├── .github
└── workflows
│ ├── publish-docker.yml
│ ├── release.yml
│ └── test.yml
├── .gitignore
├── .idea
├── .gitignore
├── .name
├── csv-editor.xml
├── inspectionProfiles
│ ├── Project_Default.xml
│ └── profiles_settings.xml
├── jsLibraryMappings.xml
├── misc.xml
├── modules.xml
├── strkit.iml
└── vcs.xml
├── Dockerfile
├── LICENSE
├── MANIFEST.in
├── README.md
├── catalogs
└── pathogenic_assoc.hg38.tsv
├── docs
├── caller_catalog.md
├── caller_usage.md
├── images
│ ├── browser_hist.png
│ ├── browser_igv.png
│ ├── call_method_flow.png
│ ├── strkit_logo_open_graph.png
│ └── strkit_logo_small.png
├── output_formats.md
└── trio_analyses.md
├── pyproject.toml
├── requirements.txt
├── setup.py
├── strkit
├── VERSION
├── __init__.py
├── call
│ ├── __init__.py
│ ├── align_matrix.py
│ ├── allele.py
│ ├── call_locus.py
│ ├── call_sample.py
│ ├── cigar.py
│ ├── non_daemonic_pool.py
│ ├── output
│ │ ├── __init__.py
│ │ ├── json_report.py
│ │ ├── tsv.py
│ │ └── vcf.py
│ ├── params.py
│ ├── realign.py
│ ├── repeats.py
│ ├── snvs.py
│ ├── types.py
│ ├── utils.py
│ └── validation.py
├── catalog
│ ├── __init__.py
│ └── combine.py
├── constants.py
├── convert
│ ├── __init__.py
│ ├── _bed_4.py
│ ├── constants.py
│ ├── converter.py
│ ├── expansionhunter.py
│ ├── gangstr.py
│ ├── hipstr.py
│ └── trgt.py
├── entry.py
├── exceptions.py
├── iupac.py
├── json.py
├── logger.py
├── mi
│ ├── __init__.py
│ ├── base.py
│ ├── expansionhunter.py
│ ├── gangstr.py
│ ├── generic_vcf.py
│ ├── intervals.py
│ ├── repeathmm.py
│ ├── result.py
│ ├── straglr.py
│ ├── strkit.py
│ ├── tandem_genotypes.py
│ ├── trgt.py
│ └── vcf_utils.py
├── utils.py
└── viz
│ ├── __init__.py
│ ├── server.py
│ ├── static
│ └── logo.png
│ └── templates
│ └── browser.html
└── tests
├── data
└── test_loci.bed
├── test_caller_locus_validation.py
├── test_caller_utils.py
├── test_iupac.py
└── test_mi_intervals.py
/.github/workflows/publish-docker.yml:
--------------------------------------------------------------------------------
1 | name: Publish STRkit Docker image
2 |
3 | on:
4 | release:
5 | types: [published]
6 |
7 | jobs:
8 | publish:
9 | runs-on: ubuntu-latest
10 |
11 | permissions:
12 | packages: write
13 | contents: read
14 |
15 | steps:
16 | - uses: actions/checkout@v4
17 |
18 | - name: Set up QEMU
19 | uses: docker/setup-qemu-action@v3
20 | with:
21 | platforms: linux/amd64,linux/arm64
22 |
23 | - uses: docker/metadata-action@v5
24 | id: meta
25 | with:
26 | images: ghcr.io/davidlougheed/strkit
27 | tags: |
28 | type=semver,pattern={{version}}
29 | type=semver,pattern={{major}}.{{minor}}
30 |
31 | - uses: docker/setup-buildx-action@v3
32 |
33 | - uses: docker/login-action@v3
34 | with:
35 | registry: ghcr.io
36 | username: ${{ github.actor }}
37 | password: ${{ secrets.GITHUB_TOKEN }}
38 |
39 | - uses: docker/build-push-action@v5
40 | with:
41 | context: .
42 | push: true
43 | tags: ${{ steps.meta.outputs.tags }}
44 | labels: ${{ steps.meta.outputs.labels }}
45 | platforms: linux/amd64,linux/arm64
46 |
--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
1 | name: Publish PyPI release
2 |
3 | on:
4 | release:
5 | types: [published]
6 |
7 | jobs:
8 | publish:
9 | runs-on: ubuntu-latest
10 |
11 | permissions:
12 | contents: read
13 | id-token: write
14 |
15 | environment:
16 | name: release
17 | url: https://pypi.org/p/strkit
18 |
19 | steps:
20 | - uses: actions/checkout@v4
21 |
22 | - uses: actions/setup-python@v5
23 | with:
24 | python-version: '3.10'
25 |
26 | - name: Install pypa/build
27 | run: python -m pip install build --user
28 |
29 | - name: Build
30 | run: python -m build --sdist --wheel --outdir dist/ .
31 |
32 | - name: Publish to PyPI
33 | uses: pypa/gh-action-pypi-publish@release/v1
34 |
--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
1 | name: Test
2 |
3 | on:
4 | push:
5 | branches:
6 | - master
7 | pull_request:
8 |
9 | jobs:
10 | test:
11 | runs-on: ubuntu-latest
12 | strategy:
13 | matrix:
14 | python-version: [ "3.10", "3.11", "3.12" ]
15 |
16 | steps:
17 | - uses: actions/checkout@v4
18 | - uses: actions/setup-python@v5
19 | name: Set up Python
20 | with:
21 | python-version: ${{ matrix.python-version }}
22 | - name: Install dependencies
23 | run: pip install -r requirements.txt
24 | - name: Install STRkit
25 | run: pip install .
26 | - name: Test
27 | run: pytest -svv --cov=strkit --cov-branch
28 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /env
2 | /envp11
3 | __pycache__
4 |
5 | /build
6 | /dist
7 | /strkit.egg-info
8 | # ignore WIP cohort code for now
9 | /strkit/cohort
10 |
11 | *.bam
12 | *.bai
13 | *.fa.gz
14 | *.fa.gz.fai
15 | *.fa.gz.gzi
16 | *.bed
17 | !tests/data/*.bed
18 | /*.json
19 | /*.tsv
20 | *.vcf.gz*
21 | *.vcf
22 |
23 | *.token
24 |
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 |
--------------------------------------------------------------------------------
/.idea/.name:
--------------------------------------------------------------------------------
1 | strkit
--------------------------------------------------------------------------------
/.idea/csv-editor.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
15 |
16 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/jsLibraryMappings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/strkit.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.12-bookworm
2 |
3 | WORKDIR /strkit
4 |
5 | COPY LICENSE .
6 | COPY MANIFEST.in .
7 | COPY pyproject.toml .
8 | COPY README.md .
9 | COPY setup.py .
10 | COPY strkit strkit
11 |
12 | RUN curl https://sh.rustup.rs -sSf > rustup-init.sh
13 | RUN sh ./rustup-init.sh -y
14 | ENV PATH="/root/.cargo/bin:${PATH}"
15 |
16 | RUN pip install -U pip
17 | RUN pip install --no-cache-dir -v .
18 |
19 | CMD [ "strkit" ]
20 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include strkit/VERSION
2 | include strkit/viz/static/logo.png
3 | include strkit/viz/templates/*.html
4 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # STRkit - short tandem repeat genotyping with long reads
2 |
3 | [](https://badge.fury.io/py/strkit)
4 | [](https://doi.org/10.1101/2025.03.25.645269)
5 | [](https://doi.org/10.5281/zenodo.12689906)
6 |
7 | STRkit is a short tandem repeat (STR) genotyping and analysis toolkit for long read sequencing data, especially
8 | PacBio HiFi data. The STRkit software package is written in Python and is available in the PyPI package registry or as
9 | a Docker container.
10 |
11 | If you use STRkit in published work, please cite our preprint:
12 |
13 | > [STRkit: precise, read-level genotyping of short tandem repeats using long reads and single-nucleotide variation.](https://doi.org/10.1101/2025.03.25.645269)
14 | > David R Lougheed, Tomi Pastinen, Guillaume Bourque. *BioRxiv preprint*.
15 | > DOI: [10.1101/2025.03.25.645269](https://doi.org/10.1101/2025.03.25.645269)
16 |
17 |
18 |
19 |
20 | ## Table of Contents
21 |
22 | * [Installation](#installation)
23 | * [Via PyPI](#via-pypi)
24 | * [As a Docker container](#as-a-docker-container)
25 | * [Commands](#commands)
26 | * [`strkit call`: Genotype caller with bootstrapped confidence intervals](#strkit-call-genotype-caller-with-bootstrapped-confidence-intervals)
27 | * [Features](#features)
28 | * [Usage](#usage)
29 | * [Further documentation on the STRkit caller, including output format](#further-documentation-on-the-strkit-caller-including-output-format)
30 | * [`strkit visualize`: Call visualizer](#strkit-visualize-call-visualizer)
31 | * [`strkit mi`: Mendelian inheritance analysis](#strkit-mi-mendelian-inheritance-analysis)
32 | * [Usage](#usage-1)
33 | * [Further documentation](#further-documentation)
34 | * [`strkit convert`: STR catalog conversion](#strkit-convert-str-catalog-conversion)
35 | * [Usage](#usage-2)
36 | * [Copyright and License](#copyright-and-license)
37 | * [Notice](#notice)
38 | * [Exceptions](#exceptions)
39 |
40 |
41 | ## Installation
42 |
43 | ### Via PyPI
44 |
45 | STRkit requires Python 3.10+ and can be installed from PyPI via `pip`
46 | with the following command:
47 |
48 | ```bash
49 | python -m pip install strkit
50 | ```
51 |
52 | You may need to install the [Rust toolchain](https://www.rust-lang.org/tools/install)
53 | and a C compiler (e.g., `gcc`, `clang`), as well as `cmake`, to compile the `strkit_rust_ext` wheel,
54 | although prebuilt wheels for this module are available for some platforms. Compiling the wheel may take quite
55 | a long time (in the tens of minutes).
56 |
57 | On Digital Research Alliance of Canada/Compute Canada clusters, this involves loading a few modules:
58 |
59 | ```bash
60 | module load rust/1.85.0 clang/18.1.8 python/3.11 scipy-stack/2025a parasail/2.6.2
61 | python -m pip install strkit
62 | ```
63 |
64 | STRkit should then be available in your Python environment as a command-line tool:
65 |
66 | ```bash
67 | strkit --help
68 | ```
69 |
70 | ### As a Docker container
71 |
72 | STRkit is also available as a [Docker container](https://github.com/davidlougheed/strkit/pkgs/container/strkit), stored
73 | in the GitHub Container Registry.
74 |
75 | It can be pulled using the following command:
76 |
77 | ```bash
78 | docker pull ghcr.io/davidlougheed/strkit:latest
79 | ```
80 |
81 | Then, STRkit commands can be run mostly as normal using the Docker image:
82 |
83 | ```bash
84 | docker run -it ghcr.io/davidlougheed/strkit --help
85 | ```
86 |
87 |
88 | ## Commands
89 |
90 | ### `strkit call`: Genotype caller with bootstrapped confidence intervals
91 |
92 | A Gaussian mixture model tandem repeat genotype caller for long read data.
93 | STRkit is tuned specifically for high-fidelity long reads, although other
94 | long read data should still work.
95 |
96 | 
97 |
98 | #### Features:
99 |
100 | * Performant, vectorized (thanks to [parasail](https://github.com/jeffdaily/parasail))
101 | estimates of repeat counts from high-fidelity long reads and a supplied
102 | catalog of TR loci and motifs.
103 | * Re-weighting of longer reads, to compensate for their lower likelihood of observation.
104 | * Whole-genome and targeted genotyping modes to adjust this re-weighting.
105 | * Incorporation of single-nucleotide variation (SNVs) for better and faster calling plus
106 | additional downstream analysis possibilities.
107 | * Recommended for **HiFi data and ONT R10 data only**. In my testing, this worsens runtime and call quality for
108 | ONT ultra-long-read data, but speeds up the tool and improves call quality for HiFi/ONT R10 data.
109 | * Parallelized for faster computing on clusters and for ad-hoc fast analysis of single samples.
110 | * 95% confidence intervals on calls via a user-configurable optional parametric bootstrapping process.
111 |
112 |
113 | #### Usage:
114 |
115 | See all parameters and example usage with a Slurm cluster:
116 | [Advanced caller usage and configuration](./docs/caller_usage.md)
117 |
118 | ##### EXAMPLE USAGE
119 |
120 | ```bash
121 | # For the dbSNP VCF used below for SNV incorporation, see https://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/
122 | # (00-common_all.vcf.gz)
123 | #
124 | # "Accurate reads" here means HiFi / ONT R10 duplex reads, but in practice may also include ONT R10 simplex reads.
125 |
126 | strkit call \
127 | path/to/read/file.bam \ # [REQUIRED] One indexed read file (BAM/CRAM)
128 | --hq \ # If using accurate reads, enable this to get better genotyping & more robust expansion detection
129 | --realign \ # If using accurate reads, enable this to enable local realignment / read recovery. Good for detecting expansions, but slows down calling.
130 | --ref path/to/reference.fa.gz \ # [REQUIRED] Indexed FASTA-formatted reference genome
131 | --loci path/to/loci.bed \ # [REQUIRED] TRF-formatted (or 4-col, with motif as last column) sorted list of loci to genotype
132 | --incorporate-snvs path/to/dbsnp/00-common_all.vcf.gz \ # If you want, specify a SNV catalogue to help phase STRs & speed up calling
133 | --vcf my-calls.vcf \ # Calculate consensus sequences for alleles and output a .vcf (or .vcf.gz) with call data
134 | --seed 183 \ # Fixed random number generator seed for replicability
135 | --processes 10 \ # Number of parallel processes to use; DEFAULT: 1
136 | --no-tsv # If VCF output is enabled as above, we don't need TSV genotype output to stdout (which is the default)
137 | ```
138 |
139 | ##### REGARDING ALIGNMENTS
140 |
141 | Ideally, you should be using a read file aligned with parameters tuned for tandem repeats.
142 | PacBio provides a
143 | [recommended workflow](https://github.com/PacificBiosciences/apps-scripts/tree/master/RepeatAnalysisTools)
144 | for CCS alignment in this scenario. However, regular aligned readsets are fine and have been tested
145 | extensively.
146 |
147 | If you're using accurate long reads (e.g., HiFi, ONT R10 duplex) as input, **use the `--hq` and
148 | `--realign` options** to get better genotype calculation and a greater proportion of reads
149 | incorporated into the computed genotypes, respectively. These should not add much performance
150 | overhead. *In practice, these options may also aid calling with slightly-less-accurate reads.*
151 |
152 | If you want to **incorporate haplotagging from an alignment file (`HP` tags)** into the
153 | process, which should speed up runtime and potentially improve calling results, you must pass
154 | the `--use-hp` flag.
155 |
156 | ##### REGARDING SNV INCORPORATION
157 |
158 | If you want to **incorporate SNV calling** into the process, which speeds up runtime and gives
159 | marginally better calling results, you must provide an indexed, `bgzip`-compressed SNV catalog
160 | VCF which matches your reference genome. You can find dbSNP VCFs at
161 | [`https://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/`](https://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/).
162 | The file for GRCh38 is called `00-common_all.vcf.gz` as of time of writing.
163 | **Note that this does not need to be an SNV call file for your sample, specifically**; just one
164 | which has positions, reference/alternate alleles, and the `ID` field populated.
165 |
166 | ##### REGARDING OUTPUT
167 |
168 | If you want to output a full call report, you can use the `--json output-file.json` argument to
169 | specify a path to output a more detailed JSON document to. This document contains 99% CIs, peak
170 | labels, and some other information that isn't included in the normal TSV file. If you want this
171 | file to be indented and human-readable, use the `--indent-json` flag in addition to `--json ...`.
172 |
173 | If you want to output a VCF file (STRs and SNVs if called; currently not phased), use the
174 | `--vcf ...` argument. If you pass `--vcf stdout`, the VCF will be written to `stdout` instead of a
175 | file.
176 |
177 | For more information, see also documentation on the [Output formats](./docs/output_formats.md).
178 |
179 | ##### REGARDING REFERENCE GENOMES
180 |
181 | The reference genome provided must be BGZipped and indexed using `samtools faidx`:
182 |
183 | ```bash
184 | # Starting from a .fa:
185 | bgzip my-reference.fa # Replaces .fa with a .fa.gz file
186 | samtools faidx my-reference.fa.gz # Generates a .fai index file
187 | ```
188 |
189 | ##### OTHER PARAMETERS
190 |
191 | See the '[Caller catalog format & choosing a catalog](./docs/caller_catalog.md)' page for more on
192 | how to format a locus catalog or choose from existing available catalogs.
193 |
194 |
195 | #### Further documentation on the STRkit caller, including output format:
196 |
197 | * [Advanced caller usage and configuration](./docs/caller_usage.md)
198 | * [Caller catalog format & choosing a catalog](./docs/caller_catalog.md)
199 | * [Output formats](./docs/output_formats.md)
200 |
201 |
202 | ### `strkit visualize`: Call visualizer
203 |
204 | STRkit bundles a call visualization tool which takes as input a BAM file and
205 | a JSON call file from using the `--json` flag with `strkit call`.
206 |
207 | It starts a web server on your local machine; the visualizations can be
208 | interacted with in a web browser.
209 |
210 | To use the tool, run the following command:
211 |
212 | ```bash
213 | strkit visualize path/to/my-alignment.bam \
214 | --ref hg38 \ # or hg19
215 | --json path/to/my-calls.json \
216 | -i 1 # 1-indexed offset in JSON file for locus of interest. Default is 1 if left out.
217 | ```
218 |
219 | This will output something like the following:
220 |
221 | ```
222 | * Serving Flask app 'strkit.viz.server' (lazy loading)
223 | * Environment: production
224 | WARNING: This is a development server. Do not use it in a production deployment.
225 | Use a production WSGI server instead.
226 | * Debug mode: on
227 | * Running on http://localhost:5011 (Press CTRL+C to quit)
228 | ...
229 | ```
230 |
231 | You can then go to the URL listed, `http://localhost:5011`, on your local machine
232 | to see the visualization tool:
233 |
234 | 
235 | *STRkit browser histogram, showing an expansion in the HTT gene.*
236 |
237 | 
238 | *The same expansion, shown in the igv.js browser. Note the insertions on
239 | the left-hand side in most reads, and the heterozygous copy number pattern.*
240 |
241 | To exit the tool, press `Ctrl-C` in your command line window as mentioned in
242 | the start-up instructions.
243 |
244 |
245 |
246 | ### `strkit mi`: Mendelian inheritance analysis
247 |
248 | Using trio data, candidate de novo STR mutations (or genotyping errors/dropout rates) can be discovered
249 | by looking at inheritance patterns. This tool provides a few different ways to do this, via:
250 |
251 | * Mendelian inheritance % (MI) calculations for many common TR genotyping tools for both long/short reads,
252 | including support for genotyping methods which report confidence intervals.
253 | * Reports of loci (potentially of interest) which do not respect MI.
254 |
255 | #### Usage
256 |
257 | For a basic JSON report on Mendelian inheritance with a trio of STRkit VCFs (compressed and indexed with BGZip), use
258 | something like the following command:
259 |
260 | ```bash
261 | # In addition to summary figures on Mendelian inheritance, this tool outputs loci which do not respect MI, which may be
262 | # useful as candidate de novo mutations. The --mismatch-out-mi flag controls which form of MI metric is used for
263 | # deciding which loci to output. Options for this flag are:
264 | # strict (strict copy number MI),
265 | # pm1 (copy number MI ± 1 repeat unit),
266 | # ci_95 (copy number 95% confidence interval),
267 | # ci_99 (copy number 99% confidence interval),
268 | # seq ([allele] sequence MI),
269 | # sl ([allele] sequence length MI),
270 | # sl_pm1 ([allele] sequence length MI ± 1 base pair)
271 | strkit mi \
272 | --caller strkit-vcf \
273 | --json mi-report.json \
274 | --mismatch-out-mi seq \
275 | child-calls.vcf.gz \
276 | mother-calls.vcf.gz \
277 | father-calls.vcf.gz
278 | # This will also output a TSV report to stdout. If this is not desired, use --no-tsv to suppress TSV output.
279 | ```
280 |
281 | For other options and what they do, run `strkit mi` (with no other arguments) or `strkit mi --help`.
282 |
283 | #### Further documentation
284 |
285 | **For more information on what kind of analyses can be done with this data**, see the
286 | [Trio analyses with STRkit](./docs/trio_analyses.md) page.
287 |
288 |
289 | ### `strkit convert`: STR catalog conversion
290 |
291 | STRkit takes as input a four-or-more-column BED file, structured like:
292 |
293 | ```
294 | contig start end [0 or more extraneous columns] motif
295 | ```
296 |
297 | Any extraneous columns are removed, (internally) leaving a four-column STR locus representation.
298 | Some other tools, e.g., [Straglr](https://github.com/bcgsc/straglr), also take a four-column STR
299 | BED as locus catalog input. However, other formats representing a catalog of STRs exist:
300 |
301 | * [Tandem Repeats Finder](https://github.com/Benson-Genomics-Lab/TRF) outputs a TSV/BED with a lot
302 | of information. This can be used as-is with STRkit, but it's safer for other tools to convert to
303 | a four-column BED format.
304 | * [TRGT uses a custom repeat definition format](https://github.com/PacificBiosciences/trgt/blob/main/docs/repeat_files.md),
305 | which can specify more advanced STR structures.
306 |
307 | #### Usage
308 |
309 | The `strkit convert` sub-command requires an input format (`trf` or `trgt`), an output format
310 | (many, see `strkit convert --help`), and an input file. Output is written to `stdout`.
311 |
312 | *Note:* Not all input/output format pairs have available converter functions; an error will be
313 | printed to `stderr` if one does not exist.
314 |
315 | For example, to convert from a TRF BED to a TRGT repeat definition BED file:
316 |
317 | ```bash
318 | strkit convert --in-format trf --out-format trgt in_file.trf.bed > out_file.bed
319 | ```
320 |
321 | To attempt a conversion from a TRGT repeat definition file to a STRkit/four-column motif BED:
322 |
323 | ```bash
324 | strkit convert --in-format trgt --out-format strkit in_file.trgt.bed > out_file.bed
325 | ```
326 |
327 | Note that TRGT can represent STRs with complex structure that STRkit cannot, so some of these loci
328 | may not be converted (these will be logged to `stderr`).
329 |
330 |
331 | ## Copyright and License
332 |
333 | * 2021-2023: © David Lougheed (DL) and McGill University 2021-2023 (versions up to and including `0.8.0a1`),
334 | created during graduate research by DL.
335 | * 2023+: (versions beyond `0.8.0a1`):
336 | * Portions © DL and McGill University 2021-2023
337 | * Portions © McGill University 2024-2025
338 | * Portions © DL 2024-2025
339 |
340 |
341 | ### Notice
342 |
343 | This program is free software: you can redistribute it and/or modify
344 | it under the terms of the GNU General Public License as published by
345 | the Free Software Foundation, either version 3 of the License, or
346 | (at your option) any later version.
347 |
348 | This program is distributed in the hope that it will be useful,
349 | but WITHOUT ANY WARRANTY; without even the implied warranty of
350 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
351 | GNU General Public License for more details.
352 |
353 | You should have received a copy of the GNU General Public License
354 | along with this program. If not, see .
355 |
356 | ### Exceptions
357 |
358 | **Some exclusions to this license apply; specifically portions of
359 | [`strkit/viz/templates/browser.html`](strkit/viz/templates/browser.html) and
360 | the STRkit logo files ([./docs/images/strkit_logo_small.png](./docs/images/strkit_logo_small.png)
361 | and [./strkit/viz/static/logo.png](./strkit/viz/static/logo.png).)**
362 |
363 | The STRkit logo is © David Lougheed 2022, and was designed by Evelyn Lougheed. It is not licensed
364 | under the terms of the GPL 3.0; it is instead licensed under the terms of the
365 | [CC BY-ND 4.0](https://creativecommons.org/licenses/by-nd/4.0/).
366 |
367 | Portions of `viz/templates/browser.html` copyright (C) 2021-2022 Observable, Inc.
368 | Used under the terms of the ISC license.
369 |
--------------------------------------------------------------------------------
/catalogs/pathogenic_assoc.hg38.tsv:
--------------------------------------------------------------------------------
1 | # Citations mostly obtained from Gall-Duncan et al. 2022 . . . . . . . .
2 | # contig start end disease inheritance gene citation pathogenic form notes motif
3 | chr1 57367043 57367125 SCA37 AD DAB1 Seixas et al. 2017 RAAAT
4 | chr1 94418421 94418442 OPDM AD ABCD3 Cortese et al. 2024 CN>=118 GCC
5 | chr1 149390802 149390842 NIID;EssentialTremor;ALS;OPDM3 AD;Assoc;Assoc;Familial NOTCH2NLC Tian et al. 2019;Sun et al. 2020;Yuan et al. 2020;Yu et al. 2021 GGC
6 | chr2 96197066 96197124 BAFME2 AD STARD7 Corbett et al. 2019 AAAWK
7 | chr2 100104798 100104824 Developmental anomalies PossibleAssoc AFF3 Metsu et al. 2014 GCC
8 | chr2 176093058 176093099 SPD1 AD HOXD13 Gong et al. 2011 CN>=22 GCN
9 | chr2 190880872 190880920 GD AR GLS van Kuilenburg et al. 2019 GCA
10 | chr3 63912684 63912715 SCA7 Familial ATXN7 Stevanin et al. 1998 CN>=37 GCA
11 | chr3 129172576 129172733 DM2 AD CNBP Liquori et al. 2001 CASR
12 | chr3 183712176 183712226 BAFME4 AD YEATS2 Yeetong et al. 2019 ATTTY
13 | chr4 3074876 3074940 HD AD HTT HDCRG 1993 CN>=36 CAG
14 | chr4 39348424 39348479 CANVAS AR RFC1 Cortese et al. 2019&Rafehi et al. 2019 AARRG
15 | chr4 41745975 41746022 CCHS Familial PHOX2B Amiel et al. 2003 GCC
16 | chr5 10356338 10356411 BAFME3 AD MARCHF6 Florian et al. 2019 TTTYA
17 | chr5 146878727 146878759 SCA12 AD PPP2R2B Holmes et al. 1999 GCT
18 | chr6 16327633 16327724 SCA1;ALS AD;Assoc ATXN1 Orr et al. 1993;Lattante et al. 2018,Tazelaar et al. 2020 TGC
19 | chr6 45422749 45422794 CCD AD RUNX2 Mundlos et al. 1997 GGC
20 | chr6 170561906 170562017 SCA17 AD TBP Koide et al. 1999 CN>=43 GCA
21 | chr7 27199679 27199732 HFGS AD HOXA13 Utsch et al. 2002 GCN
22 | chr8 118366812 118366918 BAFME1 AD SAMD12 Ishiura et al. 2018 AARTA
23 | chr9 27573484 27573546 ALS/FTD AD C9orf72 Renton et al. 2011&DeJesus-Hernandez et al. 2011 GCCCCG
24 | chr10 79826380 79826404 OPDM Assoc NUTM2B-AS1 Gu et al. 2024 CGG
25 | chr11 119206289 119206323 JS Assoc CBL2 Michaelis et al. 1998 CGG
26 | chr12 6936716 6936775 DRPLA AD ATN1 Nagafuchi et al. 1994,Koide et al. 1994,Chaudhry et al. 2021 CAG
27 | chr12 50505001 50505024 ID;LGS Assoc;Assoc DIP2B Winnepenninckx et al. 2007;Qaiser et al. 2021 GGC
28 | chr13 70139351 70139429 SCA8;LGS AD;Assoc ATXN8OS Koob et al. 1999;Qaiser et al. 2021 TRC
29 | chr13 99985448 99985494 HPE5 AD ZIC2 Brown et al. 1998 GCG
30 | chr14 23321464 23321543 OPMD AD PABP2 Brais et al. 1998 GCG
31 | chr15 22786671 22786703 ALS Assoc NIPA1 Blauw et al. 2012 GCG
32 | chr16 17470920 17470921 BSS AR XYLT1 LaCroix et al. 2019 GGC
33 | chr16 24613438 24613532 BAFME6 AD TNRC6A Ishiura et al. 2018 ATTTY
34 | chr16 66490398 66490466 SCA31 AD ENSG00000260851 Sato et al. 2009 TRRAA
35 | chr18 55586153 55586229 FECD Assoc TCF4 Wieben et al. 2012 AGC
36 | chr19 13207858 13207898 SCA6 AD CACNA1 Zhuchenko et al. 1997 CTG
37 | chr19 14496041 14496085 OPDM2 Familial GIPC1 Deng et al. 2020 CCG
38 | chr19 18786027 18786050 PSACH AD COMP Deere et al. 1999 CGT
39 | chr19 45770204 45770266 DM1 AD DMPK Many CAG
40 | chr20 2652732 2652775 SCA36 AD NOP56 Kobayashi et al. 2011 GGGCCT
41 | chr21 43776442 43776479 EPM1 AR CSTB Lalioti et al. 1998 GCGCGGGGCGGG
42 | chr22 45795354 45795424 SCA10 AD SCA10 Matsuura et al. 2000,Matsuura et al. 2006 CN>=280 Variable penetrance in intermediate range ~280-800 ATTCT
43 | chrX 67545316 67545419 SBMA X-linked AR La Spada et al. 1991,Fratta et al. 2014 CN>=38 GCA
44 | chrX 71453054 71453129 XDP X-linked TAF1 Bragg et al. 2017 GAGAGG
45 | chrX 147912036 147912111 FXS;FXPOI;FXTAS X-linked FMR1 Many CN>=200;55<=CN<200;55<=CN<200 GGC
46 | chrX 148500604 148500753 FRAXE X-linked AFF2 Knight et al. 1993,Gu et al. 1996 CN>=200 GCC
47 |
--------------------------------------------------------------------------------
/docs/caller_catalog.md:
--------------------------------------------------------------------------------
1 | # Caller catalog format & choosing a catalog
2 |
3 | ## Caller catalog format
4 |
5 | For the `--loci` argument, `strkit call` takes a list of loci in a modified BED / TSV format,
6 | similar to methods like Straglr/Tandem-genotypes/GangSTR.
7 |
8 | The file must be structured with a row per locus, where each row looks like:
9 |
10 | ```
11 | chr# 10000 10101 [...] AC
12 | ```
13 |
14 | The important requirements here are:
15 |
16 | * The fields are tab-separated
17 | * The rows are sorted by contig, and then by starting position
18 | * Locus coordinates are 0-based and half-open (start is inclusive, end is exclusive)
19 | * The locus motif must come **last** in the row, but *any number of fields* can separate
20 | the end position and the motif.
21 |
22 | As a result, STRkit can take myrid different TSV-type catalog formats as input, including
23 | those produced from the TRF UCSC browser track, or for GangSTR, or for Straglr.
24 |
25 | Here are a few notes on catalogs:
26 |
27 | * Coordinates are used to locate the STR locus in the reference genome, but may be slightly
28 | expanded to better encompass the entire locus.
29 | * Be wary of using Tandem Repeats Finder output directly as a catalog, as it can output multiple
30 | rows for the same locus, or define motifs in a "compound" fashion, e.g., `ATATAT` instead of `AT`.
31 | * Some disease expansions can contain multiple different motifs,
32 | which may be not present in the reference genome at all (for example:
33 | [CANVAS](https://pubmed.ncbi.nlm.nih.gov/31230722/), [BAFME2](https://www.nature.com/articles/s41467-019-12671-y)).
34 | As such, we provide a mechanism to specify motifs using any
35 | [IUPAC code](https://www.bioinformatics.org/sms/iupac.html).
36 | Thus, the CANVAS and BAFME2 motifs can be represented as `AARRG` and `AAAWK`, respectively.
37 | We also add in a non-IUPAC code, `X`, which behaves like `N` in that it represents any base,
38 | but instead of giving a reward of `+2` it neither penalizes nor rewards alignment,
39 | and penalizes a gap. We use this internally to represent low-confidence base calls.
40 | * Related to the above, this can be important for diseases such as SCA37, where the motif composition
41 | (rather than the actual copy number) is associated with disease
42 | ([Seixas *et al.* 2017](https://doi.org/10.1016%2Fj.ajhg.2017.06.007)). Here, STRkit's motif-sized k-mer counting
43 | function can be used during calling with the `--count-kmers` flag. See the
44 | [advanced usage](https://github.com/davidlougheed/strkit/blob/master/docs/caller_usage.md#all-optional-flags) page
45 | for more.
46 |
47 |
48 | ## Choosing an existing catalog
49 |
50 | Other researchers have done extensive work in identifying and cataloguing loci for genotyping:
51 |
52 | * The Tandem Repeats Finder track for the UCSC browser, available as a
53 | [downloadable BED file](https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/latest/hg38.trf.bed.gz),
54 | with the caveat that this file includes **overlapping entries**, and TRs may not always be represented in
55 | their most 'essential' form (e.g., using the motif `TATATATA` instead of just `TA`). Thus, some work may be
56 | required to create a desirable locus catalog.
57 | * The researchers behind the [GangSTR](https://github.com/gymreklab/GangSTR) short-read STR genotyping method
58 | have prepared [several extensive STR catalogs](https://github.com/gymreklab/GangSTR#gangstr-reference-files)
59 | for different human reference genomes, containing motifs up to 20bp in length. However, **these files use
60 | 1-based closed-interval coordinates**, and should be adjusted (subtracting 1 from all start coordinates) to
61 | transform them into the 0-based half-open interval coordinates when using them with STRkit.
62 | * We have prepared a [catalog of disease-causing or disease-associated loci](../catalogs/pathogenic_assoc.hg38.tsv)
63 | for the `hg38` reference genome, partially based on the review research done by Gall-Duncan *et al.* (2022), as well
64 | as entries from the [STRipy database](https://stripy.org/database)
65 | (DOI: [10.1002/humu.24382](https://doi.org/10.1002/humu.24382)) and our own reading of other articles.
66 |
--------------------------------------------------------------------------------
/docs/caller_usage.md:
--------------------------------------------------------------------------------
1 | # Advanced caller usage and configuration
2 |
3 |
4 | ## All optional flags
5 |
6 | * `--sample-id example_sample`: Set a sample ID, or override the alignment file sample ID. This will be included in JSON
7 | output, but not TSV output.
8 | * `--min-reads ##`: Minimum number of supporting reads needed to make a call. **Default:** 4
9 | * `--min-allele-reads ##`: Minimum number of supporting reads needed to call a specific allele size.
10 | **Default:** 2
11 | * `--max-reads ##`: Maximum number of supporting reads to use for calling a locus. **Default:** 250
12 | * `--min-avg-phred ##`: Minimum average PHRED score for relevant bases (flanking region + tandem repeat).
13 | Read segments with average PHRED scores below this (common with a threshold of ~13 and ONT Ultra Long reads,
14 | for example) will be skipped. **Default:** 13
15 | * `--min-read-align-score #.#`: Minimum normalized read alignment score (fractional; `0.0` to `1.0`) needed to include a
16 | read in a call. A good value for pure tandem repeats is 0.9. A good value for much more lenient genotyping is anywhere
17 | from 0.0-0.4. **Default:** 0.9
18 | * `--max-rcn-iters ##`: Maximum number of read copy-number counting iterations to perform. Loci which require a lot of
19 | iterations are probably impure tandem repeats, for which the resulting copy number will not be very accurate anyway.
20 | **Default:** 50
21 | * `--flank-size ##`: Size of the flanking region to use on either side of a region to properly anchor reads.
22 | **Default:** 70
23 | * `--realign` or `-a`: Whether to perform local re-alignment to attempt recovery of soft-clipped reads. Some aligners
24 | may soft-clip around large insertions, e.g. with an expansion (I've noticed this with *pbmm2*/*minimap2*).
25 | Currently recommended **for HiFi or ONT R10 only**, since this step aggressively filters out realignments with
26 | many mismatches or small indels. Enabling this slows down calling, so it may not be suitable for a very large catalog
27 | of tandem repeats.
28 | * `--hq`: Whether to treat provided reads as "high quality", i.e., fairly close to the actual true sequence. Used when
29 | detecting expansions, to skip a smoothing filter that may ignore disparate, rare expansion-like read counts.
30 | Use for CCS reads or similar data (e.g., R10 nanopore data) ONLY! **Default:** off
31 | * `--use-hp`: Whether to incorporate `HP` tags from a haplotagged alignment file. This should speed up runtime and
32 | will potentially improve calling results. **This flag is experimental, and has not been tested extensively.**
33 | * `--skip-supplementary` or `--skip-supp`: Skip supplementary alignments. **Default:** off
34 | * `--skip-secondary` or `--skip-sec`: Skip secondary alignments. **Default:** off
35 | * `--incorporate-snvs [path]` or `--snv [path]`: A path to a VCF with SNVs to incorporate into the calling process and
36 | final output. This file is just used as an SNV loci catalog; STRkit itself will perform the SNV calling. Empirically
37 | improves calling quality a small amount, speeds up runtime, and gives nearby SNV calls for downstream analysis.
38 | You can find dbSNP VCFs at
39 | [`https://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/`](https://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/).
40 | The file for GRCh38 is called `00-common_all.vcf.gz` as of time of writing.
41 | * `--snv-min-base-qual [int]` or `--min-sbq [int]`: Minimum PHRED quality score for bases of SNVs to use for phasing.
42 | **Default:** 20
43 | * `--targeted` or `-t`: Turn on targeted genotyping mode, which re-weights longer reads differently. Use this option if
44 | the alignment file contains targeted reads that do not reflect normal mapping patterns, e.g. from PacBio No-Amp
45 | Targeted Sequencing. **Default:** off
46 | * `--respect-ref` or `-e`: Turn off reference TR region 'coordinate extension' from what is specified in the catalog.
47 | TR boundaries can be blurry, so by default we give STRkit an opportunity to extend the provided region to improve
48 | mapped indel capturing and to be consistent with the approach we use to count repeat copies in non-reference samples.
49 | Turning this off should give results closer to other STR callers, at the cost of potentially missing variation.
50 | * `--count-kmers` or `-k`: Turn on motif-sized k-mer counting at the allele level, with `-k peak`, or at the read
51 | level, with `-k read`, or both with `-k both`. If the flag is provided with no value, it will default to `peak.`
52 | Note that k-mer counts will only be reported if a `--json` path is specified. This feature can be used to detect
53 | motif composition differences between alleles or samples. **Default:** `none`
54 | * `--consensus` or `-c`: Turn on consensus calculation for alleles. This adds runtime, but gives a better idea of STR
55 | structure and is useful for comparing alleles beyond copy number. If `--vcf` is set, this option is forced on.
56 | **Default:** off
57 | * `--vcf-anchor-size`: Number of bases upstream (5') of the tandem repeat to include in the VCF output. This can include
58 | small indels, and having a size above `1` may be beneficial or detrimental to the use case at hand, but is nice for
59 | benchmarking and in case of slight misalignment. This is clamped to being in the range of `[1, flank_size]`.
60 | **Default:** 5
61 | * `--num-bootstrap ###` or `-b`: Now many bootstrap re-samplings to perform. **Default:** 100
62 | * `--sex-chr ??` or `-x`: Sex chromosome configuration. **Without this, loci in sex chromosomes will not be genotyped.**
63 | Can be any configuration of Xs and Ys; only count matters. **Default:** *none*
64 | * `--json [path]` or `-j`: Path to output JSON call data to. JSON call data is more detailed than the `stdout` TSV
65 | output. If the value passed is `stdout`, the JSON data will be written to `stdout` instead of a file.
66 | **Default:** *none*
67 | * `--indent-json` or `-i`: If passed alongside `--json [x]`, the JSON output will be indented to be more human-readable
68 | but less compact. **Default:** off
69 | * `--vcf [path]`: Path to output VCF-formatted call data to. Setting this option forces the `--consensus` option as
70 | well in order to output true REF/ALT values, which slows down runtime somewhat. If the value passed is `stdout`, the
71 | VCF data will be written to `stdout` instead of a file. If a `.vcf.gz` path is specified, a bgzipped file will be
72 | written automatically. **Default:** *none*
73 | * `--no-tsv`: Suppresses TSV output to `stdout`. Without `--json` or `--vcf`, no output will be generated, which isn't
74 | very helpful. **Default:** TSV output on
75 | * `--seed`: Seed the random number generator used for all random sampling, Gaussian mixture modeling, etc.
76 | Useful for replicability.
77 | * `--log-level [level]`: Log level. Value must be of `error`, `warning`, `info`, and `debug`. Be careful with the
78 | `debug` log level, as it can produce gigabytes of logs for a large run. **Default:** `info`.
79 |
80 |
81 | ## Usage on HPC machines
82 |
83 | We have tested STRkit on three different clusters associated with the
84 | Digital Research Alliance of Canada (formerly Compute Canada).
85 |
86 | Usage is pretty straightforward; for our use cases we set up a Python virtual environment
87 | with the `strkit` package installed, and ran a SLURM batch job which looks something like:
88 |
89 | ```bash
90 | #!/bin/bash
91 | #SBATCH --mem=16G
92 | #SBATCH --ntasks=1
93 | #SBATCH --cpus-per-task=10
94 | #SBATCH --time=1-00
95 | #SBATCH --account=rrg-xxxxx
96 |
97 |
98 | module load StdEnv/2023
99 | module load python/3.11 scipy-stack/2025a parasail/2.6.2
100 |
101 | cd /home/xxxxx || exit
102 | source env/bin/activate
103 |
104 | strkit call \
105 | --loci /path/to/catalog \
106 | --ref /path/to/ref.fa.gz \
107 | --processes 10 \
108 | --seed 342 \
109 | --vcf sample.vcf \
110 | --no-tsv \
111 | path/to/sample.bam
112 |
113 | deactivate
114 |
115 | ```
116 |
--------------------------------------------------------------------------------
/docs/images/browser_hist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidlougheed/strkit/90f07d4bb8e27b67a1a1c9afafa4f4b1412b171a/docs/images/browser_hist.png
--------------------------------------------------------------------------------
/docs/images/browser_igv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidlougheed/strkit/90f07d4bb8e27b67a1a1c9afafa4f4b1412b171a/docs/images/browser_igv.png
--------------------------------------------------------------------------------
/docs/images/call_method_flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidlougheed/strkit/90f07d4bb8e27b67a1a1c9afafa4f4b1412b171a/docs/images/call_method_flow.png
--------------------------------------------------------------------------------
/docs/images/strkit_logo_open_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidlougheed/strkit/90f07d4bb8e27b67a1a1c9afafa4f4b1412b171a/docs/images/strkit_logo_open_graph.png
--------------------------------------------------------------------------------
/docs/images/strkit_logo_small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidlougheed/strkit/90f07d4bb8e27b67a1a1c9afafa4f4b1412b171a/docs/images/strkit_logo_small.png
--------------------------------------------------------------------------------
/docs/output_formats.md:
--------------------------------------------------------------------------------
1 | # STRkit output formats
2 |
3 | STRkit can output three different file formats, depending on the set of arguments used:
4 |
5 | * [TSV](#tsv-standard-output): by default, printed to `stdout` when STRkit is run. Good as an overview, but less
6 | informative/interoperable than other formats.
7 | * [JSON](#json-report): a JSON report, containing the maximum amount of information possible. These files can be quite
8 | large, especially if formatted to be human-readable and indented with the `--indent-json` flag.
9 | * [VCF](#vcf): a [VCF 4.2](https://samtools.github.io/hts-specs/VCFv4.2.pdf) file, with STR and SNV genotypes, including
10 | consensus STR sequences.
11 |
12 | **Note:** In general, the JSON format contains the most information about how STRkit was run, and each locus' called
13 | genotype.
14 |
15 |
16 | ## TSV (standard output)
17 |
18 | A tab-separated text file with the following columns:
19 |
20 | * Chromosome
21 | * Starting position (matching input BED file; real coordinates of region may be different if
22 | `--respect-ref` is not used)
23 | * Ending position (matching input BED file; real coordinates of region may be different if
24 | `--respect-ref` is not used)
25 | * Motif sequence (matching input BED file)
26 | * Reference copy number
27 | * Comma-delimited list of copy numbers for all reads successfully extracted for this locus.
28 | * Copy number call, `|`-delimited (one call per allele)
29 | * 95% confidence intervals for copy number calls, `|`-delimited (one `X-Y` 95% CI per allele)
30 | * Calling approach used by STRkit: one of:
31 | * `dist` - clustering based on a copy number distance metric
32 | * `snv+dist` - clustering based on a copy number + nearby SNV genotype difference distance metric
33 | * `snv` - clustering solely based on nearby SNV genotypes
34 |
35 | Here is an example line:
36 |
37 | ```
38 | chr4 5975495 5975530 TTTTG 7 6,6,6,6,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,8 6|7 6-6|7-7 snv
39 | ```
40 |
41 | Note that quite a bit of information is missing from the TSV, including per-sample copy numbers, read identities,
42 | SNV calls, and STR consensus sequences.
43 |
44 |
45 | ## JSON report
46 |
47 | Example report format:
48 |
49 | ```javascript
50 | {
51 | "sample_id": "HG002",
52 | "caller": {
53 | "name": "strkit",
54 | "version": "0.15.0"
55 | },
56 | "parameters": {
57 | "read_files": "HG002.SequelII.ccs.phased.40x.chr4.bam",
58 | "reference_file": "/Users/davidlougheed/git/gt-poc/hg38.analysisSet.fa.gz",
59 | "min_reads": 4,
60 | "min_allele_reads": 2,
61 | "min_avg_phred": 13,
62 | "num_bootstrap": 100,
63 | "flank_size": 70,
64 | "sample_id": "HG002",
65 | "realign": true,
66 | "hq": true,
67 | "snv_vcf": "00-common_all.vcf.gz",
68 | "snv_min_base_qual": 20,
69 | "targeted": false,
70 | "respect_ref": false,
71 | "count_kmers": "none",
72 | "consensus": true,
73 | "log_level": 10,
74 | "seed": 1234,
75 | "processes": 1
76 | },
77 | "runtime": 8.628772,
78 | "contigs": [
79 | "chr4"
80 | ],
81 | "results": [
82 | {
83 | "locus_index": 1,
84 | "contig": "chr4",
85 | "start": 96617,
86 | "end": 96648,
87 | "start_adj": 96617,
88 | "end_adj": 96648,
89 | "motif": "AC",
90 | "ref_cn": 16,
91 | "ref_start_anchor": "t",
92 | "ref_seq": "acacacacacacacacacacacacacacaca",
93 | "reads": {
94 | "m64011_190901_095311/50792740/ccs": {
95 | "s": "-",
96 | "sc": 2.0,
97 | "cn": 15,
98 | "w": 1.0217145751733625,
99 | "snvu": ["G"],
100 | "p": 0
101 | },
102 | // ...
103 | "m64012_190921_234837/4523939/ccs": {
104 | "s": "+",
105 | "sc": 2.0,
106 | "cn": 15,
107 | "w": 1.0217145751733625,
108 | "snvu": ["A"],
109 | "p": 1
110 | },
111 | // ...
112 | },
113 | "snvs": [
114 | {
115 | "id": "rs73213545",
116 | "ref": "G",
117 | "pos": 94593,
118 | "call": ["G", "A"],
119 | "rcs": [20, 23]
120 | }
121 | ],
122 | "assign_method": "snv+dist",
123 | "call": [15, 15],
124 | "call_95_cis": [
125 | [15, 15],
126 | [15, 15]
127 | ],
128 | "call_99_cis": [
129 | [15, 15],
130 | [15, 15]
131 | ],
132 | "mean_model_align_score": 2.0,
133 | "peaks": {
134 | "means": [15, 15],
135 | "weights": [0.5, 0.5],
136 | "stdevs": [0.31622776601683794, 0.3585309239667531],
137 | "modal_n": 2,
138 | "n_reads": [20, 23],
139 | "seqs": [
140 | ["ACACACACACACACACACACACACACACA", "poa"],
141 | ["ACACACACACACACACACACACACACACA", "poa"]
142 | ]
143 | },
144 | "read_peaks_called": true,
145 | "time": 0.1274
146 | },
147 | // ...
148 | ]
149 | }
150 | ```
151 |
152 |
153 | ## VCF
154 |
155 | VCF format fields (i.e., for each variant sample entry):
156 |
157 | * `AD`: Read depth for each allele
158 | * `CONS`: Consensus methods used for each alt (`single`/`poa`/`best_rep`)
159 | * `DP`: Total read depth
160 | * `DPS`: Total read depth; only supporting reads (for calls with incorporated SNVs mainly; STR calls only)
161 | * `GT`: Genotype
162 | * `MC`: Motif copy number for each allele (STR calls only)
163 | * `MCCI`: Motif copy number 95% confidence intervals for each allele (STR calls only)
164 | * `MCRL`: Read-level copy number histogram for each allele. Allele entries are comma-delimited, and copy numbers within
165 | an allele's read-set are pipe (`|`)-delimited and formatted as `[copy number]x[reads]`. For example, for two alleles
166 | with 8 and 9 copy-number respectively, we may get `7x1|8x10|9x1,8x2|9x12` — the first allele has one 7-copy read, ten
167 | 8-copy reads, and one 9-copy read. The second allele has two 8-copy reads and twelve 9-copy reads.
168 | * `MMAS`: Mean model (candidate TR sequence) alignment score across reads, for this sample. This score, relative to the
169 | other locis' scores, represents how well a pure tandem repeat stretch with the catalogued motif and the determined
170 | copy number (e.g., `CAGCAGCAG`) aligns to the true sequence.
171 | * `PS`: Phase set
172 | * `PM`: Peak-calling method (`dist`/`single`/`snv+dist`/`snv`/`hp`; STR calls only)
173 |
174 | VCF info. fields (i.e., for each STR variant record; not present for SNV records):
175 |
176 | * `VT`: Variant record type (`str` or `snv`)
177 | * `MOTIF`: Motif sequence
178 | * `REFMC`: Motif copy number in the reference genome
179 |
--------------------------------------------------------------------------------
/docs/trio_analyses.md:
--------------------------------------------------------------------------------
1 | # Trio analyses with STRkit
2 |
3 | Trio datasets include genomic sequence data for a child, their mother, and their father (the "trio"). These data
4 | can be used to discover de novo mutation (and incidental genotyping errors).
5 |
6 | STRkit includes a Mendelian inheritance (MI) analysis tool, under the sub-command `strkit mi`.
7 | After genotyping the trio with `strkit call`, this command can be used to discover loci which:
8 |
9 | 1. Do not respect exact MI
10 | 2. Do not respect MI allowing for a ±1 repeat unit difference
11 | (Note: most true mutation occurs in 1-repeat-unit changes too!
12 | See [Ellegren, 2004](https://www.nature.com/articles/nrg1348).)
13 | 3. Do not respect MI under the 95% locus confidence intervals
14 | 4. Look like de novo mutation at a read count distribution level, via a Mann-Whitney *U* test (with tie correction).
15 | The alternative hypothesis can be specified as either two-sided or looking for expansion in the offspring.
16 | *The requirements for this test are invalidated in cases of mosaicism.*
17 | 5. Look like de novo mutation at a read count distribution level, via a chi-squared independence test,
18 | where the contingency table looks like the following:
19 |
20 | | Read distribution \ Copy number | 11 | 12 | 13 |
21 | |---------------------------------|------|------|------|
22 | | Parent reads (best peak fit) | 20 | 10 | 0 |
23 | | Child reads | 2 | 20 | 10 |
24 |
25 |
26 | ## Trio-level
27 |
28 | At a trio level, the chi-squared test gives (optionally multiple testing-corrected) loci with a significant
29 | chance of containing a de novo mutation.
30 |
31 | ## Cohort-level
32 |
33 | At a cohort level, multiple downstream analyses are possible from a collection of trio mutation analyses,
34 | such as:
35 |
36 | 1. Case-control analysis looking for frequency of de novo mutations in specific loci
37 | 2. Case-control analysis looking at the incidence rate of de novo mutation
38 |
39 | Currently, tools to automatically perform these analyses are not available in STRkit.
40 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=57.4.0", "wheel"]
3 | build-backend = "setuptools.build_meta"
4 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | blinker==1.7.0
2 | click==8.1.7
3 | coverage==7.8.0
4 | Cython==3.0.12
5 | exceptiongroup==1.2.0
6 | Flask==3.0.3
7 | importlib_metadata==7.1.0
8 | iniconfig==2.0.0
9 | itsdangerous==2.2.0
10 | Jinja2==3.1.4
11 | joblib==1.3.2
12 | MarkupSafe==2.1.5
13 | numpy==1.26.4
14 | orjson==3.10.16
15 | packaging==24.0
16 | pandas==2.2.3
17 | parasail==1.3.4
18 | patsy==0.5.6
19 | pluggy==1.4.0
20 | psutil==6.1.0
21 | pyparsing==3.1.2
22 | pysam==0.23.0
23 | pytest==7.4.4
24 | pytest-cov==4.1.0
25 | python-dateutil==2.8.2
26 | pytz==2025.2
27 | scikit-learn==1.4.2
28 | scipy==1.15.1
29 | six==1.16.0
30 | statsmodels==0.14.4
31 | strkit_rust_ext==0.20.2
32 | threadpoolctl==3.4.0
33 | tomli==2.0.1
34 | tzdata==2024.2
35 | Werkzeug==3.0.4
36 | zipp==3.20.2
37 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import setuptools
3 | from setuptools import setup
4 |
5 | with open("README.md", "r") as fh:
6 | long_description = fh.read()
7 |
8 | with open("./strkit/VERSION", "r") as vf:
9 | version = vf.read().strip()
10 |
11 | setup(
12 | name="strkit",
13 | version=version,
14 |
15 | python_requires="~=3.10",
16 | install_requires=[
17 | "Flask>=2.2.5,<3.1",
18 | "orjson>=3.9.15,<4",
19 | "pysam>=0.19,<0.24",
20 | "numpy>=1.23.4,<1.27",
21 | "parasail>=1.2.4,<1.4",
22 | "scikit-learn>=1.2.1,<1.6",
23 | "scipy>=1.10,<1.16",
24 | "statsmodels>=0.14.0,<0.15",
25 | "strkit_rust_ext==0.20.2",
26 | ],
27 |
28 | description="A toolkit for analyzing variation in short(ish) tandem repeats.",
29 | long_description=long_description,
30 | long_description_content_type="text/markdown",
31 |
32 | url="https://github.com/davidlougheed/strkit",
33 | license="GPLv3",
34 | classifiers=[
35 | "Programming Language :: Python :: 3.10",
36 | "Programming Language :: Python :: 3.11",
37 | "Programming Language :: Python :: 3.12",
38 | "Programming Language :: Python :: 3 :: Only",
39 | "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
40 | "Operating System :: POSIX",
41 | ],
42 |
43 | author="David Lougheed",
44 | author_email="david.lougheed@gmail.com",
45 |
46 | packages=setuptools.find_namespace_packages(),
47 | include_package_data=True,
48 |
49 | entry_points={
50 | "console_scripts": ["strkit=strkit.entry:main"],
51 | },
52 | )
53 |
--------------------------------------------------------------------------------
/strkit/VERSION:
--------------------------------------------------------------------------------
1 | 0.23.0-dev
2 |
--------------------------------------------------------------------------------
/strkit/__init__.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | __all__ = [
4 | "__version__",
5 | ]
6 |
7 | with open(Path(__file__).parent / "VERSION", "r") as vf:
8 | __version__ = vf.read().strip()
9 |
--------------------------------------------------------------------------------
/strkit/call/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from .allele import call_alleles
4 | from .call_sample import call_sample
5 | from .params import CallParams
6 |
7 | __all__ = [
8 | "call_alleles",
9 | "call_sample",
10 | "CallParams",
11 | ]
12 |
--------------------------------------------------------------------------------
/strkit/call/align_matrix.py:
--------------------------------------------------------------------------------
1 | import parasail
2 | from ..iupac import IUPAC_NUCLEOTIDE_CODES
3 |
4 | __all__ = [
5 | "dna_codes",
6 | "match_score",
7 | "mismatch_penalty",
8 | "indel_penalty",
9 | "dna_bases",
10 | "dna_matrix",
11 | ]
12 |
13 |
14 | match_score: int = 2 # TODO: parametrize
15 | mismatch_penalty: int = 7 # TODO: parametrize
16 | indel_penalty: int = 5 # TODO: parametrize
17 |
18 |
19 | # TODO: Customize matrix based on error chances
20 | # Create a substitution matrix for alignment.
21 | # Include IUPAC wildcard bases to allow for motifs with multiple possible motifs.
22 | # Include a wildcard base 'X' for very low-confidence base calls, to prevent needlessly harsh penalties - this is
23 | # inserted into a read in place of bases with low PHRED scores.
24 | dna_bases_str: str = "ACGT" + "".join(IUPAC_NUCLEOTIDE_CODES.keys()) + "X"
25 | dna_bases: dict[str, int] = {b: i for i, b in enumerate(dna_bases_str)}
26 | dna_codes: dict[str, tuple[str, ...]] = {
27 | **IUPAC_NUCLEOTIDE_CODES,
28 | "X": ("A", "C", "G", "T"), # Special character for matching low-quality bases
29 | }
30 | dna_matrix = parasail.matrix_create(dna_bases_str, match_score, -1 * mismatch_penalty)
31 |
32 | for code, code_matches in dna_codes.items():
33 | for cm in code_matches:
34 | dna_matrix[dna_bases[code], dna_bases[cm]] = 2 if code != "X" else 0
35 | dna_matrix[dna_bases[cm], dna_bases[code]] = 2 if code != "X" else 0
36 |
--------------------------------------------------------------------------------
/strkit/call/allele.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | # Disable OpenMP/other multithreading since it adds enormous overhead when multiprocessing
4 | import os
5 | os.environ["OMP_NUM_THREADS"] = "1"
6 | os.environ["OPENBLAS_NUM_THREADS"] = "1"
7 | os.environ["MKL_NUM_THREADS"] = "1"
8 | os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
9 | os.environ["NUMEXPR_NUM_THREADS"] = "1"
10 |
11 | # ----------------------------------------------------------------------------------------------------------------------
12 |
13 | import logging # For type hinting
14 | import numpy as np
15 | import statistics
16 |
17 | from sklearn.exceptions import ConvergenceWarning
18 | from sklearn.mixture import GaussianMixture
19 | from sklearn.preprocessing import normalize
20 | from warnings import simplefilter
21 |
22 | from numpy.typing import NDArray
23 | from typing import Iterable, Literal, TypedDict, Union
24 |
25 | import strkit.constants as cc
26 |
27 | from .params import CallParams
28 | from .utils import get_new_seed
29 |
30 | __all__ = [
31 | "RepeatCounts",
32 | "CallDict",
33 | "get_n_alleles",
34 | "call_alleles",
35 | ]
36 |
37 | RepeatCounts = list[int] | tuple[int, ...] | NDArray[np.int_]
38 |
39 |
40 | # K-means convergence errors - we expect convergence to some extent with homozygous alleles
41 | simplefilter("ignore", category=ConvergenceWarning)
42 |
43 | # TODO: parameterize
44 | small_allele_min = 8
45 | expansion_ratio = 5
46 | N_GM_INIT = 3
47 |
48 | WEIGHT_1_0 = np.array([[1.0]])
49 | FLOAT_32_EPSILON = np.finfo(np.float32).eps
50 |
51 | CI_PERCENTILE_RANGES = {
52 | "95": (2.5, 97.5),
53 | "99": (0.5, 99.5),
54 | }
55 |
56 |
57 | def _array_as_int(n: NDArray[np.int_] | NDArray[np.float_]) -> NDArray[np.int32]:
58 | return np.rint(n).astype(np.int32)
59 |
60 |
61 | def _calculate_cis(samples, ci: str = Literal["95", "99"]) -> NDArray[np.int32]:
62 | percentiles = np.percentile(
63 | samples, CI_PERCENTILE_RANGES[ci], axis=1, method="interpolated_inverted_cdf"
64 | ).transpose()
65 | return _array_as_int(percentiles)
66 |
67 |
68 | def get_n_alleles(default_n_alleles: int, sample_sex_chroms: str | None, contig: str) -> int | None:
69 | if contig in cc.M_CHROMOSOME_NAMES:
70 | return 1
71 |
72 | if contig in cc.SEX_CHROMOSOMES:
73 | if sample_sex_chroms is None:
74 | return None
75 | if contig in cc.X_CHROMOSOME_NAMES:
76 | return sample_sex_chroms.count("X")
77 | if contig in cc.Y_CHROMOSOME_NAMES:
78 | return sample_sex_chroms.count("Y")
79 |
80 | return default_n_alleles
81 |
82 |
83 | def na_length_list(n_alleles: int):
84 | return [list() for _ in range(n_alleles)]
85 |
86 |
87 | GMMInitParamsMethod = Literal["kmeans", "k-means++"]
88 |
89 |
90 | def make_fitted_gmm(n_components: int, sample_rs: NDArray, init_params: GMMInitParamsMethod, rng: np.random.Generator):
91 | return GaussianMixture(
92 | n_components=n_components,
93 | init_params=init_params,
94 | covariance_type="spherical",
95 | n_init=N_GM_INIT,
96 | random_state=get_new_seed(rng),
97 | ).fit(sample_rs)
98 |
99 |
100 | def fit_gmm(
101 | rng: np.random.Generator,
102 | sample: NDArray,
103 | n_alleles: int,
104 | allele_filter: float,
105 | hq: bool,
106 | gm_filter_factor: int,
107 | init_params: GMMInitParamsMethod = "k-means++", # TODO: parameterize outside
108 | ) -> object | None:
109 | sample_rs = sample.reshape(-1, 1)
110 | g: object | None = None
111 |
112 | n_components: int = n_alleles
113 | while n_components > 0:
114 | if n_components == 1: # Don't need to do the full fit for a single peak, just calculate the parameters
115 | # I've confirmed this gives an ~identical result to fitting a GMM with one parameter.
116 | fake_g: object = type("", (), {})()
117 | fake_g.means_ = np.array([[np.mean(sample_rs)]])
118 | fake_g.weights_ = WEIGHT_1_0
119 | fake_g.covariances_ = np.array([[np.var(sample_rs)]])
120 | return fake_g
121 |
122 | g = make_fitted_gmm(n_components, sample_rs, init_params, rng)
123 |
124 | # noinspection PyUnresolvedReferences
125 | means_and_weights = np.append(g.means_.transpose(), g.weights_.reshape(1, -1), axis=0)
126 |
127 | # Filter out peaks that aren't supported by ~min_allele_reads reads by probability, with some delta to
128 | # allow for peaks supported by "most of a read".
129 | mw_filter_1 = means_and_weights[1, :] > allele_filter
130 |
131 | # Filter out any peaks below some threshold using this magic constant filter factor
132 | # - Exception: Large expansions can have very few supporting reads due to quirks of sequencing beyond
133 | # just chance/read length distribution; if we have 2 alleles and the large one is a lot bigger than
134 | # the small one, don't apply this filter
135 | # - Discard anything below a specific weight threshold and resample means based on remaining weights
136 | # to fill in the gap. E.g. below 1 / (5 * num alleles) - i.e. 5 times less than we expect with equal
137 | # sharing in the worst case where it represents just one allele
138 | if n_components > 2 or (n_components == 2 and (not hq or (
139 | means_and_weights[0, -1] < expansion_ratio * max(means_and_weights[0, 0], small_allele_min)))):
140 | mw_filter_2 = means_and_weights[1, :] > (1 / (gm_filter_factor * n_components))
141 | else:
142 | mw_filter_2 = means_and_weights[1, :] > FLOAT_32_EPSILON
143 |
144 | mw_filter = mw_filter_1 & mw_filter_2
145 | n_useless = np.size(mw_filter) - np.count_nonzero(mw_filter)
146 | if not n_useless:
147 | # No useless components left to remove, so return the GMM
148 | return g
149 | n_components -= n_useless
150 |
151 | return g
152 |
153 |
154 | class BaseCallDict(TypedDict):
155 | call: Union[NDArray[np.int32], NDArray[np.float_]]
156 | call_95_cis: Union[NDArray[np.int32], NDArray[np.float_]] # 2D arrays
157 | call_99_cis: Union[NDArray[np.int32], NDArray[np.float_]] # 2D arrays
158 | peaks: NDArray[np.float_]
159 | peak_weights: NDArray[np.float_]
160 | peak_stdevs: NDArray[np.float_]
161 | modal_n_peaks: int
162 |
163 |
164 | class CallDict(BaseCallDict, total=False):
165 | ps: int
166 |
167 |
168 | def make_read_weights(read_weights: Iterable[float] | None, num_reads: int) -> NDArray[np.float_]:
169 | return np.array(
170 | read_weights if read_weights is not None else np.array(([1/num_reads] * num_reads) if num_reads else []))
171 |
172 |
173 | def call_alleles(
174 | repeats_fwd: NDArray[np.int32],
175 | repeats_rev: NDArray[np.int32],
176 | read_weights_fwd: Iterable[float] | None,
177 | read_weights_rev: Iterable[float] | None,
178 | params: CallParams,
179 | min_reads: int,
180 | n_alleles: int,
181 | separate_strands: bool,
182 | read_bias_corr_min: int,
183 | gm_filter_factor: int,
184 | seed: int | None,
185 | logger_: logging.Logger,
186 | debug_str: str,
187 | ) -> CallDict | None:
188 | fwd_len = repeats_fwd.shape[0]
189 | rev_len = repeats_rev.shape[0]
190 |
191 | fwd_strand_weights = make_read_weights(read_weights_fwd, fwd_len)
192 | rev_strand_weights = make_read_weights(read_weights_rev, rev_len)
193 |
194 | assert repeats_fwd.shape == fwd_strand_weights.shape
195 | assert repeats_rev.shape == rev_strand_weights.shape
196 |
197 | combined_reads = np.concatenate((repeats_fwd, repeats_rev), axis=None)
198 | combined_weights = np.concatenate((fwd_strand_weights, rev_strand_weights), axis=None)
199 | combined_len = combined_reads.shape[-1]
200 |
201 | if combined_len < min_reads:
202 | return None
203 |
204 | # If the locus/allele only has one value, don't bother bootstrapping
205 | if np.unique(combined_reads).shape[0] == 1:
206 | logger_.debug(f"{debug_str} - skipping bootstrap / GMM fitting for allele(s) (single value)")
207 | cn = combined_reads[0]
208 |
209 | call = _array_as_int(np.full(n_alleles, cn))
210 | call_cis = _array_as_int(np.full((n_alleles, 2), cn))
211 |
212 | peaks: NDArray[np.float_] = call.astype(np.float_)
213 |
214 | return {
215 | "call": call,
216 | "call_95_cis": call_cis,
217 | "call_99_cis": call_cis,
218 | "peaks": peaks,
219 | "peak_weights": np.full(n_alleles, 1.0 / n_alleles),
220 | "peak_stdevs": np.full(n_alleles, 0.0),
221 | "modal_n_peaks": 1, # 1 peak, since we have 1 value
222 | }
223 |
224 | nal = na_length_list(n_alleles)
225 | allele_samples = np.array(nal, dtype=np.float32)
226 | allele_weight_samples = np.array(nal, dtype=np.float32)
227 | allele_stdev_samples = np.array(nal, dtype=np.float32)
228 | sample_peaks = np.array([], dtype=np.int32)
229 |
230 | rng: np.random.Generator = np.random.default_rng(seed=seed)
231 |
232 | # Perform a number of bootstrap iterations to get a 95% CI and more accurate estimate of repeat counts / differences
233 |
234 | if separate_strands and fwd_len >= read_bias_corr_min and rev_len >= read_bias_corr_min:
235 | target_length: int = max(fwd_len, rev_len)
236 |
237 | # Resample original sample, correcting for imbalances between
238 | # forward and reverse-strand reads along the way
239 | # (if we've passed the coverage threshold)
240 |
241 | fwd_strand_samples = rng.choice(
242 | repeats_fwd,
243 | size=(params.num_bootstrap, target_length),
244 | replace=True,
245 | p=fwd_strand_weights,
246 | )
247 |
248 | rev_strand_samples = rng.choice(
249 | repeats_rev,
250 | size=(params.num_bootstrap, target_length),
251 | replace=True,
252 | p=rev_strand_weights,
253 | )
254 |
255 | concat_samples = np.sort(
256 | np.concatenate((fwd_strand_samples, rev_strand_samples), axis=1),
257 | kind="stable")
258 |
259 | else:
260 | concat_samples = np.sort(
261 | rng.choice(
262 | combined_reads,
263 | size=(params.num_bootstrap, combined_len),
264 | replace=True,
265 | p=combined_weights,
266 | ) if params.num_bootstrap > 1 else np.array([combined_reads]),
267 | kind="stable")
268 |
269 | gmm_cache = {}
270 |
271 | def _get_fitted_gmm(s: NDArray[np.int_] | NDArray[np.float_]) -> object | None:
272 | if (s_t := s.tobytes()) not in gmm_cache:
273 | # Fit Gaussian mixture model to the resampled data
274 | gmm_cache[s_t] = fit_gmm(rng, s, n_alleles, allele_filter, params.hq, gm_filter_factor)
275 |
276 | return gmm_cache[s_t]
277 |
278 | # Filter out peaks that aren't supported by ~min_allele_reads reads by probability, with some delta to
279 | # allow for peaks supported by "most of a read".
280 | allele_filter = (params.min_allele_reads - 0.1) / concat_samples.shape[0]
281 |
282 | for i in range(params.num_bootstrap):
283 | sample = concat_samples[i, :]
284 |
285 | g: object | None = _get_fitted_gmm(sample)
286 | if not g:
287 | # Could not fit any Gaussian mixture; skip this allele
288 | return None
289 |
290 | # Keep track of how many alleles were found for
291 | # noinspection PyUnresolvedReferences
292 | sample_peaks = np.append(sample_peaks, g.means_.shape[0])
293 |
294 | # noinspection PyUnresolvedReferences
295 | means_and_weights = np.append(g.means_.transpose(), g.weights_.reshape(1, -1), axis=0)
296 |
297 | means = means_and_weights[0, :]
298 | weights = means_and_weights[1, :]
299 | # noinspection PyUnresolvedReferences
300 | stdevs = np.sqrt(g.covariances_)
301 | n_to_resample = n_alleles - means.shape[0]
302 |
303 | if n_to_resample:
304 | # Re-sample means if any are removed, based on weights (re-normalized), to match total # of alleles
305 | resampled_indices = rng.choice(
306 | np.arange(len(means)),
307 | size=n_to_resample,
308 | p=normalize(weights.reshape(1, -1), norm="l1").flatten())
309 | resampled_means = np.append(means, means[resampled_indices])
310 | resampled_weights = np.append(weights, weights[resampled_indices])
311 | resampled_stdevs = np.append(stdevs, stdevs[resampled_indices])
312 | else:
313 | resampled_means = means
314 | resampled_weights = weights
315 | resampled_stdevs = stdevs
316 |
317 | argsorted_means = np.argsort(resampled_means, axis=0, kind="stable")
318 | sorted_allele_estimates = resampled_means[argsorted_means].reshape(-1, 1)
319 | sorted_allele_weight_estimates = resampled_weights[argsorted_means].reshape(-1, 1)
320 | sorted_allele_stdev_estimates = resampled_stdevs[argsorted_means].reshape(-1, 1)
321 |
322 | allele_samples = np.append(allele_samples, sorted_allele_estimates, axis=1)
323 | allele_weight_samples = np.append(allele_weight_samples, sorted_allele_weight_estimates, axis=1)
324 | allele_stdev_samples = np.append(allele_stdev_samples, sorted_allele_stdev_estimates, axis=1)
325 |
326 | # Calculate 95% and 99% confidence intervals for each allele from the bootstrap distributions.
327 | allele_samples_argsort = allele_samples.argsort(axis=1, kind="stable")
328 | allele_samples = np.take_along_axis(allele_samples, allele_samples_argsort, axis=1)
329 | allele_cis_95 = _calculate_cis(allele_samples, ci="95")
330 | allele_cis_99 = _calculate_cis(allele_samples, ci="99")
331 | allele_weight_samples = np.take_along_axis(allele_weight_samples, allele_samples_argsort, axis=1)
332 | allele_stdev_samples = np.take_along_axis(allele_stdev_samples, allele_samples_argsort, axis=1)
333 |
334 | sample_peaks.sort(kind="stable") # To make mode consistent, given same set of peak #s
335 |
336 | # TODO: Calculate CIs based on Gaussians from allele samples instead? Ask someone...
337 | # - Could take median of 2.5 percentiles and 97.5 percentiles from Gaussians instead, median of means
338 |
339 | # Report the median estimates and the confidence intervals.
340 | # - we choose nearest for median rather than interpolating, so we can get real corresponding weights and stdevs.
341 |
342 | median_idx = allele_samples.shape[1] // 2 #
343 | medians_of_means = allele_samples[:, median_idx]
344 | medians_of_means_final = np.rint(medians_of_means).astype(np.int32)
345 | peak_weights = allele_weight_samples[:, median_idx].flatten()
346 | peak_stdevs = allele_stdev_samples[:, median_idx]
347 | modal_n_peaks: int = statistics.mode(sample_peaks).item()
348 |
349 | peak_weights /= peak_weights.sum() # re-normalize weights
350 |
351 | return {
352 | "call": medians_of_means_final.flatten(),
353 | "call_95_cis": allele_cis_95,
354 | "call_99_cis": allele_cis_99,
355 |
356 | "peaks": medians_of_means.flatten(), # Don't round, so we can recover original Gaussian model
357 | "peak_weights": peak_weights,
358 | "peak_stdevs": peak_stdevs.flatten(),
359 | # TODO: should be ok to use this, because resample gets put at end, vertically (3rd allele in a 3-ploid case)
360 | # so taking the first 2 alleles still works in terms of stdev/mean estimates? I think?
361 | # Not quite, cause it's sorted...
362 | # --> Only do the peak assignment with 1/2 peaks, which is the majority of human situations
363 | "modal_n_peaks": modal_n_peaks,
364 | }
365 |
--------------------------------------------------------------------------------
/strkit/call/cigar.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from numpy.typing import NDArray
3 |
4 | from strkit_rust_ext import get_aligned_pair_matches
5 |
6 | __all__ = [
7 | "decode_cigar_np",
8 | "get_aligned_pair_matches",
9 | ]
10 |
11 |
12 | def decode_cigar_np(encoded_cigar: NDArray[np.uint32]) -> NDArray[np.uint32]:
13 | return np.stack((np.bitwise_and(encoded_cigar, 15), np.right_shift(encoded_cigar, 4)), axis=1)
14 |
--------------------------------------------------------------------------------
/strkit/call/non_daemonic_pool.py:
--------------------------------------------------------------------------------
1 | import multiprocessing as mp
2 |
3 | __all__ = [
4 | "NonDaemonicPool",
5 | ]
6 |
7 |
8 | # Need a pool which itself can spawn realignment processes - see https://stackoverflow.com/a/53180921
9 |
10 |
11 | class NonDaemonicProcess(mp.Process):
12 | @property
13 | def daemon(self):
14 | return False
15 |
16 | @daemon.setter
17 | def daemon(self, value):
18 | pass
19 |
20 |
21 | class NonDaemonicContext(type(mp.get_context())):
22 | Process = NonDaemonicProcess
23 |
24 |
25 | class NonDaemonicPool(mp.pool.Pool):
26 | # noinspection PyArgumentList
27 | def __init__(self, *args, **kwargs):
28 | kwargs["context"] = NonDaemonicContext()
29 | super().__init__(*args, **kwargs)
30 |
--------------------------------------------------------------------------------
/strkit/call/output/__init__.py:
--------------------------------------------------------------------------------
1 | from .json_report import output_json_report_header, output_json_report_results, output_json_report_footer
2 | from .tsv import output_tsv
3 | from .vcf import build_vcf_header, output_contig_vcf_lines
4 |
5 | __all__ = [
6 | "output_json_report_header",
7 | "output_json_report_results",
8 | "output_json_report_footer",
9 | "output_tsv",
10 | "build_vcf_header",
11 | "output_contig_vcf_lines",
12 | ]
13 |
--------------------------------------------------------------------------------
/strkit/call/output/json_report.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from typing import Callable, Literal
3 |
4 | from strkit import __version__
5 | from strkit.json import Serializable, dumps, dumps_indented
6 |
7 | from ..params import CallParams
8 | from ..types import LocusResult
9 |
10 | __all__ = [
11 | "output_json_report_header",
12 | "output_json_report_results",
13 | "output_json_report_footer",
14 | ]
15 |
16 |
17 | def _get_dfn(indent_json: bool) -> Callable[[Serializable], bytes]:
18 | return dumps_indented if indent_json else dumps
19 |
20 |
21 | def _write_bytes(b: bytes, json_path: str, mode: Literal["wb", "ab"]):
22 | if json_path == "stdout":
23 | sys.stdout.buffer.write(b)
24 | sys.stdout.flush()
25 | else:
26 | with open(json_path, mode) as jf:
27 | # noinspection PyTypeChecker
28 | jf.write(b)
29 |
30 |
31 | def output_json_report_header(params: CallParams, contig_set: set[str], json_path: str, indent_json: bool):
32 | json_report_header = {
33 | "sample_id": params.sample_id,
34 | "caller": {
35 | "name": "strkit",
36 | "version": __version__,
37 | },
38 | "parameters": params.to_dict(as_inputted=True),
39 | "contigs": tuple(contig_set),
40 | }
41 |
42 | dfn = _get_dfn(indent_json)
43 | header_serialized: bytes = dfn(json_report_header)[:(-2 if indent_json else -1)] # remove trailing ending brace
44 |
45 | # kludge: build up a portion of the JSON file, so we can output contig results as they come instead of storing them
46 | # in memory until the end of the run.
47 | header_serialized += b","
48 | if indent_json:
49 | header_serialized += b'\n "results": [\n'
50 | else:
51 | header_serialized += b'"results":['
52 |
53 | # write partial JSON
54 | _write_bytes(header_serialized, json_path, "wb")
55 |
56 |
57 | def output_json_report_results(results: tuple[LocusResult, ...], is_last: bool, json_path: str, indent_json: bool):
58 | dfn = _get_dfn(indent_json)
59 | results_bytes: bytes = dfn(results)
60 |
61 | if indent_json:
62 | results_bytes = results_bytes[2:-2] # remove opening and closing "[]" + trailing newline
63 | if not is_last:
64 | results_bytes += b",\n"
65 | else:
66 | results_bytes = results_bytes[1:-1] # remove opening and closing "[]"
67 | if not is_last:
68 | results_bytes += b","
69 |
70 | # write results "rows"
71 | _write_bytes(results_bytes, json_path, "ab")
72 |
73 |
74 | def output_json_report_footer(time_taken: float, json_path: str, indent_json: bool):
75 | runtime_bytes = dumps(time_taken)
76 | if indent_json:
77 | footer_bytes = b'\n ],\n "runtime": ' + runtime_bytes + b'\n}\n'
78 | else:
79 | footer_bytes = b'],"runtime":' + runtime_bytes + b'}\n'
80 |
81 | # write partial JSON
82 | _write_bytes(footer_bytes, json_path, "ab")
83 |
--------------------------------------------------------------------------------
/strkit/call/output/tsv.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | __all__ = ["output_tsv"]
4 |
5 |
6 | def _cn_to_str(cn: int | float) -> str:
7 | return f"{cn:.1f}" if isinstance(cn, float) else str(cn)
8 |
9 |
10 | def output_tsv(results: tuple[dict, ...], has_snv_vcf: bool):
11 | for res in results:
12 | has_call = res["call"] is not None
13 | # n_peaks = res["peaks"]["modal_n"]
14 |
15 | ref_cn = res.get("ref_cn")
16 | reads = res.get("reads")
17 |
18 | sys.stdout.write("\t".join((
19 | res["contig"],
20 | str(res["start"]),
21 | str(res["end"]),
22 | res["motif"],
23 | _cn_to_str(ref_cn) if ref_cn is not None else ".",
24 | ",".join(map(_cn_to_str, sorted(r["cn"] for r in reads.values()))) if reads else ".",
25 | "|".join(map(_cn_to_str, res["call"])) if has_call else ".",
26 | ("|".join("-".join(map(_cn_to_str, gc)) for gc in res["call_95_cis"]) if has_call else "."),
27 | # *((res["assign_method"] if has_call else ".",) if incorporate_snvs else ()),
28 | *((res["assign_method"] if has_call else ".",) if has_snv_vcf else ()),
29 |
30 | # ("|".join(map(lambda x: f"{x:.5f}", res["peaks"]["means"][:n_peaks]))
31 | # if has_call and n_peaks <= 2 else "."),
32 | # ("|".join(map(lambda x: f"{x:.5f}", res["peaks"]["weights"][:n_peaks]))
33 | # if has_call and n_peaks <= 2 else "."),
34 | # ("|".join(map(lambda x: f"{x:.5f}", res["peaks"]["stdevs"][:n_peaks]))
35 | # if has_call and n_peaks <= 2 else "."),
36 | )) + "\n")
37 |
--------------------------------------------------------------------------------
/strkit/call/output/vcf.py:
--------------------------------------------------------------------------------
1 | import functools
2 | import logging
3 |
4 | from collections import Counter
5 | from os.path import commonprefix
6 | from pathlib import Path
7 | from pysam import FastaFile, VariantFile, VariantHeader, VariantRecord
8 | from typing import Iterable
9 |
10 | from strkit.utils import cat_strs, is_none, idx_0_getter
11 | from ..allele import get_n_alleles
12 | from ..params import CallParams
13 | from ..utils import cn_getter
14 |
15 | __all__ = [
16 | "build_vcf_header",
17 | "output_contig_vcf_lines",
18 | ]
19 |
20 |
21 | # VCF_ALLELE_CNV_TR = ""
22 |
23 | # VCF_TR_INFO_RECORDS: tuple[tuple[str, str, str, str], ...] = (
24 | # ("SVLEN", "A", "Integer", "Length of the structural variant"),
25 | # ("CN", "A", "Float", "Copy number of allele"),
26 | # ("RN", "A", "Integer", "Total number of repeat sequences in this allele"),
27 | # ("RUS", ".", "String", "Repeat unit sequence of the corresponding repeat sequence"),
28 | # ("RUL", ".", "Integer", "Repeat unit length of the corresponding repeat sequence"),
29 | # ("RB", ".", "Integer", "Total number of bases in the corresponding repeat sequence"),
30 | # ("CIRUC", ".", "Float", "Confidence interval around RUC"),
31 | # ("CIRB", ".", "Integer", "Confidence interval around RB"),
32 | # )
33 |
34 | VCF_INFO_VT = "VT"
35 | VCF_INFO_MOTIF = "MOTIF"
36 | VCF_INFO_REFMC = "REFMC"
37 | VCF_INFO_ANCH = "ANCH"
38 |
39 | VT_STR = "str"
40 | VT_SNV = "snv"
41 |
42 |
43 | def iter_to_upper(x: Iterable[str]) -> Iterable[str]:
44 | # noinspection PyTypeChecker
45 | return map(str.upper, x)
46 |
47 |
48 | def build_vcf_header(sample_id: str, reference_file: str) -> VariantHeader:
49 | vh = VariantHeader() # automatically sets VCF version to 4.2
50 |
51 | # Add an absolute path to the reference genome
52 | vh.add_meta("reference", f"file://{str(Path(reference_file).resolve().absolute())}")
53 |
54 | # Add all contigs from the reference genome file + lengths
55 | rf = FastaFile(reference_file)
56 | try:
57 | for contig in rf.references:
58 | vh.contigs.add(contig, length=rf.get_reference_length(contig))
59 | finally:
60 | rf.close()
61 |
62 | # Add CNV:TR alt type (symbolic allele: tandem repeat)
63 | # vh.add_meta("ALT", "")
64 |
65 | # Set up basic VCF formats
66 | vh.formats.add("AD", ".", "Integer", "Read depth for each allele")
67 | vh.formats.add("ANCL", ".", "Integer", "Anchor length for the ref and each alt, five-prime of TR sequence")
68 | vh.formats.add("CONS", ".", "String", "Consensus methods used for each alt (single/poa/best_rep)")
69 | vh.formats.add("DP", 1, "Integer", "Read depth")
70 | vh.formats.add("DPS", 1, "Integer", "Read depth (supporting reads only)")
71 | vh.formats.add("GT", 1, "String", "Genotype")
72 | vh.formats.add("MC", ".", "Integer", "Motif copy number for each allele")
73 | vh.formats.add("MCCI", ".", "String", "Motif copy number 95% confidence interval for each allele")
74 | vh.formats.add("MCRL", ".", "String", "Read-level motif copy numbers for each allele")
75 | vh.formats.add("MMAS", 1, "Float", "Mean model (candidate TR sequence) alignment score across reads.")
76 | vh.formats.add("NSNV", 1, "Integer", "Number of supporting SNVs for the STR peak-call")
77 | vh.formats.add("PS", 1, "Integer", "Phase set")
78 | vh.formats.add("PM", 1, "String", "Peak-calling method (dist/snv+dist/snv/hp)")
79 |
80 | # Set up VCF info fields
81 | vh.info.add(VCF_INFO_VT, 1, "String", "Variant record type (str/snv)")
82 | vh.info.add(VCF_INFO_MOTIF, 1, "String", "Motif string")
83 | vh.info.add(VCF_INFO_REFMC, 1, "Integer", "Motif copy number in the reference genome")
84 | vh.info.add(VCF_INFO_ANCH, 1, "Integer", "Five-prime anchor size")
85 |
86 | # Add INFO records for tandem repeat copies - these are new to VCF4.4! TODO
87 | # for iv in VCF_TR_INFO_RECORDS:
88 | # vh.info.add(*iv)
89 |
90 | # Add the sample
91 | vh.add_sample(sample_id)
92 |
93 | return vh
94 |
95 |
96 | def _vr_pos_key(vr: VariantRecord) -> int:
97 | return vr.pos
98 |
99 |
100 | def _reversed_str(s: str) -> str:
101 | return cat_strs(reversed(s))
102 |
103 |
104 | @functools.cache
105 | def _blank_entry(n_alleles: int) -> tuple[None, ...]:
106 | return tuple([None] * n_alleles)
107 |
108 |
109 | def output_contig_vcf_lines(
110 | params: CallParams,
111 | sample_id: str,
112 | variant_file: VariantFile,
113 | results: tuple[dict, ...],
114 | logger: logging.Logger,
115 | ) -> None:
116 | variant_records: list[VariantRecord] = []
117 |
118 | # has_at_least_one_snv_set = next((r.get("snvs") is not None for r in results), None) is not None
119 | snvs_written: set[str] = set()
120 |
121 | for result_idx, result in enumerate(results, 1):
122 | contig = result["contig"]
123 | start = result["start"]
124 |
125 | if "ref_start_anchor" not in result:
126 | logger.debug(f"No ref anchor for {contig}:{start}; skipping VCF output for locus")
127 | continue
128 |
129 | ref_start_anchor = result["ref_start_anchor"].upper()
130 | ref_seq = result["ref_seq"].upper()
131 |
132 | n_alleles: int = get_n_alleles(2, params.sex_chroms, contig) or 2
133 |
134 | res_reads = result["reads"]
135 | res_peaks = result["peaks"] or {}
136 |
137 | peak_seqs_and_methods = {(seq.upper() if seq else seq): method for seq, method in res_peaks.get("seqs", [])}
138 | peak_seqs: tuple[str, ...] = tuple(peak_seqs_and_methods.keys())
139 | peak_start_anchor_seqs: list[str] = list(map(idx_0_getter, res_peaks.get("start_anchor_seqs", [])))
140 |
141 | if any(map(is_none, peak_seqs)): # Occurs when no consensus for one of the peaks
142 | logger.error(f"Encountered None in results[{result_idx}].peaks.seqs: {peak_seqs}")
143 | continue
144 |
145 | if any(map(is_none, peak_start_anchor_seqs)): # Occurs when no consensus for one of the peaks
146 | logger.error(f"Encountered None in results[{result_idx}].peaks.start_anchor_seqs: {peak_start_anchor_seqs}")
147 | continue
148 |
149 | peak_start_anchor_seqs_upper = tuple(iter_to_upper(peak_start_anchor_seqs))
150 | common_anchor_prefix = commonprefix([ref_start_anchor, *peak_start_anchor_seqs_upper])
151 | # anchor_offset = how many bases we can cut off from the front of the anchor
152 | # since they're shared between all alleles - yields a more compact representation.
153 | # - we need to leave one base as an anchor for VCF compliance though, thus the min(...)
154 | anchor_offset = min(len(common_anchor_prefix), params.vcf_anchor_size - 1)
155 |
156 | ref_start_anchor = ref_start_anchor[anchor_offset:]
157 | ref_seq_with_anchor = ref_start_anchor + ref_seq
158 |
159 | seqs_with_anchors: list[tuple[str, str]] = list(
160 | zip(peak_seqs, map(lambda a: a[anchor_offset:], peak_start_anchor_seqs_upper))
161 | )
162 |
163 | if 0 < len(peak_seqs) < n_alleles:
164 | peak_seqs = tuple([peak_seqs[0]] * n_alleles)
165 | seqs_with_anchors = [seqs_with_anchors[0]] * n_alleles
166 |
167 | seq_alts = sorted(
168 | set(filter(lambda c: not (c[1] + c[0] == ref_seq_with_anchor), seqs_with_anchors)),
169 | key=lambda c: c[1] + c[0]
170 | )
171 |
172 | call = result["call"]
173 | call_95_cis = result["call_95_cis"]
174 |
175 | seq_alleles_raw: tuple[str | None, ...] = (
176 | ((ref_seq, ref_start_anchor), *(seq_alts or (None,)))
177 | if call is not None
178 | else ()
179 | )
180 |
181 | seq_alleles: list[str] = [ref_seq_with_anchor]
182 |
183 | if call is not None and seq_alts:
184 | # If we have a complete deletion, including the anchor, use a symbolic allele meaning "upstream deletion"
185 | for alt_tr_seq, alt_anchor in seq_alts:
186 | if not alt_tr_seq and not alt_anchor:
187 | seq_alleles.append("*")
188 | continue
189 | seq_alleles.append(alt_anchor + alt_tr_seq)
190 | else:
191 | seq_alleles.append(".")
192 |
193 | start = result.get("start_adj", start) - len(ref_start_anchor)
194 |
195 | vr: VariantRecord = variant_file.new_record(
196 | contig=contig,
197 | start=start,
198 | alleles=seq_alleles,
199 | )
200 |
201 | vr.info[VCF_INFO_VT] = VT_STR
202 | vr.info[VCF_INFO_MOTIF] = result["motif"]
203 | vr.info[VCF_INFO_REFMC] = result["ref_cn"]
204 | vr.info[VCF_INFO_ANCH] = params.vcf_anchor_size - anchor_offset
205 |
206 | vr.samples[sample_id]["GT"] = (
207 | tuple(map(seq_alleles_raw.index, seqs_with_anchors))
208 | if call is not None and peak_seqs
209 | else _blank_entry(n_alleles)
210 | )
211 |
212 | if am := result.get("assign_method"):
213 | vr.samples[sample_id]["PM"] = am
214 |
215 | str_snvs = result.get("snvs", ())
216 | if str_snvs:
217 | # Record number of support SNVs for the locus
218 | vr.samples[sample_id]["NSNV"] = len(str_snvs)
219 |
220 | vr.samples[sample_id]["DP"] = len(res_reads)
221 | vr.samples[sample_id]["MMAS"] = result.get("mean_model_align_score")
222 |
223 | if call is not None and res_peaks:
224 | vr.samples[sample_id]["DPS"] = sum(res_peaks["n_reads"])
225 | vr.samples[sample_id]["AD"] = tuple(res_peaks["n_reads"])
226 | vr.samples[sample_id]["MC"] = tuple(map(int, call))
227 | vr.samples[sample_id]["MCCI"] = tuple(f"{x[0]}-{x[1]}" for x in call_95_cis)
228 |
229 | vr.samples[sample_id]["ANCL"] = tuple(len(ar[1]) for ar in seq_alleles_raw if ar is not None)
230 |
231 | # For each alt, mention which consensus method was used to obtain the sequence.
232 | cons = tuple(
233 | peak_seqs_and_methods[ar[0]] for ar in seq_alleles_raw[1:] if ar is not None
234 | )
235 | vr.samples[sample_id]["CONS"] = cons if cons else (".",)
236 |
237 | # Produces a histogram-like format for read-level copy numbers
238 | # e.g., for two alleles with 8 and 9 copy-number respectively, we may get: 7x1|8x10|9x1,8x2|9x12
239 | vr.samples[sample_id]["MCRL"] = tuple(
240 | "|".join(
241 | map(
242 | lambda pair: "x".join(map(str, pair)),
243 | sorted(
244 | Counter(
245 | map(cn_getter, filter(lambda r: r.get("p") == pi, res_reads.values()))
246 | ).items()
247 | )
248 | )
249 | )
250 | for pi in range(res_peaks["modal_n"])
251 | )
252 |
253 | ps = result["ps"]
254 |
255 | try:
256 | if ps is not None: # have phase set on call, so mark as phased
257 | vr.samples[sample_id].phased = True
258 | vr.samples[sample_id]["PS"] = ps
259 | except TypeError:
260 | vr.samples[sample_id].phased = False
261 | logger.error(f"Received bad PS value while writing VCF record at {contig}:{start} - {ps}")
262 | ps = None
263 |
264 | for snv in str_snvs:
265 | snv_id = snv["id"]
266 | if snv_id in snvs_written:
267 | continue
268 | snvs_written.add(snv_id)
269 |
270 | ref = snv["ref"]
271 | snv_alts = tuple(sorted(set(filter(lambda v: v != ref, snv["call"]))))
272 | snv_alleles = (ref, *snv_alts)
273 | snv_pos = snv["pos"]
274 |
275 | if len(snv_alleles) < 2:
276 | logger.error(f"Error while writing VCF: SNV ({snv_id}) at {contig}:{snv_pos+1} has no alts")
277 | continue
278 |
279 | snv_vr: VariantRecord = variant_file.new_record(
280 | contig=contig,
281 | id=snv_id,
282 | start=snv_pos,
283 | stop=snv_pos + 1,
284 | alleles=snv_alleles,
285 | )
286 |
287 | snv_vr.info[VCF_INFO_VT] = VT_SNV
288 |
289 | snv_vr.samples[sample_id]["GT"] = tuple(map(snv_alleles.index, snv["call"]))
290 | snv_vr.samples[sample_id]["DP"] = sum(snv["rcs"])
291 | snv_vr.samples[sample_id]["AD"] = snv["rcs"]
292 |
293 | if ps is not None:
294 | snv_vr.samples[sample_id].phased = True
295 | snv_vr.samples[sample_id]["PS"] = ps
296 |
297 | variant_records.append(snv_vr)
298 |
299 | variant_records.append(vr)
300 |
301 | # sort the variant records by position
302 | variant_records.sort(key=_vr_pos_key)
303 |
304 | # write them to the VCF
305 | for vrr in variant_records:
306 | variant_file.write(vrr)
307 |
--------------------------------------------------------------------------------
/strkit/call/params.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import pathlib
3 |
4 | from pysam import AlignmentFile
5 |
6 | from ..logger import log_levels
7 |
8 | __all__ = ["CallParams"]
9 |
10 |
11 | class CallParams:
12 | def __init__(
13 | self,
14 |
15 | logger: logging.Logger,
16 |
17 | read_file: str,
18 | reference_file: str,
19 | loci_file: str,
20 | sample_id: str | None,
21 | min_reads: int = 4,
22 | min_allele_reads: int = 2,
23 | max_reads: int = 250,
24 | min_avg_phred: int = 13,
25 | min_read_align_score: float = 0.9,
26 | max_rcn_iters: int = 50,
27 | num_bootstrap: int = 100,
28 | flank_size: int = 70,
29 | skip_supplementary: bool = False,
30 | skip_secondary: bool = False,
31 | sex_chroms: str | None = None,
32 | realign: bool = False,
33 | hq: bool = False,
34 | use_hp: bool = False,
35 | snv_vcf: pathlib.Path | None = None,
36 | snv_min_base_qual: int = 20,
37 | targeted: bool = False,
38 | respect_ref: bool = False,
39 | count_kmers: str = "none", # "none" | "peak" | "read"
40 | consensus: bool = False,
41 | vcf_anchor_size: int = 5,
42 | # ---
43 | log_level: int = logging.WARNING,
44 | seed: int | None = None,
45 | processes: int = 1,
46 | ):
47 | self.read_file: str = read_file
48 | self.reference_file: str = reference_file
49 | self.loci_file: str = loci_file
50 | self.min_reads: int = min_reads
51 | self.min_allele_reads: int = min_allele_reads
52 | self.max_reads: int = max_reads
53 | self.min_avg_phred: int = min_avg_phred
54 | self.min_read_align_score: float = min_read_align_score
55 | self.max_rcn_iters: int = max_rcn_iters
56 | self.num_bootstrap: int = num_bootstrap
57 | self.flank_size: int = flank_size
58 | self.skip_supplementary: bool = skip_supplementary
59 | self.skip_secondary: bool = skip_secondary
60 | self.sex_chroms: str | None = sex_chroms
61 | self.realign: bool = realign
62 | self.hq: bool = hq
63 | self.use_hp: bool = use_hp
64 | self.snv_vcf: pathlib.Path | None = snv_vcf
65 | self.snv_min_base_qual: int = snv_min_base_qual
66 | self.targeted: bool = targeted
67 | self.respect_ref: bool = respect_ref
68 | self.count_kmers: str = count_kmers
69 | self.consensus: bool = consensus
70 | self.vcf_anchor_size: int = vcf_anchor_size
71 | # ---
72 | self.log_level: int = log_level
73 | self.seed: int | None = seed
74 | self.processes: int = processes
75 |
76 | bf = AlignmentFile(read_file, reference_filename=reference_file)
77 |
78 | # noinspection PyTypeChecker
79 | bfh = bf.header.to_dict()
80 |
81 | sns: set[str] = {e.get("SM") for e in bfh.get("RG", ()) if e.get("SM")}
82 | bam_sample_id: str | None = None
83 |
84 | if len(sns) > 1:
85 | # Error or warning or what?
86 | sns_str = "', '".join(sns)
87 | logger.warning(f"Found more than one sample ID in BAM file(s): '{sns_str}'")
88 | elif not sns:
89 | if not sample_id:
90 | logger.warning("Could not find sample ID in BAM file(s); sample ID can be set manually via --sample-id")
91 | else:
92 | bam_sample_id = sns.pop()
93 |
94 | self._sample_id_orig: str | None = sample_id
95 | self.sample_id = sample_id or bam_sample_id
96 |
97 | @classmethod
98 | def from_args(cls, logger: logging.Logger, p_args):
99 | return cls(
100 | logger,
101 | p_args.read_file,
102 | p_args.ref,
103 | p_args.loci,
104 | sample_id=p_args.sample_id,
105 | min_reads=p_args.min_reads,
106 | min_allele_reads=p_args.min_allele_reads,
107 | max_reads=p_args.max_reads,
108 | min_avg_phred=p_args.min_avg_phred,
109 | min_read_align_score=p_args.min_read_align_score,
110 | max_rcn_iters=p_args.max_rcn_iters,
111 | num_bootstrap=p_args.num_bootstrap,
112 | flank_size=p_args.flank_size,
113 | skip_supplementary=p_args.skip_supplementary,
114 | skip_secondary=p_args.skip_secondary,
115 | sex_chroms=p_args.sex_chr,
116 | realign=p_args.realign,
117 | hq=p_args.hq,
118 | use_hp=p_args.use_hp,
119 | snv_vcf=p_args.incorporate_snvs,
120 | snv_min_base_qual=p_args.snv_min_base_qual,
121 | targeted=p_args.targeted,
122 | respect_ref=p_args.respect_ref,
123 | count_kmers=p_args.count_kmers,
124 | consensus=p_args.consensus or not (not p_args.vcf), # Consensus calculation is required for VCF output.
125 | vcf_anchor_size=min(max(p_args.vcf_anchor_size, 1), p_args.flank_size),
126 | # ---
127 | log_level=log_levels[p_args.log_level],
128 | seed=p_args.seed,
129 | processes=p_args.processes,
130 | )
131 |
132 | def to_dict(self, as_inputted: bool = False):
133 | return {
134 | "read_file": self.read_file,
135 | "reference_file": self.reference_file,
136 | "min_reads": self.min_reads,
137 | "min_allele_reads": self.min_allele_reads,
138 | "max_reads": self.max_reads,
139 | "min_avg_phred": self.min_avg_phred,
140 | "min_read_align_score": self.min_read_align_score,
141 | "max_rcn_iters": self.max_rcn_iters,
142 | "num_bootstrap": self.num_bootstrap,
143 | "flank_size": self.flank_size,
144 | "skip_supplementary": self.skip_supplementary,
145 | "skip_secondary": self.skip_secondary,
146 | "sample_id": self._sample_id_orig if as_inputted else self.sample_id,
147 | "realign": self.realign,
148 | "hq": self.hq,
149 | "use_hp": self.use_hp,
150 | "snv_vcf": str(self.snv_vcf) if self.snv_vcf else None,
151 | "snv_min_base_qual": self.snv_min_base_qual,
152 | "targeted": self.targeted,
153 | "respect_ref": self.respect_ref,
154 | "count_kmers": self.count_kmers,
155 | "consensus": self.consensus,
156 | "vcf_anchor_size": self.vcf_anchor_size,
157 | "log_level": self.log_level,
158 | "seed": self.seed,
159 | "processes": self.processes,
160 | }
161 |
--------------------------------------------------------------------------------
/strkit/call/realign.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import multiprocessing as mp
3 | import numpy as np
4 | import os
5 | import parasail
6 | import queue
7 | import time
8 |
9 | from numpy.typing import NDArray
10 |
11 | from .align_matrix import match_score, dna_matrix
12 | from .cigar import decode_cigar_np, get_aligned_pair_matches
13 | from .params import CallParams
14 | from .utils import calculate_seq_with_wildcards
15 |
16 | __all__ = [
17 | "MatchedCoordPairListOrNone",
18 | "realign_read",
19 | "perform_realign",
20 | ]
21 |
22 |
23 | min_realign_score_ratio: float = 0.95 # TODO: parametrize
24 | realign_indel_open_penalty: int = 7 # TODO: parametrize
25 | max_ref_len_for_same_proc: int = 1200 # TODO: parametrize
26 | max_read_len_for_same_proc: int = 20000 # TODO: parametrize
27 |
28 |
29 | MatchedCoordPairList = tuple[NDArray[np.uint64], NDArray[np.uint64]]
30 | MatchedCoordPairListOrNone = MatchedCoordPairList | None
31 |
32 |
33 | def realign_read(
34 | ref_seq: str,
35 | query_seq: str,
36 | left_flank_coord: int,
37 | flank_size: int,
38 | rn: str,
39 | t_idx: int,
40 | always_realign: bool,
41 | q, # mp.Queue | None
42 | log_level: int = logging.WARNING,
43 | ) -> MatchedCoordPairListOrNone:
44 | # Have to re-attach logger in separate process I guess
45 |
46 | def ret_q(v: MatchedCoordPairListOrNone) -> MatchedCoordPairListOrNone:
47 | if q:
48 | q.put(v)
49 | q.close()
50 | return v
51 |
52 | from strkit.logger import create_process_logger
53 | lg = create_process_logger(os.getpid(), log_level)
54 |
55 | # flipped: 'ref sequence' as query here, since it should in general be shorter (!)
56 | pr = parasail.sg_dx_trace_scan_16(
57 | # fetch an extra base for the right flank coordinate check later (needs to be >= the exclusive coord)
58 | ref_seq, query_seq, realign_indel_open_penalty, 0, dna_matrix)
59 |
60 | if pr.score < (th := min_realign_score_ratio * (flank_size * 2 * match_score - realign_indel_open_penalty)):
61 | lg.debug(f"Realignment for {rn} scored below threshold ({pr.score} < {th:.2f})")
62 | return ret_q(None)
63 |
64 | lg.debug(
65 | f"Realigned {rn} in locus {t_idx}{' (due to soft clipping)' if not always_realign else ''}: scored {pr.score}; "
66 | f"Flipped CIGAR: {pr.cigar.decode.decode('ascii')}")
67 |
68 | matches = get_aligned_pair_matches(decode_cigar_np(pr.cigar.seq), left_flank_coord, 0)
69 | res: MatchedCoordPairList = (matches[1], matches[0])
70 | return ret_q(res)
71 |
72 |
73 | def perform_realign(
74 | t_idx: int,
75 | left_flank_coord: int,
76 | ref_total_seq: str,
77 | rn: str,
78 | qs: str,
79 | fqqs: NDArray[np.uint8],
80 | # ---
81 | params: CallParams,
82 | realign_timeout: int,
83 | force_realign: bool,
84 | # ---
85 | logger_: logging.Logger,
86 | locus_log_str: str,
87 | ) -> MatchedCoordPairListOrNone:
88 | qs_wc = calculate_seq_with_wildcards(qs, fqqs)
89 |
90 | ref_seq_len = len(ref_total_seq)
91 | qs_len = len(qs_wc)
92 |
93 | if ref_seq_len <= max_ref_len_for_same_proc and qs_len <= max_read_len_for_same_proc:
94 | # Don't start process for short realigns, since then process startup dominates the total time taken
95 | # TODO: more robust solution; realign worker somehow? How to do timeout?
96 | return realign_read(
97 | ref_total_seq, qs_wc, left_flank_coord, params.flank_size, rn, t_idx, force_realign, None, params.log_level
98 | )
99 |
100 | t = time.time()
101 |
102 | q: mp.Queue = mp.Queue()
103 | proc = mp.Process(target=realign_read, daemon=False, kwargs=dict(
104 | # fetch an extra base for the right flank coordinate check later (needs to be >= the exclusive coord)
105 | ref_seq=ref_total_seq, # TODO: with the plus 1, really?
106 | query_seq=qs_wc,
107 | left_flank_coord=left_flank_coord,
108 | flank_size=params.flank_size,
109 | rn=rn,
110 | t_idx=t_idx,
111 | always_realign=force_realign,
112 | q=q,
113 | log_level=params.log_level,
114 | ))
115 | proc.start()
116 |
117 | pairs_new = None
118 | try:
119 | pairs_new = q.get(timeout=realign_timeout)
120 | proc.join()
121 | except queue.Empty:
122 | logger_.warning(
123 | f"{locus_log_str} - experienced timeout while re-aligning read {rn}. Reverting to initial "
124 | f"alignment.")
125 | proc.terminate()
126 | time.sleep(0.1) # wait a little for the process to terminate
127 | finally:
128 | wait_count: int = 0
129 | while proc.is_alive():
130 | logger_.warning(f"{locus_log_str} - realign job has still not exited. Waiting 0.5 seconds...")
131 | time.sleep(0.5)
132 | wait_count += 1
133 | if wait_count > 30:
134 | logger_.fatal(f"{locus_log_str} - realign job never exited. Terminating...")
135 | exit(1)
136 | proc.close()
137 |
138 | logger_.debug(
139 | f"{locus_log_str} - {rn}: long realign job completed in {time.time() - t:.4f}s ({ref_seq_len=}, {qs_len=})")
140 |
141 | return pairs_new
142 |
--------------------------------------------------------------------------------
/strkit/call/repeats.py:
--------------------------------------------------------------------------------
1 | import parasail
2 |
3 | from functools import lru_cache
4 | from typing import Literal
5 |
6 | from strkit_rust_ext import get_repeat_count as _get_repeat_count
7 | from strkit.utils import idx_1_getter
8 |
9 | from .align_matrix import dna_matrix, indel_penalty
10 |
11 | __all__ = [
12 | "get_repeat_count",
13 | "get_ref_repeat_count",
14 | ]
15 |
16 |
17 | DEFAULT_LOCAL_SEARCH_RANGE = 3
18 |
19 |
20 | def score_candidate_with_string(db_seq_profile: parasail.Profile, tr_seq: str) -> int:
21 | # TODO: sub-flank again, to avoid more errors in flanking region contributing to score?
22 | # Always assign parasail results to variables due to funky memory allocation behaviour
23 | # - switch 'db' and 'query' here so we can use the db sequence as the profile for a "database" search against
24 | # candidate sequences. order doesn't end up mattering, since we're using semi-global alignment.
25 | r = parasail.sg_striped_profile_sat(db_seq_profile, tr_seq, indel_penalty, indel_penalty)
26 | return r.score
27 |
28 |
29 | def score_candidate(
30 | db_seq_profile: parasail.Profile,
31 | motif: str,
32 | motif_count: int,
33 | flank_left_seq: str,
34 | flank_right_seq: str,
35 | ) -> int:
36 | return score_candidate_with_string(db_seq_profile, f"{flank_left_seq}{motif * motif_count}{flank_right_seq}")
37 |
38 |
39 | def score_ref_boundaries(
40 | db_seq_profile: parasail.Profile,
41 | db_seq_rev_profile: parasail.Profile,
42 | tr_candidate: str,
43 | flank_left_seq: str,
44 | flank_right_seq: str,
45 | ref_size: int,
46 | ) -> tuple[tuple[int, int], tuple[int, int]]:
47 | # Always assign parasail results to variables due to funky memory allocation behaviour
48 | ext_r_seq = f"{flank_left_seq}{tr_candidate}"
49 | r_fwd = parasail.sg_qe_scan_profile_sat(db_seq_profile, ext_r_seq, indel_penalty, indel_penalty)
50 | r_adj = r_fwd.end_query + 1 - len(flank_left_seq) - ref_size # Amount to tweak boundary on the right side by
51 |
52 | # Used to be flank_right_seq[max(r_adj, 0):] but I think that adjustment makes this score worse than it should and
53 | # wasn't valid, since what matters is the delta over the limit...
54 | ext_l_seq = f"{tr_candidate}{flank_right_seq}"[::-1] # reverse
55 |
56 | r_rev = parasail.sg_qe_scan_profile_sat(db_seq_rev_profile, ext_l_seq, indel_penalty, indel_penalty)
57 | l_adj = r_rev.end_query + 1 - len(flank_right_seq) - ref_size # Amount to tweak boundary on the left side by
58 |
59 | return (r_fwd.score, r_adj), (r_rev.score, l_adj)
60 |
61 |
62 | # TODO: instead of lru_cache, some more custom mechanism for sharing?
63 | @lru_cache(maxsize=512)
64 | def get_repeat_count(
65 | start_count: int,
66 | tr_seq: str,
67 | flank_left_seq: str,
68 | flank_right_seq: str,
69 | motif: str,
70 | max_iters: int,
71 | local_search_range: int = DEFAULT_LOCAL_SEARCH_RANGE, # TODO: Parametrize for user
72 | step_size: int = 1,
73 | ) -> tuple[tuple[int, int], int, int]:
74 | return _get_repeat_count(
75 | start_count, tr_seq, flank_left_seq, flank_right_seq, motif, max_iters, local_search_range, step_size
76 | )
77 |
78 |
79 | def get_ref_repeat_count(
80 | start_count: int,
81 | tr_seq: str,
82 | flank_left_seq: str,
83 | flank_right_seq: str,
84 | motif: str,
85 | ref_size: int,
86 | vcf_anchor_size: int,
87 | max_iters: int,
88 | respect_coords: bool = False,
89 | local_search_range: int = DEFAULT_LOCAL_SEARCH_RANGE, # TODO: Parametrize for user
90 | step_size: int = 1,
91 | ) -> tuple[tuple[int | float, int], int, int, tuple[int, int], tuple[str, str, str]]:
92 | l_offset: int = 0
93 | r_offset: int = 0
94 |
95 | db_seq: str = f"{flank_left_seq}{tr_seq}{flank_right_seq}"
96 | db_seq_profile: parasail.Profile = parasail.profile_create_sat(db_seq, dna_matrix)
97 | db_seq_rev_profile: parasail.Profile = parasail.profile_create_sat(db_seq[::-1], dna_matrix)
98 |
99 | motif_size = len(motif)
100 |
101 | n_offset_scores: int = 0
102 |
103 | if not respect_coords: # Extend out coordinates from initial definition
104 | to_explore: list[tuple[int, Literal[-1, 0, 1]]] = [
105 | (start_count - step_size, -1), (start_count + step_size, 1), (start_count, 0)]
106 |
107 | fwd_sizes_scores_adj: dict[int | float, tuple[int, int]] = {}
108 | rev_sizes_scores_adj: dict[int | float, tuple[int, int]] = {}
109 |
110 | while to_explore and n_offset_scores < max_iters:
111 | size_to_explore, direction = to_explore.pop()
112 | if size_to_explore < 0:
113 | continue
114 |
115 | fwd_scores: list[tuple[float | int, tuple[int, int], int]] = [] # For right-side adjustment
116 | rev_scores: list[tuple[float | int, tuple[int, int], int]] = [] # For left-side adjustment
117 |
118 | start_size = max(
119 | size_to_explore - (local_search_range if (direction < 1 or step_size > local_search_range) else 0), 0)
120 | end_size = size_to_explore + (local_search_range if (direction > -1 or step_size > local_search_range)
121 | else 0)
122 |
123 | for i in range(start_size, end_size + 1):
124 | fwd_rs = fwd_sizes_scores_adj.get(i)
125 | rev_rs = rev_sizes_scores_adj.get(i)
126 |
127 | if fwd_rs is None or rev_rs is None:
128 | res = score_ref_boundaries(
129 | db_seq_profile, db_seq_rev_profile, motif * i, flank_left_seq, flank_right_seq, ref_size)
130 |
131 | fwd_sizes_scores_adj[i] = fwd_rs = res[0]
132 | rev_sizes_scores_adj[i] = rev_rs = res[1]
133 |
134 | n_offset_scores += 1
135 |
136 | fwd_scores.append((i, fwd_rs, i))
137 | rev_scores.append((i, rev_rs, i))
138 |
139 | mv: tuple[float | int, tuple[int, int], int] = max((*fwd_scores, *rev_scores), key=idx_1_getter)
140 | if mv[2] > size_to_explore and (
141 | (new_rc := mv[2] + step_size) not in fwd_sizes_scores_adj or new_rc not in rev_sizes_scores_adj):
142 | if new_rc >= 0:
143 | to_explore.append((new_rc, 1))
144 | if mv[2] < size_to_explore and (
145 | (new_rc := mv[2] - step_size) not in fwd_sizes_scores_adj or new_rc not in rev_sizes_scores_adj):
146 | if new_rc >= 0:
147 | to_explore.append((new_rc, -1))
148 |
149 | # noinspection PyTypeChecker
150 | fwd_top_res: tuple[int | float, tuple] = max(fwd_sizes_scores_adj.items(), key=lambda x: x[1][0])
151 | # noinspection PyTypeChecker
152 | rev_top_res: tuple[int | float, tuple] = max(rev_sizes_scores_adj.items(), key=lambda x: x[1][0])
153 |
154 | # Ignore negative differences (contractions vs TRF definition), but follow expansions
155 | # TODO: Should we incorporate contractions? How would that work?
156 |
157 | l_offset = rev_top_res[1][1]
158 | r_offset = fwd_top_res[1][1]
159 |
160 | if l_offset >= len(flank_left_seq) - vcf_anchor_size:
161 | # don't do anything weird if we're removing the entire flank sequence
162 | # TODO: this can be caused by NNNNNNN - see chr5:139453668-139454525 in GRCh38
163 | l_offset = 0
164 | if r_offset >= len(flank_right_seq):
165 | r_offset = 0 # same here
166 |
167 | if l_offset > 0:
168 | tr_seq = flank_left_seq[-1*l_offset:] + tr_seq # first, move a chunk of the left flank to the TR seq
169 | flank_left_seq = flank_left_seq[:-1*l_offset] # then, remove that chunk from the left flank
170 | if r_offset > 0:
171 | tr_seq = tr_seq + flank_right_seq[:r_offset] # same, but for the right flank
172 | flank_right_seq = flank_right_seq[r_offset:]
173 |
174 | # ------------------------------------------------------------------------------------------------------------------
175 |
176 | final_res, n_iters_final_count, _ = get_repeat_count(
177 | # always start with int here:
178 | round(((start_count * motif_size) + (max(0, l_offset) + max(0, r_offset))) / motif_size),
179 | tr_seq,
180 | flank_left_seq,
181 | flank_right_seq,
182 | motif,
183 | max_iters=max_iters,
184 | step_size=step_size,
185 | )
186 |
187 | return (
188 | final_res, l_offset, r_offset, (n_offset_scores, n_iters_final_count), (flank_left_seq, tr_seq, flank_right_seq)
189 | )
190 |
--------------------------------------------------------------------------------
/strkit/call/snvs.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import multiprocessing.managers as mmg
3 |
4 | from collections import Counter
5 |
6 | from strkit_rust_ext import get_read_snvs, process_read_snvs_for_locus_and_calculate_useful_snvs, CandidateSNVs
7 | from strkit.utils import idx_1_getter
8 |
9 | from .types import ReadDict, CalledSNV
10 |
11 |
12 | __all__ = [
13 | "SNV_OUT_OF_RANGE_CHAR",
14 | "SNV_GAP_CHAR",
15 | "SNV_NA_CHARS",
16 | "get_read_snvs",
17 | "call_and_filter_useful_snvs",
18 | "process_read_snvs_for_locus_and_calculate_useful_snvs",
19 | ]
20 |
21 | SNV_OUT_OF_RANGE_CHAR = "-"
22 | SNV_GAP_CHAR = "_"
23 | SNV_NA_CHARS = (SNV_OUT_OF_RANGE_CHAR, SNV_GAP_CHAR)
24 |
25 |
26 | def call_and_filter_useful_snvs(
27 | contig: str,
28 | n_alleles: int,
29 | read_dict: dict[str, ReadDict],
30 | useful_snvs: list[tuple[int, int]],
31 | candidate_snvs: CandidateSNVs,
32 | # ---
33 | snv_quality_threshold: int,
34 | # ---
35 | snv_genotype_cache: mmg.DictProxy,
36 | # ---
37 | locus_log_str: str,
38 | logger_: logging.Logger,
39 | ) -> list[CalledSNV]:
40 | """
41 | Call useful SNVs at a locus level from read-level SNV data.
42 | :param contig: The contig of the SNVs. Used for generating an ID if one does not exist.
43 | :param n_alleles: The number of alleles called for this locus.
44 | :param read_dict: Dictionary of read data. Must already have peaks assigned.
45 | :param useful_snvs: List of tuples representing useful SNVs: (SNV index, reference position)
46 | :param candidate_snvs: A dictionary of useful SNVs, indexed by reference position. Used to look up IDs.
47 | :param snv_quality_threshold: Minimum PHRED score needed to incorporate a read base into the genotype.
48 | :param snv_genotype_cache: Cache for SNV genotype/phase set information.
49 | :param locus_log_str: Locus string representation for logging purposes.
50 | :param logger_: Python logger object.
51 | :return: List of called SNVs for the locus.
52 | """
53 |
54 | # Since these have already been classified as 'useful' earlier in the pipeline,
55 | # we have some guarantees that these values should be fairly internally consistent
56 | # for a given peak... most of the time.
57 |
58 | allele_range = tuple(range(n_alleles))
59 | peak_base_counts: dict[int, dict[int, Counter]] = {
60 | u_ref: {p: Counter() for p in allele_range}
61 | for _, u_ref in useful_snvs
62 | }
63 |
64 | for rn, read in read_dict.items():
65 | p: int | None = read.get("p")
66 | if p is None: # No peak; read wasn't used to call peaks
67 | continue
68 | for u_idx, (_, u_ref) in enumerate(useful_snvs):
69 | su, su_q = read["snvu"][u_idx]
70 |
71 | if su == SNV_GAP_CHAR or su_q >= snv_quality_threshold:
72 | peak_base_counts[u_ref][p].update((su,))
73 |
74 | called_snvs: list[dict] = []
75 | skipped_snvs: set[int] = set()
76 |
77 | for u_idx, (u_ref, peak_counts) in enumerate(peak_base_counts.items()):
78 | call: list[str] = []
79 | rs: list[int] = []
80 |
81 | skipped: bool = False
82 |
83 | for a in allele_range:
84 | if skipped:
85 | break
86 |
87 | peak_counts_a = peak_counts[a]
88 | a_total = peak_counts[a].total()
89 |
90 | if a_total == 0: # probably due to quality filtering
91 | skipped = True
92 | logger_.warning(f"{locus_log_str} - for SNV {u_ref}, found a 0-total for allele {a} (a)")
93 | break
94 |
95 | mc = peak_counts_a.most_common(2)
96 | mcc = mc[0]
97 |
98 | try:
99 | if mcc[0] == SNV_OUT_OF_RANGE_CHAR: # Chose most common non-uncalled value
100 | mcc = mc[1]
101 |
102 | for b in allele_range:
103 | if b == a:
104 | continue
105 |
106 | peak_counts_b = peak_counts[b]
107 | b_total = peak_counts_b.total()
108 |
109 | if b_total == 0: # probably due to quality filtering
110 | skipped = True
111 | logger_.warning(f"{locus_log_str} - for SNV {u_ref}, found a 0-total for allele {b} (b)")
112 | break
113 |
114 | if (peak_counts_b[mcc[0]] / b_total) > (peak_counts_a[mcc[0]] / a_total / 2): # TODO: parametrize
115 | logger_.debug(
116 | f"{locus_log_str} - for SNV position {u_ref}: got uninformative peak counts (cross-talk) - "
117 | f"{peak_counts=}")
118 | skipped = True
119 | break
120 |
121 | except IndexError: # '-' is the only value, somehow
122 | logger_.debug(
123 | f"{locus_log_str} - for SNV {u_ref}, found only '{SNV_OUT_OF_RANGE_CHAR}' with {mcc[1]} reads")
124 | logger_.debug(f"{locus_log_str} - for SNV position {u_ref}: {mc=}, {peak_counts[a]=}")
125 | skipped = True
126 | break
127 |
128 | if not skipped:
129 | call.append(mcc[0])
130 | rs.append(mcc[1])
131 |
132 | snv_call_set = set(call)
133 |
134 | if not skipped and len(snv_call_set) == 1:
135 | logger_.warning(
136 | f"{locus_log_str} - for SNV position {u_ref}: got degenerate call {call} from {peak_counts=}")
137 | skipped = True
138 |
139 | snv_rec = candidate_snvs.get(u_ref)
140 | if snv_rec is not None:
141 | snv_id = snv_rec["id"]
142 | if snv_id == ".":
143 | snv_id = f"{contig}_{u_ref}"
144 | else:
145 | snv_id = f"{contig}_{u_ref}"
146 |
147 | if not skipped:
148 | cached_snv_genotype = snv_genotype_cache.get(snv_id)
149 | if cached_snv_genotype is not None and (cgt := set(cached_snv_genotype[0])) != snv_call_set:
150 | logger_.warning(
151 | f"{locus_log_str} - got mismatch for SNV {snv_id} (position {u_ref}); cache genotype set {cgt} != "
152 | f"current genotype set {snv_call_set}")
153 | skipped = True
154 |
155 | if skipped:
156 | skipped_snvs.add(u_idx) # Skip this useful SNV, since it isn't actually useful
157 | continue
158 |
159 | called_snvs.append({
160 | "id": snv_id,
161 | **({"ref": snv_rec["ref_base"]} if snv_rec is not None else {}),
162 | "pos": u_ref,
163 | "call": tuple(call),
164 | "rcs": rs,
165 | })
166 |
167 | # If we've skipped any SNVs, filter them out of the read dict - MUTATION
168 | if skipped_snvs:
169 | for read in read_dict.values():
170 | if "snvu" not in read:
171 | continue
172 | read["snvu"] = tuple(map(idx_1_getter, filter(lambda e: e[0] not in skipped_snvs, enumerate(read["snvu"]))))
173 | logger_.debug(f"{locus_log_str} - filtered out {len(skipped_snvs)} not-actually-useful SNVs")
174 |
175 | return called_snvs
176 |
--------------------------------------------------------------------------------
/strkit/call/types.py:
--------------------------------------------------------------------------------
1 | # import pysam
2 | import numpy as np
3 | from typing import Literal, TypedDict
4 | from numpy.typing import NDArray
5 |
6 |
7 | __all__ = [
8 | "VCFContigFormat",
9 | "AssignMethod",
10 | "AssignMethodWithHP",
11 | "ConsensusMethod",
12 | # ---
13 | "ReadDict",
14 | "ReadDictExtra",
15 | "CalledSNV",
16 | "LocusResult",
17 | ]
18 |
19 | # TODO: py3.10: new Required[] TypedDict structuring
20 |
21 |
22 | VCFContigFormat = Literal["chr", "num", "acc", ""]
23 |
24 | AssignMethod = Literal["dist", "snv", "snv+dist", "single"]
25 | AssignMethodWithHP = AssignMethod | Literal["hp"]
26 |
27 | ConsensusMethod = Literal["single", "poa", "best_rep"]
28 |
29 |
30 | class _ReadDictBase(TypedDict):
31 | s: Literal["-", "+"] # DNA strand alignment
32 | cn: int | float # Copy number
33 | w: float # Weight
34 | sc: float | None # Adjusted read model align score (None if TR is missing)
35 |
36 |
37 | class ReadDict(_ReadDictBase, total=False):
38 | # Whether the read was realigned by hand using a local alignment algorithm.
39 | realn: bool
40 |
41 | # Whether the read appears to be chimeric within the locus region,
42 | # i.e. aligned twice with different soft-clipping.
43 | chimeric_in_region: bool
44 |
45 | p: int # Peak (allele)
46 |
47 | kmers: dict[str, int] # Dictionary of {kmer: count}
48 |
49 | # Only added if HP tags from a haplotagged alignment file are being incorporated:
50 | hp: int
51 | ps: int
52 |
53 | # Only added if SNVs are being incorporated:
54 | # - After including only useful SNVs, this contains a tuple of bases for just those + corresponding qualities
55 | snvu: tuple[tuple[str, int], ...]
56 |
57 |
58 | class ReadDictExtra(TypedDict, total=False):
59 | _ref_start: int # Read start in ref coordinates
60 | _ref_end: int # Read end in ref coordinates
61 |
62 | # BEGIN: only added if consensus is being calculated
63 | _start_anchor: str # Left anchor for calculated allele sequence (usually 1 base)
64 | _tr_seq: str # Tandem repeat sequence
65 | # END: only added if consensus is being calculated
66 |
67 | # Below are only added if SNVs are being incorporated:
68 |
69 | _qs: str # Query (read) sequence
70 | _fqqs: NDArray[np.uint8] # Query (read) base qualities
71 |
72 | sig_clip_left: bool # Significant amounts of clipping (5' of read)
73 | sig_clip_right: bool # Significant amounts of clipping (3' of read)
74 |
75 | snv: dict[int, str] # Intermediate result: dictionary of a bunch of SNVs for this read {position: base}
76 | # Intermediate result: tuple of bases/qualities for the set of SNVs across all reads
77 | snv_bases: tuple[tuple[str, int], ...]
78 |
79 |
80 | class _CalledSNVBase(TypedDict):
81 | id: str
82 | pos: int
83 | call: tuple[str, ...]
84 | rcs: list[int]
85 |
86 |
87 | class CalledSNV(_CalledSNVBase, total=False):
88 | ref: str
89 |
90 |
91 | class BasePeakData(TypedDict):
92 | means: NDArray[np.float32]
93 | weights: NDArray[np.float32]
94 | stdevs: NDArray[np.float32]
95 | modal_int: int
96 | n_reads: list[int]
97 |
98 |
99 | class PeakData(BasePeakData):
100 | kmers: dict[str, int]
101 | seqs: list[tuple[str, ConsensusMethod]] # really "list-tyup
102 |
103 |
104 | class BaseLocusResult(TypedDict):
105 | locus_index: int
106 | contig: str
107 | start: int
108 | end: int
109 |
110 | motif: str
111 |
112 | assign_method: AssignMethodWithHP | None
113 | call: list[int] | None
114 | call_95_cis: list[list[int]] | None
115 | call_99_cis: list[list[int]] | None
116 |
117 | # Mean model (candidate TR sequence) alignment score across reads.
118 | mean_model_align_score: float | None
119 |
120 |
121 | class LocusResult(BaseLocusResult, total=False):
122 | start_adj: int
123 | end_adj: int
124 |
125 | ref_cn: int
126 |
127 | ps: int | None
128 | peaks: PeakData | None
129 | read_peaks_called: bool
130 | time: float
131 |
132 | # if we're in consensus mode: ---
133 | ref_start_anchor: str
134 | ref_seq: str
135 | # ---
136 |
137 | reads: dict[str, ReadDict]
138 | snvs: list[CalledSNV]
139 |
--------------------------------------------------------------------------------
/strkit/call/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import operator
3 |
4 | from functools import cache, partial
5 | from numpy.typing import NDArray
6 |
7 | from ..utils import cat_strs
8 |
9 | __all__ = [
10 | "cn_getter",
11 | "neq_blank",
12 | "find_pair_by_ref_pos",
13 | "normalize_contig",
14 | "round_to_base_pos",
15 | "get_new_seed",
16 | "calculate_seq_with_wildcards",
17 | ]
18 |
19 |
20 | # index/property getters and other partials
21 | cn_getter = operator.itemgetter("cn")
22 | neq_blank = partial(operator.ne, "")
23 |
24 |
25 | def find_pair_by_ref_pos(r_coords: NDArray[np.uint64], target: int, start_left: int = 0) -> tuple[int, bool]:
26 | n_pairs: int = len(r_coords)
27 | idx = start_left + np.searchsorted(r_coords[start_left:], target)
28 | return idx, idx < n_pairs and r_coords[idx] == target
29 |
30 |
31 | def normalize_contig(contig: str, has_chr: bool) -> str:
32 | return ("chr" if has_chr else "") + contig.replace("chr", "")
33 |
34 |
35 | def round_to_base_pos(x, motif_size: int) -> float:
36 | return round(float(x) * motif_size) / motif_size
37 |
38 |
39 | def get_new_seed(rng: np.random.Generator) -> int:
40 | return rng.integers(0, 4096, dtype=int)
41 |
42 |
43 | @cache # TODO: parametrize base_wildcard_threshold
44 | def _mask_low_q_base(base_and_qual: tuple[str, int], base_wildcard_threshold: int = 3) -> str:
45 | return base_and_qual[0] if base_and_qual[1] > base_wildcard_threshold else "X"
46 |
47 |
48 | def calculate_seq_with_wildcards(qs: str, quals: NDArray[np.uint8] | None) -> str:
49 | if quals is None:
50 | return qs # No quality information, so don't do anything
51 | return cat_strs(map(_mask_low_q_base, zip(qs, quals)))
52 |
--------------------------------------------------------------------------------
/strkit/call/validation.py:
--------------------------------------------------------------------------------
1 | import re
2 | from logging import Logger
3 |
4 | __all__ = [
5 | "LocusValidationError",
6 | "valid_motif",
7 | "validate_locus",
8 | ]
9 |
10 | # patterns
11 | RE_VALID_MOTIF = re.compile(r"^[ACGTRYSWKMBDHVN]+$")
12 |
13 |
14 | # exceptions
15 |
16 | class LocusValidationError(ValueError):
17 | def __init__(self, error_str: str, hint_msg: str):
18 | self._error_str = error_str
19 | self._hint_msg = hint_msg
20 | super().__init__(error_str)
21 |
22 | def log_error(self, logger: Logger) -> None:
23 | logger.critical(self._error_str)
24 | logger.critical(self._hint_msg)
25 |
26 |
27 | # functions
28 |
29 | def valid_motif(motif: str) -> bool:
30 | """
31 | Determines whether a motif is valid, i.e., can be used by `strkit call`. Here, valid means "composed of IUPAC
32 | nucleotide codes and no other characters."
33 | :param motif: The motif to assess the validity of.
34 | :return: Whether the motif is valid or not.
35 | """
36 | return RE_VALID_MOTIF.match(motif) is not None
37 |
38 |
39 | def validate_locus(line: int, start: int, end: int, motif: str) -> None:
40 | """
41 | Validate a locus definition for use by STRkit.
42 | :param line: Line number, for logging errors in a catalog BED file.
43 | :param start: Start coordinate; 0-based, inclusive.
44 | :param end: End coordinate; 0-based, exclusive.
45 | :param motif: Motif sequence (to be validated).
46 | """
47 |
48 | if start >= end:
49 | raise LocusValidationError(
50 | f"BED catalog format error: invalid coordinates on line {line}: start ({start}) >= end ({end})",
51 | "BED catalog: coordinates must be 0-based, half-open - [start, end)",
52 | )
53 |
54 | if not valid_motif(motif):
55 | raise LocusValidationError(
56 | f"BED catalog format error: invalid motif on line {line}: {motif}",
57 | "BED catalog: motifs must contain only valid IUPAC nucleotide codes.",
58 | )
59 |
--------------------------------------------------------------------------------
/strkit/catalog/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidlougheed/strkit/90f07d4bb8e27b67a1a1c9afafa4f4b1412b171a/strkit/catalog/__init__.py
--------------------------------------------------------------------------------
/strkit/catalog/combine.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import sys
4 | from ..constants import CALLER_STRAGLR, CHROMOSOMES
5 |
6 | __all__ = [
7 | "combine_catalogs",
8 | ]
9 |
10 |
11 | def combine_catalogs(caller: str, paths: list[str]) -> int:
12 | if caller != CALLER_STRAGLR:
13 | sys.stderr.write(f"Error: This command only supports caller '{CALLER_STRAGLR}'\n")
14 | return 1
15 |
16 | lines = set()
17 |
18 | for path in paths:
19 | if not path.endswith(".bed"):
20 | sys.stderr.write(f"Error: Please supply only .bed files from '{CALLER_STRAGLR}'\n")
21 | return 1
22 |
23 | with open(path, "r") as fh:
24 | for line in fh:
25 | if line.startswith("#"):
26 | continue
27 |
28 | raw_data = line.strip().split("\t")
29 | lines.add((raw_data[0], int(raw_data[1]), int(raw_data[2]), raw_data[3]))
30 |
31 | for line in sorted(lines, key=lambda x: (CHROMOSOMES.index(x[0]), x[1])):
32 | sys.stdout.write("\t".join(map(str, line)) + "\n")
33 |
34 | return 0
35 |
--------------------------------------------------------------------------------
/strkit/constants.py:
--------------------------------------------------------------------------------
1 | __all__ = [
2 | "CALLER_EXPANSIONHUNTER",
3 | "CALLER_HIPSTR",
4 | "CALLER_GANGSTR",
5 | "CALLER_REPEATHMM",
6 | "CALLER_STRAGLR",
7 | "CALLER_TANDEM_GENOTYPES",
8 |
9 | "M_CHROMOSOME_NAMES",
10 | "X_CHROMOSOME_NAMES",
11 | "Y_CHROMOSOME_NAMES",
12 | "SEX_CHROMOSOMES",
13 | "AUTOSOMES",
14 | "CHROMOSOMES",
15 |
16 | "MI_CALLERS",
17 | ]
18 |
19 | CALLER_EXPANSIONHUNTER = "expansionhunter"
20 | CALLER_HIPSTR = "hipstr"
21 | CALLER_LONGTR = "longtr"
22 | CALLER_GANGSTR = "gangstr"
23 | CALLER_GENERIC_VCF = "generic-vcf"
24 | CALLER_REPEATHMM = "repeathmm"
25 | CALLER_STRDUST = "strdust"
26 | CALLER_STRAGLR = "straglr"
27 | CALLER_STRKIT = "strkit"
28 | CALLER_STRKIT_JSON = "strkit-json"
29 | CALLER_STRKIT_VCF = "strkit-vcf"
30 | CALLER_TANDEM_GENOTYPES = "tandem-genotypes"
31 | CALLER_TRGT = "trgt"
32 |
33 | M_CHROMOSOME_NAMES = ("chrM", "M")
34 | X_CHROMOSOME_NAMES = ("chrX", "X")
35 | Y_CHROMOSOME_NAMES = ("chrY", "Y")
36 | SEX_CHROMOSOMES = (*X_CHROMOSOME_NAMES, *Y_CHROMOSOME_NAMES)
37 |
38 | AUTOSOMES = (
39 | *map(str, range(1, 23)),
40 | *(f"chr{i}" for i in range(1, 23)),
41 | )
42 |
43 | CHROMOSOMES = (
44 | *AUTOSOMES,
45 | *SEX_CHROMOSOMES,
46 | )
47 |
48 |
49 | MI_CALLERS = (
50 | CALLER_EXPANSIONHUNTER,
51 | CALLER_GANGSTR,
52 | CALLER_GENERIC_VCF,
53 | CALLER_LONGTR,
54 | CALLER_REPEATHMM,
55 | CALLER_STRDUST,
56 | CALLER_STRAGLR,
57 | CALLER_STRKIT,
58 | CALLER_STRKIT_JSON,
59 | CALLER_STRKIT_VCF,
60 | CALLER_TANDEM_GENOTYPES,
61 | CALLER_TRGT,
62 | )
63 |
--------------------------------------------------------------------------------
/strkit/convert/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidlougheed/strkit/90f07d4bb8e27b67a1a1c9afafa4f4b1412b171a/strkit/convert/__init__.py
--------------------------------------------------------------------------------
/strkit/convert/_bed_4.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from logging import Logger
3 |
4 | __all__ = [
5 | "trf_to_bed_4",
6 | ]
7 |
8 |
9 | def trf_to_bed_4(trf_data: list, _logger: Logger):
10 | for item in trf_data:
11 | sys.stdout.write("\t".join((*item[:3], item[-1])) + "\n")
12 |
--------------------------------------------------------------------------------
/strkit/convert/constants.py:
--------------------------------------------------------------------------------
1 | __all__ = [
2 | "IN_FORMAT_TRF",
3 | "IN_FORMAT_TRGT",
4 | "CONVERTER_IN_FORMATS",
5 | ]
6 |
7 | IN_FORMAT_TRF = "trf"
8 | IN_FORMAT_TRGT = "trgt"
9 |
10 | CONVERTER_IN_FORMATS = (
11 | IN_FORMAT_TRF,
12 | IN_FORMAT_TRGT,
13 | )
14 |
--------------------------------------------------------------------------------
/strkit/convert/converter.py:
--------------------------------------------------------------------------------
1 | from logging import Logger
2 | from typing import Callable
3 |
4 | from ._bed_4 import trf_to_bed_4
5 | from .constants import IN_FORMAT_TRF, IN_FORMAT_TRGT, CONVERTER_IN_FORMATS
6 | from .expansionhunter import trf_bed_to_eh
7 | from .hipstr import trf_bed_to_hipstr
8 | from .gangstr import trf_bed_to_gangstr
9 | from .trgt import trgt_bed_to_bed4, trf_or_strkit_bed_to_trgt
10 |
11 | import strkit.constants as c
12 |
13 | __all__ = [
14 | "CONVERTER_OUTPUT_FORMATS",
15 | "convert",
16 | ]
17 |
18 | convert_formats: dict[tuple[str, str], Callable[[list, Logger], None]] = {
19 | # TRF converters:
20 | (IN_FORMAT_TRF, c.CALLER_EXPANSIONHUNTER): trf_bed_to_eh,
21 | (IN_FORMAT_TRF, c.CALLER_HIPSTR): trf_bed_to_hipstr,
22 | (IN_FORMAT_TRF, c.CALLER_GANGSTR): trf_bed_to_gangstr,
23 | (IN_FORMAT_TRF, c.CALLER_REPEATHMM): lambda x: x,
24 | (IN_FORMAT_TRF, c.CALLER_STRAGLR): trf_to_bed_4,
25 | (IN_FORMAT_TRF, c.CALLER_STRKIT): trf_to_bed_4, # or can just leave -asis
26 | (IN_FORMAT_TRF, c.CALLER_TANDEM_GENOTYPES): trf_to_bed_4,
27 | (IN_FORMAT_TRF, c.CALLER_TRGT): trf_or_strkit_bed_to_trgt,
28 | # TRGT converters:
29 | (IN_FORMAT_TRGT, c.CALLER_STRAGLR): trgt_bed_to_bed4,
30 | (IN_FORMAT_TRGT, c.CALLER_STRKIT): trgt_bed_to_bed4,
31 | (IN_FORMAT_TRGT, c.CALLER_TANDEM_GENOTYPES): trgt_bed_to_bed4,
32 | }
33 |
34 | CONVERTER_OUTPUT_FORMATS: tuple[str, ...] = tuple(sorted(set(k[1] for k in convert_formats)))
35 |
36 |
37 | def convert(in_file: str, in_format: str, out_format: str, logger: Logger) -> int:
38 | out_format = out_format.lower()
39 |
40 | if in_format == IN_FORMAT_TRF:
41 | if out_format == c.CALLER_REPEATHMM:
42 | logger.critical(f"No need to convert for '{out_format}'; TRF BED files are accepted as input")
43 | return 1
44 | elif out_format == c.CALLER_STRKIT:
45 | logger.info("STRkit can use TRF BED files as-is; will convert to a BED4 file")
46 |
47 | if in_format not in CONVERTER_IN_FORMATS:
48 | logger.critical(f"Unsupported input format: {in_format}")
49 |
50 | if (in_format, out_format) not in convert_formats:
51 | logger.critical(f"Unsupported conversion: {in_format} -> {out_format} (no converter defined)")
52 | return 1
53 |
54 | with open(in_file, "r") as tf:
55 | data = [line.strip().split("\t") for line in tf]
56 |
57 | convert_formats[(in_format, out_format)](data, logger)
58 | return 0
59 |
--------------------------------------------------------------------------------
/strkit/convert/expansionhunter.py:
--------------------------------------------------------------------------------
1 | import json
2 | import sys
3 | from logging import Logger
4 |
5 | __all__ = [
6 | "trf_bed_to_eh",
7 | ]
8 |
9 |
10 | def trf_bed_to_eh(trf_data: list, _logger: Logger):
11 | eh_formatted_loci = []
12 |
13 | for i, item in enumerate(trf_data, 1):
14 | eh_formatted_loci.append({
15 | "LocusId": f"Locus{i}",
16 | "LocusStructure": f"({item[-1]})*",
17 | "ReferenceRegion": f"{item[0]}:{item[1]}-{item[2]}",
18 | "VariantType": "Repeat",
19 | })
20 |
21 | sys.stdout.write(json.dumps(eh_formatted_loci, indent=2))
22 |
--------------------------------------------------------------------------------
/strkit/convert/gangstr.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from logging import Logger
3 |
4 | __all__ = [
5 | "trf_bed_to_gangstr",
6 | ]
7 |
8 |
9 | def trf_bed_to_gangstr(trf_data: list, _logger: Logger):
10 | for i, item in enumerate(trf_data, 1):
11 | sys.stdout.write("\t".join((*item[:3], str(len(item[-1])), item[-1])) + "\n")
12 |
--------------------------------------------------------------------------------
/strkit/convert/hipstr.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from logging import Logger
3 |
4 | __all__ = [
5 | "trf_bed_to_hipstr",
6 | ]
7 |
8 |
9 | def trf_bed_to_hipstr(trf_data: list, _logger: Logger):
10 | for i, item in enumerate(trf_data, 1):
11 | sys.stdout.write("\t".join((*item[:3], str(len(item[-1])), str(round(float(item[5]))))) + "\n")
12 |
--------------------------------------------------------------------------------
/strkit/convert/trgt.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from logging import Logger
3 |
4 | __all__ = [
5 | "trgt_bed_to_bed4",
6 | "trf_or_strkit_bed_to_trgt",
7 | ]
8 |
9 | from strkit.iupac import get_iupac_code_for_nt_set
10 |
11 |
12 | def trgt_bed_to_bed4(trgt_data: list, logger: Logger):
13 | """
14 | Converts a TRGT repeat catalog to the STRkit/BED4 catalog format.
15 | :param trgt_data: The loaded TRGT catalog (split by tab).
16 | :param logger: A logger instance for issuing conversion failure warnings.
17 | """
18 |
19 | for line, data in enumerate(trgt_data, 1):
20 | structure_data = {j[0]: j[1] for j in (i.split("=") for i in data[3].split(";"))}
21 | motifs = structure_data["MOTIFS"].split(",")
22 |
23 | if len(motifs) > 1:
24 | # We can do some basic IUPAC code normalization here for simple compound STR structures in TRGT catalogs:
25 | if (
26 | structure_data["STRUC"] in {"".join(f"({m})n" for m in motifs), f"<{structure_data['ID']}>"}
27 | and len({len(m) for m in motifs}) == 1
28 | ):
29 | failed: bool = False
30 | combined_motif_bases = []
31 | for bases in zip(*motifs):
32 | bases_set = set(bases)
33 | if len(bases_set) == 1: # same base in all motifs
34 | combined_motif_bases.append(next(iter(bases_set)))
35 | elif iupac_code := get_iupac_code_for_nt_set(bases_set):
36 | # find IUPAC code representing consensus "base" and append it to the motif
37 | combined_motif_bases.append(iupac_code)
38 | else: # something went wrong (invalid base?)
39 | failed = True
40 | break
41 |
42 | if not failed: # found a consensus base for the multiple-motif STR, so we can convert it
43 | sys.stdout.write("\t".join((*data[:3], "".join(combined_motif_bases))) + "\n")
44 | continue
45 |
46 | data_str = "\t".join(data)
47 | logger.warning(f"Could not convert complex locus at line {line}: {data_str}")
48 | continue
49 |
50 | sys.stdout.write("\t".join((*data[:3], motifs[0])) + "\n")
51 |
52 |
53 | def trf_or_strkit_bed_to_trgt(trf_data: list, _logger: Logger):
54 | """
55 | Convets a TRF- or STRkit-formatted BED (motif-last) to a basic version of a TRGT catalog.
56 | :param trf_data: The loaded BED catalog data.
57 | :param _logger: Logger instance (unused).
58 | """
59 |
60 | for i, item in enumerate(trf_data):
61 | motif = trf_data[-1]
62 | sys.stdout.write("\t".join((*trf_data[:3], f"ID=locus{i};MOTIFS={motif};STRUC=({motif})n")) + "\n")
63 |
--------------------------------------------------------------------------------
/strkit/exceptions.py:
--------------------------------------------------------------------------------
1 | __all__ = [
2 | "ParamError",
3 | "InputError",
4 | ]
5 |
6 |
7 | class ParamError(Exception):
8 | pass
9 |
10 |
11 | class InputError(Exception):
12 | pass
13 |
--------------------------------------------------------------------------------
/strkit/iupac.py:
--------------------------------------------------------------------------------
1 | __all__ = [
2 | "IUPAC_NUCLEOTIDE_CODES",
3 | "IUPAC_NUCLEOTIDE_CODES_REVERSE",
4 | "get_iupac_code_for_nt_set",
5 | ]
6 |
7 | # IUPAC nucleotide codes representing >1 nucleotide (quasi-"wildcards"):
8 | # - It's important that the values remain sorted, so we can do a reverse-lookup (see below)
9 | IUPAC_NUCLEOTIDE_CODES: dict[str, tuple[str, ...]] = {
10 | "R": ("A", "G"),
11 | "Y": ("C", "T"),
12 | "S": ("C", "G"),
13 | "W": ("A", "T"),
14 | "K": ("G", "T"),
15 | "M": ("A", "C"),
16 | "B": ("C", "G", "T"),
17 | "D": ("A", "C", "T"),
18 | "H": ("A", "C", "T"),
19 | "V": ("A", "C", "G"),
20 | "N": ("A", "C", "G", "T"),
21 | }
22 |
23 | # Lookup table of {(sorted nucleotides): ""}
24 | IUPAC_NUCLEOTIDE_CODES_REVERSE: dict[tuple[str, ...], str] = {
25 | v: k for k, v in IUPAC_NUCLEOTIDE_CODES.items()
26 | }
27 |
28 |
29 | def get_iupac_code_for_nt_set(nt_set: set[str]) -> str | None:
30 | """
31 | Given a set of standard nucleotides (ATGC), return an IUPAC code which represents the set.
32 | :param nt_set: A set of nucleotides (A, T, G, or C). Any other base will result in a None return.
33 | :return: An IUPAC nucleotide code representing the set of nucleotides, or None given an invalid nucleotide set.
34 | """
35 | return IUPAC_NUCLEOTIDE_CODES_REVERSE.get(tuple(sorted(nt_set)))
36 |
--------------------------------------------------------------------------------
/strkit/json.py:
--------------------------------------------------------------------------------
1 | import orjson as json
2 |
3 |
4 | __all__ = [
5 | "Serializable",
6 | "json",
7 | "dumps",
8 | "dumps_indented",
9 | ]
10 |
11 |
12 | Serializable = dict | list | tuple | str | int | float
13 |
14 |
15 | def dumps(v: Serializable) -> bytes:
16 | return json.dumps(v, option=json.OPT_NON_STR_KEYS | json.OPT_SERIALIZE_NUMPY)
17 |
18 |
19 | def dumps_indented(v: Serializable) -> bytes:
20 | return json.dumps(v, option=json.OPT_NON_STR_KEYS | json.OPT_INDENT_2 | json.OPT_SERIALIZE_NUMPY)
21 |
--------------------------------------------------------------------------------
/strkit/logger.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import sys
3 |
4 | __all__ = [
5 | "get_main_logger",
6 | "attach_stream_handler",
7 | "create_process_logger",
8 | "log_levels",
9 | ]
10 |
11 | fmt = logging.Formatter(fmt="%(name)s:\t[%(levelname)s]\t%(message)s")
12 |
13 |
14 | def get_main_logger(level: int = logging.DEBUG):
15 | logger = logging.getLogger("strkit-main")
16 | logger.setLevel(level)
17 | return logger
18 |
19 |
20 | def attach_stream_handler(level: int, logger_=None):
21 | ch = logging.StreamHandler(sys.stderr)
22 | ch.setLevel(level)
23 | ch.setFormatter(fmt)
24 | logger_.addHandler(ch)
25 |
26 |
27 | def create_process_logger(pid: int, level: int):
28 | lg = logging.getLogger(f"strkit-{pid}")
29 | lg.setLevel(level)
30 | if not lg.handlers:
31 | attach_stream_handler(level, logger_=lg)
32 | return lg
33 |
34 |
35 | log_levels = {
36 | "debug": logging.DEBUG,
37 | "info": logging.INFO,
38 | "warning": logging.WARNING,
39 | "error": logging.ERROR,
40 | }
41 |
--------------------------------------------------------------------------------
/strkit/mi/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidlougheed/strkit/90f07d4bb8e27b67a1a1c9afafa4f4b1412b171a/strkit/mi/__init__.py
--------------------------------------------------------------------------------
/strkit/mi/base.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import logging
4 | import uuid
5 | from abc import ABC, abstractmethod
6 | from pathlib import Path
7 | from typing import Any
8 |
9 | from strkit.logger import get_main_logger
10 | from .intervals import (
11 | LociDictOfDict,
12 | LociDictOfList,
13 | build_loci_dict_of_dict_from_file,
14 | build_loci_dict_of_list_from_file,
15 | overlapping_loci_dict_of_dict,
16 | overlapping_loci_dict_of_list,
17 | )
18 | from .result import MIKind, MIContigResult, MIResult
19 |
20 | __all__ = [
21 | "SEX_CHROMOSOMES",
22 | "BaseCalculator",
23 | ]
24 |
25 |
26 | SEX_CHROMOSOMES = {"chrX", "X", "chrY", "Y"} # TODO: proper parametrization
27 |
28 |
29 | # noinspection PyUnusedLocal
30 | class BaseCalculator(ABC):
31 | def __init__(
32 | self,
33 | child_call_file: Path,
34 | mother_call_file: Path,
35 | father_call_file: Path,
36 |
37 | child_id: str | None = None,
38 | mother_id: str | None = None,
39 | father_id: str | None = None,
40 |
41 | loci_file: str | None = None,
42 | exclude_file: str | None = None,
43 | one_based_loci: bool = False,
44 |
45 | widen: float = 0,
46 |
47 | mismatch_out_mi: MIKind = "pm1",
48 | test_to_perform: str = "none", # means mismatch_out_mi has no effect
49 | sig_level: float = 0.05,
50 | mt_corr: str = "none",
51 | only_phased: bool = False,
52 |
53 | debug: bool = False,
54 | logger: logging.Logger | None = None,
55 | ):
56 | self._debug: bool = debug
57 | self._logger: logging.Logger = logger or get_main_logger()
58 |
59 | self._child_call_file: Path = child_call_file
60 | self._mother_call_file: Path = mother_call_file
61 | self._father_call_file: Path = father_call_file
62 |
63 | self._child_id: str | None = child_id
64 | self._mother_id: str | None = mother_id
65 | self._father_id: str | None = father_id
66 |
67 | self._loci_file: str | None = loci_file
68 | self._loci_dict: LociDictOfDict = build_loci_dict_of_dict_from_file(loci_file, one_based_loci)
69 | self._loci_dict_cache_key: str = str(uuid.uuid4())
70 | if self._loci_file is not None:
71 | self._logger.debug(
72 | "Built loci dict of size %d with contigs %s",
73 | sum(len(loc) for loc in self._loci_dict.values()),
74 | tuple(self._loci_dict.keys()),
75 | )
76 |
77 | self._exclude_file: str | None = exclude_file
78 | self._exclude_dict: LociDictOfList = build_loci_dict_of_list_from_file(exclude_file, one_based_loci)
79 | if self._exclude_file is not None:
80 | self._logger.debug(
81 | "Built exclude dict of size %d with contigs %s",
82 | len(self._loci_dict),
83 | tuple(self._exclude_dict.keys()),
84 | )
85 |
86 | self._decimal_threshold: float = 0.5
87 | self._widen: float = widen
88 |
89 | self._mismatch_out_mi: MIKind = mismatch_out_mi
90 |
91 | self._test_to_perform: str = test_to_perform
92 | self._sig_level: float = sig_level
93 | self._mt_corr: str = mt_corr
94 | self._only_phased: bool = only_phased
95 |
96 | self._cache: dict[str, Any] = {}
97 |
98 | @property
99 | def test_to_perform(self) -> str:
100 | return self._test_to_perform
101 |
102 | @property
103 | def sig_level(self) -> float:
104 | return self._sig_level
105 |
106 | @property
107 | def mt_corr(self) -> str:
108 | return self._mt_corr
109 |
110 | def get_loci_overlapping(
111 | self, contig: str, start: int, end: int, first_only: bool
112 | ) -> list[tuple[int, int, list[str]]]:
113 | return overlapping_loci_dict_of_dict(
114 | contig, start, end, self._loci_dict, first_only, dict_cache_key=self._loci_dict_cache_key
115 | )
116 |
117 | def should_exclude_locus(self, contig: str, start: int, end: int) -> bool:
118 | return any(True for _ in overlapping_loci_dict_of_list(contig, start, end, self._exclude_dict, True))
119 |
120 | def should_skip_locus(
121 | self, contig: str, start: int, end: int, cached_overlapping: list | None = None
122 | ) -> str | None:
123 | # Returns either a reason string (if yes) or None (=== no)
124 |
125 | # Check to make sure call is present in TRF BED file, if it is specified
126 | # Check to make sure the locus is not excluded via overlap with exclude BED
127 |
128 | if not self._loci_file or not self._loci_dict:
129 | return None
130 |
131 | if not (cached_overlapping or self.get_loci_overlapping(contig, start, end, True)):
132 | return "no overlapping loci"
133 |
134 | if self.should_exclude_locus(contig, start, end):
135 | return "should_exclude_locus returned True"
136 |
137 | return None
138 |
139 | @abstractmethod
140 | def _get_sample_contigs(self) -> tuple[set, set, set]:
141 | return set(), set(), set()
142 |
143 | def get_trio_contigs(self, include_sex_chromosomes: bool = False) -> set:
144 | mc, fc, cc = self._get_sample_contigs()
145 |
146 | contig_set = mc.intersection(fc).intersection(cc)
147 |
148 | if include_sex_chromosomes: # TODO: proper parametrization
149 | if "Y" in cc:
150 | contig_set = contig_set.union({"X", "Y"})
151 | elif "chrY" in cc:
152 | contig_set = contig_set.union({"chrX", "chrY"})
153 | elif "X" in cc:
154 | contig_set = contig_set.union({"X"})
155 | elif "chrX" in cc:
156 | contig_set = contig_set.union({"chrX"})
157 | else:
158 | contig_set = contig_set.difference(SEX_CHROMOSOMES)
159 |
160 | if self._loci_dict:
161 | # Limit contig set to only contigs which are in the locus dictionary if one is specified.
162 | contig_set = contig_set.intersection(self._loci_dict.keys())
163 |
164 | self._logger.debug("Got %d intersection trio contigs", len(contig_set))
165 |
166 | return contig_set
167 |
168 | @abstractmethod
169 | def calculate_contig(self, contig: str) -> MIContigResult:
170 | return MIContigResult(contig)
171 |
172 | @staticmethod
173 | def _updated_mi_res(res: float | None, v: int | float | None) -> float | None:
174 | return None if v is None else ((res or 0) + v)
175 |
176 | def calculate(self, included_contigs: set) -> MIResult | None:
177 | # copy number
178 | res: float = 0
179 | res_pm1: float = 0
180 | res_95_ci: float | None = None
181 | res_99_ci: float | None = None
182 | # sequence
183 | res_seq: float | None = None
184 | res_sl: float | None = None
185 | res_sl_pm1: float | None = None
186 |
187 | n_total: int = 0
188 |
189 | contig_results = []
190 | output_loci = []
191 |
192 | for contig in sorted(included_contigs):
193 | self._logger.info("Processing contig %s", contig)
194 |
195 | contig_result = self.calculate_contig(contig)
196 | contig_results.append(contig_result)
197 |
198 | r, nm = contig_result.process_loci(
199 | mismatch_out_mi=self._mismatch_out_mi, calculate_non_matching=self.test_to_perform == "none"
200 | )
201 |
202 | value_95_ci = r["ci_95"]
203 | value_99_ci = r["ci_99"]
204 | value_seq = r["seq"]
205 | value_sl = r["sl"]
206 | value_sl_pm1 = r["sl_pm1"]
207 |
208 | res += r["strict"]
209 | res_pm1 += r["pm1"]
210 | res_95_ci = self._updated_mi_res(res_95_ci, value_95_ci)
211 | res_99_ci = self._updated_mi_res(res_99_ci, value_99_ci)
212 | res_seq = self._updated_mi_res(res_seq, value_seq)
213 | res_sl = self._updated_mi_res(res_sl, value_sl)
214 | res_sl_pm1 = self._updated_mi_res(res_sl_pm1, value_sl_pm1)
215 |
216 | n_total += len(contig_result)
217 | output_loci.extend(nm)
218 |
219 | logger_fmt = "Finished processing contig %s; n_total=%d. Current value: %.2f%%, ±1: %.2f%%"
220 | logger_args = [contig_result.contig, n_total, res / n_total * 100, res_pm1 / n_total * 100]
221 |
222 | extras = (
223 | (res_95_ci, "95%% CI"),
224 | (res_99_ci, "99%% CI"),
225 | (res_seq, "seq"),
226 | (res_sl, "s.l."),
227 | (res_sl_pm1, "s.l.±1"),
228 | )
229 |
230 | for val, fmt_txt in extras:
231 | if val is not None:
232 | logger_fmt += f", {fmt_txt}: %.2f%%"
233 | logger_args.append(val / n_total * 100)
234 |
235 | self._logger.info(logger_fmt, *logger_args)
236 |
237 | if n_total == 0:
238 | self._logger.warning("No common loci found")
239 | return None
240 |
241 | res /= n_total
242 | res_pm1 /= n_total
243 | res_95_ci = None if res_95_ci is None else (res_95_ci / n_total)
244 | res_99_ci = None if res_99_ci is None else (res_99_ci / n_total)
245 | res_seq = None if res_seq is None else (res_seq / n_total)
246 | res_sl = None if res_sl is None else (res_sl / n_total)
247 | res_sl_pm1 = None if res_sl is None else (res_sl_pm1 / n_total)
248 |
249 | mi_res = MIResult(
250 | {
251 | "strict": res,
252 | "pm1": res_pm1,
253 | "ci_95": res_95_ci,
254 | "ci_99": res_99_ci,
255 | "seq": res_seq,
256 | "sl": res_sl,
257 | "sl_pm1": res_sl_pm1,
258 | },
259 | contig_results,
260 | output_loci,
261 | self._widen,
262 | self.test_to_perform,
263 | self.sig_level,
264 | self.mt_corr,
265 | logger=self._logger,
266 | )
267 |
268 | if self.test_to_perform != "none":
269 | mi_res.correct_for_multiple_testing() # Also calculates new output loci
270 |
271 | return mi_res
272 |
--------------------------------------------------------------------------------
/strkit/mi/expansionhunter.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pysam
4 |
5 | from .base import BaseCalculator
6 | from .result import MIContigResult, MILocusData
7 | from .vcf_utils import VCFCalculatorMixin
8 | from ..utils import parse_cis
9 |
10 | __all__ = ["ExpansionHunterCalculator"]
11 |
12 |
13 | def _parse_allele(a: int | str | None) -> int | None:
14 | if isinstance(a, str):
15 | if a == ".":
16 | return None
17 | return int(a)
18 | return a
19 |
20 |
21 | def _unzip_gt(vals) -> tuple[tuple[int | float | None, ...], tuple[int | float | None, ...]]:
22 | try:
23 | return (_parse_allele(vals[0][0]), _parse_allele(vals[1][0])), parse_cis((vals[0][1], vals[1][1]))
24 | except ValueError:
25 | return (None, None), (None, None)
26 |
27 |
28 | class ExpansionHunterCalculator(BaseCalculator, VCFCalculatorMixin):
29 | def _get_sample_contigs(self) -> tuple[set, set, set]:
30 | return self.get_contigs_from_files(self._mother_call_file, self._father_call_file, self._child_call_file)
31 |
32 | def calculate_contig(self, contig: str) -> MIContigResult:
33 | cr = MIContigResult(contig, includes_95_ci=True)
34 |
35 | mvf = pysam.VariantFile(str(self._mother_call_file))
36 | fvf = pysam.VariantFile(str(self._father_call_file))
37 | cvf = pysam.VariantFile(str(self._child_call_file))
38 |
39 | # We want all common loci, so loop through the child and then look for the loci in the parent calls
40 | # TODO: What to do about filtering etc? !!!!!!!!!!!!!!!!!!!!!!!!
41 | # !!!!!!!!!!!!!!!!
42 | # - Q score
43 | # - CIs are "proper" - not inverted or weird
44 |
45 | for cv in cvf.fetch(contig):
46 | mv = next(mvf.fetch(contig, cv.start, cv.stop), None)
47 | fv = next(fvf.fetch(contig, cv.start, cv.stop), None)
48 |
49 | # TODO: Handle sex chromosomes
50 |
51 | k = (contig, cv.start, cv.stop)
52 |
53 | if self.should_skip_locus(*k):
54 | continue
55 |
56 | cr.seen_locus(*k)
57 |
58 | if mv is None or fv is None:
59 | # Variant isn't found in at least one of the parents, so we can't do anything with it.
60 | # TODO: We need to actually check calls, and check with sample ID, not just assume
61 | continue
62 |
63 | # TODO: Handle missing samples gracefully
64 | # TODO: Handle wrong formatted VCFs gracefully
65 |
66 | cs = cv.samples[self._child_id or 0]
67 | ms = mv.samples[self._mother_id or 0]
68 | fs = fv.samples[self._father_id or 0]
69 |
70 | cs_reps = tuple(sorted(zip(cs["REPCN"].split("/"), cs["REPCI"].split("/")), key=lambda x: x[0]))
71 | ms_reps = tuple(sorted(zip(ms["REPCN"].split("/"), ms["REPCI"].split("/")), key=lambda x: x[0]))
72 | fs_reps = tuple(sorted(zip(fs["REPCN"].split("/"), fs["REPCI"].split("/")), key=lambda x: x[0]))
73 |
74 | c_gt, c_gt_95_ci = _unzip_gt(cs_reps)
75 | m_gt, m_gt_95_ci = _unzip_gt(ms_reps)
76 | f_gt, f_gt_95_ci = _unzip_gt(fs_reps)
77 |
78 | if c_gt[0] is None or m_gt[0] is None or f_gt[0] is None:
79 | # None call in VCF, skip this call
80 | continue
81 |
82 | cr.append(MILocusData(
83 | contig=contig,
84 | start=cv.start,
85 | end=cv.stop,
86 | motif=cv.info["RU"],
87 |
88 | child_gt=c_gt, mother_gt=m_gt, father_gt=f_gt,
89 | child_gt_95_ci=c_gt_95_ci, mother_gt_95_ci=m_gt_95_ci, father_gt_95_ci=f_gt_95_ci,
90 |
91 | reference_copies=cv.info["REF"],
92 | ))
93 |
94 | return cr
95 |
--------------------------------------------------------------------------------
/strkit/mi/gangstr.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pysam
4 |
5 | from .base import BaseCalculator
6 | from .result import MIContigResult, MILocusData
7 | from .vcf_utils import VCFCalculatorMixin
8 | from ..utils import parse_cis
9 |
10 | __all__ = ["GangSTRCalculator"]
11 |
12 |
13 | class GangSTRCalculator(BaseCalculator, VCFCalculatorMixin):
14 | def _get_sample_contigs(self) -> tuple[set, set, set]:
15 | return self.get_contigs_from_files(self._mother_call_file, self._father_call_file, self._child_call_file)
16 |
17 | def calculate_contig(self, contig: str) -> MIContigResult:
18 | cr = MIContigResult(contig, includes_95_ci=True)
19 |
20 | mvf = pysam.VariantFile(str(self._mother_call_file))
21 | fvf = pysam.VariantFile(str(self._father_call_file))
22 | cvf = pysam.VariantFile(str(self._child_call_file))
23 |
24 | # We want all common loci, so loop through the child and then look for the loci in the parent calls
25 | # TODO: What to do about filtering etc? !!!!!!!!!!!!!!!!!!!!!!!!
26 | # !!!!!!!!!!!!!!!!
27 | # - Q score
28 | # - CIs are "proper" - not inverted or weird
29 |
30 | for cv in cvf.fetch(contig):
31 | mv = next(mvf.fetch(contig, cv.start, cv.stop), None)
32 | fv = next(fvf.fetch(contig, cv.start, cv.stop), None)
33 |
34 | # TODO: Handle sex chromosomes
35 |
36 | # Check to make sure call is present in TRF BED file, if it is specified
37 | k1 = (contig, cv.start, cv.stop)
38 | k2 = (contig, cv.start + 1, cv.stop + 1)
39 |
40 | if self.should_skip_locus(*k1) or self.should_skip_locus(*k2):
41 | continue
42 |
43 | cr.seen_locus(*k1)
44 |
45 | if mv is None or fv is None:
46 | # Variant isn't found in at least one of the parents, so we can't do anything with it.
47 | # TODO: We need to actually check calls, and check with sample ID, not just assume
48 | continue
49 |
50 | # TODO: Handle missing samples gracefully
51 | # TODO: Handle wrong formatted VCFs gracefully
52 |
53 | cs = cv.samples[self._child_id or 0]
54 | ms = mv.samples[self._mother_id or 0]
55 | fs = fv.samples[self._father_id or 0]
56 |
57 | c_gt = cs["REPCN"]
58 | m_gt = ms["REPCN"]
59 | f_gt = fs["REPCN"]
60 |
61 | try:
62 | c_gt_95_ci = parse_cis(cs["REPCI"])
63 | m_gt_95_ci = parse_cis(ms["REPCI"])
64 | f_gt_95_ci = parse_cis(fs["REPCI"])
65 | except (ValueError, TypeError):
66 | # None call in VCF, skip this call
67 | continue
68 |
69 | if c_gt[0] is None or m_gt[0] is None or f_gt[0] is None:
70 | # None call in VCF, skip this call
71 | continue
72 |
73 | cr.append(MILocusData(
74 | contig=contig,
75 | start=cv.start,
76 | end=cv.stop,
77 | motif=cv.info["RU"],
78 |
79 | child_gt=c_gt, mother_gt=m_gt, father_gt=f_gt,
80 | child_gt_95_ci=c_gt_95_ci, mother_gt_95_ci=m_gt_95_ci, father_gt_95_ci=f_gt_95_ci,
81 |
82 | reference_copies=cv.info["REF"],
83 | ))
84 |
85 | return cr
86 |
--------------------------------------------------------------------------------
/strkit/mi/generic_vcf.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pysam
4 |
5 | from .base import BaseCalculator
6 | from .result import MIContigResult, MILocusData
7 | from .vcf_utils import VCFCalculatorMixin
8 |
9 | __all__ = ["GenericVCFLengthCalculator"]
10 |
11 |
12 | class GenericVCFLengthCalculator(BaseCalculator, VCFCalculatorMixin):
13 | def _get_sample_contigs(self) -> tuple[set, set, set]:
14 | contigs = self.get_contigs_from_files(self._mother_call_file, self._father_call_file, self._child_call_file)
15 | self._logger.debug(
16 | "Got trio contigs - child: %d, mother: %d, father: %d",
17 | len(contigs[2]), len(contigs[0]), len(contigs[1]),
18 | )
19 | return contigs
20 |
21 | def calculate_contig(self, contig: str) -> MIContigResult:
22 | cr = MIContigResult(contig, includes_seq=True)
23 |
24 | mvf = pysam.VariantFile(str(self._mother_call_file))
25 | fvf = pysam.VariantFile(str(self._father_call_file))
26 | cvf = pysam.VariantFile(str(self._child_call_file))
27 |
28 | # We want all common loci, so loop through the child and then look for the loci in the parent calls
29 |
30 | for cv in cvf.fetch(contig):
31 | # child variant start/end, as determined by the reference allele sequence
32 | cv_start = cv.start
33 | cv_stop = cv.stop
34 |
35 | # hack for LongTR: if we override start/end in INFO, use those values as the true start/end in the context
36 | # of the locus boundaries
37 | if "START" in cv.info:
38 | cv_start = int(cv.info["START"]) - 1
39 | if "END" in cv.info:
40 | cv_stop = int(cv.info["END"])
41 |
42 | mv = next(mvf.fetch(contig, cv_start, cv_stop), None)
43 | fv = next(fvf.fetch(contig, cv_start, cv_stop), None)
44 |
45 | # TODO: Handle sex chromosomes
46 |
47 | k = (contig, cv_start, cv_stop)
48 |
49 | overlapping = self.get_loci_overlapping(k[0], k[1], k[2], True)
50 |
51 | if r := self.should_skip_locus(k[0], k[1], k[2], cached_overlapping=overlapping):
52 | self._logger.debug(f"Skipping locus {k}: {r}")
53 | continue
54 |
55 | cr.seen_locus(*k)
56 |
57 | if mv is None or fv is None:
58 | # Variant isn't found in at least one of the parents, so we can't do anything with it.
59 | # TODO: We need to actually check calls, and check with sample ID, not just assume
60 | self._logger.debug(f"Skipping locus {k}: mv or fv is None")
61 | continue
62 |
63 | # TODO: Handle missing samples gracefully
64 | # TODO: Handle wrong formatted VCFs gracefully
65 |
66 | # Need to dig up original motif from the locus file - thus, the original locus file is required.
67 | motif: str = overlapping[0][-1][0]
68 | if not motif:
69 | self._logger.debug(f"Skipping locus {k}: motif is false-y")
70 | continue
71 |
72 | motif_len = len(motif)
73 |
74 | cs = cv.samples[self._child_id or 0]
75 | ms = mv.samples[self._mother_id or 0]
76 | fs = fv.samples[self._father_id or 0]
77 |
78 | c_seq_gt = tuple(sorted((cv.alleles[g] for g in cs["GT"]), key=len)) if None not in cs["GT"] else None
79 | c_gt = tuple(round(len(a) / motif_len) for a in c_seq_gt) if c_seq_gt is not None else None
80 | m_seq_gt = tuple(sorted((mv.alleles[g] for g in ms["GT"]), key=len)) if None not in ms["GT"] else None
81 | m_gt = tuple(round(len(a) / motif_len) for a in m_seq_gt) if m_seq_gt is not None else None
82 | f_seq_gt = tuple(sorted((fv.alleles[g] for g in fs["GT"]), key=len)) if None not in fs["GT"] else None
83 | f_gt = tuple(round(len(a) / motif_len) for a in f_seq_gt) if f_seq_gt is not None else None
84 |
85 | if c_gt is None or m_gt is None or f_gt is None:
86 | # None call in VCF, skip this call
87 | continue
88 |
89 | cr.append(MILocusData(
90 | contig=contig,
91 | start=cv_start,
92 | end=cv_stop,
93 | motif=motif,
94 |
95 | child_gt=c_gt, mother_gt=m_gt, father_gt=f_gt,
96 |
97 | # sequence may not line up with start/end if VCF record INFO START/END entries are used
98 | child_seq_gt=c_seq_gt, mother_seq_gt=m_seq_gt, father_seq_gt=f_seq_gt,
99 | ))
100 |
101 | return cr
102 |
--------------------------------------------------------------------------------
/strkit/mi/intervals.py:
--------------------------------------------------------------------------------
1 | import bisect
2 | from pathlib import Path
3 | from typing import Iterable
4 |
5 | from strkit.utils import idx_0_getter, idx_1_getter
6 |
7 |
8 | def _line_filter_fn(s: str) -> bool:
9 | """
10 | Filter function to skip blank lines and comments
11 | :param s: line of a file
12 | :return: whether the line is not blank and is not a comment
13 | """
14 | return s and not s.startswith("#")
15 |
16 |
17 | # key: contig, value: dict of (key: coordinate interval, value: list of extra values)
18 | LociDictOfDict = dict[str, dict[tuple[int, int], list[str]]]
19 |
20 | # key: contig, value: list of coordinate intervals
21 | LociDictOfList = dict[str, list[tuple[int, int]]]
22 |
23 |
24 | def build_loci_dict_of_dict_from_file(loci_path: str | Path | None, one_based: bool) -> LociDictOfDict:
25 | # Assumes standard BED format - 0-based, half-open intervals, unless one_based=True,
26 | # in which case assume 1-based closed intervals and adjust to be 0-based half-closed.
27 |
28 | if not loci_path:
29 | return {}
30 |
31 | start_adj = -1 * int(one_based) # -1 if converting from 1-based closed to 0-based half-open, otherwise do nothing.
32 |
33 | res: LociDictOfDict = {}
34 |
35 | with open(loci_path, "r") as lf:
36 | for line in filter(_line_filter_fn, map(str.strip, lf)):
37 | ls = line.split("\t")
38 |
39 | contig, ss, es = ls[:3]
40 |
41 | if contig not in res:
42 | res[contig] = {}
43 |
44 | res[contig][int(ss) + start_adj, int(es)] = ls[3:]
45 |
46 | return res
47 |
48 |
49 | def build_loci_dict_of_list_from_file(loci_path: str | Path | None, one_based: bool) -> LociDictOfList:
50 | # Assumes standard BED format - 0-based, half-open intervals, unless one_based=True,
51 | # in which case assume 1-based closed intervals and adjust to be 0-based half-closed.
52 |
53 | if not loci_path:
54 | return {}
55 |
56 | start_adj = -1 * int(one_based) # -1 if converting from 1-based closed to 0-based half-open, otherwise do nothing.
57 |
58 | res: dict[str, list[tuple[int, int]]] = {}
59 |
60 | with open(loci_path, "r") as lf:
61 | for line in filter(_line_filter_fn, map(str.strip, lf)):
62 | ls = line.split("\t")
63 |
64 | contig, ss, es = ls[:3]
65 |
66 | if contig not in res:
67 | res[contig] = []
68 |
69 | res[contig].append((int(ss) + start_adj, int(es)))
70 |
71 | return res
72 |
73 |
74 | _overlapping_dict_cache = {}
75 |
76 |
77 | def overlapping_loci_dict_of_dict(
78 | contig: str, start: int, end: int, d: LociDictOfDict, first_only: bool = False, dict_cache_key: str | None = None
79 | ) -> list[tuple[int, int, list[str]]]:
80 | if contig not in d:
81 | return []
82 |
83 | global _overlapping_dict_cache
84 |
85 | full_cache_key = f"{dict_cache_key}--{contig}"
86 |
87 | if full_cache_key in _overlapping_dict_cache:
88 | c_dict, c_keys, c_lhs = _overlapping_dict_cache[full_cache_key]
89 | else:
90 | c_dict = d[contig]
91 | c_keys = tuple(c_dict.keys())
92 | c_lhs = tuple(map(lambda k: k[0], c_keys))
93 | if full_cache_key is not None:
94 | _overlapping_dict_cache[full_cache_key] = c_dict, c_keys, c_lhs
95 |
96 | i = bisect.bisect_left(c_lhs, end) # use _left since end is exclusive
97 |
98 | # now sort by [1] (possible overlap end), which should be (almost!) sorted already.
99 | # then, we can get only entries where start < ov[1] via bisect (finding ov[1] <= start and skipping them).
100 | possible_overlaps = sorted(c_keys[:i], key=idx_1_getter)
101 | j = bisect.bisect_right(possible_overlaps, start, key=idx_1_getter) # bisect right because exclusive
102 | possible_overlaps = possible_overlaps[j:]
103 |
104 | acc: list[tuple[int, int, list[str]]] = []
105 |
106 | for ov in possible_overlaps:
107 | acc.append((ov[0], ov[1], c_dict[ov]))
108 | if first_only:
109 | break
110 |
111 | return sorted(acc, key=idx_0_getter)
112 |
113 |
114 | def overlapping_loci_dict_of_list(
115 | contig: str, start: int, end: int, d: LociDictOfList, first_only: bool
116 | ) -> Iterable[tuple[int, int]]:
117 | if contig not in d:
118 | yield from ()
119 | return
120 |
121 | c_ints = d[contig]
122 | c_lhs = tuple(map(lambda k: k[0], c_ints))
123 | i = bisect.bisect_left(c_lhs, end) # use _left since end is exclusive
124 |
125 | for ov in c_ints[:i]:
126 | if start < ov[1]:
127 | yield ov[0], ov[1]
128 | if first_only:
129 | break
130 |
--------------------------------------------------------------------------------
/strkit/mi/repeathmm.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from .base import BaseCalculator
4 | from .result import MIContigResult, MILocusData
5 | from ..utils import int_tuple
6 |
7 | __all__ = [
8 | "RepeatHMMCalculator",
9 | ]
10 |
11 |
12 | class RepeatHMMCalculator(BaseCalculator):
13 | @staticmethod
14 | def get_contigs_from_fh(fh) -> set:
15 | return {ls[0] for ls in (line.split(":") for line in fh)}
16 |
17 | @staticmethod
18 | def make_calls_dict(ph, contig):
19 | return {
20 | tuple(k.split(":")): int_tuple(v.split("/"))
21 | for k, v in (pv.split() for pv in ph)
22 | if k.split(":")[0] == contig
23 | }
24 |
25 | def _get_sample_contigs(self) -> tuple[set, set, set]:
26 | with open(self._mother_call_file, "r") as mvf, open(self._father_call_file, "r") as fvf, \
27 | open(self._child_call_file, "r") as cvf:
28 |
29 | mc = self.get_contigs_from_fh(mvf)
30 | fc = self.get_contigs_from_fh(fvf)
31 | cc = self.get_contigs_from_fh(cvf)
32 |
33 | return mc, fc, cc
34 |
35 | def calculate_contig(self, contig: str) -> MIContigResult:
36 | cr = MIContigResult(contig)
37 |
38 | with open(self._mother_call_file) as mh:
39 | mother_calls = self.make_calls_dict(mh, contig)
40 |
41 | with open(self._father_call_file) as fh:
42 | father_calls = self.make_calls_dict(fh, contig)
43 |
44 | with open(self._child_call_file) as ch:
45 | for cv in ch:
46 | locus_data, call = cv.strip().split(" ")
47 | lookup = tuple(locus_data.split(":"))
48 |
49 | if lookup[0] != contig:
50 | continue
51 |
52 | locus_start: int = int(lookup[1])
53 | locus_end: int = int(lookup[2])
54 |
55 | k = (contig, locus_start, locus_end)
56 |
57 | # Check to make sure call is present in TRF BED file, if it is specified
58 | if self.should_skip_locus(*k):
59 | continue
60 |
61 | cr.seen_locus(*k)
62 |
63 | # Check to make sure call is present in all trio individuals
64 | if lookup not in mother_calls or lookup not in father_calls:
65 | continue
66 |
67 | c_gt = int_tuple(call.split("/"))
68 | m_gt = mother_calls[lookup]
69 | f_gt = father_calls[lookup]
70 |
71 | # Failed calls from RepeatHMM seem to be represented as 0/0, so skip this
72 | # TODO… Need to decide if we actually want to include these?
73 | # or at least somehow record them
74 | if (0, 0) in (c_gt, m_gt, f_gt):
75 | continue
76 |
77 | # TODO: Include ref copies... should be in file somewhere?
78 | cr.append(MILocusData(
79 | lookup[0],
80 | locus_start,
81 | locus_end,
82 | lookup[3],
83 |
84 | child_gt=int_tuple(call.split("/")),
85 | mother_gt=mother_calls[lookup],
86 | father_gt=father_calls[lookup],
87 |
88 | logger=self._logger,
89 | ))
90 |
91 | return cr
92 |
--------------------------------------------------------------------------------
/strkit/mi/straglr.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from .base import BaseCalculator
4 | from .result import MILocusData, MIContigResult
5 |
6 | __all__ = [
7 | "StraglrCalculator",
8 | ]
9 |
10 |
11 | class StraglrCalculator(BaseCalculator):
12 | @staticmethod
13 | def get_contigs_from_fh(fh) -> set:
14 | return {ls[0] for ls in (line.split("\t") for line in fh if not line.startswith("#"))}
15 |
16 | def make_calls_dict(self, ph, contig, cr: MIContigResult | None = None):
17 | # For reference, dicts are ordered in Python 3.7+ (guaranteed)
18 |
19 | calls = {}
20 |
21 | for pv in ph:
22 | if pv.startswith("#"):
23 | continue
24 |
25 | line = pv.strip().split("\t")
26 |
27 | if line[0] != contig:
28 | if calls:
29 | # assume ordered BED; break after we've collected all calls for the contig
30 | break
31 | continue
32 |
33 | locus = tuple(line[:3])
34 |
35 | k = (line[0], int(line[1]), int(line[2]))
36 |
37 | overlapping = self.get_loci_overlapping(k[0], k[1], k[2], True)
38 |
39 | if r := self.should_skip_locus(k[0], k[1], k[2], cached_overlapping=overlapping):
40 | self._logger.debug(f"Skipping locus {k}: {r}")
41 | continue
42 |
43 | if cr:
44 | cr.seen_locus(*k)
45 |
46 | orig_motif: str = overlapping[0][-1][0]
47 | if not orig_motif: # false-y/blank
48 | self._logger.debug(f"Skipping locus {k}: motif is false-y")
49 | continue
50 |
51 | # Transform the genotypes into something that is consistent across individuals,
52 | # using the file with the list of loci.
53 | gt_fact = len(line[3]) / len(orig_motif)
54 |
55 | gt = tuple(float(g.split("(")[0]) * gt_fact for g in line[4].split(";"))
56 | if len(gt) == 1: # If it's homozygous, expand it out to length 2
57 | gt = gt + gt
58 |
59 | calls[locus + (orig_motif,)] = gt
60 |
61 | return calls
62 |
63 | def _get_sample_contigs(self) -> tuple[set, set, set]:
64 | with open(self._mother_call_file, "r") as mvf, open(self._father_call_file, "r") as fvf, \
65 | open(self._child_call_file, "r") as cvf:
66 |
67 | mc = self.get_contigs_from_fh(mvf)
68 | fc = self.get_contigs_from_fh(fvf)
69 | cc = self.get_contigs_from_fh(cvf)
70 |
71 | return mc, fc, cc
72 |
73 | def calculate_contig(self, contig: str):
74 | cr = MIContigResult(contig)
75 |
76 | with open(self._mother_call_file, "r") as mh:
77 | mother_calls = self.make_calls_dict(mh, contig)
78 |
79 | with open(self._father_call_file, "r") as fh:
80 | father_calls = self.make_calls_dict(fh, contig)
81 |
82 | with open(self._child_call_file, "r") as ch:
83 | child_calls = self.make_calls_dict(ch, contig, cr)
84 |
85 | for locus_data, c_gt in child_calls.items():
86 | # Check to make sure call is present in all trio individuals
87 | if locus_data not in mother_calls or locus_data not in father_calls:
88 | continue
89 |
90 | cr.append(MILocusData(
91 | contig=locus_data[0],
92 | start=int(locus_data[1]),
93 | end=int(locus_data[2]),
94 | motif=locus_data[3],
95 |
96 | child_gt=c_gt,
97 | mother_gt=mother_calls[locus_data],
98 | father_gt=father_calls[locus_data],
99 |
100 | decimal=True,
101 | ))
102 |
103 | return cr
104 |
--------------------------------------------------------------------------------
/strkit/mi/strkit.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import numpy as np
4 |
5 | from pysam import VariantFile
6 | from pysam.libcbcf import VariantRecordSample
7 |
8 | from strkit.json import json
9 |
10 | from .base import BaseCalculator
11 | from .result import MIContigResult, MILocusData
12 | from .vcf_utils import VCFCalculatorMixin
13 | from ..utils import int_tuple, parse_cis
14 |
15 | __all__ = [
16 | "StrKitCalculator",
17 | "StrKitJSONCalculator",
18 | "StrKitVCFCalculator",
19 | ]
20 |
21 |
22 | STRKIT_TSV_CALL_INDEX = 6
23 | STRKIT_TSV_CALL_95_CI_INDEX = 7
24 |
25 |
26 | class StrKitCalculator(BaseCalculator):
27 | @staticmethod
28 | def get_contigs_from_fh(fh) -> set[str]:
29 | return {ls[0] for ls in (line.split("\t") for line in fh if not line.startswith("#"))}
30 |
31 | def _get_sample_contigs(self) -> tuple[set, set, set]:
32 | with open(self._mother_call_file, "r") as mvf:
33 | mc = self.get_contigs_from_fh(mvf)
34 | with open(self._father_call_file, "r") as fvf:
35 | fc = self.get_contigs_from_fh(fvf)
36 | with open(self._child_call_file, "r") as cvf:
37 | cc = self.get_contigs_from_fh(cvf)
38 | return mc, fc, cc
39 |
40 | @staticmethod
41 | def make_calls_dict(ph, contig):
42 | return {
43 | tuple(line[:4]): (
44 | int_tuple(line[STRKIT_TSV_CALL_INDEX].split("|")),
45 | parse_cis(line[STRKIT_TSV_CALL_95_CI_INDEX].split("|")),
46 | None # parse_cis(line[-1:].split("|")),
47 | )
48 | for line in (pv.strip().split("\t") for pv in ph)
49 | if line[0] == contig and "." not in line[STRKIT_TSV_CALL_INDEX]
50 | }
51 |
52 | def calculate_contig(self, contig: str) -> MIContigResult:
53 | cr = MIContigResult(contig, includes_95_ci=True)
54 |
55 | with open(self._mother_call_file) as mh:
56 | mother_calls = self.make_calls_dict(mh, contig)
57 |
58 | self._logger.debug(f"loaded materal calls for {contig}")
59 |
60 | with open(self._father_call_file) as fh:
61 | father_calls = self.make_calls_dict(fh, contig)
62 |
63 | self._logger.debug(f"loaded paternal calls for {contig}")
64 |
65 | with open(self._child_call_file) as ch:
66 | for cv in ch:
67 | locus_data = cv.strip().split("\t")
68 |
69 | if locus_data[0] != contig:
70 | continue
71 |
72 | lookup = tuple(locus_data[:4])
73 |
74 | start = int(locus_data[1])
75 | end = int(locus_data[2])
76 |
77 | if self.should_skip_locus(contig, start, end):
78 | continue
79 |
80 | # Check to make sure call is present in all trio individuals
81 | if lookup not in mother_calls or lookup not in father_calls:
82 | continue
83 |
84 | m_gt, m_gt_95_ci, _ = mother_calls[lookup]
85 | f_gt, f_gt_95_ci, _ = father_calls[lookup]
86 |
87 | calls = locus_data[STRKIT_TSV_CALL_INDEX].split("|")
88 |
89 | if "." in calls:
90 | # Failed call
91 | continue
92 |
93 | cr.append(MILocusData(
94 | contig=lookup[0],
95 | start=int(lookup[1]),
96 | end=int(lookup[2]),
97 | motif=lookup[3],
98 |
99 | child_gt=int_tuple(calls),
100 | mother_gt=m_gt,
101 | father_gt=f_gt,
102 |
103 | child_gt_95_ci=parse_cis(locus_data[STRKIT_TSV_CALL_95_CI_INDEX].split("|")),
104 | mother_gt_95_ci=m_gt_95_ci,
105 | father_gt_95_ci=f_gt_95_ci,
106 |
107 | # child_gt_99_ci=parse_cis(locus_data[-1:].split("|")),
108 | # mother_gt_99_ci=m_gt_99_ci,
109 | # father_gt_99_ci=f_gt_99_ci,
110 |
111 | reference_copies=int(locus_data[4]),
112 |
113 | decimal=False,
114 | ))
115 |
116 | return cr
117 |
118 |
119 | class StrKitJSONCalculator(BaseCalculator):
120 | def __init__(self, *args, **kwargs):
121 | super().__init__(*args, **kwargs)
122 |
123 | with open(self._mother_call_file, "r") as mvf:
124 | self._cache["mother_data"] = json.loads(mvf.read())
125 | with open(self._father_call_file, "r") as fvf:
126 | self._cache["father_data"] = json.loads(fvf.read())
127 | with open(self._child_call_file, "r") as cvf:
128 | self._cache["child_data"] = json.loads(cvf.read())
129 |
130 | @staticmethod
131 | def get_contigs_from_data(report) -> set:
132 | if (report_contigs := report.get("contigs")) is not None:
133 | return set(report_contigs)
134 | return {res["contig"] for res in report["results"]}
135 |
136 | def _get_sample_contigs(self, include_sex_chromosomes: bool = False) -> tuple[set, set, set]:
137 | mc = self.get_contigs_from_data(self._cache["mother_data"])
138 | fc = self.get_contigs_from_data(self._cache["father_data"])
139 | cc = self.get_contigs_from_data(self._cache["child_data"])
140 | return mc, fc, cc
141 |
142 | @staticmethod
143 | def get_read_counts(res: dict, dtype=int):
144 | # TODO: This only works with diploids...
145 |
146 | read_cns = []
147 | read_peaks = []
148 |
149 | for r in res["reads"].values():
150 | if (peak := r.get("p")) is None:
151 | continue
152 | read_cns.append(r["cn"])
153 | read_peaks.append(peak)
154 |
155 | n = res["peaks"]["modal_n"]
156 |
157 | if (n < 2 or len(set(res["call"]))) == 1 and res.get("assign_method", "dist") == "dist":
158 | # Split copy numbers evenly in two if we have a homozygous locus called only via distance.
159 | rcs = np.array(read_cns, dtype=dtype)
160 | np.random.shuffle(rcs) # TODO: seed shuffle
161 | part = rcs.shape[0] // 2
162 | return tuple(rcs[:part].tolist()), tuple(rcs[part:].tolist())
163 |
164 | rc = []
165 | for _ in range(n):
166 | rc.append([])
167 | for cn, pk in zip(read_cns, read_peaks):
168 | rc[pk].append(cn)
169 | return tuple(map(tuple, rc))
170 |
171 | @staticmethod
172 | def make_calls_dict(report: dict, contig: str):
173 | return {
174 | (res["contig"], res["start"], res["end"], res["motif"]): (
175 | int_tuple(res["call"]),
176 | tuple(map(lambda x: tuple(map(int, x)), res["call_95_cis"])),
177 | None, # Placeholder for 99% CI
178 | StrKitJSONCalculator.get_read_counts(res, dtype=int),
179 | )
180 | for res in report["results"]
181 | if res["contig"] == contig and res["call"] is not None
182 | }
183 |
184 | def calculate_contig(self, contig: str) -> MIContigResult:
185 | c_report = self._cache["child_data"]
186 |
187 | cr = MIContigResult(contig, includes_95_ci=True)
188 |
189 | mother_data = self.make_calls_dict(self._cache["mother_data"], contig)
190 | self._logger.debug(f"loaded materal calls for {contig}")
191 |
192 | father_data = self.make_calls_dict(self._cache["father_data"], contig)
193 | self._logger.debug(f"loaded paternal calls for {contig}")
194 |
195 | for res in c_report["results"]:
196 | if res["contig"] != contig:
197 | continue
198 |
199 | locus_start = res["start"]
200 | locus_end = res["end"]
201 |
202 | lookup = (contig, locus_start, locus_end, res["motif"])
203 |
204 | k = (contig, int(locus_start), int(locus_end))
205 |
206 | # Check to make sure call is present in TRF BED file, if it is specified
207 | if self.should_skip_locus(*k):
208 | continue
209 |
210 | cr.seen_locus(*k)
211 |
212 | # Check to make sure call is present in all trio individuals
213 | if lookup not in mother_data or lookup not in father_data:
214 | continue
215 |
216 | m_gt, m_gt_95_ci, _, m_rcs = mother_data[lookup]
217 | f_gt, f_gt_95_ci, _, f_rcs = father_data[lookup]
218 |
219 | if res["call"] is None:
220 | # Failed call
221 | continue
222 |
223 | call = int_tuple(res["call"])
224 |
225 | cr.append(MILocusData(
226 | contig=lookup[0],
227 | start=locus_start,
228 | end=locus_end,
229 | motif=lookup[3],
230 |
231 | child_gt=int_tuple(call),
232 | mother_gt=m_gt,
233 | father_gt=f_gt,
234 |
235 | child_gt_95_ci=tuple(map(lambda x: tuple(map(int, x)), res["call_95_cis"])),
236 | mother_gt_95_ci=m_gt_95_ci,
237 | father_gt_95_ci=f_gt_95_ci,
238 |
239 | # child_gt_99_ci=parse_cis(locus_data[-1:].split("|")),
240 | # mother_gt_99_ci=m_gt_99_ci,
241 | # father_gt_99_ci=f_gt_99_ci,
242 |
243 | child_read_counts=StrKitJSONCalculator.get_read_counts(res, dtype=int),
244 | mother_read_counts=m_rcs,
245 | father_read_counts=f_rcs,
246 |
247 | reference_copies=int(res["ref_cn"]),
248 |
249 | decimal=False,
250 |
251 | test_to_perform=self.test_to_perform,
252 | sig_level=self.sig_level,
253 | ))
254 |
255 | return cr
256 |
257 |
258 | class StrKitVCFCalculator(BaseCalculator, VCFCalculatorMixin):
259 | def _get_sample_contigs(self, include_sex_chromosomes: bool = False) -> tuple[set, set, set]:
260 | return self.get_contigs_from_files(self._mother_call_file, self._father_call_file, self._child_call_file)
261 |
262 | @staticmethod
263 | def get_peak_cns_from_vcf_line(sample_record: VariantRecordSample):
264 | if "MCRL" not in sample_record:
265 | return None
266 |
267 | res = []
268 |
269 | for enc_peak in sample_record["MCRL"]:
270 | peak = []
271 | for cn_r in enc_peak.split("|"):
272 | cn, cn_c = cn_r.split("x")
273 | peak.extend([int(cn)] * int(cn_c))
274 |
275 | res.append(tuple(peak))
276 |
277 | if len(res) == 1:
278 | # Split one peak into two, interleaving reads between the two peaks
279 | return res[0][::2], res[0][1::2]
280 |
281 | return tuple(res)
282 |
283 | def calculate_contig(self, contig: str) -> MIContigResult:
284 | cr = MIContigResult(contig, includes_95_ci=True, includes_seq=True)
285 |
286 | mvf = VariantFile(str(self._mother_call_file))
287 | fvf = VariantFile(str(self._father_call_file))
288 | cvf = VariantFile(str(self._child_call_file))
289 |
290 | # We want all common loci, so loop through the child and then look for the loci in the parent calls
291 |
292 | for cv in cvf.fetch(contig):
293 | if cv.info["VT"] != "str":
294 | continue
295 |
296 | motif = cv.info["MOTIF"]
297 | k = (contig, cv.start, cv.stop)
298 |
299 | mv = next(filter(lambda v: v.info["VT"] == "str" and v.info["MOTIF"] == motif, mvf.fetch(*k)), None)
300 | fv = next(filter(lambda v: v.info["VT"] == "str" and v.info["MOTIF"] == motif, fvf.fetch(*k)), None)
301 |
302 | # TODO: Handle sex chromosomes
303 |
304 | # Check to make sure call is present in TRF BED file, if it is specified
305 | if self.should_skip_locus(*k):
306 | continue
307 |
308 | cr.seen_locus(*k)
309 |
310 | if mv is None or fv is None:
311 | # Variant isn't found in at least one of the parents, so we can't do anything with it.
312 | # TODO: We need to actually check calls, and check with sample ID, not just assume
313 | continue
314 |
315 | # TODO: Handle missing samples gracefully
316 | # TODO: Handle wrong formatted VCFs gracefully
317 |
318 | cs = cv.samples[self._child_id or 0]
319 | ms = mv.samples[self._mother_id or 0]
320 | fs = fv.samples[self._father_id or 0]
321 |
322 | try:
323 | c_gt = cs["MC"]
324 | m_gt = ms["MC"]
325 | f_gt = fs["MC"]
326 | except KeyError:
327 | # None call in VCF, skip this call
328 | continue
329 |
330 | try:
331 | c_gt_95_ci = parse_cis(cs["MCCI"])
332 | m_gt_95_ci = parse_cis(ms["MCCI"])
333 | f_gt_95_ci = parse_cis(fs["MCCI"])
334 | except (ValueError, TypeError):
335 | # None call in VCF, skip this call
336 | continue
337 |
338 | if c_gt[0] is None or m_gt[0] is None or f_gt[0] is None:
339 | # None call in VCF, skip this call
340 | continue
341 |
342 | if self._only_phased and ("PS" not in cs or "PS" not in ms or "PS" not in fs):
343 | # No phasing support across trio, and we're only looking at phased loci --> skip this call
344 | continue
345 |
346 | c_seq_gt = tuple(sorted((cv.alleles[g] for g in cs["GT"]), key=len)) if None not in cs["GT"] else None
347 | m_seq_gt = tuple(sorted((mv.alleles[g] for g in ms["GT"]), key=len)) if None not in ms["GT"] else None
348 | f_seq_gt = tuple(sorted((fv.alleles[g] for g in fs["GT"]), key=len)) if None not in fs["GT"] else None
349 |
350 | cr.append(MILocusData(
351 | contig=contig,
352 | start=cv.start,
353 | end=cv.stop,
354 | motif=motif,
355 |
356 | child_gt=c_gt, mother_gt=m_gt, father_gt=f_gt,
357 | child_gt_95_ci=c_gt_95_ci, mother_gt_95_ci=m_gt_95_ci, father_gt_95_ci=f_gt_95_ci,
358 | child_seq_gt=c_seq_gt, mother_seq_gt=m_seq_gt, father_seq_gt=f_seq_gt,
359 |
360 | reference_copies=cv.info["REFMC"],
361 |
362 | # ---- for de novo mutation detection (this function returns None if MCRL is not in the VCF FORMAT for
363 | # the samples; i.e., with older STRkit versions):
364 |
365 | child_read_counts=StrKitVCFCalculator.get_peak_cns_from_vcf_line(cs),
366 | mother_read_counts=StrKitVCFCalculator.get_peak_cns_from_vcf_line(ms),
367 | father_read_counts=StrKitVCFCalculator.get_peak_cns_from_vcf_line(fs),
368 |
369 | test_to_perform=self.test_to_perform,
370 | sig_level=self.sig_level,
371 | ))
372 |
373 | return cr
374 |
--------------------------------------------------------------------------------
/strkit/mi/tandem_genotypes.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from .base import BaseCalculator
4 | from .result import MIContigResult, MILocusData
5 | from ..utils import int_tuple
6 |
7 | __all__ = [
8 | "TandemGenotypesCalculator",
9 | ]
10 |
11 |
12 | class TandemGenotypesCalculator(BaseCalculator):
13 | @staticmethod
14 | def get_contigs_from_fh(fh) -> set[str]:
15 | return {ls[0] for ls in (line.split("\t") for line in fh if not line.startswith("#"))}
16 |
17 | @staticmethod
18 | def make_calls_dict(ph, contig):
19 | return {
20 | tuple(line[:4]): int_tuple(line[6:8])
21 | for line in (pv.strip().split("\t") for pv in ph if not pv.startswith("#"))
22 | if line[0] == contig and "." not in line[6:8]
23 | }
24 |
25 | def _get_sample_contigs(self) -> tuple[set, set, set]:
26 | with open(self._mother_call_file, "r") as mvf, open(self._father_call_file, "r") as fvf, \
27 | open(self._child_call_file, "r") as cvf:
28 |
29 | mc = self.get_contigs_from_fh(mvf)
30 | fc = self.get_contigs_from_fh(fvf)
31 | cc = self.get_contigs_from_fh(cvf)
32 |
33 | return mc, fc, cc
34 |
35 | def calculate_contig(self, contig: str) -> MIContigResult:
36 | cr = MIContigResult(contig)
37 |
38 | with open(self._mother_call_file) as mh:
39 | mother_calls = self.make_calls_dict(mh, contig)
40 |
41 | with open(self._father_call_file) as fh:
42 | father_calls = self.make_calls_dict(fh, contig)
43 |
44 | with open(self._child_call_file) as ch:
45 | for cv in ch:
46 | locus_data = cv.strip().split("\t")
47 | lookup = tuple(locus_data[:4])
48 |
49 | if locus_data[0] != contig:
50 | continue
51 |
52 | k = (contig, int(lookup[1]), int(lookup[2]))
53 |
54 | if self.should_skip_locus(*k):
55 | continue
56 |
57 | cr.seen_locus(*k)
58 |
59 | # Check to make sure call is present in all trio individuals
60 | if lookup not in mother_calls or lookup not in father_calls:
61 | continue
62 |
63 | child_calls = locus_data[6:8]
64 |
65 | if "." in child_calls:
66 | # Failed call
67 | continue
68 |
69 | cr.append(MILocusData(
70 | contig=contig,
71 | start=k[1],
72 | end=k[2],
73 | motif=lookup[3],
74 |
75 | child_gt=int_tuple(child_calls),
76 | mother_gt=mother_calls[lookup],
77 | father_gt=father_calls[lookup],
78 | ))
79 |
80 | return cr
81 |
--------------------------------------------------------------------------------
/strkit/mi/trgt.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pysam
4 |
5 | from .base import BaseCalculator
6 | from .result import MIContigResult, MILocusData
7 | from .vcf_utils import VCFCalculatorMixin
8 | from ..utils import parse_ci
9 |
10 | __all__ = ["TRGTCalculator"]
11 |
12 |
13 | def _parse_allele(a: int | str | None) -> int | None:
14 | if isinstance(a, str):
15 | if a == ".":
16 | return None
17 | return int(a)
18 | return a
19 |
20 |
21 | def _unzip_gt(
22 | vals, motif_len: int
23 | ) -> tuple[tuple[int, ...], tuple[tuple[int, ...], tuple[int, ...]]] | tuple[tuple[None, None], tuple[None, None]]:
24 | try:
25 | return (
26 | (
27 | round(_parse_allele(vals[0][0]) / motif_len),
28 | round(_parse_allele(vals[1][0]) / motif_len),
29 | ),
30 | (
31 | tuple(map(lambda x: round(x / motif_len), parse_ci(vals[0][1]))),
32 | tuple(map(lambda x: round(x / motif_len), parse_ci(vals[1][1]))),
33 | ),
34 | )
35 | except (ValueError, TypeError):
36 | return (None, None), (None, None)
37 |
38 |
39 | class TRGTCalculator(BaseCalculator, VCFCalculatorMixin):
40 | def _get_sample_contigs(self) -> tuple[set, set, set]:
41 | return self.get_contigs_from_files(self._mother_call_file, self._father_call_file, self._child_call_file)
42 |
43 | def calculate_contig(self, contig: str) -> MIContigResult:
44 | cr = MIContigResult(contig, includes_95_ci=True, includes_seq=True)
45 |
46 | mvf = pysam.VariantFile(str(self._mother_call_file))
47 | fvf = pysam.VariantFile(str(self._father_call_file))
48 | cvf = pysam.VariantFile(str(self._child_call_file))
49 |
50 | # We want all common loci, so loop through the child and then look for the loci in the parent calls
51 |
52 | for cv in cvf.fetch(contig):
53 | mv = next(mvf.fetch(contig, cv.start, cv.stop), None)
54 | fv = next(fvf.fetch(contig, cv.start, cv.stop), None)
55 |
56 | # TODO: Handle sex chromosomes
57 |
58 | k = (contig, cv.start, cv.stop)
59 |
60 | if self.should_skip_locus(*k):
61 | continue
62 |
63 | cr.seen_locus(*k)
64 |
65 | if mv is None or fv is None:
66 | # Variant isn't found in at least one of the parents, so we can't do anything with it.
67 | # TODO: We need to actually check calls, and check with sample ID, not just assume
68 | continue
69 |
70 | # TODO: Handle missing samples gracefully
71 | # TODO: Handle wrong formatted VCFs gracefully
72 |
73 | motif = cv.info["MOTIFS"][0]
74 |
75 | cs = cv.samples[self._child_id or 0]
76 | ms = mv.samples[self._mother_id or 0]
77 | fs = fv.samples[self._father_id or 0]
78 |
79 | if None in cs["GT"] or None in ms["GT"] or None in fs["GT"]:
80 | # None call in VCF, skip this call
81 | continue
82 |
83 | c_gt = tuple(sorted(int(m.split("_")[0]) for m in cs["MC"]))
84 | m_gt = tuple(sorted(int(m.split("_")[0]) for m in ms["MC"]))
85 | f_gt = tuple(sorted(int(m.split("_")[0]) for m in fs["MC"]))
86 |
87 | # Uncomment to use allele length as motif copies:
88 |
89 | # cs_reps = tuple(sorted(zip(cs["AL"], cs["ALLR"]), key=lambda x: x[0]))
90 | # ms_reps = tuple(sorted(zip(ms["AL"], ms["ALLR"]), key=lambda x: x[0]))
91 | # fs_reps = tuple(sorted(zip(fs["AL"], fs["ALLR"]), key=lambda x: x[0]))
92 | #
93 | # c_gt, c_gt_95_ci = _unzip_gt(cs_reps, len(motif))
94 | # m_gt, m_gt_95_ci = _unzip_gt(ms_reps, len(motif))
95 | # f_gt, f_gt_95_ci = _unzip_gt(fs_reps, len(motif))
96 |
97 | # noinspection PyTypeChecker
98 | c_seq_gt: tuple[str] | tuple[str, str] = tuple(sorted((cv.alleles[g] for g in cs["GT"]), key=len))
99 | # noinspection PyTypeChecker
100 | m_seq_gt: tuple[str] | tuple[str, str] = tuple(sorted((mv.alleles[g] for g in ms["GT"]), key=len))
101 | # noinspection PyTypeChecker
102 | f_seq_gt: tuple[str] | tuple[str, str] = tuple(sorted((fv.alleles[g] for g in fs["GT"]), key=len))
103 |
104 | cr.append(MILocusData(
105 | contig=contig,
106 | start=cv.start,
107 | end=cv.stop,
108 | motif=motif,
109 |
110 | child_gt=c_gt, mother_gt=m_gt, father_gt=f_gt,
111 | # Uncomment to use allele length as motif copies 95% CI:
112 | # child_gt_95_ci=c_gt_95_ci, mother_gt_95_ci=m_gt_95_ci, father_gt_95_ci=f_gt_95_ci,
113 | child_seq_gt=c_seq_gt, mother_seq_gt=m_seq_gt, father_seq_gt=f_seq_gt,
114 | ))
115 |
116 | return cr
117 |
--------------------------------------------------------------------------------
/strkit/mi/vcf_utils.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pysam
4 |
5 | __all__ = ["VCFCalculatorMixin"]
6 |
7 |
8 | class VCFCalculatorMixin:
9 | @staticmethod
10 | def get_contigs_from_files(mother_call_file, father_call_file, child_call_file) -> tuple[set, set, set]:
11 | with pysam.VariantFile(str(mother_call_file)) as mvf:
12 | mc = set(mvf.header.contigs)
13 |
14 | with pysam.VariantFile(str(father_call_file)) as fvf:
15 | fc = set(fvf.header.contigs)
16 |
17 | with pysam.VariantFile(str(child_call_file)) as cvf:
18 | cc = set(cvf.header.contigs)
19 |
20 | return mc, fc, cc
21 |
--------------------------------------------------------------------------------
/strkit/utils.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import math
4 | import operator
5 | from functools import partial
6 | from typing import Any, Callable, Iterable
7 |
8 | __all__ = [
9 | "cat_strs",
10 | "is_none",
11 | "idx_0_getter",
12 | "idx_1_getter",
13 | "apply_or_none",
14 | "int_tuple",
15 | "float_tuple",
16 | "parse_ci",
17 | "parse_cis",
18 | "cis_overlap",
19 | "sign",
20 | ]
21 |
22 |
23 | # index/property getters and other partials
24 | cat_strs = "".join
25 | is_none = partial(operator.is_, None)
26 | idx_0_getter = operator.itemgetter(0)
27 | idx_1_getter = operator.itemgetter(1)
28 |
29 |
30 | def apply_or_none(fn: Callable, x: Any) -> Any:
31 | # Python: add any type of monad functionality challenge [IMPOSSIBLE]
32 | return fn(x) if x is not None else None
33 |
34 |
35 | def int_tuple(x: Iterable) -> tuple[int, ...]:
36 | return tuple(map(int, x))
37 |
38 |
39 | def float_tuple(x: Iterable) -> tuple[float, ...]:
40 | return tuple(map(float, x))
41 |
42 |
43 | def parse_ci(ci: str, commas=False, dtype=int) -> tuple[int, int] | tuple[float, float]:
44 | ci_s = ci.split("," if commas else "-")
45 | return dtype(ci_s[0]), dtype(ci_s[1])
46 |
47 |
48 | def parse_cis(
49 | cis: Iterable[str], commas=False, dtype=int
50 | ) -> tuple[tuple[int, ...], ...] | tuple[tuple[float, ...], ...]:
51 | return tuple(map(lambda ci: parse_ci(ci, commas, dtype), cis))
52 |
53 |
54 | def cis_overlap(ci1, ci2) -> bool:
55 | epsilon = -0.0001
56 |
57 | # []: ci1
58 | # (): ci2
59 | # [ ( ] ) or [ ( ) ] or ( [ ) ] or ( [ ] )
60 | # int logic: ci1[0] <= ci2[1] and ci2[0] <= ci1[1]
61 | # float logic: lets add some epsilon to prevent little issues
62 | return (ci2[1] - ci1[0]) > epsilon and (ci1[1] - ci2[0]) > epsilon
63 |
64 |
65 | def sign(x: int | float) -> int:
66 | return round(math.copysign(1, x))
67 |
--------------------------------------------------------------------------------
/strkit/viz/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidlougheed/strkit/90f07d4bb8e27b67a1a1c9afafa4f4b1412b171a/strkit/viz/__init__.py
--------------------------------------------------------------------------------
/strkit/viz/server.py:
--------------------------------------------------------------------------------
1 | from flask import Flask, render_template, request, send_file
2 | from werkzeug.exceptions import NotFound
3 |
4 | __all__ = [
5 | "run_server",
6 | ]
7 |
8 | app = Flask(__name__)
9 |
10 |
11 | @app.route("/")
12 | def browser():
13 | return render_template(
14 | "browser.html",
15 | **app.config["PARAMS"])
16 |
17 |
18 | @app.route("/report-metadata")
19 | def get_report_metadata():
20 | return {k: v for k, v in app.config["CALL_REPORT"].items() if k != "results"}
21 |
22 |
23 | @app.route("/params")
24 | def get_params():
25 | return {
26 | "cmd": app.config["PARAMS"],
27 | "report": app.config["CALL_REPORT"]["parameters"],
28 | }
29 |
30 |
31 | @app.route("/loci")
32 | def get_loci():
33 | cr = app.config["CALL_REPORT"]
34 | ecd = list(enumerate(cr["results"])) # TODO: cache
35 |
36 | q = request.args.get("q", "").strip()
37 | if q:
38 | res = list(filter(lambda x: q.lower() in f"{x[1]['contig']}:{x[1]['start']}-{x[1]['end']}", ecd)) # TODO
39 | else:
40 | # TODO: nicer priority
41 | res = ecd[:10]
42 |
43 | return {
44 | "results": list(map(
45 | lambda x: {
46 | "i": x[0],
47 | "contig": x[1]["contig"],
48 | "start": x[1]["start"],
49 | "end": x[1]["end"],
50 | "disabled": x[1]["call"] is None,
51 | },
52 | res)),
53 | }
54 |
55 |
56 | @app.route("/call_data/")
57 | def get_call_data(i: int):
58 | cr = app.config["CALL_REPORT"]
59 | cr_res = cr["results"]
60 | if i < 0 or i > len(cr_res) - 1:
61 | raise NotFound()
62 | return cr_res[i]
63 |
64 |
65 | # @app.route("/ref")
66 | # def get_ref_file():
67 | # return send_file(app.config["PARAMS"]["ref"], conditional=True)
68 | #
69 | #
70 | # @app.route("/ref_index")
71 | # def get_ref_index_file():
72 | # return send_file(app.config["PARAMS"]["ref_index"], conditional=True)
73 |
74 |
75 | @app.route("/align_file")
76 | def get_align_file():
77 | return send_file(app.config["PARAMS"]["align_file"], conditional=True)
78 |
79 |
80 | @app.route("/align_index")
81 | def get_align_index_file():
82 | return send_file(app.config["PARAMS"]["align_index"], conditional=True)
83 |
84 |
85 | def run_server(call_report, **kwargs):
86 | app.config.from_mapping(dict(CALL_REPORT=call_report, PARAMS=kwargs))
87 | app.run(host="localhost", port=5011, debug=True)
88 |
--------------------------------------------------------------------------------
/strkit/viz/static/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidlougheed/strkit/90f07d4bb8e27b67a1a1c9afafa4f4b1412b171a/strkit/viz/static/logo.png
--------------------------------------------------------------------------------
/tests/data/test_loci.bed:
--------------------------------------------------------------------------------
1 | chr1 200 300 ACAA
2 | chr1 300 400 GA
3 | chr1 350 450 GAGA
4 | chr2 100 200 CAG
5 |
--------------------------------------------------------------------------------
/tests/test_caller_locus_validation.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from strkit.call.validation import LocusValidationError, valid_motif, validate_locus
3 |
4 |
5 | @pytest.mark.parametrize("motif,valid", [
6 | ("CAG", True),
7 | ("CAGN", True),
8 | ("CAGX", False),
9 | ("(CAG)n", False),
10 | ("XX", False),
11 | ])
12 | def test_valid_motif(motif, valid):
13 | assert valid_motif(motif) == valid
14 |
15 |
16 | def test_validate_locus():
17 | with pytest.raises(LocusValidationError):
18 | # start > end, invalid
19 | validate_locus(1, 1000, 500, "CAG")
20 |
21 | with pytest.raises(LocusValidationError):
22 | # start == end, invalid
23 | validate_locus(1, 1000, 1000, "CAG")
24 |
25 | with pytest.raises(LocusValidationError):
26 | # invalid motif
27 | validate_locus(1, 1000, 1200, "(CAG)n")
28 |
--------------------------------------------------------------------------------
/tests/test_caller_utils.py:
--------------------------------------------------------------------------------
1 | from strkit.call.utils import find_pair_by_ref_pos, normalize_contig
2 |
3 | # A A T T C G C C C C A A A A A C
4 | PAIRS = [(0, 1000), (1, 1001), (2, 1003), (3, 1004), (4, 1005), (5, 1006), (6, 1008), (7, 1009)]
5 | SNVS = ((1003, "C"), (1009, "A"))
6 | PAIRS_Q = list(p[0] for p in PAIRS)
7 | PAIRS_R = list(p[1] for p in PAIRS)
8 |
9 |
10 | def test_find_pair_by_ref_pos():
11 | assert find_pair_by_ref_pos(PAIRS_R, 1004) == (3, True)
12 | assert find_pair_by_ref_pos(PAIRS_R, 1007) == (6, False)
13 |
14 |
15 | def test_normalize_contig():
16 | assert normalize_contig("chr5", True) == "chr5"
17 | assert normalize_contig("5", True) == "chr5"
18 | assert normalize_contig("X", True) == "chrX"
19 | assert normalize_contig("chr5", False) == "5"
20 | assert normalize_contig("chrX", False) == "X"
21 |
--------------------------------------------------------------------------------
/tests/test_iupac.py:
--------------------------------------------------------------------------------
1 | from strkit.iupac import get_iupac_code_for_nt_set
2 |
3 |
4 | def test_get_iupac_code():
5 | assert get_iupac_code_for_nt_set({"A", "T"}) == "W"
6 | assert get_iupac_code_for_nt_set({"A", "C", "G", "T"}) == "N"
7 | assert get_iupac_code_for_nt_set({"A", "T", "C", "G"}) == "N"
8 | assert get_iupac_code_for_nt_set({"A", "T", "C"}) == "H"
9 | assert get_iupac_code_for_nt_set({"A", "T", "C", "Z"}) is None
10 | assert get_iupac_code_for_nt_set({"A", "T", "C", ":)"}) is None
11 | assert get_iupac_code_for_nt_set({"A", "T", "C", ""}) is None
12 |
--------------------------------------------------------------------------------
/tests/test_mi_intervals.py:
--------------------------------------------------------------------------------
1 | import pathlib
2 | import pytest
3 |
4 | from strkit.mi.intervals import (
5 | build_loci_dict_of_dict_from_file,
6 | overlapping_loci_dict_of_dict,
7 | build_loci_dict_of_list_from_file,
8 | overlapping_loci_dict_of_list,
9 | )
10 |
11 | TEST_LOCI = pathlib.Path(__file__).parent / "data" / "test_loci.bed"
12 |
13 | BED_CASES = [
14 | ("chr1", 50, 70, 0),
15 | ("chr1", 205, 210, 1),
16 | ("chr1", 50, 1000, 3),
17 | ("chr1", 320, 500, 2),
18 | ("chr1", 400, 450, 1),
19 | ("chr1", 1000, 1001, 0),
20 | ("chr2", 100, 101, 1),
21 | ("chr2", 100, 200, 1),
22 | ("asdf", 50, 1000, 0),
23 | ]
24 |
25 |
26 | @pytest.mark.parametrize("contig,start,end,nr", BED_CASES)
27 | def test_loci_dict_of_dict(contig: str, start: int, end: int, nr: int):
28 | d = build_loci_dict_of_dict_from_file(TEST_LOCI, False)
29 | assert len(overlapping_loci_dict_of_dict(contig, start, end, d)) == nr
30 |
31 |
32 | @pytest.mark.parametrize("contig,start,end,nr", BED_CASES)
33 | def test_loci_dict_of_list(contig: str, start: int, end: int, nr: int):
34 | d = build_loci_dict_of_list_from_file(TEST_LOCI, False)
35 | assert len(tuple(overlapping_loci_dict_of_list(contig, start, end, d, False))) == nr
36 |
--------------------------------------------------------------------------------