├── .github └── workflows │ ├── publish-docker.yml │ ├── release.yml │ └── test.yml ├── .gitignore ├── .idea ├── .gitignore ├── .name ├── csv-editor.xml ├── inspectionProfiles │ ├── Project_Default.xml │ └── profiles_settings.xml ├── jsLibraryMappings.xml ├── misc.xml ├── modules.xml ├── strkit.iml └── vcs.xml ├── Dockerfile ├── LICENSE ├── MANIFEST.in ├── README.md ├── catalogs └── pathogenic_assoc.hg38.tsv ├── docs ├── caller_catalog.md ├── caller_usage.md ├── images │ ├── browser_hist.png │ ├── browser_igv.png │ ├── call_method_flow.png │ ├── strkit_logo_open_graph.png │ └── strkit_logo_small.png ├── output_formats.md └── trio_analyses.md ├── pyproject.toml ├── requirements.txt ├── setup.py ├── strkit ├── VERSION ├── __init__.py ├── call │ ├── __init__.py │ ├── align_matrix.py │ ├── allele.py │ ├── call_locus.py │ ├── call_sample.py │ ├── cigar.py │ ├── non_daemonic_pool.py │ ├── output │ │ ├── __init__.py │ │ ├── json_report.py │ │ ├── tsv.py │ │ └── vcf.py │ ├── params.py │ ├── realign.py │ ├── repeats.py │ ├── snvs.py │ ├── types.py │ ├── utils.py │ └── validation.py ├── catalog │ ├── __init__.py │ └── combine.py ├── constants.py ├── convert │ ├── __init__.py │ ├── _bed_4.py │ ├── constants.py │ ├── converter.py │ ├── expansionhunter.py │ ├── gangstr.py │ ├── hipstr.py │ └── trgt.py ├── entry.py ├── exceptions.py ├── iupac.py ├── json.py ├── logger.py ├── mi │ ├── __init__.py │ ├── base.py │ ├── expansionhunter.py │ ├── gangstr.py │ ├── generic_vcf.py │ ├── intervals.py │ ├── repeathmm.py │ ├── result.py │ ├── straglr.py │ ├── strkit.py │ ├── tandem_genotypes.py │ ├── trgt.py │ └── vcf_utils.py ├── utils.py └── viz │ ├── __init__.py │ ├── server.py │ ├── static │ └── logo.png │ └── templates │ └── browser.html └── tests ├── data └── test_loci.bed ├── test_caller_locus_validation.py ├── test_caller_utils.py ├── test_iupac.py └── test_mi_intervals.py /.github/workflows/publish-docker.yml: -------------------------------------------------------------------------------- 1 | name: Publish STRkit Docker image 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | jobs: 8 | publish: 9 | runs-on: ubuntu-latest 10 | 11 | permissions: 12 | packages: write 13 | contents: read 14 | 15 | steps: 16 | - uses: actions/checkout@v4 17 | 18 | - name: Set up QEMU 19 | uses: docker/setup-qemu-action@v3 20 | with: 21 | platforms: linux/amd64,linux/arm64 22 | 23 | - uses: docker/metadata-action@v5 24 | id: meta 25 | with: 26 | images: ghcr.io/davidlougheed/strkit 27 | tags: | 28 | type=semver,pattern={{version}} 29 | type=semver,pattern={{major}}.{{minor}} 30 | 31 | - uses: docker/setup-buildx-action@v3 32 | 33 | - uses: docker/login-action@v3 34 | with: 35 | registry: ghcr.io 36 | username: ${{ github.actor }} 37 | password: ${{ secrets.GITHUB_TOKEN }} 38 | 39 | - uses: docker/build-push-action@v5 40 | with: 41 | context: . 42 | push: true 43 | tags: ${{ steps.meta.outputs.tags }} 44 | labels: ${{ steps.meta.outputs.labels }} 45 | platforms: linux/amd64,linux/arm64 46 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Publish PyPI release 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | jobs: 8 | publish: 9 | runs-on: ubuntu-latest 10 | 11 | permissions: 12 | contents: read 13 | id-token: write 14 | 15 | environment: 16 | name: release 17 | url: https://pypi.org/p/strkit 18 | 19 | steps: 20 | - uses: actions/checkout@v4 21 | 22 | - uses: actions/setup-python@v5 23 | with: 24 | python-version: '3.10' 25 | 26 | - name: Install pypa/build 27 | run: python -m pip install build --user 28 | 29 | - name: Build 30 | run: python -m build --sdist --wheel --outdir dist/ . 31 | 32 | - name: Publish to PyPI 33 | uses: pypa/gh-action-pypi-publish@release/v1 34 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | 9 | jobs: 10 | test: 11 | runs-on: ubuntu-latest 12 | strategy: 13 | matrix: 14 | python-version: [ "3.10", "3.11", "3.12" ] 15 | 16 | steps: 17 | - uses: actions/checkout@v4 18 | - uses: actions/setup-python@v5 19 | name: Set up Python 20 | with: 21 | python-version: ${{ matrix.python-version }} 22 | - name: Install dependencies 23 | run: pip install -r requirements.txt 24 | - name: Install STRkit 25 | run: pip install . 26 | - name: Test 27 | run: pytest -svv --cov=strkit --cov-branch 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /env 2 | /envp11 3 | __pycache__ 4 | 5 | /build 6 | /dist 7 | /strkit.egg-info 8 | # ignore WIP cohort code for now 9 | /strkit/cohort 10 | 11 | *.bam 12 | *.bai 13 | *.fa.gz 14 | *.fa.gz.fai 15 | *.fa.gz.gzi 16 | *.bed 17 | !tests/data/*.bed 18 | /*.json 19 | /*.tsv 20 | *.vcf.gz* 21 | *.vcf 22 | 23 | *.token 24 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /.idea/.name: -------------------------------------------------------------------------------- 1 | strkit -------------------------------------------------------------------------------- /.idea/csv-editor.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 15 | 16 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 27 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 7 | -------------------------------------------------------------------------------- /.idea/jsLibraryMappings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | 8 | 10 | 11 | 13 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/strkit.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.12-bookworm 2 | 3 | WORKDIR /strkit 4 | 5 | COPY LICENSE . 6 | COPY MANIFEST.in . 7 | COPY pyproject.toml . 8 | COPY README.md . 9 | COPY setup.py . 10 | COPY strkit strkit 11 | 12 | RUN curl https://sh.rustup.rs -sSf > rustup-init.sh 13 | RUN sh ./rustup-init.sh -y 14 | ENV PATH="/root/.cargo/bin:${PATH}" 15 | 16 | RUN pip install -U pip 17 | RUN pip install --no-cache-dir -v . 18 | 19 | CMD [ "strkit" ] 20 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include strkit/VERSION 2 | include strkit/viz/static/logo.png 3 | include strkit/viz/templates/*.html 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # STRkit - short tandem repeat genotyping with long reads 2 | 3 | [![PyPI version](https://badge.fury.io/py/strkit.svg)](https://badge.fury.io/py/strkit) 4 | [![BioRxiv DOI](https://img.shields.io/badge/bioRxiv-10.1101/2025.03.25.645269-B31B1B.svg)](https://doi.org/10.1101/2025.03.25.645269) 5 | [![Zenodo DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.12689906.svg)](https://doi.org/10.5281/zenodo.12689906) 6 | 7 | STRkit is a short tandem repeat (STR) genotyping and analysis toolkit for long read sequencing data, especially 8 | PacBio HiFi data. The STRkit software package is written in Python and is available in the PyPI package registry or as 9 | a Docker container. 10 | 11 | If you use STRkit in published work, please cite our preprint: 12 | 13 | > [STRkit: precise, read-level genotyping of short tandem repeats using long reads and single-nucleotide variation.](https://doi.org/10.1101/2025.03.25.645269) 14 | > David R Lougheed, Tomi Pastinen, Guillaume Bourque. *BioRxiv preprint*. 15 | > DOI: [10.1101/2025.03.25.645269](https://doi.org/10.1101/2025.03.25.645269) 16 | 17 | 18 | 19 | 20 | ## Table of Contents 21 | 22 | * [Installation](#installation) 23 | * [Via PyPI](#via-pypi) 24 | * [As a Docker container](#as-a-docker-container) 25 | * [Commands](#commands) 26 | * [`strkit call`: Genotype caller with bootstrapped confidence intervals](#strkit-call-genotype-caller-with-bootstrapped-confidence-intervals) 27 | * [Features](#features) 28 | * [Usage](#usage) 29 | * [Further documentation on the STRkit caller, including output format](#further-documentation-on-the-strkit-caller-including-output-format) 30 | * [`strkit visualize`: Call visualizer](#strkit-visualize-call-visualizer) 31 | * [`strkit mi`: Mendelian inheritance analysis](#strkit-mi-mendelian-inheritance-analysis) 32 | * [Usage](#usage-1) 33 | * [Further documentation](#further-documentation) 34 | * [`strkit convert`: STR catalog conversion](#strkit-convert-str-catalog-conversion) 35 | * [Usage](#usage-2) 36 | * [Copyright and License](#copyright-and-license) 37 | * [Notice](#notice) 38 | * [Exceptions](#exceptions) 39 | 40 | 41 | ## Installation 42 | 43 | ### Via PyPI 44 | 45 | STRkit requires Python 3.10+ and can be installed from PyPI via `pip` 46 | with the following command: 47 | 48 | ```bash 49 | python -m pip install strkit 50 | ``` 51 | 52 | You may need to install the [Rust toolchain](https://www.rust-lang.org/tools/install) 53 | and a C compiler (e.g., `gcc`, `clang`), as well as `cmake`, to compile the `strkit_rust_ext` wheel, 54 | although prebuilt wheels for this module are available for some platforms. Compiling the wheel may take quite 55 | a long time (in the tens of minutes). 56 | 57 | On Digital Research Alliance of Canada/Compute Canada clusters, this involves loading a few modules: 58 | 59 | ```bash 60 | module load rust/1.85.0 clang/18.1.8 python/3.11 scipy-stack/2025a parasail/2.6.2 61 | python -m pip install strkit 62 | ``` 63 | 64 | STRkit should then be available in your Python environment as a command-line tool: 65 | 66 | ```bash 67 | strkit --help 68 | ``` 69 | 70 | ### As a Docker container 71 | 72 | STRkit is also available as a [Docker container](https://github.com/davidlougheed/strkit/pkgs/container/strkit), stored 73 | in the GitHub Container Registry. 74 | 75 | It can be pulled using the following command: 76 | 77 | ```bash 78 | docker pull ghcr.io/davidlougheed/strkit:latest 79 | ``` 80 | 81 | Then, STRkit commands can be run mostly as normal using the Docker image: 82 | 83 | ```bash 84 | docker run -it ghcr.io/davidlougheed/strkit --help 85 | ``` 86 | 87 | 88 | ## Commands 89 | 90 | ### `strkit call`: Genotype caller with bootstrapped confidence intervals 91 | 92 | A Gaussian mixture model tandem repeat genotype caller for long read data. 93 | STRkit is tuned specifically for high-fidelity long reads, although other 94 | long read data should still work. 95 | 96 | ![Calling approach flow chart](./docs/images/call_method_flow.png) 97 | 98 | #### Features: 99 | 100 | * Performant, vectorized (thanks to [parasail](https://github.com/jeffdaily/parasail)) 101 | estimates of repeat counts from high-fidelity long reads and a supplied 102 | catalog of TR loci and motifs. 103 | * Re-weighting of longer reads, to compensate for their lower likelihood of observation. 104 | * Whole-genome and targeted genotyping modes to adjust this re-weighting. 105 | * Incorporation of single-nucleotide variation (SNVs) for better and faster calling plus 106 | additional downstream analysis possibilities. 107 | * Recommended for **HiFi data and ONT R10 data only**. In my testing, this worsens runtime and call quality for 108 | ONT ultra-long-read data, but speeds up the tool and improves call quality for HiFi/ONT R10 data. 109 | * Parallelized for faster computing on clusters and for ad-hoc fast analysis of single samples. 110 | * 95% confidence intervals on calls via a user-configurable optional parametric bootstrapping process. 111 | 112 | 113 | #### Usage: 114 | 115 | See all parameters and example usage with a Slurm cluster: 116 | [Advanced caller usage and configuration](./docs/caller_usage.md) 117 | 118 | ##### EXAMPLE USAGE 119 | 120 | ```bash 121 | # For the dbSNP VCF used below for SNV incorporation, see https://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/ 122 | # (00-common_all.vcf.gz) 123 | # 124 | # "Accurate reads" here means HiFi / ONT R10 duplex reads, but in practice may also include ONT R10 simplex reads. 125 | 126 | strkit call \ 127 | path/to/read/file.bam \ # [REQUIRED] One indexed read file (BAM/CRAM) 128 | --hq \ # If using accurate reads, enable this to get better genotyping & more robust expansion detection 129 | --realign \ # If using accurate reads, enable this to enable local realignment / read recovery. Good for detecting expansions, but slows down calling. 130 | --ref path/to/reference.fa.gz \ # [REQUIRED] Indexed FASTA-formatted reference genome 131 | --loci path/to/loci.bed \ # [REQUIRED] TRF-formatted (or 4-col, with motif as last column) sorted list of loci to genotype 132 | --incorporate-snvs path/to/dbsnp/00-common_all.vcf.gz \ # If you want, specify a SNV catalogue to help phase STRs & speed up calling 133 | --vcf my-calls.vcf \ # Calculate consensus sequences for alleles and output a .vcf (or .vcf.gz) with call data 134 | --seed 183 \ # Fixed random number generator seed for replicability 135 | --processes 10 \ # Number of parallel processes to use; DEFAULT: 1 136 | --no-tsv # If VCF output is enabled as above, we don't need TSV genotype output to stdout (which is the default) 137 | ``` 138 | 139 | ##### REGARDING ALIGNMENTS 140 | 141 | Ideally, you should be using a read file aligned with parameters tuned for tandem repeats. 142 | PacBio provides a 143 | [recommended workflow](https://github.com/PacificBiosciences/apps-scripts/tree/master/RepeatAnalysisTools) 144 | for CCS alignment in this scenario. However, regular aligned readsets are fine and have been tested 145 | extensively. 146 | 147 | If you're using accurate long reads (e.g., HiFi, ONT R10 duplex) as input, **use the `--hq` and 148 | `--realign` options** to get better genotype calculation and a greater proportion of reads 149 | incorporated into the computed genotypes, respectively. These should not add much performance 150 | overhead. *In practice, these options may also aid calling with slightly-less-accurate reads.* 151 | 152 | If you want to **incorporate haplotagging from an alignment file (`HP` tags)** into the 153 | process, which should speed up runtime and potentially improve calling results, you must pass 154 | the `--use-hp` flag. 155 | 156 | ##### REGARDING SNV INCORPORATION 157 | 158 | If you want to **incorporate SNV calling** into the process, which speeds up runtime and gives 159 | marginally better calling results, you must provide an indexed, `bgzip`-compressed SNV catalog 160 | VCF which matches your reference genome. You can find dbSNP VCFs at 161 | [`https://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/`](https://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/). 162 | The file for GRCh38 is called `00-common_all.vcf.gz` as of time of writing. 163 | **Note that this does not need to be an SNV call file for your sample, specifically**; just one 164 | which has positions, reference/alternate alleles, and the `ID` field populated. 165 | 166 | ##### REGARDING OUTPUT 167 | 168 | If you want to output a full call report, you can use the `--json output-file.json` argument to 169 | specify a path to output a more detailed JSON document to. This document contains 99% CIs, peak 170 | labels, and some other information that isn't included in the normal TSV file. If you want this 171 | file to be indented and human-readable, use the `--indent-json` flag in addition to `--json ...`. 172 | 173 | If you want to output a VCF file (STRs and SNVs if called; currently not phased), use the 174 | `--vcf ...` argument. If you pass `--vcf stdout`, the VCF will be written to `stdout` instead of a 175 | file. 176 | 177 | For more information, see also documentation on the [Output formats](./docs/output_formats.md). 178 | 179 | ##### REGARDING REFERENCE GENOMES 180 | 181 | The reference genome provided must be BGZipped and indexed using `samtools faidx`: 182 | 183 | ```bash 184 | # Starting from a .fa: 185 | bgzip my-reference.fa # Replaces .fa with a .fa.gz file 186 | samtools faidx my-reference.fa.gz # Generates a .fai index file 187 | ``` 188 | 189 | ##### OTHER PARAMETERS 190 | 191 | See the '[Caller catalog format & choosing a catalog](./docs/caller_catalog.md)' page for more on 192 | how to format a locus catalog or choose from existing available catalogs. 193 | 194 | 195 | #### Further documentation on the STRkit caller, including output format: 196 | 197 | * [Advanced caller usage and configuration](./docs/caller_usage.md) 198 | * [Caller catalog format & choosing a catalog](./docs/caller_catalog.md) 199 | * [Output formats](./docs/output_formats.md) 200 | 201 | 202 | ### `strkit visualize`: Call visualizer 203 | 204 | STRkit bundles a call visualization tool which takes as input a BAM file and 205 | a JSON call file from using the `--json` flag with `strkit call`. 206 | 207 | It starts a web server on your local machine; the visualizations can be 208 | interacted with in a web browser. 209 | 210 | To use the tool, run the following command: 211 | 212 | ```bash 213 | strkit visualize path/to/my-alignment.bam \ 214 | --ref hg38 \ # or hg19 215 | --json path/to/my-calls.json \ 216 | -i 1 # 1-indexed offset in JSON file for locus of interest. Default is 1 if left out. 217 | ``` 218 | 219 | This will output something like the following: 220 | 221 | ``` 222 | * Serving Flask app 'strkit.viz.server' (lazy loading) 223 | * Environment: production 224 | WARNING: This is a development server. Do not use it in a production deployment. 225 | Use a production WSGI server instead. 226 | * Debug mode: on 227 | * Running on http://localhost:5011 (Press CTRL+C to quit) 228 | ... 229 | ``` 230 | 231 | You can then go to the URL listed, `http://localhost:5011`, on your local machine 232 | to see the visualization tool: 233 | 234 | ![Browser Histogram](./docs/images/browser_hist.png) 235 | *STRkit browser histogram, showing an expansion in the HTT gene.* 236 | 237 | ![igv.js Genome Browser](./docs/images/browser_igv.png) 238 | *The same expansion, shown in the igv.js browser. Note the insertions on 239 | the left-hand side in most reads, and the heterozygous copy number pattern.* 240 | 241 | To exit the tool, press `Ctrl-C` in your command line window as mentioned in 242 | the start-up instructions. 243 | 244 | 245 | 246 | ### `strkit mi`: Mendelian inheritance analysis 247 | 248 | Using trio data, candidate de novo STR mutations (or genotyping errors/dropout rates) can be discovered 249 | by looking at inheritance patterns. This tool provides a few different ways to do this, via: 250 | 251 | * Mendelian inheritance % (MI) calculations for many common TR genotyping tools for both long/short reads, 252 | including support for genotyping methods which report confidence intervals. 253 | * Reports of loci (potentially of interest) which do not respect MI. 254 | 255 | #### Usage 256 | 257 | For a basic JSON report on Mendelian inheritance with a trio of STRkit VCFs (compressed and indexed with BGZip), use 258 | something like the following command: 259 | 260 | ```bash 261 | # In addition to summary figures on Mendelian inheritance, this tool outputs loci which do not respect MI, which may be 262 | # useful as candidate de novo mutations. The --mismatch-out-mi flag controls which form of MI metric is used for 263 | # deciding which loci to output. Options for this flag are: 264 | # strict (strict copy number MI), 265 | # pm1 (copy number MI ± 1 repeat unit), 266 | # ci_95 (copy number 95% confidence interval), 267 | # ci_99 (copy number 99% confidence interval), 268 | # seq ([allele] sequence MI), 269 | # sl ([allele] sequence length MI), 270 | # sl_pm1 ([allele] sequence length MI ± 1 base pair) 271 | strkit mi \ 272 | --caller strkit-vcf \ 273 | --json mi-report.json \ 274 | --mismatch-out-mi seq \ 275 | child-calls.vcf.gz \ 276 | mother-calls.vcf.gz \ 277 | father-calls.vcf.gz 278 | # This will also output a TSV report to stdout. If this is not desired, use --no-tsv to suppress TSV output. 279 | ``` 280 | 281 | For other options and what they do, run `strkit mi` (with no other arguments) or `strkit mi --help`. 282 | 283 | #### Further documentation 284 | 285 | **For more information on what kind of analyses can be done with this data**, see the 286 | [Trio analyses with STRkit](./docs/trio_analyses.md) page. 287 | 288 | 289 | ### `strkit convert`: STR catalog conversion 290 | 291 | STRkit takes as input a four-or-more-column BED file, structured like: 292 | 293 | ``` 294 | contig start end [0 or more extraneous columns] motif 295 | ``` 296 | 297 | Any extraneous columns are removed, (internally) leaving a four-column STR locus representation. 298 | Some other tools, e.g., [Straglr](https://github.com/bcgsc/straglr), also take a four-column STR 299 | BED as locus catalog input. However, other formats representing a catalog of STRs exist: 300 | 301 | * [Tandem Repeats Finder](https://github.com/Benson-Genomics-Lab/TRF) outputs a TSV/BED with a lot 302 | of information. This can be used as-is with STRkit, but it's safer for other tools to convert to 303 | a four-column BED format. 304 | * [TRGT uses a custom repeat definition format](https://github.com/PacificBiosciences/trgt/blob/main/docs/repeat_files.md), 305 | which can specify more advanced STR structures. 306 | 307 | #### Usage 308 | 309 | The `strkit convert` sub-command requires an input format (`trf` or `trgt`), an output format 310 | (many, see `strkit convert --help`), and an input file. Output is written to `stdout`. 311 | 312 | *Note:* Not all input/output format pairs have available converter functions; an error will be 313 | printed to `stderr` if one does not exist. 314 | 315 | For example, to convert from a TRF BED to a TRGT repeat definition BED file: 316 | 317 | ```bash 318 | strkit convert --in-format trf --out-format trgt in_file.trf.bed > out_file.bed 319 | ``` 320 | 321 | To attempt a conversion from a TRGT repeat definition file to a STRkit/four-column motif BED: 322 | 323 | ```bash 324 | strkit convert --in-format trgt --out-format strkit in_file.trgt.bed > out_file.bed 325 | ``` 326 | 327 | Note that TRGT can represent STRs with complex structure that STRkit cannot, so some of these loci 328 | may not be converted (these will be logged to `stderr`). 329 | 330 | 331 | ## Copyright and License 332 | 333 | * 2021-2023: © David Lougheed (DL) and McGill University 2021-2023 (versions up to and including `0.8.0a1`), 334 | created during graduate research by DL. 335 | * 2023+: (versions beyond `0.8.0a1`): 336 | * Portions © DL and McGill University 2021-2023 337 | * Portions © McGill University 2024-2025 338 | * Portions © DL 2024-2025 339 | 340 | 341 | ### Notice 342 | 343 | This program is free software: you can redistribute it and/or modify 344 | it under the terms of the GNU General Public License as published by 345 | the Free Software Foundation, either version 3 of the License, or 346 | (at your option) any later version. 347 | 348 | This program is distributed in the hope that it will be useful, 349 | but WITHOUT ANY WARRANTY; without even the implied warranty of 350 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 351 | GNU General Public License for more details. 352 | 353 | You should have received a copy of the GNU General Public License 354 | along with this program. If not, see . 355 | 356 | ### Exceptions 357 | 358 | **Some exclusions to this license apply; specifically portions of 359 | [`strkit/viz/templates/browser.html`](strkit/viz/templates/browser.html) and 360 | the STRkit logo files ([./docs/images/strkit_logo_small.png](./docs/images/strkit_logo_small.png) 361 | and [./strkit/viz/static/logo.png](./strkit/viz/static/logo.png).)** 362 | 363 | The STRkit logo is © David Lougheed 2022, and was designed by Evelyn Lougheed. It is not licensed 364 | under the terms of the GPL 3.0; it is instead licensed under the terms of the 365 | [CC BY-ND 4.0](https://creativecommons.org/licenses/by-nd/4.0/). 366 | 367 | Portions of `viz/templates/browser.html` copyright (C) 2021-2022 Observable, Inc. 368 | Used under the terms of the ISC license. 369 | -------------------------------------------------------------------------------- /catalogs/pathogenic_assoc.hg38.tsv: -------------------------------------------------------------------------------- 1 | # Citations mostly obtained from Gall-Duncan et al. 2022 . . . . . . . . 2 | # contig start end disease inheritance gene citation pathogenic form notes motif 3 | chr1 57367043 57367125 SCA37 AD DAB1 Seixas et al. 2017 RAAAT 4 | chr1 94418421 94418442 OPDM AD ABCD3 Cortese et al. 2024 CN>=118 GCC 5 | chr1 149390802 149390842 NIID;EssentialTremor;ALS;OPDM3 AD;Assoc;Assoc;Familial NOTCH2NLC Tian et al. 2019;Sun et al. 2020;Yuan et al. 2020;Yu et al. 2021 GGC 6 | chr2 96197066 96197124 BAFME2 AD STARD7 Corbett et al. 2019 AAAWK 7 | chr2 100104798 100104824 Developmental anomalies PossibleAssoc AFF3 Metsu et al. 2014 GCC 8 | chr2 176093058 176093099 SPD1 AD HOXD13 Gong et al. 2011 CN>=22 GCN 9 | chr2 190880872 190880920 GD AR GLS van Kuilenburg et al. 2019 GCA 10 | chr3 63912684 63912715 SCA7 Familial ATXN7 Stevanin et al. 1998 CN>=37 GCA 11 | chr3 129172576 129172733 DM2 AD CNBP Liquori et al. 2001 CASR 12 | chr3 183712176 183712226 BAFME4 AD YEATS2 Yeetong et al. 2019 ATTTY 13 | chr4 3074876 3074940 HD AD HTT HDCRG 1993 CN>=36 CAG 14 | chr4 39348424 39348479 CANVAS AR RFC1 Cortese et al. 2019&Rafehi et al. 2019 AARRG 15 | chr4 41745975 41746022 CCHS Familial PHOX2B Amiel et al. 2003 GCC 16 | chr5 10356338 10356411 BAFME3 AD MARCHF6 Florian et al. 2019 TTTYA 17 | chr5 146878727 146878759 SCA12 AD PPP2R2B Holmes et al. 1999 GCT 18 | chr6 16327633 16327724 SCA1;ALS AD;Assoc ATXN1 Orr et al. 1993;Lattante et al. 2018,Tazelaar et al. 2020 TGC 19 | chr6 45422749 45422794 CCD AD RUNX2 Mundlos et al. 1997 GGC 20 | chr6 170561906 170562017 SCA17 AD TBP Koide et al. 1999 CN>=43 GCA 21 | chr7 27199679 27199732 HFGS AD HOXA13 Utsch et al. 2002 GCN 22 | chr8 118366812 118366918 BAFME1 AD SAMD12 Ishiura et al. 2018 AARTA 23 | chr9 27573484 27573546 ALS/FTD AD C9orf72 Renton et al. 2011&DeJesus-Hernandez et al. 2011 GCCCCG 24 | chr10 79826380 79826404 OPDM Assoc NUTM2B-AS1 Gu et al. 2024 CGG 25 | chr11 119206289 119206323 JS Assoc CBL2 Michaelis et al. 1998 CGG 26 | chr12 6936716 6936775 DRPLA AD ATN1 Nagafuchi et al. 1994,Koide et al. 1994,Chaudhry et al. 2021 CAG 27 | chr12 50505001 50505024 ID;LGS Assoc;Assoc DIP2B Winnepenninckx et al. 2007;Qaiser et al. 2021 GGC 28 | chr13 70139351 70139429 SCA8;LGS AD;Assoc ATXN8OS Koob et al. 1999;Qaiser et al. 2021 TRC 29 | chr13 99985448 99985494 HPE5 AD ZIC2 Brown et al. 1998 GCG 30 | chr14 23321464 23321543 OPMD AD PABP2 Brais et al. 1998 GCG 31 | chr15 22786671 22786703 ALS Assoc NIPA1 Blauw et al. 2012 GCG 32 | chr16 17470920 17470921 BSS AR XYLT1 LaCroix et al. 2019 GGC 33 | chr16 24613438 24613532 BAFME6 AD TNRC6A Ishiura et al. 2018 ATTTY 34 | chr16 66490398 66490466 SCA31 AD ENSG00000260851 Sato et al. 2009 TRRAA 35 | chr18 55586153 55586229 FECD Assoc TCF4 Wieben et al. 2012 AGC 36 | chr19 13207858 13207898 SCA6 AD CACNA1 Zhuchenko et al. 1997 CTG 37 | chr19 14496041 14496085 OPDM2 Familial GIPC1 Deng et al. 2020 CCG 38 | chr19 18786027 18786050 PSACH AD COMP Deere et al. 1999 CGT 39 | chr19 45770204 45770266 DM1 AD DMPK Many CAG 40 | chr20 2652732 2652775 SCA36 AD NOP56 Kobayashi et al. 2011 GGGCCT 41 | chr21 43776442 43776479 EPM1 AR CSTB Lalioti et al. 1998 GCGCGGGGCGGG 42 | chr22 45795354 45795424 SCA10 AD SCA10 Matsuura et al. 2000,Matsuura et al. 2006 CN>=280 Variable penetrance in intermediate range ~280-800 ATTCT 43 | chrX 67545316 67545419 SBMA X-linked AR La Spada et al. 1991,Fratta et al. 2014 CN>=38 GCA 44 | chrX 71453054 71453129 XDP X-linked TAF1 Bragg et al. 2017 GAGAGG 45 | chrX 147912036 147912111 FXS;FXPOI;FXTAS X-linked FMR1 Many CN>=200;55<=CN<200;55<=CN<200 GGC 46 | chrX 148500604 148500753 FRAXE X-linked AFF2 Knight et al. 1993,Gu et al. 1996 CN>=200 GCC 47 | -------------------------------------------------------------------------------- /docs/caller_catalog.md: -------------------------------------------------------------------------------- 1 | # Caller catalog format & choosing a catalog 2 | 3 | ## Caller catalog format 4 | 5 | For the `--loci` argument, `strkit call` takes a list of loci in a modified BED / TSV format, 6 | similar to methods like Straglr/Tandem-genotypes/GangSTR. 7 | 8 | The file must be structured with a row per locus, where each row looks like: 9 | 10 | ``` 11 | chr# 10000 10101 [...] AC 12 | ``` 13 | 14 | The important requirements here are: 15 | 16 | * The fields are tab-separated 17 | * The rows are sorted by contig, and then by starting position 18 | * Locus coordinates are 0-based and half-open (start is inclusive, end is exclusive) 19 | * The locus motif must come **last** in the row, but *any number of fields* can separate 20 | the end position and the motif. 21 | 22 | As a result, STRkit can take myrid different TSV-type catalog formats as input, including 23 | those produced from the TRF UCSC browser track, or for GangSTR, or for Straglr. 24 | 25 | Here are a few notes on catalogs: 26 | 27 | * Coordinates are used to locate the STR locus in the reference genome, but may be slightly 28 | expanded to better encompass the entire locus. 29 | * Be wary of using Tandem Repeats Finder output directly as a catalog, as it can output multiple 30 | rows for the same locus, or define motifs in a "compound" fashion, e.g., `ATATAT` instead of `AT`. 31 | * Some disease expansions can contain multiple different motifs, 32 | which may be not present in the reference genome at all (for example: 33 | [CANVAS](https://pubmed.ncbi.nlm.nih.gov/31230722/), [BAFME2](https://www.nature.com/articles/s41467-019-12671-y)). 34 | As such, we provide a mechanism to specify motifs using any 35 | [IUPAC code](https://www.bioinformatics.org/sms/iupac.html). 36 | Thus, the CANVAS and BAFME2 motifs can be represented as `AARRG` and `AAAWK`, respectively. 37 | We also add in a non-IUPAC code, `X`, which behaves like `N` in that it represents any base, 38 | but instead of giving a reward of `+2` it neither penalizes nor rewards alignment, 39 | and penalizes a gap. We use this internally to represent low-confidence base calls. 40 | * Related to the above, this can be important for diseases such as SCA37, where the motif composition 41 | (rather than the actual copy number) is associated with disease 42 | ([Seixas *et al.* 2017](https://doi.org/10.1016%2Fj.ajhg.2017.06.007)). Here, STRkit's motif-sized k-mer counting 43 | function can be used during calling with the `--count-kmers` flag. See the 44 | [advanced usage](https://github.com/davidlougheed/strkit/blob/master/docs/caller_usage.md#all-optional-flags) page 45 | for more. 46 | 47 | 48 | ## Choosing an existing catalog 49 | 50 | Other researchers have done extensive work in identifying and cataloguing loci for genotyping: 51 | 52 | * The Tandem Repeats Finder track for the UCSC browser, available as a 53 | [downloadable BED file](https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/latest/hg38.trf.bed.gz), 54 | with the caveat that this file includes **overlapping entries**, and TRs may not always be represented in 55 | their most 'essential' form (e.g., using the motif `TATATATA` instead of just `TA`). Thus, some work may be 56 | required to create a desirable locus catalog. 57 | * The researchers behind the [GangSTR](https://github.com/gymreklab/GangSTR) short-read STR genotyping method 58 | have prepared [several extensive STR catalogs](https://github.com/gymreklab/GangSTR#gangstr-reference-files) 59 | for different human reference genomes, containing motifs up to 20bp in length. However, **these files use 60 | 1-based closed-interval coordinates**, and should be adjusted (subtracting 1 from all start coordinates) to 61 | transform them into the 0-based half-open interval coordinates when using them with STRkit. 62 | * We have prepared a [catalog of disease-causing or disease-associated loci](../catalogs/pathogenic_assoc.hg38.tsv) 63 | for the `hg38` reference genome, partially based on the review research done by Gall-Duncan *et al.* (2022), as well 64 | as entries from the [STRipy database](https://stripy.org/database) 65 | (DOI: [10.1002/humu.24382](https://doi.org/10.1002/humu.24382)) and our own reading of other articles. 66 | -------------------------------------------------------------------------------- /docs/caller_usage.md: -------------------------------------------------------------------------------- 1 | # Advanced caller usage and configuration 2 | 3 | 4 | ## All optional flags 5 | 6 | * `--sample-id example_sample`: Set a sample ID, or override the alignment file sample ID. This will be included in JSON 7 | output, but not TSV output. 8 | * `--min-reads ##`: Minimum number of supporting reads needed to make a call. **Default:** 4 9 | * `--min-allele-reads ##`: Minimum number of supporting reads needed to call a specific allele size. 10 | **Default:** 2 11 | * `--max-reads ##`: Maximum number of supporting reads to use for calling a locus. **Default:** 250 12 | * `--min-avg-phred ##`: Minimum average PHRED score for relevant bases (flanking region + tandem repeat). 13 | Read segments with average PHRED scores below this (common with a threshold of ~13 and ONT Ultra Long reads, 14 | for example) will be skipped. **Default:** 13 15 | * `--min-read-align-score #.#`: Minimum normalized read alignment score (fractional; `0.0` to `1.0`) needed to include a 16 | read in a call. A good value for pure tandem repeats is 0.9. A good value for much more lenient genotyping is anywhere 17 | from 0.0-0.4. **Default:** 0.9 18 | * `--max-rcn-iters ##`: Maximum number of read copy-number counting iterations to perform. Loci which require a lot of 19 | iterations are probably impure tandem repeats, for which the resulting copy number will not be very accurate anyway. 20 | **Default:** 50 21 | * `--flank-size ##`: Size of the flanking region to use on either side of a region to properly anchor reads. 22 | **Default:** 70 23 | * `--realign` or `-a`: Whether to perform local re-alignment to attempt recovery of soft-clipped reads. Some aligners 24 | may soft-clip around large insertions, e.g. with an expansion (I've noticed this with *pbmm2*/*minimap2*). 25 | Currently recommended **for HiFi or ONT R10 only**, since this step aggressively filters out realignments with 26 | many mismatches or small indels. Enabling this slows down calling, so it may not be suitable for a very large catalog 27 | of tandem repeats. 28 | * `--hq`: Whether to treat provided reads as "high quality", i.e., fairly close to the actual true sequence. Used when 29 | detecting expansions, to skip a smoothing filter that may ignore disparate, rare expansion-like read counts. 30 | Use for CCS reads or similar data (e.g., R10 nanopore data) ONLY! **Default:** off 31 | * `--use-hp`: Whether to incorporate `HP` tags from a haplotagged alignment file. This should speed up runtime and 32 | will potentially improve calling results. **This flag is experimental, and has not been tested extensively.** 33 | * `--skip-supplementary` or `--skip-supp`: Skip supplementary alignments. **Default:** off 34 | * `--skip-secondary` or `--skip-sec`: Skip secondary alignments. **Default:** off 35 | * `--incorporate-snvs [path]` or `--snv [path]`: A path to a VCF with SNVs to incorporate into the calling process and 36 | final output. This file is just used as an SNV loci catalog; STRkit itself will perform the SNV calling. Empirically 37 | improves calling quality a small amount, speeds up runtime, and gives nearby SNV calls for downstream analysis. 38 | You can find dbSNP VCFs at 39 | [`https://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/`](https://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/). 40 | The file for GRCh38 is called `00-common_all.vcf.gz` as of time of writing. 41 | * `--snv-min-base-qual [int]` or `--min-sbq [int]`: Minimum PHRED quality score for bases of SNVs to use for phasing. 42 | **Default:** 20 43 | * `--targeted` or `-t`: Turn on targeted genotyping mode, which re-weights longer reads differently. Use this option if 44 | the alignment file contains targeted reads that do not reflect normal mapping patterns, e.g. from PacBio No-Amp 45 | Targeted Sequencing. **Default:** off 46 | * `--respect-ref` or `-e`: Turn off reference TR region 'coordinate extension' from what is specified in the catalog. 47 | TR boundaries can be blurry, so by default we give STRkit an opportunity to extend the provided region to improve 48 | mapped indel capturing and to be consistent with the approach we use to count repeat copies in non-reference samples. 49 | Turning this off should give results closer to other STR callers, at the cost of potentially missing variation. 50 | * `--count-kmers` or `-k`: Turn on motif-sized k-mer counting at the allele level, with `-k peak`, or at the read 51 | level, with `-k read`, or both with `-k both`. If the flag is provided with no value, it will default to `peak.` 52 | Note that k-mer counts will only be reported if a `--json` path is specified. This feature can be used to detect 53 | motif composition differences between alleles or samples. **Default:** `none` 54 | * `--consensus` or `-c`: Turn on consensus calculation for alleles. This adds runtime, but gives a better idea of STR 55 | structure and is useful for comparing alleles beyond copy number. If `--vcf` is set, this option is forced on. 56 | **Default:** off 57 | * `--vcf-anchor-size`: Number of bases upstream (5') of the tandem repeat to include in the VCF output. This can include 58 | small indels, and having a size above `1` may be beneficial or detrimental to the use case at hand, but is nice for 59 | benchmarking and in case of slight misalignment. This is clamped to being in the range of `[1, flank_size]`. 60 | **Default:** 5 61 | * `--num-bootstrap ###` or `-b`: Now many bootstrap re-samplings to perform. **Default:** 100 62 | * `--sex-chr ??` or `-x`: Sex chromosome configuration. **Without this, loci in sex chromosomes will not be genotyped.** 63 | Can be any configuration of Xs and Ys; only count matters. **Default:** *none* 64 | * `--json [path]` or `-j`: Path to output JSON call data to. JSON call data is more detailed than the `stdout` TSV 65 | output. If the value passed is `stdout`, the JSON data will be written to `stdout` instead of a file. 66 | **Default:** *none* 67 | * `--indent-json` or `-i`: If passed alongside `--json [x]`, the JSON output will be indented to be more human-readable 68 | but less compact. **Default:** off 69 | * `--vcf [path]`: Path to output VCF-formatted call data to. Setting this option forces the `--consensus` option as 70 | well in order to output true REF/ALT values, which slows down runtime somewhat. If the value passed is `stdout`, the 71 | VCF data will be written to `stdout` instead of a file. If a `.vcf.gz` path is specified, a bgzipped file will be 72 | written automatically. **Default:** *none* 73 | * `--no-tsv`: Suppresses TSV output to `stdout`. Without `--json` or `--vcf`, no output will be generated, which isn't 74 | very helpful. **Default:** TSV output on 75 | * `--seed`: Seed the random number generator used for all random sampling, Gaussian mixture modeling, etc. 76 | Useful for replicability. 77 | * `--log-level [level]`: Log level. Value must be of `error`, `warning`, `info`, and `debug`. Be careful with the 78 | `debug` log level, as it can produce gigabytes of logs for a large run. **Default:** `info`. 79 | 80 | 81 | ## Usage on HPC machines 82 | 83 | We have tested STRkit on three different clusters associated with the 84 | Digital Research Alliance of Canada (formerly Compute Canada). 85 | 86 | Usage is pretty straightforward; for our use cases we set up a Python virtual environment 87 | with the `strkit` package installed, and ran a SLURM batch job which looks something like: 88 | 89 | ```bash 90 | #!/bin/bash 91 | #SBATCH --mem=16G 92 | #SBATCH --ntasks=1 93 | #SBATCH --cpus-per-task=10 94 | #SBATCH --time=1-00 95 | #SBATCH --account=rrg-xxxxx 96 | 97 | 98 | module load StdEnv/2023 99 | module load python/3.11 scipy-stack/2025a parasail/2.6.2 100 | 101 | cd /home/xxxxx || exit 102 | source env/bin/activate 103 | 104 | strkit call \ 105 | --loci /path/to/catalog \ 106 | --ref /path/to/ref.fa.gz \ 107 | --processes 10 \ 108 | --seed 342 \ 109 | --vcf sample.vcf \ 110 | --no-tsv \ 111 | path/to/sample.bam 112 | 113 | deactivate 114 | 115 | ``` 116 | -------------------------------------------------------------------------------- /docs/images/browser_hist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidlougheed/strkit/90f07d4bb8e27b67a1a1c9afafa4f4b1412b171a/docs/images/browser_hist.png -------------------------------------------------------------------------------- /docs/images/browser_igv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidlougheed/strkit/90f07d4bb8e27b67a1a1c9afafa4f4b1412b171a/docs/images/browser_igv.png -------------------------------------------------------------------------------- /docs/images/call_method_flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidlougheed/strkit/90f07d4bb8e27b67a1a1c9afafa4f4b1412b171a/docs/images/call_method_flow.png -------------------------------------------------------------------------------- /docs/images/strkit_logo_open_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidlougheed/strkit/90f07d4bb8e27b67a1a1c9afafa4f4b1412b171a/docs/images/strkit_logo_open_graph.png -------------------------------------------------------------------------------- /docs/images/strkit_logo_small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidlougheed/strkit/90f07d4bb8e27b67a1a1c9afafa4f4b1412b171a/docs/images/strkit_logo_small.png -------------------------------------------------------------------------------- /docs/output_formats.md: -------------------------------------------------------------------------------- 1 | # STRkit output formats 2 | 3 | STRkit can output three different file formats, depending on the set of arguments used: 4 | 5 | * [TSV](#tsv-standard-output): by default, printed to `stdout` when STRkit is run. Good as an overview, but less 6 | informative/interoperable than other formats. 7 | * [JSON](#json-report): a JSON report, containing the maximum amount of information possible. These files can be quite 8 | large, especially if formatted to be human-readable and indented with the `--indent-json` flag. 9 | * [VCF](#vcf): a [VCF 4.2](https://samtools.github.io/hts-specs/VCFv4.2.pdf) file, with STR and SNV genotypes, including 10 | consensus STR sequences. 11 | 12 | **Note:** In general, the JSON format contains the most information about how STRkit was run, and each locus' called 13 | genotype. 14 | 15 | 16 | ## TSV (standard output) 17 | 18 | A tab-separated text file with the following columns: 19 | 20 | * Chromosome 21 | * Starting position (matching input BED file; real coordinates of region may be different if 22 | `--respect-ref` is not used) 23 | * Ending position (matching input BED file; real coordinates of region may be different if 24 | `--respect-ref` is not used) 25 | * Motif sequence (matching input BED file) 26 | * Reference copy number 27 | * Comma-delimited list of copy numbers for all reads successfully extracted for this locus. 28 | * Copy number call, `|`-delimited (one call per allele) 29 | * 95% confidence intervals for copy number calls, `|`-delimited (one `X-Y` 95% CI per allele) 30 | * Calling approach used by STRkit: one of: 31 | * `dist` - clustering based on a copy number distance metric 32 | * `snv+dist` - clustering based on a copy number + nearby SNV genotype difference distance metric 33 | * `snv` - clustering solely based on nearby SNV genotypes 34 | 35 | Here is an example line: 36 | 37 | ``` 38 | chr4 5975495 5975530 TTTTG 7 6,6,6,6,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,8 6|7 6-6|7-7 snv 39 | ``` 40 | 41 | Note that quite a bit of information is missing from the TSV, including per-sample copy numbers, read identities, 42 | SNV calls, and STR consensus sequences. 43 | 44 | 45 | ## JSON report 46 | 47 | Example report format: 48 | 49 | ```javascript 50 | { 51 | "sample_id": "HG002", 52 | "caller": { 53 | "name": "strkit", 54 | "version": "0.15.0" 55 | }, 56 | "parameters": { 57 | "read_files": "HG002.SequelII.ccs.phased.40x.chr4.bam", 58 | "reference_file": "/Users/davidlougheed/git/gt-poc/hg38.analysisSet.fa.gz", 59 | "min_reads": 4, 60 | "min_allele_reads": 2, 61 | "min_avg_phred": 13, 62 | "num_bootstrap": 100, 63 | "flank_size": 70, 64 | "sample_id": "HG002", 65 | "realign": true, 66 | "hq": true, 67 | "snv_vcf": "00-common_all.vcf.gz", 68 | "snv_min_base_qual": 20, 69 | "targeted": false, 70 | "respect_ref": false, 71 | "count_kmers": "none", 72 | "consensus": true, 73 | "log_level": 10, 74 | "seed": 1234, 75 | "processes": 1 76 | }, 77 | "runtime": 8.628772, 78 | "contigs": [ 79 | "chr4" 80 | ], 81 | "results": [ 82 | { 83 | "locus_index": 1, 84 | "contig": "chr4", 85 | "start": 96617, 86 | "end": 96648, 87 | "start_adj": 96617, 88 | "end_adj": 96648, 89 | "motif": "AC", 90 | "ref_cn": 16, 91 | "ref_start_anchor": "t", 92 | "ref_seq": "acacacacacacacacacacacacacacaca", 93 | "reads": { 94 | "m64011_190901_095311/50792740/ccs": { 95 | "s": "-", 96 | "sc": 2.0, 97 | "cn": 15, 98 | "w": 1.0217145751733625, 99 | "snvu": ["G"], 100 | "p": 0 101 | }, 102 | // ... 103 | "m64012_190921_234837/4523939/ccs": { 104 | "s": "+", 105 | "sc": 2.0, 106 | "cn": 15, 107 | "w": 1.0217145751733625, 108 | "snvu": ["A"], 109 | "p": 1 110 | }, 111 | // ... 112 | }, 113 | "snvs": [ 114 | { 115 | "id": "rs73213545", 116 | "ref": "G", 117 | "pos": 94593, 118 | "call": ["G", "A"], 119 | "rcs": [20, 23] 120 | } 121 | ], 122 | "assign_method": "snv+dist", 123 | "call": [15, 15], 124 | "call_95_cis": [ 125 | [15, 15], 126 | [15, 15] 127 | ], 128 | "call_99_cis": [ 129 | [15, 15], 130 | [15, 15] 131 | ], 132 | "mean_model_align_score": 2.0, 133 | "peaks": { 134 | "means": [15, 15], 135 | "weights": [0.5, 0.5], 136 | "stdevs": [0.31622776601683794, 0.3585309239667531], 137 | "modal_n": 2, 138 | "n_reads": [20, 23], 139 | "seqs": [ 140 | ["ACACACACACACACACACACACACACACA", "poa"], 141 | ["ACACACACACACACACACACACACACACA", "poa"] 142 | ] 143 | }, 144 | "read_peaks_called": true, 145 | "time": 0.1274 146 | }, 147 | // ... 148 | ] 149 | } 150 | ``` 151 | 152 | 153 | ## VCF 154 | 155 | VCF format fields (i.e., for each variant sample entry): 156 | 157 | * `AD`: Read depth for each allele 158 | * `CONS`: Consensus methods used for each alt (`single`/`poa`/`best_rep`) 159 | * `DP`: Total read depth 160 | * `DPS`: Total read depth; only supporting reads (for calls with incorporated SNVs mainly; STR calls only) 161 | * `GT`: Genotype 162 | * `MC`: Motif copy number for each allele (STR calls only) 163 | * `MCCI`: Motif copy number 95% confidence intervals for each allele (STR calls only) 164 | * `MCRL`: Read-level copy number histogram for each allele. Allele entries are comma-delimited, and copy numbers within 165 | an allele's read-set are pipe (`|`)-delimited and formatted as `[copy number]x[reads]`. For example, for two alleles 166 | with 8 and 9 copy-number respectively, we may get `7x1|8x10|9x1,8x2|9x12` — the first allele has one 7-copy read, ten 167 | 8-copy reads, and one 9-copy read. The second allele has two 8-copy reads and twelve 9-copy reads. 168 | * `MMAS`: Mean model (candidate TR sequence) alignment score across reads, for this sample. This score, relative to the 169 | other locis' scores, represents how well a pure tandem repeat stretch with the catalogued motif and the determined 170 | copy number (e.g., `CAGCAGCAG`) aligns to the true sequence. 171 | * `PS`: Phase set 172 | * `PM`: Peak-calling method (`dist`/`single`/`snv+dist`/`snv`/`hp`; STR calls only) 173 | 174 | VCF info. fields (i.e., for each STR variant record; not present for SNV records): 175 | 176 | * `VT`: Variant record type (`str` or `snv`) 177 | * `MOTIF`: Motif sequence 178 | * `REFMC`: Motif copy number in the reference genome 179 | -------------------------------------------------------------------------------- /docs/trio_analyses.md: -------------------------------------------------------------------------------- 1 | # Trio analyses with STRkit 2 | 3 | Trio datasets include genomic sequence data for a child, their mother, and their father (the "trio"). These data 4 | can be used to discover de novo mutation (and incidental genotyping errors). 5 | 6 | STRkit includes a Mendelian inheritance (MI) analysis tool, under the sub-command `strkit mi`. 7 | After genotyping the trio with `strkit call`, this command can be used to discover loci which: 8 | 9 | 1. Do not respect exact MI 10 | 2. Do not respect MI allowing for a ±1 repeat unit difference 11 | (Note: most true mutation occurs in 1-repeat-unit changes too! 12 | See [Ellegren, 2004](https://www.nature.com/articles/nrg1348).) 13 | 3. Do not respect MI under the 95% locus confidence intervals 14 | 4. Look like de novo mutation at a read count distribution level, via a Mann-Whitney *U* test (with tie correction). 15 | The alternative hypothesis can be specified as either two-sided or looking for expansion in the offspring. 16 | *The requirements for this test are invalidated in cases of mosaicism.* 17 | 5. Look like de novo mutation at a read count distribution level, via a chi-squared independence test, 18 | where the contingency table looks like the following: 19 | 20 | | Read distribution \ Copy number | 11 | 12 | 13 | 21 | |---------------------------------|------|------|------| 22 | | Parent reads (best peak fit) | 20 | 10 | 0 | 23 | | Child reads | 2 | 20 | 10 | 24 | 25 | 26 | ## Trio-level 27 | 28 | At a trio level, the chi-squared test gives (optionally multiple testing-corrected) loci with a significant 29 | chance of containing a de novo mutation. 30 | 31 | ## Cohort-level 32 | 33 | At a cohort level, multiple downstream analyses are possible from a collection of trio mutation analyses, 34 | such as: 35 | 36 | 1. Case-control analysis looking for frequency of de novo mutations in specific loci 37 | 2. Case-control analysis looking at the incidence rate of de novo mutation 38 | 39 | Currently, tools to automatically perform these analyses are not available in STRkit. 40 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=57.4.0", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | blinker==1.7.0 2 | click==8.1.7 3 | coverage==7.8.0 4 | Cython==3.0.12 5 | exceptiongroup==1.2.0 6 | Flask==3.0.3 7 | importlib_metadata==7.1.0 8 | iniconfig==2.0.0 9 | itsdangerous==2.2.0 10 | Jinja2==3.1.4 11 | joblib==1.3.2 12 | MarkupSafe==2.1.5 13 | numpy==1.26.4 14 | orjson==3.10.16 15 | packaging==24.0 16 | pandas==2.2.3 17 | parasail==1.3.4 18 | patsy==0.5.6 19 | pluggy==1.4.0 20 | psutil==6.1.0 21 | pyparsing==3.1.2 22 | pysam==0.23.0 23 | pytest==7.4.4 24 | pytest-cov==4.1.0 25 | python-dateutil==2.8.2 26 | pytz==2025.2 27 | scikit-learn==1.4.2 28 | scipy==1.15.1 29 | six==1.16.0 30 | statsmodels==0.14.4 31 | strkit_rust_ext==0.20.2 32 | threadpoolctl==3.4.0 33 | tomli==2.0.1 34 | tzdata==2024.2 35 | Werkzeug==3.0.4 36 | zipp==3.20.2 37 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import setuptools 3 | from setuptools import setup 4 | 5 | with open("README.md", "r") as fh: 6 | long_description = fh.read() 7 | 8 | with open("./strkit/VERSION", "r") as vf: 9 | version = vf.read().strip() 10 | 11 | setup( 12 | name="strkit", 13 | version=version, 14 | 15 | python_requires="~=3.10", 16 | install_requires=[ 17 | "Flask>=2.2.5,<3.1", 18 | "orjson>=3.9.15,<4", 19 | "pysam>=0.19,<0.24", 20 | "numpy>=1.23.4,<1.27", 21 | "parasail>=1.2.4,<1.4", 22 | "scikit-learn>=1.2.1,<1.6", 23 | "scipy>=1.10,<1.16", 24 | "statsmodels>=0.14.0,<0.15", 25 | "strkit_rust_ext==0.20.2", 26 | ], 27 | 28 | description="A toolkit for analyzing variation in short(ish) tandem repeats.", 29 | long_description=long_description, 30 | long_description_content_type="text/markdown", 31 | 32 | url="https://github.com/davidlougheed/strkit", 33 | license="GPLv3", 34 | classifiers=[ 35 | "Programming Language :: Python :: 3.10", 36 | "Programming Language :: Python :: 3.11", 37 | "Programming Language :: Python :: 3.12", 38 | "Programming Language :: Python :: 3 :: Only", 39 | "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", 40 | "Operating System :: POSIX", 41 | ], 42 | 43 | author="David Lougheed", 44 | author_email="david.lougheed@gmail.com", 45 | 46 | packages=setuptools.find_namespace_packages(), 47 | include_package_data=True, 48 | 49 | entry_points={ 50 | "console_scripts": ["strkit=strkit.entry:main"], 51 | }, 52 | ) 53 | -------------------------------------------------------------------------------- /strkit/VERSION: -------------------------------------------------------------------------------- 1 | 0.23.0-dev 2 | -------------------------------------------------------------------------------- /strkit/__init__.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | __all__ = [ 4 | "__version__", 5 | ] 6 | 7 | with open(Path(__file__).parent / "VERSION", "r") as vf: 8 | __version__ = vf.read().strip() 9 | -------------------------------------------------------------------------------- /strkit/call/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from .allele import call_alleles 4 | from .call_sample import call_sample 5 | from .params import CallParams 6 | 7 | __all__ = [ 8 | "call_alleles", 9 | "call_sample", 10 | "CallParams", 11 | ] 12 | -------------------------------------------------------------------------------- /strkit/call/align_matrix.py: -------------------------------------------------------------------------------- 1 | import parasail 2 | from ..iupac import IUPAC_NUCLEOTIDE_CODES 3 | 4 | __all__ = [ 5 | "dna_codes", 6 | "match_score", 7 | "mismatch_penalty", 8 | "indel_penalty", 9 | "dna_bases", 10 | "dna_matrix", 11 | ] 12 | 13 | 14 | match_score: int = 2 # TODO: parametrize 15 | mismatch_penalty: int = 7 # TODO: parametrize 16 | indel_penalty: int = 5 # TODO: parametrize 17 | 18 | 19 | # TODO: Customize matrix based on error chances 20 | # Create a substitution matrix for alignment. 21 | # Include IUPAC wildcard bases to allow for motifs with multiple possible motifs. 22 | # Include a wildcard base 'X' for very low-confidence base calls, to prevent needlessly harsh penalties - this is 23 | # inserted into a read in place of bases with low PHRED scores. 24 | dna_bases_str: str = "ACGT" + "".join(IUPAC_NUCLEOTIDE_CODES.keys()) + "X" 25 | dna_bases: dict[str, int] = {b: i for i, b in enumerate(dna_bases_str)} 26 | dna_codes: dict[str, tuple[str, ...]] = { 27 | **IUPAC_NUCLEOTIDE_CODES, 28 | "X": ("A", "C", "G", "T"), # Special character for matching low-quality bases 29 | } 30 | dna_matrix = parasail.matrix_create(dna_bases_str, match_score, -1 * mismatch_penalty) 31 | 32 | for code, code_matches in dna_codes.items(): 33 | for cm in code_matches: 34 | dna_matrix[dna_bases[code], dna_bases[cm]] = 2 if code != "X" else 0 35 | dna_matrix[dna_bases[cm], dna_bases[code]] = 2 if code != "X" else 0 36 | -------------------------------------------------------------------------------- /strkit/call/allele.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | # Disable OpenMP/other multithreading since it adds enormous overhead when multiprocessing 4 | import os 5 | os.environ["OMP_NUM_THREADS"] = "1" 6 | os.environ["OPENBLAS_NUM_THREADS"] = "1" 7 | os.environ["MKL_NUM_THREADS"] = "1" 8 | os.environ["VECLIB_MAXIMUM_THREADS"] = "1" 9 | os.environ["NUMEXPR_NUM_THREADS"] = "1" 10 | 11 | # ---------------------------------------------------------------------------------------------------------------------- 12 | 13 | import logging # For type hinting 14 | import numpy as np 15 | import statistics 16 | 17 | from sklearn.exceptions import ConvergenceWarning 18 | from sklearn.mixture import GaussianMixture 19 | from sklearn.preprocessing import normalize 20 | from warnings import simplefilter 21 | 22 | from numpy.typing import NDArray 23 | from typing import Iterable, Literal, TypedDict, Union 24 | 25 | import strkit.constants as cc 26 | 27 | from .params import CallParams 28 | from .utils import get_new_seed 29 | 30 | __all__ = [ 31 | "RepeatCounts", 32 | "CallDict", 33 | "get_n_alleles", 34 | "call_alleles", 35 | ] 36 | 37 | RepeatCounts = list[int] | tuple[int, ...] | NDArray[np.int_] 38 | 39 | 40 | # K-means convergence errors - we expect convergence to some extent with homozygous alleles 41 | simplefilter("ignore", category=ConvergenceWarning) 42 | 43 | # TODO: parameterize 44 | small_allele_min = 8 45 | expansion_ratio = 5 46 | N_GM_INIT = 3 47 | 48 | WEIGHT_1_0 = np.array([[1.0]]) 49 | FLOAT_32_EPSILON = np.finfo(np.float32).eps 50 | 51 | CI_PERCENTILE_RANGES = { 52 | "95": (2.5, 97.5), 53 | "99": (0.5, 99.5), 54 | } 55 | 56 | 57 | def _array_as_int(n: NDArray[np.int_] | NDArray[np.float_]) -> NDArray[np.int32]: 58 | return np.rint(n).astype(np.int32) 59 | 60 | 61 | def _calculate_cis(samples, ci: str = Literal["95", "99"]) -> NDArray[np.int32]: 62 | percentiles = np.percentile( 63 | samples, CI_PERCENTILE_RANGES[ci], axis=1, method="interpolated_inverted_cdf" 64 | ).transpose() 65 | return _array_as_int(percentiles) 66 | 67 | 68 | def get_n_alleles(default_n_alleles: int, sample_sex_chroms: str | None, contig: str) -> int | None: 69 | if contig in cc.M_CHROMOSOME_NAMES: 70 | return 1 71 | 72 | if contig in cc.SEX_CHROMOSOMES: 73 | if sample_sex_chroms is None: 74 | return None 75 | if contig in cc.X_CHROMOSOME_NAMES: 76 | return sample_sex_chroms.count("X") 77 | if contig in cc.Y_CHROMOSOME_NAMES: 78 | return sample_sex_chroms.count("Y") 79 | 80 | return default_n_alleles 81 | 82 | 83 | def na_length_list(n_alleles: int): 84 | return [list() for _ in range(n_alleles)] 85 | 86 | 87 | GMMInitParamsMethod = Literal["kmeans", "k-means++"] 88 | 89 | 90 | def make_fitted_gmm(n_components: int, sample_rs: NDArray, init_params: GMMInitParamsMethod, rng: np.random.Generator): 91 | return GaussianMixture( 92 | n_components=n_components, 93 | init_params=init_params, 94 | covariance_type="spherical", 95 | n_init=N_GM_INIT, 96 | random_state=get_new_seed(rng), 97 | ).fit(sample_rs) 98 | 99 | 100 | def fit_gmm( 101 | rng: np.random.Generator, 102 | sample: NDArray, 103 | n_alleles: int, 104 | allele_filter: float, 105 | hq: bool, 106 | gm_filter_factor: int, 107 | init_params: GMMInitParamsMethod = "k-means++", # TODO: parameterize outside 108 | ) -> object | None: 109 | sample_rs = sample.reshape(-1, 1) 110 | g: object | None = None 111 | 112 | n_components: int = n_alleles 113 | while n_components > 0: 114 | if n_components == 1: # Don't need to do the full fit for a single peak, just calculate the parameters 115 | # I've confirmed this gives an ~identical result to fitting a GMM with one parameter. 116 | fake_g: object = type("", (), {})() 117 | fake_g.means_ = np.array([[np.mean(sample_rs)]]) 118 | fake_g.weights_ = WEIGHT_1_0 119 | fake_g.covariances_ = np.array([[np.var(sample_rs)]]) 120 | return fake_g 121 | 122 | g = make_fitted_gmm(n_components, sample_rs, init_params, rng) 123 | 124 | # noinspection PyUnresolvedReferences 125 | means_and_weights = np.append(g.means_.transpose(), g.weights_.reshape(1, -1), axis=0) 126 | 127 | # Filter out peaks that aren't supported by ~min_allele_reads reads by probability, with some delta to 128 | # allow for peaks supported by "most of a read". 129 | mw_filter_1 = means_and_weights[1, :] > allele_filter 130 | 131 | # Filter out any peaks below some threshold using this magic constant filter factor 132 | # - Exception: Large expansions can have very few supporting reads due to quirks of sequencing beyond 133 | # just chance/read length distribution; if we have 2 alleles and the large one is a lot bigger than 134 | # the small one, don't apply this filter 135 | # - Discard anything below a specific weight threshold and resample means based on remaining weights 136 | # to fill in the gap. E.g. below 1 / (5 * num alleles) - i.e. 5 times less than we expect with equal 137 | # sharing in the worst case where it represents just one allele 138 | if n_components > 2 or (n_components == 2 and (not hq or ( 139 | means_and_weights[0, -1] < expansion_ratio * max(means_and_weights[0, 0], small_allele_min)))): 140 | mw_filter_2 = means_and_weights[1, :] > (1 / (gm_filter_factor * n_components)) 141 | else: 142 | mw_filter_2 = means_and_weights[1, :] > FLOAT_32_EPSILON 143 | 144 | mw_filter = mw_filter_1 & mw_filter_2 145 | n_useless = np.size(mw_filter) - np.count_nonzero(mw_filter) 146 | if not n_useless: 147 | # No useless components left to remove, so return the GMM 148 | return g 149 | n_components -= n_useless 150 | 151 | return g 152 | 153 | 154 | class BaseCallDict(TypedDict): 155 | call: Union[NDArray[np.int32], NDArray[np.float_]] 156 | call_95_cis: Union[NDArray[np.int32], NDArray[np.float_]] # 2D arrays 157 | call_99_cis: Union[NDArray[np.int32], NDArray[np.float_]] # 2D arrays 158 | peaks: NDArray[np.float_] 159 | peak_weights: NDArray[np.float_] 160 | peak_stdevs: NDArray[np.float_] 161 | modal_n_peaks: int 162 | 163 | 164 | class CallDict(BaseCallDict, total=False): 165 | ps: int 166 | 167 | 168 | def make_read_weights(read_weights: Iterable[float] | None, num_reads: int) -> NDArray[np.float_]: 169 | return np.array( 170 | read_weights if read_weights is not None else np.array(([1/num_reads] * num_reads) if num_reads else [])) 171 | 172 | 173 | def call_alleles( 174 | repeats_fwd: NDArray[np.int32], 175 | repeats_rev: NDArray[np.int32], 176 | read_weights_fwd: Iterable[float] | None, 177 | read_weights_rev: Iterable[float] | None, 178 | params: CallParams, 179 | min_reads: int, 180 | n_alleles: int, 181 | separate_strands: bool, 182 | read_bias_corr_min: int, 183 | gm_filter_factor: int, 184 | seed: int | None, 185 | logger_: logging.Logger, 186 | debug_str: str, 187 | ) -> CallDict | None: 188 | fwd_len = repeats_fwd.shape[0] 189 | rev_len = repeats_rev.shape[0] 190 | 191 | fwd_strand_weights = make_read_weights(read_weights_fwd, fwd_len) 192 | rev_strand_weights = make_read_weights(read_weights_rev, rev_len) 193 | 194 | assert repeats_fwd.shape == fwd_strand_weights.shape 195 | assert repeats_rev.shape == rev_strand_weights.shape 196 | 197 | combined_reads = np.concatenate((repeats_fwd, repeats_rev), axis=None) 198 | combined_weights = np.concatenate((fwd_strand_weights, rev_strand_weights), axis=None) 199 | combined_len = combined_reads.shape[-1] 200 | 201 | if combined_len < min_reads: 202 | return None 203 | 204 | # If the locus/allele only has one value, don't bother bootstrapping 205 | if np.unique(combined_reads).shape[0] == 1: 206 | logger_.debug(f"{debug_str} - skipping bootstrap / GMM fitting for allele(s) (single value)") 207 | cn = combined_reads[0] 208 | 209 | call = _array_as_int(np.full(n_alleles, cn)) 210 | call_cis = _array_as_int(np.full((n_alleles, 2), cn)) 211 | 212 | peaks: NDArray[np.float_] = call.astype(np.float_) 213 | 214 | return { 215 | "call": call, 216 | "call_95_cis": call_cis, 217 | "call_99_cis": call_cis, 218 | "peaks": peaks, 219 | "peak_weights": np.full(n_alleles, 1.0 / n_alleles), 220 | "peak_stdevs": np.full(n_alleles, 0.0), 221 | "modal_n_peaks": 1, # 1 peak, since we have 1 value 222 | } 223 | 224 | nal = na_length_list(n_alleles) 225 | allele_samples = np.array(nal, dtype=np.float32) 226 | allele_weight_samples = np.array(nal, dtype=np.float32) 227 | allele_stdev_samples = np.array(nal, dtype=np.float32) 228 | sample_peaks = np.array([], dtype=np.int32) 229 | 230 | rng: np.random.Generator = np.random.default_rng(seed=seed) 231 | 232 | # Perform a number of bootstrap iterations to get a 95% CI and more accurate estimate of repeat counts / differences 233 | 234 | if separate_strands and fwd_len >= read_bias_corr_min and rev_len >= read_bias_corr_min: 235 | target_length: int = max(fwd_len, rev_len) 236 | 237 | # Resample original sample, correcting for imbalances between 238 | # forward and reverse-strand reads along the way 239 | # (if we've passed the coverage threshold) 240 | 241 | fwd_strand_samples = rng.choice( 242 | repeats_fwd, 243 | size=(params.num_bootstrap, target_length), 244 | replace=True, 245 | p=fwd_strand_weights, 246 | ) 247 | 248 | rev_strand_samples = rng.choice( 249 | repeats_rev, 250 | size=(params.num_bootstrap, target_length), 251 | replace=True, 252 | p=rev_strand_weights, 253 | ) 254 | 255 | concat_samples = np.sort( 256 | np.concatenate((fwd_strand_samples, rev_strand_samples), axis=1), 257 | kind="stable") 258 | 259 | else: 260 | concat_samples = np.sort( 261 | rng.choice( 262 | combined_reads, 263 | size=(params.num_bootstrap, combined_len), 264 | replace=True, 265 | p=combined_weights, 266 | ) if params.num_bootstrap > 1 else np.array([combined_reads]), 267 | kind="stable") 268 | 269 | gmm_cache = {} 270 | 271 | def _get_fitted_gmm(s: NDArray[np.int_] | NDArray[np.float_]) -> object | None: 272 | if (s_t := s.tobytes()) not in gmm_cache: 273 | # Fit Gaussian mixture model to the resampled data 274 | gmm_cache[s_t] = fit_gmm(rng, s, n_alleles, allele_filter, params.hq, gm_filter_factor) 275 | 276 | return gmm_cache[s_t] 277 | 278 | # Filter out peaks that aren't supported by ~min_allele_reads reads by probability, with some delta to 279 | # allow for peaks supported by "most of a read". 280 | allele_filter = (params.min_allele_reads - 0.1) / concat_samples.shape[0] 281 | 282 | for i in range(params.num_bootstrap): 283 | sample = concat_samples[i, :] 284 | 285 | g: object | None = _get_fitted_gmm(sample) 286 | if not g: 287 | # Could not fit any Gaussian mixture; skip this allele 288 | return None 289 | 290 | # Keep track of how many alleles were found for 291 | # noinspection PyUnresolvedReferences 292 | sample_peaks = np.append(sample_peaks, g.means_.shape[0]) 293 | 294 | # noinspection PyUnresolvedReferences 295 | means_and_weights = np.append(g.means_.transpose(), g.weights_.reshape(1, -1), axis=0) 296 | 297 | means = means_and_weights[0, :] 298 | weights = means_and_weights[1, :] 299 | # noinspection PyUnresolvedReferences 300 | stdevs = np.sqrt(g.covariances_) 301 | n_to_resample = n_alleles - means.shape[0] 302 | 303 | if n_to_resample: 304 | # Re-sample means if any are removed, based on weights (re-normalized), to match total # of alleles 305 | resampled_indices = rng.choice( 306 | np.arange(len(means)), 307 | size=n_to_resample, 308 | p=normalize(weights.reshape(1, -1), norm="l1").flatten()) 309 | resampled_means = np.append(means, means[resampled_indices]) 310 | resampled_weights = np.append(weights, weights[resampled_indices]) 311 | resampled_stdevs = np.append(stdevs, stdevs[resampled_indices]) 312 | else: 313 | resampled_means = means 314 | resampled_weights = weights 315 | resampled_stdevs = stdevs 316 | 317 | argsorted_means = np.argsort(resampled_means, axis=0, kind="stable") 318 | sorted_allele_estimates = resampled_means[argsorted_means].reshape(-1, 1) 319 | sorted_allele_weight_estimates = resampled_weights[argsorted_means].reshape(-1, 1) 320 | sorted_allele_stdev_estimates = resampled_stdevs[argsorted_means].reshape(-1, 1) 321 | 322 | allele_samples = np.append(allele_samples, sorted_allele_estimates, axis=1) 323 | allele_weight_samples = np.append(allele_weight_samples, sorted_allele_weight_estimates, axis=1) 324 | allele_stdev_samples = np.append(allele_stdev_samples, sorted_allele_stdev_estimates, axis=1) 325 | 326 | # Calculate 95% and 99% confidence intervals for each allele from the bootstrap distributions. 327 | allele_samples_argsort = allele_samples.argsort(axis=1, kind="stable") 328 | allele_samples = np.take_along_axis(allele_samples, allele_samples_argsort, axis=1) 329 | allele_cis_95 = _calculate_cis(allele_samples, ci="95") 330 | allele_cis_99 = _calculate_cis(allele_samples, ci="99") 331 | allele_weight_samples = np.take_along_axis(allele_weight_samples, allele_samples_argsort, axis=1) 332 | allele_stdev_samples = np.take_along_axis(allele_stdev_samples, allele_samples_argsort, axis=1) 333 | 334 | sample_peaks.sort(kind="stable") # To make mode consistent, given same set of peak #s 335 | 336 | # TODO: Calculate CIs based on Gaussians from allele samples instead? Ask someone... 337 | # - Could take median of 2.5 percentiles and 97.5 percentiles from Gaussians instead, median of means 338 | 339 | # Report the median estimates and the confidence intervals. 340 | # - we choose nearest for median rather than interpolating, so we can get real corresponding weights and stdevs. 341 | 342 | median_idx = allele_samples.shape[1] // 2 # 343 | medians_of_means = allele_samples[:, median_idx] 344 | medians_of_means_final = np.rint(medians_of_means).astype(np.int32) 345 | peak_weights = allele_weight_samples[:, median_idx].flatten() 346 | peak_stdevs = allele_stdev_samples[:, median_idx] 347 | modal_n_peaks: int = statistics.mode(sample_peaks).item() 348 | 349 | peak_weights /= peak_weights.sum() # re-normalize weights 350 | 351 | return { 352 | "call": medians_of_means_final.flatten(), 353 | "call_95_cis": allele_cis_95, 354 | "call_99_cis": allele_cis_99, 355 | 356 | "peaks": medians_of_means.flatten(), # Don't round, so we can recover original Gaussian model 357 | "peak_weights": peak_weights, 358 | "peak_stdevs": peak_stdevs.flatten(), 359 | # TODO: should be ok to use this, because resample gets put at end, vertically (3rd allele in a 3-ploid case) 360 | # so taking the first 2 alleles still works in terms of stdev/mean estimates? I think? 361 | # Not quite, cause it's sorted... 362 | # --> Only do the peak assignment with 1/2 peaks, which is the majority of human situations 363 | "modal_n_peaks": modal_n_peaks, 364 | } 365 | -------------------------------------------------------------------------------- /strkit/call/cigar.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy.typing import NDArray 3 | 4 | from strkit_rust_ext import get_aligned_pair_matches 5 | 6 | __all__ = [ 7 | "decode_cigar_np", 8 | "get_aligned_pair_matches", 9 | ] 10 | 11 | 12 | def decode_cigar_np(encoded_cigar: NDArray[np.uint32]) -> NDArray[np.uint32]: 13 | return np.stack((np.bitwise_and(encoded_cigar, 15), np.right_shift(encoded_cigar, 4)), axis=1) 14 | -------------------------------------------------------------------------------- /strkit/call/non_daemonic_pool.py: -------------------------------------------------------------------------------- 1 | import multiprocessing as mp 2 | 3 | __all__ = [ 4 | "NonDaemonicPool", 5 | ] 6 | 7 | 8 | # Need a pool which itself can spawn realignment processes - see https://stackoverflow.com/a/53180921 9 | 10 | 11 | class NonDaemonicProcess(mp.Process): 12 | @property 13 | def daemon(self): 14 | return False 15 | 16 | @daemon.setter 17 | def daemon(self, value): 18 | pass 19 | 20 | 21 | class NonDaemonicContext(type(mp.get_context())): 22 | Process = NonDaemonicProcess 23 | 24 | 25 | class NonDaemonicPool(mp.pool.Pool): 26 | # noinspection PyArgumentList 27 | def __init__(self, *args, **kwargs): 28 | kwargs["context"] = NonDaemonicContext() 29 | super().__init__(*args, **kwargs) 30 | -------------------------------------------------------------------------------- /strkit/call/output/__init__.py: -------------------------------------------------------------------------------- 1 | from .json_report import output_json_report_header, output_json_report_results, output_json_report_footer 2 | from .tsv import output_tsv 3 | from .vcf import build_vcf_header, output_contig_vcf_lines 4 | 5 | __all__ = [ 6 | "output_json_report_header", 7 | "output_json_report_results", 8 | "output_json_report_footer", 9 | "output_tsv", 10 | "build_vcf_header", 11 | "output_contig_vcf_lines", 12 | ] 13 | -------------------------------------------------------------------------------- /strkit/call/output/json_report.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from typing import Callable, Literal 3 | 4 | from strkit import __version__ 5 | from strkit.json import Serializable, dumps, dumps_indented 6 | 7 | from ..params import CallParams 8 | from ..types import LocusResult 9 | 10 | __all__ = [ 11 | "output_json_report_header", 12 | "output_json_report_results", 13 | "output_json_report_footer", 14 | ] 15 | 16 | 17 | def _get_dfn(indent_json: bool) -> Callable[[Serializable], bytes]: 18 | return dumps_indented if indent_json else dumps 19 | 20 | 21 | def _write_bytes(b: bytes, json_path: str, mode: Literal["wb", "ab"]): 22 | if json_path == "stdout": 23 | sys.stdout.buffer.write(b) 24 | sys.stdout.flush() 25 | else: 26 | with open(json_path, mode) as jf: 27 | # noinspection PyTypeChecker 28 | jf.write(b) 29 | 30 | 31 | def output_json_report_header(params: CallParams, contig_set: set[str], json_path: str, indent_json: bool): 32 | json_report_header = { 33 | "sample_id": params.sample_id, 34 | "caller": { 35 | "name": "strkit", 36 | "version": __version__, 37 | }, 38 | "parameters": params.to_dict(as_inputted=True), 39 | "contigs": tuple(contig_set), 40 | } 41 | 42 | dfn = _get_dfn(indent_json) 43 | header_serialized: bytes = dfn(json_report_header)[:(-2 if indent_json else -1)] # remove trailing ending brace 44 | 45 | # kludge: build up a portion of the JSON file, so we can output contig results as they come instead of storing them 46 | # in memory until the end of the run. 47 | header_serialized += b"," 48 | if indent_json: 49 | header_serialized += b'\n "results": [\n' 50 | else: 51 | header_serialized += b'"results":[' 52 | 53 | # write partial JSON 54 | _write_bytes(header_serialized, json_path, "wb") 55 | 56 | 57 | def output_json_report_results(results: tuple[LocusResult, ...], is_last: bool, json_path: str, indent_json: bool): 58 | dfn = _get_dfn(indent_json) 59 | results_bytes: bytes = dfn(results) 60 | 61 | if indent_json: 62 | results_bytes = results_bytes[2:-2] # remove opening and closing "[]" + trailing newline 63 | if not is_last: 64 | results_bytes += b",\n" 65 | else: 66 | results_bytes = results_bytes[1:-1] # remove opening and closing "[]" 67 | if not is_last: 68 | results_bytes += b"," 69 | 70 | # write results "rows" 71 | _write_bytes(results_bytes, json_path, "ab") 72 | 73 | 74 | def output_json_report_footer(time_taken: float, json_path: str, indent_json: bool): 75 | runtime_bytes = dumps(time_taken) 76 | if indent_json: 77 | footer_bytes = b'\n ],\n "runtime": ' + runtime_bytes + b'\n}\n' 78 | else: 79 | footer_bytes = b'],"runtime":' + runtime_bytes + b'}\n' 80 | 81 | # write partial JSON 82 | _write_bytes(footer_bytes, json_path, "ab") 83 | -------------------------------------------------------------------------------- /strkit/call/output/tsv.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | __all__ = ["output_tsv"] 4 | 5 | 6 | def _cn_to_str(cn: int | float) -> str: 7 | return f"{cn:.1f}" if isinstance(cn, float) else str(cn) 8 | 9 | 10 | def output_tsv(results: tuple[dict, ...], has_snv_vcf: bool): 11 | for res in results: 12 | has_call = res["call"] is not None 13 | # n_peaks = res["peaks"]["modal_n"] 14 | 15 | ref_cn = res.get("ref_cn") 16 | reads = res.get("reads") 17 | 18 | sys.stdout.write("\t".join(( 19 | res["contig"], 20 | str(res["start"]), 21 | str(res["end"]), 22 | res["motif"], 23 | _cn_to_str(ref_cn) if ref_cn is not None else ".", 24 | ",".join(map(_cn_to_str, sorted(r["cn"] for r in reads.values()))) if reads else ".", 25 | "|".join(map(_cn_to_str, res["call"])) if has_call else ".", 26 | ("|".join("-".join(map(_cn_to_str, gc)) for gc in res["call_95_cis"]) if has_call else "."), 27 | # *((res["assign_method"] if has_call else ".",) if incorporate_snvs else ()), 28 | *((res["assign_method"] if has_call else ".",) if has_snv_vcf else ()), 29 | 30 | # ("|".join(map(lambda x: f"{x:.5f}", res["peaks"]["means"][:n_peaks])) 31 | # if has_call and n_peaks <= 2 else "."), 32 | # ("|".join(map(lambda x: f"{x:.5f}", res["peaks"]["weights"][:n_peaks])) 33 | # if has_call and n_peaks <= 2 else "."), 34 | # ("|".join(map(lambda x: f"{x:.5f}", res["peaks"]["stdevs"][:n_peaks])) 35 | # if has_call and n_peaks <= 2 else "."), 36 | )) + "\n") 37 | -------------------------------------------------------------------------------- /strkit/call/output/vcf.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import logging 3 | 4 | from collections import Counter 5 | from os.path import commonprefix 6 | from pathlib import Path 7 | from pysam import FastaFile, VariantFile, VariantHeader, VariantRecord 8 | from typing import Iterable 9 | 10 | from strkit.utils import cat_strs, is_none, idx_0_getter 11 | from ..allele import get_n_alleles 12 | from ..params import CallParams 13 | from ..utils import cn_getter 14 | 15 | __all__ = [ 16 | "build_vcf_header", 17 | "output_contig_vcf_lines", 18 | ] 19 | 20 | 21 | # VCF_ALLELE_CNV_TR = "" 22 | 23 | # VCF_TR_INFO_RECORDS: tuple[tuple[str, str, str, str], ...] = ( 24 | # ("SVLEN", "A", "Integer", "Length of the structural variant"), 25 | # ("CN", "A", "Float", "Copy number of allele"), 26 | # ("RN", "A", "Integer", "Total number of repeat sequences in this allele"), 27 | # ("RUS", ".", "String", "Repeat unit sequence of the corresponding repeat sequence"), 28 | # ("RUL", ".", "Integer", "Repeat unit length of the corresponding repeat sequence"), 29 | # ("RB", ".", "Integer", "Total number of bases in the corresponding repeat sequence"), 30 | # ("CIRUC", ".", "Float", "Confidence interval around RUC"), 31 | # ("CIRB", ".", "Integer", "Confidence interval around RB"), 32 | # ) 33 | 34 | VCF_INFO_VT = "VT" 35 | VCF_INFO_MOTIF = "MOTIF" 36 | VCF_INFO_REFMC = "REFMC" 37 | VCF_INFO_ANCH = "ANCH" 38 | 39 | VT_STR = "str" 40 | VT_SNV = "snv" 41 | 42 | 43 | def iter_to_upper(x: Iterable[str]) -> Iterable[str]: 44 | # noinspection PyTypeChecker 45 | return map(str.upper, x) 46 | 47 | 48 | def build_vcf_header(sample_id: str, reference_file: str) -> VariantHeader: 49 | vh = VariantHeader() # automatically sets VCF version to 4.2 50 | 51 | # Add an absolute path to the reference genome 52 | vh.add_meta("reference", f"file://{str(Path(reference_file).resolve().absolute())}") 53 | 54 | # Add all contigs from the reference genome file + lengths 55 | rf = FastaFile(reference_file) 56 | try: 57 | for contig in rf.references: 58 | vh.contigs.add(contig, length=rf.get_reference_length(contig)) 59 | finally: 60 | rf.close() 61 | 62 | # Add CNV:TR alt type (symbolic allele: tandem repeat) 63 | # vh.add_meta("ALT", "") 64 | 65 | # Set up basic VCF formats 66 | vh.formats.add("AD", ".", "Integer", "Read depth for each allele") 67 | vh.formats.add("ANCL", ".", "Integer", "Anchor length for the ref and each alt, five-prime of TR sequence") 68 | vh.formats.add("CONS", ".", "String", "Consensus methods used for each alt (single/poa/best_rep)") 69 | vh.formats.add("DP", 1, "Integer", "Read depth") 70 | vh.formats.add("DPS", 1, "Integer", "Read depth (supporting reads only)") 71 | vh.formats.add("GT", 1, "String", "Genotype") 72 | vh.formats.add("MC", ".", "Integer", "Motif copy number for each allele") 73 | vh.formats.add("MCCI", ".", "String", "Motif copy number 95% confidence interval for each allele") 74 | vh.formats.add("MCRL", ".", "String", "Read-level motif copy numbers for each allele") 75 | vh.formats.add("MMAS", 1, "Float", "Mean model (candidate TR sequence) alignment score across reads.") 76 | vh.formats.add("NSNV", 1, "Integer", "Number of supporting SNVs for the STR peak-call") 77 | vh.formats.add("PS", 1, "Integer", "Phase set") 78 | vh.formats.add("PM", 1, "String", "Peak-calling method (dist/snv+dist/snv/hp)") 79 | 80 | # Set up VCF info fields 81 | vh.info.add(VCF_INFO_VT, 1, "String", "Variant record type (str/snv)") 82 | vh.info.add(VCF_INFO_MOTIF, 1, "String", "Motif string") 83 | vh.info.add(VCF_INFO_REFMC, 1, "Integer", "Motif copy number in the reference genome") 84 | vh.info.add(VCF_INFO_ANCH, 1, "Integer", "Five-prime anchor size") 85 | 86 | # Add INFO records for tandem repeat copies - these are new to VCF4.4! TODO 87 | # for iv in VCF_TR_INFO_RECORDS: 88 | # vh.info.add(*iv) 89 | 90 | # Add the sample 91 | vh.add_sample(sample_id) 92 | 93 | return vh 94 | 95 | 96 | def _vr_pos_key(vr: VariantRecord) -> int: 97 | return vr.pos 98 | 99 | 100 | def _reversed_str(s: str) -> str: 101 | return cat_strs(reversed(s)) 102 | 103 | 104 | @functools.cache 105 | def _blank_entry(n_alleles: int) -> tuple[None, ...]: 106 | return tuple([None] * n_alleles) 107 | 108 | 109 | def output_contig_vcf_lines( 110 | params: CallParams, 111 | sample_id: str, 112 | variant_file: VariantFile, 113 | results: tuple[dict, ...], 114 | logger: logging.Logger, 115 | ) -> None: 116 | variant_records: list[VariantRecord] = [] 117 | 118 | # has_at_least_one_snv_set = next((r.get("snvs") is not None for r in results), None) is not None 119 | snvs_written: set[str] = set() 120 | 121 | for result_idx, result in enumerate(results, 1): 122 | contig = result["contig"] 123 | start = result["start"] 124 | 125 | if "ref_start_anchor" not in result: 126 | logger.debug(f"No ref anchor for {contig}:{start}; skipping VCF output for locus") 127 | continue 128 | 129 | ref_start_anchor = result["ref_start_anchor"].upper() 130 | ref_seq = result["ref_seq"].upper() 131 | 132 | n_alleles: int = get_n_alleles(2, params.sex_chroms, contig) or 2 133 | 134 | res_reads = result["reads"] 135 | res_peaks = result["peaks"] or {} 136 | 137 | peak_seqs_and_methods = {(seq.upper() if seq else seq): method for seq, method in res_peaks.get("seqs", [])} 138 | peak_seqs: tuple[str, ...] = tuple(peak_seqs_and_methods.keys()) 139 | peak_start_anchor_seqs: list[str] = list(map(idx_0_getter, res_peaks.get("start_anchor_seqs", []))) 140 | 141 | if any(map(is_none, peak_seqs)): # Occurs when no consensus for one of the peaks 142 | logger.error(f"Encountered None in results[{result_idx}].peaks.seqs: {peak_seqs}") 143 | continue 144 | 145 | if any(map(is_none, peak_start_anchor_seqs)): # Occurs when no consensus for one of the peaks 146 | logger.error(f"Encountered None in results[{result_idx}].peaks.start_anchor_seqs: {peak_start_anchor_seqs}") 147 | continue 148 | 149 | peak_start_anchor_seqs_upper = tuple(iter_to_upper(peak_start_anchor_seqs)) 150 | common_anchor_prefix = commonprefix([ref_start_anchor, *peak_start_anchor_seqs_upper]) 151 | # anchor_offset = how many bases we can cut off from the front of the anchor 152 | # since they're shared between all alleles - yields a more compact representation. 153 | # - we need to leave one base as an anchor for VCF compliance though, thus the min(...) 154 | anchor_offset = min(len(common_anchor_prefix), params.vcf_anchor_size - 1) 155 | 156 | ref_start_anchor = ref_start_anchor[anchor_offset:] 157 | ref_seq_with_anchor = ref_start_anchor + ref_seq 158 | 159 | seqs_with_anchors: list[tuple[str, str]] = list( 160 | zip(peak_seqs, map(lambda a: a[anchor_offset:], peak_start_anchor_seqs_upper)) 161 | ) 162 | 163 | if 0 < len(peak_seqs) < n_alleles: 164 | peak_seqs = tuple([peak_seqs[0]] * n_alleles) 165 | seqs_with_anchors = [seqs_with_anchors[0]] * n_alleles 166 | 167 | seq_alts = sorted( 168 | set(filter(lambda c: not (c[1] + c[0] == ref_seq_with_anchor), seqs_with_anchors)), 169 | key=lambda c: c[1] + c[0] 170 | ) 171 | 172 | call = result["call"] 173 | call_95_cis = result["call_95_cis"] 174 | 175 | seq_alleles_raw: tuple[str | None, ...] = ( 176 | ((ref_seq, ref_start_anchor), *(seq_alts or (None,))) 177 | if call is not None 178 | else () 179 | ) 180 | 181 | seq_alleles: list[str] = [ref_seq_with_anchor] 182 | 183 | if call is not None and seq_alts: 184 | # If we have a complete deletion, including the anchor, use a symbolic allele meaning "upstream deletion" 185 | for alt_tr_seq, alt_anchor in seq_alts: 186 | if not alt_tr_seq and not alt_anchor: 187 | seq_alleles.append("*") 188 | continue 189 | seq_alleles.append(alt_anchor + alt_tr_seq) 190 | else: 191 | seq_alleles.append(".") 192 | 193 | start = result.get("start_adj", start) - len(ref_start_anchor) 194 | 195 | vr: VariantRecord = variant_file.new_record( 196 | contig=contig, 197 | start=start, 198 | alleles=seq_alleles, 199 | ) 200 | 201 | vr.info[VCF_INFO_VT] = VT_STR 202 | vr.info[VCF_INFO_MOTIF] = result["motif"] 203 | vr.info[VCF_INFO_REFMC] = result["ref_cn"] 204 | vr.info[VCF_INFO_ANCH] = params.vcf_anchor_size - anchor_offset 205 | 206 | vr.samples[sample_id]["GT"] = ( 207 | tuple(map(seq_alleles_raw.index, seqs_with_anchors)) 208 | if call is not None and peak_seqs 209 | else _blank_entry(n_alleles) 210 | ) 211 | 212 | if am := result.get("assign_method"): 213 | vr.samples[sample_id]["PM"] = am 214 | 215 | str_snvs = result.get("snvs", ()) 216 | if str_snvs: 217 | # Record number of support SNVs for the locus 218 | vr.samples[sample_id]["NSNV"] = len(str_snvs) 219 | 220 | vr.samples[sample_id]["DP"] = len(res_reads) 221 | vr.samples[sample_id]["MMAS"] = result.get("mean_model_align_score") 222 | 223 | if call is not None and res_peaks: 224 | vr.samples[sample_id]["DPS"] = sum(res_peaks["n_reads"]) 225 | vr.samples[sample_id]["AD"] = tuple(res_peaks["n_reads"]) 226 | vr.samples[sample_id]["MC"] = tuple(map(int, call)) 227 | vr.samples[sample_id]["MCCI"] = tuple(f"{x[0]}-{x[1]}" for x in call_95_cis) 228 | 229 | vr.samples[sample_id]["ANCL"] = tuple(len(ar[1]) for ar in seq_alleles_raw if ar is not None) 230 | 231 | # For each alt, mention which consensus method was used to obtain the sequence. 232 | cons = tuple( 233 | peak_seqs_and_methods[ar[0]] for ar in seq_alleles_raw[1:] if ar is not None 234 | ) 235 | vr.samples[sample_id]["CONS"] = cons if cons else (".",) 236 | 237 | # Produces a histogram-like format for read-level copy numbers 238 | # e.g., for two alleles with 8 and 9 copy-number respectively, we may get: 7x1|8x10|9x1,8x2|9x12 239 | vr.samples[sample_id]["MCRL"] = tuple( 240 | "|".join( 241 | map( 242 | lambda pair: "x".join(map(str, pair)), 243 | sorted( 244 | Counter( 245 | map(cn_getter, filter(lambda r: r.get("p") == pi, res_reads.values())) 246 | ).items() 247 | ) 248 | ) 249 | ) 250 | for pi in range(res_peaks["modal_n"]) 251 | ) 252 | 253 | ps = result["ps"] 254 | 255 | try: 256 | if ps is not None: # have phase set on call, so mark as phased 257 | vr.samples[sample_id].phased = True 258 | vr.samples[sample_id]["PS"] = ps 259 | except TypeError: 260 | vr.samples[sample_id].phased = False 261 | logger.error(f"Received bad PS value while writing VCF record at {contig}:{start} - {ps}") 262 | ps = None 263 | 264 | for snv in str_snvs: 265 | snv_id = snv["id"] 266 | if snv_id in snvs_written: 267 | continue 268 | snvs_written.add(snv_id) 269 | 270 | ref = snv["ref"] 271 | snv_alts = tuple(sorted(set(filter(lambda v: v != ref, snv["call"])))) 272 | snv_alleles = (ref, *snv_alts) 273 | snv_pos = snv["pos"] 274 | 275 | if len(snv_alleles) < 2: 276 | logger.error(f"Error while writing VCF: SNV ({snv_id}) at {contig}:{snv_pos+1} has no alts") 277 | continue 278 | 279 | snv_vr: VariantRecord = variant_file.new_record( 280 | contig=contig, 281 | id=snv_id, 282 | start=snv_pos, 283 | stop=snv_pos + 1, 284 | alleles=snv_alleles, 285 | ) 286 | 287 | snv_vr.info[VCF_INFO_VT] = VT_SNV 288 | 289 | snv_vr.samples[sample_id]["GT"] = tuple(map(snv_alleles.index, snv["call"])) 290 | snv_vr.samples[sample_id]["DP"] = sum(snv["rcs"]) 291 | snv_vr.samples[sample_id]["AD"] = snv["rcs"] 292 | 293 | if ps is not None: 294 | snv_vr.samples[sample_id].phased = True 295 | snv_vr.samples[sample_id]["PS"] = ps 296 | 297 | variant_records.append(snv_vr) 298 | 299 | variant_records.append(vr) 300 | 301 | # sort the variant records by position 302 | variant_records.sort(key=_vr_pos_key) 303 | 304 | # write them to the VCF 305 | for vrr in variant_records: 306 | variant_file.write(vrr) 307 | -------------------------------------------------------------------------------- /strkit/call/params.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pathlib 3 | 4 | from pysam import AlignmentFile 5 | 6 | from ..logger import log_levels 7 | 8 | __all__ = ["CallParams"] 9 | 10 | 11 | class CallParams: 12 | def __init__( 13 | self, 14 | 15 | logger: logging.Logger, 16 | 17 | read_file: str, 18 | reference_file: str, 19 | loci_file: str, 20 | sample_id: str | None, 21 | min_reads: int = 4, 22 | min_allele_reads: int = 2, 23 | max_reads: int = 250, 24 | min_avg_phred: int = 13, 25 | min_read_align_score: float = 0.9, 26 | max_rcn_iters: int = 50, 27 | num_bootstrap: int = 100, 28 | flank_size: int = 70, 29 | skip_supplementary: bool = False, 30 | skip_secondary: bool = False, 31 | sex_chroms: str | None = None, 32 | realign: bool = False, 33 | hq: bool = False, 34 | use_hp: bool = False, 35 | snv_vcf: pathlib.Path | None = None, 36 | snv_min_base_qual: int = 20, 37 | targeted: bool = False, 38 | respect_ref: bool = False, 39 | count_kmers: str = "none", # "none" | "peak" | "read" 40 | consensus: bool = False, 41 | vcf_anchor_size: int = 5, 42 | # --- 43 | log_level: int = logging.WARNING, 44 | seed: int | None = None, 45 | processes: int = 1, 46 | ): 47 | self.read_file: str = read_file 48 | self.reference_file: str = reference_file 49 | self.loci_file: str = loci_file 50 | self.min_reads: int = min_reads 51 | self.min_allele_reads: int = min_allele_reads 52 | self.max_reads: int = max_reads 53 | self.min_avg_phred: int = min_avg_phred 54 | self.min_read_align_score: float = min_read_align_score 55 | self.max_rcn_iters: int = max_rcn_iters 56 | self.num_bootstrap: int = num_bootstrap 57 | self.flank_size: int = flank_size 58 | self.skip_supplementary: bool = skip_supplementary 59 | self.skip_secondary: bool = skip_secondary 60 | self.sex_chroms: str | None = sex_chroms 61 | self.realign: bool = realign 62 | self.hq: bool = hq 63 | self.use_hp: bool = use_hp 64 | self.snv_vcf: pathlib.Path | None = snv_vcf 65 | self.snv_min_base_qual: int = snv_min_base_qual 66 | self.targeted: bool = targeted 67 | self.respect_ref: bool = respect_ref 68 | self.count_kmers: str = count_kmers 69 | self.consensus: bool = consensus 70 | self.vcf_anchor_size: int = vcf_anchor_size 71 | # --- 72 | self.log_level: int = log_level 73 | self.seed: int | None = seed 74 | self.processes: int = processes 75 | 76 | bf = AlignmentFile(read_file, reference_filename=reference_file) 77 | 78 | # noinspection PyTypeChecker 79 | bfh = bf.header.to_dict() 80 | 81 | sns: set[str] = {e.get("SM") for e in bfh.get("RG", ()) if e.get("SM")} 82 | bam_sample_id: str | None = None 83 | 84 | if len(sns) > 1: 85 | # Error or warning or what? 86 | sns_str = "', '".join(sns) 87 | logger.warning(f"Found more than one sample ID in BAM file(s): '{sns_str}'") 88 | elif not sns: 89 | if not sample_id: 90 | logger.warning("Could not find sample ID in BAM file(s); sample ID can be set manually via --sample-id") 91 | else: 92 | bam_sample_id = sns.pop() 93 | 94 | self._sample_id_orig: str | None = sample_id 95 | self.sample_id = sample_id or bam_sample_id 96 | 97 | @classmethod 98 | def from_args(cls, logger: logging.Logger, p_args): 99 | return cls( 100 | logger, 101 | p_args.read_file, 102 | p_args.ref, 103 | p_args.loci, 104 | sample_id=p_args.sample_id, 105 | min_reads=p_args.min_reads, 106 | min_allele_reads=p_args.min_allele_reads, 107 | max_reads=p_args.max_reads, 108 | min_avg_phred=p_args.min_avg_phred, 109 | min_read_align_score=p_args.min_read_align_score, 110 | max_rcn_iters=p_args.max_rcn_iters, 111 | num_bootstrap=p_args.num_bootstrap, 112 | flank_size=p_args.flank_size, 113 | skip_supplementary=p_args.skip_supplementary, 114 | skip_secondary=p_args.skip_secondary, 115 | sex_chroms=p_args.sex_chr, 116 | realign=p_args.realign, 117 | hq=p_args.hq, 118 | use_hp=p_args.use_hp, 119 | snv_vcf=p_args.incorporate_snvs, 120 | snv_min_base_qual=p_args.snv_min_base_qual, 121 | targeted=p_args.targeted, 122 | respect_ref=p_args.respect_ref, 123 | count_kmers=p_args.count_kmers, 124 | consensus=p_args.consensus or not (not p_args.vcf), # Consensus calculation is required for VCF output. 125 | vcf_anchor_size=min(max(p_args.vcf_anchor_size, 1), p_args.flank_size), 126 | # --- 127 | log_level=log_levels[p_args.log_level], 128 | seed=p_args.seed, 129 | processes=p_args.processes, 130 | ) 131 | 132 | def to_dict(self, as_inputted: bool = False): 133 | return { 134 | "read_file": self.read_file, 135 | "reference_file": self.reference_file, 136 | "min_reads": self.min_reads, 137 | "min_allele_reads": self.min_allele_reads, 138 | "max_reads": self.max_reads, 139 | "min_avg_phred": self.min_avg_phred, 140 | "min_read_align_score": self.min_read_align_score, 141 | "max_rcn_iters": self.max_rcn_iters, 142 | "num_bootstrap": self.num_bootstrap, 143 | "flank_size": self.flank_size, 144 | "skip_supplementary": self.skip_supplementary, 145 | "skip_secondary": self.skip_secondary, 146 | "sample_id": self._sample_id_orig if as_inputted else self.sample_id, 147 | "realign": self.realign, 148 | "hq": self.hq, 149 | "use_hp": self.use_hp, 150 | "snv_vcf": str(self.snv_vcf) if self.snv_vcf else None, 151 | "snv_min_base_qual": self.snv_min_base_qual, 152 | "targeted": self.targeted, 153 | "respect_ref": self.respect_ref, 154 | "count_kmers": self.count_kmers, 155 | "consensus": self.consensus, 156 | "vcf_anchor_size": self.vcf_anchor_size, 157 | "log_level": self.log_level, 158 | "seed": self.seed, 159 | "processes": self.processes, 160 | } 161 | -------------------------------------------------------------------------------- /strkit/call/realign.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import multiprocessing as mp 3 | import numpy as np 4 | import os 5 | import parasail 6 | import queue 7 | import time 8 | 9 | from numpy.typing import NDArray 10 | 11 | from .align_matrix import match_score, dna_matrix 12 | from .cigar import decode_cigar_np, get_aligned_pair_matches 13 | from .params import CallParams 14 | from .utils import calculate_seq_with_wildcards 15 | 16 | __all__ = [ 17 | "MatchedCoordPairListOrNone", 18 | "realign_read", 19 | "perform_realign", 20 | ] 21 | 22 | 23 | min_realign_score_ratio: float = 0.95 # TODO: parametrize 24 | realign_indel_open_penalty: int = 7 # TODO: parametrize 25 | max_ref_len_for_same_proc: int = 1200 # TODO: parametrize 26 | max_read_len_for_same_proc: int = 20000 # TODO: parametrize 27 | 28 | 29 | MatchedCoordPairList = tuple[NDArray[np.uint64], NDArray[np.uint64]] 30 | MatchedCoordPairListOrNone = MatchedCoordPairList | None 31 | 32 | 33 | def realign_read( 34 | ref_seq: str, 35 | query_seq: str, 36 | left_flank_coord: int, 37 | flank_size: int, 38 | rn: str, 39 | t_idx: int, 40 | always_realign: bool, 41 | q, # mp.Queue | None 42 | log_level: int = logging.WARNING, 43 | ) -> MatchedCoordPairListOrNone: 44 | # Have to re-attach logger in separate process I guess 45 | 46 | def ret_q(v: MatchedCoordPairListOrNone) -> MatchedCoordPairListOrNone: 47 | if q: 48 | q.put(v) 49 | q.close() 50 | return v 51 | 52 | from strkit.logger import create_process_logger 53 | lg = create_process_logger(os.getpid(), log_level) 54 | 55 | # flipped: 'ref sequence' as query here, since it should in general be shorter (!) 56 | pr = parasail.sg_dx_trace_scan_16( 57 | # fetch an extra base for the right flank coordinate check later (needs to be >= the exclusive coord) 58 | ref_seq, query_seq, realign_indel_open_penalty, 0, dna_matrix) 59 | 60 | if pr.score < (th := min_realign_score_ratio * (flank_size * 2 * match_score - realign_indel_open_penalty)): 61 | lg.debug(f"Realignment for {rn} scored below threshold ({pr.score} < {th:.2f})") 62 | return ret_q(None) 63 | 64 | lg.debug( 65 | f"Realigned {rn} in locus {t_idx}{' (due to soft clipping)' if not always_realign else ''}: scored {pr.score}; " 66 | f"Flipped CIGAR: {pr.cigar.decode.decode('ascii')}") 67 | 68 | matches = get_aligned_pair_matches(decode_cigar_np(pr.cigar.seq), left_flank_coord, 0) 69 | res: MatchedCoordPairList = (matches[1], matches[0]) 70 | return ret_q(res) 71 | 72 | 73 | def perform_realign( 74 | t_idx: int, 75 | left_flank_coord: int, 76 | ref_total_seq: str, 77 | rn: str, 78 | qs: str, 79 | fqqs: NDArray[np.uint8], 80 | # --- 81 | params: CallParams, 82 | realign_timeout: int, 83 | force_realign: bool, 84 | # --- 85 | logger_: logging.Logger, 86 | locus_log_str: str, 87 | ) -> MatchedCoordPairListOrNone: 88 | qs_wc = calculate_seq_with_wildcards(qs, fqqs) 89 | 90 | ref_seq_len = len(ref_total_seq) 91 | qs_len = len(qs_wc) 92 | 93 | if ref_seq_len <= max_ref_len_for_same_proc and qs_len <= max_read_len_for_same_proc: 94 | # Don't start process for short realigns, since then process startup dominates the total time taken 95 | # TODO: more robust solution; realign worker somehow? How to do timeout? 96 | return realign_read( 97 | ref_total_seq, qs_wc, left_flank_coord, params.flank_size, rn, t_idx, force_realign, None, params.log_level 98 | ) 99 | 100 | t = time.time() 101 | 102 | q: mp.Queue = mp.Queue() 103 | proc = mp.Process(target=realign_read, daemon=False, kwargs=dict( 104 | # fetch an extra base for the right flank coordinate check later (needs to be >= the exclusive coord) 105 | ref_seq=ref_total_seq, # TODO: with the plus 1, really? 106 | query_seq=qs_wc, 107 | left_flank_coord=left_flank_coord, 108 | flank_size=params.flank_size, 109 | rn=rn, 110 | t_idx=t_idx, 111 | always_realign=force_realign, 112 | q=q, 113 | log_level=params.log_level, 114 | )) 115 | proc.start() 116 | 117 | pairs_new = None 118 | try: 119 | pairs_new = q.get(timeout=realign_timeout) 120 | proc.join() 121 | except queue.Empty: 122 | logger_.warning( 123 | f"{locus_log_str} - experienced timeout while re-aligning read {rn}. Reverting to initial " 124 | f"alignment.") 125 | proc.terminate() 126 | time.sleep(0.1) # wait a little for the process to terminate 127 | finally: 128 | wait_count: int = 0 129 | while proc.is_alive(): 130 | logger_.warning(f"{locus_log_str} - realign job has still not exited. Waiting 0.5 seconds...") 131 | time.sleep(0.5) 132 | wait_count += 1 133 | if wait_count > 30: 134 | logger_.fatal(f"{locus_log_str} - realign job never exited. Terminating...") 135 | exit(1) 136 | proc.close() 137 | 138 | logger_.debug( 139 | f"{locus_log_str} - {rn}: long realign job completed in {time.time() - t:.4f}s ({ref_seq_len=}, {qs_len=})") 140 | 141 | return pairs_new 142 | -------------------------------------------------------------------------------- /strkit/call/repeats.py: -------------------------------------------------------------------------------- 1 | import parasail 2 | 3 | from functools import lru_cache 4 | from typing import Literal 5 | 6 | from strkit_rust_ext import get_repeat_count as _get_repeat_count 7 | from strkit.utils import idx_1_getter 8 | 9 | from .align_matrix import dna_matrix, indel_penalty 10 | 11 | __all__ = [ 12 | "get_repeat_count", 13 | "get_ref_repeat_count", 14 | ] 15 | 16 | 17 | DEFAULT_LOCAL_SEARCH_RANGE = 3 18 | 19 | 20 | def score_candidate_with_string(db_seq_profile: parasail.Profile, tr_seq: str) -> int: 21 | # TODO: sub-flank again, to avoid more errors in flanking region contributing to score? 22 | # Always assign parasail results to variables due to funky memory allocation behaviour 23 | # - switch 'db' and 'query' here so we can use the db sequence as the profile for a "database" search against 24 | # candidate sequences. order doesn't end up mattering, since we're using semi-global alignment. 25 | r = parasail.sg_striped_profile_sat(db_seq_profile, tr_seq, indel_penalty, indel_penalty) 26 | return r.score 27 | 28 | 29 | def score_candidate( 30 | db_seq_profile: parasail.Profile, 31 | motif: str, 32 | motif_count: int, 33 | flank_left_seq: str, 34 | flank_right_seq: str, 35 | ) -> int: 36 | return score_candidate_with_string(db_seq_profile, f"{flank_left_seq}{motif * motif_count}{flank_right_seq}") 37 | 38 | 39 | def score_ref_boundaries( 40 | db_seq_profile: parasail.Profile, 41 | db_seq_rev_profile: parasail.Profile, 42 | tr_candidate: str, 43 | flank_left_seq: str, 44 | flank_right_seq: str, 45 | ref_size: int, 46 | ) -> tuple[tuple[int, int], tuple[int, int]]: 47 | # Always assign parasail results to variables due to funky memory allocation behaviour 48 | ext_r_seq = f"{flank_left_seq}{tr_candidate}" 49 | r_fwd = parasail.sg_qe_scan_profile_sat(db_seq_profile, ext_r_seq, indel_penalty, indel_penalty) 50 | r_adj = r_fwd.end_query + 1 - len(flank_left_seq) - ref_size # Amount to tweak boundary on the right side by 51 | 52 | # Used to be flank_right_seq[max(r_adj, 0):] but I think that adjustment makes this score worse than it should and 53 | # wasn't valid, since what matters is the delta over the limit... 54 | ext_l_seq = f"{tr_candidate}{flank_right_seq}"[::-1] # reverse 55 | 56 | r_rev = parasail.sg_qe_scan_profile_sat(db_seq_rev_profile, ext_l_seq, indel_penalty, indel_penalty) 57 | l_adj = r_rev.end_query + 1 - len(flank_right_seq) - ref_size # Amount to tweak boundary on the left side by 58 | 59 | return (r_fwd.score, r_adj), (r_rev.score, l_adj) 60 | 61 | 62 | # TODO: instead of lru_cache, some more custom mechanism for sharing? 63 | @lru_cache(maxsize=512) 64 | def get_repeat_count( 65 | start_count: int, 66 | tr_seq: str, 67 | flank_left_seq: str, 68 | flank_right_seq: str, 69 | motif: str, 70 | max_iters: int, 71 | local_search_range: int = DEFAULT_LOCAL_SEARCH_RANGE, # TODO: Parametrize for user 72 | step_size: int = 1, 73 | ) -> tuple[tuple[int, int], int, int]: 74 | return _get_repeat_count( 75 | start_count, tr_seq, flank_left_seq, flank_right_seq, motif, max_iters, local_search_range, step_size 76 | ) 77 | 78 | 79 | def get_ref_repeat_count( 80 | start_count: int, 81 | tr_seq: str, 82 | flank_left_seq: str, 83 | flank_right_seq: str, 84 | motif: str, 85 | ref_size: int, 86 | vcf_anchor_size: int, 87 | max_iters: int, 88 | respect_coords: bool = False, 89 | local_search_range: int = DEFAULT_LOCAL_SEARCH_RANGE, # TODO: Parametrize for user 90 | step_size: int = 1, 91 | ) -> tuple[tuple[int | float, int], int, int, tuple[int, int], tuple[str, str, str]]: 92 | l_offset: int = 0 93 | r_offset: int = 0 94 | 95 | db_seq: str = f"{flank_left_seq}{tr_seq}{flank_right_seq}" 96 | db_seq_profile: parasail.Profile = parasail.profile_create_sat(db_seq, dna_matrix) 97 | db_seq_rev_profile: parasail.Profile = parasail.profile_create_sat(db_seq[::-1], dna_matrix) 98 | 99 | motif_size = len(motif) 100 | 101 | n_offset_scores: int = 0 102 | 103 | if not respect_coords: # Extend out coordinates from initial definition 104 | to_explore: list[tuple[int, Literal[-1, 0, 1]]] = [ 105 | (start_count - step_size, -1), (start_count + step_size, 1), (start_count, 0)] 106 | 107 | fwd_sizes_scores_adj: dict[int | float, tuple[int, int]] = {} 108 | rev_sizes_scores_adj: dict[int | float, tuple[int, int]] = {} 109 | 110 | while to_explore and n_offset_scores < max_iters: 111 | size_to_explore, direction = to_explore.pop() 112 | if size_to_explore < 0: 113 | continue 114 | 115 | fwd_scores: list[tuple[float | int, tuple[int, int], int]] = [] # For right-side adjustment 116 | rev_scores: list[tuple[float | int, tuple[int, int], int]] = [] # For left-side adjustment 117 | 118 | start_size = max( 119 | size_to_explore - (local_search_range if (direction < 1 or step_size > local_search_range) else 0), 0) 120 | end_size = size_to_explore + (local_search_range if (direction > -1 or step_size > local_search_range) 121 | else 0) 122 | 123 | for i in range(start_size, end_size + 1): 124 | fwd_rs = fwd_sizes_scores_adj.get(i) 125 | rev_rs = rev_sizes_scores_adj.get(i) 126 | 127 | if fwd_rs is None or rev_rs is None: 128 | res = score_ref_boundaries( 129 | db_seq_profile, db_seq_rev_profile, motif * i, flank_left_seq, flank_right_seq, ref_size) 130 | 131 | fwd_sizes_scores_adj[i] = fwd_rs = res[0] 132 | rev_sizes_scores_adj[i] = rev_rs = res[1] 133 | 134 | n_offset_scores += 1 135 | 136 | fwd_scores.append((i, fwd_rs, i)) 137 | rev_scores.append((i, rev_rs, i)) 138 | 139 | mv: tuple[float | int, tuple[int, int], int] = max((*fwd_scores, *rev_scores), key=idx_1_getter) 140 | if mv[2] > size_to_explore and ( 141 | (new_rc := mv[2] + step_size) not in fwd_sizes_scores_adj or new_rc not in rev_sizes_scores_adj): 142 | if new_rc >= 0: 143 | to_explore.append((new_rc, 1)) 144 | if mv[2] < size_to_explore and ( 145 | (new_rc := mv[2] - step_size) not in fwd_sizes_scores_adj or new_rc not in rev_sizes_scores_adj): 146 | if new_rc >= 0: 147 | to_explore.append((new_rc, -1)) 148 | 149 | # noinspection PyTypeChecker 150 | fwd_top_res: tuple[int | float, tuple] = max(fwd_sizes_scores_adj.items(), key=lambda x: x[1][0]) 151 | # noinspection PyTypeChecker 152 | rev_top_res: tuple[int | float, tuple] = max(rev_sizes_scores_adj.items(), key=lambda x: x[1][0]) 153 | 154 | # Ignore negative differences (contractions vs TRF definition), but follow expansions 155 | # TODO: Should we incorporate contractions? How would that work? 156 | 157 | l_offset = rev_top_res[1][1] 158 | r_offset = fwd_top_res[1][1] 159 | 160 | if l_offset >= len(flank_left_seq) - vcf_anchor_size: 161 | # don't do anything weird if we're removing the entire flank sequence 162 | # TODO: this can be caused by NNNNNNN - see chr5:139453668-139454525 in GRCh38 163 | l_offset = 0 164 | if r_offset >= len(flank_right_seq): 165 | r_offset = 0 # same here 166 | 167 | if l_offset > 0: 168 | tr_seq = flank_left_seq[-1*l_offset:] + tr_seq # first, move a chunk of the left flank to the TR seq 169 | flank_left_seq = flank_left_seq[:-1*l_offset] # then, remove that chunk from the left flank 170 | if r_offset > 0: 171 | tr_seq = tr_seq + flank_right_seq[:r_offset] # same, but for the right flank 172 | flank_right_seq = flank_right_seq[r_offset:] 173 | 174 | # ------------------------------------------------------------------------------------------------------------------ 175 | 176 | final_res, n_iters_final_count, _ = get_repeat_count( 177 | # always start with int here: 178 | round(((start_count * motif_size) + (max(0, l_offset) + max(0, r_offset))) / motif_size), 179 | tr_seq, 180 | flank_left_seq, 181 | flank_right_seq, 182 | motif, 183 | max_iters=max_iters, 184 | step_size=step_size, 185 | ) 186 | 187 | return ( 188 | final_res, l_offset, r_offset, (n_offset_scores, n_iters_final_count), (flank_left_seq, tr_seq, flank_right_seq) 189 | ) 190 | -------------------------------------------------------------------------------- /strkit/call/snvs.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import multiprocessing.managers as mmg 3 | 4 | from collections import Counter 5 | 6 | from strkit_rust_ext import get_read_snvs, process_read_snvs_for_locus_and_calculate_useful_snvs, CandidateSNVs 7 | from strkit.utils import idx_1_getter 8 | 9 | from .types import ReadDict, CalledSNV 10 | 11 | 12 | __all__ = [ 13 | "SNV_OUT_OF_RANGE_CHAR", 14 | "SNV_GAP_CHAR", 15 | "SNV_NA_CHARS", 16 | "get_read_snvs", 17 | "call_and_filter_useful_snvs", 18 | "process_read_snvs_for_locus_and_calculate_useful_snvs", 19 | ] 20 | 21 | SNV_OUT_OF_RANGE_CHAR = "-" 22 | SNV_GAP_CHAR = "_" 23 | SNV_NA_CHARS = (SNV_OUT_OF_RANGE_CHAR, SNV_GAP_CHAR) 24 | 25 | 26 | def call_and_filter_useful_snvs( 27 | contig: str, 28 | n_alleles: int, 29 | read_dict: dict[str, ReadDict], 30 | useful_snvs: list[tuple[int, int]], 31 | candidate_snvs: CandidateSNVs, 32 | # --- 33 | snv_quality_threshold: int, 34 | # --- 35 | snv_genotype_cache: mmg.DictProxy, 36 | # --- 37 | locus_log_str: str, 38 | logger_: logging.Logger, 39 | ) -> list[CalledSNV]: 40 | """ 41 | Call useful SNVs at a locus level from read-level SNV data. 42 | :param contig: The contig of the SNVs. Used for generating an ID if one does not exist. 43 | :param n_alleles: The number of alleles called for this locus. 44 | :param read_dict: Dictionary of read data. Must already have peaks assigned. 45 | :param useful_snvs: List of tuples representing useful SNVs: (SNV index, reference position) 46 | :param candidate_snvs: A dictionary of useful SNVs, indexed by reference position. Used to look up IDs. 47 | :param snv_quality_threshold: Minimum PHRED score needed to incorporate a read base into the genotype. 48 | :param snv_genotype_cache: Cache for SNV genotype/phase set information. 49 | :param locus_log_str: Locus string representation for logging purposes. 50 | :param logger_: Python logger object. 51 | :return: List of called SNVs for the locus. 52 | """ 53 | 54 | # Since these have already been classified as 'useful' earlier in the pipeline, 55 | # we have some guarantees that these values should be fairly internally consistent 56 | # for a given peak... most of the time. 57 | 58 | allele_range = tuple(range(n_alleles)) 59 | peak_base_counts: dict[int, dict[int, Counter]] = { 60 | u_ref: {p: Counter() for p in allele_range} 61 | for _, u_ref in useful_snvs 62 | } 63 | 64 | for rn, read in read_dict.items(): 65 | p: int | None = read.get("p") 66 | if p is None: # No peak; read wasn't used to call peaks 67 | continue 68 | for u_idx, (_, u_ref) in enumerate(useful_snvs): 69 | su, su_q = read["snvu"][u_idx] 70 | 71 | if su == SNV_GAP_CHAR or su_q >= snv_quality_threshold: 72 | peak_base_counts[u_ref][p].update((su,)) 73 | 74 | called_snvs: list[dict] = [] 75 | skipped_snvs: set[int] = set() 76 | 77 | for u_idx, (u_ref, peak_counts) in enumerate(peak_base_counts.items()): 78 | call: list[str] = [] 79 | rs: list[int] = [] 80 | 81 | skipped: bool = False 82 | 83 | for a in allele_range: 84 | if skipped: 85 | break 86 | 87 | peak_counts_a = peak_counts[a] 88 | a_total = peak_counts[a].total() 89 | 90 | if a_total == 0: # probably due to quality filtering 91 | skipped = True 92 | logger_.warning(f"{locus_log_str} - for SNV {u_ref}, found a 0-total for allele {a} (a)") 93 | break 94 | 95 | mc = peak_counts_a.most_common(2) 96 | mcc = mc[0] 97 | 98 | try: 99 | if mcc[0] == SNV_OUT_OF_RANGE_CHAR: # Chose most common non-uncalled value 100 | mcc = mc[1] 101 | 102 | for b in allele_range: 103 | if b == a: 104 | continue 105 | 106 | peak_counts_b = peak_counts[b] 107 | b_total = peak_counts_b.total() 108 | 109 | if b_total == 0: # probably due to quality filtering 110 | skipped = True 111 | logger_.warning(f"{locus_log_str} - for SNV {u_ref}, found a 0-total for allele {b} (b)") 112 | break 113 | 114 | if (peak_counts_b[mcc[0]] / b_total) > (peak_counts_a[mcc[0]] / a_total / 2): # TODO: parametrize 115 | logger_.debug( 116 | f"{locus_log_str} - for SNV position {u_ref}: got uninformative peak counts (cross-talk) - " 117 | f"{peak_counts=}") 118 | skipped = True 119 | break 120 | 121 | except IndexError: # '-' is the only value, somehow 122 | logger_.debug( 123 | f"{locus_log_str} - for SNV {u_ref}, found only '{SNV_OUT_OF_RANGE_CHAR}' with {mcc[1]} reads") 124 | logger_.debug(f"{locus_log_str} - for SNV position {u_ref}: {mc=}, {peak_counts[a]=}") 125 | skipped = True 126 | break 127 | 128 | if not skipped: 129 | call.append(mcc[0]) 130 | rs.append(mcc[1]) 131 | 132 | snv_call_set = set(call) 133 | 134 | if not skipped and len(snv_call_set) == 1: 135 | logger_.warning( 136 | f"{locus_log_str} - for SNV position {u_ref}: got degenerate call {call} from {peak_counts=}") 137 | skipped = True 138 | 139 | snv_rec = candidate_snvs.get(u_ref) 140 | if snv_rec is not None: 141 | snv_id = snv_rec["id"] 142 | if snv_id == ".": 143 | snv_id = f"{contig}_{u_ref}" 144 | else: 145 | snv_id = f"{contig}_{u_ref}" 146 | 147 | if not skipped: 148 | cached_snv_genotype = snv_genotype_cache.get(snv_id) 149 | if cached_snv_genotype is not None and (cgt := set(cached_snv_genotype[0])) != snv_call_set: 150 | logger_.warning( 151 | f"{locus_log_str} - got mismatch for SNV {snv_id} (position {u_ref}); cache genotype set {cgt} != " 152 | f"current genotype set {snv_call_set}") 153 | skipped = True 154 | 155 | if skipped: 156 | skipped_snvs.add(u_idx) # Skip this useful SNV, since it isn't actually useful 157 | continue 158 | 159 | called_snvs.append({ 160 | "id": snv_id, 161 | **({"ref": snv_rec["ref_base"]} if snv_rec is not None else {}), 162 | "pos": u_ref, 163 | "call": tuple(call), 164 | "rcs": rs, 165 | }) 166 | 167 | # If we've skipped any SNVs, filter them out of the read dict - MUTATION 168 | if skipped_snvs: 169 | for read in read_dict.values(): 170 | if "snvu" not in read: 171 | continue 172 | read["snvu"] = tuple(map(idx_1_getter, filter(lambda e: e[0] not in skipped_snvs, enumerate(read["snvu"])))) 173 | logger_.debug(f"{locus_log_str} - filtered out {len(skipped_snvs)} not-actually-useful SNVs") 174 | 175 | return called_snvs 176 | -------------------------------------------------------------------------------- /strkit/call/types.py: -------------------------------------------------------------------------------- 1 | # import pysam 2 | import numpy as np 3 | from typing import Literal, TypedDict 4 | from numpy.typing import NDArray 5 | 6 | 7 | __all__ = [ 8 | "VCFContigFormat", 9 | "AssignMethod", 10 | "AssignMethodWithHP", 11 | "ConsensusMethod", 12 | # --- 13 | "ReadDict", 14 | "ReadDictExtra", 15 | "CalledSNV", 16 | "LocusResult", 17 | ] 18 | 19 | # TODO: py3.10: new Required[] TypedDict structuring 20 | 21 | 22 | VCFContigFormat = Literal["chr", "num", "acc", ""] 23 | 24 | AssignMethod = Literal["dist", "snv", "snv+dist", "single"] 25 | AssignMethodWithHP = AssignMethod | Literal["hp"] 26 | 27 | ConsensusMethod = Literal["single", "poa", "best_rep"] 28 | 29 | 30 | class _ReadDictBase(TypedDict): 31 | s: Literal["-", "+"] # DNA strand alignment 32 | cn: int | float # Copy number 33 | w: float # Weight 34 | sc: float | None # Adjusted read model align score (None if TR is missing) 35 | 36 | 37 | class ReadDict(_ReadDictBase, total=False): 38 | # Whether the read was realigned by hand using a local alignment algorithm. 39 | realn: bool 40 | 41 | # Whether the read appears to be chimeric within the locus region, 42 | # i.e. aligned twice with different soft-clipping. 43 | chimeric_in_region: bool 44 | 45 | p: int # Peak (allele) 46 | 47 | kmers: dict[str, int] # Dictionary of {kmer: count} 48 | 49 | # Only added if HP tags from a haplotagged alignment file are being incorporated: 50 | hp: int 51 | ps: int 52 | 53 | # Only added if SNVs are being incorporated: 54 | # - After including only useful SNVs, this contains a tuple of bases for just those + corresponding qualities 55 | snvu: tuple[tuple[str, int], ...] 56 | 57 | 58 | class ReadDictExtra(TypedDict, total=False): 59 | _ref_start: int # Read start in ref coordinates 60 | _ref_end: int # Read end in ref coordinates 61 | 62 | # BEGIN: only added if consensus is being calculated 63 | _start_anchor: str # Left anchor for calculated allele sequence (usually 1 base) 64 | _tr_seq: str # Tandem repeat sequence 65 | # END: only added if consensus is being calculated 66 | 67 | # Below are only added if SNVs are being incorporated: 68 | 69 | _qs: str # Query (read) sequence 70 | _fqqs: NDArray[np.uint8] # Query (read) base qualities 71 | 72 | sig_clip_left: bool # Significant amounts of clipping (5' of read) 73 | sig_clip_right: bool # Significant amounts of clipping (3' of read) 74 | 75 | snv: dict[int, str] # Intermediate result: dictionary of a bunch of SNVs for this read {position: base} 76 | # Intermediate result: tuple of bases/qualities for the set of SNVs across all reads 77 | snv_bases: tuple[tuple[str, int], ...] 78 | 79 | 80 | class _CalledSNVBase(TypedDict): 81 | id: str 82 | pos: int 83 | call: tuple[str, ...] 84 | rcs: list[int] 85 | 86 | 87 | class CalledSNV(_CalledSNVBase, total=False): 88 | ref: str 89 | 90 | 91 | class BasePeakData(TypedDict): 92 | means: NDArray[np.float32] 93 | weights: NDArray[np.float32] 94 | stdevs: NDArray[np.float32] 95 | modal_int: int 96 | n_reads: list[int] 97 | 98 | 99 | class PeakData(BasePeakData): 100 | kmers: dict[str, int] 101 | seqs: list[tuple[str, ConsensusMethod]] # really "list-tyup 102 | 103 | 104 | class BaseLocusResult(TypedDict): 105 | locus_index: int 106 | contig: str 107 | start: int 108 | end: int 109 | 110 | motif: str 111 | 112 | assign_method: AssignMethodWithHP | None 113 | call: list[int] | None 114 | call_95_cis: list[list[int]] | None 115 | call_99_cis: list[list[int]] | None 116 | 117 | # Mean model (candidate TR sequence) alignment score across reads. 118 | mean_model_align_score: float | None 119 | 120 | 121 | class LocusResult(BaseLocusResult, total=False): 122 | start_adj: int 123 | end_adj: int 124 | 125 | ref_cn: int 126 | 127 | ps: int | None 128 | peaks: PeakData | None 129 | read_peaks_called: bool 130 | time: float 131 | 132 | # if we're in consensus mode: --- 133 | ref_start_anchor: str 134 | ref_seq: str 135 | # --- 136 | 137 | reads: dict[str, ReadDict] 138 | snvs: list[CalledSNV] 139 | -------------------------------------------------------------------------------- /strkit/call/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import operator 3 | 4 | from functools import cache, partial 5 | from numpy.typing import NDArray 6 | 7 | from ..utils import cat_strs 8 | 9 | __all__ = [ 10 | "cn_getter", 11 | "neq_blank", 12 | "find_pair_by_ref_pos", 13 | "normalize_contig", 14 | "round_to_base_pos", 15 | "get_new_seed", 16 | "calculate_seq_with_wildcards", 17 | ] 18 | 19 | 20 | # index/property getters and other partials 21 | cn_getter = operator.itemgetter("cn") 22 | neq_blank = partial(operator.ne, "") 23 | 24 | 25 | def find_pair_by_ref_pos(r_coords: NDArray[np.uint64], target: int, start_left: int = 0) -> tuple[int, bool]: 26 | n_pairs: int = len(r_coords) 27 | idx = start_left + np.searchsorted(r_coords[start_left:], target) 28 | return idx, idx < n_pairs and r_coords[idx] == target 29 | 30 | 31 | def normalize_contig(contig: str, has_chr: bool) -> str: 32 | return ("chr" if has_chr else "") + contig.replace("chr", "") 33 | 34 | 35 | def round_to_base_pos(x, motif_size: int) -> float: 36 | return round(float(x) * motif_size) / motif_size 37 | 38 | 39 | def get_new_seed(rng: np.random.Generator) -> int: 40 | return rng.integers(0, 4096, dtype=int) 41 | 42 | 43 | @cache # TODO: parametrize base_wildcard_threshold 44 | def _mask_low_q_base(base_and_qual: tuple[str, int], base_wildcard_threshold: int = 3) -> str: 45 | return base_and_qual[0] if base_and_qual[1] > base_wildcard_threshold else "X" 46 | 47 | 48 | def calculate_seq_with_wildcards(qs: str, quals: NDArray[np.uint8] | None) -> str: 49 | if quals is None: 50 | return qs # No quality information, so don't do anything 51 | return cat_strs(map(_mask_low_q_base, zip(qs, quals))) 52 | -------------------------------------------------------------------------------- /strkit/call/validation.py: -------------------------------------------------------------------------------- 1 | import re 2 | from logging import Logger 3 | 4 | __all__ = [ 5 | "LocusValidationError", 6 | "valid_motif", 7 | "validate_locus", 8 | ] 9 | 10 | # patterns 11 | RE_VALID_MOTIF = re.compile(r"^[ACGTRYSWKMBDHVN]+$") 12 | 13 | 14 | # exceptions 15 | 16 | class LocusValidationError(ValueError): 17 | def __init__(self, error_str: str, hint_msg: str): 18 | self._error_str = error_str 19 | self._hint_msg = hint_msg 20 | super().__init__(error_str) 21 | 22 | def log_error(self, logger: Logger) -> None: 23 | logger.critical(self._error_str) 24 | logger.critical(self._hint_msg) 25 | 26 | 27 | # functions 28 | 29 | def valid_motif(motif: str) -> bool: 30 | """ 31 | Determines whether a motif is valid, i.e., can be used by `strkit call`. Here, valid means "composed of IUPAC 32 | nucleotide codes and no other characters." 33 | :param motif: The motif to assess the validity of. 34 | :return: Whether the motif is valid or not. 35 | """ 36 | return RE_VALID_MOTIF.match(motif) is not None 37 | 38 | 39 | def validate_locus(line: int, start: int, end: int, motif: str) -> None: 40 | """ 41 | Validate a locus definition for use by STRkit. 42 | :param line: Line number, for logging errors in a catalog BED file. 43 | :param start: Start coordinate; 0-based, inclusive. 44 | :param end: End coordinate; 0-based, exclusive. 45 | :param motif: Motif sequence (to be validated). 46 | """ 47 | 48 | if start >= end: 49 | raise LocusValidationError( 50 | f"BED catalog format error: invalid coordinates on line {line}: start ({start}) >= end ({end})", 51 | "BED catalog: coordinates must be 0-based, half-open - [start, end)", 52 | ) 53 | 54 | if not valid_motif(motif): 55 | raise LocusValidationError( 56 | f"BED catalog format error: invalid motif on line {line}: {motif}", 57 | "BED catalog: motifs must contain only valid IUPAC nucleotide codes.", 58 | ) 59 | -------------------------------------------------------------------------------- /strkit/catalog/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidlougheed/strkit/90f07d4bb8e27b67a1a1c9afafa4f4b1412b171a/strkit/catalog/__init__.py -------------------------------------------------------------------------------- /strkit/catalog/combine.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import sys 4 | from ..constants import CALLER_STRAGLR, CHROMOSOMES 5 | 6 | __all__ = [ 7 | "combine_catalogs", 8 | ] 9 | 10 | 11 | def combine_catalogs(caller: str, paths: list[str]) -> int: 12 | if caller != CALLER_STRAGLR: 13 | sys.stderr.write(f"Error: This command only supports caller '{CALLER_STRAGLR}'\n") 14 | return 1 15 | 16 | lines = set() 17 | 18 | for path in paths: 19 | if not path.endswith(".bed"): 20 | sys.stderr.write(f"Error: Please supply only .bed files from '{CALLER_STRAGLR}'\n") 21 | return 1 22 | 23 | with open(path, "r") as fh: 24 | for line in fh: 25 | if line.startswith("#"): 26 | continue 27 | 28 | raw_data = line.strip().split("\t") 29 | lines.add((raw_data[0], int(raw_data[1]), int(raw_data[2]), raw_data[3])) 30 | 31 | for line in sorted(lines, key=lambda x: (CHROMOSOMES.index(x[0]), x[1])): 32 | sys.stdout.write("\t".join(map(str, line)) + "\n") 33 | 34 | return 0 35 | -------------------------------------------------------------------------------- /strkit/constants.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "CALLER_EXPANSIONHUNTER", 3 | "CALLER_HIPSTR", 4 | "CALLER_GANGSTR", 5 | "CALLER_REPEATHMM", 6 | "CALLER_STRAGLR", 7 | "CALLER_TANDEM_GENOTYPES", 8 | 9 | "M_CHROMOSOME_NAMES", 10 | "X_CHROMOSOME_NAMES", 11 | "Y_CHROMOSOME_NAMES", 12 | "SEX_CHROMOSOMES", 13 | "AUTOSOMES", 14 | "CHROMOSOMES", 15 | 16 | "MI_CALLERS", 17 | ] 18 | 19 | CALLER_EXPANSIONHUNTER = "expansionhunter" 20 | CALLER_HIPSTR = "hipstr" 21 | CALLER_LONGTR = "longtr" 22 | CALLER_GANGSTR = "gangstr" 23 | CALLER_GENERIC_VCF = "generic-vcf" 24 | CALLER_REPEATHMM = "repeathmm" 25 | CALLER_STRDUST = "strdust" 26 | CALLER_STRAGLR = "straglr" 27 | CALLER_STRKIT = "strkit" 28 | CALLER_STRKIT_JSON = "strkit-json" 29 | CALLER_STRKIT_VCF = "strkit-vcf" 30 | CALLER_TANDEM_GENOTYPES = "tandem-genotypes" 31 | CALLER_TRGT = "trgt" 32 | 33 | M_CHROMOSOME_NAMES = ("chrM", "M") 34 | X_CHROMOSOME_NAMES = ("chrX", "X") 35 | Y_CHROMOSOME_NAMES = ("chrY", "Y") 36 | SEX_CHROMOSOMES = (*X_CHROMOSOME_NAMES, *Y_CHROMOSOME_NAMES) 37 | 38 | AUTOSOMES = ( 39 | *map(str, range(1, 23)), 40 | *(f"chr{i}" for i in range(1, 23)), 41 | ) 42 | 43 | CHROMOSOMES = ( 44 | *AUTOSOMES, 45 | *SEX_CHROMOSOMES, 46 | ) 47 | 48 | 49 | MI_CALLERS = ( 50 | CALLER_EXPANSIONHUNTER, 51 | CALLER_GANGSTR, 52 | CALLER_GENERIC_VCF, 53 | CALLER_LONGTR, 54 | CALLER_REPEATHMM, 55 | CALLER_STRDUST, 56 | CALLER_STRAGLR, 57 | CALLER_STRKIT, 58 | CALLER_STRKIT_JSON, 59 | CALLER_STRKIT_VCF, 60 | CALLER_TANDEM_GENOTYPES, 61 | CALLER_TRGT, 62 | ) 63 | -------------------------------------------------------------------------------- /strkit/convert/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidlougheed/strkit/90f07d4bb8e27b67a1a1c9afafa4f4b1412b171a/strkit/convert/__init__.py -------------------------------------------------------------------------------- /strkit/convert/_bed_4.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from logging import Logger 3 | 4 | __all__ = [ 5 | "trf_to_bed_4", 6 | ] 7 | 8 | 9 | def trf_to_bed_4(trf_data: list, _logger: Logger): 10 | for item in trf_data: 11 | sys.stdout.write("\t".join((*item[:3], item[-1])) + "\n") 12 | -------------------------------------------------------------------------------- /strkit/convert/constants.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "IN_FORMAT_TRF", 3 | "IN_FORMAT_TRGT", 4 | "CONVERTER_IN_FORMATS", 5 | ] 6 | 7 | IN_FORMAT_TRF = "trf" 8 | IN_FORMAT_TRGT = "trgt" 9 | 10 | CONVERTER_IN_FORMATS = ( 11 | IN_FORMAT_TRF, 12 | IN_FORMAT_TRGT, 13 | ) 14 | -------------------------------------------------------------------------------- /strkit/convert/converter.py: -------------------------------------------------------------------------------- 1 | from logging import Logger 2 | from typing import Callable 3 | 4 | from ._bed_4 import trf_to_bed_4 5 | from .constants import IN_FORMAT_TRF, IN_FORMAT_TRGT, CONVERTER_IN_FORMATS 6 | from .expansionhunter import trf_bed_to_eh 7 | from .hipstr import trf_bed_to_hipstr 8 | from .gangstr import trf_bed_to_gangstr 9 | from .trgt import trgt_bed_to_bed4, trf_or_strkit_bed_to_trgt 10 | 11 | import strkit.constants as c 12 | 13 | __all__ = [ 14 | "CONVERTER_OUTPUT_FORMATS", 15 | "convert", 16 | ] 17 | 18 | convert_formats: dict[tuple[str, str], Callable[[list, Logger], None]] = { 19 | # TRF converters: 20 | (IN_FORMAT_TRF, c.CALLER_EXPANSIONHUNTER): trf_bed_to_eh, 21 | (IN_FORMAT_TRF, c.CALLER_HIPSTR): trf_bed_to_hipstr, 22 | (IN_FORMAT_TRF, c.CALLER_GANGSTR): trf_bed_to_gangstr, 23 | (IN_FORMAT_TRF, c.CALLER_REPEATHMM): lambda x: x, 24 | (IN_FORMAT_TRF, c.CALLER_STRAGLR): trf_to_bed_4, 25 | (IN_FORMAT_TRF, c.CALLER_STRKIT): trf_to_bed_4, # or can just leave -asis 26 | (IN_FORMAT_TRF, c.CALLER_TANDEM_GENOTYPES): trf_to_bed_4, 27 | (IN_FORMAT_TRF, c.CALLER_TRGT): trf_or_strkit_bed_to_trgt, 28 | # TRGT converters: 29 | (IN_FORMAT_TRGT, c.CALLER_STRAGLR): trgt_bed_to_bed4, 30 | (IN_FORMAT_TRGT, c.CALLER_STRKIT): trgt_bed_to_bed4, 31 | (IN_FORMAT_TRGT, c.CALLER_TANDEM_GENOTYPES): trgt_bed_to_bed4, 32 | } 33 | 34 | CONVERTER_OUTPUT_FORMATS: tuple[str, ...] = tuple(sorted(set(k[1] for k in convert_formats))) 35 | 36 | 37 | def convert(in_file: str, in_format: str, out_format: str, logger: Logger) -> int: 38 | out_format = out_format.lower() 39 | 40 | if in_format == IN_FORMAT_TRF: 41 | if out_format == c.CALLER_REPEATHMM: 42 | logger.critical(f"No need to convert for '{out_format}'; TRF BED files are accepted as input") 43 | return 1 44 | elif out_format == c.CALLER_STRKIT: 45 | logger.info("STRkit can use TRF BED files as-is; will convert to a BED4 file") 46 | 47 | if in_format not in CONVERTER_IN_FORMATS: 48 | logger.critical(f"Unsupported input format: {in_format}") 49 | 50 | if (in_format, out_format) not in convert_formats: 51 | logger.critical(f"Unsupported conversion: {in_format} -> {out_format} (no converter defined)") 52 | return 1 53 | 54 | with open(in_file, "r") as tf: 55 | data = [line.strip().split("\t") for line in tf] 56 | 57 | convert_formats[(in_format, out_format)](data, logger) 58 | return 0 59 | -------------------------------------------------------------------------------- /strkit/convert/expansionhunter.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | from logging import Logger 4 | 5 | __all__ = [ 6 | "trf_bed_to_eh", 7 | ] 8 | 9 | 10 | def trf_bed_to_eh(trf_data: list, _logger: Logger): 11 | eh_formatted_loci = [] 12 | 13 | for i, item in enumerate(trf_data, 1): 14 | eh_formatted_loci.append({ 15 | "LocusId": f"Locus{i}", 16 | "LocusStructure": f"({item[-1]})*", 17 | "ReferenceRegion": f"{item[0]}:{item[1]}-{item[2]}", 18 | "VariantType": "Repeat", 19 | }) 20 | 21 | sys.stdout.write(json.dumps(eh_formatted_loci, indent=2)) 22 | -------------------------------------------------------------------------------- /strkit/convert/gangstr.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from logging import Logger 3 | 4 | __all__ = [ 5 | "trf_bed_to_gangstr", 6 | ] 7 | 8 | 9 | def trf_bed_to_gangstr(trf_data: list, _logger: Logger): 10 | for i, item in enumerate(trf_data, 1): 11 | sys.stdout.write("\t".join((*item[:3], str(len(item[-1])), item[-1])) + "\n") 12 | -------------------------------------------------------------------------------- /strkit/convert/hipstr.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from logging import Logger 3 | 4 | __all__ = [ 5 | "trf_bed_to_hipstr", 6 | ] 7 | 8 | 9 | def trf_bed_to_hipstr(trf_data: list, _logger: Logger): 10 | for i, item in enumerate(trf_data, 1): 11 | sys.stdout.write("\t".join((*item[:3], str(len(item[-1])), str(round(float(item[5]))))) + "\n") 12 | -------------------------------------------------------------------------------- /strkit/convert/trgt.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from logging import Logger 3 | 4 | __all__ = [ 5 | "trgt_bed_to_bed4", 6 | "trf_or_strkit_bed_to_trgt", 7 | ] 8 | 9 | from strkit.iupac import get_iupac_code_for_nt_set 10 | 11 | 12 | def trgt_bed_to_bed4(trgt_data: list, logger: Logger): 13 | """ 14 | Converts a TRGT repeat catalog to the STRkit/BED4 catalog format. 15 | :param trgt_data: The loaded TRGT catalog (split by tab). 16 | :param logger: A logger instance for issuing conversion failure warnings. 17 | """ 18 | 19 | for line, data in enumerate(trgt_data, 1): 20 | structure_data = {j[0]: j[1] for j in (i.split("=") for i in data[3].split(";"))} 21 | motifs = structure_data["MOTIFS"].split(",") 22 | 23 | if len(motifs) > 1: 24 | # We can do some basic IUPAC code normalization here for simple compound STR structures in TRGT catalogs: 25 | if ( 26 | structure_data["STRUC"] in {"".join(f"({m})n" for m in motifs), f"<{structure_data['ID']}>"} 27 | and len({len(m) for m in motifs}) == 1 28 | ): 29 | failed: bool = False 30 | combined_motif_bases = [] 31 | for bases in zip(*motifs): 32 | bases_set = set(bases) 33 | if len(bases_set) == 1: # same base in all motifs 34 | combined_motif_bases.append(next(iter(bases_set))) 35 | elif iupac_code := get_iupac_code_for_nt_set(bases_set): 36 | # find IUPAC code representing consensus "base" and append it to the motif 37 | combined_motif_bases.append(iupac_code) 38 | else: # something went wrong (invalid base?) 39 | failed = True 40 | break 41 | 42 | if not failed: # found a consensus base for the multiple-motif STR, so we can convert it 43 | sys.stdout.write("\t".join((*data[:3], "".join(combined_motif_bases))) + "\n") 44 | continue 45 | 46 | data_str = "\t".join(data) 47 | logger.warning(f"Could not convert complex locus at line {line}: {data_str}") 48 | continue 49 | 50 | sys.stdout.write("\t".join((*data[:3], motifs[0])) + "\n") 51 | 52 | 53 | def trf_or_strkit_bed_to_trgt(trf_data: list, _logger: Logger): 54 | """ 55 | Convets a TRF- or STRkit-formatted BED (motif-last) to a basic version of a TRGT catalog. 56 | :param trf_data: The loaded BED catalog data. 57 | :param _logger: Logger instance (unused). 58 | """ 59 | 60 | for i, item in enumerate(trf_data): 61 | motif = trf_data[-1] 62 | sys.stdout.write("\t".join((*trf_data[:3], f"ID=locus{i};MOTIFS={motif};STRUC=({motif})n")) + "\n") 63 | -------------------------------------------------------------------------------- /strkit/exceptions.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "ParamError", 3 | "InputError", 4 | ] 5 | 6 | 7 | class ParamError(Exception): 8 | pass 9 | 10 | 11 | class InputError(Exception): 12 | pass 13 | -------------------------------------------------------------------------------- /strkit/iupac.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "IUPAC_NUCLEOTIDE_CODES", 3 | "IUPAC_NUCLEOTIDE_CODES_REVERSE", 4 | "get_iupac_code_for_nt_set", 5 | ] 6 | 7 | # IUPAC nucleotide codes representing >1 nucleotide (quasi-"wildcards"): 8 | # - It's important that the values remain sorted, so we can do a reverse-lookup (see below) 9 | IUPAC_NUCLEOTIDE_CODES: dict[str, tuple[str, ...]] = { 10 | "R": ("A", "G"), 11 | "Y": ("C", "T"), 12 | "S": ("C", "G"), 13 | "W": ("A", "T"), 14 | "K": ("G", "T"), 15 | "M": ("A", "C"), 16 | "B": ("C", "G", "T"), 17 | "D": ("A", "C", "T"), 18 | "H": ("A", "C", "T"), 19 | "V": ("A", "C", "G"), 20 | "N": ("A", "C", "G", "T"), 21 | } 22 | 23 | # Lookup table of {(sorted nucleotides): ""} 24 | IUPAC_NUCLEOTIDE_CODES_REVERSE: dict[tuple[str, ...], str] = { 25 | v: k for k, v in IUPAC_NUCLEOTIDE_CODES.items() 26 | } 27 | 28 | 29 | def get_iupac_code_for_nt_set(nt_set: set[str]) -> str | None: 30 | """ 31 | Given a set of standard nucleotides (ATGC), return an IUPAC code which represents the set. 32 | :param nt_set: A set of nucleotides (A, T, G, or C). Any other base will result in a None return. 33 | :return: An IUPAC nucleotide code representing the set of nucleotides, or None given an invalid nucleotide set. 34 | """ 35 | return IUPAC_NUCLEOTIDE_CODES_REVERSE.get(tuple(sorted(nt_set))) 36 | -------------------------------------------------------------------------------- /strkit/json.py: -------------------------------------------------------------------------------- 1 | import orjson as json 2 | 3 | 4 | __all__ = [ 5 | "Serializable", 6 | "json", 7 | "dumps", 8 | "dumps_indented", 9 | ] 10 | 11 | 12 | Serializable = dict | list | tuple | str | int | float 13 | 14 | 15 | def dumps(v: Serializable) -> bytes: 16 | return json.dumps(v, option=json.OPT_NON_STR_KEYS | json.OPT_SERIALIZE_NUMPY) 17 | 18 | 19 | def dumps_indented(v: Serializable) -> bytes: 20 | return json.dumps(v, option=json.OPT_NON_STR_KEYS | json.OPT_INDENT_2 | json.OPT_SERIALIZE_NUMPY) 21 | -------------------------------------------------------------------------------- /strkit/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | 4 | __all__ = [ 5 | "get_main_logger", 6 | "attach_stream_handler", 7 | "create_process_logger", 8 | "log_levels", 9 | ] 10 | 11 | fmt = logging.Formatter(fmt="%(name)s:\t[%(levelname)s]\t%(message)s") 12 | 13 | 14 | def get_main_logger(level: int = logging.DEBUG): 15 | logger = logging.getLogger("strkit-main") 16 | logger.setLevel(level) 17 | return logger 18 | 19 | 20 | def attach_stream_handler(level: int, logger_=None): 21 | ch = logging.StreamHandler(sys.stderr) 22 | ch.setLevel(level) 23 | ch.setFormatter(fmt) 24 | logger_.addHandler(ch) 25 | 26 | 27 | def create_process_logger(pid: int, level: int): 28 | lg = logging.getLogger(f"strkit-{pid}") 29 | lg.setLevel(level) 30 | if not lg.handlers: 31 | attach_stream_handler(level, logger_=lg) 32 | return lg 33 | 34 | 35 | log_levels = { 36 | "debug": logging.DEBUG, 37 | "info": logging.INFO, 38 | "warning": logging.WARNING, 39 | "error": logging.ERROR, 40 | } 41 | -------------------------------------------------------------------------------- /strkit/mi/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidlougheed/strkit/90f07d4bb8e27b67a1a1c9afafa4f4b1412b171a/strkit/mi/__init__.py -------------------------------------------------------------------------------- /strkit/mi/base.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import logging 4 | import uuid 5 | from abc import ABC, abstractmethod 6 | from pathlib import Path 7 | from typing import Any 8 | 9 | from strkit.logger import get_main_logger 10 | from .intervals import ( 11 | LociDictOfDict, 12 | LociDictOfList, 13 | build_loci_dict_of_dict_from_file, 14 | build_loci_dict_of_list_from_file, 15 | overlapping_loci_dict_of_dict, 16 | overlapping_loci_dict_of_list, 17 | ) 18 | from .result import MIKind, MIContigResult, MIResult 19 | 20 | __all__ = [ 21 | "SEX_CHROMOSOMES", 22 | "BaseCalculator", 23 | ] 24 | 25 | 26 | SEX_CHROMOSOMES = {"chrX", "X", "chrY", "Y"} # TODO: proper parametrization 27 | 28 | 29 | # noinspection PyUnusedLocal 30 | class BaseCalculator(ABC): 31 | def __init__( 32 | self, 33 | child_call_file: Path, 34 | mother_call_file: Path, 35 | father_call_file: Path, 36 | 37 | child_id: str | None = None, 38 | mother_id: str | None = None, 39 | father_id: str | None = None, 40 | 41 | loci_file: str | None = None, 42 | exclude_file: str | None = None, 43 | one_based_loci: bool = False, 44 | 45 | widen: float = 0, 46 | 47 | mismatch_out_mi: MIKind = "pm1", 48 | test_to_perform: str = "none", # means mismatch_out_mi has no effect 49 | sig_level: float = 0.05, 50 | mt_corr: str = "none", 51 | only_phased: bool = False, 52 | 53 | debug: bool = False, 54 | logger: logging.Logger | None = None, 55 | ): 56 | self._debug: bool = debug 57 | self._logger: logging.Logger = logger or get_main_logger() 58 | 59 | self._child_call_file: Path = child_call_file 60 | self._mother_call_file: Path = mother_call_file 61 | self._father_call_file: Path = father_call_file 62 | 63 | self._child_id: str | None = child_id 64 | self._mother_id: str | None = mother_id 65 | self._father_id: str | None = father_id 66 | 67 | self._loci_file: str | None = loci_file 68 | self._loci_dict: LociDictOfDict = build_loci_dict_of_dict_from_file(loci_file, one_based_loci) 69 | self._loci_dict_cache_key: str = str(uuid.uuid4()) 70 | if self._loci_file is not None: 71 | self._logger.debug( 72 | "Built loci dict of size %d with contigs %s", 73 | sum(len(loc) for loc in self._loci_dict.values()), 74 | tuple(self._loci_dict.keys()), 75 | ) 76 | 77 | self._exclude_file: str | None = exclude_file 78 | self._exclude_dict: LociDictOfList = build_loci_dict_of_list_from_file(exclude_file, one_based_loci) 79 | if self._exclude_file is not None: 80 | self._logger.debug( 81 | "Built exclude dict of size %d with contigs %s", 82 | len(self._loci_dict), 83 | tuple(self._exclude_dict.keys()), 84 | ) 85 | 86 | self._decimal_threshold: float = 0.5 87 | self._widen: float = widen 88 | 89 | self._mismatch_out_mi: MIKind = mismatch_out_mi 90 | 91 | self._test_to_perform: str = test_to_perform 92 | self._sig_level: float = sig_level 93 | self._mt_corr: str = mt_corr 94 | self._only_phased: bool = only_phased 95 | 96 | self._cache: dict[str, Any] = {} 97 | 98 | @property 99 | def test_to_perform(self) -> str: 100 | return self._test_to_perform 101 | 102 | @property 103 | def sig_level(self) -> float: 104 | return self._sig_level 105 | 106 | @property 107 | def mt_corr(self) -> str: 108 | return self._mt_corr 109 | 110 | def get_loci_overlapping( 111 | self, contig: str, start: int, end: int, first_only: bool 112 | ) -> list[tuple[int, int, list[str]]]: 113 | return overlapping_loci_dict_of_dict( 114 | contig, start, end, self._loci_dict, first_only, dict_cache_key=self._loci_dict_cache_key 115 | ) 116 | 117 | def should_exclude_locus(self, contig: str, start: int, end: int) -> bool: 118 | return any(True for _ in overlapping_loci_dict_of_list(contig, start, end, self._exclude_dict, True)) 119 | 120 | def should_skip_locus( 121 | self, contig: str, start: int, end: int, cached_overlapping: list | None = None 122 | ) -> str | None: 123 | # Returns either a reason string (if yes) or None (=== no) 124 | 125 | # Check to make sure call is present in TRF BED file, if it is specified 126 | # Check to make sure the locus is not excluded via overlap with exclude BED 127 | 128 | if not self._loci_file or not self._loci_dict: 129 | return None 130 | 131 | if not (cached_overlapping or self.get_loci_overlapping(contig, start, end, True)): 132 | return "no overlapping loci" 133 | 134 | if self.should_exclude_locus(contig, start, end): 135 | return "should_exclude_locus returned True" 136 | 137 | return None 138 | 139 | @abstractmethod 140 | def _get_sample_contigs(self) -> tuple[set, set, set]: 141 | return set(), set(), set() 142 | 143 | def get_trio_contigs(self, include_sex_chromosomes: bool = False) -> set: 144 | mc, fc, cc = self._get_sample_contigs() 145 | 146 | contig_set = mc.intersection(fc).intersection(cc) 147 | 148 | if include_sex_chromosomes: # TODO: proper parametrization 149 | if "Y" in cc: 150 | contig_set = contig_set.union({"X", "Y"}) 151 | elif "chrY" in cc: 152 | contig_set = contig_set.union({"chrX", "chrY"}) 153 | elif "X" in cc: 154 | contig_set = contig_set.union({"X"}) 155 | elif "chrX" in cc: 156 | contig_set = contig_set.union({"chrX"}) 157 | else: 158 | contig_set = contig_set.difference(SEX_CHROMOSOMES) 159 | 160 | if self._loci_dict: 161 | # Limit contig set to only contigs which are in the locus dictionary if one is specified. 162 | contig_set = contig_set.intersection(self._loci_dict.keys()) 163 | 164 | self._logger.debug("Got %d intersection trio contigs", len(contig_set)) 165 | 166 | return contig_set 167 | 168 | @abstractmethod 169 | def calculate_contig(self, contig: str) -> MIContigResult: 170 | return MIContigResult(contig) 171 | 172 | @staticmethod 173 | def _updated_mi_res(res: float | None, v: int | float | None) -> float | None: 174 | return None if v is None else ((res or 0) + v) 175 | 176 | def calculate(self, included_contigs: set) -> MIResult | None: 177 | # copy number 178 | res: float = 0 179 | res_pm1: float = 0 180 | res_95_ci: float | None = None 181 | res_99_ci: float | None = None 182 | # sequence 183 | res_seq: float | None = None 184 | res_sl: float | None = None 185 | res_sl_pm1: float | None = None 186 | 187 | n_total: int = 0 188 | 189 | contig_results = [] 190 | output_loci = [] 191 | 192 | for contig in sorted(included_contigs): 193 | self._logger.info("Processing contig %s", contig) 194 | 195 | contig_result = self.calculate_contig(contig) 196 | contig_results.append(contig_result) 197 | 198 | r, nm = contig_result.process_loci( 199 | mismatch_out_mi=self._mismatch_out_mi, calculate_non_matching=self.test_to_perform == "none" 200 | ) 201 | 202 | value_95_ci = r["ci_95"] 203 | value_99_ci = r["ci_99"] 204 | value_seq = r["seq"] 205 | value_sl = r["sl"] 206 | value_sl_pm1 = r["sl_pm1"] 207 | 208 | res += r["strict"] 209 | res_pm1 += r["pm1"] 210 | res_95_ci = self._updated_mi_res(res_95_ci, value_95_ci) 211 | res_99_ci = self._updated_mi_res(res_99_ci, value_99_ci) 212 | res_seq = self._updated_mi_res(res_seq, value_seq) 213 | res_sl = self._updated_mi_res(res_sl, value_sl) 214 | res_sl_pm1 = self._updated_mi_res(res_sl_pm1, value_sl_pm1) 215 | 216 | n_total += len(contig_result) 217 | output_loci.extend(nm) 218 | 219 | logger_fmt = "Finished processing contig %s; n_total=%d. Current value: %.2f%%, ±1: %.2f%%" 220 | logger_args = [contig_result.contig, n_total, res / n_total * 100, res_pm1 / n_total * 100] 221 | 222 | extras = ( 223 | (res_95_ci, "95%% CI"), 224 | (res_99_ci, "99%% CI"), 225 | (res_seq, "seq"), 226 | (res_sl, "s.l."), 227 | (res_sl_pm1, "s.l.±1"), 228 | ) 229 | 230 | for val, fmt_txt in extras: 231 | if val is not None: 232 | logger_fmt += f", {fmt_txt}: %.2f%%" 233 | logger_args.append(val / n_total * 100) 234 | 235 | self._logger.info(logger_fmt, *logger_args) 236 | 237 | if n_total == 0: 238 | self._logger.warning("No common loci found") 239 | return None 240 | 241 | res /= n_total 242 | res_pm1 /= n_total 243 | res_95_ci = None if res_95_ci is None else (res_95_ci / n_total) 244 | res_99_ci = None if res_99_ci is None else (res_99_ci / n_total) 245 | res_seq = None if res_seq is None else (res_seq / n_total) 246 | res_sl = None if res_sl is None else (res_sl / n_total) 247 | res_sl_pm1 = None if res_sl is None else (res_sl_pm1 / n_total) 248 | 249 | mi_res = MIResult( 250 | { 251 | "strict": res, 252 | "pm1": res_pm1, 253 | "ci_95": res_95_ci, 254 | "ci_99": res_99_ci, 255 | "seq": res_seq, 256 | "sl": res_sl, 257 | "sl_pm1": res_sl_pm1, 258 | }, 259 | contig_results, 260 | output_loci, 261 | self._widen, 262 | self.test_to_perform, 263 | self.sig_level, 264 | self.mt_corr, 265 | logger=self._logger, 266 | ) 267 | 268 | if self.test_to_perform != "none": 269 | mi_res.correct_for_multiple_testing() # Also calculates new output loci 270 | 271 | return mi_res 272 | -------------------------------------------------------------------------------- /strkit/mi/expansionhunter.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pysam 4 | 5 | from .base import BaseCalculator 6 | from .result import MIContigResult, MILocusData 7 | from .vcf_utils import VCFCalculatorMixin 8 | from ..utils import parse_cis 9 | 10 | __all__ = ["ExpansionHunterCalculator"] 11 | 12 | 13 | def _parse_allele(a: int | str | None) -> int | None: 14 | if isinstance(a, str): 15 | if a == ".": 16 | return None 17 | return int(a) 18 | return a 19 | 20 | 21 | def _unzip_gt(vals) -> tuple[tuple[int | float | None, ...], tuple[int | float | None, ...]]: 22 | try: 23 | return (_parse_allele(vals[0][0]), _parse_allele(vals[1][0])), parse_cis((vals[0][1], vals[1][1])) 24 | except ValueError: 25 | return (None, None), (None, None) 26 | 27 | 28 | class ExpansionHunterCalculator(BaseCalculator, VCFCalculatorMixin): 29 | def _get_sample_contigs(self) -> tuple[set, set, set]: 30 | return self.get_contigs_from_files(self._mother_call_file, self._father_call_file, self._child_call_file) 31 | 32 | def calculate_contig(self, contig: str) -> MIContigResult: 33 | cr = MIContigResult(contig, includes_95_ci=True) 34 | 35 | mvf = pysam.VariantFile(str(self._mother_call_file)) 36 | fvf = pysam.VariantFile(str(self._father_call_file)) 37 | cvf = pysam.VariantFile(str(self._child_call_file)) 38 | 39 | # We want all common loci, so loop through the child and then look for the loci in the parent calls 40 | # TODO: What to do about filtering etc? !!!!!!!!!!!!!!!!!!!!!!!! 41 | # !!!!!!!!!!!!!!!! 42 | # - Q score 43 | # - CIs are "proper" - not inverted or weird 44 | 45 | for cv in cvf.fetch(contig): 46 | mv = next(mvf.fetch(contig, cv.start, cv.stop), None) 47 | fv = next(fvf.fetch(contig, cv.start, cv.stop), None) 48 | 49 | # TODO: Handle sex chromosomes 50 | 51 | k = (contig, cv.start, cv.stop) 52 | 53 | if self.should_skip_locus(*k): 54 | continue 55 | 56 | cr.seen_locus(*k) 57 | 58 | if mv is None or fv is None: 59 | # Variant isn't found in at least one of the parents, so we can't do anything with it. 60 | # TODO: We need to actually check calls, and check with sample ID, not just assume 61 | continue 62 | 63 | # TODO: Handle missing samples gracefully 64 | # TODO: Handle wrong formatted VCFs gracefully 65 | 66 | cs = cv.samples[self._child_id or 0] 67 | ms = mv.samples[self._mother_id or 0] 68 | fs = fv.samples[self._father_id or 0] 69 | 70 | cs_reps = tuple(sorted(zip(cs["REPCN"].split("/"), cs["REPCI"].split("/")), key=lambda x: x[0])) 71 | ms_reps = tuple(sorted(zip(ms["REPCN"].split("/"), ms["REPCI"].split("/")), key=lambda x: x[0])) 72 | fs_reps = tuple(sorted(zip(fs["REPCN"].split("/"), fs["REPCI"].split("/")), key=lambda x: x[0])) 73 | 74 | c_gt, c_gt_95_ci = _unzip_gt(cs_reps) 75 | m_gt, m_gt_95_ci = _unzip_gt(ms_reps) 76 | f_gt, f_gt_95_ci = _unzip_gt(fs_reps) 77 | 78 | if c_gt[0] is None or m_gt[0] is None or f_gt[0] is None: 79 | # None call in VCF, skip this call 80 | continue 81 | 82 | cr.append(MILocusData( 83 | contig=contig, 84 | start=cv.start, 85 | end=cv.stop, 86 | motif=cv.info["RU"], 87 | 88 | child_gt=c_gt, mother_gt=m_gt, father_gt=f_gt, 89 | child_gt_95_ci=c_gt_95_ci, mother_gt_95_ci=m_gt_95_ci, father_gt_95_ci=f_gt_95_ci, 90 | 91 | reference_copies=cv.info["REF"], 92 | )) 93 | 94 | return cr 95 | -------------------------------------------------------------------------------- /strkit/mi/gangstr.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pysam 4 | 5 | from .base import BaseCalculator 6 | from .result import MIContigResult, MILocusData 7 | from .vcf_utils import VCFCalculatorMixin 8 | from ..utils import parse_cis 9 | 10 | __all__ = ["GangSTRCalculator"] 11 | 12 | 13 | class GangSTRCalculator(BaseCalculator, VCFCalculatorMixin): 14 | def _get_sample_contigs(self) -> tuple[set, set, set]: 15 | return self.get_contigs_from_files(self._mother_call_file, self._father_call_file, self._child_call_file) 16 | 17 | def calculate_contig(self, contig: str) -> MIContigResult: 18 | cr = MIContigResult(contig, includes_95_ci=True) 19 | 20 | mvf = pysam.VariantFile(str(self._mother_call_file)) 21 | fvf = pysam.VariantFile(str(self._father_call_file)) 22 | cvf = pysam.VariantFile(str(self._child_call_file)) 23 | 24 | # We want all common loci, so loop through the child and then look for the loci in the parent calls 25 | # TODO: What to do about filtering etc? !!!!!!!!!!!!!!!!!!!!!!!! 26 | # !!!!!!!!!!!!!!!! 27 | # - Q score 28 | # - CIs are "proper" - not inverted or weird 29 | 30 | for cv in cvf.fetch(contig): 31 | mv = next(mvf.fetch(contig, cv.start, cv.stop), None) 32 | fv = next(fvf.fetch(contig, cv.start, cv.stop), None) 33 | 34 | # TODO: Handle sex chromosomes 35 | 36 | # Check to make sure call is present in TRF BED file, if it is specified 37 | k1 = (contig, cv.start, cv.stop) 38 | k2 = (contig, cv.start + 1, cv.stop + 1) 39 | 40 | if self.should_skip_locus(*k1) or self.should_skip_locus(*k2): 41 | continue 42 | 43 | cr.seen_locus(*k1) 44 | 45 | if mv is None or fv is None: 46 | # Variant isn't found in at least one of the parents, so we can't do anything with it. 47 | # TODO: We need to actually check calls, and check with sample ID, not just assume 48 | continue 49 | 50 | # TODO: Handle missing samples gracefully 51 | # TODO: Handle wrong formatted VCFs gracefully 52 | 53 | cs = cv.samples[self._child_id or 0] 54 | ms = mv.samples[self._mother_id or 0] 55 | fs = fv.samples[self._father_id or 0] 56 | 57 | c_gt = cs["REPCN"] 58 | m_gt = ms["REPCN"] 59 | f_gt = fs["REPCN"] 60 | 61 | try: 62 | c_gt_95_ci = parse_cis(cs["REPCI"]) 63 | m_gt_95_ci = parse_cis(ms["REPCI"]) 64 | f_gt_95_ci = parse_cis(fs["REPCI"]) 65 | except (ValueError, TypeError): 66 | # None call in VCF, skip this call 67 | continue 68 | 69 | if c_gt[0] is None or m_gt[0] is None or f_gt[0] is None: 70 | # None call in VCF, skip this call 71 | continue 72 | 73 | cr.append(MILocusData( 74 | contig=contig, 75 | start=cv.start, 76 | end=cv.stop, 77 | motif=cv.info["RU"], 78 | 79 | child_gt=c_gt, mother_gt=m_gt, father_gt=f_gt, 80 | child_gt_95_ci=c_gt_95_ci, mother_gt_95_ci=m_gt_95_ci, father_gt_95_ci=f_gt_95_ci, 81 | 82 | reference_copies=cv.info["REF"], 83 | )) 84 | 85 | return cr 86 | -------------------------------------------------------------------------------- /strkit/mi/generic_vcf.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pysam 4 | 5 | from .base import BaseCalculator 6 | from .result import MIContigResult, MILocusData 7 | from .vcf_utils import VCFCalculatorMixin 8 | 9 | __all__ = ["GenericVCFLengthCalculator"] 10 | 11 | 12 | class GenericVCFLengthCalculator(BaseCalculator, VCFCalculatorMixin): 13 | def _get_sample_contigs(self) -> tuple[set, set, set]: 14 | contigs = self.get_contigs_from_files(self._mother_call_file, self._father_call_file, self._child_call_file) 15 | self._logger.debug( 16 | "Got trio contigs - child: %d, mother: %d, father: %d", 17 | len(contigs[2]), len(contigs[0]), len(contigs[1]), 18 | ) 19 | return contigs 20 | 21 | def calculate_contig(self, contig: str) -> MIContigResult: 22 | cr = MIContigResult(contig, includes_seq=True) 23 | 24 | mvf = pysam.VariantFile(str(self._mother_call_file)) 25 | fvf = pysam.VariantFile(str(self._father_call_file)) 26 | cvf = pysam.VariantFile(str(self._child_call_file)) 27 | 28 | # We want all common loci, so loop through the child and then look for the loci in the parent calls 29 | 30 | for cv in cvf.fetch(contig): 31 | # child variant start/end, as determined by the reference allele sequence 32 | cv_start = cv.start 33 | cv_stop = cv.stop 34 | 35 | # hack for LongTR: if we override start/end in INFO, use those values as the true start/end in the context 36 | # of the locus boundaries 37 | if "START" in cv.info: 38 | cv_start = int(cv.info["START"]) - 1 39 | if "END" in cv.info: 40 | cv_stop = int(cv.info["END"]) 41 | 42 | mv = next(mvf.fetch(contig, cv_start, cv_stop), None) 43 | fv = next(fvf.fetch(contig, cv_start, cv_stop), None) 44 | 45 | # TODO: Handle sex chromosomes 46 | 47 | k = (contig, cv_start, cv_stop) 48 | 49 | overlapping = self.get_loci_overlapping(k[0], k[1], k[2], True) 50 | 51 | if r := self.should_skip_locus(k[0], k[1], k[2], cached_overlapping=overlapping): 52 | self._logger.debug(f"Skipping locus {k}: {r}") 53 | continue 54 | 55 | cr.seen_locus(*k) 56 | 57 | if mv is None or fv is None: 58 | # Variant isn't found in at least one of the parents, so we can't do anything with it. 59 | # TODO: We need to actually check calls, and check with sample ID, not just assume 60 | self._logger.debug(f"Skipping locus {k}: mv or fv is None") 61 | continue 62 | 63 | # TODO: Handle missing samples gracefully 64 | # TODO: Handle wrong formatted VCFs gracefully 65 | 66 | # Need to dig up original motif from the locus file - thus, the original locus file is required. 67 | motif: str = overlapping[0][-1][0] 68 | if not motif: 69 | self._logger.debug(f"Skipping locus {k}: motif is false-y") 70 | continue 71 | 72 | motif_len = len(motif) 73 | 74 | cs = cv.samples[self._child_id or 0] 75 | ms = mv.samples[self._mother_id or 0] 76 | fs = fv.samples[self._father_id or 0] 77 | 78 | c_seq_gt = tuple(sorted((cv.alleles[g] for g in cs["GT"]), key=len)) if None not in cs["GT"] else None 79 | c_gt = tuple(round(len(a) / motif_len) for a in c_seq_gt) if c_seq_gt is not None else None 80 | m_seq_gt = tuple(sorted((mv.alleles[g] for g in ms["GT"]), key=len)) if None not in ms["GT"] else None 81 | m_gt = tuple(round(len(a) / motif_len) for a in m_seq_gt) if m_seq_gt is not None else None 82 | f_seq_gt = tuple(sorted((fv.alleles[g] for g in fs["GT"]), key=len)) if None not in fs["GT"] else None 83 | f_gt = tuple(round(len(a) / motif_len) for a in f_seq_gt) if f_seq_gt is not None else None 84 | 85 | if c_gt is None or m_gt is None or f_gt is None: 86 | # None call in VCF, skip this call 87 | continue 88 | 89 | cr.append(MILocusData( 90 | contig=contig, 91 | start=cv_start, 92 | end=cv_stop, 93 | motif=motif, 94 | 95 | child_gt=c_gt, mother_gt=m_gt, father_gt=f_gt, 96 | 97 | # sequence may not line up with start/end if VCF record INFO START/END entries are used 98 | child_seq_gt=c_seq_gt, mother_seq_gt=m_seq_gt, father_seq_gt=f_seq_gt, 99 | )) 100 | 101 | return cr 102 | -------------------------------------------------------------------------------- /strkit/mi/intervals.py: -------------------------------------------------------------------------------- 1 | import bisect 2 | from pathlib import Path 3 | from typing import Iterable 4 | 5 | from strkit.utils import idx_0_getter, idx_1_getter 6 | 7 | 8 | def _line_filter_fn(s: str) -> bool: 9 | """ 10 | Filter function to skip blank lines and comments 11 | :param s: line of a file 12 | :return: whether the line is not blank and is not a comment 13 | """ 14 | return s and not s.startswith("#") 15 | 16 | 17 | # key: contig, value: dict of (key: coordinate interval, value: list of extra values) 18 | LociDictOfDict = dict[str, dict[tuple[int, int], list[str]]] 19 | 20 | # key: contig, value: list of coordinate intervals 21 | LociDictOfList = dict[str, list[tuple[int, int]]] 22 | 23 | 24 | def build_loci_dict_of_dict_from_file(loci_path: str | Path | None, one_based: bool) -> LociDictOfDict: 25 | # Assumes standard BED format - 0-based, half-open intervals, unless one_based=True, 26 | # in which case assume 1-based closed intervals and adjust to be 0-based half-closed. 27 | 28 | if not loci_path: 29 | return {} 30 | 31 | start_adj = -1 * int(one_based) # -1 if converting from 1-based closed to 0-based half-open, otherwise do nothing. 32 | 33 | res: LociDictOfDict = {} 34 | 35 | with open(loci_path, "r") as lf: 36 | for line in filter(_line_filter_fn, map(str.strip, lf)): 37 | ls = line.split("\t") 38 | 39 | contig, ss, es = ls[:3] 40 | 41 | if contig not in res: 42 | res[contig] = {} 43 | 44 | res[contig][int(ss) + start_adj, int(es)] = ls[3:] 45 | 46 | return res 47 | 48 | 49 | def build_loci_dict_of_list_from_file(loci_path: str | Path | None, one_based: bool) -> LociDictOfList: 50 | # Assumes standard BED format - 0-based, half-open intervals, unless one_based=True, 51 | # in which case assume 1-based closed intervals and adjust to be 0-based half-closed. 52 | 53 | if not loci_path: 54 | return {} 55 | 56 | start_adj = -1 * int(one_based) # -1 if converting from 1-based closed to 0-based half-open, otherwise do nothing. 57 | 58 | res: dict[str, list[tuple[int, int]]] = {} 59 | 60 | with open(loci_path, "r") as lf: 61 | for line in filter(_line_filter_fn, map(str.strip, lf)): 62 | ls = line.split("\t") 63 | 64 | contig, ss, es = ls[:3] 65 | 66 | if contig not in res: 67 | res[contig] = [] 68 | 69 | res[contig].append((int(ss) + start_adj, int(es))) 70 | 71 | return res 72 | 73 | 74 | _overlapping_dict_cache = {} 75 | 76 | 77 | def overlapping_loci_dict_of_dict( 78 | contig: str, start: int, end: int, d: LociDictOfDict, first_only: bool = False, dict_cache_key: str | None = None 79 | ) -> list[tuple[int, int, list[str]]]: 80 | if contig not in d: 81 | return [] 82 | 83 | global _overlapping_dict_cache 84 | 85 | full_cache_key = f"{dict_cache_key}--{contig}" 86 | 87 | if full_cache_key in _overlapping_dict_cache: 88 | c_dict, c_keys, c_lhs = _overlapping_dict_cache[full_cache_key] 89 | else: 90 | c_dict = d[contig] 91 | c_keys = tuple(c_dict.keys()) 92 | c_lhs = tuple(map(lambda k: k[0], c_keys)) 93 | if full_cache_key is not None: 94 | _overlapping_dict_cache[full_cache_key] = c_dict, c_keys, c_lhs 95 | 96 | i = bisect.bisect_left(c_lhs, end) # use _left since end is exclusive 97 | 98 | # now sort by [1] (possible overlap end), which should be (almost!) sorted already. 99 | # then, we can get only entries where start < ov[1] via bisect (finding ov[1] <= start and skipping them). 100 | possible_overlaps = sorted(c_keys[:i], key=idx_1_getter) 101 | j = bisect.bisect_right(possible_overlaps, start, key=idx_1_getter) # bisect right because exclusive 102 | possible_overlaps = possible_overlaps[j:] 103 | 104 | acc: list[tuple[int, int, list[str]]] = [] 105 | 106 | for ov in possible_overlaps: 107 | acc.append((ov[0], ov[1], c_dict[ov])) 108 | if first_only: 109 | break 110 | 111 | return sorted(acc, key=idx_0_getter) 112 | 113 | 114 | def overlapping_loci_dict_of_list( 115 | contig: str, start: int, end: int, d: LociDictOfList, first_only: bool 116 | ) -> Iterable[tuple[int, int]]: 117 | if contig not in d: 118 | yield from () 119 | return 120 | 121 | c_ints = d[contig] 122 | c_lhs = tuple(map(lambda k: k[0], c_ints)) 123 | i = bisect.bisect_left(c_lhs, end) # use _left since end is exclusive 124 | 125 | for ov in c_ints[:i]: 126 | if start < ov[1]: 127 | yield ov[0], ov[1] 128 | if first_only: 129 | break 130 | -------------------------------------------------------------------------------- /strkit/mi/repeathmm.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from .base import BaseCalculator 4 | from .result import MIContigResult, MILocusData 5 | from ..utils import int_tuple 6 | 7 | __all__ = [ 8 | "RepeatHMMCalculator", 9 | ] 10 | 11 | 12 | class RepeatHMMCalculator(BaseCalculator): 13 | @staticmethod 14 | def get_contigs_from_fh(fh) -> set: 15 | return {ls[0] for ls in (line.split(":") for line in fh)} 16 | 17 | @staticmethod 18 | def make_calls_dict(ph, contig): 19 | return { 20 | tuple(k.split(":")): int_tuple(v.split("/")) 21 | for k, v in (pv.split() for pv in ph) 22 | if k.split(":")[0] == contig 23 | } 24 | 25 | def _get_sample_contigs(self) -> tuple[set, set, set]: 26 | with open(self._mother_call_file, "r") as mvf, open(self._father_call_file, "r") as fvf, \ 27 | open(self._child_call_file, "r") as cvf: 28 | 29 | mc = self.get_contigs_from_fh(mvf) 30 | fc = self.get_contigs_from_fh(fvf) 31 | cc = self.get_contigs_from_fh(cvf) 32 | 33 | return mc, fc, cc 34 | 35 | def calculate_contig(self, contig: str) -> MIContigResult: 36 | cr = MIContigResult(contig) 37 | 38 | with open(self._mother_call_file) as mh: 39 | mother_calls = self.make_calls_dict(mh, contig) 40 | 41 | with open(self._father_call_file) as fh: 42 | father_calls = self.make_calls_dict(fh, contig) 43 | 44 | with open(self._child_call_file) as ch: 45 | for cv in ch: 46 | locus_data, call = cv.strip().split(" ") 47 | lookup = tuple(locus_data.split(":")) 48 | 49 | if lookup[0] != contig: 50 | continue 51 | 52 | locus_start: int = int(lookup[1]) 53 | locus_end: int = int(lookup[2]) 54 | 55 | k = (contig, locus_start, locus_end) 56 | 57 | # Check to make sure call is present in TRF BED file, if it is specified 58 | if self.should_skip_locus(*k): 59 | continue 60 | 61 | cr.seen_locus(*k) 62 | 63 | # Check to make sure call is present in all trio individuals 64 | if lookup not in mother_calls or lookup not in father_calls: 65 | continue 66 | 67 | c_gt = int_tuple(call.split("/")) 68 | m_gt = mother_calls[lookup] 69 | f_gt = father_calls[lookup] 70 | 71 | # Failed calls from RepeatHMM seem to be represented as 0/0, so skip this 72 | # TODO… Need to decide if we actually want to include these? 73 | # or at least somehow record them 74 | if (0, 0) in (c_gt, m_gt, f_gt): 75 | continue 76 | 77 | # TODO: Include ref copies... should be in file somewhere? 78 | cr.append(MILocusData( 79 | lookup[0], 80 | locus_start, 81 | locus_end, 82 | lookup[3], 83 | 84 | child_gt=int_tuple(call.split("/")), 85 | mother_gt=mother_calls[lookup], 86 | father_gt=father_calls[lookup], 87 | 88 | logger=self._logger, 89 | )) 90 | 91 | return cr 92 | -------------------------------------------------------------------------------- /strkit/mi/straglr.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from .base import BaseCalculator 4 | from .result import MILocusData, MIContigResult 5 | 6 | __all__ = [ 7 | "StraglrCalculator", 8 | ] 9 | 10 | 11 | class StraglrCalculator(BaseCalculator): 12 | @staticmethod 13 | def get_contigs_from_fh(fh) -> set: 14 | return {ls[0] for ls in (line.split("\t") for line in fh if not line.startswith("#"))} 15 | 16 | def make_calls_dict(self, ph, contig, cr: MIContigResult | None = None): 17 | # For reference, dicts are ordered in Python 3.7+ (guaranteed) 18 | 19 | calls = {} 20 | 21 | for pv in ph: 22 | if pv.startswith("#"): 23 | continue 24 | 25 | line = pv.strip().split("\t") 26 | 27 | if line[0] != contig: 28 | if calls: 29 | # assume ordered BED; break after we've collected all calls for the contig 30 | break 31 | continue 32 | 33 | locus = tuple(line[:3]) 34 | 35 | k = (line[0], int(line[1]), int(line[2])) 36 | 37 | overlapping = self.get_loci_overlapping(k[0], k[1], k[2], True) 38 | 39 | if r := self.should_skip_locus(k[0], k[1], k[2], cached_overlapping=overlapping): 40 | self._logger.debug(f"Skipping locus {k}: {r}") 41 | continue 42 | 43 | if cr: 44 | cr.seen_locus(*k) 45 | 46 | orig_motif: str = overlapping[0][-1][0] 47 | if not orig_motif: # false-y/blank 48 | self._logger.debug(f"Skipping locus {k}: motif is false-y") 49 | continue 50 | 51 | # Transform the genotypes into something that is consistent across individuals, 52 | # using the file with the list of loci. 53 | gt_fact = len(line[3]) / len(orig_motif) 54 | 55 | gt = tuple(float(g.split("(")[0]) * gt_fact for g in line[4].split(";")) 56 | if len(gt) == 1: # If it's homozygous, expand it out to length 2 57 | gt = gt + gt 58 | 59 | calls[locus + (orig_motif,)] = gt 60 | 61 | return calls 62 | 63 | def _get_sample_contigs(self) -> tuple[set, set, set]: 64 | with open(self._mother_call_file, "r") as mvf, open(self._father_call_file, "r") as fvf, \ 65 | open(self._child_call_file, "r") as cvf: 66 | 67 | mc = self.get_contigs_from_fh(mvf) 68 | fc = self.get_contigs_from_fh(fvf) 69 | cc = self.get_contigs_from_fh(cvf) 70 | 71 | return mc, fc, cc 72 | 73 | def calculate_contig(self, contig: str): 74 | cr = MIContigResult(contig) 75 | 76 | with open(self._mother_call_file, "r") as mh: 77 | mother_calls = self.make_calls_dict(mh, contig) 78 | 79 | with open(self._father_call_file, "r") as fh: 80 | father_calls = self.make_calls_dict(fh, contig) 81 | 82 | with open(self._child_call_file, "r") as ch: 83 | child_calls = self.make_calls_dict(ch, contig, cr) 84 | 85 | for locus_data, c_gt in child_calls.items(): 86 | # Check to make sure call is present in all trio individuals 87 | if locus_data not in mother_calls or locus_data not in father_calls: 88 | continue 89 | 90 | cr.append(MILocusData( 91 | contig=locus_data[0], 92 | start=int(locus_data[1]), 93 | end=int(locus_data[2]), 94 | motif=locus_data[3], 95 | 96 | child_gt=c_gt, 97 | mother_gt=mother_calls[locus_data], 98 | father_gt=father_calls[locus_data], 99 | 100 | decimal=True, 101 | )) 102 | 103 | return cr 104 | -------------------------------------------------------------------------------- /strkit/mi/strkit.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import numpy as np 4 | 5 | from pysam import VariantFile 6 | from pysam.libcbcf import VariantRecordSample 7 | 8 | from strkit.json import json 9 | 10 | from .base import BaseCalculator 11 | from .result import MIContigResult, MILocusData 12 | from .vcf_utils import VCFCalculatorMixin 13 | from ..utils import int_tuple, parse_cis 14 | 15 | __all__ = [ 16 | "StrKitCalculator", 17 | "StrKitJSONCalculator", 18 | "StrKitVCFCalculator", 19 | ] 20 | 21 | 22 | STRKIT_TSV_CALL_INDEX = 6 23 | STRKIT_TSV_CALL_95_CI_INDEX = 7 24 | 25 | 26 | class StrKitCalculator(BaseCalculator): 27 | @staticmethod 28 | def get_contigs_from_fh(fh) -> set[str]: 29 | return {ls[0] for ls in (line.split("\t") for line in fh if not line.startswith("#"))} 30 | 31 | def _get_sample_contigs(self) -> tuple[set, set, set]: 32 | with open(self._mother_call_file, "r") as mvf: 33 | mc = self.get_contigs_from_fh(mvf) 34 | with open(self._father_call_file, "r") as fvf: 35 | fc = self.get_contigs_from_fh(fvf) 36 | with open(self._child_call_file, "r") as cvf: 37 | cc = self.get_contigs_from_fh(cvf) 38 | return mc, fc, cc 39 | 40 | @staticmethod 41 | def make_calls_dict(ph, contig): 42 | return { 43 | tuple(line[:4]): ( 44 | int_tuple(line[STRKIT_TSV_CALL_INDEX].split("|")), 45 | parse_cis(line[STRKIT_TSV_CALL_95_CI_INDEX].split("|")), 46 | None # parse_cis(line[-1:].split("|")), 47 | ) 48 | for line in (pv.strip().split("\t") for pv in ph) 49 | if line[0] == contig and "." not in line[STRKIT_TSV_CALL_INDEX] 50 | } 51 | 52 | def calculate_contig(self, contig: str) -> MIContigResult: 53 | cr = MIContigResult(contig, includes_95_ci=True) 54 | 55 | with open(self._mother_call_file) as mh: 56 | mother_calls = self.make_calls_dict(mh, contig) 57 | 58 | self._logger.debug(f"loaded materal calls for {contig}") 59 | 60 | with open(self._father_call_file) as fh: 61 | father_calls = self.make_calls_dict(fh, contig) 62 | 63 | self._logger.debug(f"loaded paternal calls for {contig}") 64 | 65 | with open(self._child_call_file) as ch: 66 | for cv in ch: 67 | locus_data = cv.strip().split("\t") 68 | 69 | if locus_data[0] != contig: 70 | continue 71 | 72 | lookup = tuple(locus_data[:4]) 73 | 74 | start = int(locus_data[1]) 75 | end = int(locus_data[2]) 76 | 77 | if self.should_skip_locus(contig, start, end): 78 | continue 79 | 80 | # Check to make sure call is present in all trio individuals 81 | if lookup not in mother_calls or lookup not in father_calls: 82 | continue 83 | 84 | m_gt, m_gt_95_ci, _ = mother_calls[lookup] 85 | f_gt, f_gt_95_ci, _ = father_calls[lookup] 86 | 87 | calls = locus_data[STRKIT_TSV_CALL_INDEX].split("|") 88 | 89 | if "." in calls: 90 | # Failed call 91 | continue 92 | 93 | cr.append(MILocusData( 94 | contig=lookup[0], 95 | start=int(lookup[1]), 96 | end=int(lookup[2]), 97 | motif=lookup[3], 98 | 99 | child_gt=int_tuple(calls), 100 | mother_gt=m_gt, 101 | father_gt=f_gt, 102 | 103 | child_gt_95_ci=parse_cis(locus_data[STRKIT_TSV_CALL_95_CI_INDEX].split("|")), 104 | mother_gt_95_ci=m_gt_95_ci, 105 | father_gt_95_ci=f_gt_95_ci, 106 | 107 | # child_gt_99_ci=parse_cis(locus_data[-1:].split("|")), 108 | # mother_gt_99_ci=m_gt_99_ci, 109 | # father_gt_99_ci=f_gt_99_ci, 110 | 111 | reference_copies=int(locus_data[4]), 112 | 113 | decimal=False, 114 | )) 115 | 116 | return cr 117 | 118 | 119 | class StrKitJSONCalculator(BaseCalculator): 120 | def __init__(self, *args, **kwargs): 121 | super().__init__(*args, **kwargs) 122 | 123 | with open(self._mother_call_file, "r") as mvf: 124 | self._cache["mother_data"] = json.loads(mvf.read()) 125 | with open(self._father_call_file, "r") as fvf: 126 | self._cache["father_data"] = json.loads(fvf.read()) 127 | with open(self._child_call_file, "r") as cvf: 128 | self._cache["child_data"] = json.loads(cvf.read()) 129 | 130 | @staticmethod 131 | def get_contigs_from_data(report) -> set: 132 | if (report_contigs := report.get("contigs")) is not None: 133 | return set(report_contigs) 134 | return {res["contig"] for res in report["results"]} 135 | 136 | def _get_sample_contigs(self, include_sex_chromosomes: bool = False) -> tuple[set, set, set]: 137 | mc = self.get_contigs_from_data(self._cache["mother_data"]) 138 | fc = self.get_contigs_from_data(self._cache["father_data"]) 139 | cc = self.get_contigs_from_data(self._cache["child_data"]) 140 | return mc, fc, cc 141 | 142 | @staticmethod 143 | def get_read_counts(res: dict, dtype=int): 144 | # TODO: This only works with diploids... 145 | 146 | read_cns = [] 147 | read_peaks = [] 148 | 149 | for r in res["reads"].values(): 150 | if (peak := r.get("p")) is None: 151 | continue 152 | read_cns.append(r["cn"]) 153 | read_peaks.append(peak) 154 | 155 | n = res["peaks"]["modal_n"] 156 | 157 | if (n < 2 or len(set(res["call"]))) == 1 and res.get("assign_method", "dist") == "dist": 158 | # Split copy numbers evenly in two if we have a homozygous locus called only via distance. 159 | rcs = np.array(read_cns, dtype=dtype) 160 | np.random.shuffle(rcs) # TODO: seed shuffle 161 | part = rcs.shape[0] // 2 162 | return tuple(rcs[:part].tolist()), tuple(rcs[part:].tolist()) 163 | 164 | rc = [] 165 | for _ in range(n): 166 | rc.append([]) 167 | for cn, pk in zip(read_cns, read_peaks): 168 | rc[pk].append(cn) 169 | return tuple(map(tuple, rc)) 170 | 171 | @staticmethod 172 | def make_calls_dict(report: dict, contig: str): 173 | return { 174 | (res["contig"], res["start"], res["end"], res["motif"]): ( 175 | int_tuple(res["call"]), 176 | tuple(map(lambda x: tuple(map(int, x)), res["call_95_cis"])), 177 | None, # Placeholder for 99% CI 178 | StrKitJSONCalculator.get_read_counts(res, dtype=int), 179 | ) 180 | for res in report["results"] 181 | if res["contig"] == contig and res["call"] is not None 182 | } 183 | 184 | def calculate_contig(self, contig: str) -> MIContigResult: 185 | c_report = self._cache["child_data"] 186 | 187 | cr = MIContigResult(contig, includes_95_ci=True) 188 | 189 | mother_data = self.make_calls_dict(self._cache["mother_data"], contig) 190 | self._logger.debug(f"loaded materal calls for {contig}") 191 | 192 | father_data = self.make_calls_dict(self._cache["father_data"], contig) 193 | self._logger.debug(f"loaded paternal calls for {contig}") 194 | 195 | for res in c_report["results"]: 196 | if res["contig"] != contig: 197 | continue 198 | 199 | locus_start = res["start"] 200 | locus_end = res["end"] 201 | 202 | lookup = (contig, locus_start, locus_end, res["motif"]) 203 | 204 | k = (contig, int(locus_start), int(locus_end)) 205 | 206 | # Check to make sure call is present in TRF BED file, if it is specified 207 | if self.should_skip_locus(*k): 208 | continue 209 | 210 | cr.seen_locus(*k) 211 | 212 | # Check to make sure call is present in all trio individuals 213 | if lookup not in mother_data or lookup not in father_data: 214 | continue 215 | 216 | m_gt, m_gt_95_ci, _, m_rcs = mother_data[lookup] 217 | f_gt, f_gt_95_ci, _, f_rcs = father_data[lookup] 218 | 219 | if res["call"] is None: 220 | # Failed call 221 | continue 222 | 223 | call = int_tuple(res["call"]) 224 | 225 | cr.append(MILocusData( 226 | contig=lookup[0], 227 | start=locus_start, 228 | end=locus_end, 229 | motif=lookup[3], 230 | 231 | child_gt=int_tuple(call), 232 | mother_gt=m_gt, 233 | father_gt=f_gt, 234 | 235 | child_gt_95_ci=tuple(map(lambda x: tuple(map(int, x)), res["call_95_cis"])), 236 | mother_gt_95_ci=m_gt_95_ci, 237 | father_gt_95_ci=f_gt_95_ci, 238 | 239 | # child_gt_99_ci=parse_cis(locus_data[-1:].split("|")), 240 | # mother_gt_99_ci=m_gt_99_ci, 241 | # father_gt_99_ci=f_gt_99_ci, 242 | 243 | child_read_counts=StrKitJSONCalculator.get_read_counts(res, dtype=int), 244 | mother_read_counts=m_rcs, 245 | father_read_counts=f_rcs, 246 | 247 | reference_copies=int(res["ref_cn"]), 248 | 249 | decimal=False, 250 | 251 | test_to_perform=self.test_to_perform, 252 | sig_level=self.sig_level, 253 | )) 254 | 255 | return cr 256 | 257 | 258 | class StrKitVCFCalculator(BaseCalculator, VCFCalculatorMixin): 259 | def _get_sample_contigs(self, include_sex_chromosomes: bool = False) -> tuple[set, set, set]: 260 | return self.get_contigs_from_files(self._mother_call_file, self._father_call_file, self._child_call_file) 261 | 262 | @staticmethod 263 | def get_peak_cns_from_vcf_line(sample_record: VariantRecordSample): 264 | if "MCRL" not in sample_record: 265 | return None 266 | 267 | res = [] 268 | 269 | for enc_peak in sample_record["MCRL"]: 270 | peak = [] 271 | for cn_r in enc_peak.split("|"): 272 | cn, cn_c = cn_r.split("x") 273 | peak.extend([int(cn)] * int(cn_c)) 274 | 275 | res.append(tuple(peak)) 276 | 277 | if len(res) == 1: 278 | # Split one peak into two, interleaving reads between the two peaks 279 | return res[0][::2], res[0][1::2] 280 | 281 | return tuple(res) 282 | 283 | def calculate_contig(self, contig: str) -> MIContigResult: 284 | cr = MIContigResult(contig, includes_95_ci=True, includes_seq=True) 285 | 286 | mvf = VariantFile(str(self._mother_call_file)) 287 | fvf = VariantFile(str(self._father_call_file)) 288 | cvf = VariantFile(str(self._child_call_file)) 289 | 290 | # We want all common loci, so loop through the child and then look for the loci in the parent calls 291 | 292 | for cv in cvf.fetch(contig): 293 | if cv.info["VT"] != "str": 294 | continue 295 | 296 | motif = cv.info["MOTIF"] 297 | k = (contig, cv.start, cv.stop) 298 | 299 | mv = next(filter(lambda v: v.info["VT"] == "str" and v.info["MOTIF"] == motif, mvf.fetch(*k)), None) 300 | fv = next(filter(lambda v: v.info["VT"] == "str" and v.info["MOTIF"] == motif, fvf.fetch(*k)), None) 301 | 302 | # TODO: Handle sex chromosomes 303 | 304 | # Check to make sure call is present in TRF BED file, if it is specified 305 | if self.should_skip_locus(*k): 306 | continue 307 | 308 | cr.seen_locus(*k) 309 | 310 | if mv is None or fv is None: 311 | # Variant isn't found in at least one of the parents, so we can't do anything with it. 312 | # TODO: We need to actually check calls, and check with sample ID, not just assume 313 | continue 314 | 315 | # TODO: Handle missing samples gracefully 316 | # TODO: Handle wrong formatted VCFs gracefully 317 | 318 | cs = cv.samples[self._child_id or 0] 319 | ms = mv.samples[self._mother_id or 0] 320 | fs = fv.samples[self._father_id or 0] 321 | 322 | try: 323 | c_gt = cs["MC"] 324 | m_gt = ms["MC"] 325 | f_gt = fs["MC"] 326 | except KeyError: 327 | # None call in VCF, skip this call 328 | continue 329 | 330 | try: 331 | c_gt_95_ci = parse_cis(cs["MCCI"]) 332 | m_gt_95_ci = parse_cis(ms["MCCI"]) 333 | f_gt_95_ci = parse_cis(fs["MCCI"]) 334 | except (ValueError, TypeError): 335 | # None call in VCF, skip this call 336 | continue 337 | 338 | if c_gt[0] is None or m_gt[0] is None or f_gt[0] is None: 339 | # None call in VCF, skip this call 340 | continue 341 | 342 | if self._only_phased and ("PS" not in cs or "PS" not in ms or "PS" not in fs): 343 | # No phasing support across trio, and we're only looking at phased loci --> skip this call 344 | continue 345 | 346 | c_seq_gt = tuple(sorted((cv.alleles[g] for g in cs["GT"]), key=len)) if None not in cs["GT"] else None 347 | m_seq_gt = tuple(sorted((mv.alleles[g] for g in ms["GT"]), key=len)) if None not in ms["GT"] else None 348 | f_seq_gt = tuple(sorted((fv.alleles[g] for g in fs["GT"]), key=len)) if None not in fs["GT"] else None 349 | 350 | cr.append(MILocusData( 351 | contig=contig, 352 | start=cv.start, 353 | end=cv.stop, 354 | motif=motif, 355 | 356 | child_gt=c_gt, mother_gt=m_gt, father_gt=f_gt, 357 | child_gt_95_ci=c_gt_95_ci, mother_gt_95_ci=m_gt_95_ci, father_gt_95_ci=f_gt_95_ci, 358 | child_seq_gt=c_seq_gt, mother_seq_gt=m_seq_gt, father_seq_gt=f_seq_gt, 359 | 360 | reference_copies=cv.info["REFMC"], 361 | 362 | # ---- for de novo mutation detection (this function returns None if MCRL is not in the VCF FORMAT for 363 | # the samples; i.e., with older STRkit versions): 364 | 365 | child_read_counts=StrKitVCFCalculator.get_peak_cns_from_vcf_line(cs), 366 | mother_read_counts=StrKitVCFCalculator.get_peak_cns_from_vcf_line(ms), 367 | father_read_counts=StrKitVCFCalculator.get_peak_cns_from_vcf_line(fs), 368 | 369 | test_to_perform=self.test_to_perform, 370 | sig_level=self.sig_level, 371 | )) 372 | 373 | return cr 374 | -------------------------------------------------------------------------------- /strkit/mi/tandem_genotypes.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from .base import BaseCalculator 4 | from .result import MIContigResult, MILocusData 5 | from ..utils import int_tuple 6 | 7 | __all__ = [ 8 | "TandemGenotypesCalculator", 9 | ] 10 | 11 | 12 | class TandemGenotypesCalculator(BaseCalculator): 13 | @staticmethod 14 | def get_contigs_from_fh(fh) -> set[str]: 15 | return {ls[0] for ls in (line.split("\t") for line in fh if not line.startswith("#"))} 16 | 17 | @staticmethod 18 | def make_calls_dict(ph, contig): 19 | return { 20 | tuple(line[:4]): int_tuple(line[6:8]) 21 | for line in (pv.strip().split("\t") for pv in ph if not pv.startswith("#")) 22 | if line[0] == contig and "." not in line[6:8] 23 | } 24 | 25 | def _get_sample_contigs(self) -> tuple[set, set, set]: 26 | with open(self._mother_call_file, "r") as mvf, open(self._father_call_file, "r") as fvf, \ 27 | open(self._child_call_file, "r") as cvf: 28 | 29 | mc = self.get_contigs_from_fh(mvf) 30 | fc = self.get_contigs_from_fh(fvf) 31 | cc = self.get_contigs_from_fh(cvf) 32 | 33 | return mc, fc, cc 34 | 35 | def calculate_contig(self, contig: str) -> MIContigResult: 36 | cr = MIContigResult(contig) 37 | 38 | with open(self._mother_call_file) as mh: 39 | mother_calls = self.make_calls_dict(mh, contig) 40 | 41 | with open(self._father_call_file) as fh: 42 | father_calls = self.make_calls_dict(fh, contig) 43 | 44 | with open(self._child_call_file) as ch: 45 | for cv in ch: 46 | locus_data = cv.strip().split("\t") 47 | lookup = tuple(locus_data[:4]) 48 | 49 | if locus_data[0] != contig: 50 | continue 51 | 52 | k = (contig, int(lookup[1]), int(lookup[2])) 53 | 54 | if self.should_skip_locus(*k): 55 | continue 56 | 57 | cr.seen_locus(*k) 58 | 59 | # Check to make sure call is present in all trio individuals 60 | if lookup not in mother_calls or lookup not in father_calls: 61 | continue 62 | 63 | child_calls = locus_data[6:8] 64 | 65 | if "." in child_calls: 66 | # Failed call 67 | continue 68 | 69 | cr.append(MILocusData( 70 | contig=contig, 71 | start=k[1], 72 | end=k[2], 73 | motif=lookup[3], 74 | 75 | child_gt=int_tuple(child_calls), 76 | mother_gt=mother_calls[lookup], 77 | father_gt=father_calls[lookup], 78 | )) 79 | 80 | return cr 81 | -------------------------------------------------------------------------------- /strkit/mi/trgt.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pysam 4 | 5 | from .base import BaseCalculator 6 | from .result import MIContigResult, MILocusData 7 | from .vcf_utils import VCFCalculatorMixin 8 | from ..utils import parse_ci 9 | 10 | __all__ = ["TRGTCalculator"] 11 | 12 | 13 | def _parse_allele(a: int | str | None) -> int | None: 14 | if isinstance(a, str): 15 | if a == ".": 16 | return None 17 | return int(a) 18 | return a 19 | 20 | 21 | def _unzip_gt( 22 | vals, motif_len: int 23 | ) -> tuple[tuple[int, ...], tuple[tuple[int, ...], tuple[int, ...]]] | tuple[tuple[None, None], tuple[None, None]]: 24 | try: 25 | return ( 26 | ( 27 | round(_parse_allele(vals[0][0]) / motif_len), 28 | round(_parse_allele(vals[1][0]) / motif_len), 29 | ), 30 | ( 31 | tuple(map(lambda x: round(x / motif_len), parse_ci(vals[0][1]))), 32 | tuple(map(lambda x: round(x / motif_len), parse_ci(vals[1][1]))), 33 | ), 34 | ) 35 | except (ValueError, TypeError): 36 | return (None, None), (None, None) 37 | 38 | 39 | class TRGTCalculator(BaseCalculator, VCFCalculatorMixin): 40 | def _get_sample_contigs(self) -> tuple[set, set, set]: 41 | return self.get_contigs_from_files(self._mother_call_file, self._father_call_file, self._child_call_file) 42 | 43 | def calculate_contig(self, contig: str) -> MIContigResult: 44 | cr = MIContigResult(contig, includes_95_ci=True, includes_seq=True) 45 | 46 | mvf = pysam.VariantFile(str(self._mother_call_file)) 47 | fvf = pysam.VariantFile(str(self._father_call_file)) 48 | cvf = pysam.VariantFile(str(self._child_call_file)) 49 | 50 | # We want all common loci, so loop through the child and then look for the loci in the parent calls 51 | 52 | for cv in cvf.fetch(contig): 53 | mv = next(mvf.fetch(contig, cv.start, cv.stop), None) 54 | fv = next(fvf.fetch(contig, cv.start, cv.stop), None) 55 | 56 | # TODO: Handle sex chromosomes 57 | 58 | k = (contig, cv.start, cv.stop) 59 | 60 | if self.should_skip_locus(*k): 61 | continue 62 | 63 | cr.seen_locus(*k) 64 | 65 | if mv is None or fv is None: 66 | # Variant isn't found in at least one of the parents, so we can't do anything with it. 67 | # TODO: We need to actually check calls, and check with sample ID, not just assume 68 | continue 69 | 70 | # TODO: Handle missing samples gracefully 71 | # TODO: Handle wrong formatted VCFs gracefully 72 | 73 | motif = cv.info["MOTIFS"][0] 74 | 75 | cs = cv.samples[self._child_id or 0] 76 | ms = mv.samples[self._mother_id or 0] 77 | fs = fv.samples[self._father_id or 0] 78 | 79 | if None in cs["GT"] or None in ms["GT"] or None in fs["GT"]: 80 | # None call in VCF, skip this call 81 | continue 82 | 83 | c_gt = tuple(sorted(int(m.split("_")[0]) for m in cs["MC"])) 84 | m_gt = tuple(sorted(int(m.split("_")[0]) for m in ms["MC"])) 85 | f_gt = tuple(sorted(int(m.split("_")[0]) for m in fs["MC"])) 86 | 87 | # Uncomment to use allele length as motif copies: 88 | 89 | # cs_reps = tuple(sorted(zip(cs["AL"], cs["ALLR"]), key=lambda x: x[0])) 90 | # ms_reps = tuple(sorted(zip(ms["AL"], ms["ALLR"]), key=lambda x: x[0])) 91 | # fs_reps = tuple(sorted(zip(fs["AL"], fs["ALLR"]), key=lambda x: x[0])) 92 | # 93 | # c_gt, c_gt_95_ci = _unzip_gt(cs_reps, len(motif)) 94 | # m_gt, m_gt_95_ci = _unzip_gt(ms_reps, len(motif)) 95 | # f_gt, f_gt_95_ci = _unzip_gt(fs_reps, len(motif)) 96 | 97 | # noinspection PyTypeChecker 98 | c_seq_gt: tuple[str] | tuple[str, str] = tuple(sorted((cv.alleles[g] for g in cs["GT"]), key=len)) 99 | # noinspection PyTypeChecker 100 | m_seq_gt: tuple[str] | tuple[str, str] = tuple(sorted((mv.alleles[g] for g in ms["GT"]), key=len)) 101 | # noinspection PyTypeChecker 102 | f_seq_gt: tuple[str] | tuple[str, str] = tuple(sorted((fv.alleles[g] for g in fs["GT"]), key=len)) 103 | 104 | cr.append(MILocusData( 105 | contig=contig, 106 | start=cv.start, 107 | end=cv.stop, 108 | motif=motif, 109 | 110 | child_gt=c_gt, mother_gt=m_gt, father_gt=f_gt, 111 | # Uncomment to use allele length as motif copies 95% CI: 112 | # child_gt_95_ci=c_gt_95_ci, mother_gt_95_ci=m_gt_95_ci, father_gt_95_ci=f_gt_95_ci, 113 | child_seq_gt=c_seq_gt, mother_seq_gt=m_seq_gt, father_seq_gt=f_seq_gt, 114 | )) 115 | 116 | return cr 117 | -------------------------------------------------------------------------------- /strkit/mi/vcf_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pysam 4 | 5 | __all__ = ["VCFCalculatorMixin"] 6 | 7 | 8 | class VCFCalculatorMixin: 9 | @staticmethod 10 | def get_contigs_from_files(mother_call_file, father_call_file, child_call_file) -> tuple[set, set, set]: 11 | with pysam.VariantFile(str(mother_call_file)) as mvf: 12 | mc = set(mvf.header.contigs) 13 | 14 | with pysam.VariantFile(str(father_call_file)) as fvf: 15 | fc = set(fvf.header.contigs) 16 | 17 | with pysam.VariantFile(str(child_call_file)) as cvf: 18 | cc = set(cvf.header.contigs) 19 | 20 | return mc, fc, cc 21 | -------------------------------------------------------------------------------- /strkit/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import math 4 | import operator 5 | from functools import partial 6 | from typing import Any, Callable, Iterable 7 | 8 | __all__ = [ 9 | "cat_strs", 10 | "is_none", 11 | "idx_0_getter", 12 | "idx_1_getter", 13 | "apply_or_none", 14 | "int_tuple", 15 | "float_tuple", 16 | "parse_ci", 17 | "parse_cis", 18 | "cis_overlap", 19 | "sign", 20 | ] 21 | 22 | 23 | # index/property getters and other partials 24 | cat_strs = "".join 25 | is_none = partial(operator.is_, None) 26 | idx_0_getter = operator.itemgetter(0) 27 | idx_1_getter = operator.itemgetter(1) 28 | 29 | 30 | def apply_or_none(fn: Callable, x: Any) -> Any: 31 | # Python: add any type of monad functionality challenge [IMPOSSIBLE] 32 | return fn(x) if x is not None else None 33 | 34 | 35 | def int_tuple(x: Iterable) -> tuple[int, ...]: 36 | return tuple(map(int, x)) 37 | 38 | 39 | def float_tuple(x: Iterable) -> tuple[float, ...]: 40 | return tuple(map(float, x)) 41 | 42 | 43 | def parse_ci(ci: str, commas=False, dtype=int) -> tuple[int, int] | tuple[float, float]: 44 | ci_s = ci.split("," if commas else "-") 45 | return dtype(ci_s[0]), dtype(ci_s[1]) 46 | 47 | 48 | def parse_cis( 49 | cis: Iterable[str], commas=False, dtype=int 50 | ) -> tuple[tuple[int, ...], ...] | tuple[tuple[float, ...], ...]: 51 | return tuple(map(lambda ci: parse_ci(ci, commas, dtype), cis)) 52 | 53 | 54 | def cis_overlap(ci1, ci2) -> bool: 55 | epsilon = -0.0001 56 | 57 | # []: ci1 58 | # (): ci2 59 | # [ ( ] ) or [ ( ) ] or ( [ ) ] or ( [ ] ) 60 | # int logic: ci1[0] <= ci2[1] and ci2[0] <= ci1[1] 61 | # float logic: lets add some epsilon to prevent little issues 62 | return (ci2[1] - ci1[0]) > epsilon and (ci1[1] - ci2[0]) > epsilon 63 | 64 | 65 | def sign(x: int | float) -> int: 66 | return round(math.copysign(1, x)) 67 | -------------------------------------------------------------------------------- /strkit/viz/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidlougheed/strkit/90f07d4bb8e27b67a1a1c9afafa4f4b1412b171a/strkit/viz/__init__.py -------------------------------------------------------------------------------- /strkit/viz/server.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, render_template, request, send_file 2 | from werkzeug.exceptions import NotFound 3 | 4 | __all__ = [ 5 | "run_server", 6 | ] 7 | 8 | app = Flask(__name__) 9 | 10 | 11 | @app.route("/") 12 | def browser(): 13 | return render_template( 14 | "browser.html", 15 | **app.config["PARAMS"]) 16 | 17 | 18 | @app.route("/report-metadata") 19 | def get_report_metadata(): 20 | return {k: v for k, v in app.config["CALL_REPORT"].items() if k != "results"} 21 | 22 | 23 | @app.route("/params") 24 | def get_params(): 25 | return { 26 | "cmd": app.config["PARAMS"], 27 | "report": app.config["CALL_REPORT"]["parameters"], 28 | } 29 | 30 | 31 | @app.route("/loci") 32 | def get_loci(): 33 | cr = app.config["CALL_REPORT"] 34 | ecd = list(enumerate(cr["results"])) # TODO: cache 35 | 36 | q = request.args.get("q", "").strip() 37 | if q: 38 | res = list(filter(lambda x: q.lower() in f"{x[1]['contig']}:{x[1]['start']}-{x[1]['end']}", ecd)) # TODO 39 | else: 40 | # TODO: nicer priority 41 | res = ecd[:10] 42 | 43 | return { 44 | "results": list(map( 45 | lambda x: { 46 | "i": x[0], 47 | "contig": x[1]["contig"], 48 | "start": x[1]["start"], 49 | "end": x[1]["end"], 50 | "disabled": x[1]["call"] is None, 51 | }, 52 | res)), 53 | } 54 | 55 | 56 | @app.route("/call_data/") 57 | def get_call_data(i: int): 58 | cr = app.config["CALL_REPORT"] 59 | cr_res = cr["results"] 60 | if i < 0 or i > len(cr_res) - 1: 61 | raise NotFound() 62 | return cr_res[i] 63 | 64 | 65 | # @app.route("/ref") 66 | # def get_ref_file(): 67 | # return send_file(app.config["PARAMS"]["ref"], conditional=True) 68 | # 69 | # 70 | # @app.route("/ref_index") 71 | # def get_ref_index_file(): 72 | # return send_file(app.config["PARAMS"]["ref_index"], conditional=True) 73 | 74 | 75 | @app.route("/align_file") 76 | def get_align_file(): 77 | return send_file(app.config["PARAMS"]["align_file"], conditional=True) 78 | 79 | 80 | @app.route("/align_index") 81 | def get_align_index_file(): 82 | return send_file(app.config["PARAMS"]["align_index"], conditional=True) 83 | 84 | 85 | def run_server(call_report, **kwargs): 86 | app.config.from_mapping(dict(CALL_REPORT=call_report, PARAMS=kwargs)) 87 | app.run(host="localhost", port=5011, debug=True) 88 | -------------------------------------------------------------------------------- /strkit/viz/static/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidlougheed/strkit/90f07d4bb8e27b67a1a1c9afafa4f4b1412b171a/strkit/viz/static/logo.png -------------------------------------------------------------------------------- /tests/data/test_loci.bed: -------------------------------------------------------------------------------- 1 | chr1 200 300 ACAA 2 | chr1 300 400 GA 3 | chr1 350 450 GAGA 4 | chr2 100 200 CAG 5 | -------------------------------------------------------------------------------- /tests/test_caller_locus_validation.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from strkit.call.validation import LocusValidationError, valid_motif, validate_locus 3 | 4 | 5 | @pytest.mark.parametrize("motif,valid", [ 6 | ("CAG", True), 7 | ("CAGN", True), 8 | ("CAGX", False), 9 | ("(CAG)n", False), 10 | ("XX", False), 11 | ]) 12 | def test_valid_motif(motif, valid): 13 | assert valid_motif(motif) == valid 14 | 15 | 16 | def test_validate_locus(): 17 | with pytest.raises(LocusValidationError): 18 | # start > end, invalid 19 | validate_locus(1, 1000, 500, "CAG") 20 | 21 | with pytest.raises(LocusValidationError): 22 | # start == end, invalid 23 | validate_locus(1, 1000, 1000, "CAG") 24 | 25 | with pytest.raises(LocusValidationError): 26 | # invalid motif 27 | validate_locus(1, 1000, 1200, "(CAG)n") 28 | -------------------------------------------------------------------------------- /tests/test_caller_utils.py: -------------------------------------------------------------------------------- 1 | from strkit.call.utils import find_pair_by_ref_pos, normalize_contig 2 | 3 | # A A T T C G C C C C A A A A A C 4 | PAIRS = [(0, 1000), (1, 1001), (2, 1003), (3, 1004), (4, 1005), (5, 1006), (6, 1008), (7, 1009)] 5 | SNVS = ((1003, "C"), (1009, "A")) 6 | PAIRS_Q = list(p[0] for p in PAIRS) 7 | PAIRS_R = list(p[1] for p in PAIRS) 8 | 9 | 10 | def test_find_pair_by_ref_pos(): 11 | assert find_pair_by_ref_pos(PAIRS_R, 1004) == (3, True) 12 | assert find_pair_by_ref_pos(PAIRS_R, 1007) == (6, False) 13 | 14 | 15 | def test_normalize_contig(): 16 | assert normalize_contig("chr5", True) == "chr5" 17 | assert normalize_contig("5", True) == "chr5" 18 | assert normalize_contig("X", True) == "chrX" 19 | assert normalize_contig("chr5", False) == "5" 20 | assert normalize_contig("chrX", False) == "X" 21 | -------------------------------------------------------------------------------- /tests/test_iupac.py: -------------------------------------------------------------------------------- 1 | from strkit.iupac import get_iupac_code_for_nt_set 2 | 3 | 4 | def test_get_iupac_code(): 5 | assert get_iupac_code_for_nt_set({"A", "T"}) == "W" 6 | assert get_iupac_code_for_nt_set({"A", "C", "G", "T"}) == "N" 7 | assert get_iupac_code_for_nt_set({"A", "T", "C", "G"}) == "N" 8 | assert get_iupac_code_for_nt_set({"A", "T", "C"}) == "H" 9 | assert get_iupac_code_for_nt_set({"A", "T", "C", "Z"}) is None 10 | assert get_iupac_code_for_nt_set({"A", "T", "C", ":)"}) is None 11 | assert get_iupac_code_for_nt_set({"A", "T", "C", ""}) is None 12 | -------------------------------------------------------------------------------- /tests/test_mi_intervals.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import pytest 3 | 4 | from strkit.mi.intervals import ( 5 | build_loci_dict_of_dict_from_file, 6 | overlapping_loci_dict_of_dict, 7 | build_loci_dict_of_list_from_file, 8 | overlapping_loci_dict_of_list, 9 | ) 10 | 11 | TEST_LOCI = pathlib.Path(__file__).parent / "data" / "test_loci.bed" 12 | 13 | BED_CASES = [ 14 | ("chr1", 50, 70, 0), 15 | ("chr1", 205, 210, 1), 16 | ("chr1", 50, 1000, 3), 17 | ("chr1", 320, 500, 2), 18 | ("chr1", 400, 450, 1), 19 | ("chr1", 1000, 1001, 0), 20 | ("chr2", 100, 101, 1), 21 | ("chr2", 100, 200, 1), 22 | ("asdf", 50, 1000, 0), 23 | ] 24 | 25 | 26 | @pytest.mark.parametrize("contig,start,end,nr", BED_CASES) 27 | def test_loci_dict_of_dict(contig: str, start: int, end: int, nr: int): 28 | d = build_loci_dict_of_dict_from_file(TEST_LOCI, False) 29 | assert len(overlapping_loci_dict_of_dict(contig, start, end, d)) == nr 30 | 31 | 32 | @pytest.mark.parametrize("contig,start,end,nr", BED_CASES) 33 | def test_loci_dict_of_list(contig: str, start: int, end: int, nr: int): 34 | d = build_loci_dict_of_list_from_file(TEST_LOCI, False) 35 | assert len(tuple(overlapping_loci_dict_of_list(contig, start, end, d, False))) == nr 36 | --------------------------------------------------------------------------------