├── .github
    └── workflows
    │   ├── publish-docker.yml
    │   ├── release.yml
    │   └── test.yml
├── .gitignore
├── .idea
    ├── .gitignore
    ├── .name
    ├── csv-editor.xml
    ├── inspectionProfiles
    │   ├── Project_Default.xml
    │   └── profiles_settings.xml
    ├── jsLibraryMappings.xml
    ├── misc.xml
    ├── modules.xml
    ├── strkit.iml
    └── vcs.xml
├── Dockerfile
├── LICENSE
├── MANIFEST.in
├── README.md
├── catalogs
    └── pathogenic_assoc.hg38.tsv
├── docs
    ├── caller_catalog.md
    ├── caller_usage.md
    ├── images
    │   ├── browser_hist.png
    │   ├── browser_igv.png
    │   ├── call_method_flow.png
    │   ├── strkit_logo_open_graph.png
    │   └── strkit_logo_small.png
    ├── output_formats.md
    └── trio_analyses.md
├── pyproject.toml
├── requirements.txt
├── setup.py
├── strkit
    ├── VERSION
    ├── __init__.py
    ├── call
    │   ├── __init__.py
    │   ├── align_matrix.py
    │   ├── allele.py
    │   ├── call_locus.py
    │   ├── call_sample.py
    │   ├── cigar.py
    │   ├── non_daemonic_pool.py
    │   ├── output
    │   │   ├── __init__.py
    │   │   ├── json_report.py
    │   │   ├── tsv.py
    │   │   └── vcf.py
    │   ├── params.py
    │   ├── realign.py
    │   ├── repeats.py
    │   ├── snvs.py
    │   ├── types.py
    │   ├── utils.py
    │   └── validation.py
    ├── catalog
    │   ├── __init__.py
    │   └── combine.py
    ├── constants.py
    ├── convert
    │   ├── __init__.py
    │   ├── _bed_4.py
    │   ├── constants.py
    │   ├── converter.py
    │   ├── expansionhunter.py
    │   ├── gangstr.py
    │   ├── hipstr.py
    │   └── trgt.py
    ├── entry.py
    ├── exceptions.py
    ├── iupac.py
    ├── json.py
    ├── logger.py
    ├── mi
    │   ├── __init__.py
    │   ├── base.py
    │   ├── expansionhunter.py
    │   ├── gangstr.py
    │   ├── generic_vcf.py
    │   ├── intervals.py
    │   ├── repeathmm.py
    │   ├── result.py
    │   ├── straglr.py
    │   ├── strkit.py
    │   ├── tandem_genotypes.py
    │   ├── trgt.py
    │   └── vcf_utils.py
    ├── utils.py
    └── viz
    │   ├── __init__.py
    │   ├── server.py
    │   ├── static
    │       └── logo.png
    │   └── templates
    │       └── browser.html
└── tests
    ├── data
        └── test_loci.bed
    ├── test_caller_locus_validation.py
    ├── test_caller_utils.py
    ├── test_iupac.py
    └── test_mi_intervals.py


/.github/workflows/publish-docker.yml:
--------------------------------------------------------------------------------
 1 | name: Publish STRkit Docker image
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 | 
 7 | jobs:
 8 |   publish:
 9 |     runs-on: ubuntu-latest
10 | 
11 |     permissions:
12 |       packages: write
13 |       contents: read
14 | 
15 |     steps:
16 |       - uses: actions/checkout@v4
17 | 
18 |       - name: Set up QEMU
19 |         uses: docker/setup-qemu-action@v3
20 |         with:
21 |           platforms: linux/amd64,linux/arm64
22 | 
23 |       - uses: docker/metadata-action@v5
24 |         id: meta
25 |         with:
26 |           images: ghcr.io/davidlougheed/strkit
27 |           tags: |
28 |             type=semver,pattern={{version}}
29 |             type=semver,pattern={{major}}.{{minor}}
30 | 
31 |       - uses: docker/setup-buildx-action@v3
32 | 
33 |       - uses: docker/login-action@v3
34 |         with:
35 |           registry: ghcr.io
36 |           username: ${{ github.actor }}
37 |           password: ${{ secrets.GITHUB_TOKEN }}
38 | 
39 |       - uses: docker/build-push-action@v5
40 |         with:
41 |           context: .
42 |           push: true
43 |           tags: ${{ steps.meta.outputs.tags }}
44 |           labels: ${{ steps.meta.outputs.labels }}
45 |           platforms: linux/amd64,linux/arm64
46 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Publish PyPI release
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 | 
 7 | jobs:
 8 |   publish:
 9 |     runs-on: ubuntu-latest
10 | 
11 |     permissions:
12 |       contents: read
13 |       id-token: write
14 | 
15 |     environment:
16 |       name: release
17 |       url: https://pypi.org/p/strkit
18 | 
19 |     steps:
20 |       - uses: actions/checkout@v4
21 | 
22 |       - uses: actions/setup-python@v5
23 |         with:
24 |           python-version: '3.10'
25 | 
26 |       - name: Install pypa/build
27 |         run: python -m pip install build --user
28 | 
29 |       - name: Build
30 |         run: python -m build --sdist --wheel --outdir dist/ .
31 | 
32 |       - name: Publish to PyPI
33 |         uses: pypa/gh-action-pypi-publish@release/v1
34 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Test
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |   pull_request:
 8 | 
 9 | jobs:
10 |   test:
11 |     runs-on: ubuntu-latest
12 |     strategy:
13 |       matrix:
14 |         python-version: [ "3.10", "3.11", "3.12" ]
15 | 
16 |     steps:
17 |       - uses: actions/checkout@v4
18 |       - uses: actions/setup-python@v5
19 |         name: Set up Python
20 |         with:
21 |           python-version: ${{ matrix.python-version }}
22 |       - name: Install dependencies
23 |         run: pip install -r requirements.txt
24 |       - name: Install STRkit
25 |         run: pip install .
26 |       - name: Test
27 |         run: pytest -svv --cov=strkit --cov-branch
28 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /env
 2 | /envp11
 3 | __pycache__
 4 | 
 5 | /build
 6 | /dist
 7 | /strkit.egg-info
 8 | # ignore WIP cohort code for now
 9 | /strkit/cohort
10 | 
11 | *.bam
12 | *.bai
13 | *.fa.gz
14 | *.fa.gz.fai
15 | *.fa.gz.gzi
16 | *.bed
17 | !tests/data/*.bed
18 | /*.json
19 | /*.tsv
20 | *.vcf.gz*
21 | *.vcf
22 | 
23 | *.token
24 | 


--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | 


--------------------------------------------------------------------------------
/.idea/.name:
--------------------------------------------------------------------------------
1 | strkit


--------------------------------------------------------------------------------
/.idea/csv-editor.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="CsvFileAttributes">
 4 |     <option name="attributeMap">
 5 |       <map>
 6 |         <entry key="/env/lib/python3.12/site-packages/statsmodels/tsa/statespace/tests/results/results_realgdpar_stata.csv">
 7 |           <value>
 8 |             <Attribute>
 9 |               <option name="separator" value="," />
10 |             </Attribute>
11 |           </value>
12 |         </entry>
13 |       </map>
14 |     </option>
15 |   </component>
16 | </project>


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
 1 | <component name="InspectionProjectProfileManager">
 2 |   <profile version="1.0">
 3 |     <option name="myName" value="Project Default" />
 4 |     <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
 5 |       <option name="ignoredPackages">
 6 |         <value>
 7 |           <list size="9">
 8 |             <item index="0" class="java.lang.String" itemvalue="django-extensions" />
 9 |             <item index="1" class="java.lang.String" itemvalue="pandas" />
10 |             <item index="2" class="java.lang.String" itemvalue="scipy" />
11 |             <item index="3" class="java.lang.String" itemvalue="six" />
12 |             <item index="4" class="java.lang.String" itemvalue="threadpoolctl" />
13 |             <item index="5" class="java.lang.String" itemvalue="matplotlib" />
14 |             <item index="6" class="java.lang.String" itemvalue="python-dateutil" />
15 |             <item index="7" class="java.lang.String" itemvalue="numpy" />
16 |             <item index="8" class="java.lang.String" itemvalue="Pillow" />
17 |           </list>
18 |         </value>
19 |       </option>
20 |     </inspection_tool>
21 |     <inspection_tool class="SpellCheckingInspection" enabled="false" level="TYPO" enabled_by_default="false">
22 |       <option name="processCode" value="true" />
23 |       <option name="processLiterals" value="true" />
24 |       <option name="processComments" value="true" />
25 |     </inspection_tool>
26 |   </profile>
27 | </component>


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="PROJECT_PROFILE" value="Default" />
4 |     <option name="USE_PROJECT_PROFILE" value="false" />
5 |     <version value="1.0" />
6 |   </settings>
7 | </component>


--------------------------------------------------------------------------------
/.idea/jsLibraryMappings.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptLibraryMappings">
4 |     <file url="file://$PROJECT_DIR$" libraries="{@observablehq, d3.v7, igv}" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="Black">
 4 |     <option name="sdkName" value="Python 3.10 (stronger)" />
 5 |   </component>
 6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (stronger)" project-jdk-type="Python SDK" />
 7 |   <component name="PyCharmProfessionalAdvertiser">
 8 |     <option name="shown" value="true" />
 9 |   </component>
10 |   <component name="PythonCompatibilityInspectionAdvertiser">
11 |     <option name="version" value="3" />
12 |   </component>
13 | </project>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/strkit.iml" filepath="$PROJECT_DIR$/.idea/strkit.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/.idea/strkit.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$">
 5 |       <excludeFolder url="file://$MODULE_DIR$/.mypy_cache" />
 6 |       <excludeFolder url="file://$MODULE_DIR$/.pytest_cache" />
 7 |       <excludeFolder url="file://$MODULE_DIR$/build" />
 8 |       <excludeFolder url="file://$MODULE_DIR$/env" />
 9 |       <excludeFolder url="file://$MODULE_DIR$/envp11" />
10 |       <excludeFolder url="file://$MODULE_DIR$/strkit.egg-info" />
11 |     </content>
12 |     <orderEntry type="jdk" jdkName="Python 3.12 (stronger)" jdkType="Python SDK" />
13 |     <orderEntry type="sourceFolder" forTests="false" />
14 |     <orderEntry type="library" name="d3.v7" level="application" />
15 |     <orderEntry type="library" name="igv" level="application" />
16 |     <orderEntry type="library" name="@observablehq" level="application" />
17 |   </component>
18 | </module>


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.12-bookworm
 2 | 
 3 | WORKDIR /strkit
 4 | 
 5 | COPY LICENSE .
 6 | COPY MANIFEST.in .
 7 | COPY pyproject.toml .
 8 | COPY README.md .
 9 | COPY setup.py .
10 | COPY strkit strkit
11 | 
12 | RUN curl https://sh.rustup.rs -sSf > rustup-init.sh
13 | RUN sh ./rustup-init.sh -y
14 | ENV PATH="/root/.cargo/bin:${PATH}"
15 | 
16 | RUN pip install -U pip
17 | RUN pip install --no-cache-dir -v .
18 | 
19 | CMD [ "strkit" ]
20 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include strkit/VERSION
2 | include strkit/viz/static/logo.png
3 | include strkit/viz/templates/*.html
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # STRkit - short tandem repeat genotyping with long reads
  2 | 
  3 | [![PyPI version](https://badge.fury.io/py/strkit.svg)](https://badge.fury.io/py/strkit)
  4 | [![BioRxiv DOI](https://img.shields.io/badge/bioRxiv-10.1101/2025.03.25.645269-B31B1B.svg)](https://doi.org/10.1101/2025.03.25.645269)
  5 | [![Zenodo DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.12689906.svg)](https://doi.org/10.5281/zenodo.12689906)
  6 | 
  7 | STRkit is a short tandem repeat (STR) genotyping and analysis toolkit for long read sequencing data, especially 
  8 | PacBio HiFi data. The STRkit software package is written in Python and is available in the PyPI package registry or as
  9 | a Docker container.
 10 | 
 11 | If you use STRkit in published work, please cite our preprint:
 12 | 
 13 | > [STRkit: precise, read-level genotyping of short tandem repeats using long reads and single-nucleotide variation.](https://doi.org/10.1101/2025.03.25.645269)
 14 | > David R Lougheed, Tomi Pastinen, Guillaume Bourque. *BioRxiv&nbsp;preprint*.
 15 | > DOI:&nbsp;[10.1101/2025.03.25.645269](https://doi.org/10.1101/2025.03.25.645269)
 16 | 
 17 | <img src="./docs/images/strkit_logo_small.png" alt="" width="500" height="324" />
 18 | 
 19 | 
 20 | ## Table of Contents
 21 | 
 22 | * [Installation](#installation)
 23 |   * [Via PyPI](#via-pypi)
 24 |   * [As a Docker container](#as-a-docker-container)
 25 | * [Commands](#commands)
 26 |   * [`strkit call`: Genotype caller with bootstrapped confidence intervals](#strkit-call-genotype-caller-with-bootstrapped-confidence-intervals)
 27 |     * [Features](#features)
 28 |     * [Usage](#usage)
 29 |     * [Further documentation on the STRkit caller, including output format](#further-documentation-on-the-strkit-caller-including-output-format)
 30 |   * [`strkit visualize`: Call visualizer](#strkit-visualize-call-visualizer)
 31 |   * [`strkit mi`: Mendelian inheritance analysis](#strkit-mi-mendelian-inheritance-analysis)
 32 |     * [Usage](#usage-1)
 33 |     * [Further documentation](#further-documentation)
 34 |   * [`strkit convert`: STR catalog conversion](#strkit-convert-str-catalog-conversion)
 35 |     * [Usage](#usage-2)
 36 | * [Copyright and License](#copyright-and-license)
 37 |   * [Notice](#notice)
 38 |   * [Exceptions](#exceptions)
 39 | 
 40 | 
 41 | ## Installation
 42 | 
 43 | ### Via PyPI
 44 | 
 45 | STRkit requires Python 3.10+ and can be installed from PyPI via `pip` 
 46 | with the following command:
 47 | 
 48 | ```bash
 49 | python -m pip install strkit
 50 | ```
 51 | 
 52 | You may need to install the [Rust toolchain](https://www.rust-lang.org/tools/install)
 53 | and a C compiler (e.g., `gcc`, `clang`), as well as `cmake`, to compile the `strkit_rust_ext` wheel, 
 54 | although prebuilt wheels for this module are available for some platforms. Compiling the wheel may take quite
 55 | a long time (in the tens of minutes).
 56 | 
 57 | On Digital Research Alliance of Canada/Compute Canada clusters, this involves loading a few modules:
 58 | 
 59 | ```bash
 60 | module load rust/1.85.0 clang/18.1.8 python/3.11 scipy-stack/2025a parasail/2.6.2
 61 | python -m pip install strkit
 62 | ```
 63 | 
 64 | STRkit should then be available in your Python environment as a command-line tool:
 65 | 
 66 | ```bash
 67 | strkit --help
 68 | ```
 69 | 
 70 | ### As a Docker container
 71 | 
 72 | STRkit is also available as a [Docker container](https://github.com/davidlougheed/strkit/pkgs/container/strkit), stored 
 73 | in the GitHub Container Registry.
 74 | 
 75 | It can be pulled using the following command:
 76 | 
 77 | ```bash
 78 | docker pull ghcr.io/davidlougheed/strkit:latest
 79 | ```
 80 | 
 81 | Then, STRkit commands can be run mostly as normal using the Docker image:
 82 | 
 83 | ```bash
 84 | docker run -it ghcr.io/davidlougheed/strkit --help
 85 | ```
 86 | 
 87 | 
 88 | ## Commands
 89 | 
 90 | ### `strkit call`: Genotype caller with bootstrapped confidence intervals
 91 | 
 92 | A Gaussian mixture model tandem repeat genotype caller for long read data.
 93 | STRkit is tuned specifically for high-fidelity long reads, although other 
 94 | long read data should still work.
 95 | 
 96 | ![Calling approach flow chart](./docs/images/call_method_flow.png)
 97 | 
 98 | #### Features:
 99 | 
100 | * Performant, vectorized (thanks to [parasail](https://github.com/jeffdaily/parasail))
101 |   estimates of repeat counts from high-fidelity long reads and a supplied 
102 |   catalog of TR loci and motifs.
103 | * Re-weighting of longer reads, to compensate for their lower likelihood of observation.
104 |   * Whole-genome and targeted genotyping modes to adjust this re-weighting.
105 | * Incorporation of single-nucleotide variation (SNVs) for better and faster calling plus 
106 |   additional downstream analysis possibilities.
107 |   * Recommended for **HiFi data and ONT R10 data only**. In my testing, this worsens runtime and call quality for 
108 |     ONT ultra-long-read data, but speeds up the tool and improves call quality for HiFi/ONT R10 data. 
109 | * Parallelized for faster computing on clusters and for ad-hoc fast analysis of single samples.
110 | * 95% confidence intervals on calls via a user-configurable optional parametric bootstrapping process.
111 | 
112 | 
113 | #### Usage:
114 | 
115 | See all parameters and example usage with a Slurm cluster: 
116 | [Advanced caller usage and configuration](./docs/caller_usage.md)
117 | 
118 | ##### EXAMPLE USAGE
119 | 
120 | ```bash
121 | # For the dbSNP VCF used below for SNV incorporation, see https://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/
122 | # (00-common_all.vcf.gz)
123 | #
124 | # "Accurate reads" here means HiFi / ONT R10 duplex reads, but in practice may also include ONT R10 simplex reads.
125 | 
126 | strkit call \
127 |   path/to/read/file.bam \  # [REQUIRED] One indexed read file (BAM/CRAM)
128 |   --hq \  # If using accurate reads, enable this to get better genotyping & more robust expansion detection
129 |   --realign \  # If using accurate reads, enable this to enable local realignment / read recovery. Good for detecting expansions, but slows down calling.
130 |   --ref path/to/reference.fa.gz \  # [REQUIRED] Indexed FASTA-formatted reference genome
131 |   --loci path/to/loci.bed \  # [REQUIRED] TRF-formatted (or 4-col, with motif as last column) sorted list of loci to genotype
132 |   --incorporate-snvs path/to/dbsnp/00-common_all.vcf.gz \   # If you want, specify a SNV catalogue to help phase STRs & speed up calling
133 |   --vcf my-calls.vcf \  # Calculate consensus sequences for alleles and output a .vcf (or .vcf.gz) with call data
134 |   --seed 183 \  # Fixed random number generator seed for replicability
135 |   --processes 10 \  # Number of parallel processes to use; DEFAULT: 1
136 |   --no-tsv  # If VCF output is enabled as above, we don't need TSV genotype output to stdout (which is the default)
137 | ```
138 | 
139 | ##### REGARDING ALIGNMENTS
140 | 
141 | Ideally, you should be using a read file aligned with parameters tuned for tandem repeats. 
142 | PacBio provides a 
143 | [recommended workflow](https://github.com/PacificBiosciences/apps-scripts/tree/master/RepeatAnalysisTools)
144 | for CCS alignment in this scenario. However, regular aligned readsets are fine and have been tested
145 | extensively.
146 | 
147 | If you're using accurate long reads (e.g., HiFi, ONT R10 duplex) as input, **use the `--hq` and 
148 | `--realign` options** to get better genotype calculation and a greater proportion of reads 
149 | incorporated into the computed genotypes, respectively. These should not add much performance 
150 | overhead. *In practice, these options may also aid calling with slightly-less-accurate reads.*
151 | 
152 | If you want to **incorporate haplotagging from an alignment file (`HP` tags)** into the 
153 | process, which should speed up runtime and potentially improve calling results, you must pass 
154 | the `--use-hp` flag.
155 | 
156 | ##### REGARDING SNV INCORPORATION
157 | 
158 | If you want to **incorporate SNV calling** into the process, which speeds up runtime and gives
159 | marginally better calling results, you must provide an indexed, `bgzip`-compressed SNV catalog 
160 | VCF which matches your reference genome. You can find dbSNP VCFs at
161 | [`https://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/`](https://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/).
162 | The file for GRCh38 is called `00-common_all.vcf.gz` as of time of writing.
163 | **Note that this does not need to be an SNV call file for your sample, specifically**; just one 
164 | which has positions, reference/alternate alleles, and the `ID` field populated.
165 | 
166 | ##### REGARDING OUTPUT
167 | 
168 | If you want to output a full call report, you can use the `--json output-file.json` argument to
169 | specify a path to output a more detailed JSON document to. This document contains 99% CIs, peak
170 | labels, and some other information that isn't included in the normal TSV file. If you want this
171 | file to be indented and human-readable, use the `--indent-json` flag in addition to `--json ...`.
172 | 
173 | If you want to output a VCF file (STRs and SNVs if called; currently not phased), use the
174 | `--vcf ...` argument. If you pass `--vcf stdout`, the VCF will be written to `stdout` instead of a 
175 | file.
176 | 
177 | For more information, see also documentation on the [Output formats](./docs/output_formats.md).
178 | 
179 | ##### REGARDING REFERENCE GENOMES
180 | 
181 | The reference genome provided must be BGZipped and indexed using `samtools faidx`:
182 | 
183 | ```bash
184 | # Starting from a .fa:
185 | bgzip my-reference.fa  # Replaces .fa with a .fa.gz file
186 | samtools faidx my-reference.fa.gz  # Generates a .fai index file
187 | ```
188 | 
189 | ##### OTHER PARAMETERS
190 | 
191 | See the '[Caller catalog format & choosing a catalog](./docs/caller_catalog.md)' page for more on
192 | how to format a locus catalog or choose from existing available catalogs.
193 | 
194 | 
195 | #### Further documentation on the STRkit caller, including output format:
196 | 
197 |   * [Advanced caller usage and configuration](./docs/caller_usage.md)
198 |   * [Caller catalog format & choosing a catalog](./docs/caller_catalog.md)
199 |   * [Output formats](./docs/output_formats.md)
200 | 
201 | 
202 | ### `strkit visualize`: Call visualizer
203 | 
204 | STRkit bundles a call visualization tool which takes as input a BAM file and
205 | a JSON call file from using the `--json` flag with `strkit call`.
206 | 
207 | It starts a web server on your local machine; the visualizations can be 
208 | interacted with in a web browser.
209 | 
210 | To use the tool, run the following command:
211 | 
212 | ```bash
213 | strkit visualize path/to/my-alignment.bam \ 
214 |   --ref hg38 \  # or hg19
215 |   --json path/to/my-calls.json \
216 |   -i 1  # 1-indexed offset in JSON file for locus of interest. Default is 1 if left out.
217 | ```
218 | 
219 | This will output something like the following:
220 | 
221 | ```
222 |  * Serving Flask app 'strkit.viz.server' (lazy loading)
223 |  * Environment: production
224 |    WARNING: This is a development server. Do not use it in a production deployment.
225 |    Use a production WSGI server instead.
226 |  * Debug mode: on
227 |  * Running on http://localhost:5011 (Press CTRL+C to quit)
228 | ...
229 | ```
230 | 
231 | You can then go to the URL listed, `http://localhost:5011`, on your local machine
232 | to see the visualization tool:
233 | 
234 | ![Browser Histogram](./docs/images/browser_hist.png)
235 | *STRkit browser histogram, showing an expansion in the HTT gene.*
236 | 
237 | ![igv.js Genome Browser](./docs/images/browser_igv.png)
238 | *The same expansion, shown in the igv.js browser. Note the insertions on
239 | the left-hand side in most reads, and the heterozygous copy number pattern.*
240 | 
241 | To exit the tool, press `Ctrl-C` in your command line window as mentioned in 
242 | the start-up instructions.
243 | 
244 | 
245 | 
246 | ### `strkit mi`: Mendelian inheritance analysis
247 | 
248 | Using trio data, candidate de novo STR mutations (or genotyping errors/dropout rates) can be discovered 
249 | by looking at inheritance patterns. This tool provides a few different ways to do this, via:
250 | 
251 | * Mendelian inheritance % (MI) calculations for many common TR genotyping tools for both long/short reads, 
252 |   including support for genotyping methods which report confidence intervals.
253 | * Reports of loci (potentially of interest) which do not respect MI.
254 | 
255 | #### Usage
256 | 
257 | For a basic JSON report on Mendelian inheritance with a trio of STRkit VCFs (compressed and indexed with BGZip), use 
258 | something like the following command:
259 | 
260 | ```bash
261 | # In addition to summary figures on Mendelian inheritance, this tool outputs loci which do not respect MI, which may be 
262 | # useful as candidate de novo mutations. The --mismatch-out-mi flag controls which form of MI metric is used for 
263 | # deciding which loci to output. Options for this flag are:
264 | #   strict (strict copy number MI),
265 | #   pm1 (copy number MI ± 1 repeat unit),
266 | #   ci_95 (copy number 95% confidence interval),
267 | #   ci_99 (copy number 99% confidence interval),
268 | #   seq ([allele] sequence MI),
269 | #   sl ([allele] sequence length MI),
270 | #   sl_pm1 ([allele] sequence length MI ± 1 base pair)
271 | strkit mi \
272 |   --caller strkit-vcf \
273 |   --json mi-report.json \
274 |   --mismatch-out-mi seq \
275 |   child-calls.vcf.gz \
276 |   mother-calls.vcf.gz \
277 |   father-calls.vcf.gz
278 | # This will also output a TSV report to stdout. If this is not desired, use --no-tsv to suppress TSV output.
279 | ```
280 | 
281 | For other options and what they do, run `strkit mi` (with no other arguments) or `strkit mi --help`.
282 | 
283 | #### Further documentation
284 | 
285 | **For more information on what kind of analyses can be done with this data**, see the
286 | [Trio analyses with STRkit](./docs/trio_analyses.md) page.
287 | 
288 | 
289 | ### `strkit convert`: STR catalog conversion
290 | 
291 | STRkit takes as input a four-or-more-column BED file, structured like:
292 | 
293 | ```
294 | contig  start end [0 or more extraneous columns] motif
295 | ```
296 | 
297 | Any extraneous columns are removed, (internally) leaving a four-column STR locus representation. 
298 | Some other tools, e.g., [Straglr](https://github.com/bcgsc/straglr), also take a four-column STR
299 | BED as locus catalog input. However, other formats representing a catalog of STRs exist:
300 | 
301 | * [Tandem Repeats Finder](https://github.com/Benson-Genomics-Lab/TRF) outputs a TSV/BED with a lot 
302 |   of information. This can be used as-is with STRkit, but it's safer for other tools to convert to
303 |   a four-column BED format.
304 | * [TRGT uses a custom repeat definition format](https://github.com/PacificBiosciences/trgt/blob/main/docs/repeat_files.md),
305 |   which can specify more advanced STR structures.
306 | 
307 | #### Usage
308 | 
309 | The `strkit convert` sub-command requires an input format (`trf` or `trgt`), an output format 
310 | (many, see `strkit convert --help`), and an input file. Output is written to `stdout`.
311 | 
312 | *Note:* Not all input/output format pairs have available converter functions; an error will be 
313 | printed to `stderr` if one does not exist.
314 | 
315 | For example, to convert from a TRF BED to a TRGT repeat definition BED file:
316 | 
317 | ```bash
318 | strkit convert --in-format trf --out-format trgt in_file.trf.bed > out_file.bed
319 | ```
320 | 
321 | To attempt a conversion from a TRGT repeat definition file to a STRkit/four-column motif BED:
322 | 
323 | ```bash
324 | strkit convert --in-format trgt --out-format strkit in_file.trgt.bed > out_file.bed
325 | ```
326 | 
327 | Note that TRGT can represent STRs with complex structure that STRkit cannot, so some of these loci
328 | may not be converted (these will be logged to `stderr`).
329 | 
330 | 
331 | ## Copyright and License
332 | 
333 | * 2021-2023: &copy; David Lougheed (DL) and McGill University 2021-2023 (versions up to and including `0.8.0a1`), 
334 |   created during graduate research by DL.
335 | * 2023+: (versions beyond `0.8.0a1`):
336 |   * Portions &copy; DL and McGill University 2021-2023
337 |   * Portions &copy; McGill University 2024-2025
338 |   * Portions &copy; DL 2024-2025
339 | 
340 | 
341 | ### Notice
342 | 
343 | This program is free software: you can redistribute it and/or modify
344 | it under the terms of the GNU General Public License as published by
345 | the Free Software Foundation, either version 3 of the License, or
346 | (at your option) any later version.
347 | 
348 | This program is distributed in the hope that it will be useful,
349 | but WITHOUT ANY WARRANTY; without even the implied warranty of
350 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
351 | GNU General Public License for more details.
352 | 
353 | You should have received a copy of the GNU General Public License
354 | along with this program.  If not, see <https://www.gnu.org/licenses/>.
355 | 
356 | ### Exceptions
357 | 
358 | **Some exclusions to this license apply; specifically portions of 
359 | [`strkit/viz/templates/browser.html`](strkit/viz/templates/browser.html) and
360 | the STRkit logo files ([./docs/images/strkit_logo_small.png](./docs/images/strkit_logo_small.png)
361 | and [./strkit/viz/static/logo.png](./strkit/viz/static/logo.png).)**
362 | 
363 | The STRkit logo is &copy; David Lougheed 2022, and was designed by Evelyn Lougheed. It is not licensed
364 | under the terms of the GPL 3.0; it is instead licensed under the terms of the 
365 | [CC BY-ND 4.0](https://creativecommons.org/licenses/by-nd/4.0/).
366 | 
367 | Portions of `viz/templates/browser.html` copyright (C) 2021-2022  Observable, Inc.
368 | Used under the terms of the ISC license.
369 | 


--------------------------------------------------------------------------------
/catalogs/pathogenic_assoc.hg38.tsv:
--------------------------------------------------------------------------------
 1 | # Citations mostly obtained from Gall-Duncan et al. 2022	.	.	.	.	.	.	.	.
 2 | # contig	start	end	disease	inheritance	gene	citation	pathogenic form	notes	motif
 3 | chr1	57367043	57367125	SCA37	AD	DAB1	Seixas et al. 2017			RAAAT
 4 | chr1	94418421	94418442	OPDM	AD	ABCD3	Cortese et al. 2024	CN>=118		GCC
 5 | chr1	149390802	149390842	NIID;EssentialTremor;ALS;OPDM3	AD;Assoc;Assoc;Familial	NOTCH2NLC	Tian et al. 2019;Sun et al. 2020;Yuan et al. 2020;Yu et al. 2021			GGC
 6 | chr2	96197066	96197124	BAFME2	AD	STARD7	Corbett et al. 2019			AAAWK
 7 | chr2	100104798	100104824	Developmental anomalies	PossibleAssoc	AFF3	Metsu et al. 2014			GCC
 8 | chr2	176093058	176093099	SPD1	AD	HOXD13	Gong et al. 2011	CN>=22		GCN
 9 | chr2	190880872	190880920	GD	AR	GLS	van Kuilenburg et al. 2019			GCA
10 | chr3	63912684	63912715	SCA7	Familial	ATXN7	Stevanin et al. 1998	CN>=37		GCA
11 | chr3	129172576	129172733	DM2	AD	CNBP	Liquori et al. 2001			CASR
12 | chr3	183712176	183712226	BAFME4	AD	YEATS2	Yeetong et al. 2019			ATTTY
13 | chr4	3074876	3074940	HD	AD	HTT	HDCRG 1993	CN>=36		CAG
14 | chr4	39348424	39348479	CANVAS	AR	RFC1	Cortese et al. 2019&Rafehi et al. 2019			AARRG
15 | chr4	41745975	41746022	CCHS	Familial	PHOX2B	Amiel et al. 2003			GCC
16 | chr5	10356338	10356411	BAFME3	AD	MARCHF6	Florian et al. 2019			TTTYA
17 | chr5	146878727	146878759	SCA12	AD	PPP2R2B	Holmes et al. 1999			GCT
18 | chr6	16327633	16327724	SCA1;ALS	AD;Assoc	ATXN1	Orr et al. 1993;Lattante et al. 2018,Tazelaar et al. 2020			TGC
19 | chr6	45422749	45422794	CCD	AD	RUNX2	Mundlos et al. 1997			GGC
20 | chr6	170561906	170562017	SCA17	AD	TBP	Koide et al. 1999	CN>=43		GCA
21 | chr7	27199679	27199732	HFGS	AD	HOXA13	Utsch et al. 2002			GCN
22 | chr8	118366812	118366918	BAFME1	AD	SAMD12	Ishiura et al. 2018			AARTA
23 | chr9	27573484	27573546	ALS/FTD	AD	C9orf72	Renton et al. 2011&DeJesus-Hernandez et al. 2011			GCCCCG
24 | chr10	79826380	79826404	OPDM	Assoc	NUTM2B-AS1	Gu et al. 2024			CGG
25 | chr11	119206289	119206323	JS	Assoc	CBL2	Michaelis et al. 1998			CGG
26 | chr12	6936716	6936775	DRPLA	AD	ATN1	Nagafuchi et al. 1994,Koide et al. 1994,Chaudhry et al. 2021			CAG
27 | chr12	50505001	50505024	ID;LGS	Assoc;Assoc	DIP2B	Winnepenninckx et al. 2007;Qaiser et al. 2021			GGC
28 | chr13	70139351	70139429	SCA8;LGS	AD;Assoc	ATXN8OS	Koob et al. 1999;Qaiser et al. 2021			TRC
29 | chr13	99985448	99985494	HPE5	AD	ZIC2	Brown et al. 1998			GCG
30 | chr14	23321464	23321543	OPMD	AD	PABP2	Brais et al. 1998			GCG
31 | chr15	22786671	22786703	ALS	Assoc	NIPA1	Blauw et al. 2012			GCG
32 | chr16	17470920	17470921	BSS	AR	XYLT1	LaCroix et al. 2019			GGC
33 | chr16	24613438	24613532	BAFME6	AD	TNRC6A	Ishiura et al. 2018			ATTTY
34 | chr16	66490398	66490466	SCA31	AD	ENSG00000260851	Sato et al. 2009			TRRAA
35 | chr18	55586153	55586229	FECD	Assoc	TCF4	Wieben et al. 2012			AGC
36 | chr19	13207858	13207898	SCA6	AD	CACNA1	Zhuchenko et al. 1997			CTG
37 | chr19	14496041	14496085	OPDM2	Familial	GIPC1	Deng et al. 2020			CCG
38 | chr19	18786027	18786050	PSACH	AD	COMP	Deere et al. 1999			CGT
39 | chr19	45770204	45770266	DM1	AD	DMPK	Many			CAG
40 | chr20	2652732	2652775	SCA36	AD	NOP56	Kobayashi et al. 2011			GGGCCT
41 | chr21	43776442	43776479	EPM1	AR	CSTB	Lalioti et al. 1998			GCGCGGGGCGGG
42 | chr22	45795354	45795424	SCA10	AD	SCA10	Matsuura et al. 2000,Matsuura et al. 2006	CN>=280	Variable penetrance in intermediate range ~280-800	ATTCT
43 | chrX	67545316	67545419	SBMA	X-linked	AR	La Spada et al. 1991,Fratta et al. 2014	CN>=38		GCA
44 | chrX	71453054	71453129	XDP	X-linked	TAF1	Bragg et al. 2017			GAGAGG
45 | chrX	147912036	147912111	FXS;FXPOI;FXTAS	X-linked	FMR1	Many	CN>=200;55<=CN<200;55<=CN<200		GGC
46 | chrX	148500604	148500753	FRAXE	X-linked	AFF2	Knight et al. 1993,Gu et al. 1996	CN>=200		GCC
47 | 


--------------------------------------------------------------------------------
/docs/caller_catalog.md:
--------------------------------------------------------------------------------
 1 | # Caller catalog format & choosing a catalog
 2 | 
 3 | ## Caller catalog format
 4 | 
 5 | For the `--loci` argument, `strkit call` takes a list of loci in a modified BED / TSV format,
 6 | similar to methods like Straglr/Tandem-genotypes/GangSTR.
 7 | 
 8 | The file must be structured with a row per locus, where each row looks like:
 9 | 
10 | ```
11 | chr#    10000    10101    [...]    AC
12 | ```
13 | 
14 | The important requirements here are:
15 | 
16 |   * The fields are tab-separated
17 |   * The rows are sorted by contig, and then by starting position
18 |   * Locus coordinates are 0-based and half-open (start is inclusive, end is exclusive)
19 |   * The locus motif must come **last** in the row, but *any number of fields* can separate
20 |     the end position and the motif.
21 | 
22 | As a result, STRkit can take myrid different TSV-type catalog formats as input, including
23 | those produced from the TRF UCSC browser track, or for GangSTR, or for Straglr.
24 | 
25 | Here are a few notes on catalogs:
26 | 
27 |   * Coordinates are used to locate the STR locus in the reference genome, but may be slightly 
28 |     expanded to better encompass the entire locus.
29 |   * Be wary of using Tandem Repeats Finder output directly as a catalog, as it can output multiple
30 |     rows for the same locus, or define motifs in a "compound" fashion, e.g., `ATATAT` instead of `AT`.
31 |   * Some disease expansions can contain multiple different motifs, 
32 |     which may be not present in the reference genome at all (for example: 
33 |     [CANVAS](https://pubmed.ncbi.nlm.nih.gov/31230722/), [BAFME2](https://www.nature.com/articles/s41467-019-12671-y)).
34 |     As such, we provide a mechanism to specify motifs using any 
35 |     [IUPAC code](https://www.bioinformatics.org/sms/iupac.html). 
36 |     Thus, the CANVAS and BAFME2 motifs can be represented as `AARRG` and `AAAWK`, respectively.
37 |     We also add in a non-IUPAC code, `X`, which behaves like `N` in that it represents any base, 
38 |     but instead of giving a reward of `+2` it neither penalizes nor rewards alignment, 
39 |     and penalizes a gap. We use this internally to represent low-confidence base calls.
40 |   * Related to the above, this can be important for diseases such as SCA37, where the motif composition 
41 |     (rather than the actual copy number) is associated with disease 
42 |     ([Seixas *et al.* 2017](https://doi.org/10.1016%2Fj.ajhg.2017.06.007)). Here, STRkit's motif-sized k-mer counting
43 |     function can be used during calling with the `--count-kmers` flag. See the 
44 |     [advanced usage](https://github.com/davidlougheed/strkit/blob/master/docs/caller_usage.md#all-optional-flags) page 
45 |     for more.
46 | 
47 | 
48 | ## Choosing an existing catalog
49 | 
50 | Other researchers have done extensive work in identifying and cataloguing loci for genotyping:
51 | 
52 |   * The Tandem Repeats Finder track for the UCSC browser, available as a 
53 |     [downloadable BED file](https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/latest/hg38.trf.bed.gz),
54 |     with the caveat that this file includes **overlapping entries**, and TRs may not always be represented in 
55 |     their most 'essential' form (e.g., using the motif `TATATATA` instead of just `TA`). Thus, some work may be
56 |     required to create a desirable locus catalog.
57 |   * The researchers behind the [GangSTR](https://github.com/gymreklab/GangSTR) short-read STR genotyping method
58 |     have prepared [several extensive STR catalogs](https://github.com/gymreklab/GangSTR#gangstr-reference-files) 
59 |     for different human reference genomes, containing motifs up to 20bp in length. However, **these files use
60 |     1-based closed-interval coordinates**, and should be adjusted (subtracting 1 from all start coordinates) to 
61 |     transform them into the 0-based half-open interval coordinates when using them with STRkit.
62 |   * We have prepared a [catalog of disease-causing or disease-associated loci](../catalogs/pathogenic_assoc.hg38.tsv) 
63 |     for the `hg38` reference genome, partially based on the review research done by Gall-Duncan *et al.* (2022), as well
64 |     as entries from the [STRipy database](https://stripy.org/database) 
65 |     (DOI: [10.1002/humu.24382](https://doi.org/10.1002/humu.24382)) and our own reading of other articles.
66 | 


--------------------------------------------------------------------------------
/docs/caller_usage.md:
--------------------------------------------------------------------------------
  1 | # Advanced caller usage and configuration
  2 | 
  3 | 
  4 | ## All optional flags
  5 | 
  6 | * `--sample-id example_sample`: Set a sample ID, or override the alignment file sample ID. This will be included in JSON 
  7 |   output, but not TSV output.
  8 | * `--min-reads ##`: Minimum number of supporting reads needed to make a call. **Default:** 4
  9 | * `--min-allele-reads ##`: Minimum number of supporting reads needed to call a specific allele size. 
 10 |   **Default:** 2
 11 | * `--max-reads ##`: Maximum number of supporting reads to use for calling a locus. **Default:** 250
 12 | * `--min-avg-phred ##`: Minimum average PHRED score for relevant bases (flanking region + tandem repeat).
 13 |   Read segments with average PHRED scores below this (common with a threshold of ~13 and ONT Ultra Long reads, 
 14 |   for example) will be skipped. **Default:** 13
 15 | * `--min-read-align-score #.#`: Minimum normalized read alignment score (fractional; `0.0` to `1.0`) needed to include a 
 16 |   read in a call. A good value for pure tandem repeats is 0.9. A good value for much more lenient genotyping is anywhere
 17 |   from 0.0-0.4. **Default:** 0.9
 18 | * `--max-rcn-iters ##`: Maximum number of read copy-number counting iterations to perform. Loci which require a lot of
 19 |   iterations are probably impure tandem repeats, for which the resulting copy number will not be very accurate anyway.
 20 |   **Default:** 50
 21 | * `--flank-size ##`: Size of the flanking region to use on either side of a region to properly anchor reads. 
 22 |   **Default:** 70
 23 | * `--realign` or `-a`: Whether to perform local re-alignment to attempt recovery of soft-clipped reads. Some aligners
 24 |   may soft-clip around large insertions, e.g. with an expansion (I've noticed this with *pbmm2*/*minimap2*). 
 25 |   Currently recommended **for HiFi or ONT R10 only**, since this step aggressively filters out realignments with 
 26 |   many mismatches or small indels. Enabling this slows down calling, so it may not be suitable for a very large catalog 
 27 |   of tandem repeats.
 28 | * `--hq`: Whether to treat provided reads as "high quality", i.e., fairly close to the actual true sequence. Used when 
 29 |   detecting expansions, to skip a smoothing filter that may ignore disparate, rare expansion-like read counts.
 30 |   Use for CCS reads or similar data (e.g., R10 nanopore data) ONLY! **Default:** off
 31 | * `--use-hp`: Whether to incorporate `HP` tags from a haplotagged alignment file. This should speed up runtime and 
 32 |   will potentially improve calling results. **This flag is experimental, and has not been tested extensively.**
 33 | * `--skip-supplementary` or `--skip-supp`: Skip supplementary alignments. **Default:** off
 34 | * `--skip-secondary` or `--skip-sec`: Skip secondary alignments. **Default:** off
 35 | * `--incorporate-snvs [path]` or `--snv [path]`: A path to a VCF with SNVs to incorporate into the calling process and 
 36 |   final output. This file is just used as an SNV loci catalog; STRkit itself will perform the SNV calling. Empirically 
 37 |   improves calling quality a small amount, speeds up runtime, and gives nearby SNV calls for downstream analysis.
 38 |   You can find dbSNP VCFs at
 39 |   [`https://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/`](https://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/).
 40 |   The file for GRCh38 is called `00-common_all.vcf.gz` as of time of writing.
 41 | * `--snv-min-base-qual [int]` or `--min-sbq [int]`: Minimum PHRED quality score for bases of SNVs to use for phasing. 
 42 |   **Default:** 20
 43 | * `--targeted` or `-t`: Turn on targeted genotyping mode, which re-weights longer reads differently. Use this option if
 44 |   the alignment file contains targeted reads that do not reflect normal mapping patterns, e.g. from PacBio No-Amp 
 45 |   Targeted Sequencing. **Default:** off
 46 | * `--respect-ref` or `-e`: Turn off reference TR region 'coordinate extension' from what is specified in the catalog.
 47 |   TR boundaries can be blurry, so by default we give STRkit an opportunity to extend the provided region to improve
 48 |   mapped indel capturing and to be consistent with the approach we use to count repeat copies in non-reference samples.
 49 |   Turning this off should give results closer to other STR callers, at the cost of potentially missing variation.
 50 | * `--count-kmers` or `-k`: Turn on motif-sized k-mer counting at the allele level, with `-k peak`, or at the read 
 51 |   level, with `-k read`, or both with `-k both`. If the flag is provided with no value, it will default to `peak.`
 52 |   Note that k-mer counts will only be reported if a `--json` path is specified. This feature can be used to detect
 53 |   motif composition differences between alleles or samples. **Default:** `none`
 54 | * `--consensus` or `-c`: Turn on consensus calculation for alleles. This adds runtime, but gives a better idea of STR 
 55 |   structure and is useful for comparing alleles beyond copy number. If `--vcf` is set, this option is forced on. 
 56 |   **Default:** off
 57 | * `--vcf-anchor-size`: Number of bases upstream (5') of the tandem repeat to include in the VCF output. This can include
 58 |   small indels, and having a size above `1` may be beneficial or detrimental to the use case at hand, but is nice for 
 59 |   benchmarking and in case of slight misalignment. This is clamped to being in the range of `[1, flank_size]`.
 60 |   **Default:** 5
 61 | * `--num-bootstrap ###` or `-b`: Now many bootstrap re-samplings to perform. **Default:** 100
 62 | * `--sex-chr ??` or `-x`: Sex chromosome configuration. **Without this, loci in sex chromosomes will not be genotyped.**
 63 |   Can be any configuration of Xs and Ys; only count matters. **Default:** *none*
 64 | * `--json [path]` or `-j`: Path to output JSON call data to. JSON call data is more detailed than the `stdout` TSV 
 65 |   output. If the value passed is `stdout`, the JSON data will be written to `stdout` instead of a file. 
 66 |   **Default:** *none*
 67 | * `--indent-json` or `-i`: If passed alongside `--json [x]`, the JSON output will be indented to be more human-readable
 68 |   but less compact. **Default:** off
 69 | * `--vcf [path]`: Path to output VCF-formatted call data to. Setting this option forces the `--consensus` option as 
 70 |   well in order to output true REF/ALT values, which slows down runtime somewhat. If the value passed is `stdout`, the 
 71 |   VCF data will be written to `stdout` instead of a file. If a `.vcf.gz` path is specified, a bgzipped file will be 
 72 |   written automatically. **Default:** *none*
 73 | * `--no-tsv`: Suppresses TSV output to `stdout`. Without `--json` or `--vcf`, no output will be generated, which isn't 
 74 |   very helpful. **Default:** TSV output on
 75 | * `--seed`: Seed the random number generator used for all random sampling, Gaussian mixture modeling, etc. 
 76 |   Useful for replicability.
 77 | * `--log-level [level]`: Log level. Value must be of `error`, `warning`, `info`, and `debug`. Be careful with the 
 78 |   `debug` log level, as it can produce gigabytes of logs for a large run. **Default:** `info`.
 79 | 
 80 | 
 81 | ## Usage on HPC machines
 82 | 
 83 | We have tested STRkit on three different clusters associated with the 
 84 | Digital Research Alliance of Canada (formerly Compute Canada). 
 85 | 
 86 | Usage is pretty straightforward; for our use cases we set up a Python virtual environment
 87 | with the `strkit` package installed, and ran a SLURM batch job which looks something like:
 88 | 
 89 | ```bash
 90 | #!/bin/bash
 91 | #SBATCH --mem=16G
 92 | #SBATCH --ntasks=1
 93 | #SBATCH --cpus-per-task=10
 94 | #SBATCH --time=1-00
 95 | #SBATCH --account=rrg-xxxxx
 96 | 
 97 | 
 98 | module load StdEnv/2023
 99 | module load python/3.11 scipy-stack/2025a parasail/2.6.2
100 | 
101 | cd /home/xxxxx || exit
102 | source env/bin/activate
103 | 
104 | strkit call \
105 |   --loci /path/to/catalog \
106 |   --ref /path/to/ref.fa.gz \
107 |   --processes 10 \
108 |   --seed 342 \
109 |   --vcf sample.vcf \
110 |   --no-tsv \
111 |   path/to/sample.bam
112 | 
113 | deactivate
114 | 
115 | ```
116 | 


--------------------------------------------------------------------------------
/docs/images/browser_hist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidlougheed/strkit/90f07d4bb8e27b67a1a1c9afafa4f4b1412b171a/docs/images/browser_hist.png


--------------------------------------------------------------------------------
/docs/images/browser_igv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidlougheed/strkit/90f07d4bb8e27b67a1a1c9afafa4f4b1412b171a/docs/images/browser_igv.png


--------------------------------------------------------------------------------
/docs/images/call_method_flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidlougheed/strkit/90f07d4bb8e27b67a1a1c9afafa4f4b1412b171a/docs/images/call_method_flow.png


--------------------------------------------------------------------------------
/docs/images/strkit_logo_open_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidlougheed/strkit/90f07d4bb8e27b67a1a1c9afafa4f4b1412b171a/docs/images/strkit_logo_open_graph.png


--------------------------------------------------------------------------------
/docs/images/strkit_logo_small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidlougheed/strkit/90f07d4bb8e27b67a1a1c9afafa4f4b1412b171a/docs/images/strkit_logo_small.png


--------------------------------------------------------------------------------
/docs/output_formats.md:
--------------------------------------------------------------------------------
  1 | # STRkit output formats
  2 | 
  3 | STRkit can output three different file formats, depending on the set of arguments used:
  4 | 
  5 | * [TSV](#tsv-standard-output): by default, printed to `stdout` when STRkit is run. Good as an overview, but less 
  6 |   informative/interoperable than other formats.
  7 | * [JSON](#json-report): a JSON report, containing the maximum amount of information possible. These files can be quite 
  8 |   large, especially if formatted to be human-readable and indented with the `--indent-json` flag.
  9 | * [VCF](#vcf): a [VCF 4.2](https://samtools.github.io/hts-specs/VCFv4.2.pdf) file, with STR and SNV genotypes, including
 10 |   consensus STR sequences.
 11 | 
 12 | **Note:** In general, the JSON format contains the most information about how STRkit was run, and each locus' called 
 13 | genotype.
 14 | 
 15 | 
 16 | ## TSV (standard output)
 17 | 
 18 | A tab-separated text file with the following columns:
 19 | 
 20 | * Chromosome
 21 | * Starting position (matching input BED file; real coordinates of region may be different if 
 22 |   `--respect-ref` is not used)
 23 | * Ending position (matching input BED file; real coordinates of region may be different if 
 24 |   `--respect-ref` is not used)
 25 | * Motif sequence (matching input BED file)
 26 | * Reference copy number
 27 | * Comma-delimited list of copy numbers for all reads successfully extracted for this locus.
 28 | * Copy number call, `|`-delimited (one call per allele)
 29 | * 95% confidence intervals for copy number calls, `|`-delimited (one `X-Y` 95% CI per allele)
 30 | * Calling approach used by STRkit: one of:
 31 |   * `dist` - clustering based on a copy number distance metric
 32 |   * `snv+dist` - clustering based on a copy number + nearby SNV genotype difference distance metric
 33 |   * `snv` - clustering solely based on nearby SNV genotypes
 34 | 
 35 | Here is an example line:
 36 | 
 37 | ```
 38 | chr4	5975495	5975530	TTTTG	7	6,6,6,6,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,8	6|7	6-6|7-7	snv
 39 | ```
 40 | 
 41 | Note that quite a bit of information is missing from the TSV, including per-sample copy numbers, read identities, 
 42 | SNV calls, and STR consensus sequences.
 43 | 
 44 | 
 45 | ## JSON report
 46 | 
 47 | Example report format:
 48 | 
 49 | ```javascript
 50 | {
 51 |   "sample_id": "HG002",
 52 |   "caller": {
 53 |     "name": "strkit",
 54 |     "version": "0.15.0"
 55 |   },
 56 |   "parameters": {
 57 |     "read_files": "HG002.SequelII.ccs.phased.40x.chr4.bam",
 58 |     "reference_file": "/Users/davidlougheed/git/gt-poc/hg38.analysisSet.fa.gz",
 59 |     "min_reads": 4,
 60 |     "min_allele_reads": 2,
 61 |     "min_avg_phred": 13,
 62 |     "num_bootstrap": 100,
 63 |     "flank_size": 70,
 64 |     "sample_id": "HG002",
 65 |     "realign": true,
 66 |     "hq": true,
 67 |     "snv_vcf": "00-common_all.vcf.gz",
 68 |     "snv_min_base_qual": 20,
 69 |     "targeted": false,
 70 |     "respect_ref": false,
 71 |     "count_kmers": "none",
 72 |     "consensus": true,
 73 |     "log_level": 10,
 74 |     "seed": 1234,
 75 |     "processes": 1
 76 |   },
 77 |   "runtime": 8.628772,
 78 |   "contigs": [
 79 |     "chr4"
 80 |   ],
 81 |   "results": [
 82 |     {
 83 |       "locus_index": 1,
 84 |       "contig": "chr4",
 85 |       "start": 96617,
 86 |       "end": 96648,
 87 |       "start_adj": 96617,
 88 |       "end_adj": 96648,
 89 |       "motif": "AC",
 90 |       "ref_cn": 16,
 91 |       "ref_start_anchor": "t",
 92 |       "ref_seq": "acacacacacacacacacacacacacacaca",
 93 |       "reads": {
 94 |         "m64011_190901_095311/50792740/ccs": {
 95 |           "s": "-",
 96 |           "sc": 2.0,
 97 |           "cn": 15,
 98 |           "w": 1.0217145751733625,
 99 |           "snvu": ["G"],
100 |           "p": 0
101 |         },
102 |         // ...
103 |         "m64012_190921_234837/4523939/ccs": {
104 |           "s": "+",
105 |           "sc": 2.0,
106 |           "cn": 15,
107 |           "w": 1.0217145751733625,
108 |           "snvu": ["A"],
109 |           "p": 1
110 |         },
111 |         // ...
112 |       },
113 |       "snvs": [
114 |         {
115 |           "id": "rs73213545",
116 |           "ref": "G",
117 |           "pos": 94593,
118 |           "call": ["G", "A"],
119 |           "rcs": [20, 23]
120 |         }
121 |       ],
122 |       "assign_method": "snv+dist",
123 |       "call": [15, 15],
124 |       "call_95_cis": [
125 |         [15, 15],
126 |         [15, 15]
127 |       ],
128 |       "call_99_cis": [
129 |         [15, 15],
130 |         [15, 15]
131 |       ],
132 |       "mean_model_align_score": 2.0,
133 |       "peaks": {
134 |         "means": [15, 15],
135 |         "weights": [0.5, 0.5],
136 |         "stdevs": [0.31622776601683794, 0.3585309239667531],
137 |         "modal_n": 2,
138 |         "n_reads": [20, 23],
139 |         "seqs": [
140 |           ["ACACACACACACACACACACACACACACA", "poa"],
141 |           ["ACACACACACACACACACACACACACACA", "poa"]
142 |         ]
143 |       },
144 |       "read_peaks_called": true,
145 |       "time": 0.1274
146 |     },
147 |     // ...
148 |   ]
149 | }
150 | ```
151 | 
152 | 
153 | ## VCF
154 | 
155 | VCF format fields (i.e., for each variant sample entry):
156 | 
157 | * `AD`: Read depth for each allele
158 | * `CONS`: Consensus methods used for each alt (`single`/`poa`/`best_rep`)
159 | * `DP`: Total read depth
160 | * `DPS`: Total read depth; only supporting reads (for calls with incorporated SNVs mainly; STR calls only)
161 | * `GT`: Genotype
162 | * `MC`: Motif copy number for each allele (STR calls only)
163 | * `MCCI`: Motif copy number 95% confidence intervals for each allele (STR calls only)
164 | * `MCRL`: Read-level copy number histogram for each allele. Allele entries are comma-delimited, and copy numbers within 
165 |   an allele's read-set are pipe (`|`)-delimited and formatted as `[copy number]x[reads]`. For example, for two alleles 
166 |   with 8 and 9 copy-number respectively, we may get `7x1|8x10|9x1,8x2|9x12` — the first allele has one 7-copy read, ten 
167 |   8-copy reads, and one 9-copy read. The second allele has two 8-copy reads and twelve 9-copy reads.
168 | * `MMAS`: Mean model (candidate TR sequence) alignment score across reads, for this sample. This score, relative to the
169 |   other locis' scores, represents how well a pure tandem repeat stretch with the catalogued motif and the determined 
170 |   copy number (e.g., `CAGCAGCAG`) aligns to the true sequence.
171 | * `PS`: Phase set
172 | * `PM`: Peak-calling method (`dist`/`single`/`snv+dist`/`snv`/`hp`; STR calls only)
173 | 
174 | VCF info. fields (i.e., for each STR variant record; not present for SNV records):
175 | 
176 | * `VT`: Variant record type (`str` or `snv`)
177 | * `MOTIF`: Motif sequence
178 | * `REFMC`: Motif copy number in the reference genome
179 | 


--------------------------------------------------------------------------------
/docs/trio_analyses.md:
--------------------------------------------------------------------------------
 1 | # Trio analyses with STRkit
 2 | 
 3 | Trio datasets include genomic sequence data for a child, their mother, and their father (the "trio"). These data
 4 | can be used to discover de novo mutation (and incidental genotyping errors).
 5 | 
 6 | STRkit includes a Mendelian inheritance (MI) analysis tool, under the sub-command `strkit mi`.
 7 | After genotyping the trio with `strkit call`, this command can be used to discover loci which:
 8 | 
 9 | 1. Do not respect exact MI
10 | 2. Do not respect MI allowing for a ±1 repeat unit difference 
11 |    (Note: most true mutation occurs in 1-repeat-unit changes too! 
12 |    See [Ellegren, 2004](https://www.nature.com/articles/nrg1348).)
13 | 3. Do not respect MI under the 95% locus confidence intervals
14 | 4. Look like de novo mutation at a read count distribution level, via a Mann-Whitney *U* test (with tie correction).
15 |    The alternative hypothesis can be specified as either two-sided or looking for expansion in the offspring. 
16 |    *The requirements for this test are invalidated in cases of mosaicism.*
17 | 5. Look like de novo mutation at a read count distribution level, via a chi-squared independence test,
18 |    where the contingency table looks like the following:
19 | 
20 | | Read distribution \ Copy number | 11   | 12   | 13   |
21 | |---------------------------------|------|------|------|
22 | | Parent reads (best peak fit)    | 20   | 10   | 0    |
23 | | Child reads                     | 2    | 20   | 10   |
24 | 
25 | 
26 | ## Trio-level
27 | 
28 | At a trio level, the chi-squared test gives (optionally multiple testing-corrected) loci with a significant
29 | chance of containing a de novo mutation.
30 | 
31 | ## Cohort-level
32 | 
33 | At a cohort level, multiple downstream analyses are possible from a collection of trio mutation analyses,
34 | such as:
35 | 
36 |   1. Case-control analysis looking for frequency of de novo mutations in specific loci
37 |   2. Case-control analysis looking at the incidence rate of de novo mutation
38 | 
39 | Currently, tools to automatically perform these analyses are not available in STRkit.
40 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=57.4.0", "wheel"]
3 | build-backend = "setuptools.build_meta"
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | blinker==1.7.0
 2 | click==8.1.7
 3 | coverage==7.8.0
 4 | Cython==3.0.12
 5 | exceptiongroup==1.2.0
 6 | Flask==3.0.3
 7 | importlib_metadata==7.1.0
 8 | iniconfig==2.0.0
 9 | itsdangerous==2.2.0
10 | Jinja2==3.1.4
11 | joblib==1.3.2
12 | MarkupSafe==2.1.5
13 | numpy==1.26.4
14 | orjson==3.10.16
15 | packaging==24.0
16 | pandas==2.2.3
17 | parasail==1.3.4
18 | patsy==0.5.6
19 | pluggy==1.4.0
20 | psutil==6.1.0
21 | pyparsing==3.1.2
22 | pysam==0.23.0
23 | pytest==7.4.4
24 | pytest-cov==4.1.0
25 | python-dateutil==2.8.2
26 | pytz==2025.2
27 | scikit-learn==1.4.2
28 | scipy==1.15.1
29 | six==1.16.0
30 | statsmodels==0.14.4
31 | strkit_rust_ext==0.20.2
32 | threadpoolctl==3.4.0
33 | tomli==2.0.1
34 | tzdata==2024.2
35 | Werkzeug==3.0.4
36 | zipp==3.20.2
37 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import setuptools
 3 | from setuptools import setup
 4 | 
 5 | with open("README.md", "r") as fh:
 6 |     long_description = fh.read()
 7 | 
 8 | with open("./strkit/VERSION", "r") as vf:
 9 |     version = vf.read().strip()
10 | 
11 | setup(
12 |     name="strkit",
13 |     version=version,
14 | 
15 |     python_requires="~=3.10",
16 |     install_requires=[
17 |         "Flask>=2.2.5,<3.1",
18 |         "orjson>=3.9.15,<4",
19 |         "pysam>=0.19,<0.24",
20 |         "numpy>=1.23.4,<1.27",
21 |         "parasail>=1.2.4,<1.4",
22 |         "scikit-learn>=1.2.1,<1.6",
23 |         "scipy>=1.10,<1.16",
24 |         "statsmodels>=0.14.0,<0.15",
25 |         "strkit_rust_ext==0.20.2",
26 |     ],
27 | 
28 |     description="A toolkit for analyzing variation in short(ish) tandem repeats.",
29 |     long_description=long_description,
30 |     long_description_content_type="text/markdown",
31 | 
32 |     url="https://github.com/davidlougheed/strkit",
33 |     license="GPLv3",
34 |     classifiers=[
35 |         "Programming Language :: Python :: 3.10",
36 |         "Programming Language :: Python :: 3.11",
37 |         "Programming Language :: Python :: 3.12",
38 |         "Programming Language :: Python :: 3 :: Only",
39 |         "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
40 |         "Operating System :: POSIX",
41 |     ],
42 | 
43 |     author="David Lougheed",
44 |     author_email="david.lougheed@gmail.com",
45 | 
46 |     packages=setuptools.find_namespace_packages(),
47 |     include_package_data=True,
48 | 
49 |     entry_points={
50 |         "console_scripts": ["strkit=strkit.entry:main"],
51 |     },
52 | )
53 | 


--------------------------------------------------------------------------------
/strkit/VERSION:
--------------------------------------------------------------------------------
1 | 0.23.0-dev
2 | 


--------------------------------------------------------------------------------
/strkit/__init__.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | 
3 | __all__ = [
4 |     "__version__",
5 | ]
6 | 
7 | with open(Path(__file__).parent / "VERSION", "r") as vf:
8 |     __version__ = vf.read().strip()
9 | 


--------------------------------------------------------------------------------
/strkit/call/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from .allele import call_alleles
 4 | from .call_sample import call_sample
 5 | from .params import CallParams
 6 | 
 7 | __all__ = [
 8 |     "call_alleles",
 9 |     "call_sample",
10 |     "CallParams",
11 | ]
12 | 


--------------------------------------------------------------------------------
/strkit/call/align_matrix.py:
--------------------------------------------------------------------------------
 1 | import parasail
 2 | from ..iupac import IUPAC_NUCLEOTIDE_CODES
 3 | 
 4 | __all__ = [
 5 |     "dna_codes",
 6 |     "match_score",
 7 |     "mismatch_penalty",
 8 |     "indel_penalty",
 9 |     "dna_bases",
10 |     "dna_matrix",
11 | ]
12 | 
13 | 
14 | match_score: int = 2  # TODO: parametrize
15 | mismatch_penalty: int = 7  # TODO: parametrize
16 | indel_penalty: int = 5  # TODO: parametrize
17 | 
18 | 
19 | # TODO: Customize matrix based on error chances
20 | # Create a substitution matrix for alignment.
21 | # Include IUPAC wildcard bases to allow for motifs with multiple possible motifs.
22 | # Include a wildcard base 'X' for very low-confidence base calls, to prevent needlessly harsh penalties - this is
23 | # inserted into a read in place of bases with low PHRED scores.
24 | dna_bases_str: str = "ACGT" + "".join(IUPAC_NUCLEOTIDE_CODES.keys()) + "X"
25 | dna_bases: dict[str, int] = {b: i for i, b in enumerate(dna_bases_str)}
26 | dna_codes: dict[str, tuple[str, ...]] = {
27 |     **IUPAC_NUCLEOTIDE_CODES,
28 |     "X": ("A", "C", "G", "T"),  # Special character for matching low-quality bases
29 | }
30 | dna_matrix = parasail.matrix_create(dna_bases_str, match_score, -1 * mismatch_penalty)
31 | 
32 | for code, code_matches in dna_codes.items():
33 |     for cm in code_matches:
34 |         dna_matrix[dna_bases[code], dna_bases[cm]] = 2 if code != "X" else 0
35 |         dna_matrix[dna_bases[cm], dna_bases[code]] = 2 if code != "X" else 0
36 | 


--------------------------------------------------------------------------------
/strkit/call/allele.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | # Disable OpenMP/other multithreading since it adds enormous overhead when multiprocessing
  4 | import os
  5 | os.environ["OMP_NUM_THREADS"] = "1"
  6 | os.environ["OPENBLAS_NUM_THREADS"] = "1"
  7 | os.environ["MKL_NUM_THREADS"] = "1"
  8 | os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
  9 | os.environ["NUMEXPR_NUM_THREADS"] = "1"
 10 | 
 11 | # ----------------------------------------------------------------------------------------------------------------------
 12 | 
 13 | import logging  # For type hinting
 14 | import numpy as np
 15 | import statistics
 16 | 
 17 | from sklearn.exceptions import ConvergenceWarning
 18 | from sklearn.mixture import GaussianMixture
 19 | from sklearn.preprocessing import normalize
 20 | from warnings import simplefilter
 21 | 
 22 | from numpy.typing import NDArray
 23 | from typing import Iterable, Literal, TypedDict, Union
 24 | 
 25 | import strkit.constants as cc
 26 | 
 27 | from .params import CallParams
 28 | from .utils import get_new_seed
 29 | 
 30 | __all__ = [
 31 |     "RepeatCounts",
 32 |     "CallDict",
 33 |     "get_n_alleles",
 34 |     "call_alleles",
 35 | ]
 36 | 
 37 | RepeatCounts = list[int] | tuple[int, ...] | NDArray[np.int_]
 38 | 
 39 | 
 40 | # K-means convergence errors - we expect convergence to some extent with homozygous alleles
 41 | simplefilter("ignore", category=ConvergenceWarning)
 42 | 
 43 | # TODO: parameterize
 44 | small_allele_min = 8
 45 | expansion_ratio = 5
 46 | N_GM_INIT = 3
 47 | 
 48 | WEIGHT_1_0 = np.array([[1.0]])
 49 | FLOAT_32_EPSILON = np.finfo(np.float32).eps
 50 | 
 51 | CI_PERCENTILE_RANGES = {
 52 |     "95": (2.5, 97.5),
 53 |     "99": (0.5, 99.5),
 54 | }
 55 | 
 56 | 
 57 | def _array_as_int(n: NDArray[np.int_] | NDArray[np.float_]) -> NDArray[np.int32]:
 58 |     return np.rint(n).astype(np.int32)
 59 | 
 60 | 
 61 | def _calculate_cis(samples, ci: str = Literal["95", "99"]) -> NDArray[np.int32]:
 62 |     percentiles = np.percentile(
 63 |         samples, CI_PERCENTILE_RANGES[ci], axis=1, method="interpolated_inverted_cdf"
 64 |     ).transpose()
 65 |     return _array_as_int(percentiles)
 66 | 
 67 | 
 68 | def get_n_alleles(default_n_alleles: int, sample_sex_chroms: str | None, contig: str) -> int | None:
 69 |     if contig in cc.M_CHROMOSOME_NAMES:
 70 |         return 1
 71 | 
 72 |     if contig in cc.SEX_CHROMOSOMES:
 73 |         if sample_sex_chroms is None:
 74 |             return None
 75 |         if contig in cc.X_CHROMOSOME_NAMES:
 76 |             return sample_sex_chroms.count("X")
 77 |         if contig in cc.Y_CHROMOSOME_NAMES:
 78 |             return sample_sex_chroms.count("Y")
 79 | 
 80 |     return default_n_alleles
 81 | 
 82 | 
 83 | def na_length_list(n_alleles: int):
 84 |     return [list() for _ in range(n_alleles)]
 85 | 
 86 | 
 87 | GMMInitParamsMethod = Literal["kmeans", "k-means++"]
 88 | 
 89 | 
 90 | def make_fitted_gmm(n_components: int, sample_rs: NDArray, init_params: GMMInitParamsMethod, rng: np.random.Generator):
 91 |     return GaussianMixture(
 92 |         n_components=n_components,
 93 |         init_params=init_params,
 94 |         covariance_type="spherical",
 95 |         n_init=N_GM_INIT,
 96 |         random_state=get_new_seed(rng),
 97 |     ).fit(sample_rs)
 98 | 
 99 | 
100 | def fit_gmm(
101 |     rng: np.random.Generator,
102 |     sample: NDArray,
103 |     n_alleles: int,
104 |     allele_filter: float,
105 |     hq: bool,
106 |     gm_filter_factor: int,
107 |     init_params: GMMInitParamsMethod = "k-means++",  # TODO: parameterize outside
108 | ) -> object | None:
109 |     sample_rs = sample.reshape(-1, 1)
110 |     g: object | None = None
111 | 
112 |     n_components: int = n_alleles
113 |     while n_components > 0:
114 |         if n_components == 1:  # Don't need to do the full fit for a single peak, just calculate the parameters
115 |             # I've confirmed this gives an ~identical result to fitting a GMM with one parameter.
116 |             fake_g: object = type("", (), {})()
117 |             fake_g.means_ = np.array([[np.mean(sample_rs)]])
118 |             fake_g.weights_ = WEIGHT_1_0
119 |             fake_g.covariances_ = np.array([[np.var(sample_rs)]])
120 |             return fake_g
121 | 
122 |         g = make_fitted_gmm(n_components, sample_rs, init_params, rng)
123 | 
124 |         # noinspection PyUnresolvedReferences
125 |         means_and_weights = np.append(g.means_.transpose(), g.weights_.reshape(1, -1), axis=0)
126 | 
127 |         # Filter out peaks that aren't supported by ~min_allele_reads reads by probability, with some delta to
128 |         # allow for peaks supported by "most of a read".
129 |         mw_filter_1 = means_and_weights[1, :] > allele_filter
130 | 
131 |         # Filter out any peaks below some threshold using this magic constant filter factor
132 |         # - Exception: Large expansions can have very few supporting reads due to quirks of sequencing beyond
133 |         #   just chance/read length distribution; if we have 2 alleles and the large one is a lot bigger than
134 |         #   the small one, don't apply this filter
135 |         # - Discard anything below a specific weight threshold and resample means based on remaining weights
136 |         #   to fill in the gap. E.g. below 1 / (5 * num alleles) - i.e. 5 times less than we expect with equal
137 |         #   sharing in the worst case where it represents just one allele
138 |         if n_components > 2 or (n_components == 2 and (not hq or (
139 |                 means_and_weights[0, -1] < expansion_ratio * max(means_and_weights[0, 0], small_allele_min)))):
140 |             mw_filter_2 = means_and_weights[1, :] > (1 / (gm_filter_factor * n_components))
141 |         else:
142 |             mw_filter_2 = means_and_weights[1, :] > FLOAT_32_EPSILON
143 | 
144 |         mw_filter = mw_filter_1 & mw_filter_2
145 |         n_useless = np.size(mw_filter) - np.count_nonzero(mw_filter)
146 |         if not n_useless:
147 |             # No useless components left to remove, so return the GMM
148 |             return g
149 |         n_components -= n_useless
150 | 
151 |     return g
152 | 
153 | 
154 | class BaseCallDict(TypedDict):
155 |     call: Union[NDArray[np.int32], NDArray[np.float_]]
156 |     call_95_cis: Union[NDArray[np.int32], NDArray[np.float_]]  # 2D arrays
157 |     call_99_cis: Union[NDArray[np.int32], NDArray[np.float_]]  # 2D arrays
158 |     peaks: NDArray[np.float_]
159 |     peak_weights: NDArray[np.float_]
160 |     peak_stdevs: NDArray[np.float_]
161 |     modal_n_peaks: int
162 | 
163 | 
164 | class CallDict(BaseCallDict, total=False):
165 |     ps: int
166 | 
167 | 
168 | def make_read_weights(read_weights: Iterable[float] | None, num_reads: int) -> NDArray[np.float_]:
169 |     return np.array(
170 |         read_weights if read_weights is not None else np.array(([1/num_reads] * num_reads) if num_reads else []))
171 | 
172 | 
173 | def call_alleles(
174 |     repeats_fwd: NDArray[np.int32],
175 |     repeats_rev: NDArray[np.int32],
176 |     read_weights_fwd: Iterable[float] | None,
177 |     read_weights_rev: Iterable[float] | None,
178 |     params: CallParams,
179 |     min_reads: int,
180 |     n_alleles: int,
181 |     separate_strands: bool,
182 |     read_bias_corr_min: int,
183 |     gm_filter_factor: int,
184 |     seed: int | None,
185 |     logger_: logging.Logger,
186 |     debug_str: str,
187 | ) -> CallDict | None:
188 |     fwd_len = repeats_fwd.shape[0]
189 |     rev_len = repeats_rev.shape[0]
190 | 
191 |     fwd_strand_weights = make_read_weights(read_weights_fwd, fwd_len)
192 |     rev_strand_weights = make_read_weights(read_weights_rev, rev_len)
193 | 
194 |     assert repeats_fwd.shape == fwd_strand_weights.shape
195 |     assert repeats_rev.shape == rev_strand_weights.shape
196 | 
197 |     combined_reads = np.concatenate((repeats_fwd, repeats_rev), axis=None)
198 |     combined_weights = np.concatenate((fwd_strand_weights, rev_strand_weights), axis=None)
199 |     combined_len = combined_reads.shape[-1]
200 | 
201 |     if combined_len < min_reads:
202 |         return None
203 | 
204 |     # If the locus/allele only has one value, don't bother bootstrapping
205 |     if np.unique(combined_reads).shape[0] == 1:
206 |         logger_.debug(f"{debug_str} - skipping bootstrap / GMM fitting for allele(s) (single value)")
207 |         cn = combined_reads[0]
208 | 
209 |         call = _array_as_int(np.full(n_alleles, cn))
210 |         call_cis = _array_as_int(np.full((n_alleles, 2), cn))
211 | 
212 |         peaks: NDArray[np.float_] = call.astype(np.float_)
213 | 
214 |         return {
215 |             "call": call,
216 |             "call_95_cis": call_cis,
217 |             "call_99_cis": call_cis,
218 |             "peaks": peaks,
219 |             "peak_weights": np.full(n_alleles, 1.0 / n_alleles),
220 |             "peak_stdevs": np.full(n_alleles, 0.0),
221 |             "modal_n_peaks": 1,  # 1 peak, since we have 1 value
222 |         }
223 | 
224 |     nal = na_length_list(n_alleles)
225 |     allele_samples = np.array(nal, dtype=np.float32)
226 |     allele_weight_samples = np.array(nal, dtype=np.float32)
227 |     allele_stdev_samples = np.array(nal, dtype=np.float32)
228 |     sample_peaks = np.array([], dtype=np.int32)
229 | 
230 |     rng: np.random.Generator = np.random.default_rng(seed=seed)
231 | 
232 |     # Perform a number of bootstrap iterations to get a 95% CI and more accurate estimate of repeat counts / differences
233 | 
234 |     if separate_strands and fwd_len >= read_bias_corr_min and rev_len >= read_bias_corr_min:
235 |         target_length: int = max(fwd_len, rev_len)
236 | 
237 |         # Resample original sample, correcting for imbalances between
238 |         # forward and reverse-strand reads along the way
239 |         # (if we've passed the coverage threshold)
240 | 
241 |         fwd_strand_samples = rng.choice(
242 |             repeats_fwd,
243 |             size=(params.num_bootstrap, target_length),
244 |             replace=True,
245 |             p=fwd_strand_weights,
246 |         )
247 | 
248 |         rev_strand_samples = rng.choice(
249 |             repeats_rev,
250 |             size=(params.num_bootstrap, target_length),
251 |             replace=True,
252 |             p=rev_strand_weights,
253 |         )
254 | 
255 |         concat_samples = np.sort(
256 |             np.concatenate((fwd_strand_samples, rev_strand_samples), axis=1),
257 |             kind="stable")
258 | 
259 |     else:
260 |         concat_samples = np.sort(
261 |             rng.choice(
262 |                 combined_reads,
263 |                 size=(params.num_bootstrap, combined_len),
264 |                 replace=True,
265 |                 p=combined_weights,
266 |             ) if params.num_bootstrap > 1 else np.array([combined_reads]),
267 |             kind="stable")
268 | 
269 |     gmm_cache = {}
270 | 
271 |     def _get_fitted_gmm(s: NDArray[np.int_] | NDArray[np.float_]) -> object | None:
272 |         if (s_t := s.tobytes()) not in gmm_cache:
273 |             # Fit Gaussian mixture model to the resampled data
274 |             gmm_cache[s_t] = fit_gmm(rng, s, n_alleles, allele_filter, params.hq, gm_filter_factor)
275 | 
276 |         return gmm_cache[s_t]
277 | 
278 |     # Filter out peaks that aren't supported by ~min_allele_reads reads by probability, with some delta to
279 |     # allow for peaks supported by "most of a read".
280 |     allele_filter = (params.min_allele_reads - 0.1) / concat_samples.shape[0]
281 | 
282 |     for i in range(params.num_bootstrap):
283 |         sample = concat_samples[i, :]
284 | 
285 |         g: object | None = _get_fitted_gmm(sample)
286 |         if not g:
287 |             # Could not fit any Gaussian mixture; skip this allele
288 |             return None
289 | 
290 |         # Keep track of how many alleles were found for
291 |         # noinspection PyUnresolvedReferences
292 |         sample_peaks = np.append(sample_peaks, g.means_.shape[0])
293 | 
294 |         # noinspection PyUnresolvedReferences
295 |         means_and_weights = np.append(g.means_.transpose(), g.weights_.reshape(1, -1), axis=0)
296 | 
297 |         means = means_and_weights[0, :]
298 |         weights = means_and_weights[1, :]
299 |         # noinspection PyUnresolvedReferences
300 |         stdevs = np.sqrt(g.covariances_)
301 |         n_to_resample = n_alleles - means.shape[0]
302 | 
303 |         if n_to_resample:
304 |             # Re-sample means if any are removed, based on weights (re-normalized), to match total # of alleles
305 |             resampled_indices = rng.choice(
306 |                 np.arange(len(means)),
307 |                 size=n_to_resample,
308 |                 p=normalize(weights.reshape(1, -1), norm="l1").flatten())
309 |             resampled_means = np.append(means, means[resampled_indices])
310 |             resampled_weights = np.append(weights, weights[resampled_indices])
311 |             resampled_stdevs = np.append(stdevs, stdevs[resampled_indices])
312 |         else:
313 |             resampled_means = means
314 |             resampled_weights = weights
315 |             resampled_stdevs = stdevs
316 | 
317 |         argsorted_means = np.argsort(resampled_means, axis=0, kind="stable")
318 |         sorted_allele_estimates = resampled_means[argsorted_means].reshape(-1, 1)
319 |         sorted_allele_weight_estimates = resampled_weights[argsorted_means].reshape(-1, 1)
320 |         sorted_allele_stdev_estimates = resampled_stdevs[argsorted_means].reshape(-1, 1)
321 | 
322 |         allele_samples = np.append(allele_samples, sorted_allele_estimates, axis=1)
323 |         allele_weight_samples = np.append(allele_weight_samples, sorted_allele_weight_estimates, axis=1)
324 |         allele_stdev_samples = np.append(allele_stdev_samples, sorted_allele_stdev_estimates, axis=1)
325 | 
326 |     # Calculate 95% and 99% confidence intervals for each allele from the bootstrap distributions.
327 |     allele_samples_argsort = allele_samples.argsort(axis=1, kind="stable")
328 |     allele_samples = np.take_along_axis(allele_samples, allele_samples_argsort, axis=1)
329 |     allele_cis_95 = _calculate_cis(allele_samples, ci="95")
330 |     allele_cis_99 = _calculate_cis(allele_samples, ci="99")
331 |     allele_weight_samples = np.take_along_axis(allele_weight_samples, allele_samples_argsort, axis=1)
332 |     allele_stdev_samples = np.take_along_axis(allele_stdev_samples, allele_samples_argsort, axis=1)
333 | 
334 |     sample_peaks.sort(kind="stable")  # To make mode consistent, given same set of peak #s
335 | 
336 |     # TODO: Calculate CIs based on Gaussians from allele samples instead? Ask someone...
337 |     #  - Could take median of 2.5 percentiles and 97.5 percentiles from Gaussians instead, median of means
338 | 
339 |     # Report the median estimates and the confidence intervals.
340 |     #  - we choose nearest for median rather than interpolating, so we can get real corresponding weights and stdevs.
341 | 
342 |     median_idx = allele_samples.shape[1] // 2  #
343 |     medians_of_means = allele_samples[:, median_idx]
344 |     medians_of_means_final = np.rint(medians_of_means).astype(np.int32)
345 |     peak_weights = allele_weight_samples[:, median_idx].flatten()
346 |     peak_stdevs = allele_stdev_samples[:, median_idx]
347 |     modal_n_peaks: int = statistics.mode(sample_peaks).item()
348 | 
349 |     peak_weights /= peak_weights.sum()  # re-normalize weights
350 | 
351 |     return {
352 |         "call": medians_of_means_final.flatten(),
353 |         "call_95_cis": allele_cis_95,
354 |         "call_99_cis": allele_cis_99,
355 | 
356 |         "peaks": medians_of_means.flatten(),  # Don't round, so we can recover original Gaussian model
357 |         "peak_weights": peak_weights,
358 |         "peak_stdevs": peak_stdevs.flatten(),
359 |         # TODO: should be ok to use this, because resample gets put at end, vertically (3rd allele in a 3-ploid case)
360 |         #  so taking the first 2 alleles still works in terms of stdev/mean estimates? I think?
361 |         #  Not quite, cause it's sorted...
362 |         #  --> Only do the peak assignment with 1/2 peaks, which is the majority of human situations
363 |         "modal_n_peaks": modal_n_peaks,
364 |     }
365 | 


--------------------------------------------------------------------------------
/strkit/call/cigar.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from numpy.typing import NDArray
 3 | 
 4 | from strkit_rust_ext import get_aligned_pair_matches
 5 | 
 6 | __all__ = [
 7 |     "decode_cigar_np",
 8 |     "get_aligned_pair_matches",
 9 | ]
10 | 
11 | 
12 | def decode_cigar_np(encoded_cigar: NDArray[np.uint32]) -> NDArray[np.uint32]:
13 |     return np.stack((np.bitwise_and(encoded_cigar, 15), np.right_shift(encoded_cigar, 4)), axis=1)
14 | 


--------------------------------------------------------------------------------
/strkit/call/non_daemonic_pool.py:
--------------------------------------------------------------------------------
 1 | import multiprocessing as mp
 2 | 
 3 | __all__ = [
 4 |     "NonDaemonicPool",
 5 | ]
 6 | 
 7 | 
 8 | # Need a pool which itself can spawn realignment processes - see https://stackoverflow.com/a/53180921
 9 | 
10 | 
11 | class NonDaemonicProcess(mp.Process):
12 |     @property
13 |     def daemon(self):
14 |         return False
15 | 
16 |     @daemon.setter
17 |     def daemon(self, value):
18 |         pass
19 | 
20 | 
21 | class NonDaemonicContext(type(mp.get_context())):
22 |     Process = NonDaemonicProcess
23 | 
24 | 
25 | class NonDaemonicPool(mp.pool.Pool):
26 |     # noinspection PyArgumentList
27 |     def __init__(self, *args, **kwargs):
28 |         kwargs["context"] = NonDaemonicContext()
29 |         super().__init__(*args, **kwargs)
30 | 


--------------------------------------------------------------------------------
/strkit/call/output/__init__.py:
--------------------------------------------------------------------------------
 1 | from .json_report import output_json_report_header, output_json_report_results, output_json_report_footer
 2 | from .tsv import output_tsv
 3 | from .vcf import build_vcf_header, output_contig_vcf_lines
 4 | 
 5 | __all__ = [
 6 |     "output_json_report_header",
 7 |     "output_json_report_results",
 8 |     "output_json_report_footer",
 9 |     "output_tsv",
10 |     "build_vcf_header",
11 |     "output_contig_vcf_lines",
12 | ]
13 | 


--------------------------------------------------------------------------------
/strkit/call/output/json_report.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from typing import Callable, Literal
 3 | 
 4 | from strkit import __version__
 5 | from strkit.json import Serializable, dumps, dumps_indented
 6 | 
 7 | from ..params import CallParams
 8 | from ..types import LocusResult
 9 | 
10 | __all__ = [
11 |     "output_json_report_header",
12 |     "output_json_report_results",
13 |     "output_json_report_footer",
14 | ]
15 | 
16 | 
17 | def _get_dfn(indent_json: bool) -> Callable[[Serializable], bytes]:
18 |     return dumps_indented if indent_json else dumps
19 | 
20 | 
21 | def _write_bytes(b: bytes, json_path: str, mode: Literal["wb", "ab"]):
22 |     if json_path == "stdout":
23 |         sys.stdout.buffer.write(b)
24 |         sys.stdout.flush()
25 |     else:
26 |         with open(json_path, mode) as jf:
27 |             # noinspection PyTypeChecker
28 |             jf.write(b)
29 | 
30 | 
31 | def output_json_report_header(params: CallParams, contig_set: set[str], json_path: str, indent_json: bool):
32 |     json_report_header = {
33 |         "sample_id": params.sample_id,
34 |         "caller": {
35 |             "name": "strkit",
36 |             "version": __version__,
37 |         },
38 |         "parameters": params.to_dict(as_inputted=True),
39 |         "contigs": tuple(contig_set),
40 |     }
41 | 
42 |     dfn = _get_dfn(indent_json)
43 |     header_serialized: bytes = dfn(json_report_header)[:(-2 if indent_json else -1)]  # remove trailing ending brace
44 | 
45 |     # kludge: build up a portion of the JSON file, so we can output contig results as they come instead of storing them
46 |     # in memory until the end of the run.
47 |     header_serialized += b","
48 |     if indent_json:
49 |         header_serialized += b'\n  "results": [\n'
50 |     else:
51 |         header_serialized += b'"results":['
52 | 
53 |     # write partial JSON
54 |     _write_bytes(header_serialized, json_path, "wb")
55 | 
56 | 
57 | def output_json_report_results(results: tuple[LocusResult, ...], is_last: bool, json_path: str, indent_json: bool):
58 |     dfn = _get_dfn(indent_json)
59 |     results_bytes: bytes = dfn(results)
60 | 
61 |     if indent_json:
62 |         results_bytes = results_bytes[2:-2]  # remove opening and closing "[]" + trailing newline
63 |         if not is_last:
64 |             results_bytes += b",\n"
65 |     else:
66 |         results_bytes = results_bytes[1:-1]  # remove opening and closing "[]"
67 |         if not is_last:
68 |             results_bytes += b","
69 | 
70 |     # write results "rows"
71 |     _write_bytes(results_bytes, json_path, "ab")
72 | 
73 | 
74 | def output_json_report_footer(time_taken: float, json_path: str, indent_json: bool):
75 |     runtime_bytes = dumps(time_taken)
76 |     if indent_json:
77 |         footer_bytes = b'\n  ],\n  "runtime": ' + runtime_bytes + b'\n}\n'
78 |     else:
79 |         footer_bytes = b'],"runtime":' + runtime_bytes + b'}\n'
80 | 
81 |     # write partial JSON
82 |     _write_bytes(footer_bytes, json_path, "ab")
83 | 


--------------------------------------------------------------------------------
/strkit/call/output/tsv.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | __all__ = ["output_tsv"]
 4 | 
 5 | 
 6 | def _cn_to_str(cn: int | float) -> str:
 7 |     return f"{cn:.1f}" if isinstance(cn, float) else str(cn)
 8 | 
 9 | 
10 | def output_tsv(results: tuple[dict, ...], has_snv_vcf: bool):
11 |     for res in results:
12 |         has_call = res["call"] is not None
13 |         # n_peaks = res["peaks"]["modal_n"]
14 | 
15 |         ref_cn = res.get("ref_cn")
16 |         reads = res.get("reads")
17 | 
18 |         sys.stdout.write("\t".join((
19 |             res["contig"],
20 |             str(res["start"]),
21 |             str(res["end"]),
22 |             res["motif"],
23 |             _cn_to_str(ref_cn) if ref_cn is not None else ".",
24 |             ",".join(map(_cn_to_str, sorted(r["cn"] for r in reads.values()))) if reads else ".",
25 |             "|".join(map(_cn_to_str, res["call"])) if has_call else ".",
26 |             ("|".join("-".join(map(_cn_to_str, gc)) for gc in res["call_95_cis"]) if has_call else "."),
27 |             # *((res["assign_method"] if has_call else ".",) if incorporate_snvs else ()),
28 |             *((res["assign_method"] if has_call else ".",) if has_snv_vcf else ()),
29 | 
30 |             # ("|".join(map(lambda x: f"{x:.5f}", res["peaks"]["means"][:n_peaks]))
31 |             #  if has_call and n_peaks <= 2 else "."),
32 |             # ("|".join(map(lambda x: f"{x:.5f}", res["peaks"]["weights"][:n_peaks]))
33 |             #  if has_call and n_peaks <= 2 else "."),
34 |             # ("|".join(map(lambda x: f"{x:.5f}", res["peaks"]["stdevs"][:n_peaks]))
35 |             #  if has_call and n_peaks <= 2 else "."),
36 |         )) + "\n")
37 | 


--------------------------------------------------------------------------------
/strkit/call/output/vcf.py:
--------------------------------------------------------------------------------
  1 | import functools
  2 | import logging
  3 | 
  4 | from collections import Counter
  5 | from os.path import commonprefix
  6 | from pathlib import Path
  7 | from pysam import FastaFile, VariantFile, VariantHeader, VariantRecord
  8 | from typing import Iterable
  9 | 
 10 | from strkit.utils import cat_strs, is_none, idx_0_getter
 11 | from ..allele import get_n_alleles
 12 | from ..params import CallParams
 13 | from ..utils import cn_getter
 14 | 
 15 | __all__ = [
 16 |     "build_vcf_header",
 17 |     "output_contig_vcf_lines",
 18 | ]
 19 | 
 20 | 
 21 | # VCF_ALLELE_CNV_TR = "<CNV:TR>"
 22 | 
 23 | # VCF_TR_INFO_RECORDS: tuple[tuple[str, str, str, str], ...] = (
 24 | #     ("SVLEN", "A", "Integer", "Length of the structural variant"),
 25 | #     ("CN", "A", "Float", "Copy number of allele"),
 26 | #     ("RN", "A", "Integer", "Total number of repeat sequences in this allele"),
 27 | #     ("RUS", ".", "String", "Repeat unit sequence of the corresponding repeat sequence"),
 28 | #     ("RUL", ".", "Integer", "Repeat unit length of the corresponding repeat sequence"),
 29 | #     ("RB", ".", "Integer", "Total number of bases in the corresponding repeat sequence"),
 30 | #     ("CIRUC", ".", "Float", "Confidence interval around RUC"),
 31 | #     ("CIRB", ".", "Integer", "Confidence interval around RB"),
 32 | # )
 33 | 
 34 | VCF_INFO_VT = "VT"
 35 | VCF_INFO_MOTIF = "MOTIF"
 36 | VCF_INFO_REFMC = "REFMC"
 37 | VCF_INFO_ANCH = "ANCH"
 38 | 
 39 | VT_STR = "str"
 40 | VT_SNV = "snv"
 41 | 
 42 | 
 43 | def iter_to_upper(x: Iterable[str]) -> Iterable[str]:
 44 |     # noinspection PyTypeChecker
 45 |     return map(str.upper, x)
 46 | 
 47 | 
 48 | def build_vcf_header(sample_id: str, reference_file: str) -> VariantHeader:
 49 |     vh = VariantHeader()  # automatically sets VCF version to 4.2
 50 | 
 51 |     # Add an absolute path to the reference genome
 52 |     vh.add_meta("reference", f"file://{str(Path(reference_file).resolve().absolute())}")
 53 | 
 54 |     # Add all contigs from the reference genome file + lengths
 55 |     rf = FastaFile(reference_file)
 56 |     try:
 57 |         for contig in rf.references:
 58 |             vh.contigs.add(contig, length=rf.get_reference_length(contig))
 59 |     finally:
 60 |         rf.close()
 61 | 
 62 |     # Add CNV:TR alt type (symbolic allele: tandem repeat)
 63 |     # vh.add_meta("ALT", "<ID=CNV:TR,Description=\"Tandem repeat\">")
 64 | 
 65 |     # Set up basic VCF formats
 66 |     vh.formats.add("AD", ".", "Integer", "Read depth for each allele")
 67 |     vh.formats.add("ANCL", ".", "Integer", "Anchor length for the ref and each alt, five-prime of TR sequence")
 68 |     vh.formats.add("CONS", ".", "String", "Consensus methods used for each alt (single/poa/best_rep)")
 69 |     vh.formats.add("DP", 1, "Integer", "Read depth")
 70 |     vh.formats.add("DPS", 1, "Integer", "Read depth (supporting reads only)")
 71 |     vh.formats.add("GT", 1, "String", "Genotype")
 72 |     vh.formats.add("MC", ".", "Integer", "Motif copy number for each allele")
 73 |     vh.formats.add("MCCI", ".", "String", "Motif copy number 95% confidence interval for each allele")
 74 |     vh.formats.add("MCRL", ".", "String", "Read-level motif copy numbers for each allele")
 75 |     vh.formats.add("MMAS", 1, "Float", "Mean model (candidate TR sequence) alignment score across reads.")
 76 |     vh.formats.add("NSNV", 1, "Integer", "Number of supporting SNVs for the STR peak-call")
 77 |     vh.formats.add("PS", 1, "Integer", "Phase set")
 78 |     vh.formats.add("PM", 1, "String", "Peak-calling method (dist/snv+dist/snv/hp)")
 79 | 
 80 |     # Set up VCF info fields
 81 |     vh.info.add(VCF_INFO_VT, 1, "String", "Variant record type (str/snv)")
 82 |     vh.info.add(VCF_INFO_MOTIF, 1, "String", "Motif string")
 83 |     vh.info.add(VCF_INFO_REFMC, 1, "Integer", "Motif copy number in the reference genome")
 84 |     vh.info.add(VCF_INFO_ANCH, 1, "Integer", "Five-prime anchor size")
 85 | 
 86 |     # Add INFO records for tandem repeat copies - these are new to VCF4.4!  TODO
 87 |     # for iv in VCF_TR_INFO_RECORDS:
 88 |     #     vh.info.add(*iv)
 89 | 
 90 |     # Add the sample
 91 |     vh.add_sample(sample_id)
 92 | 
 93 |     return vh
 94 | 
 95 | 
 96 | def _vr_pos_key(vr: VariantRecord) -> int:
 97 |     return vr.pos
 98 | 
 99 | 
100 | def _reversed_str(s: str) -> str:
101 |     return cat_strs(reversed(s))
102 | 
103 | 
104 | @functools.cache
105 | def _blank_entry(n_alleles: int) -> tuple[None, ...]:
106 |     return tuple([None] * n_alleles)
107 | 
108 | 
109 | def output_contig_vcf_lines(
110 |     params: CallParams,
111 |     sample_id: str,
112 |     variant_file: VariantFile,
113 |     results: tuple[dict, ...],
114 |     logger: logging.Logger,
115 | ) -> None:
116 |     variant_records: list[VariantRecord] = []
117 | 
118 |     # has_at_least_one_snv_set = next((r.get("snvs") is not None for r in results), None) is not None
119 |     snvs_written: set[str] = set()
120 | 
121 |     for result_idx, result in enumerate(results, 1):
122 |         contig = result["contig"]
123 |         start = result["start"]
124 | 
125 |         if "ref_start_anchor" not in result:
126 |             logger.debug(f"No ref anchor for {contig}:{start}; skipping VCF output for locus")
127 |             continue
128 | 
129 |         ref_start_anchor = result["ref_start_anchor"].upper()
130 |         ref_seq = result["ref_seq"].upper()
131 | 
132 |         n_alleles: int = get_n_alleles(2, params.sex_chroms, contig) or 2
133 | 
134 |         res_reads = result["reads"]
135 |         res_peaks = result["peaks"] or {}
136 | 
137 |         peak_seqs_and_methods = {(seq.upper() if seq else seq): method for seq, method in res_peaks.get("seqs", [])}
138 |         peak_seqs: tuple[str, ...] = tuple(peak_seqs_and_methods.keys())
139 |         peak_start_anchor_seqs: list[str] = list(map(idx_0_getter, res_peaks.get("start_anchor_seqs", [])))
140 | 
141 |         if any(map(is_none, peak_seqs)):  # Occurs when no consensus for one of the peaks
142 |             logger.error(f"Encountered None in results[{result_idx}].peaks.seqs: {peak_seqs}")
143 |             continue
144 | 
145 |         if any(map(is_none, peak_start_anchor_seqs)):  # Occurs when no consensus for one of the peaks
146 |             logger.error(f"Encountered None in results[{result_idx}].peaks.start_anchor_seqs: {peak_start_anchor_seqs}")
147 |             continue
148 | 
149 |         peak_start_anchor_seqs_upper = tuple(iter_to_upper(peak_start_anchor_seqs))
150 |         common_anchor_prefix = commonprefix([ref_start_anchor, *peak_start_anchor_seqs_upper])
151 |         # anchor_offset = how many bases we can cut off from the front of the anchor
152 |         # since they're shared between all alleles - yields a more compact representation.
153 |         #  - we need to leave one base as an anchor for VCF compliance though, thus the min(...)
154 |         anchor_offset = min(len(common_anchor_prefix), params.vcf_anchor_size - 1)
155 | 
156 |         ref_start_anchor = ref_start_anchor[anchor_offset:]
157 |         ref_seq_with_anchor = ref_start_anchor + ref_seq
158 | 
159 |         seqs_with_anchors: list[tuple[str, str]] = list(
160 |             zip(peak_seqs, map(lambda a: a[anchor_offset:], peak_start_anchor_seqs_upper))
161 |         )
162 | 
163 |         if 0 < len(peak_seqs) < n_alleles:
164 |             peak_seqs = tuple([peak_seqs[0]] * n_alleles)
165 |             seqs_with_anchors = [seqs_with_anchors[0]] * n_alleles
166 | 
167 |         seq_alts = sorted(
168 |             set(filter(lambda c: not (c[1] + c[0] == ref_seq_with_anchor), seqs_with_anchors)),
169 |             key=lambda c: c[1] + c[0]
170 |         )
171 | 
172 |         call = result["call"]
173 |         call_95_cis = result["call_95_cis"]
174 | 
175 |         seq_alleles_raw: tuple[str | None, ...] = (
176 |             ((ref_seq, ref_start_anchor), *(seq_alts or (None,)))
177 |             if call is not None
178 |             else ()
179 |         )
180 | 
181 |         seq_alleles: list[str] = [ref_seq_with_anchor]
182 | 
183 |         if call is not None and seq_alts:
184 |             # If we have a complete deletion, including the anchor, use a symbolic allele meaning "upstream deletion"
185 |             for alt_tr_seq, alt_anchor in seq_alts:
186 |                 if not alt_tr_seq and not alt_anchor:
187 |                     seq_alleles.append("*")
188 |                     continue
189 |                 seq_alleles.append(alt_anchor + alt_tr_seq)
190 |         else:
191 |             seq_alleles.append(".")
192 | 
193 |         start = result.get("start_adj", start) - len(ref_start_anchor)
194 | 
195 |         vr: VariantRecord = variant_file.new_record(
196 |             contig=contig,
197 |             start=start,
198 |             alleles=seq_alleles,
199 |         )
200 | 
201 |         vr.info[VCF_INFO_VT] = VT_STR
202 |         vr.info[VCF_INFO_MOTIF] = result["motif"]
203 |         vr.info[VCF_INFO_REFMC] = result["ref_cn"]
204 |         vr.info[VCF_INFO_ANCH] = params.vcf_anchor_size - anchor_offset
205 | 
206 |         vr.samples[sample_id]["GT"] = (
207 |             tuple(map(seq_alleles_raw.index, seqs_with_anchors))
208 |             if call is not None and peak_seqs
209 |             else _blank_entry(n_alleles)
210 |         )
211 | 
212 |         if am := result.get("assign_method"):
213 |             vr.samples[sample_id]["PM"] = am
214 | 
215 |         str_snvs = result.get("snvs", ())
216 |         if str_snvs:
217 |             # Record number of support SNVs for the locus
218 |             vr.samples[sample_id]["NSNV"] = len(str_snvs)
219 | 
220 |         vr.samples[sample_id]["DP"] = len(res_reads)
221 |         vr.samples[sample_id]["MMAS"] = result.get("mean_model_align_score")
222 | 
223 |         if call is not None and res_peaks:
224 |             vr.samples[sample_id]["DPS"] = sum(res_peaks["n_reads"])
225 |             vr.samples[sample_id]["AD"] = tuple(res_peaks["n_reads"])
226 |             vr.samples[sample_id]["MC"] = tuple(map(int, call))
227 |             vr.samples[sample_id]["MCCI"] = tuple(f"{x[0]}-{x[1]}" for x in call_95_cis)
228 | 
229 |             vr.samples[sample_id]["ANCL"] = tuple(len(ar[1]) for ar in seq_alleles_raw if ar is not None)
230 | 
231 |             # For each alt, mention which consensus method was used to obtain the sequence.
232 |             cons = tuple(
233 |                 peak_seqs_and_methods[ar[0]] for ar in seq_alleles_raw[1:] if ar is not None
234 |             )
235 |             vr.samples[sample_id]["CONS"] = cons if cons else (".",)
236 | 
237 |             # Produces a histogram-like format for read-level copy numbers
238 |             # e.g., for two alleles with 8 and 9 copy-number respectively, we may get: 7x1|8x10|9x1,8x2|9x12
239 |             vr.samples[sample_id]["MCRL"] = tuple(
240 |                 "|".join(
241 |                     map(
242 |                         lambda pair: "x".join(map(str, pair)),
243 |                         sorted(
244 |                             Counter(
245 |                                 map(cn_getter, filter(lambda r: r.get("p") == pi, res_reads.values()))
246 |                             ).items()
247 |                         )
248 |                     )
249 |                 )
250 |                 for pi in range(res_peaks["modal_n"])
251 |             )
252 | 
253 |             ps = result["ps"]
254 | 
255 |             try:
256 |                 if ps is not None:  # have phase set on call, so mark as phased
257 |                     vr.samples[sample_id].phased = True
258 |                     vr.samples[sample_id]["PS"] = ps
259 |             except TypeError:
260 |                 vr.samples[sample_id].phased = False
261 |                 logger.error(f"Received bad PS value while writing VCF record at {contig}:{start} - {ps}")
262 |                 ps = None
263 | 
264 |             for snv in str_snvs:
265 |                 snv_id = snv["id"]
266 |                 if snv_id in snvs_written:
267 |                     continue
268 |                 snvs_written.add(snv_id)
269 | 
270 |                 ref = snv["ref"]
271 |                 snv_alts = tuple(sorted(set(filter(lambda v: v != ref, snv["call"]))))
272 |                 snv_alleles = (ref, *snv_alts)
273 |                 snv_pos = snv["pos"]
274 | 
275 |                 if len(snv_alleles) < 2:
276 |                     logger.error(f"Error while writing VCF: SNV ({snv_id}) at {contig}:{snv_pos+1} has no alts")
277 |                     continue
278 | 
279 |                 snv_vr: VariantRecord = variant_file.new_record(
280 |                     contig=contig,
281 |                     id=snv_id,
282 |                     start=snv_pos,
283 |                     stop=snv_pos + 1,
284 |                     alleles=snv_alleles,
285 |                 )
286 | 
287 |                 snv_vr.info[VCF_INFO_VT] = VT_SNV
288 | 
289 |                 snv_vr.samples[sample_id]["GT"] = tuple(map(snv_alleles.index, snv["call"]))
290 |                 snv_vr.samples[sample_id]["DP"] = sum(snv["rcs"])
291 |                 snv_vr.samples[sample_id]["AD"] = snv["rcs"]
292 | 
293 |                 if ps is not None:
294 |                     snv_vr.samples[sample_id].phased = True
295 |                     snv_vr.samples[sample_id]["PS"] = ps
296 | 
297 |                 variant_records.append(snv_vr)
298 | 
299 |         variant_records.append(vr)
300 | 
301 |     # sort the variant records by position
302 |     variant_records.sort(key=_vr_pos_key)
303 | 
304 |     # write them to the VCF
305 |     for vrr in variant_records:
306 |         variant_file.write(vrr)
307 | 


--------------------------------------------------------------------------------
/strkit/call/params.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import pathlib
  3 | 
  4 | from pysam import AlignmentFile
  5 | 
  6 | from ..logger import log_levels
  7 | 
  8 | __all__ = ["CallParams"]
  9 | 
 10 | 
 11 | class CallParams:
 12 |     def __init__(
 13 |         self,
 14 | 
 15 |         logger: logging.Logger,
 16 | 
 17 |         read_file: str,
 18 |         reference_file: str,
 19 |         loci_file: str,
 20 |         sample_id: str | None,
 21 |         min_reads: int = 4,
 22 |         min_allele_reads: int = 2,
 23 |         max_reads: int = 250,
 24 |         min_avg_phred: int = 13,
 25 |         min_read_align_score: float = 0.9,
 26 |         max_rcn_iters: int = 50,
 27 |         num_bootstrap: int = 100,
 28 |         flank_size: int = 70,
 29 |         skip_supplementary: bool = False,
 30 |         skip_secondary: bool = False,
 31 |         sex_chroms: str | None = None,
 32 |         realign: bool = False,
 33 |         hq: bool = False,
 34 |         use_hp: bool = False,
 35 |         snv_vcf: pathlib.Path | None = None,
 36 |         snv_min_base_qual: int = 20,
 37 |         targeted: bool = False,
 38 |         respect_ref: bool = False,
 39 |         count_kmers: str = "none",  # "none" | "peak" | "read"
 40 |         consensus: bool = False,
 41 |         vcf_anchor_size: int = 5,
 42 |         # ---
 43 |         log_level: int = logging.WARNING,
 44 |         seed: int | None = None,
 45 |         processes: int = 1,
 46 |     ):
 47 |         self.read_file: str = read_file
 48 |         self.reference_file: str = reference_file
 49 |         self.loci_file: str = loci_file
 50 |         self.min_reads: int = min_reads
 51 |         self.min_allele_reads: int = min_allele_reads
 52 |         self.max_reads: int = max_reads
 53 |         self.min_avg_phred: int = min_avg_phred
 54 |         self.min_read_align_score: float = min_read_align_score
 55 |         self.max_rcn_iters: int = max_rcn_iters
 56 |         self.num_bootstrap: int = num_bootstrap
 57 |         self.flank_size: int = flank_size
 58 |         self.skip_supplementary: bool = skip_supplementary
 59 |         self.skip_secondary: bool = skip_secondary
 60 |         self.sex_chroms: str | None = sex_chroms
 61 |         self.realign: bool = realign
 62 |         self.hq: bool = hq
 63 |         self.use_hp: bool = use_hp
 64 |         self.snv_vcf: pathlib.Path | None = snv_vcf
 65 |         self.snv_min_base_qual: int = snv_min_base_qual
 66 |         self.targeted: bool = targeted
 67 |         self.respect_ref: bool = respect_ref
 68 |         self.count_kmers: str = count_kmers
 69 |         self.consensus: bool = consensus
 70 |         self.vcf_anchor_size: int = vcf_anchor_size
 71 |         # ---
 72 |         self.log_level: int = log_level
 73 |         self.seed: int | None = seed
 74 |         self.processes: int = processes
 75 | 
 76 |         bf = AlignmentFile(read_file, reference_filename=reference_file)
 77 | 
 78 |         # noinspection PyTypeChecker
 79 |         bfh = bf.header.to_dict()
 80 | 
 81 |         sns: set[str] = {e.get("SM") for e in bfh.get("RG", ()) if e.get("SM")}
 82 |         bam_sample_id: str | None = None
 83 | 
 84 |         if len(sns) > 1:
 85 |             # Error or warning or what?
 86 |             sns_str = "', '".join(sns)
 87 |             logger.warning(f"Found more than one sample ID in BAM file(s): '{sns_str}'")
 88 |         elif not sns:
 89 |             if not sample_id:
 90 |                 logger.warning("Could not find sample ID in BAM file(s); sample ID can be set manually via --sample-id")
 91 |         else:
 92 |             bam_sample_id = sns.pop()
 93 | 
 94 |         self._sample_id_orig: str | None = sample_id
 95 |         self.sample_id = sample_id or bam_sample_id
 96 | 
 97 |     @classmethod
 98 |     def from_args(cls, logger: logging.Logger, p_args):
 99 |         return cls(
100 |             logger,
101 |             p_args.read_file,
102 |             p_args.ref,
103 |             p_args.loci,
104 |             sample_id=p_args.sample_id,
105 |             min_reads=p_args.min_reads,
106 |             min_allele_reads=p_args.min_allele_reads,
107 |             max_reads=p_args.max_reads,
108 |             min_avg_phred=p_args.min_avg_phred,
109 |             min_read_align_score=p_args.min_read_align_score,
110 |             max_rcn_iters=p_args.max_rcn_iters,
111 |             num_bootstrap=p_args.num_bootstrap,
112 |             flank_size=p_args.flank_size,
113 |             skip_supplementary=p_args.skip_supplementary,
114 |             skip_secondary=p_args.skip_secondary,
115 |             sex_chroms=p_args.sex_chr,
116 |             realign=p_args.realign,
117 |             hq=p_args.hq,
118 |             use_hp=p_args.use_hp,
119 |             snv_vcf=p_args.incorporate_snvs,
120 |             snv_min_base_qual=p_args.snv_min_base_qual,
121 |             targeted=p_args.targeted,
122 |             respect_ref=p_args.respect_ref,
123 |             count_kmers=p_args.count_kmers,
124 |             consensus=p_args.consensus or not (not p_args.vcf),  # Consensus calculation is required for VCF output.
125 |             vcf_anchor_size=min(max(p_args.vcf_anchor_size, 1), p_args.flank_size),
126 |             # ---
127 |             log_level=log_levels[p_args.log_level],
128 |             seed=p_args.seed,
129 |             processes=p_args.processes,
130 |         )
131 | 
132 |     def to_dict(self, as_inputted: bool = False):
133 |         return {
134 |             "read_file": self.read_file,
135 |             "reference_file": self.reference_file,
136 |             "min_reads": self.min_reads,
137 |             "min_allele_reads": self.min_allele_reads,
138 |             "max_reads": self.max_reads,
139 |             "min_avg_phred": self.min_avg_phred,
140 |             "min_read_align_score": self.min_read_align_score,
141 |             "max_rcn_iters": self.max_rcn_iters,
142 |             "num_bootstrap": self.num_bootstrap,
143 |             "flank_size": self.flank_size,
144 |             "skip_supplementary": self.skip_supplementary,
145 |             "skip_secondary": self.skip_secondary,
146 |             "sample_id": self._sample_id_orig if as_inputted else self.sample_id,
147 |             "realign": self.realign,
148 |             "hq": self.hq,
149 |             "use_hp": self.use_hp,
150 |             "snv_vcf": str(self.snv_vcf) if self.snv_vcf else None,
151 |             "snv_min_base_qual": self.snv_min_base_qual,
152 |             "targeted": self.targeted,
153 |             "respect_ref": self.respect_ref,
154 |             "count_kmers": self.count_kmers,
155 |             "consensus": self.consensus,
156 |             "vcf_anchor_size": self.vcf_anchor_size,
157 |             "log_level": self.log_level,
158 |             "seed": self.seed,
159 |             "processes": self.processes,
160 |         }
161 | 


--------------------------------------------------------------------------------
/strkit/call/realign.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import multiprocessing as mp
  3 | import numpy as np
  4 | import os
  5 | import parasail
  6 | import queue
  7 | import time
  8 | 
  9 | from numpy.typing import NDArray
 10 | 
 11 | from .align_matrix import match_score, dna_matrix
 12 | from .cigar import decode_cigar_np, get_aligned_pair_matches
 13 | from .params import CallParams
 14 | from .utils import calculate_seq_with_wildcards
 15 | 
 16 | __all__ = [
 17 |     "MatchedCoordPairListOrNone",
 18 |     "realign_read",
 19 |     "perform_realign",
 20 | ]
 21 | 
 22 | 
 23 | min_realign_score_ratio: float = 0.95  # TODO: parametrize
 24 | realign_indel_open_penalty: int = 7  # TODO: parametrize
 25 | max_ref_len_for_same_proc: int = 1200  # TODO: parametrize
 26 | max_read_len_for_same_proc: int = 20000  # TODO: parametrize
 27 | 
 28 | 
 29 | MatchedCoordPairList = tuple[NDArray[np.uint64], NDArray[np.uint64]]
 30 | MatchedCoordPairListOrNone = MatchedCoordPairList | None
 31 | 
 32 | 
 33 | def realign_read(
 34 |     ref_seq: str,
 35 |     query_seq: str,
 36 |     left_flank_coord: int,
 37 |     flank_size: int,
 38 |     rn: str,
 39 |     t_idx: int,
 40 |     always_realign: bool,
 41 |     q,  # mp.Queue | None
 42 |     log_level: int = logging.WARNING,
 43 | ) -> MatchedCoordPairListOrNone:
 44 |     # Have to re-attach logger in separate process I guess
 45 | 
 46 |     def ret_q(v: MatchedCoordPairListOrNone) -> MatchedCoordPairListOrNone:
 47 |         if q:
 48 |             q.put(v)
 49 |             q.close()
 50 |         return v
 51 | 
 52 |     from strkit.logger import create_process_logger
 53 |     lg = create_process_logger(os.getpid(), log_level)
 54 | 
 55 |     # flipped: 'ref sequence' as query here, since it should in general be shorter (!)
 56 |     pr = parasail.sg_dx_trace_scan_16(
 57 |         # fetch an extra base for the right flank coordinate check later (needs to be >= the exclusive coord)
 58 |         ref_seq, query_seq, realign_indel_open_penalty, 0, dna_matrix)
 59 | 
 60 |     if pr.score < (th := min_realign_score_ratio * (flank_size * 2 * match_score - realign_indel_open_penalty)):
 61 |         lg.debug(f"Realignment for {rn} scored below threshold ({pr.score} < {th:.2f})")
 62 |         return ret_q(None)
 63 | 
 64 |     lg.debug(
 65 |         f"Realigned {rn} in locus {t_idx}{' (due to soft clipping)' if not always_realign else ''}: scored {pr.score}; "
 66 |         f"Flipped CIGAR: {pr.cigar.decode.decode('ascii')}")
 67 | 
 68 |     matches = get_aligned_pair_matches(decode_cigar_np(pr.cigar.seq), left_flank_coord, 0)
 69 |     res: MatchedCoordPairList = (matches[1], matches[0])
 70 |     return ret_q(res)
 71 | 
 72 | 
 73 | def perform_realign(
 74 |     t_idx: int,
 75 |     left_flank_coord: int,
 76 |     ref_total_seq: str,
 77 |     rn: str,
 78 |     qs: str,
 79 |     fqqs: NDArray[np.uint8],
 80 |     # ---
 81 |     params: CallParams,
 82 |     realign_timeout: int,
 83 |     force_realign: bool,
 84 |     # ---
 85 |     logger_: logging.Logger,
 86 |     locus_log_str: str,
 87 | ) -> MatchedCoordPairListOrNone:
 88 |     qs_wc = calculate_seq_with_wildcards(qs, fqqs)
 89 | 
 90 |     ref_seq_len = len(ref_total_seq)
 91 |     qs_len = len(qs_wc)
 92 | 
 93 |     if ref_seq_len <= max_ref_len_for_same_proc and qs_len <= max_read_len_for_same_proc:
 94 |         # Don't start process for short realigns, since then process startup dominates the total time taken
 95 |         # TODO: more robust solution; realign worker somehow? How to do timeout?
 96 |         return realign_read(
 97 |             ref_total_seq, qs_wc, left_flank_coord, params.flank_size, rn, t_idx, force_realign, None, params.log_level
 98 |         )
 99 | 
100 |     t = time.time()
101 | 
102 |     q: mp.Queue = mp.Queue()
103 |     proc = mp.Process(target=realign_read, daemon=False, kwargs=dict(
104 |         # fetch an extra base for the right flank coordinate check later (needs to be >= the exclusive coord)
105 |         ref_seq=ref_total_seq,  # TODO: with the plus 1, really?
106 |         query_seq=qs_wc,
107 |         left_flank_coord=left_flank_coord,
108 |         flank_size=params.flank_size,
109 |         rn=rn,
110 |         t_idx=t_idx,
111 |         always_realign=force_realign,
112 |         q=q,
113 |         log_level=params.log_level,
114 |     ))
115 |     proc.start()
116 | 
117 |     pairs_new = None
118 |     try:
119 |         pairs_new = q.get(timeout=realign_timeout)
120 |         proc.join()
121 |     except queue.Empty:
122 |         logger_.warning(
123 |             f"{locus_log_str} - experienced timeout while re-aligning read {rn}. Reverting to initial "
124 |             f"alignment.")
125 |         proc.terminate()
126 |         time.sleep(0.1)  # wait a little for the process to terminate
127 |     finally:
128 |         wait_count: int = 0
129 |         while proc.is_alive():
130 |             logger_.warning(f"{locus_log_str} - realign job has still not exited. Waiting 0.5 seconds...")
131 |             time.sleep(0.5)
132 |             wait_count += 1
133 |             if wait_count > 30:
134 |                 logger_.fatal(f"{locus_log_str} - realign job never exited. Terminating...")
135 |                 exit(1)
136 |         proc.close()
137 | 
138 |     logger_.debug(
139 |         f"{locus_log_str} - {rn}: long realign job completed in {time.time() - t:.4f}s ({ref_seq_len=}, {qs_len=})")
140 | 
141 |     return pairs_new
142 | 


--------------------------------------------------------------------------------
/strkit/call/repeats.py:
--------------------------------------------------------------------------------
  1 | import parasail
  2 | 
  3 | from functools import lru_cache
  4 | from typing import Literal
  5 | 
  6 | from strkit_rust_ext import get_repeat_count as _get_repeat_count
  7 | from strkit.utils import idx_1_getter
  8 | 
  9 | from .align_matrix import dna_matrix, indel_penalty
 10 | 
 11 | __all__ = [
 12 |     "get_repeat_count",
 13 |     "get_ref_repeat_count",
 14 | ]
 15 | 
 16 | 
 17 | DEFAULT_LOCAL_SEARCH_RANGE = 3
 18 | 
 19 | 
 20 | def score_candidate_with_string(db_seq_profile: parasail.Profile, tr_seq: str) -> int:
 21 |     # TODO: sub-flank again, to avoid more errors in flanking region contributing to score?
 22 |     # Always assign parasail results to variables due to funky memory allocation behaviour
 23 |     #  - switch 'db' and 'query' here so we can use the db sequence as the profile for a "database" search against
 24 |     #    candidate sequences. order doesn't end up mattering, since we're using semi-global alignment.
 25 |     r = parasail.sg_striped_profile_sat(db_seq_profile, tr_seq, indel_penalty, indel_penalty)
 26 |     return r.score
 27 | 
 28 | 
 29 | def score_candidate(
 30 |     db_seq_profile: parasail.Profile,
 31 |     motif: str,
 32 |     motif_count: int,
 33 |     flank_left_seq: str,
 34 |     flank_right_seq: str,
 35 | ) -> int:
 36 |     return score_candidate_with_string(db_seq_profile, f"{flank_left_seq}{motif * motif_count}{flank_right_seq}")
 37 | 
 38 | 
 39 | def score_ref_boundaries(
 40 |     db_seq_profile: parasail.Profile,
 41 |     db_seq_rev_profile: parasail.Profile,
 42 |     tr_candidate: str,
 43 |     flank_left_seq: str,
 44 |     flank_right_seq: str,
 45 |     ref_size: int,
 46 | ) -> tuple[tuple[int, int], tuple[int, int]]:
 47 |     # Always assign parasail results to variables due to funky memory allocation behaviour
 48 |     ext_r_seq = f"{flank_left_seq}{tr_candidate}"
 49 |     r_fwd = parasail.sg_qe_scan_profile_sat(db_seq_profile, ext_r_seq, indel_penalty, indel_penalty)
 50 |     r_adj = r_fwd.end_query + 1 - len(flank_left_seq) - ref_size  # Amount to tweak boundary on the right side by
 51 | 
 52 |     # Used to be flank_right_seq[max(r_adj, 0):] but I think that adjustment makes this score worse than it should and
 53 |     # wasn't valid, since what matters is the delta over the limit...
 54 |     ext_l_seq = f"{tr_candidate}{flank_right_seq}"[::-1]  # reverse
 55 | 
 56 |     r_rev = parasail.sg_qe_scan_profile_sat(db_seq_rev_profile, ext_l_seq, indel_penalty, indel_penalty)
 57 |     l_adj = r_rev.end_query + 1 - len(flank_right_seq) - ref_size  # Amount to tweak boundary on the left side by
 58 | 
 59 |     return (r_fwd.score, r_adj), (r_rev.score, l_adj)
 60 | 
 61 | 
 62 | # TODO: instead of lru_cache, some more custom mechanism for sharing?
 63 | @lru_cache(maxsize=512)
 64 | def get_repeat_count(
 65 |     start_count: int,
 66 |     tr_seq: str,
 67 |     flank_left_seq: str,
 68 |     flank_right_seq: str,
 69 |     motif: str,
 70 |     max_iters: int,
 71 |     local_search_range: int = DEFAULT_LOCAL_SEARCH_RANGE,  # TODO: Parametrize for user
 72 |     step_size: int = 1,
 73 | ) -> tuple[tuple[int, int], int, int]:
 74 |     return _get_repeat_count(
 75 |         start_count, tr_seq, flank_left_seq, flank_right_seq, motif, max_iters, local_search_range, step_size
 76 |     )
 77 | 
 78 | 
 79 | def get_ref_repeat_count(
 80 |     start_count: int,
 81 |     tr_seq: str,
 82 |     flank_left_seq: str,
 83 |     flank_right_seq: str,
 84 |     motif: str,
 85 |     ref_size: int,
 86 |     vcf_anchor_size: int,
 87 |     max_iters: int,
 88 |     respect_coords: bool = False,
 89 |     local_search_range: int = DEFAULT_LOCAL_SEARCH_RANGE,  # TODO: Parametrize for user
 90 |     step_size: int = 1,
 91 | ) -> tuple[tuple[int | float, int], int, int, tuple[int, int], tuple[str, str, str]]:
 92 |     l_offset: int = 0
 93 |     r_offset: int = 0
 94 | 
 95 |     db_seq: str = f"{flank_left_seq}{tr_seq}{flank_right_seq}"
 96 |     db_seq_profile: parasail.Profile = parasail.profile_create_sat(db_seq, dna_matrix)
 97 |     db_seq_rev_profile: parasail.Profile = parasail.profile_create_sat(db_seq[::-1], dna_matrix)
 98 | 
 99 |     motif_size = len(motif)
100 | 
101 |     n_offset_scores: int = 0
102 | 
103 |     if not respect_coords:  # Extend out coordinates from initial definition
104 |         to_explore: list[tuple[int, Literal[-1, 0, 1]]] = [
105 |             (start_count - step_size, -1), (start_count + step_size, 1), (start_count, 0)]
106 | 
107 |         fwd_sizes_scores_adj: dict[int | float, tuple[int, int]] = {}
108 |         rev_sizes_scores_adj: dict[int | float, tuple[int, int]] = {}
109 | 
110 |         while to_explore and n_offset_scores < max_iters:
111 |             size_to_explore, direction = to_explore.pop()
112 |             if size_to_explore < 0:
113 |                 continue
114 | 
115 |             fwd_scores: list[tuple[float | int, tuple[int, int], int]] = []  # For right-side adjustment
116 |             rev_scores: list[tuple[float | int, tuple[int, int], int]] = []  # For left-side adjustment
117 | 
118 |             start_size = max(
119 |                 size_to_explore - (local_search_range if (direction < 1 or step_size > local_search_range) else 0), 0)
120 |             end_size = size_to_explore + (local_search_range if (direction > -1 or step_size > local_search_range)
121 |                                           else 0)
122 | 
123 |             for i in range(start_size, end_size + 1):
124 |                 fwd_rs = fwd_sizes_scores_adj.get(i)
125 |                 rev_rs = rev_sizes_scores_adj.get(i)
126 | 
127 |                 if fwd_rs is None or rev_rs is None:
128 |                     res = score_ref_boundaries(
129 |                         db_seq_profile, db_seq_rev_profile, motif * i, flank_left_seq, flank_right_seq, ref_size)
130 | 
131 |                     fwd_sizes_scores_adj[i] = fwd_rs = res[0]
132 |                     rev_sizes_scores_adj[i] = rev_rs = res[1]
133 | 
134 |                     n_offset_scores += 1
135 | 
136 |                 fwd_scores.append((i, fwd_rs, i))
137 |                 rev_scores.append((i, rev_rs, i))
138 | 
139 |             mv: tuple[float | int, tuple[int, int], int] = max((*fwd_scores, *rev_scores), key=idx_1_getter)
140 |             if mv[2] > size_to_explore and (
141 |                     (new_rc := mv[2] + step_size) not in fwd_sizes_scores_adj or new_rc not in rev_sizes_scores_adj):
142 |                 if new_rc >= 0:
143 |                     to_explore.append((new_rc, 1))
144 |             if mv[2] < size_to_explore and (
145 |                     (new_rc := mv[2] - step_size) not in fwd_sizes_scores_adj or new_rc not in rev_sizes_scores_adj):
146 |                 if new_rc >= 0:
147 |                     to_explore.append((new_rc, -1))
148 | 
149 |         # noinspection PyTypeChecker
150 |         fwd_top_res: tuple[int | float, tuple] = max(fwd_sizes_scores_adj.items(), key=lambda x: x[1][0])
151 |         # noinspection PyTypeChecker
152 |         rev_top_res: tuple[int | float, tuple] = max(rev_sizes_scores_adj.items(), key=lambda x: x[1][0])
153 | 
154 |         # Ignore negative differences (contractions vs TRF definition), but follow expansions
155 |         # TODO: Should we incorporate contractions? How would that work?
156 | 
157 |         l_offset = rev_top_res[1][1]
158 |         r_offset = fwd_top_res[1][1]
159 | 
160 |         if l_offset >= len(flank_left_seq) - vcf_anchor_size:
161 |             # don't do anything weird if we're removing the entire flank sequence
162 |             # TODO: this can be caused by NNNNNNN - see chr5:139453668-139454525 in GRCh38
163 |             l_offset = 0
164 |         if r_offset >= len(flank_right_seq):
165 |             r_offset = 0  # same here
166 | 
167 |         if l_offset > 0:
168 |             tr_seq = flank_left_seq[-1*l_offset:] + tr_seq  # first, move a chunk of the left flank to the TR seq
169 |             flank_left_seq = flank_left_seq[:-1*l_offset]  # then, remove that chunk from the left flank
170 |         if r_offset > 0:
171 |             tr_seq = tr_seq + flank_right_seq[:r_offset]  # same, but for the right flank
172 |             flank_right_seq = flank_right_seq[r_offset:]
173 | 
174 |     # ------------------------------------------------------------------------------------------------------------------
175 | 
176 |     final_res, n_iters_final_count, _ = get_repeat_count(
177 |         # always start with int here:
178 |         round(((start_count * motif_size) + (max(0, l_offset) + max(0, r_offset))) / motif_size),
179 |         tr_seq,
180 |         flank_left_seq,
181 |         flank_right_seq,
182 |         motif,
183 |         max_iters=max_iters,
184 |         step_size=step_size,
185 |     )
186 | 
187 |     return (
188 |         final_res, l_offset, r_offset, (n_offset_scores, n_iters_final_count), (flank_left_seq, tr_seq, flank_right_seq)
189 |     )
190 | 


--------------------------------------------------------------------------------
/strkit/call/snvs.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import multiprocessing.managers as mmg
  3 | 
  4 | from collections import Counter
  5 | 
  6 | from strkit_rust_ext import get_read_snvs, process_read_snvs_for_locus_and_calculate_useful_snvs, CandidateSNVs
  7 | from strkit.utils import idx_1_getter
  8 | 
  9 | from .types import ReadDict, CalledSNV
 10 | 
 11 | 
 12 | __all__ = [
 13 |     "SNV_OUT_OF_RANGE_CHAR",
 14 |     "SNV_GAP_CHAR",
 15 |     "SNV_NA_CHARS",
 16 |     "get_read_snvs",
 17 |     "call_and_filter_useful_snvs",
 18 |     "process_read_snvs_for_locus_and_calculate_useful_snvs",
 19 | ]
 20 | 
 21 | SNV_OUT_OF_RANGE_CHAR = "-"
 22 | SNV_GAP_CHAR = "_"
 23 | SNV_NA_CHARS = (SNV_OUT_OF_RANGE_CHAR, SNV_GAP_CHAR)
 24 | 
 25 | 
 26 | def call_and_filter_useful_snvs(
 27 |     contig: str,
 28 |     n_alleles: int,
 29 |     read_dict: dict[str, ReadDict],
 30 |     useful_snvs: list[tuple[int, int]],
 31 |     candidate_snvs: CandidateSNVs,
 32 |     # ---
 33 |     snv_quality_threshold: int,
 34 |     # ---
 35 |     snv_genotype_cache: mmg.DictProxy,
 36 |     # ---
 37 |     locus_log_str: str,
 38 |     logger_: logging.Logger,
 39 | ) -> list[CalledSNV]:
 40 |     """
 41 |     Call useful SNVs at a locus level from read-level SNV data.
 42 |     :param contig: The contig of the SNVs. Used for generating an ID if one does not exist.
 43 |     :param n_alleles: The number of alleles called for this locus.
 44 |     :param read_dict: Dictionary of read data. Must already have peaks assigned.
 45 |     :param useful_snvs: List of tuples representing useful SNVs: (SNV index, reference position)
 46 |     :param candidate_snvs: A dictionary of useful SNVs, indexed by reference position. Used to look up IDs.
 47 |     :param snv_quality_threshold: Minimum PHRED score needed to incorporate a read base into the genotype.
 48 |     :param snv_genotype_cache: Cache for SNV genotype/phase set information.
 49 |     :param locus_log_str: Locus string representation for logging purposes.
 50 |     :param logger_: Python logger object.
 51 |     :return: List of called SNVs for the locus.
 52 |     """
 53 | 
 54 |     # Since these have already been classified as 'useful' earlier in the pipeline,
 55 |     # we have some guarantees that these values should be fairly internally consistent
 56 |     # for a given peak... most of the time.
 57 | 
 58 |     allele_range = tuple(range(n_alleles))
 59 |     peak_base_counts: dict[int, dict[int, Counter]] = {
 60 |         u_ref: {p: Counter() for p in allele_range}
 61 |         for _, u_ref in useful_snvs
 62 |     }
 63 | 
 64 |     for rn, read in read_dict.items():
 65 |         p: int | None = read.get("p")
 66 |         if p is None:  # No peak; read wasn't used to call peaks
 67 |             continue
 68 |         for u_idx, (_, u_ref) in enumerate(useful_snvs):
 69 |             su, su_q = read["snvu"][u_idx]
 70 | 
 71 |             if su == SNV_GAP_CHAR or su_q >= snv_quality_threshold:
 72 |                 peak_base_counts[u_ref][p].update((su,))
 73 | 
 74 |     called_snvs: list[dict] = []
 75 |     skipped_snvs: set[int] = set()
 76 | 
 77 |     for u_idx, (u_ref, peak_counts) in enumerate(peak_base_counts.items()):
 78 |         call: list[str] = []
 79 |         rs: list[int] = []
 80 | 
 81 |         skipped: bool = False
 82 | 
 83 |         for a in allele_range:
 84 |             if skipped:
 85 |                 break
 86 | 
 87 |             peak_counts_a = peak_counts[a]
 88 |             a_total = peak_counts[a].total()
 89 | 
 90 |             if a_total == 0:  # probably due to quality filtering
 91 |                 skipped = True
 92 |                 logger_.warning(f"{locus_log_str} - for SNV {u_ref}, found a 0-total for allele {a} (a)")
 93 |                 break
 94 | 
 95 |             mc = peak_counts_a.most_common(2)
 96 |             mcc = mc[0]
 97 | 
 98 |             try:
 99 |                 if mcc[0] == SNV_OUT_OF_RANGE_CHAR:  # Chose most common non-uncalled value
100 |                     mcc = mc[1]
101 | 
102 |                 for b in allele_range:
103 |                     if b == a:
104 |                         continue
105 | 
106 |                     peak_counts_b = peak_counts[b]
107 |                     b_total = peak_counts_b.total()
108 | 
109 |                     if b_total == 0:  # probably due to quality filtering
110 |                         skipped = True
111 |                         logger_.warning(f"{locus_log_str} - for SNV {u_ref}, found a 0-total for allele {b} (b)")
112 |                         break
113 | 
114 |                     if (peak_counts_b[mcc[0]] / b_total) > (peak_counts_a[mcc[0]] / a_total / 2):  # TODO: parametrize
115 |                         logger_.debug(
116 |                             f"{locus_log_str} - for SNV position {u_ref}: got uninformative peak counts (cross-talk) - "
117 |                             f"{peak_counts=}")
118 |                         skipped = True
119 |                         break
120 | 
121 |             except IndexError:  # '-' is the only value, somehow
122 |                 logger_.debug(
123 |                     f"{locus_log_str} - for SNV {u_ref}, found only '{SNV_OUT_OF_RANGE_CHAR}' with {mcc[1]} reads")
124 |                 logger_.debug(f"{locus_log_str} - for SNV position {u_ref}: {mc=}, {peak_counts[a]=}")
125 |                 skipped = True
126 |                 break
127 | 
128 |             if not skipped:
129 |                 call.append(mcc[0])
130 |                 rs.append(mcc[1])
131 | 
132 |         snv_call_set = set(call)
133 | 
134 |         if not skipped and len(snv_call_set) == 1:
135 |             logger_.warning(
136 |                 f"{locus_log_str} - for SNV position {u_ref}: got degenerate call {call} from {peak_counts=}")
137 |             skipped = True
138 | 
139 |         snv_rec = candidate_snvs.get(u_ref)
140 |         if snv_rec is not None:
141 |             snv_id = snv_rec["id"]
142 |             if snv_id == ".":
143 |                 snv_id = f"{contig}_{u_ref}"
144 |         else:
145 |             snv_id = f"{contig}_{u_ref}"
146 | 
147 |         if not skipped:
148 |             cached_snv_genotype = snv_genotype_cache.get(snv_id)
149 |             if cached_snv_genotype is not None and (cgt := set(cached_snv_genotype[0])) != snv_call_set:
150 |                 logger_.warning(
151 |                     f"{locus_log_str} - got mismatch for SNV {snv_id} (position {u_ref}); cache genotype set {cgt} != "
152 |                     f"current genotype set {snv_call_set}")
153 |                 skipped = True
154 | 
155 |         if skipped:
156 |             skipped_snvs.add(u_idx)  # Skip this useful SNV, since it isn't actually useful
157 |             continue
158 | 
159 |         called_snvs.append({
160 |             "id": snv_id,
161 |             **({"ref": snv_rec["ref_base"]} if snv_rec is not None else {}),
162 |             "pos": u_ref,
163 |             "call": tuple(call),
164 |             "rcs": rs,
165 |         })
166 | 
167 |     # If we've skipped any SNVs, filter them out of the read dict - MUTATION
168 |     if skipped_snvs:
169 |         for read in read_dict.values():
170 |             if "snvu" not in read:
171 |                 continue
172 |             read["snvu"] = tuple(map(idx_1_getter, filter(lambda e: e[0] not in skipped_snvs, enumerate(read["snvu"]))))
173 |         logger_.debug(f"{locus_log_str} - filtered out {len(skipped_snvs)} not-actually-useful SNVs")
174 | 
175 |     return called_snvs
176 | 


--------------------------------------------------------------------------------
/strkit/call/types.py:
--------------------------------------------------------------------------------
  1 | # import pysam
  2 | import numpy as np
  3 | from typing import Literal, TypedDict
  4 | from numpy.typing import NDArray
  5 | 
  6 | 
  7 | __all__ = [
  8 |     "VCFContigFormat",
  9 |     "AssignMethod",
 10 |     "AssignMethodWithHP",
 11 |     "ConsensusMethod",
 12 |     # ---
 13 |     "ReadDict",
 14 |     "ReadDictExtra",
 15 |     "CalledSNV",
 16 |     "LocusResult",
 17 | ]
 18 | 
 19 | # TODO: py3.10: new Required[] TypedDict structuring
 20 | 
 21 | 
 22 | VCFContigFormat = Literal["chr", "num", "acc", ""]
 23 | 
 24 | AssignMethod = Literal["dist", "snv", "snv+dist", "single"]
 25 | AssignMethodWithHP = AssignMethod | Literal["hp"]
 26 | 
 27 | ConsensusMethod = Literal["single", "poa", "best_rep"]
 28 | 
 29 | 
 30 | class _ReadDictBase(TypedDict):
 31 |     s: Literal["-", "+"]  # DNA strand alignment
 32 |     cn: int | float  # Copy number
 33 |     w: float  # Weight
 34 |     sc: float | None  # Adjusted read model align score (None if TR is missing)
 35 | 
 36 | 
 37 | class ReadDict(_ReadDictBase, total=False):
 38 |     # Whether the read was realigned by hand using a local alignment algorithm.
 39 |     realn: bool
 40 | 
 41 |     # Whether the read appears to be chimeric within the locus region,
 42 |     # i.e. aligned twice with different soft-clipping.
 43 |     chimeric_in_region: bool
 44 | 
 45 |     p: int  # Peak (allele)
 46 | 
 47 |     kmers: dict[str, int]  # Dictionary of {kmer: count}
 48 | 
 49 |     # Only added if HP tags from a haplotagged alignment file are being incorporated:
 50 |     hp: int
 51 |     ps: int
 52 | 
 53 |     # Only added if SNVs are being incorporated:
 54 |     #  - After including only useful SNVs, this contains a tuple of bases for just those + corresponding qualities
 55 |     snvu: tuple[tuple[str, int], ...]
 56 | 
 57 | 
 58 | class ReadDictExtra(TypedDict, total=False):
 59 |     _ref_start: int  # Read start in ref coordinates
 60 |     _ref_end: int  # Read end in ref coordinates
 61 | 
 62 |     # BEGIN: only added if consensus is being calculated
 63 |     _start_anchor: str  # Left anchor for calculated allele sequence (usually 1 base)
 64 |     _tr_seq: str  # Tandem repeat sequence
 65 |     # END: only added if consensus is being calculated
 66 | 
 67 |     # Below are only added if SNVs are being incorporated:
 68 | 
 69 |     _qs: str  # Query (read) sequence
 70 |     _fqqs: NDArray[np.uint8]  # Query (read) base qualities
 71 | 
 72 |     sig_clip_left: bool  # Significant amounts of clipping (5' of read)
 73 |     sig_clip_right: bool  # Significant amounts of clipping (3' of read)
 74 | 
 75 |     snv: dict[int, str]  # Intermediate result: dictionary of a bunch of SNVs for this read {position: base}
 76 |     # Intermediate result: tuple of bases/qualities for the set of SNVs across all reads
 77 |     snv_bases: tuple[tuple[str, int], ...]
 78 | 
 79 | 
 80 | class _CalledSNVBase(TypedDict):
 81 |     id: str
 82 |     pos: int
 83 |     call: tuple[str, ...]
 84 |     rcs: list[int]
 85 | 
 86 | 
 87 | class CalledSNV(_CalledSNVBase, total=False):
 88 |     ref: str
 89 | 
 90 | 
 91 | class BasePeakData(TypedDict):
 92 |     means: NDArray[np.float32]
 93 |     weights: NDArray[np.float32]
 94 |     stdevs: NDArray[np.float32]
 95 |     modal_int: int
 96 |     n_reads: list[int]
 97 | 
 98 | 
 99 | class PeakData(BasePeakData):
100 |     kmers: dict[str, int]
101 |     seqs: list[tuple[str, ConsensusMethod]]  # really "list-tyup
102 | 
103 | 
104 | class BaseLocusResult(TypedDict):
105 |     locus_index: int
106 |     contig: str
107 |     start: int
108 |     end: int
109 | 
110 |     motif: str
111 | 
112 |     assign_method: AssignMethodWithHP | None
113 |     call: list[int] | None
114 |     call_95_cis: list[list[int]] | None
115 |     call_99_cis: list[list[int]] | None
116 | 
117 |     # Mean model (candidate TR sequence) alignment score across reads.
118 |     mean_model_align_score: float | None
119 | 
120 | 
121 | class LocusResult(BaseLocusResult, total=False):
122 |     start_adj: int
123 |     end_adj: int
124 | 
125 |     ref_cn: int
126 | 
127 |     ps: int | None
128 |     peaks: PeakData | None
129 |     read_peaks_called: bool
130 |     time: float
131 | 
132 |     # if we're in consensus mode: ---
133 |     ref_start_anchor: str
134 |     ref_seq: str
135 |     # ---
136 | 
137 |     reads: dict[str, ReadDict]
138 |     snvs: list[CalledSNV]
139 | 


--------------------------------------------------------------------------------
/strkit/call/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import operator
 3 | 
 4 | from functools import cache, partial
 5 | from numpy.typing import NDArray
 6 | 
 7 | from ..utils import cat_strs
 8 | 
 9 | __all__ = [
10 |     "cn_getter",
11 |     "neq_blank",
12 |     "find_pair_by_ref_pos",
13 |     "normalize_contig",
14 |     "round_to_base_pos",
15 |     "get_new_seed",
16 |     "calculate_seq_with_wildcards",
17 | ]
18 | 
19 | 
20 | # index/property getters and other partials
21 | cn_getter = operator.itemgetter("cn")
22 | neq_blank = partial(operator.ne, "")
23 | 
24 | 
25 | def find_pair_by_ref_pos(r_coords: NDArray[np.uint64], target: int, start_left: int = 0) -> tuple[int, bool]:
26 |     n_pairs: int = len(r_coords)
27 |     idx = start_left + np.searchsorted(r_coords[start_left:], target)
28 |     return idx, idx < n_pairs and r_coords[idx] == target
29 | 
30 | 
31 | def normalize_contig(contig: str, has_chr: bool) -> str:
32 |     return ("chr" if has_chr else "") + contig.replace("chr", "")
33 | 
34 | 
35 | def round_to_base_pos(x, motif_size: int) -> float:
36 |     return round(float(x) * motif_size) / motif_size
37 | 
38 | 
39 | def get_new_seed(rng: np.random.Generator) -> int:
40 |     return rng.integers(0, 4096, dtype=int)
41 | 
42 | 
43 | @cache  # TODO: parametrize base_wildcard_threshold
44 | def _mask_low_q_base(base_and_qual: tuple[str, int], base_wildcard_threshold: int = 3) -> str:
45 |     return base_and_qual[0] if base_and_qual[1] > base_wildcard_threshold else "X"
46 | 
47 | 
48 | def calculate_seq_with_wildcards(qs: str, quals: NDArray[np.uint8] | None) -> str:
49 |     if quals is None:
50 |         return qs  # No quality information, so don't do anything
51 |     return cat_strs(map(_mask_low_q_base, zip(qs, quals)))
52 | 


--------------------------------------------------------------------------------
/strkit/call/validation.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from logging import Logger
 3 | 
 4 | __all__ = [
 5 |     "LocusValidationError",
 6 |     "valid_motif",
 7 |     "validate_locus",
 8 | ]
 9 | 
10 | # patterns
11 | RE_VALID_MOTIF = re.compile(r"^[ACGTRYSWKMBDHVN]+$")
12 | 
13 | 
14 | # exceptions
15 | 
16 | class LocusValidationError(ValueError):
17 |     def __init__(self, error_str: str, hint_msg: str):
18 |         self._error_str = error_str
19 |         self._hint_msg = hint_msg
20 |         super().__init__(error_str)
21 | 
22 |     def log_error(self, logger: Logger) -> None:
23 |         logger.critical(self._error_str)
24 |         logger.critical(self._hint_msg)
25 | 
26 | 
27 | # functions
28 | 
29 | def valid_motif(motif: str) -> bool:
30 |     """
31 |     Determines whether a motif is valid, i.e., can be used by `strkit call`. Here, valid means "composed of IUPAC
32 |     nucleotide codes and no other characters."
33 |     :param motif: The motif to assess the validity of.
34 |     :return: Whether the motif is valid or not.
35 |     """
36 |     return RE_VALID_MOTIF.match(motif) is not None
37 | 
38 | 
39 | def validate_locus(line: int, start: int, end: int, motif: str) -> None:
40 |     """
41 |     Validate a locus definition for use by STRkit.
42 |     :param line: Line number, for logging errors in a catalog BED file.
43 |     :param start: Start coordinate; 0-based, inclusive.
44 |     :param end: End coordinate; 0-based, exclusive.
45 |     :param motif: Motif sequence (to be validated).
46 |     """
47 | 
48 |     if start >= end:
49 |         raise LocusValidationError(
50 |             f"BED catalog format error: invalid coordinates on line {line}: start ({start}) >= end ({end})",
51 |             "BED catalog: coordinates must be 0-based, half-open - [start, end)",
52 |         )
53 | 
54 |     if not valid_motif(motif):
55 |         raise LocusValidationError(
56 |             f"BED catalog format error: invalid motif on line {line}: {motif}",
57 |             "BED catalog: motifs must contain only valid IUPAC nucleotide codes.",
58 |         )
59 | 


--------------------------------------------------------------------------------
/strkit/catalog/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidlougheed/strkit/90f07d4bb8e27b67a1a1c9afafa4f4b1412b171a/strkit/catalog/__init__.py


--------------------------------------------------------------------------------
/strkit/catalog/combine.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import sys
 4 | from ..constants import CALLER_STRAGLR, CHROMOSOMES
 5 | 
 6 | __all__ = [
 7 |     "combine_catalogs",
 8 | ]
 9 | 
10 | 
11 | def combine_catalogs(caller: str, paths: list[str]) -> int:
12 |     if caller != CALLER_STRAGLR:
13 |         sys.stderr.write(f"Error: This command only supports caller '{CALLER_STRAGLR}'\n")
14 |         return 1
15 | 
16 |     lines = set()
17 | 
18 |     for path in paths:
19 |         if not path.endswith(".bed"):
20 |             sys.stderr.write(f"Error: Please supply only .bed files from '{CALLER_STRAGLR}'\n")
21 |             return 1
22 | 
23 |         with open(path, "r") as fh:
24 |             for line in fh:
25 |                 if line.startswith("#"):
26 |                     continue
27 | 
28 |                 raw_data = line.strip().split("\t")
29 |                 lines.add((raw_data[0], int(raw_data[1]), int(raw_data[2]), raw_data[3]))
30 | 
31 |     for line in sorted(lines, key=lambda x: (CHROMOSOMES.index(x[0]), x[1])):
32 |         sys.stdout.write("\t".join(map(str, line)) + "\n")
33 | 
34 |     return 0
35 | 


--------------------------------------------------------------------------------
/strkit/constants.py:
--------------------------------------------------------------------------------
 1 | __all__ = [
 2 |     "CALLER_EXPANSIONHUNTER",
 3 |     "CALLER_HIPSTR",
 4 |     "CALLER_GANGSTR",
 5 |     "CALLER_REPEATHMM",
 6 |     "CALLER_STRAGLR",
 7 |     "CALLER_TANDEM_GENOTYPES",
 8 | 
 9 |     "M_CHROMOSOME_NAMES",
10 |     "X_CHROMOSOME_NAMES",
11 |     "Y_CHROMOSOME_NAMES",
12 |     "SEX_CHROMOSOMES",
13 |     "AUTOSOMES",
14 |     "CHROMOSOMES",
15 | 
16 |     "MI_CALLERS",
17 | ]
18 | 
19 | CALLER_EXPANSIONHUNTER = "expansionhunter"
20 | CALLER_HIPSTR = "hipstr"
21 | CALLER_LONGTR = "longtr"
22 | CALLER_GANGSTR = "gangstr"
23 | CALLER_GENERIC_VCF = "generic-vcf"
24 | CALLER_REPEATHMM = "repeathmm"
25 | CALLER_STRDUST = "strdust"
26 | CALLER_STRAGLR = "straglr"
27 | CALLER_STRKIT = "strkit"
28 | CALLER_STRKIT_JSON = "strkit-json"
29 | CALLER_STRKIT_VCF = "strkit-vcf"
30 | CALLER_TANDEM_GENOTYPES = "tandem-genotypes"
31 | CALLER_TRGT = "trgt"
32 | 
33 | M_CHROMOSOME_NAMES = ("chrM", "M")
34 | X_CHROMOSOME_NAMES = ("chrX", "X")
35 | Y_CHROMOSOME_NAMES = ("chrY", "Y")
36 | SEX_CHROMOSOMES = (*X_CHROMOSOME_NAMES, *Y_CHROMOSOME_NAMES)
37 | 
38 | AUTOSOMES = (
39 |     *map(str, range(1, 23)),
40 |     *(f"chr{i}" for i in range(1, 23)),
41 | )
42 | 
43 | CHROMOSOMES = (
44 |     *AUTOSOMES,
45 |     *SEX_CHROMOSOMES,
46 | )
47 | 
48 | 
49 | MI_CALLERS = (
50 |     CALLER_EXPANSIONHUNTER,
51 |     CALLER_GANGSTR,
52 |     CALLER_GENERIC_VCF,
53 |     CALLER_LONGTR,
54 |     CALLER_REPEATHMM,
55 |     CALLER_STRDUST,
56 |     CALLER_STRAGLR,
57 |     CALLER_STRKIT,
58 |     CALLER_STRKIT_JSON,
59 |     CALLER_STRKIT_VCF,
60 |     CALLER_TANDEM_GENOTYPES,
61 |     CALLER_TRGT,
62 | )
63 | 


--------------------------------------------------------------------------------
/strkit/convert/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidlougheed/strkit/90f07d4bb8e27b67a1a1c9afafa4f4b1412b171a/strkit/convert/__init__.py


--------------------------------------------------------------------------------
/strkit/convert/_bed_4.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from logging import Logger
 3 | 
 4 | __all__ = [
 5 |     "trf_to_bed_4",
 6 | ]
 7 | 
 8 | 
 9 | def trf_to_bed_4(trf_data: list, _logger: Logger):
10 |     for item in trf_data:
11 |         sys.stdout.write("\t".join((*item[:3], item[-1])) + "\n")
12 | 


--------------------------------------------------------------------------------
/strkit/convert/constants.py:
--------------------------------------------------------------------------------
 1 | __all__ = [
 2 |     "IN_FORMAT_TRF",
 3 |     "IN_FORMAT_TRGT",
 4 |     "CONVERTER_IN_FORMATS",
 5 | ]
 6 | 
 7 | IN_FORMAT_TRF = "trf"
 8 | IN_FORMAT_TRGT = "trgt"
 9 | 
10 | CONVERTER_IN_FORMATS = (
11 |     IN_FORMAT_TRF,
12 |     IN_FORMAT_TRGT,
13 | )
14 | 


--------------------------------------------------------------------------------
/strkit/convert/converter.py:
--------------------------------------------------------------------------------
 1 | from logging import Logger
 2 | from typing import Callable
 3 | 
 4 | from ._bed_4 import trf_to_bed_4
 5 | from .constants import IN_FORMAT_TRF, IN_FORMAT_TRGT, CONVERTER_IN_FORMATS
 6 | from .expansionhunter import trf_bed_to_eh
 7 | from .hipstr import trf_bed_to_hipstr
 8 | from .gangstr import trf_bed_to_gangstr
 9 | from .trgt import trgt_bed_to_bed4, trf_or_strkit_bed_to_trgt
10 | 
11 | import strkit.constants as c
12 | 
13 | __all__ = [
14 |     "CONVERTER_OUTPUT_FORMATS",
15 |     "convert",
16 | ]
17 | 
18 | convert_formats: dict[tuple[str, str], Callable[[list, Logger], None]] = {
19 |     # TRF converters:
20 |     (IN_FORMAT_TRF, c.CALLER_EXPANSIONHUNTER): trf_bed_to_eh,
21 |     (IN_FORMAT_TRF, c.CALLER_HIPSTR): trf_bed_to_hipstr,
22 |     (IN_FORMAT_TRF, c.CALLER_GANGSTR): trf_bed_to_gangstr,
23 |     (IN_FORMAT_TRF, c.CALLER_REPEATHMM): lambda x: x,
24 |     (IN_FORMAT_TRF, c.CALLER_STRAGLR): trf_to_bed_4,
25 |     (IN_FORMAT_TRF, c.CALLER_STRKIT): trf_to_bed_4,  # or can just leave -asis
26 |     (IN_FORMAT_TRF, c.CALLER_TANDEM_GENOTYPES): trf_to_bed_4,
27 |     (IN_FORMAT_TRF, c.CALLER_TRGT): trf_or_strkit_bed_to_trgt,
28 |     # TRGT converters:
29 |     (IN_FORMAT_TRGT, c.CALLER_STRAGLR): trgt_bed_to_bed4,
30 |     (IN_FORMAT_TRGT, c.CALLER_STRKIT): trgt_bed_to_bed4,
31 |     (IN_FORMAT_TRGT, c.CALLER_TANDEM_GENOTYPES): trgt_bed_to_bed4,
32 | }
33 | 
34 | CONVERTER_OUTPUT_FORMATS: tuple[str, ...] = tuple(sorted(set(k[1] for k in convert_formats)))
35 | 
36 | 
37 | def convert(in_file: str, in_format: str, out_format: str, logger: Logger) -> int:
38 |     out_format = out_format.lower()
39 | 
40 |     if in_format == IN_FORMAT_TRF:
41 |         if out_format == c.CALLER_REPEATHMM:
42 |             logger.critical(f"No need to convert for '{out_format}'; TRF BED files are accepted as input")
43 |             return 1
44 |         elif out_format == c.CALLER_STRKIT:
45 |             logger.info("STRkit can use TRF BED files as-is; will convert to a BED4 file")
46 | 
47 |     if in_format not in CONVERTER_IN_FORMATS:
48 |         logger.critical(f"Unsupported input format: {in_format}")
49 | 
50 |     if (in_format, out_format) not in convert_formats:
51 |         logger.critical(f"Unsupported conversion: {in_format} -> {out_format} (no converter defined)")
52 |         return 1
53 | 
54 |     with open(in_file, "r") as tf:
55 |         data = [line.strip().split("\t") for line in tf]
56 | 
57 |     convert_formats[(in_format, out_format)](data, logger)
58 |     return 0
59 | 


--------------------------------------------------------------------------------
/strkit/convert/expansionhunter.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import sys
 3 | from logging import Logger
 4 | 
 5 | __all__ = [
 6 |     "trf_bed_to_eh",
 7 | ]
 8 | 
 9 | 
10 | def trf_bed_to_eh(trf_data: list, _logger: Logger):
11 |     eh_formatted_loci = []
12 | 
13 |     for i, item in enumerate(trf_data, 1):
14 |         eh_formatted_loci.append({
15 |             "LocusId": f"Locus{i}",
16 |             "LocusStructure": f"({item[-1]})*",
17 |             "ReferenceRegion": f"{item[0]}:{item[1]}-{item[2]}",
18 |             "VariantType": "Repeat",
19 |         })
20 | 
21 |     sys.stdout.write(json.dumps(eh_formatted_loci, indent=2))
22 | 


--------------------------------------------------------------------------------
/strkit/convert/gangstr.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from logging import Logger
 3 | 
 4 | __all__ = [
 5 |     "trf_bed_to_gangstr",
 6 | ]
 7 | 
 8 | 
 9 | def trf_bed_to_gangstr(trf_data: list, _logger: Logger):
10 |     for i, item in enumerate(trf_data, 1):
11 |         sys.stdout.write("\t".join((*item[:3], str(len(item[-1])), item[-1])) + "\n")
12 | 


--------------------------------------------------------------------------------
/strkit/convert/hipstr.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from logging import Logger
 3 | 
 4 | __all__ = [
 5 |     "trf_bed_to_hipstr",
 6 | ]
 7 | 
 8 | 
 9 | def trf_bed_to_hipstr(trf_data: list, _logger: Logger):
10 |     for i, item in enumerate(trf_data, 1):
11 |         sys.stdout.write("\t".join((*item[:3], str(len(item[-1])), str(round(float(item[5]))))) + "\n")
12 | 


--------------------------------------------------------------------------------
/strkit/convert/trgt.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from logging import Logger
 3 | 
 4 | __all__ = [
 5 |     "trgt_bed_to_bed4",
 6 |     "trf_or_strkit_bed_to_trgt",
 7 | ]
 8 | 
 9 | from strkit.iupac import get_iupac_code_for_nt_set
10 | 
11 | 
12 | def trgt_bed_to_bed4(trgt_data: list, logger: Logger):
13 |     """
14 |     Converts a TRGT repeat catalog to the STRkit/BED4 catalog format.
15 |     :param trgt_data: The loaded TRGT catalog (split by tab).
16 |     :param logger: A logger instance for issuing conversion failure warnings.
17 |     """
18 | 
19 |     for line, data in enumerate(trgt_data, 1):
20 |         structure_data = {j[0]: j[1] for j in (i.split("=") for i in data[3].split(";"))}
21 |         motifs = structure_data["MOTIFS"].split(",")
22 | 
23 |         if len(motifs) > 1:
24 |             # We can do some basic IUPAC code normalization here for simple compound STR structures in TRGT catalogs:
25 |             if (
26 |                 structure_data["STRUC"] in {"".join(f"({m})n" for m in motifs), f"<{structure_data['ID']}>"}
27 |                 and len({len(m) for m in motifs}) == 1
28 |             ):
29 |                 failed: bool = False
30 |                 combined_motif_bases = []
31 |                 for bases in zip(*motifs):
32 |                     bases_set = set(bases)
33 |                     if len(bases_set) == 1:  # same base in all motifs
34 |                         combined_motif_bases.append(next(iter(bases_set)))
35 |                     elif iupac_code := get_iupac_code_for_nt_set(bases_set):
36 |                         # find IUPAC code representing consensus "base" and append it to the motif
37 |                         combined_motif_bases.append(iupac_code)
38 |                     else:  # something went wrong (invalid base?)
39 |                         failed = True
40 |                         break
41 | 
42 |                 if not failed:  # found a consensus base for the multiple-motif STR, so we can convert it
43 |                     sys.stdout.write("\t".join((*data[:3], "".join(combined_motif_bases))) + "\n")
44 |                     continue
45 | 
46 |             data_str = "\t".join(data)
47 |             logger.warning(f"Could not convert complex locus at line {line}: {data_str}")
48 |             continue
49 | 
50 |         sys.stdout.write("\t".join((*data[:3], motifs[0])) + "\n")
51 | 
52 | 
53 | def trf_or_strkit_bed_to_trgt(trf_data: list, _logger: Logger):
54 |     """
55 |     Convets a TRF- or STRkit-formatted BED (motif-last) to a basic version of a TRGT catalog.
56 |     :param trf_data: The loaded BED catalog data.
57 |     :param _logger: Logger instance (unused).
58 |     """
59 | 
60 |     for i, item in enumerate(trf_data):
61 |         motif = trf_data[-1]
62 |         sys.stdout.write("\t".join((*trf_data[:3], f"ID=locus{i};MOTIFS={motif};STRUC=({motif})n")) + "\n")
63 | 


--------------------------------------------------------------------------------
/strkit/exceptions.py:
--------------------------------------------------------------------------------
 1 | __all__ = [
 2 |     "ParamError",
 3 |     "InputError",
 4 | ]
 5 | 
 6 | 
 7 | class ParamError(Exception):
 8 |     pass
 9 | 
10 | 
11 | class InputError(Exception):
12 |     pass
13 | 


--------------------------------------------------------------------------------
/strkit/iupac.py:
--------------------------------------------------------------------------------
 1 | __all__ = [
 2 |     "IUPAC_NUCLEOTIDE_CODES",
 3 |     "IUPAC_NUCLEOTIDE_CODES_REVERSE",
 4 |     "get_iupac_code_for_nt_set",
 5 | ]
 6 | 
 7 | # IUPAC nucleotide codes representing >1 nucleotide (quasi-"wildcards"):
 8 | #  - It's important that the values remain sorted, so we can do a reverse-lookup (see below)
 9 | IUPAC_NUCLEOTIDE_CODES: dict[str, tuple[str, ...]] = {
10 |     "R": ("A", "G"),
11 |     "Y": ("C", "T"),
12 |     "S": ("C", "G"),
13 |     "W": ("A", "T"),
14 |     "K": ("G", "T"),
15 |     "M": ("A", "C"),
16 |     "B": ("C", "G", "T"),
17 |     "D": ("A", "C", "T"),
18 |     "H": ("A", "C", "T"),
19 |     "V": ("A", "C", "G"),
20 |     "N": ("A", "C", "G", "T"),
21 | }
22 | 
23 | # Lookup table of {(sorted nucleotides): "<IUPAC code>"}
24 | IUPAC_NUCLEOTIDE_CODES_REVERSE: dict[tuple[str, ...], str] = {
25 |     v: k for k, v in IUPAC_NUCLEOTIDE_CODES.items()
26 | }
27 | 
28 | 
29 | def get_iupac_code_for_nt_set(nt_set: set[str]) -> str | None:
30 |     """
31 |     Given a set of standard nucleotides (ATGC), return an IUPAC code which represents the set.
32 |     :param nt_set: A set of nucleotides (A, T, G, or C). Any other base will result in a None return.
33 |     :return: An IUPAC nucleotide code representing the set of nucleotides, or None given an invalid nucleotide set.
34 |     """
35 |     return IUPAC_NUCLEOTIDE_CODES_REVERSE.get(tuple(sorted(nt_set)))
36 | 


--------------------------------------------------------------------------------
/strkit/json.py:
--------------------------------------------------------------------------------
 1 | import orjson as json
 2 | 
 3 | 
 4 | __all__ = [
 5 |     "Serializable",
 6 |     "json",
 7 |     "dumps",
 8 |     "dumps_indented",
 9 | ]
10 | 
11 | 
12 | Serializable = dict | list | tuple | str | int | float
13 | 
14 | 
15 | def dumps(v: Serializable) -> bytes:
16 |     return json.dumps(v, option=json.OPT_NON_STR_KEYS | json.OPT_SERIALIZE_NUMPY)
17 | 
18 | 
19 | def dumps_indented(v: Serializable) -> bytes:
20 |     return json.dumps(v, option=json.OPT_NON_STR_KEYS | json.OPT_INDENT_2 | json.OPT_SERIALIZE_NUMPY)
21 | 


--------------------------------------------------------------------------------
/strkit/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import sys
 3 | 
 4 | __all__ = [
 5 |     "get_main_logger",
 6 |     "attach_stream_handler",
 7 |     "create_process_logger",
 8 |     "log_levels",
 9 | ]
10 | 
11 | fmt = logging.Formatter(fmt="%(name)s:\t[%(levelname)s]\t%(message)s")
12 | 
13 | 
14 | def get_main_logger(level: int = logging.DEBUG):
15 |     logger = logging.getLogger("strkit-main")
16 |     logger.setLevel(level)
17 |     return logger
18 | 
19 | 
20 | def attach_stream_handler(level: int, logger_=None):
21 |     ch = logging.StreamHandler(sys.stderr)
22 |     ch.setLevel(level)
23 |     ch.setFormatter(fmt)
24 |     logger_.addHandler(ch)
25 | 
26 | 
27 | def create_process_logger(pid: int, level: int):
28 |     lg = logging.getLogger(f"strkit-{pid}")
29 |     lg.setLevel(level)
30 |     if not lg.handlers:
31 |         attach_stream_handler(level, logger_=lg)
32 |     return lg
33 | 
34 | 
35 | log_levels = {
36 |     "debug": logging.DEBUG,
37 |     "info": logging.INFO,
38 |     "warning": logging.WARNING,
39 |     "error": logging.ERROR,
40 | }
41 | 


--------------------------------------------------------------------------------
/strkit/mi/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidlougheed/strkit/90f07d4bb8e27b67a1a1c9afafa4f4b1412b171a/strkit/mi/__init__.py


--------------------------------------------------------------------------------
/strkit/mi/base.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import logging
  4 | import uuid
  5 | from abc import ABC, abstractmethod
  6 | from pathlib import Path
  7 | from typing import Any
  8 | 
  9 | from strkit.logger import get_main_logger
 10 | from .intervals import (
 11 |     LociDictOfDict,
 12 |     LociDictOfList,
 13 |     build_loci_dict_of_dict_from_file,
 14 |     build_loci_dict_of_list_from_file,
 15 |     overlapping_loci_dict_of_dict,
 16 |     overlapping_loci_dict_of_list,
 17 | )
 18 | from .result import MIKind, MIContigResult, MIResult
 19 | 
 20 | __all__ = [
 21 |     "SEX_CHROMOSOMES",
 22 |     "BaseCalculator",
 23 | ]
 24 | 
 25 | 
 26 | SEX_CHROMOSOMES = {"chrX", "X", "chrY", "Y"}  # TODO: proper parametrization
 27 | 
 28 | 
 29 | # noinspection PyUnusedLocal
 30 | class BaseCalculator(ABC):
 31 |     def __init__(
 32 |         self,
 33 |         child_call_file: Path,
 34 |         mother_call_file: Path,
 35 |         father_call_file: Path,
 36 | 
 37 |         child_id: str | None = None,
 38 |         mother_id: str | None = None,
 39 |         father_id: str | None = None,
 40 | 
 41 |         loci_file: str | None = None,
 42 |         exclude_file: str | None = None,
 43 |         one_based_loci: bool = False,
 44 | 
 45 |         widen: float = 0,
 46 | 
 47 |         mismatch_out_mi: MIKind = "pm1",
 48 |         test_to_perform: str = "none",  # means mismatch_out_mi has no effect
 49 |         sig_level: float = 0.05,
 50 |         mt_corr: str = "none",
 51 |         only_phased: bool = False,
 52 | 
 53 |         debug: bool = False,
 54 |         logger: logging.Logger | None = None,
 55 |     ):
 56 |         self._debug: bool = debug
 57 |         self._logger: logging.Logger = logger or get_main_logger()
 58 | 
 59 |         self._child_call_file: Path = child_call_file
 60 |         self._mother_call_file: Path = mother_call_file
 61 |         self._father_call_file: Path = father_call_file
 62 | 
 63 |         self._child_id: str | None = child_id
 64 |         self._mother_id: str | None = mother_id
 65 |         self._father_id: str | None = father_id
 66 | 
 67 |         self._loci_file: str | None = loci_file
 68 |         self._loci_dict: LociDictOfDict = build_loci_dict_of_dict_from_file(loci_file, one_based_loci)
 69 |         self._loci_dict_cache_key: str = str(uuid.uuid4())
 70 |         if self._loci_file is not None:
 71 |             self._logger.debug(
 72 |                 "Built loci dict of size %d with contigs %s",
 73 |                 sum(len(loc) for loc in self._loci_dict.values()),
 74 |                 tuple(self._loci_dict.keys()),
 75 |             )
 76 | 
 77 |         self._exclude_file: str | None = exclude_file
 78 |         self._exclude_dict: LociDictOfList = build_loci_dict_of_list_from_file(exclude_file, one_based_loci)
 79 |         if self._exclude_file is not None:
 80 |             self._logger.debug(
 81 |                 "Built exclude dict of size %d with contigs %s",
 82 |                 len(self._loci_dict),
 83 |                 tuple(self._exclude_dict.keys()),
 84 |             )
 85 | 
 86 |         self._decimal_threshold: float = 0.5
 87 |         self._widen: float = widen
 88 | 
 89 |         self._mismatch_out_mi: MIKind = mismatch_out_mi
 90 | 
 91 |         self._test_to_perform: str = test_to_perform
 92 |         self._sig_level: float = sig_level
 93 |         self._mt_corr: str = mt_corr
 94 |         self._only_phased: bool = only_phased
 95 | 
 96 |         self._cache: dict[str, Any] = {}
 97 | 
 98 |     @property
 99 |     def test_to_perform(self) -> str:
100 |         return self._test_to_perform
101 | 
102 |     @property
103 |     def sig_level(self) -> float:
104 |         return self._sig_level
105 | 
106 |     @property
107 |     def mt_corr(self) -> str:
108 |         return self._mt_corr
109 | 
110 |     def get_loci_overlapping(
111 |         self, contig: str, start: int, end: int, first_only: bool
112 |     ) -> list[tuple[int, int, list[str]]]:
113 |         return overlapping_loci_dict_of_dict(
114 |             contig, start, end, self._loci_dict, first_only, dict_cache_key=self._loci_dict_cache_key
115 |         )
116 | 
117 |     def should_exclude_locus(self, contig: str, start: int, end: int) -> bool:
118 |         return any(True for _ in overlapping_loci_dict_of_list(contig, start, end, self._exclude_dict, True))
119 | 
120 |     def should_skip_locus(
121 |         self, contig: str, start: int, end: int, cached_overlapping: list | None = None
122 |     ) -> str | None:
123 |         # Returns either a reason string (if yes) or None (=== no)
124 | 
125 |         # Check to make sure call is present in TRF BED file, if it is specified
126 |         # Check to make sure the locus is not excluded via overlap with exclude BED
127 | 
128 |         if not self._loci_file or not self._loci_dict:
129 |             return None
130 | 
131 |         if not (cached_overlapping or self.get_loci_overlapping(contig, start, end, True)):
132 |             return "no overlapping loci"
133 | 
134 |         if self.should_exclude_locus(contig, start, end):
135 |             return "should_exclude_locus returned True"
136 | 
137 |         return None
138 | 
139 |     @abstractmethod
140 |     def _get_sample_contigs(self) -> tuple[set, set, set]:
141 |         return set(), set(), set()
142 | 
143 |     def get_trio_contigs(self, include_sex_chromosomes: bool = False) -> set:
144 |         mc, fc, cc = self._get_sample_contigs()
145 | 
146 |         contig_set = mc.intersection(fc).intersection(cc)
147 | 
148 |         if include_sex_chromosomes:  # TODO: proper parametrization
149 |             if "Y" in cc:
150 |                 contig_set = contig_set.union({"X", "Y"})
151 |             elif "chrY" in cc:
152 |                 contig_set = contig_set.union({"chrX", "chrY"})
153 |             elif "X" in cc:
154 |                 contig_set = contig_set.union({"X"})
155 |             elif "chrX" in cc:
156 |                 contig_set = contig_set.union({"chrX"})
157 |         else:
158 |             contig_set = contig_set.difference(SEX_CHROMOSOMES)
159 | 
160 |         if self._loci_dict:
161 |             # Limit contig set to only contigs which are in the locus dictionary if one is specified.
162 |             contig_set = contig_set.intersection(self._loci_dict.keys())
163 | 
164 |         self._logger.debug("Got %d intersection trio contigs", len(contig_set))
165 | 
166 |         return contig_set
167 | 
168 |     @abstractmethod
169 |     def calculate_contig(self, contig: str) -> MIContigResult:
170 |         return MIContigResult(contig)
171 | 
172 |     @staticmethod
173 |     def _updated_mi_res(res: float | None, v: int | float | None) -> float | None:
174 |         return None if v is None else ((res or 0) + v)
175 | 
176 |     def calculate(self, included_contigs: set) -> MIResult | None:
177 |         # copy number
178 |         res: float = 0
179 |         res_pm1: float = 0
180 |         res_95_ci: float | None = None
181 |         res_99_ci: float | None = None
182 |         # sequence
183 |         res_seq: float | None = None
184 |         res_sl: float | None = None
185 |         res_sl_pm1: float | None = None
186 | 
187 |         n_total: int = 0
188 | 
189 |         contig_results = []
190 |         output_loci = []
191 | 
192 |         for contig in sorted(included_contigs):
193 |             self._logger.info("Processing contig %s", contig)
194 | 
195 |             contig_result = self.calculate_contig(contig)
196 |             contig_results.append(contig_result)
197 | 
198 |             r, nm = contig_result.process_loci(
199 |                 mismatch_out_mi=self._mismatch_out_mi, calculate_non_matching=self.test_to_perform == "none"
200 |             )
201 | 
202 |             value_95_ci = r["ci_95"]
203 |             value_99_ci = r["ci_99"]
204 |             value_seq = r["seq"]
205 |             value_sl = r["sl"]
206 |             value_sl_pm1 = r["sl_pm1"]
207 | 
208 |             res += r["strict"]
209 |             res_pm1 += r["pm1"]
210 |             res_95_ci = self._updated_mi_res(res_95_ci, value_95_ci)
211 |             res_99_ci = self._updated_mi_res(res_99_ci, value_99_ci)
212 |             res_seq = self._updated_mi_res(res_seq, value_seq)
213 |             res_sl = self._updated_mi_res(res_sl, value_sl)
214 |             res_sl_pm1 = self._updated_mi_res(res_sl_pm1, value_sl_pm1)
215 | 
216 |             n_total += len(contig_result)
217 |             output_loci.extend(nm)
218 | 
219 |             logger_fmt = "Finished processing contig %s; n_total=%d. Current value: %.2f%%, ±1: %.2f%%"
220 |             logger_args = [contig_result.contig, n_total, res / n_total * 100, res_pm1 / n_total * 100]
221 | 
222 |             extras = (
223 |                 (res_95_ci, "95%% CI"),
224 |                 (res_99_ci, "99%% CI"),
225 |                 (res_seq, "seq"),
226 |                 (res_sl, "s.l."),
227 |                 (res_sl_pm1, "s.l.±1"),
228 |             )
229 | 
230 |             for val, fmt_txt in extras:
231 |                 if val is not None:
232 |                     logger_fmt += f", {fmt_txt}: %.2f%%"
233 |                     logger_args.append(val / n_total * 100)
234 | 
235 |             self._logger.info(logger_fmt, *logger_args)
236 | 
237 |         if n_total == 0:
238 |             self._logger.warning("No common loci found")
239 |             return None
240 | 
241 |         res /= n_total
242 |         res_pm1 /= n_total
243 |         res_95_ci = None if res_95_ci is None else (res_95_ci / n_total)
244 |         res_99_ci = None if res_99_ci is None else (res_99_ci / n_total)
245 |         res_seq = None if res_seq is None else (res_seq / n_total)
246 |         res_sl = None if res_sl is None else (res_sl / n_total)
247 |         res_sl_pm1 = None if res_sl is None else (res_sl_pm1 / n_total)
248 | 
249 |         mi_res = MIResult(
250 |             {
251 |                 "strict": res,
252 |                 "pm1": res_pm1,
253 |                 "ci_95": res_95_ci,
254 |                 "ci_99": res_99_ci,
255 |                 "seq": res_seq,
256 |                 "sl": res_sl,
257 |                 "sl_pm1": res_sl_pm1,
258 |             },
259 |             contig_results,
260 |             output_loci,
261 |             self._widen,
262 |             self.test_to_perform,
263 |             self.sig_level,
264 |             self.mt_corr,
265 |             logger=self._logger,
266 |         )
267 | 
268 |         if self.test_to_perform != "none":
269 |             mi_res.correct_for_multiple_testing()  # Also calculates new output loci
270 | 
271 |         return mi_res
272 | 


--------------------------------------------------------------------------------
/strkit/mi/expansionhunter.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import pysam
 4 | 
 5 | from .base import BaseCalculator
 6 | from .result import MIContigResult, MILocusData
 7 | from .vcf_utils import VCFCalculatorMixin
 8 | from ..utils import parse_cis
 9 | 
10 | __all__ = ["ExpansionHunterCalculator"]
11 | 
12 | 
13 | def _parse_allele(a: int | str | None) -> int | None:
14 |     if isinstance(a, str):
15 |         if a == ".":
16 |             return None
17 |         return int(a)
18 |     return a
19 | 
20 | 
21 | def _unzip_gt(vals) -> tuple[tuple[int | float | None, ...], tuple[int | float | None, ...]]:
22 |     try:
23 |         return (_parse_allele(vals[0][0]), _parse_allele(vals[1][0])), parse_cis((vals[0][1], vals[1][1]))
24 |     except ValueError:
25 |         return (None, None), (None, None)
26 | 
27 | 
28 | class ExpansionHunterCalculator(BaseCalculator, VCFCalculatorMixin):
29 |     def _get_sample_contigs(self) -> tuple[set, set, set]:
30 |         return self.get_contigs_from_files(self._mother_call_file, self._father_call_file, self._child_call_file)
31 | 
32 |     def calculate_contig(self, contig: str) -> MIContigResult:
33 |         cr = MIContigResult(contig, includes_95_ci=True)
34 | 
35 |         mvf = pysam.VariantFile(str(self._mother_call_file))
36 |         fvf = pysam.VariantFile(str(self._father_call_file))
37 |         cvf = pysam.VariantFile(str(self._child_call_file))
38 | 
39 |         # We want all common loci, so loop through the child and then look for the loci in the parent calls
40 |         # TODO: What to do about filtering etc? !!!!!!!!!!!!!!!!!!!!!!!!
41 |         #  !!!!!!!!!!!!!!!!
42 |         #  - Q score
43 |         #  - CIs are "proper" - not inverted or weird
44 | 
45 |         for cv in cvf.fetch(contig):
46 |             mv = next(mvf.fetch(contig, cv.start, cv.stop), None)
47 |             fv = next(fvf.fetch(contig, cv.start, cv.stop), None)
48 | 
49 |             # TODO: Handle sex chromosomes
50 | 
51 |             k = (contig, cv.start, cv.stop)
52 | 
53 |             if self.should_skip_locus(*k):
54 |                 continue
55 | 
56 |             cr.seen_locus(*k)
57 | 
58 |             if mv is None or fv is None:
59 |                 # Variant isn't found in at least one of the parents, so we can't do anything with it.
60 |                 # TODO: We need to actually check calls, and check with sample ID, not just assume
61 |                 continue
62 | 
63 |             # TODO: Handle missing samples gracefully
64 |             # TODO: Handle wrong formatted VCFs gracefully
65 | 
66 |             cs = cv.samples[self._child_id or 0]
67 |             ms = mv.samples[self._mother_id or 0]
68 |             fs = fv.samples[self._father_id or 0]
69 | 
70 |             cs_reps = tuple(sorted(zip(cs["REPCN"].split("/"), cs["REPCI"].split("/")), key=lambda x: x[0]))
71 |             ms_reps = tuple(sorted(zip(ms["REPCN"].split("/"), ms["REPCI"].split("/")), key=lambda x: x[0]))
72 |             fs_reps = tuple(sorted(zip(fs["REPCN"].split("/"), fs["REPCI"].split("/")), key=lambda x: x[0]))
73 | 
74 |             c_gt, c_gt_95_ci = _unzip_gt(cs_reps)
75 |             m_gt, m_gt_95_ci = _unzip_gt(ms_reps)
76 |             f_gt, f_gt_95_ci = _unzip_gt(fs_reps)
77 | 
78 |             if c_gt[0] is None or m_gt[0] is None or f_gt[0] is None:
79 |                 # None call in VCF, skip this call
80 |                 continue
81 | 
82 |             cr.append(MILocusData(
83 |                 contig=contig,
84 |                 start=cv.start,
85 |                 end=cv.stop,
86 |                 motif=cv.info["RU"],
87 | 
88 |                 child_gt=c_gt, mother_gt=m_gt, father_gt=f_gt,
89 |                 child_gt_95_ci=c_gt_95_ci, mother_gt_95_ci=m_gt_95_ci, father_gt_95_ci=f_gt_95_ci,
90 | 
91 |                 reference_copies=cv.info["REF"],
92 |             ))
93 | 
94 |         return cr
95 | 


--------------------------------------------------------------------------------
/strkit/mi/gangstr.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import pysam
 4 | 
 5 | from .base import BaseCalculator
 6 | from .result import MIContigResult, MILocusData
 7 | from .vcf_utils import VCFCalculatorMixin
 8 | from ..utils import parse_cis
 9 | 
10 | __all__ = ["GangSTRCalculator"]
11 | 
12 | 
13 | class GangSTRCalculator(BaseCalculator, VCFCalculatorMixin):
14 |     def _get_sample_contigs(self) -> tuple[set, set, set]:
15 |         return self.get_contigs_from_files(self._mother_call_file, self._father_call_file, self._child_call_file)
16 | 
17 |     def calculate_contig(self, contig: str) -> MIContigResult:
18 |         cr = MIContigResult(contig, includes_95_ci=True)
19 | 
20 |         mvf = pysam.VariantFile(str(self._mother_call_file))
21 |         fvf = pysam.VariantFile(str(self._father_call_file))
22 |         cvf = pysam.VariantFile(str(self._child_call_file))
23 | 
24 |         # We want all common loci, so loop through the child and then look for the loci in the parent calls
25 |         # TODO: What to do about filtering etc? !!!!!!!!!!!!!!!!!!!!!!!!
26 |         #  !!!!!!!!!!!!!!!!
27 |         #  - Q score
28 |         #  - CIs are "proper" - not inverted or weird
29 | 
30 |         for cv in cvf.fetch(contig):
31 |             mv = next(mvf.fetch(contig, cv.start, cv.stop), None)
32 |             fv = next(fvf.fetch(contig, cv.start, cv.stop), None)
33 | 
34 |             # TODO: Handle sex chromosomes
35 | 
36 |             # Check to make sure call is present in TRF BED file, if it is specified
37 |             k1 = (contig, cv.start, cv.stop)
38 |             k2 = (contig, cv.start + 1, cv.stop + 1)
39 | 
40 |             if self.should_skip_locus(*k1) or self.should_skip_locus(*k2):
41 |                 continue
42 | 
43 |             cr.seen_locus(*k1)
44 | 
45 |             if mv is None or fv is None:
46 |                 # Variant isn't found in at least one of the parents, so we can't do anything with it.
47 |                 # TODO: We need to actually check calls, and check with sample ID, not just assume
48 |                 continue
49 | 
50 |             # TODO: Handle missing samples gracefully
51 |             # TODO: Handle wrong formatted VCFs gracefully
52 | 
53 |             cs = cv.samples[self._child_id or 0]
54 |             ms = mv.samples[self._mother_id or 0]
55 |             fs = fv.samples[self._father_id or 0]
56 | 
57 |             c_gt = cs["REPCN"]
58 |             m_gt = ms["REPCN"]
59 |             f_gt = fs["REPCN"]
60 | 
61 |             try:
62 |                 c_gt_95_ci = parse_cis(cs["REPCI"])
63 |                 m_gt_95_ci = parse_cis(ms["REPCI"])
64 |                 f_gt_95_ci = parse_cis(fs["REPCI"])
65 |             except (ValueError, TypeError):
66 |                 # None call in VCF, skip this call
67 |                 continue
68 | 
69 |             if c_gt[0] is None or m_gt[0] is None or f_gt[0] is None:
70 |                 # None call in VCF, skip this call
71 |                 continue
72 | 
73 |             cr.append(MILocusData(
74 |                 contig=contig,
75 |                 start=cv.start,
76 |                 end=cv.stop,
77 |                 motif=cv.info["RU"],
78 | 
79 |                 child_gt=c_gt, mother_gt=m_gt, father_gt=f_gt,
80 |                 child_gt_95_ci=c_gt_95_ci, mother_gt_95_ci=m_gt_95_ci, father_gt_95_ci=f_gt_95_ci,
81 | 
82 |                 reference_copies=cv.info["REF"],
83 |             ))
84 | 
85 |         return cr
86 | 


--------------------------------------------------------------------------------
/strkit/mi/generic_vcf.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import pysam
  4 | 
  5 | from .base import BaseCalculator
  6 | from .result import MIContigResult, MILocusData
  7 | from .vcf_utils import VCFCalculatorMixin
  8 | 
  9 | __all__ = ["GenericVCFLengthCalculator"]
 10 | 
 11 | 
 12 | class GenericVCFLengthCalculator(BaseCalculator, VCFCalculatorMixin):
 13 |     def _get_sample_contigs(self) -> tuple[set, set, set]:
 14 |         contigs = self.get_contigs_from_files(self._mother_call_file, self._father_call_file, self._child_call_file)
 15 |         self._logger.debug(
 16 |             "Got trio contigs - child: %d, mother: %d, father: %d",
 17 |             len(contigs[2]), len(contigs[0]), len(contigs[1]),
 18 |         )
 19 |         return contigs
 20 | 
 21 |     def calculate_contig(self, contig: str) -> MIContigResult:
 22 |         cr = MIContigResult(contig, includes_seq=True)
 23 | 
 24 |         mvf = pysam.VariantFile(str(self._mother_call_file))
 25 |         fvf = pysam.VariantFile(str(self._father_call_file))
 26 |         cvf = pysam.VariantFile(str(self._child_call_file))
 27 | 
 28 |         # We want all common loci, so loop through the child and then look for the loci in the parent calls
 29 | 
 30 |         for cv in cvf.fetch(contig):
 31 |             # child variant start/end, as determined by the reference allele sequence
 32 |             cv_start = cv.start
 33 |             cv_stop = cv.stop
 34 | 
 35 |             # hack for LongTR: if we override start/end in INFO, use those values as the true start/end in the context
 36 |             # of the locus boundaries
 37 |             if "START" in cv.info:
 38 |                 cv_start = int(cv.info["START"]) - 1
 39 |                 if "END" in cv.info:
 40 |                     cv_stop = int(cv.info["END"])
 41 | 
 42 |             mv = next(mvf.fetch(contig, cv_start, cv_stop), None)
 43 |             fv = next(fvf.fetch(contig, cv_start, cv_stop), None)
 44 | 
 45 |             # TODO: Handle sex chromosomes
 46 | 
 47 |             k = (contig, cv_start, cv_stop)
 48 | 
 49 |             overlapping = self.get_loci_overlapping(k[0], k[1], k[2], True)
 50 | 
 51 |             if r := self.should_skip_locus(k[0], k[1], k[2], cached_overlapping=overlapping):
 52 |                 self._logger.debug(f"Skipping locus {k}: {r}")
 53 |                 continue
 54 | 
 55 |             cr.seen_locus(*k)
 56 | 
 57 |             if mv is None or fv is None:
 58 |                 # Variant isn't found in at least one of the parents, so we can't do anything with it.
 59 |                 # TODO: We need to actually check calls, and check with sample ID, not just assume
 60 |                 self._logger.debug(f"Skipping locus {k}: mv or fv is None")
 61 |                 continue
 62 | 
 63 |             # TODO: Handle missing samples gracefully
 64 |             # TODO: Handle wrong formatted VCFs gracefully
 65 | 
 66 |             # Need to dig up original motif from the locus file - thus, the original locus file is required.
 67 |             motif: str = overlapping[0][-1][0]
 68 |             if not motif:
 69 |                 self._logger.debug(f"Skipping locus {k}: motif is false-y")
 70 |                 continue
 71 | 
 72 |             motif_len = len(motif)
 73 | 
 74 |             cs = cv.samples[self._child_id or 0]
 75 |             ms = mv.samples[self._mother_id or 0]
 76 |             fs = fv.samples[self._father_id or 0]
 77 | 
 78 |             c_seq_gt = tuple(sorted((cv.alleles[g] for g in cs["GT"]), key=len)) if None not in cs["GT"] else None
 79 |             c_gt = tuple(round(len(a) / motif_len) for a in c_seq_gt) if c_seq_gt is not None else None
 80 |             m_seq_gt = tuple(sorted((mv.alleles[g] for g in ms["GT"]), key=len)) if None not in ms["GT"] else None
 81 |             m_gt = tuple(round(len(a) / motif_len) for a in m_seq_gt) if m_seq_gt is not None else None
 82 |             f_seq_gt = tuple(sorted((fv.alleles[g] for g in fs["GT"]), key=len)) if None not in fs["GT"] else None
 83 |             f_gt = tuple(round(len(a) / motif_len) for a in f_seq_gt) if f_seq_gt is not None else None
 84 | 
 85 |             if c_gt is None or m_gt is None or f_gt is None:
 86 |                 # None call in VCF, skip this call
 87 |                 continue
 88 | 
 89 |             cr.append(MILocusData(
 90 |                 contig=contig,
 91 |                 start=cv_start,
 92 |                 end=cv_stop,
 93 |                 motif=motif,
 94 | 
 95 |                 child_gt=c_gt, mother_gt=m_gt, father_gt=f_gt,
 96 | 
 97 |                 # sequence may not line up with start/end if VCF record INFO START/END entries are used
 98 |                 child_seq_gt=c_seq_gt, mother_seq_gt=m_seq_gt, father_seq_gt=f_seq_gt,
 99 |             ))
100 | 
101 |         return cr
102 | 


--------------------------------------------------------------------------------
/strkit/mi/intervals.py:
--------------------------------------------------------------------------------
  1 | import bisect
  2 | from pathlib import Path
  3 | from typing import Iterable
  4 | 
  5 | from strkit.utils import idx_0_getter, idx_1_getter
  6 | 
  7 | 
  8 | def _line_filter_fn(s: str) -> bool:
  9 |     """
 10 |     Filter function to skip blank lines and comments
 11 |     :param s: line of a file
 12 |     :return: whether the line is not blank and is not a comment
 13 |     """
 14 |     return s and not s.startswith("#")
 15 | 
 16 | 
 17 | # key: contig, value: dict of (key: coordinate interval, value: list of extra values)
 18 | LociDictOfDict = dict[str, dict[tuple[int, int], list[str]]]
 19 | 
 20 | # key: contig, value: list of coordinate intervals
 21 | LociDictOfList = dict[str, list[tuple[int, int]]]
 22 | 
 23 | 
 24 | def build_loci_dict_of_dict_from_file(loci_path: str | Path | None, one_based: bool) -> LociDictOfDict:
 25 |     # Assumes standard BED format - 0-based, half-open intervals, unless one_based=True,
 26 |     # in which case assume 1-based closed intervals and adjust to be 0-based half-closed.
 27 | 
 28 |     if not loci_path:
 29 |         return {}
 30 | 
 31 |     start_adj = -1 * int(one_based)  # -1 if converting from 1-based closed to 0-based half-open, otherwise do nothing.
 32 | 
 33 |     res: LociDictOfDict = {}
 34 | 
 35 |     with open(loci_path, "r") as lf:
 36 |         for line in filter(_line_filter_fn, map(str.strip, lf)):
 37 |             ls = line.split("\t")
 38 | 
 39 |             contig, ss, es = ls[:3]
 40 | 
 41 |             if contig not in res:
 42 |                 res[contig] = {}
 43 | 
 44 |             res[contig][int(ss) + start_adj, int(es)] = ls[3:]
 45 | 
 46 |     return res
 47 | 
 48 | 
 49 | def build_loci_dict_of_list_from_file(loci_path: str | Path | None, one_based: bool) -> LociDictOfList:
 50 |     # Assumes standard BED format - 0-based, half-open intervals, unless one_based=True,
 51 |     # in which case assume 1-based closed intervals and adjust to be 0-based half-closed.
 52 | 
 53 |     if not loci_path:
 54 |         return {}
 55 | 
 56 |     start_adj = -1 * int(one_based)  # -1 if converting from 1-based closed to 0-based half-open, otherwise do nothing.
 57 | 
 58 |     res: dict[str, list[tuple[int, int]]] = {}
 59 | 
 60 |     with open(loci_path, "r") as lf:
 61 |         for line in filter(_line_filter_fn, map(str.strip, lf)):
 62 |             ls = line.split("\t")
 63 | 
 64 |             contig, ss, es = ls[:3]
 65 | 
 66 |             if contig not in res:
 67 |                 res[contig] = []
 68 | 
 69 |             res[contig].append((int(ss) + start_adj, int(es)))
 70 | 
 71 |     return res
 72 | 
 73 | 
 74 | _overlapping_dict_cache = {}
 75 | 
 76 | 
 77 | def overlapping_loci_dict_of_dict(
 78 |     contig: str, start: int, end: int, d: LociDictOfDict, first_only: bool = False, dict_cache_key: str | None = None
 79 | ) -> list[tuple[int, int, list[str]]]:
 80 |     if contig not in d:
 81 |         return []
 82 | 
 83 |     global _overlapping_dict_cache
 84 | 
 85 |     full_cache_key = f"{dict_cache_key}--{contig}"
 86 | 
 87 |     if full_cache_key in _overlapping_dict_cache:
 88 |         c_dict, c_keys, c_lhs = _overlapping_dict_cache[full_cache_key]
 89 |     else:
 90 |         c_dict = d[contig]
 91 |         c_keys = tuple(c_dict.keys())
 92 |         c_lhs = tuple(map(lambda k: k[0], c_keys))
 93 |         if full_cache_key is not None:
 94 |             _overlapping_dict_cache[full_cache_key] = c_dict, c_keys, c_lhs
 95 | 
 96 |     i = bisect.bisect_left(c_lhs, end)  # use _left since end is exclusive
 97 | 
 98 |     # now sort by [1] (possible overlap end), which should be (almost!) sorted already.
 99 |     # then, we can get only entries where start < ov[1] via bisect (finding ov[1] <= start and skipping them).
100 |     possible_overlaps = sorted(c_keys[:i], key=idx_1_getter)
101 |     j = bisect.bisect_right(possible_overlaps, start, key=idx_1_getter)  # bisect right because exclusive
102 |     possible_overlaps = possible_overlaps[j:]
103 | 
104 |     acc: list[tuple[int, int, list[str]]] = []
105 | 
106 |     for ov in possible_overlaps:
107 |         acc.append((ov[0], ov[1], c_dict[ov]))
108 |         if first_only:
109 |             break
110 | 
111 |     return sorted(acc, key=idx_0_getter)
112 | 
113 | 
114 | def overlapping_loci_dict_of_list(
115 |     contig: str, start: int, end: int, d: LociDictOfList, first_only: bool
116 | ) -> Iterable[tuple[int, int]]:
117 |     if contig not in d:
118 |         yield from ()
119 |         return
120 | 
121 |     c_ints = d[contig]
122 |     c_lhs = tuple(map(lambda k: k[0], c_ints))
123 |     i = bisect.bisect_left(c_lhs, end)  # use _left since end is exclusive
124 | 
125 |     for ov in c_ints[:i]:
126 |         if start < ov[1]:
127 |             yield ov[0], ov[1]
128 |             if first_only:
129 |                 break
130 | 


--------------------------------------------------------------------------------
/strkit/mi/repeathmm.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from .base import BaseCalculator
 4 | from .result import MIContigResult, MILocusData
 5 | from ..utils import int_tuple
 6 | 
 7 | __all__ = [
 8 |     "RepeatHMMCalculator",
 9 | ]
10 | 
11 | 
12 | class RepeatHMMCalculator(BaseCalculator):
13 |     @staticmethod
14 |     def get_contigs_from_fh(fh) -> set:
15 |         return {ls[0] for ls in (line.split(":") for line in fh)}
16 | 
17 |     @staticmethod
18 |     def make_calls_dict(ph, contig):
19 |         return {
20 |             tuple(k.split(":")): int_tuple(v.split("/"))
21 |             for k, v in (pv.split() for pv in ph)
22 |             if k.split(":")[0] == contig
23 |         }
24 | 
25 |     def _get_sample_contigs(self) -> tuple[set, set, set]:
26 |         with open(self._mother_call_file, "r") as mvf, open(self._father_call_file, "r") as fvf, \
27 |                 open(self._child_call_file, "r") as cvf:
28 | 
29 |             mc = self.get_contigs_from_fh(mvf)
30 |             fc = self.get_contigs_from_fh(fvf)
31 |             cc = self.get_contigs_from_fh(cvf)
32 | 
33 |             return mc, fc, cc
34 | 
35 |     def calculate_contig(self, contig: str) -> MIContigResult:
36 |         cr = MIContigResult(contig)
37 | 
38 |         with open(self._mother_call_file) as mh:
39 |             mother_calls = self.make_calls_dict(mh, contig)
40 | 
41 |         with open(self._father_call_file) as fh:
42 |             father_calls = self.make_calls_dict(fh, contig)
43 | 
44 |         with open(self._child_call_file) as ch:
45 |             for cv in ch:
46 |                 locus_data, call = cv.strip().split(" ")
47 |                 lookup = tuple(locus_data.split(":"))
48 | 
49 |                 if lookup[0] != contig:
50 |                     continue
51 | 
52 |                 locus_start: int = int(lookup[1])
53 |                 locus_end: int = int(lookup[2])
54 | 
55 |                 k = (contig, locus_start, locus_end)
56 | 
57 |                 # Check to make sure call is present in TRF BED file, if it is specified
58 |                 if self.should_skip_locus(*k):
59 |                     continue
60 | 
61 |                 cr.seen_locus(*k)
62 | 
63 |                 # Check to make sure call is present in all trio individuals
64 |                 if lookup not in mother_calls or lookup not in father_calls:
65 |                     continue
66 | 
67 |                 c_gt = int_tuple(call.split("/"))
68 |                 m_gt = mother_calls[lookup]
69 |                 f_gt = father_calls[lookup]
70 | 
71 |                 # Failed calls from RepeatHMM seem to be represented as 0/0, so skip this
72 |                 # TODO… Need to decide if we actually want to include these?
73 |                 #  or at least somehow record them
74 |                 if (0, 0) in (c_gt, m_gt, f_gt):
75 |                     continue
76 | 
77 |                 # TODO: Include ref copies... should be in file somewhere?
78 |                 cr.append(MILocusData(
79 |                     lookup[0],
80 |                     locus_start,
81 |                     locus_end,
82 |                     lookup[3],
83 | 
84 |                     child_gt=int_tuple(call.split("/")),
85 |                     mother_gt=mother_calls[lookup],
86 |                     father_gt=father_calls[lookup],
87 | 
88 |                     logger=self._logger,
89 |                 ))
90 | 
91 |         return cr
92 | 


--------------------------------------------------------------------------------
/strkit/mi/straglr.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from .base import BaseCalculator
  4 | from .result import MILocusData, MIContigResult
  5 | 
  6 | __all__ = [
  7 |     "StraglrCalculator",
  8 | ]
  9 | 
 10 | 
 11 | class StraglrCalculator(BaseCalculator):
 12 |     @staticmethod
 13 |     def get_contigs_from_fh(fh) -> set:
 14 |         return {ls[0] for ls in (line.split("\t") for line in fh if not line.startswith("#"))}
 15 | 
 16 |     def make_calls_dict(self, ph, contig, cr: MIContigResult | None = None):
 17 |         # For reference, dicts are ordered in Python 3.7+ (guaranteed)
 18 | 
 19 |         calls = {}
 20 | 
 21 |         for pv in ph:
 22 |             if pv.startswith("#"):
 23 |                 continue
 24 | 
 25 |             line = pv.strip().split("\t")
 26 | 
 27 |             if line[0] != contig:
 28 |                 if calls:
 29 |                     # assume ordered BED; break after we've collected all calls for the contig
 30 |                     break
 31 |                 continue
 32 | 
 33 |             locus = tuple(line[:3])
 34 | 
 35 |             k = (line[0], int(line[1]), int(line[2]))
 36 | 
 37 |             overlapping = self.get_loci_overlapping(k[0], k[1], k[2], True)
 38 | 
 39 |             if r := self.should_skip_locus(k[0], k[1], k[2], cached_overlapping=overlapping):
 40 |                 self._logger.debug(f"Skipping locus {k}: {r}")
 41 |                 continue
 42 | 
 43 |             if cr:
 44 |                 cr.seen_locus(*k)
 45 | 
 46 |             orig_motif: str = overlapping[0][-1][0]
 47 |             if not orig_motif:  # false-y/blank
 48 |                 self._logger.debug(f"Skipping locus {k}: motif is false-y")
 49 |                 continue
 50 | 
 51 |             # Transform the genotypes into something that is consistent across individuals,
 52 |             # using the file with the list of loci.
 53 |             gt_fact = len(line[3]) / len(orig_motif)
 54 | 
 55 |             gt = tuple(float(g.split("(")[0]) * gt_fact for g in line[4].split(";"))
 56 |             if len(gt) == 1:  # If it's homozygous, expand it out to length 2
 57 |                 gt = gt + gt
 58 | 
 59 |             calls[locus + (orig_motif,)] = gt
 60 | 
 61 |         return calls
 62 | 
 63 |     def _get_sample_contigs(self) -> tuple[set, set, set]:
 64 |         with open(self._mother_call_file, "r") as mvf, open(self._father_call_file, "r") as fvf, \
 65 |                 open(self._child_call_file, "r") as cvf:
 66 | 
 67 |             mc = self.get_contigs_from_fh(mvf)
 68 |             fc = self.get_contigs_from_fh(fvf)
 69 |             cc = self.get_contigs_from_fh(cvf)
 70 | 
 71 |             return mc, fc, cc
 72 | 
 73 |     def calculate_contig(self, contig: str):
 74 |         cr = MIContigResult(contig)
 75 | 
 76 |         with open(self._mother_call_file, "r") as mh:
 77 |             mother_calls = self.make_calls_dict(mh, contig)
 78 | 
 79 |         with open(self._father_call_file, "r") as fh:
 80 |             father_calls = self.make_calls_dict(fh, contig)
 81 | 
 82 |         with open(self._child_call_file, "r") as ch:
 83 |             child_calls = self.make_calls_dict(ch, contig, cr)
 84 | 
 85 |         for locus_data, c_gt in child_calls.items():
 86 |             # Check to make sure call is present in all trio individuals
 87 |             if locus_data not in mother_calls or locus_data not in father_calls:
 88 |                 continue
 89 | 
 90 |             cr.append(MILocusData(
 91 |                 contig=locus_data[0],
 92 |                 start=int(locus_data[1]),
 93 |                 end=int(locus_data[2]),
 94 |                 motif=locus_data[3],
 95 | 
 96 |                 child_gt=c_gt,
 97 |                 mother_gt=mother_calls[locus_data],
 98 |                 father_gt=father_calls[locus_data],
 99 | 
100 |                 decimal=True,
101 |             ))
102 | 
103 |         return cr
104 | 


--------------------------------------------------------------------------------
/strkit/mi/strkit.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import numpy as np
  4 | 
  5 | from pysam import VariantFile
  6 | from pysam.libcbcf import VariantRecordSample
  7 | 
  8 | from strkit.json import json
  9 | 
 10 | from .base import BaseCalculator
 11 | from .result import MIContigResult, MILocusData
 12 | from .vcf_utils import VCFCalculatorMixin
 13 | from ..utils import int_tuple, parse_cis
 14 | 
 15 | __all__ = [
 16 |     "StrKitCalculator",
 17 |     "StrKitJSONCalculator",
 18 |     "StrKitVCFCalculator",
 19 | ]
 20 | 
 21 | 
 22 | STRKIT_TSV_CALL_INDEX = 6
 23 | STRKIT_TSV_CALL_95_CI_INDEX = 7
 24 | 
 25 | 
 26 | class StrKitCalculator(BaseCalculator):
 27 |     @staticmethod
 28 |     def get_contigs_from_fh(fh) -> set[str]:
 29 |         return {ls[0] for ls in (line.split("\t") for line in fh if not line.startswith("#"))}
 30 | 
 31 |     def _get_sample_contigs(self) -> tuple[set, set, set]:
 32 |         with open(self._mother_call_file, "r") as mvf:
 33 |             mc = self.get_contigs_from_fh(mvf)
 34 |         with open(self._father_call_file, "r") as fvf:
 35 |             fc = self.get_contigs_from_fh(fvf)
 36 |         with open(self._child_call_file, "r") as cvf:
 37 |             cc = self.get_contigs_from_fh(cvf)
 38 |         return mc, fc, cc
 39 | 
 40 |     @staticmethod
 41 |     def make_calls_dict(ph, contig):
 42 |         return {
 43 |             tuple(line[:4]): (
 44 |                 int_tuple(line[STRKIT_TSV_CALL_INDEX].split("|")),
 45 |                 parse_cis(line[STRKIT_TSV_CALL_95_CI_INDEX].split("|")),
 46 |                 None  # parse_cis(line[-1:].split("|")),
 47 |             )
 48 |             for line in (pv.strip().split("\t") for pv in ph)
 49 |             if line[0] == contig and "." not in line[STRKIT_TSV_CALL_INDEX]
 50 |         }
 51 | 
 52 |     def calculate_contig(self, contig: str) -> MIContigResult:
 53 |         cr = MIContigResult(contig, includes_95_ci=True)
 54 | 
 55 |         with open(self._mother_call_file) as mh:
 56 |             mother_calls = self.make_calls_dict(mh, contig)
 57 | 
 58 |         self._logger.debug(f"loaded materal calls for {contig}")
 59 | 
 60 |         with open(self._father_call_file) as fh:
 61 |             father_calls = self.make_calls_dict(fh, contig)
 62 | 
 63 |         self._logger.debug(f"loaded paternal calls for {contig}")
 64 | 
 65 |         with open(self._child_call_file) as ch:
 66 |             for cv in ch:
 67 |                 locus_data = cv.strip().split("\t")
 68 | 
 69 |                 if locus_data[0] != contig:
 70 |                     continue
 71 | 
 72 |                 lookup = tuple(locus_data[:4])
 73 | 
 74 |                 start = int(locus_data[1])
 75 |                 end = int(locus_data[2])
 76 | 
 77 |                 if self.should_skip_locus(contig, start, end):
 78 |                     continue
 79 | 
 80 |                 # Check to make sure call is present in all trio individuals
 81 |                 if lookup not in mother_calls or lookup not in father_calls:
 82 |                     continue
 83 | 
 84 |                 m_gt, m_gt_95_ci, _ = mother_calls[lookup]
 85 |                 f_gt, f_gt_95_ci, _ = father_calls[lookup]
 86 | 
 87 |                 calls = locus_data[STRKIT_TSV_CALL_INDEX].split("|")
 88 | 
 89 |                 if "." in calls:
 90 |                     # Failed call
 91 |                     continue
 92 | 
 93 |                 cr.append(MILocusData(
 94 |                     contig=lookup[0],
 95 |                     start=int(lookup[1]),
 96 |                     end=int(lookup[2]),
 97 |                     motif=lookup[3],
 98 | 
 99 |                     child_gt=int_tuple(calls),
100 |                     mother_gt=m_gt,
101 |                     father_gt=f_gt,
102 | 
103 |                     child_gt_95_ci=parse_cis(locus_data[STRKIT_TSV_CALL_95_CI_INDEX].split("|")),
104 |                     mother_gt_95_ci=m_gt_95_ci,
105 |                     father_gt_95_ci=f_gt_95_ci,
106 | 
107 |                     # child_gt_99_ci=parse_cis(locus_data[-1:].split("|")),
108 |                     # mother_gt_99_ci=m_gt_99_ci,
109 |                     # father_gt_99_ci=f_gt_99_ci,
110 | 
111 |                     reference_copies=int(locus_data[4]),
112 | 
113 |                     decimal=False,
114 |                 ))
115 | 
116 |         return cr
117 | 
118 | 
119 | class StrKitJSONCalculator(BaseCalculator):
120 |     def __init__(self, *args, **kwargs):
121 |         super().__init__(*args, **kwargs)
122 | 
123 |         with open(self._mother_call_file, "r") as mvf:
124 |             self._cache["mother_data"] = json.loads(mvf.read())
125 |         with open(self._father_call_file, "r") as fvf:
126 |             self._cache["father_data"] = json.loads(fvf.read())
127 |         with open(self._child_call_file, "r") as cvf:
128 |             self._cache["child_data"] = json.loads(cvf.read())
129 | 
130 |     @staticmethod
131 |     def get_contigs_from_data(report) -> set:
132 |         if (report_contigs := report.get("contigs")) is not None:
133 |             return set(report_contigs)
134 |         return {res["contig"] for res in report["results"]}
135 | 
136 |     def _get_sample_contigs(self, include_sex_chromosomes: bool = False) -> tuple[set, set, set]:
137 |         mc = self.get_contigs_from_data(self._cache["mother_data"])
138 |         fc = self.get_contigs_from_data(self._cache["father_data"])
139 |         cc = self.get_contigs_from_data(self._cache["child_data"])
140 |         return mc, fc, cc
141 | 
142 |     @staticmethod
143 |     def get_read_counts(res: dict, dtype=int):
144 |         # TODO: This only works with diploids...
145 | 
146 |         read_cns = []
147 |         read_peaks = []
148 | 
149 |         for r in res["reads"].values():
150 |             if (peak := r.get("p")) is None:
151 |                 continue
152 |             read_cns.append(r["cn"])
153 |             read_peaks.append(peak)
154 | 
155 |         n = res["peaks"]["modal_n"]
156 | 
157 |         if (n < 2 or len(set(res["call"]))) == 1 and res.get("assign_method", "dist") == "dist":
158 |             # Split copy numbers evenly in two if we have a homozygous locus called only via distance.
159 |             rcs = np.array(read_cns, dtype=dtype)
160 |             np.random.shuffle(rcs)  # TODO: seed shuffle
161 |             part = rcs.shape[0] // 2
162 |             return tuple(rcs[:part].tolist()), tuple(rcs[part:].tolist())
163 | 
164 |         rc = []
165 |         for _ in range(n):
166 |             rc.append([])
167 |         for cn, pk in zip(read_cns, read_peaks):
168 |             rc[pk].append(cn)
169 |         return tuple(map(tuple, rc))
170 | 
171 |     @staticmethod
172 |     def make_calls_dict(report: dict, contig: str):
173 |         return {
174 |             (res["contig"], res["start"], res["end"], res["motif"]): (
175 |                 int_tuple(res["call"]),
176 |                 tuple(map(lambda x: tuple(map(int, x)), res["call_95_cis"])),
177 |                 None,  # Placeholder for 99% CI
178 |                 StrKitJSONCalculator.get_read_counts(res, dtype=int),
179 |             )
180 |             for res in report["results"]
181 |             if res["contig"] == contig and res["call"] is not None
182 |         }
183 | 
184 |     def calculate_contig(self, contig: str) -> MIContigResult:
185 |         c_report = self._cache["child_data"]
186 | 
187 |         cr = MIContigResult(contig, includes_95_ci=True)
188 | 
189 |         mother_data = self.make_calls_dict(self._cache["mother_data"], contig)
190 |         self._logger.debug(f"loaded materal calls for {contig}")
191 | 
192 |         father_data = self.make_calls_dict(self._cache["father_data"], contig)
193 |         self._logger.debug(f"loaded paternal calls for {contig}")
194 | 
195 |         for res in c_report["results"]:
196 |             if res["contig"] != contig:
197 |                 continue
198 | 
199 |             locus_start = res["start"]
200 |             locus_end = res["end"]
201 | 
202 |             lookup = (contig, locus_start, locus_end, res["motif"])
203 | 
204 |             k = (contig, int(locus_start), int(locus_end))
205 | 
206 |             # Check to make sure call is present in TRF BED file, if it is specified
207 |             if self.should_skip_locus(*k):
208 |                 continue
209 | 
210 |             cr.seen_locus(*k)
211 | 
212 |             # Check to make sure call is present in all trio individuals
213 |             if lookup not in mother_data or lookup not in father_data:
214 |                 continue
215 | 
216 |             m_gt, m_gt_95_ci, _, m_rcs = mother_data[lookup]
217 |             f_gt, f_gt_95_ci, _, f_rcs = father_data[lookup]
218 | 
219 |             if res["call"] is None:
220 |                 # Failed call
221 |                 continue
222 | 
223 |             call = int_tuple(res["call"])
224 | 
225 |             cr.append(MILocusData(
226 |                 contig=lookup[0],
227 |                 start=locus_start,
228 |                 end=locus_end,
229 |                 motif=lookup[3],
230 | 
231 |                 child_gt=int_tuple(call),
232 |                 mother_gt=m_gt,
233 |                 father_gt=f_gt,
234 | 
235 |                 child_gt_95_ci=tuple(map(lambda x: tuple(map(int, x)), res["call_95_cis"])),
236 |                 mother_gt_95_ci=m_gt_95_ci,
237 |                 father_gt_95_ci=f_gt_95_ci,
238 | 
239 |                 # child_gt_99_ci=parse_cis(locus_data[-1:].split("|")),
240 |                 # mother_gt_99_ci=m_gt_99_ci,
241 |                 # father_gt_99_ci=f_gt_99_ci,
242 | 
243 |                 child_read_counts=StrKitJSONCalculator.get_read_counts(res, dtype=int),
244 |                 mother_read_counts=m_rcs,
245 |                 father_read_counts=f_rcs,
246 | 
247 |                 reference_copies=int(res["ref_cn"]),
248 | 
249 |                 decimal=False,
250 | 
251 |                 test_to_perform=self.test_to_perform,
252 |                 sig_level=self.sig_level,
253 |             ))
254 | 
255 |         return cr
256 | 
257 | 
258 | class StrKitVCFCalculator(BaseCalculator, VCFCalculatorMixin):
259 |     def _get_sample_contigs(self, include_sex_chromosomes: bool = False) -> tuple[set, set, set]:
260 |         return self.get_contigs_from_files(self._mother_call_file, self._father_call_file, self._child_call_file)
261 | 
262 |     @staticmethod
263 |     def get_peak_cns_from_vcf_line(sample_record: VariantRecordSample):
264 |         if "MCRL" not in sample_record:
265 |             return None
266 | 
267 |         res = []
268 | 
269 |         for enc_peak in sample_record["MCRL"]:
270 |             peak = []
271 |             for cn_r in enc_peak.split("|"):
272 |                 cn, cn_c = cn_r.split("x")
273 |                 peak.extend([int(cn)] * int(cn_c))
274 | 
275 |             res.append(tuple(peak))
276 | 
277 |         if len(res) == 1:
278 |             # Split one peak into two, interleaving reads between the two peaks
279 |             return res[0][::2], res[0][1::2]
280 | 
281 |         return tuple(res)
282 | 
283 |     def calculate_contig(self, contig: str) -> MIContigResult:
284 |         cr = MIContigResult(contig, includes_95_ci=True, includes_seq=True)
285 | 
286 |         mvf = VariantFile(str(self._mother_call_file))
287 |         fvf = VariantFile(str(self._father_call_file))
288 |         cvf = VariantFile(str(self._child_call_file))
289 | 
290 |         # We want all common loci, so loop through the child and then look for the loci in the parent calls
291 | 
292 |         for cv in cvf.fetch(contig):
293 |             if cv.info["VT"] != "str":
294 |                 continue
295 | 
296 |             motif = cv.info["MOTIF"]
297 |             k = (contig, cv.start, cv.stop)
298 | 
299 |             mv = next(filter(lambda v: v.info["VT"] == "str" and v.info["MOTIF"] == motif, mvf.fetch(*k)), None)
300 |             fv = next(filter(lambda v: v.info["VT"] == "str" and v.info["MOTIF"] == motif, fvf.fetch(*k)), None)
301 | 
302 |             # TODO: Handle sex chromosomes
303 | 
304 |             # Check to make sure call is present in TRF BED file, if it is specified
305 |             if self.should_skip_locus(*k):
306 |                 continue
307 | 
308 |             cr.seen_locus(*k)
309 | 
310 |             if mv is None or fv is None:
311 |                 # Variant isn't found in at least one of the parents, so we can't do anything with it.
312 |                 # TODO: We need to actually check calls, and check with sample ID, not just assume
313 |                 continue
314 | 
315 |             # TODO: Handle missing samples gracefully
316 |             # TODO: Handle wrong formatted VCFs gracefully
317 | 
318 |             cs = cv.samples[self._child_id or 0]
319 |             ms = mv.samples[self._mother_id or 0]
320 |             fs = fv.samples[self._father_id or 0]
321 | 
322 |             try:
323 |                 c_gt = cs["MC"]
324 |                 m_gt = ms["MC"]
325 |                 f_gt = fs["MC"]
326 |             except KeyError:
327 |                 # None call in VCF, skip this call
328 |                 continue
329 | 
330 |             try:
331 |                 c_gt_95_ci = parse_cis(cs["MCCI"])
332 |                 m_gt_95_ci = parse_cis(ms["MCCI"])
333 |                 f_gt_95_ci = parse_cis(fs["MCCI"])
334 |             except (ValueError, TypeError):
335 |                 # None call in VCF, skip this call
336 |                 continue
337 | 
338 |             if c_gt[0] is None or m_gt[0] is None or f_gt[0] is None:
339 |                 # None call in VCF, skip this call
340 |                 continue
341 | 
342 |             if self._only_phased and ("PS" not in cs or "PS" not in ms or "PS" not in fs):
343 |                 # No phasing support across trio, and we're only looking at phased loci --> skip this call
344 |                 continue
345 | 
346 |             c_seq_gt = tuple(sorted((cv.alleles[g] for g in cs["GT"]), key=len)) if None not in cs["GT"] else None
347 |             m_seq_gt = tuple(sorted((mv.alleles[g] for g in ms["GT"]), key=len)) if None not in ms["GT"] else None
348 |             f_seq_gt = tuple(sorted((fv.alleles[g] for g in fs["GT"]), key=len)) if None not in fs["GT"] else None
349 | 
350 |             cr.append(MILocusData(
351 |                 contig=contig,
352 |                 start=cv.start,
353 |                 end=cv.stop,
354 |                 motif=motif,
355 | 
356 |                 child_gt=c_gt, mother_gt=m_gt, father_gt=f_gt,
357 |                 child_gt_95_ci=c_gt_95_ci, mother_gt_95_ci=m_gt_95_ci, father_gt_95_ci=f_gt_95_ci,
358 |                 child_seq_gt=c_seq_gt, mother_seq_gt=m_seq_gt, father_seq_gt=f_seq_gt,
359 | 
360 |                 reference_copies=cv.info["REFMC"],
361 | 
362 |                 # ---- for de novo mutation detection (this function returns None if MCRL is not in the VCF FORMAT for
363 |                 #      the samples; i.e., with older STRkit versions):
364 | 
365 |                 child_read_counts=StrKitVCFCalculator.get_peak_cns_from_vcf_line(cs),
366 |                 mother_read_counts=StrKitVCFCalculator.get_peak_cns_from_vcf_line(ms),
367 |                 father_read_counts=StrKitVCFCalculator.get_peak_cns_from_vcf_line(fs),
368 | 
369 |                 test_to_perform=self.test_to_perform,
370 |                 sig_level=self.sig_level,
371 |             ))
372 | 
373 |         return cr
374 | 


--------------------------------------------------------------------------------
/strkit/mi/tandem_genotypes.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from .base import BaseCalculator
 4 | from .result import MIContigResult, MILocusData
 5 | from ..utils import int_tuple
 6 | 
 7 | __all__ = [
 8 |     "TandemGenotypesCalculator",
 9 | ]
10 | 
11 | 
12 | class TandemGenotypesCalculator(BaseCalculator):
13 |     @staticmethod
14 |     def get_contigs_from_fh(fh) -> set[str]:
15 |         return {ls[0] for ls in (line.split("\t") for line in fh if not line.startswith("#"))}
16 | 
17 |     @staticmethod
18 |     def make_calls_dict(ph, contig):
19 |         return {
20 |             tuple(line[:4]): int_tuple(line[6:8])
21 |             for line in (pv.strip().split("\t") for pv in ph if not pv.startswith("#"))
22 |             if line[0] == contig and "." not in line[6:8]
23 |         }
24 | 
25 |     def _get_sample_contigs(self) -> tuple[set, set, set]:
26 |         with open(self._mother_call_file, "r") as mvf, open(self._father_call_file, "r") as fvf, \
27 |                 open(self._child_call_file, "r") as cvf:
28 | 
29 |             mc = self.get_contigs_from_fh(mvf)
30 |             fc = self.get_contigs_from_fh(fvf)
31 |             cc = self.get_contigs_from_fh(cvf)
32 | 
33 |             return mc, fc, cc
34 | 
35 |     def calculate_contig(self, contig: str) -> MIContigResult:
36 |         cr = MIContigResult(contig)
37 | 
38 |         with open(self._mother_call_file) as mh:
39 |             mother_calls = self.make_calls_dict(mh, contig)
40 | 
41 |         with open(self._father_call_file) as fh:
42 |             father_calls = self.make_calls_dict(fh, contig)
43 | 
44 |         with open(self._child_call_file) as ch:
45 |             for cv in ch:
46 |                 locus_data = cv.strip().split("\t")
47 |                 lookup = tuple(locus_data[:4])
48 | 
49 |                 if locus_data[0] != contig:
50 |                     continue
51 | 
52 |                 k = (contig, int(lookup[1]), int(lookup[2]))
53 | 
54 |                 if self.should_skip_locus(*k):
55 |                     continue
56 | 
57 |                 cr.seen_locus(*k)
58 | 
59 |                 # Check to make sure call is present in all trio individuals
60 |                 if lookup not in mother_calls or lookup not in father_calls:
61 |                     continue
62 | 
63 |                 child_calls = locus_data[6:8]
64 | 
65 |                 if "." in child_calls:
66 |                     # Failed call
67 |                     continue
68 | 
69 |                 cr.append(MILocusData(
70 |                     contig=contig,
71 |                     start=k[1],
72 |                     end=k[2],
73 |                     motif=lookup[3],
74 | 
75 |                     child_gt=int_tuple(child_calls),
76 |                     mother_gt=mother_calls[lookup],
77 |                     father_gt=father_calls[lookup],
78 |                 ))
79 | 
80 |         return cr
81 | 


--------------------------------------------------------------------------------
/strkit/mi/trgt.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import pysam
  4 | 
  5 | from .base import BaseCalculator
  6 | from .result import MIContigResult, MILocusData
  7 | from .vcf_utils import VCFCalculatorMixin
  8 | from ..utils import parse_ci
  9 | 
 10 | __all__ = ["TRGTCalculator"]
 11 | 
 12 | 
 13 | def _parse_allele(a: int | str | None) -> int | None:
 14 |     if isinstance(a, str):
 15 |         if a == ".":
 16 |             return None
 17 |         return int(a)
 18 |     return a
 19 | 
 20 | 
 21 | def _unzip_gt(
 22 |     vals, motif_len: int
 23 | ) -> tuple[tuple[int, ...], tuple[tuple[int, ...], tuple[int, ...]]] | tuple[tuple[None, None], tuple[None, None]]:
 24 |     try:
 25 |         return (
 26 |             (
 27 |                 round(_parse_allele(vals[0][0]) / motif_len),
 28 |                 round(_parse_allele(vals[1][0]) / motif_len),
 29 |             ),
 30 |             (
 31 |                 tuple(map(lambda x: round(x / motif_len), parse_ci(vals[0][1]))),
 32 |                 tuple(map(lambda x: round(x / motif_len), parse_ci(vals[1][1]))),
 33 |             ),
 34 |         )
 35 |     except (ValueError, TypeError):
 36 |         return (None, None), (None, None)
 37 | 
 38 | 
 39 | class TRGTCalculator(BaseCalculator, VCFCalculatorMixin):
 40 |     def _get_sample_contigs(self) -> tuple[set, set, set]:
 41 |         return self.get_contigs_from_files(self._mother_call_file, self._father_call_file, self._child_call_file)
 42 | 
 43 |     def calculate_contig(self, contig: str) -> MIContigResult:
 44 |         cr = MIContigResult(contig, includes_95_ci=True, includes_seq=True)
 45 | 
 46 |         mvf = pysam.VariantFile(str(self._mother_call_file))
 47 |         fvf = pysam.VariantFile(str(self._father_call_file))
 48 |         cvf = pysam.VariantFile(str(self._child_call_file))
 49 | 
 50 |         # We want all common loci, so loop through the child and then look for the loci in the parent calls
 51 | 
 52 |         for cv in cvf.fetch(contig):
 53 |             mv = next(mvf.fetch(contig, cv.start, cv.stop), None)
 54 |             fv = next(fvf.fetch(contig, cv.start, cv.stop), None)
 55 | 
 56 |             # TODO: Handle sex chromosomes
 57 | 
 58 |             k = (contig, cv.start, cv.stop)
 59 | 
 60 |             if self.should_skip_locus(*k):
 61 |                 continue
 62 | 
 63 |             cr.seen_locus(*k)
 64 | 
 65 |             if mv is None or fv is None:
 66 |                 # Variant isn't found in at least one of the parents, so we can't do anything with it.
 67 |                 # TODO: We need to actually check calls, and check with sample ID, not just assume
 68 |                 continue
 69 | 
 70 |             # TODO: Handle missing samples gracefully
 71 |             # TODO: Handle wrong formatted VCFs gracefully
 72 | 
 73 |             motif = cv.info["MOTIFS"][0]
 74 | 
 75 |             cs = cv.samples[self._child_id or 0]
 76 |             ms = mv.samples[self._mother_id or 0]
 77 |             fs = fv.samples[self._father_id or 0]
 78 | 
 79 |             if None in cs["GT"] or None in ms["GT"] or None in fs["GT"]:
 80 |                 # None call in VCF, skip this call
 81 |                 continue
 82 | 
 83 |             c_gt = tuple(sorted(int(m.split("_")[0]) for m in cs["MC"]))
 84 |             m_gt = tuple(sorted(int(m.split("_")[0]) for m in ms["MC"]))
 85 |             f_gt = tuple(sorted(int(m.split("_")[0]) for m in fs["MC"]))
 86 | 
 87 |             # Uncomment to use allele length as motif copies:
 88 | 
 89 |             # cs_reps = tuple(sorted(zip(cs["AL"], cs["ALLR"]), key=lambda x: x[0]))
 90 |             # ms_reps = tuple(sorted(zip(ms["AL"], ms["ALLR"]), key=lambda x: x[0]))
 91 |             # fs_reps = tuple(sorted(zip(fs["AL"], fs["ALLR"]), key=lambda x: x[0]))
 92 |             #
 93 |             # c_gt, c_gt_95_ci = _unzip_gt(cs_reps, len(motif))
 94 |             # m_gt, m_gt_95_ci = _unzip_gt(ms_reps, len(motif))
 95 |             # f_gt, f_gt_95_ci = _unzip_gt(fs_reps, len(motif))
 96 | 
 97 |             # noinspection PyTypeChecker
 98 |             c_seq_gt: tuple[str] | tuple[str, str] = tuple(sorted((cv.alleles[g] for g in cs["GT"]), key=len))
 99 |             # noinspection PyTypeChecker
100 |             m_seq_gt: tuple[str] | tuple[str, str] = tuple(sorted((mv.alleles[g] for g in ms["GT"]), key=len))
101 |             # noinspection PyTypeChecker
102 |             f_seq_gt: tuple[str] | tuple[str, str] = tuple(sorted((fv.alleles[g] for g in fs["GT"]), key=len))
103 | 
104 |             cr.append(MILocusData(
105 |                 contig=contig,
106 |                 start=cv.start,
107 |                 end=cv.stop,
108 |                 motif=motif,
109 | 
110 |                 child_gt=c_gt, mother_gt=m_gt, father_gt=f_gt,
111 |                 # Uncomment to use allele length as motif copies 95% CI:
112 |                 # child_gt_95_ci=c_gt_95_ci, mother_gt_95_ci=m_gt_95_ci, father_gt_95_ci=f_gt_95_ci,
113 |                 child_seq_gt=c_seq_gt, mother_seq_gt=m_seq_gt, father_seq_gt=f_seq_gt,
114 |             ))
115 | 
116 |         return cr
117 | 


--------------------------------------------------------------------------------
/strkit/mi/vcf_utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import pysam
 4 | 
 5 | __all__ = ["VCFCalculatorMixin"]
 6 | 
 7 | 
 8 | class VCFCalculatorMixin:
 9 |     @staticmethod
10 |     def get_contigs_from_files(mother_call_file, father_call_file, child_call_file) -> tuple[set, set, set]:
11 |         with pysam.VariantFile(str(mother_call_file)) as mvf:
12 |             mc = set(mvf.header.contigs)
13 | 
14 |         with pysam.VariantFile(str(father_call_file)) as fvf:
15 |             fc = set(fvf.header.contigs)
16 | 
17 |         with pysam.VariantFile(str(child_call_file)) as cvf:
18 |             cc = set(cvf.header.contigs)
19 | 
20 |         return mc, fc, cc
21 | 


--------------------------------------------------------------------------------
/strkit/utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import math
 4 | import operator
 5 | from functools import partial
 6 | from typing import Any, Callable, Iterable
 7 | 
 8 | __all__ = [
 9 |     "cat_strs",
10 |     "is_none",
11 |     "idx_0_getter",
12 |     "idx_1_getter",
13 |     "apply_or_none",
14 |     "int_tuple",
15 |     "float_tuple",
16 |     "parse_ci",
17 |     "parse_cis",
18 |     "cis_overlap",
19 |     "sign",
20 | ]
21 | 
22 | 
23 | # index/property getters and other partials
24 | cat_strs = "".join
25 | is_none = partial(operator.is_, None)
26 | idx_0_getter = operator.itemgetter(0)
27 | idx_1_getter = operator.itemgetter(1)
28 | 
29 | 
30 | def apply_or_none(fn: Callable, x: Any) -> Any:
31 |     # Python: add any type of monad functionality challenge [IMPOSSIBLE]
32 |     return fn(x) if x is not None else None
33 | 
34 | 
35 | def int_tuple(x: Iterable) -> tuple[int, ...]:
36 |     return tuple(map(int, x))
37 | 
38 | 
39 | def float_tuple(x: Iterable) -> tuple[float, ...]:
40 |     return tuple(map(float, x))
41 | 
42 | 
43 | def parse_ci(ci: str, commas=False, dtype=int) -> tuple[int, int] | tuple[float, float]:
44 |     ci_s = ci.split("," if commas else "-")
45 |     return dtype(ci_s[0]), dtype(ci_s[1])
46 | 
47 | 
48 | def parse_cis(
49 |     cis: Iterable[str], commas=False, dtype=int
50 | ) -> tuple[tuple[int, ...], ...] | tuple[tuple[float, ...], ...]:
51 |     return tuple(map(lambda ci: parse_ci(ci, commas, dtype), cis))
52 | 
53 | 
54 | def cis_overlap(ci1, ci2) -> bool:
55 |     epsilon = -0.0001
56 | 
57 |     # []: ci1
58 |     # (): ci2
59 |     # [   (    ]   )  or  [   (    )   ]  or  (   [    )   ]  or  (   [    ]   )
60 |     # int logic: ci1[0] <= ci2[1] and ci2[0] <= ci1[1]
61 |     # float logic: lets add some epsilon to prevent little issues
62 |     return (ci2[1] - ci1[0]) > epsilon and (ci1[1] - ci2[0]) > epsilon
63 | 
64 | 
65 | def sign(x: int | float) -> int:
66 |     return round(math.copysign(1, x))
67 | 


--------------------------------------------------------------------------------
/strkit/viz/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidlougheed/strkit/90f07d4bb8e27b67a1a1c9afafa4f4b1412b171a/strkit/viz/__init__.py


--------------------------------------------------------------------------------
/strkit/viz/server.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, render_template, request, send_file
 2 | from werkzeug.exceptions import NotFound
 3 | 
 4 | __all__ = [
 5 |     "run_server",
 6 | ]
 7 | 
 8 | app = Flask(__name__)
 9 | 
10 | 
11 | @app.route("/")
12 | def browser():
13 |     return render_template(
14 |         "browser.html",
15 |         **app.config["PARAMS"])
16 | 
17 | 
18 | @app.route("/report-metadata")
19 | def get_report_metadata():
20 |     return {k: v for k, v in app.config["CALL_REPORT"].items() if k != "results"}
21 | 
22 | 
23 | @app.route("/params")
24 | def get_params():
25 |     return {
26 |         "cmd": app.config["PARAMS"],
27 |         "report": app.config["CALL_REPORT"]["parameters"],
28 |     }
29 | 
30 | 
31 | @app.route("/loci")
32 | def get_loci():
33 |     cr = app.config["CALL_REPORT"]
34 |     ecd = list(enumerate(cr["results"]))  # TODO: cache
35 | 
36 |     q = request.args.get("q", "").strip()
37 |     if q:
38 |         res = list(filter(lambda x: q.lower() in f"{x[1]['contig']}:{x[1]['start']}-{x[1]['end']}", ecd))  # TODO
39 |     else:
40 |         # TODO: nicer priority
41 |         res = ecd[:10]
42 | 
43 |     return {
44 |         "results": list(map(
45 |             lambda x: {
46 |                 "i": x[0],
47 |                 "contig": x[1]["contig"],
48 |                 "start": x[1]["start"],
49 |                 "end": x[1]["end"],
50 |                 "disabled": x[1]["call"] is None,
51 |             },
52 |             res)),
53 |     }
54 | 
55 | 
56 | @app.route("/call_data/<int:i>")
57 | def get_call_data(i: int):
58 |     cr = app.config["CALL_REPORT"]
59 |     cr_res = cr["results"]
60 |     if i < 0 or i > len(cr_res) - 1:
61 |         raise NotFound()
62 |     return cr_res[i]
63 | 
64 | 
65 | # @app.route("/ref")
66 | # def get_ref_file():
67 | #     return send_file(app.config["PARAMS"]["ref"], conditional=True)
68 | #
69 | #
70 | # @app.route("/ref_index")
71 | # def get_ref_index_file():
72 | #     return send_file(app.config["PARAMS"]["ref_index"], conditional=True)
73 | 
74 | 
75 | @app.route("/align_file")
76 | def get_align_file():
77 |     return send_file(app.config["PARAMS"]["align_file"], conditional=True)
78 | 
79 | 
80 | @app.route("/align_index")
81 | def get_align_index_file():
82 |     return send_file(app.config["PARAMS"]["align_index"], conditional=True)
83 | 
84 | 
85 | def run_server(call_report, **kwargs):
86 |     app.config.from_mapping(dict(CALL_REPORT=call_report, PARAMS=kwargs))
87 |     app.run(host="localhost", port=5011, debug=True)
88 | 


--------------------------------------------------------------------------------
/strkit/viz/static/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davidlougheed/strkit/90f07d4bb8e27b67a1a1c9afafa4f4b1412b171a/strkit/viz/static/logo.png


--------------------------------------------------------------------------------
/tests/data/test_loci.bed:
--------------------------------------------------------------------------------
1 | chr1	200	300	ACAA
2 | chr1	300	400	GA
3 | chr1	350	450	GAGA
4 | chr2	100	200	CAG
5 | 


--------------------------------------------------------------------------------
/tests/test_caller_locus_validation.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from strkit.call.validation import LocusValidationError, valid_motif, validate_locus
 3 | 
 4 | 
 5 | @pytest.mark.parametrize("motif,valid", [
 6 |     ("CAG", True),
 7 |     ("CAGN", True),
 8 |     ("CAGX", False),
 9 |     ("(CAG)n", False),
10 |     ("XX", False),
11 | ])
12 | def test_valid_motif(motif, valid):
13 |     assert valid_motif(motif) == valid
14 | 
15 | 
16 | def test_validate_locus():
17 |     with pytest.raises(LocusValidationError):
18 |         # start > end, invalid
19 |         validate_locus(1, 1000, 500, "CAG")
20 | 
21 |     with pytest.raises(LocusValidationError):
22 |         # start == end, invalid
23 |         validate_locus(1, 1000, 1000, "CAG")
24 | 
25 |     with pytest.raises(LocusValidationError):
26 |         # invalid motif
27 |         validate_locus(1, 1000, 1200, "(CAG)n")
28 | 


--------------------------------------------------------------------------------
/tests/test_caller_utils.py:
--------------------------------------------------------------------------------
 1 | from strkit.call.utils import find_pair_by_ref_pos, normalize_contig
 2 | 
 3 | #         A  A       T  T       C  G       C  C       C  C       A  A       A  A       A  C
 4 | PAIRS = [(0, 1000), (1, 1001), (2, 1003), (3, 1004), (4, 1005), (5, 1006), (6, 1008), (7, 1009)]
 5 | SNVS = ((1003, "C"), (1009, "A"))
 6 | PAIRS_Q = list(p[0] for p in PAIRS)
 7 | PAIRS_R = list(p[1] for p in PAIRS)
 8 | 
 9 | 
10 | def test_find_pair_by_ref_pos():
11 |     assert find_pair_by_ref_pos(PAIRS_R, 1004) == (3, True)
12 |     assert find_pair_by_ref_pos(PAIRS_R, 1007) == (6, False)
13 | 
14 | 
15 | def test_normalize_contig():
16 |     assert normalize_contig("chr5", True) == "chr5"
17 |     assert normalize_contig("5", True) == "chr5"
18 |     assert normalize_contig("X", True) == "chrX"
19 |     assert normalize_contig("chr5", False) == "5"
20 |     assert normalize_contig("chrX", False) == "X"
21 | 


--------------------------------------------------------------------------------
/tests/test_iupac.py:
--------------------------------------------------------------------------------
 1 | from strkit.iupac import get_iupac_code_for_nt_set
 2 | 
 3 | 
 4 | def test_get_iupac_code():
 5 |     assert get_iupac_code_for_nt_set({"A", "T"}) == "W"
 6 |     assert get_iupac_code_for_nt_set({"A", "C", "G", "T"}) == "N"
 7 |     assert get_iupac_code_for_nt_set({"A", "T", "C", "G"}) == "N"
 8 |     assert get_iupac_code_for_nt_set({"A", "T", "C"}) == "H"
 9 |     assert get_iupac_code_for_nt_set({"A", "T", "C", "Z"}) is None
10 |     assert get_iupac_code_for_nt_set({"A", "T", "C", ":)"}) is None
11 |     assert get_iupac_code_for_nt_set({"A", "T", "C", ""}) is None
12 | 


--------------------------------------------------------------------------------
/tests/test_mi_intervals.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | import pytest
 3 | 
 4 | from strkit.mi.intervals import (
 5 |     build_loci_dict_of_dict_from_file,
 6 |     overlapping_loci_dict_of_dict,
 7 |     build_loci_dict_of_list_from_file,
 8 |     overlapping_loci_dict_of_list,
 9 | )
10 | 
11 | TEST_LOCI = pathlib.Path(__file__).parent / "data" / "test_loci.bed"
12 | 
13 | BED_CASES = [
14 |     ("chr1", 50, 70, 0),
15 |     ("chr1", 205, 210, 1),
16 |     ("chr1", 50, 1000, 3),
17 |     ("chr1", 320, 500, 2),
18 |     ("chr1", 400, 450, 1),
19 |     ("chr1", 1000, 1001, 0),
20 |     ("chr2", 100, 101, 1),
21 |     ("chr2", 100, 200, 1),
22 |     ("asdf", 50, 1000, 0),
23 | ]
24 | 
25 | 
26 | @pytest.mark.parametrize("contig,start,end,nr", BED_CASES)
27 | def test_loci_dict_of_dict(contig: str, start: int, end: int, nr: int):
28 |     d = build_loci_dict_of_dict_from_file(TEST_LOCI, False)
29 |     assert len(overlapping_loci_dict_of_dict(contig, start, end, d)) == nr
30 | 
31 | 
32 | @pytest.mark.parametrize("contig,start,end,nr", BED_CASES)
33 | def test_loci_dict_of_list(contig: str, start: int, end: int, nr: int):
34 |     d = build_loci_dict_of_list_from_file(TEST_LOCI, False)
35 |     assert len(tuple(overlapping_loci_dict_of_list(contig, start, end, d, False))) == nr
36 | 


--------------------------------------------------------------------------------