├── MANIFEST.in
├── tests
    ├── data
    │   ├── txt
    │   │   └── samples.txt
    │   └── vcf
    │   │   ├── chr22.vcf.gz
    │   │   ├── sample.vcf.gz
    │   │   ├── chr22.vcf.gz.csi
    │   │   ├── sample.vcf.gz.csi
    │   │   ├── 1kg_2020_chrM.vcf.gz
    │   │   ├── msprime_diploid.vcf.gz
    │   │   ├── 1kg_2020_chrM.vcf.gz.csi
    │   │   ├── field_type_combos.vcf.gz
    │   │   ├── msprime_diploid.vcf.gz.csi
    │   │   ├── field_type_combos.vcf.gz.csi
    │   │   ├── 1kg_2020_chr20_annotations.bcf
    │   │   └── 1kg_2020_chr20_annotations.bcf.csi
    ├── __init__.py
    ├── test_regions.py
    ├── test_calculate.py
    ├── test_vcf_roundtrip.py
    ├── test_stats.py
    ├── test_utils.py
    ├── test_plink_validation.py
    ├── test_retrieval.py
    ├── test_plink.py
    ├── test_cli.py
    ├── test_query.py
    ├── utils.py
    ├── test_tskit_data.py
    ├── test_bcftools_validation.py
    ├── test_vcf_writer.py
    └── test_filter.py
├── performance
    ├── data
    │   ├── .gitignore
    │   ├── requirements.txt
    │   └── Makefile
    └── compare.py
├── vcztools
    ├── __init__.py
    ├── __main__.py
    ├── provenance.py
    ├── constants.py
    ├── calculate.py
    ├── stats.py
    ├── samples.py
    ├── utils.py
    ├── plink.py
    ├── regions.py
    ├── retrieval.py
    ├── cli.py
    ├── query.py
    └── vcf_writer.py
├── .github
    └── workflows
    │   ├── docker
    │       ├── shared.env
    │       └── buildwheel.sh
    │   ├── cd.yml
    │   └── ci.yml
├── Makefile
├── setup.py
├── .pre-commit-config.yaml
├── .clang-format
├── lib
    ├── meson.build
    └── vcf_encoder.h
├── CHANGELOG.md
├── README.md
├── pyproject.toml
├── .gitignore
├── dev.py
└── LICENSE


/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include lib/*.h
2 | 


--------------------------------------------------------------------------------
/tests/data/txt/samples.txt:
--------------------------------------------------------------------------------
1 | NA00001
2 | NA00003
3 | 


--------------------------------------------------------------------------------
/performance/data/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !Makefile
3 | !requirements.txt
4 | 


--------------------------------------------------------------------------------
/performance/data/requirements.txt:
--------------------------------------------------------------------------------
1 | stdpopsim
2 | tskit
3 | bio2zarr
4 | 


--------------------------------------------------------------------------------
/vcztools/__init__.py:
--------------------------------------------------------------------------------
1 | from .provenance import __version__  # noqa F401
2 | 


--------------------------------------------------------------------------------
/tests/data/vcf/chr22.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/vcztools/HEAD/tests/data/vcf/chr22.vcf.gz


--------------------------------------------------------------------------------
/vcztools/__main__.py:
--------------------------------------------------------------------------------
1 | from . import cli
2 | 
3 | if __name__ == "__main__":
4 |     cli.vcztools_main()
5 | 


--------------------------------------------------------------------------------
/tests/data/vcf/sample.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/vcztools/HEAD/tests/data/vcf/sample.vcf.gz


--------------------------------------------------------------------------------
/tests/data/vcf/chr22.vcf.gz.csi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/vcztools/HEAD/tests/data/vcf/chr22.vcf.gz.csi


--------------------------------------------------------------------------------
/tests/data/vcf/sample.vcf.gz.csi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/vcztools/HEAD/tests/data/vcf/sample.vcf.gz.csi


--------------------------------------------------------------------------------
/tests/data/vcf/1kg_2020_chrM.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/vcztools/HEAD/tests/data/vcf/1kg_2020_chrM.vcf.gz


--------------------------------------------------------------------------------
/tests/data/vcf/msprime_diploid.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/vcztools/HEAD/tests/data/vcf/msprime_diploid.vcf.gz


--------------------------------------------------------------------------------
/tests/data/vcf/1kg_2020_chrM.vcf.gz.csi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/vcztools/HEAD/tests/data/vcf/1kg_2020_chrM.vcf.gz.csi


--------------------------------------------------------------------------------
/tests/data/vcf/field_type_combos.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/vcztools/HEAD/tests/data/vcf/field_type_combos.vcf.gz


--------------------------------------------------------------------------------
/tests/data/vcf/msprime_diploid.vcf.gz.csi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/vcztools/HEAD/tests/data/vcf/msprime_diploid.vcf.gz.csi


--------------------------------------------------------------------------------
/.github/workflows/docker/shared.env:
--------------------------------------------------------------------------------
1 | PYTHON_VERSIONS=(
2 |     cp39-cp39
3 |     cp310-cp310
4 |     cp311-cp311
5 |     cp312-cp312
6 | )
7 | 


--------------------------------------------------------------------------------
/tests/data/vcf/field_type_combos.vcf.gz.csi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/vcztools/HEAD/tests/data/vcf/field_type_combos.vcf.gz.csi


--------------------------------------------------------------------------------
/tests/data/vcf/1kg_2020_chr20_annotations.bcf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/vcztools/HEAD/tests/data/vcf/1kg_2020_chr20_annotations.bcf


--------------------------------------------------------------------------------
/tests/data/vcf/1kg_2020_chr20_annotations.bcf.csi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/vcztools/HEAD/tests/data/vcf/1kg_2020_chr20_annotations.bcf.csi


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | 
3 | # rewrite asserts in assert_vcfs_close to give better failure messages
4 | pytest.register_assert_rewrite("tests.utils")
5 | 


--------------------------------------------------------------------------------
/vcztools/provenance.py:
--------------------------------------------------------------------------------
1 | __version__ = "undefined"
2 | try:
3 |     from . import _version
4 | 
5 |     __version__ = _version.version
6 | except ImportError:  # pragma: nocover
7 |     pass
8 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | all: ext
 3 | 
 4 | ext: vcztools/_vcztoolsmodule.c
 5 | 	CFLAGS="-std=c99 -Wall -Wextra -Werror -Wno-unused-parameter -Wno-cast-function-type" \
 6 | 	       python3 setup.py build_ext --inplace
 7 | 
 8 | clean:
 9 | 	rm -f vcztools/*.so
10 | 	rm -fR build
11 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | from setuptools import Extension, setup
 3 | 
 4 | _vcztools_module = Extension(
 5 |     "vcztools._vcztools",
 6 |     sources=["vcztools/_vcztoolsmodule.c", "lib/vcf_encoder.c"],
 7 |     extra_compile_args=["-std=c99"],
 8 |     include_dirs=["lib", numpy.get_include()],
 9 | )
10 | 
11 | setup(
12 |     name="vcztools",
13 |     ext_modules=[_vcztools_module],
14 | )
15 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.5.0
 4 |     hooks:
 5 |       - id: check-merge-conflict
 6 |       - id: debug-statements
 7 |       - id: mixed-line-ending
 8 |       - id: check-case-conflict
 9 |       - id: check-yaml
10 |   - repo: https://github.com/astral-sh/ruff-pre-commit
11 |     rev: v0.4.2
12 |     hooks:
13 |       - id: ruff
14 |         args: [ --fix ]
15 |       - id: ruff-format


--------------------------------------------------------------------------------
/tests/test_regions.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from vcztools.regions import parse_region_string
 4 | 
 5 | 
 6 | @pytest.mark.parametrize(
 7 |     ("targets", "expected"),
 8 |     [
 9 |         ("chr1", ("chr1", None, None)),
10 |         ("chr1:12", ("chr1", 12, 12)),
11 |         ("chr1:12-", ("chr1", 12, None)),
12 |         ("chr1:12-103", ("chr1", 12, 103)),
13 |     ],
14 | )
15 | def test_parse_region_string(
16 |     targets: str, expected: tuple[str, int | None, int | None]
17 | ):
18 |     assert parse_region_string(targets) == expected
19 | 


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
 1 | Language: Cpp
 2 | BasedOnStyle: GNU
 3 | SortIncludes:    false
 4 | AllowShortIfStatementsOnASingleLine: false
 5 | BreakBeforeBraces: Linux
 6 | TabWidth:        4
 7 | IndentWidth:     4
 8 | ColumnLimit:     89
 9 | SpaceBeforeParens:
10 |     ControlStatements
11 | SpacesInCStyleCastParentheses: false
12 | SpaceAfterCStyleCast: true
13 | IndentCaseLabels: true
14 | AlignAfterOpenBracket: DontAlign
15 | BinPackArguments: true
16 | BinPackParameters: true
17 | AlwaysBreakAfterReturnType: AllDefinitions
18 | 
19 | # These are disabled for version 6 compatibility
20 | # StatementMacros: ["PyObject_HEAD"]
21 | # AlignConsecutiveMacros: true
22 | 


--------------------------------------------------------------------------------
/tests/test_calculate.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from vcztools.calculate import REF, SNP, UNCLASSIFIED, get_variant_type
 4 | 
 5 | 
 6 | @pytest.mark.parametrize(
 7 |     ("ref", "alt", "expected_type"),
 8 |     [
 9 |         ("A", "T", SNP),
10 |         ("A", "A", REF),
11 |         ("A", "<NON_REF>", REF),
12 |         ("A", "<*>", REF),
13 |         ("A", "", REF),
14 |         ("A", "AA", UNCLASSIFIED),
15 |         # these are all SNPs since they differ in one base
16 |         ("AC", "TC", SNP),
17 |         ("CA", "CT", SNP),
18 |         ("CAGG", "CTGG", SNP),
19 |     ],
20 | )
21 | def test_get_variant_type(ref, alt, expected_type):
22 |     assert get_variant_type(ref, alt) == expected_type
23 | 


--------------------------------------------------------------------------------
/tests/test_vcf_roundtrip.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | 
 3 | import pytest
 4 | 
 5 | from tests.utils import vcz_path_cache
 6 | from vcztools.vcf_writer import write_vcf
 7 | 
 8 | from .utils import assert_vcfs_close
 9 | 
10 | 
11 | @pytest.mark.parametrize(
12 |     "vcf_file",
13 |     [
14 |         "sample.vcf.gz",
15 |         "1kg_2020_chr20_annotations.bcf",
16 |         "1kg_2020_chrM.vcf.gz",
17 |         "field_type_combos.vcf.gz",
18 |     ],
19 | )
20 | def test_vcf_to_zarr_to_vcf__real_files(tmp_path, vcf_file):
21 |     original = pathlib.Path("tests/data/vcf") / vcf_file
22 |     vcz = vcz_path_cache(original)
23 |     generated = tmp_path.joinpath("output.vcf")
24 |     write_vcf(vcz, generated, no_version=True)
25 |     assert_vcfs_close(original, generated)
26 | 


--------------------------------------------------------------------------------
/lib/meson.build:
--------------------------------------------------------------------------------
 1 | project('vcf_encoder', ['c'],
 2 |     default_options: ['c_std=c99']
 3 | )
 4 | 
 5 | cc = meson.get_compiler('c')
 6 | m_dep = cc.find_library('m', required: false)
 7 | 
 8 | extra_c_args = [
 9 |     '-Wall', '-Wextra', '-Werror', '-Wpedantic', '-W',
10 |     '-Wmissing-prototypes',  '-Wstrict-prototypes',
11 |     '-Wconversion', '-Wshadow', '-Wpointer-arith', '-Wcast-align',
12 |     '-Wcast-qual', '-Wwrite-strings', '-Wnested-externs',
13 |     '-fshort-enums', '-fno-common']
14 | 
15 | lib_sources = ['vcf_encoder.c']
16 | lib_headers = ['vcf_encoder.h']
17 | 
18 | cunit_dep = dependency('cunit')
19 | 
20 | tests = executable('tests',
21 |     sources: ['tests.c', 'vcf_encoder.c'],
22 |     dependencies: [cunit_dep, m_dep],
23 |     c_args: extra_c_args, 
24 | )
25 | test('tests', tests)
26 | 


--------------------------------------------------------------------------------
/vcztools/constants.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | INT_MISSING, INT_FILL = -1, -2
 4 | 
 5 | FLOAT32_MISSING, FLOAT32_FILL = np.array([0x7F800001, 0x7F800002], dtype=np.int32).view(
 6 |     np.float32
 7 | )
 8 | FLOAT32_MISSING_AS_INT32, FLOAT32_FILL_AS_INT32 = np.array(
 9 |     [0x7F800001, 0x7F800002], dtype=np.int32
10 | )
11 | 
12 | # From VCF fixed fields
13 | RESERVED_VARIABLE_NAMES = [
14 |     "variant_contig",
15 |     "variant_position",
16 |     "variant_length",
17 |     "variant_id",
18 |     "variant_id_mask",
19 |     "variant_allele",
20 |     "variant_quality",
21 |     "variant_filter",
22 | ]
23 | 
24 | RESERVED_VCF_FIELDS = {
25 |     "CHROM": "variant_contig",
26 |     "POS": "variant_position",
27 |     "ID": "variant_id",
28 |     "REF": "variant_allele",
29 |     "ALT": "variant_allele",
30 |     "QUAL": "variant_quality",
31 |     "FILTER": "variant_filter",
32 | }
33 | 


--------------------------------------------------------------------------------
/.github/workflows/docker/buildwheel.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | DOCKER_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 3 | source "$DOCKER_DIR/shared.env"
 4 | 
 5 | set -e -x
 6 | 
 7 | ARCH=`uname -p`
 8 | echo "arch=$ARCH"
 9 | 
10 | # We're running as root in the docker container so git commands issued by
11 | # setuptools_scm will fail without this:
12 | git config --global --add safe.directory /project
13 | # Fetch the full history as we'll be missing tags otherwise.
14 | git fetch --unshallow
15 | for V in "${PYTHON_VERSIONS[@]}"; do
16 |     git reset --hard
17 |     git clean -fd
18 |     PYBIN=/opt/python/$V/bin
19 |     rm -rf build/       # Avoid lib build by one Python is used by another
20 |     $PYBIN/python -m venv env
21 |     source env/bin/activate
22 |     $PYBIN/python -m pip install --upgrade build
23 |     SETUPTOOLS_SCM_DEBUG=1 $PYBIN/python -m build
24 | done
25 | 
26 | cd dist
27 | for whl in *.whl; do
28 |     auditwheel -v repair "$whl"
29 |     rm "$whl"
30 | done
31 | 


--------------------------------------------------------------------------------
/vcztools/calculate.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | # Variant types
 4 | REF = -1  # missing value
 5 | SNP = 1 << 0
 6 | UNCLASSIFIED = 1 << 8
 7 | 
 8 | 
 9 | def get_variant_type(ref: str, alt: str) -> int:
10 |     """Return the variant type int for the given REF, ALT combination."""
11 |     if len(alt) == 0:
12 |         return REF
13 |     elif len(ref) == 1 and len(alt) == 1 and alt != "*":
14 |         if ref == alt:
15 |             return REF
16 |         else:
17 |             return SNP
18 |     elif alt == "<*>" or alt == "<NON_REF>":
19 |         return REF
20 |     elif (
21 |         len(ref) > 1
22 |         and len(ref) == len(alt)
23 |         and sum([r != a for r, a in zip(ref, alt)]) == 1  # one base differs
24 |     ):
25 |         return SNP
26 |     else:
27 |         return UNCLASSIFIED
28 | 
29 | 
30 | def calculate_variant_type(variant_allele: np.ndarray) -> np.ndarray:
31 |     """Calculate the variant type array from the variant_allele array."""
32 |     ref = variant_allele[:, 0]
33 |     alt = variant_allele[:, 1:]
34 | 
35 |     variant_type = np.zeros(alt.shape, dtype=np.int16)
36 | 
37 |     for i in range(alt.shape[0]):
38 |         for j in range(alt.shape[1]):
39 |             variant_type[i, j] = get_variant_type(ref[i], alt[i, j])
40 |     return variant_type
41 | 


--------------------------------------------------------------------------------
/tests/test_stats.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | from io import StringIO
 3 | 
 4 | import pytest
 5 | import zarr
 6 | from bio2zarr import vcf
 7 | 
 8 | from vcztools.stats import nrecords, stats
 9 | 
10 | from .utils import vcz_path_cache
11 | 
12 | 
13 | def test_nrecords():
14 |     original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz"
15 |     vcz = vcz_path_cache(original)
16 | 
17 |     output_str = StringIO()
18 |     nrecords(vcz, output_str)
19 |     assert output_str.getvalue() == "9\n"
20 | 
21 | 
22 | def test_stats():
23 |     original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz"
24 |     vcz = vcz_path_cache(original)
25 | 
26 |     output_str = StringIO()
27 |     stats(vcz, output_str)
28 | 
29 |     assert (
30 |         output_str.getvalue()
31 |         == """19	.	2
32 | 20	.	6
33 | X	.	1
34 | """
35 |     )
36 | 
37 | 
38 | def test_stats__no_index(tmp_path):
39 |     original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz"
40 |     # don't use cache here since we want to make sure vcz is not indexed
41 |     vcz = tmp_path.joinpath("intermediate.vcz")
42 |     vcf.convert([original], vcz, worker_processes=0, local_alleles=False)
43 | 
44 |     # delete the index created by vcf2zarr
45 |     root = zarr.open(vcz, mode="a")
46 |     del root["region_index"]
47 | 
48 |     with pytest.raises(ValueError, match="Could not load 'region_index' variable."):
49 |         stats(vcz, StringIO())
50 | 


--------------------------------------------------------------------------------
/vcztools/stats.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import zarr
 3 | 
 4 | from vcztools.utils import open_file_like
 5 | 
 6 | 
 7 | def nrecords(vcz, output):
 8 |     root = zarr.open(vcz, mode="r")
 9 | 
10 |     with open_file_like(output) as output:
11 |         num_variants = root["variant_position"].shape[0]
12 |         print(num_variants, file=output)
13 | 
14 | 
15 | def stats(vcz, output):
16 |     root = zarr.open(vcz, mode="r")
17 | 
18 |     if "region_index" not in root:
19 |         raise ValueError(
20 |             "Could not load 'region_index' variable. "
21 |             "Use 'vcz2zarr' to create an index."
22 |         )
23 | 
24 |     with open_file_like(output) as output:
25 |         contigs = root["contig_id"][:].astype("U").tolist()
26 |         if "contig_length" in root:
27 |             contig_lengths = root["contig_length"][:]
28 |         else:
29 |             contig_lengths = ["."] * len(contigs)
30 | 
31 |         region_index = root["region_index"][:]
32 | 
33 |         contig_indexes = region_index[:, 1]
34 |         num_records = region_index[:, 5]
35 | 
36 |         num_records_per_contig = np.bincount(
37 |             contig_indexes, weights=num_records
38 |         ).astype(np.int64)
39 | 
40 |         for contig, contig_length, nr in zip(
41 |             contigs, contig_lengths, num_records_per_contig
42 |         ):
43 |             if nr > 0:
44 |                 print(f"{contig}\t{contig_length}\t{nr}", file=output)
45 | 


--------------------------------------------------------------------------------
/performance/data/Makefile:
--------------------------------------------------------------------------------
 1 | # The make recipes require bcftools and bgzip.
 2 | 
 3 | # https://samtools.github.io/bcftools/howtos/install.html
 4 | # https://www.htslib.org/doc/bgzip.html
 5 | 
 6 | # On macOS, there are Homebrew formulas for bcftools and htslib,
 7 | # which contains bgzip.
 8 | 
 9 | # The Python requirements are listed in requirements.txt:
10 | # pip install -r requirements.txt
11 | 
12 | # Flags / commandline arguments:
13 | CHROMOSOME ?= 22
14 | WGS ?= 1
15 | 
16 | ifeq ($(WGS), 1)
17 |         TGP_URL = "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20190425_NYGC_GATK/CCDG_13607_B01_GRM_WGS_2019-02-19_chr$(CHROMOSOME).recalibrated_variants.vcf.gz"
18 | else
19 |         # Use URL for genotyping data:
20 |         TGP_URL = "http://hgdownload.cse.ucsc.edu/gbdb/hg19/1000Genomes/phase3/ALL.chr$(CHROMOSOME).phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz"
21 | endif
22 | 
23 | .PHONY: all simulated real clean
24 | 
25 | all: simulated real
26 | 
27 | simulated: sim_10k.vcz
28 | 
29 | real: chr22.vcz
30 | 
31 | sim_10k.ts:
32 | 	stdpopsim HomSap -c chr22 -o sim_10k.ts pop_0:10000
33 | 
34 | chr22.vcf.gz:
35 | 	bcftools view $(TGP_URL) | head -n 25000 | bcftools view -O z -o chr22.vcf.gz
36 | 
37 | %.vcf.gz: %.ts
38 | 	tskit vcf $< | bgzip > $@
39 | 
40 | %.vcf.gz.csi: %.vcf.gz
41 | 	bcftools index $<
42 | 
43 | %.vcz: %.vcf.gz %.vcf.gz.csi
44 | 	vcf2zarr convert $< $@
45 | 
46 | clean:
47 | 	rm -rf sim_10k.*
48 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | ## [0.1.0] - 2025-05-29
 4 | 
 5 | Improvements:
 6 | 
 7 | - Support filtering by FILTER (#217), CHROM (#223) and general string values (#220)
 8 | - Support regions (-r/-t), filter expressions (-i/-e) and samples (-s) in query command  (#205)
 9 | - Various improvements to support VCZ datasets produced from tskit and plink files by bio2zarr.
10 | - Use a fully dynamically generated header via ``vcf_meta_information`` attributes
11 | (#208). Requires vcf-zarr version >= 0.4 (bio2zarr >= 0.1.6) to fully recover the original
12 | header.
13 | - Add --version (#197)
14 | 
15 | Breaking:
16 | 
17 | - Update minimum Click version to 8.2.0 (#206)
18 | 
19 | ## [0.0.2] - 2025-04-04
20 | 
21 | Important bugfixes for filtering language and sample subsetting.
22 | 
23 | - Clarify the implementation status of the filtering mini-lanuage in
24 |   view/query. Version 0.0.1 contained several data-corrupting bugs,
25 |   including incorrect missing data handling (#163), incorrect
26 |   matching on FILTER (#164) and CHROM (#178) columns, and
27 |   incorrect per-sample filtering in query (#179). These issues
28 |   have been resolved by raising informative errors on aspects
29 |   of the query language that are not implemented correctly.
30 | 
31 | - The filtering mini-language now consists of arbitrary arithmetic
32 |   expressions on 1-dimensional fields.
33 | 
34 | - Add support for specifying samples via -s/-S options
35 | 
36 | ## [0.0.1] - 2025-02-05
37 | 
38 | Initial release of vcztools
39 | 


--------------------------------------------------------------------------------
/performance/compare.py:
--------------------------------------------------------------------------------
 1 | # This script requires pv.
 2 | 
 3 | # https://www.ivarch.com/programs/pv.shtml
 4 | 
 5 | # There is a Homebrew formula to install pv on macOS.
 6 | 
 7 | # This script also depends on the simulation data:
 8 | # make -C data
 9 | 
10 | import subprocess
11 | import sys
12 | 
13 | 
14 | def run_time_pv(command: str):
15 |     print(command)
16 |     subprocess.run(f"time {command} | pv > /dev/null", shell=True)
17 |     print()
18 | 
19 | 
20 | def run_bcftools(command: str, dataset_name: str):
21 |     run_time_pv(f"bcftools {command} data/{dataset_name}.vcf.gz")
22 | 
23 | 
24 | def run_vcztools(command: str, dataset_name: str):
25 |     run_time_pv(f"vcztools {command} data/{dataset_name}.vcz")
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     commands = [
30 |         ("view", "sim_10k"),
31 |         ("view", "chr22"),
32 |         ("view -s tsk_7068,tsk_8769,tsk_8820", "sim_10k"),
33 |         (r"query -f '%CHROM %POS %REF %ALT{0}\n'", "sim_10k"),
34 |         (r"query -f '%CHROM:%POS\n' -i 'POS=49887394 | POS=50816415'", "sim_10k"),
35 |         ("view -s '' --force-samples", "sim_10k"),
36 |         ("view -i 'FMT/DP>10 & FMT/GQ>10'", "chr22"),
37 |         ("view -i 'QUAL>10 || FMT/GQ>10'", "chr22"),
38 |         (r"query -f 'GQ:[ %GQ] \t GT:[ %GT]\n'", "chr22"),
39 |     ]
40 | 
41 |     if len(sys.argv) == 2 and sys.argv[1].isnumeric():
42 |         index = int(sys.argv[1])
43 |         command, dataset = commands[index]
44 |         run_bcftools(command, dataset)
45 |         run_vcztools(command, dataset)
46 |     else:
47 |         for command, dataset in commands:
48 |             run_bcftools(command, dataset)
49 |             run_vcztools(command, dataset)
50 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from numpy.testing import assert_array_equal
 3 | 
 4 | from vcztools.utils import search, vcf_name_to_vcz_names
 5 | 
 6 | 
 7 | @pytest.mark.parametrize(
 8 |     ("a", "v", "expected_ind"),
 9 |     [
10 |         (["a", "b", "c", "d"], ["b", "a", "c"], [1, 0, 2]),
11 |         (["a", "c", "d", "b"], ["b", "a", "c"], [3, 0, 1]),
12 |         (["a", "c", "d", "b"], ["b", "a", "a", "c"], [3, 0, 0, 1]),
13 |         (["a", "c", "d", "b"], [], []),
14 |     ],
15 | )
16 | def test_search(a, v, expected_ind):
17 |     assert_array_equal(search(a, v), expected_ind)
18 | 
19 | 
20 | @pytest.mark.parametrize(
21 |     ("vczs", "vcf", "expected_vcz_names"),
22 |     [
23 |         ({"call_genotype"}, "GT", ["call_genotype"]),
24 |         ({"call_genotype"}, "FMT/GT", ["call_genotype"]),
25 |         ({"call_genotype"}, "FORMAT/GT", ["call_genotype"]),
26 |         ({"call_DP"}, "DP", ["call_DP"]),
27 |         ({"variant_DP"}, "DP", ["variant_DP"]),
28 |         ({"call_DP", "variant_DP"}, "DP", ["call_DP", "variant_DP"]),
29 |         ({"call_DP", "variant_DP"}, "FORMAT/DP", ["call_DP"]),
30 |         ({"call_DP", "variant_DP"}, "INFO/DP", ["variant_DP"]),
31 |         ({"variant_DP"}, "FORMAT/DP", []),
32 |         ({"call_DP"}, "INFO/DP", []),
33 |         (set(), "CHROM", ["variant_contig"]),
34 |         (set(), "POS", ["variant_position"]),
35 |         (set(), "ID", ["variant_id"]),
36 |         (set(), "REF", ["variant_allele"]),
37 |         (set(), "ALT", ["variant_allele"]),
38 |         (set(), "QUAL", ["variant_quality"]),
39 |         (set(), "FILTER", ["variant_filter"]),
40 |     ],
41 | )
42 | def test_vcf_name_to_vcz_names(vczs, vcf, expected_vcz_names):
43 |     assert vcf_name_to_vcz_names(vczs, vcf) == expected_vcz_names
44 | 


--------------------------------------------------------------------------------
/tests/test_plink_validation.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pathlib
 3 | import subprocess
 4 | 
 5 | import click.testing as ct
 6 | import pytest
 7 | 
 8 | import vcztools.cli as cli
 9 | 
10 | from . import utils
11 | 
12 | 
13 | def assert_files_identical(path1, path2):
14 |     """
15 |     Asserts the files are byte-for-byte identical.
16 |     """
17 |     with open(path1, "rb") as f:
18 |         b1 = f.read()
19 |     with open(path2, "rb") as f:
20 |         b2 = f.read()
21 |     assert b1 == b2
22 | 
23 | 
24 | @pytest.mark.skip("Removing plink from CLI for bugfix release")
25 | # fmt: off
26 | @pytest.mark.parametrize(
27 |     ("args", "vcf_file"),
28 |     [
29 |         ("", "sample.vcf.gz"),
30 |         ("", "chr22.vcf.gz"),
31 |         ("", "1kg_2020_chrM.vcf.gz"),
32 |         # FIXME this needs some extra args to deal with sample ID format
33 |         # ("", "msprime_diploid.vcf.gz"),
34 |     ],
35 | )
36 | # fmt: on
37 | def test_conversion_identical(tmp_path, args, vcf_file):
38 |     original = pathlib.Path("tests/data/vcf") / vcf_file
39 |     vcz = utils.vcz_path_cache(original)
40 | 
41 |     plink_workdir = tmp_path / "plink1.9"
42 |     plink_workdir.mkdir()
43 |     plink_bin = os.environ.get("PLINK_BIN", "plink")
44 |     cmd = f"{plink_bin} --vcf {original.absolute()} {args}"
45 |     result = subprocess.run(cmd, shell=True, cwd=plink_workdir, capture_output=True)
46 |     assert result.returncode == 0
47 | 
48 |     cmd = f"view-plink1 {vcz.absolute()} {args}"
49 |     runner = ct.CliRunner()
50 |     with runner.isolated_filesystem(tmp_path) as working_dir:
51 |         vcz_workdir = pathlib.Path(working_dir)
52 |         result = runner.invoke(cli.vcztools_main, cmd, catch_exceptions=False)
53 |         for filename in ["plink.fam", "plink.bim", "plink.bed"]:
54 |             assert_files_identical(vcz_workdir / filename, plink_workdir / filename)
55 | 


--------------------------------------------------------------------------------
/vcztools/samples.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import numpy as np
 4 | 
 5 | from vcztools.utils import search
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | 
10 | def parse_samples(
11 |     samples: list[str] | str | None,
12 |     all_samples: np.ndarray,
13 |     *,
14 |     force_samples: bool = True,
15 | ) -> tuple[np.ndarray, np.ndarray | None]:
16 |     """Parse a bcftools-style samples string, or a list of sample IDs.
17 | 
18 |     Returns an array of the sample IDs, and an array indicating the selection
19 |     from all samples.
20 |     """
21 | 
22 |     if samples is None:
23 |         return all_samples, None
24 |     elif isinstance(samples, list):
25 |         exclude_samples = False
26 |         sample_ids = np.array(samples)
27 |     else:
28 |         exclude_samples = samples.startswith("^")
29 |         samples = samples.lstrip("^")
30 |         sample_ids = np.array(samples.split(","))
31 | 
32 |     if np.all(sample_ids == np.array("")):
33 |         sample_ids = np.empty((0,))
34 | 
35 |     unknown_samples = np.setdiff1d(sample_ids, all_samples)
36 |     if len(unknown_samples) > 0:
37 |         if force_samples:
38 |             # remove unknown samples from sample_ids
39 |             logger.warning(
40 |                 "subset called for sample(s) not in header: "
41 |                 f'{",".join(unknown_samples)}.'
42 |             )
43 |             sample_ids = np.delete(sample_ids, search(sample_ids, unknown_samples))
44 |         else:
45 |             raise ValueError(
46 |                 "subset called for sample(s) not in header: "
47 |                 f'{",".join(unknown_samples)}. '
48 |                 'Use "--force-samples" to ignore this error.'
49 |             )
50 | 
51 |     samples_selection = search(all_samples, sample_ids)
52 |     if exclude_samples:
53 |         samples_selection = np.setdiff1d(np.arange(all_samples.size), samples_selection)
54 |     sample_ids = all_samples[samples_selection]
55 |     return sample_ids, samples_selection
56 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![CI](https://github.com/sgkit-dev/vcztools/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/sgkit-dev/vcztools/actions/workflows/ci.yml)
 2 | [![PyPI Downloads](https://static.pepy.tech/badge/vcztools)](https://pepy.tech/projects/vcztools)
 3 | 
 4 | # vcztools
 5 | Partial reimplementation of bcftools for [VCF Zarr](https://github.com/sgkit-dev/vcf-zarr-spec/)
 6 | 
 7 | Please see the [preprint](https://www.biorxiv.org/content/10.1101/2024.06.11.598241) for more information.
 8 | 
 9 | 
10 | ## Installation
11 | 
12 | ```
13 | python3 -m pip install vcztools
14 | ```
15 | 
16 | ## Usage
17 | 
18 | ```
19 | vcztools view <path.vcz>
20 | ```
21 | or
22 | ```
23 | python -m vcztools view <path.vcz>
24 | ```
25 | should be equivalent to running
26 | ```
27 | bcftools view <path.vcf.gz>
28 | ```
29 | 
30 | See the [bio2zarr](https://sgkit-dev.github.io/bio2zarr/) project for help in
31 | converting VCF files to Zarr.
32 | 
33 | ## Goals
34 | 
35 | Vcztools aims to be a drop-in replacement for a subset of bcftools functionality.
36 | Currently supported are the ``view``, ``query`` and ``index -s/-n`` commands.
37 | 
38 | We aim for 100% compatibility so if you notice a difference between the output of
39 | vcztools and bcftools please do open an issue.
40 | 
41 | ## Cloud stores
42 | 
43 | Vcztools can read vcz files from cloud stores using [fsspec](https://filesystem-spec.readthedocs.io/en/latest/).
44 | 
45 | For example, to read from Amazon S3, first install the `s3fs` fsspec library:
46 | 
47 | ```
48 | python3 -m pip install s3fs
49 | ```
50 | 
51 | Then provide your AWS credentials as described in the [`s3fs` documentation](https://s3fs.readthedocs.io/en/latest/#credentials), for example by setting environment variables:
52 | 
53 | ```
54 | export AWS_ACCESS_KEY_ID=...
55 | export AWS_SECRET_ACCESS_KEY=...
56 | ```
57 | 
58 | You can then run vcztools using an `s3://` URL:
59 | 
60 | ```
61 | python -m vcztools view s3://<bucket-name>/path/to.vcz
62 | ```
63 | 
64 | ## Development
65 | 
66 | Vcztools is under active development and contributions are warmly welcomed. Please
67 | see the project on [GitHub](https://github.com/sgkit-dev/vcztools).
68 | 
69 | 


--------------------------------------------------------------------------------
/vcztools/utils.py:
--------------------------------------------------------------------------------
 1 | from contextlib import ExitStack, contextmanager
 2 | from pathlib import Path
 3 | 
 4 | import numpy as np
 5 | 
 6 | from vcztools.constants import RESERVED_VCF_FIELDS
 7 | 
 8 | 
 9 | def search(a, v):
10 |     """
11 |     Finds the indices into an array a corresponding to the elements in v.
12 |     The behaviour is undefined if any elements in v are not in a.
13 |     """
14 |     sorter = np.argsort(a)
15 |     rank = np.searchsorted(a, v, sorter=sorter)
16 |     return sorter[rank]
17 | 
18 | 
19 | @contextmanager
20 | def open_file_like(file):
21 |     """A context manager for opening a file path or string (and closing on exit),
22 |     or passing a file-like object through."""
23 |     with ExitStack() as stack:
24 |         if isinstance(file, (str, Path)):
25 |             file = stack.enter_context(open(file, mode="w"))
26 |         yield file
27 | 
28 | 
29 | def vcf_name_to_vcz_names(vcz_names: set[str], vcf_name: str) -> list[str]:
30 |     """
31 |     Convert the name of a VCF field to the names of corresponding VCF Zarr arrays.
32 | 
33 |     :param set[str] vcz_names: A set of allowed VCF Zarr field names
34 |     :param str vcf_name: The name of the VCF field
35 |     :return: The names of corresponding VCF Zarr arrays, with call (FORMAT) fields
36 |     before variant (INFO) fields, if both are possible matches, or an empty list
37 |     if there are no matches.
38 |     :rtype: list[str]
39 |     """
40 | 
41 |     candidates = []
42 |     split = vcf_name.split("/")
43 |     assert 1 <= len(split) <= 2
44 | 
45 |     if split[-1] == "GT":
46 |         candidates.append("call_genotype")
47 |     elif len(split) > 1:
48 |         if split[0] in {"FORMAT", "FMT"}:
49 |             candidates.append(f"call_{split[-1]}")
50 |         elif split[0] in {"INFO"}:
51 |             candidates.append(f"variant_{split[-1]}")
52 |     else:
53 |         candidates.append(f"call_{split[-1]}")
54 |         candidates.append(f"variant_{split[-1]}")
55 | 
56 |     matches = [candidate for candidate in candidates if candidate in vcz_names]
57 | 
58 |     if vcf_name in RESERVED_VCF_FIELDS:
59 |         matches.append(RESERVED_VCF_FIELDS[vcf_name])
60 | 
61 |     return matches
62 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = [
 3 |     "setuptools>=45",
 4 |     "wheel",
 5 |     "numpy>=2",
 6 |     "setuptools_scm"
 7 | ]
 8 | build-backend = "setuptools.build_meta"
 9 | 
10 | [project]
11 | name = "vcztools"
12 | description = "Implementation of bcftools for VCF Zarr"
13 | readme = "README.md"
14 | license = {file = "LICENSE"}
15 | authors = [
16 |   {name = "sgkit Developers", email = "project@sgkit.dev"},
17 | ]
18 | dependencies = [
19 |   "numpy>=1.23.5",
20 |   "zarr>=2.17,<3",
21 |   "click>=8.2.0",
22 |   "pyranges!=0.1.3",
23 |   "pyparsing>=3"
24 | ]
25 | requires-python = ">=3.10"
26 | classifiers = [
27 |   "Development Status :: 4 - Beta",
28 |   "License :: OSI Approved :: Apache Software License",
29 |   "Operating System :: POSIX",
30 |   "Operating System :: POSIX :: Linux",
31 |   "Operating System :: MacOS",
32 |   "Operating System :: MacOS :: MacOS X",
33 |   "Intended Audience :: Science/Research",
34 |   "Programming Language :: Python",
35 |   "Programming Language :: Python :: 3",
36 |   "Programming Language :: Python :: 3.10",
37 |   "Programming Language :: Python :: 3.11",
38 |   "Programming Language :: Python :: 3.12",
39 |   "Topic :: Scientific/Engineering"
40 | ]
41 | dynamic = ["version"]
42 | 
43 | [project.urls]
44 | repository = "https://github.com/sgkit-dev/vcztools"
45 | 
46 | [project.scripts]
47 | vcztools = "vcztools.cli:vcztools_main"
48 | 
49 | [project.optional-dependencies]
50 | dev = [
51 |   "bio2zarr",
52 |   "cyvcf2",
53 |   "pytest",
54 |   "pytest-cov",
55 |   "msprime",
56 |   "sgkit",
57 | ]
58 | 
59 | [tool.setuptools]
60 | packages = ["vcztools"]
61 | 
62 | [tool.pytest.ini_options]
63 | testpaths = ["tests"]
64 | addopts = "--cov=vcztools --cov-report=term-missing"
65 | 
66 | [tool.setuptools_scm]
67 | write_to = "vcztools/_version.py"
68 | 
69 | [tool.ruff]
70 | # Assume Python 3.10
71 | target-version = "py310"
72 | 
73 | # Same as Black.
74 | line-length = 88
75 | indent-width = 4
76 | 
77 | [tool.ruff.lint]
78 | select = ["E", "F", "B", "W", "I", "N", "UP", "A", "PT"]
79 | #Allow uppercase names for e.g. call_AD
80 | #Don't add strict=False to zips (B905)
81 | ignore = ["N806", "N802", "N803", "A001", "A002", "B905", "RUF", "UP038"]
82 | 
83 | fixable = ["ALL"]
84 | unfixable = []
85 | 
86 | [tool.ruff.lint.isort]
87 | known-third-party = [
88 |   "bio2zarr",
89 |   "click",
90 |   "cyvcf2",
91 |   "numcodecs",
92 |   "numpy",
93 |   "pandas",
94 |   "pyranges",
95 |   "pytest",
96 |   "setuptools",
97 |   "zarr"
98 | ]
99 | 


--------------------------------------------------------------------------------
/.github/workflows/cd.yml:
--------------------------------------------------------------------------------
 1 | name: CD
 2 | 
 3 | on:
 4 |   merge_group:
 5 |   pull_request:
 6 |   push:
 7 |     branches:
 8 |       - main
 9 |     tags:
10 |       - '*'
11 |   release:
12 |     types: [published]
13 | 
14 | jobs:
15 |   packaging:
16 |     if: github.repository_owner == 'sgkit-dev'
17 |     name: Packaging
18 |     runs-on: ubuntu-latest
19 |     steps:
20 |       - uses: actions/checkout@v4
21 |       - uses: actions/setup-python@v5
22 |         with:
23 |           python-version: '3.11'
24 |       - name: Install dependencies
25 |         run: |
26 |           python -m pip install --upgrade pip build validate-pyproject[all]
27 |           python -m build --sdist
28 |       - name: Upload sdist
29 |         uses: actions/upload-artifact@v4
30 |         with:
31 |           name: sdist
32 |           path: dist
33 |       - name: Build wheels
34 |         run: |
35 |           validate-pyproject pyproject.toml
36 |           docker run --rm -v `pwd`:/project -w /project quay.io/pypa/manylinux2014_x86_64 bash .github/workflows/docker/buildwheel.sh
37 |       - name: Check vcztools CLI
38 |         run: |
39 |           pip install numpy "zarr>=2.17,<3" "click>=8.2.0" "pyranges!=0.1.3" pyparsing
40 |           pip install vcztools --no-index --only-binary vcztools -f dist/wheelhouse
41 |           vcztools --help
42 |           # Make sure we don't have ``vcztools`` in the CWD
43 |           cd tests
44 |           python -m vcztools --help
45 |       - name: Store the distribution packages
46 |         uses: actions/upload-artifact@v4
47 |         with:
48 |           name: linux-wheels
49 |           path: dist/wheelhouse
50 | 
51 |   publish-to-pypi:
52 |     if: github.repository_owner == 'sgkit-dev' && github.event_name == 'release'
53 |     needs:
54 |       - packaging
55 |     runs-on: ubuntu-latest
56 | 
57 |     environment:
58 |       name: pypi
59 |       url: https://pypi.org/p/vcztools
60 |     permissions:
61 |       id-token: write  # IMPORTANT: mandatory for trusted publishing
62 | 
63 |     steps:
64 |       - name: Download all
65 |         uses: actions/download-artifact@v4.1.8
66 |       - name: Move to dist
67 |         run: |
68 |           mkdir dist
69 |           cp */*.{whl,gz} dist/.
70 |           ls dist
71 |       - uses: pypa/gh-action-pypi-publish@release/v1
72 | 
73 | 
74 |   publish-to-testpypi:
75 |     if: github.repository_owner == 'sgkit-dev' && github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags')
76 |     needs:
77 |       - packaging
78 |     runs-on: ubuntu-latest
79 | 
80 |     environment:
81 |       name: testpypi
82 |       url: https://test.pypi.org/p/vcztools
83 | 
84 |     permissions:
85 |       id-token: write  # IMPORTANT: mandatory for trusted publishing
86 | 
87 |     steps:
88 |       - name: Download all
89 |         uses: actions/download-artifact@v4.1.8
90 |       - name: Move to dist
91 |         run: |
92 |           mkdir dist
93 |           cp */*.{whl,gz} dist/.
94 |           ls dist
95 |       - uses: pypa/gh-action-pypi-publish@release/v1
96 |         with:
97 |           verbose: true
98 |           repository-url: https://test.pypi.org/legacy/
99 | 


--------------------------------------------------------------------------------
/tests/test_retrieval.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | 
 3 | import numpy.testing as nt
 4 | import pytest
 5 | import zarr
 6 | 
 7 | from vcztools.retrieval import variant_chunk_iter, variant_iter
 8 | from vcztools.samples import parse_samples
 9 | 
10 | from .utils import vcz_path_cache
11 | 
12 | 
13 | def test_variant_chunk_iter():
14 |     original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz"
15 |     vcz = vcz_path_cache(original)
16 |     root = zarr.open(vcz, mode="r")
17 | 
18 |     _, samples_selection = parse_samples("NA00002,NA00003", root["sample_id"][:])
19 |     chunk_data = next(
20 |         variant_chunk_iter(
21 |             root,
22 |             fields=["variant_contig", "variant_position", "call_DP", "call_GQ"],
23 |             regions="20:1230236-",
24 |             include="FMT/DP>3",
25 |             samples_selection=samples_selection,
26 |         )
27 |     )
28 |     nt.assert_array_equal(chunk_data["variant_contig"], [1, 1])
29 |     nt.assert_array_equal(chunk_data["variant_position"], [1230237, 1234567])
30 |     nt.assert_array_equal(chunk_data["call_DP"], [[4, 2], [2, 3]])
31 |     nt.assert_array_equal(chunk_data["call_GQ"], [[48, 61], [17, 40]])
32 |     # note second site (at pos 1234567) is included even though both samples in mask
33 |     # are False (NA00002 and NA00003), since sample NA00001 matched filter criteria,
34 |     # but was then removed by samples_selection
35 |     nt.assert_array_equal(chunk_data["call_mask"], [[True, False], [False, False]])
36 | 
37 | 
38 | def test_variant_chunk_iter_empty_fields():
39 |     original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz"
40 |     vcz = vcz_path_cache(original)
41 |     root = zarr.open(vcz, mode="r")
42 | 
43 |     with pytest.raises(StopIteration):
44 |         print(next(variant_chunk_iter(root, fields=[])))
45 | 
46 | 
47 | @pytest.mark.parametrize(
48 |     ("regions", "samples"),
49 |     [("20:1230236-", "NA00002,NA00003"), (["20:1230236-"], ["NA00002", "NA00003"])],
50 | )
51 | def test_variant_iter(regions, samples):
52 |     original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz"
53 |     vcz = vcz_path_cache(original)
54 | 
55 |     iter = variant_iter(
56 |         vcz,
57 |         fields=["variant_contig", "variant_position", "call_DP", "call_GQ"],
58 |         regions=regions,
59 |         include="FMT/DP>3",
60 |         samples=samples,
61 |     )
62 | 
63 |     variant1 = next(iter)
64 |     assert variant1["variant_contig"] == 1
65 |     assert variant1["variant_position"] == 1230237
66 |     nt.assert_array_equal(variant1["call_DP"], [4, 2])
67 |     nt.assert_array_equal(variant1["call_GQ"], [48, 61])
68 |     nt.assert_array_equal(variant1["call_mask"], [True, False])
69 | 
70 |     variant2 = next(iter)
71 |     assert variant2["variant_contig"] == 1
72 |     assert variant2["variant_position"] == 1234567
73 |     nt.assert_array_equal(variant2["call_DP"], [2, 3])
74 |     nt.assert_array_equal(variant2["call_GQ"], [17, 40])
75 |     nt.assert_array_equal(variant2["call_mask"], [False, False])
76 | 
77 |     with pytest.raises(StopIteration):
78 |         next(iter)
79 | 
80 | 
81 | def test_variant_iter_empty_fields():
82 |     original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz"
83 |     vcz = vcz_path_cache(original)
84 | 
85 |     with pytest.raises(StopIteration):
86 |         next(variant_iter(vcz, fields=[]))
87 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # auto generated by setuptools_scm and configured in pyproject.toml
  2 | vcztools/_version.py
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | share/python-wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | *.py,cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | cover/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | db.sqlite3-journal
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | .pybuilder/
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | #   For a library or package, you might want to ignore these files since the code is
 90 | #   intended to run in multiple environments; otherwise, check them in:
 91 | # .python-version
 92 | 
 93 | # pipenv
 94 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 95 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 96 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 97 | #   install all needed dependencies.
 98 | #Pipfile.lock
 99 | 
100 | # poetry
101 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
102 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
103 | #   commonly ignored for libraries.
104 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
105 | #poetry.lock
106 | 
107 | # pdm
108 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
109 | #pdm.lock
110 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
111 | #   in version control.
112 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
113 | .pdm.toml
114 | .pdm-python
115 | .pdm-build/
116 | 
117 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
118 | __pypackages__/
119 | 
120 | # Celery stuff
121 | celerybeat-schedule
122 | celerybeat.pid
123 | 
124 | # SageMath parsed files
125 | *.sage.py
126 | 
127 | # Environments
128 | .env
129 | .venv
130 | env/
131 | venv/
132 | ENV/
133 | env.bak/
134 | venv.bak/
135 | 
136 | # Spyder project settings
137 | .spyderproject
138 | .spyproject
139 | 
140 | # Rope project settings
141 | .ropeproject
142 | 
143 | # mkdocs documentation
144 | /site
145 | 
146 | # mypy
147 | .mypy_cache/
148 | .dmypy.json
149 | dmypy.json
150 | 
151 | # Pyre type checker
152 | .pyre/
153 | 
154 | # pytype static type analyzer
155 | .pytype/
156 | 
157 | # Cython debug symbols
158 | cython_debug/
159 | 
160 | # PyCharm
161 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
162 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
163 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
164 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
165 | .idea/
166 | 
167 | .vscode
168 | vcz_test_cache/
169 | **/.DS_Store
170 | 


--------------------------------------------------------------------------------
/lib/vcf_encoder.h:
--------------------------------------------------------------------------------
  1 | #include <stdint.h>
  2 | #include <stddef.h>
  3 | #include <stdio.h>
  4 | 
  5 | #ifdef __GNUC__
  6 | #define VCZ_UNUSED(x) VCZ_UNUSED_##x __attribute__((__unused__))
  7 | #else
  8 | #define VCZ_UNUSED(x) VCZ_UNUSED_##x
  9 | /* Don't bother with restrict for MSVC */
 10 | #define restrict
 11 | #endif
 12 | 
 13 | /* /1* We assume CHAR_BIT == 8 when loading strings from 8-bit byte arrays *1/ */
 14 | /* #if CHAR_BIT != 8 */
 15 | /* #error CHAR_BIT MUST EQUAL 8 */
 16 | /* #endif */
 17 | 
 18 | #define VCZ_INT_MISSING -1
 19 | #define VCZ_INT_FILL -2
 20 | #define VCZ_STRING_MISSING '.'
 21 | #define VCZ_STRING_FILL '\0'
 22 | #define VCZ_FLOAT32_MISSING_AS_INT32 0x7F800001
 23 | #define VCZ_FLOAT32_FILL_AS_INT32 0x7F800002
 24 | 
 25 | #define VCZ_NUM_FIXED_FIELDS 6
 26 | 
 27 | #define VCZ_TYPE_INT 1
 28 | #define VCZ_TYPE_FLOAT 2
 29 | #define VCZ_TYPE_STRING 3
 30 | #define VCZ_TYPE_BOOL 4
 31 | 
 32 | // arbitrary - we can increase if needs be
 33 | #define VCZ_MAX_FIELD_NAME_LEN 255
 34 | #define VCZ_INT32_BUF_SIZE 12 // 10 digits, leading '-' and terminating NULL
 35 | // Safe limit, no point in trying to make it too tight as it's easy to represent
 36 | // certain very large numbers of floating point.
 37 | #define VCZ_FLOAT32_BUF_SIZE 256
 38 | 
 39 | #define VCZ_ERR_NO_MEMORY (-100)
 40 | #define VCZ_ERR_BUFFER_OVERFLOW (-101)
 41 | #define VCZ_ERR_VARIANT_OUT_OF_BOUNDS (-102)
 42 | 
 43 | /* Built-in-limitations */
 44 | #define VCZ_ERR_FIELD_NAME_TOO_LONG (-201)
 45 | #define VCZ_ERR_FIELD_UNSUPPORTED_TYPE (-202)
 46 | #define VCZ_ERR_FIELD_UNSUPPORTED_ITEM_SIZE (-203)
 47 | #define VCZ_ERR_FIELD_UNSUPPORTED_NUM_COLUMNS (-204)
 48 | 
 49 | typedef struct {
 50 |     // maximum length + 1 for NULL byte
 51 |     char name[VCZ_MAX_FIELD_NAME_LEN + 1];
 52 |     size_t name_length;
 53 |     int type;
 54 |     size_t item_size;
 55 |     size_t num_columns;
 56 |     const char *data;
 57 | } vcz_field_t;
 58 | 
 59 | int vcz_field_init(vcz_field_t *self, const char *name, int type, size_t item_size,
 60 |     size_t num_columns, const void *data);
 61 | int64_t vcz_field_write_1d(
 62 |     const vcz_field_t *self, size_t row, char *buf, int64_t buflen, int64_t offset);
 63 | void vcz_field_print_state(const vcz_field_t *self, FILE *out);
 64 | 
 65 | typedef struct {
 66 |     size_t num_variants;
 67 |     size_t num_samples;
 68 |     vcz_field_t fixed_fields[VCZ_NUM_FIXED_FIELDS];
 69 |     vcz_field_t filter_id;
 70 |     const int8_t *filter_data;
 71 |     vcz_field_t gt;
 72 |     const int8_t *gt_phased_data;
 73 |     size_t num_info_fields;
 74 |     size_t max_info_fields;
 75 |     vcz_field_t *info_fields;
 76 |     size_t num_format_fields;
 77 |     size_t max_format_fields;
 78 |     size_t field_array_size_increment;
 79 |     vcz_field_t *format_fields;
 80 | } vcz_variant_encoder_t;
 81 | 
 82 | int vcz_variant_encoder_init(
 83 |     vcz_variant_encoder_t *self, size_t num_variants, size_t num_samples);
 84 | void vcz_variant_encoder_free(vcz_variant_encoder_t *self);
 85 | void vcz_variant_encoder_print_state(const vcz_variant_encoder_t *self, FILE *out);
 86 | 
 87 | int vcz_variant_encoder_add_chrom_field(
 88 |     vcz_variant_encoder_t *self, size_t item_size, const char *data);
 89 | int vcz_variant_encoder_add_pos_field(vcz_variant_encoder_t *self, const int32_t *data);
 90 | int vcz_variant_encoder_add_qual_field(vcz_variant_encoder_t *self, const float *data);
 91 | int vcz_variant_encoder_add_ref_field(
 92 |     vcz_variant_encoder_t *self, size_t item_size, const char *data);
 93 | int vcz_variant_encoder_add_id_field(
 94 |     vcz_variant_encoder_t *self, size_t item_size, size_t num_columns, const char *data);
 95 | int vcz_variant_encoder_add_alt_field(
 96 |     vcz_variant_encoder_t *self, size_t item_size, size_t num_columns, const char *data);
 97 | int vcz_variant_encoder_add_filter_field(vcz_variant_encoder_t *self,
 98 |     size_t id_item_size, size_t id_num_columns, const char *id_data,
 99 |     const int8_t *filter_data);
100 | int vcz_variant_encoder_add_gt_field(vcz_variant_encoder_t *self, size_t item_size,
101 |     size_t num_columns, const void *data, const int8_t *phased_data);
102 | int vcz_variant_encoder_add_info_field(vcz_variant_encoder_t *self, const char *name,
103 |     int type, size_t item_size, size_t num_columns, const void *data);
104 | int vcz_variant_encoder_add_format_field(vcz_variant_encoder_t *self, const char *name,
105 |     int type, size_t item_size, size_t num_columns, const void *data);
106 | 
107 | int64_t vcz_variant_encoder_encode(
108 |     const vcz_variant_encoder_t *self, size_t row, char *buf, size_t buflen);
109 | 
110 | int vcz_itoa(char *buf, int64_t v);
111 | int vcz_ftoa(char *buf, float v);
112 | 
113 | 
114 | #define VCZ_PLINK_HOM_A1 0x0  /* 00 */
115 | #define VCZ_PLINK_HOM_A2 0x3  /* 11 */
116 | #define VCZ_PLINK_HET 0x2     /* 10 */
117 | #define VCZ_PLINK_MISSING 0x1 /* 01 */
118 | int vcz_encode_plink(size_t num_variants, size_t num_samples, const int8_t *genotypes,
119 |     const int8_t *a12_allele, char *buf);
120 | 


--------------------------------------------------------------------------------
/tests/test_plink.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pytest
  3 | 
  4 | from vcztools import plink
  5 | 
  6 | 
  7 | def _encode_genotypes_row(g, allele_1, allele_2):
  8 |     # Missing genotype: 01 in PLINK format
  9 |     # Homozygous allele 1: 00 in PLINK format
 10 |     # Homozygous allele 2: 11 in PLINK format
 11 |     # Heterozygous: 10 in PLINK format
 12 |     HOM_A1 = 0b00
 13 |     HOM_A2 = 0b11
 14 |     HET = 0b10
 15 |     MISSING = 0b01
 16 | 
 17 |     num_samples = g.shape[0]
 18 |     assert g.shape[1] == 2
 19 |     bytes_per_variant = (num_samples + 3) // 4
 20 |     buff = bytearray(bytes_per_variant)
 21 |     for j in range(num_samples):
 22 |         byte_idx = j // 4
 23 |         bit_pos = (j % 4) * 2
 24 |         code = MISSING
 25 |         a, b = g[j]
 26 |         if b == -2:
 27 |             # Treated as a haploid call by plink
 28 |             if a == allele_1:
 29 |                 code = HOM_A1
 30 |             elif a == allele_2:
 31 |                 code = HOM_A2
 32 |         else:
 33 |             if a == allele_1:
 34 |                 if b == allele_1:
 35 |                     code = HOM_A1
 36 |                 elif b == allele_2:
 37 |                     code = HET
 38 |             elif a == allele_2:
 39 |                 if b == allele_2:
 40 |                     code = HOM_A2
 41 |                 elif b == allele_1:
 42 |                     code = HET
 43 |             if allele_1 == -1 and (code == HOM_A1 or code == HET):
 44 |                 code = MISSING
 45 |         # print("\t", a, b, code)
 46 |         mask = ~(0b11 << bit_pos)
 47 |         buff[byte_idx] = (buff[byte_idx] & mask) | (code << bit_pos)
 48 |     return buff
 49 | 
 50 | 
 51 | def encode_genotypes(G, a12_allele=None):
 52 |     G = np.array(G, dtype=np.int8)
 53 |     if a12_allele is None:
 54 |         a12_allele = np.zeros((G.shape[0], 2), dtype=G.dtype)
 55 |         a12_allele[:, 0] = 1
 56 |     assert G.shape[0] == a12_allele.shape[0]
 57 |     assert G.shape[2] == 2
 58 |     buff = bytearray()
 59 |     for j in range(len(G)):
 60 |         buff.extend(_encode_genotypes_row(G[j], *a12_allele[j]))
 61 |     return bytes(buff)
 62 | 
 63 | 
 64 | class TestEncodeGenotypes:
 65 |     @pytest.mark.parametrize(
 66 |         "genotypes",
 67 |         [
 68 |             [
 69 |                 [[0, 0], [0, 1], [0, 0]],
 70 |             ],
 71 |             [
 72 |                 [[0, 0], [0, 1], [0, 0]],
 73 |                 [[1, 0], [1, 1], [0, -2]],
 74 |                 [[1, 1], [0, 1], [-1, -1]],
 75 |             ],
 76 |             [
 77 |                 [[0, 0], [0, 1], [0, 0], [0, 1]],
 78 |                 [[0, 0], [0, 1], [0, 0], [0, 1]],
 79 |             ],
 80 |             [
 81 |                 [[0, 0], [0, 1], [0, 0], [0, 1], [1, 1]],
 82 |                 [[0, 0], [0, 1], [0, 0], [0, 1], [-1, -2]],
 83 |                 [[0, 0], [0, 1], [0, 0], [0, 1], [1, 1]],
 84 |                 [[1, 0], [-3, 1], [0, 0], [0, 1], [-1, -2]],
 85 |                 [[0, 1], [0, 1], [1, 2], [0, 1], [1, 1]],
 86 |                 [[0, 0], [0, -2], [0, 3], [-2, 1], [-1, -2]],
 87 |             ],
 88 |         ],
 89 |     )
 90 |     def test_examples_01_alleles(self, genotypes):
 91 |         b1 = encode_genotypes(genotypes)
 92 |         b2 = plink.encode_genotypes(genotypes)
 93 |         assert b1 == b2
 94 | 
 95 |     @pytest.mark.parametrize(
 96 |         ("num_variants", "num_samples"),
 97 |         [
 98 |             (0, 0),
 99 |             (1, 0),
100 |             (0, 1),
101 |             (1, 1),
102 |             (1, 10),
103 |             (1, 4),
104 |             (1, 16),
105 |             (1, 100),
106 |             (1, 101),
107 |             (10, 1),
108 |             (100, 1),
109 |             (10, 2),
110 |             (10, 3),
111 |             (10, 4),
112 |             (10, 5),
113 |             (10, 6),
114 |             (10, 7),
115 |             (10, 8),
116 |             (10, 9),
117 |         ],
118 |     )
119 |     @pytest.mark.parametrize("value", [-1, 0, 1, 2])
120 |     def test_shapes_01_alleles(self, value, num_variants, num_samples):
121 |         g = np.zeros((num_variants, num_samples, 2), dtype=np.int8) + value
122 |         b1 = encode_genotypes(g)
123 |         b2 = plink.encode_genotypes(g)
124 |         # assert len(b1) == len(b2)
125 |         assert b1 == b2
126 | 
127 |     @pytest.mark.parametrize(
128 |         ("num_variants", "num_samples"),
129 |         [
130 |             (1, 4),
131 |             (1, 8),
132 |             (1, 16),
133 |             (1, 32),
134 |             (1, 100),
135 |             (33, 4),
136 |             (33, 8),
137 |             (33, 16),
138 |             (33, 32),
139 |             (33, 100),
140 |         ],
141 |     )
142 |     def test_all_zeros_div_4(self, num_variants, num_samples):
143 |         assert num_samples % 4 == 0
144 |         g = np.zeros((num_variants, num_samples, 2), dtype=np.int8)
145 |         b1 = encode_genotypes(g)
146 |         b2 = plink.encode_genotypes(g)
147 |         assert b1 == b2
148 |         assert b1 == bytearray(0xFF for _ in range(num_variants * num_samples // 4))
149 | 
150 |     @pytest.mark.parametrize(
151 |         ("num_variants", "num_samples"),
152 |         [
153 |             (1, 33),
154 |             (10, 1000),
155 |         ],
156 |     )
157 |     def test_nonsensical_data(self, num_variants, num_samples):
158 |         g = np.arange((num_variants * num_samples * 2), dtype=np.int8).reshape(
159 |             (num_variants, num_samples, 2)
160 |         )
161 |         a12 = np.arange(num_variants * 2, dtype=np.int8).reshape((num_variants, 2))
162 |         b1 = encode_genotypes(g, a12)
163 |         b2 = plink.encode_genotypes(g, a12)
164 |         assert b1 == b2
165 | 


--------------------------------------------------------------------------------
/dev.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | import _vcztools
  4 | import numpy as np
  5 | import zarr
  6 | 
  7 | # From VCF fixed fields
  8 | RESERVED_VARIABLE_NAMES = [
  9 |     "variant_contig",
 10 |     "variant_position",
 11 |     "variant_id",
 12 |     "variant_id_mask",
 13 |     "variant_allele",
 14 |     "variant_quality",
 15 |     "variant_filter",
 16 | ]
 17 | 
 18 | 
 19 | def copy_to_memory(group):
 20 |     mem_group = zarr.group()
 21 |     for name, array in group.items():
 22 |         if array.dtype == "O":
 23 |             copy = mem_group.empty_like(name, array, compressor={})
 24 |             # FIXME: this is the best I've been able to come up with here.
 25 |             # Something very weird with object arrays in v2
 26 |             for j, row in enumerate(array):
 27 |                 copy[j] = row
 28 |         else:
 29 |             copy = mem_group.empty_like(name, array, compressor={})
 30 |             copy[:] = array[:]
 31 |         # print("copy = ", copy[:])
 32 |     return mem_group
 33 | 
 34 | 
 35 | def main(root):
 36 |     v_chunk = 0
 37 |     contigs = root["contig_id"][:].astype("S")
 38 |     # filters = root["filter_id"][:].astype("S")
 39 |     # print("contigs = ", contigs)
 40 |     # print("filters = ", contigs)
 41 | 
 42 |     chrom = contigs[root.variant_contig.blocks[v_chunk]]
 43 |     pos = root.variant_position.blocks[v_chunk]
 44 |     id = root.variant_id.blocks[v_chunk].astype("S")
 45 |     alleles = root.variant_allele.blocks[v_chunk]
 46 |     ref = alleles[:, 0].astype("S")
 47 |     alt = alleles[:, 1:].astype("S")
 48 |     # qual = root.variant_quality.blocks[v_chunk]
 49 |     # filter_ = filters[root.variant_filter.blocks[v_chunk]]
 50 | 
 51 |     num_variants = len(pos)
 52 |     if len(id.shape) == 1:
 53 |         id = id.reshape((num_variants, 1))
 54 | 
 55 |     # TODO gathering fields and doing IO will be done separately later so that
 56 |     # we avoid retrieving stuff we don't need.
 57 |     format_fields = {}
 58 |     info_fields = {}
 59 |     for name, array in root.arrays():
 60 |         if name.startswith("call_") and not name.startswith("call_genotype"):
 61 |             vcf_name = name[len("call_") :]
 62 |             format_fields[vcf_name] = array.blocks[v_chunk]
 63 |         elif name.startswith("variant_") and name not in RESERVED_VARIABLE_NAMES:
 64 |             vcf_name = name[len("variant_") :]
 65 |             info_fields[vcf_name] = array.blocks[v_chunk]
 66 | 
 67 |     gt = None
 68 |     gt_phased = None
 69 |     if "call_genotype" in root:
 70 |         array = root["call_genotype"]
 71 |         gt = array.blocks[v_chunk]
 72 |         if "call_genotype_phased" in root:
 73 |             array = root["call_genotype_phased"]
 74 |             gt_phased = array.blocks[v_chunk]
 75 |         else:
 76 |             gt_phased = np.zeros_like(gt, dtype=bool)
 77 | 
 78 |     # print(gt, gt_phased)
 79 |     # print(list(format_fields.keys()))
 80 |     # print(list(info_fields.keys()))
 81 | 
 82 |     # print(contigs[chrom])
 83 |     # print(bytes(contigs[chrom]))
 84 |     # print(pos)
 85 |     # print(alleles)
 86 |     # print(alleles.dtype)
 87 |     # print(chrom)
 88 |     # print(pos)
 89 |     # print(id)
 90 |     # print(ref)
 91 |     # print(alt)
 92 | 
 93 |     num_samples = 0
 94 |     if gt is not None:
 95 |         num_samples = gt.shape[1]
 96 | 
 97 |     encoder = _vcztools.VcfEncoder(
 98 |         num_variants, num_samples, chrom=chrom, pos=pos, id=id, alt=alt, ref=ref
 99 |     )
100 |     print(gt.shape)
101 |     print(gt_phased.shape)
102 |     encoder.add_gt_field(gt.astype("int32"), gt_phased)
103 |     # # print(encoder.arrays)
104 |     # # print(encoder)
105 |     for name, array in info_fields.items():
106 |         if array.dtype.kind == "O":
107 |             array = array.astype("S")
108 |         if len(array.shape) == 1:
109 |             array = array.reshape((num_variants, 1))
110 |         if array.dtype.kind == "i":
111 |             array = array.astype("int32")  # tmp
112 |         if array.dtype.kind == "f":
113 |             continue  # tmp
114 |         if array.dtype.kind == "b":
115 |             continue  # tmp
116 |             # array = array.astype("int32") # tmp
117 | 
118 |         print(name, array.dtype, array.dtype.kind)
119 |         encoder.add_info_field(name, array)
120 | 
121 |     for name, array in format_fields.items():
122 |         if array.dtype.kind == "O":
123 |             array = array.astype("S")
124 |         if len(array.shape) == 2:
125 |             array = array.reshape((num_variants, num_samples, 1))
126 |         if array.dtype.kind == "i":
127 |             array = array.astype("int32")  # tmp
128 |         if array.dtype.kind == "f":
129 |             continue  # tmp
130 |             # array = array.astype("int32") # tmp
131 | 
132 |         print(name, array.dtype, array.dtype.kind)
133 |         encoder.add_format_field(name, array)
134 | 
135 |     # d = encoder.arrays
136 |     # pos = encoder.arrays["POS"]
137 |     # print(pos)
138 |     # # print(d)
139 |     # pos[0] = 123457
140 |     # print(pos.flags)
141 |     # pos.resize(0, refcheck=False)
142 |     # print(pos)
143 | 
144 |     encoder.print_state(sys.stdout)
145 |     for k, v in encoder.arrays.items():
146 |         print(k, "\t", v.shape)
147 |     for j in range(num_variants):
148 |         line = encoder.encode_row(j, 2**30)
149 |         print(line)
150 | 
151 | 
152 | if __name__ == "__main__":
153 |     root = zarr.open(sys.argv[1], mode="r")
154 |     # root = copy_to_memory(root)
155 |     # print("pos = ", root["variant_position"].info)
156 |     # print(root.tree())
157 |     main(root)
158 |     # for _ in range(10000):
159 |     # import tqdm
160 | 
161 |     # for _ in tqdm.tqdm(range(10000)):
162 |     #     main(root)
163 | 


--------------------------------------------------------------------------------
/vcztools/plink.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Convert VCZ to plink 1 binary format.
  3 | """
  4 | 
  5 | import pathlib
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | import zarr
 10 | 
 11 | from . import _vcztools, retrieval
 12 | 
 13 | 
 14 | def encode_genotypes(genotypes, a12_allele=None):
 15 |     G = np.asarray(genotypes, dtype=np.int8)
 16 |     if a12_allele is None:
 17 |         a12_allele = np.zeros((G.shape[0], 2), dtype=G.dtype)
 18 |         a12_allele[:, 0] = 1
 19 |     a12_allele = np.asarray(a12_allele, dtype=G.dtype)
 20 |     # TODO: not sure if this is taking a copy. See the point about
 21 |     # allocating a numpy array in the C code.
 22 |     return bytes(_vcztools.encode_plink(G, a12_allele).data)
 23 | 
 24 | 
 25 | def generate_fam(root):
 26 |     # TODO generate an error if sample_id contains a space
 27 |     sample_id = root["sample_id"][:].astype(str)
 28 |     zeros = np.zeros(sample_id.shape, dtype=int)
 29 |     df = pd.DataFrame(
 30 |         {
 31 |             "FamilyID": sample_id,
 32 |             "IndividualID": sample_id,
 33 |             "FatherID": zeros,
 34 |             "MotherId": zeros,
 35 |             "Sex": zeros,
 36 |             "Phenotype": np.full_like(zeros, -9),
 37 |         }
 38 |     )
 39 |     return df.to_csv(sep="\t", header=False, index=False)
 40 | 
 41 | 
 42 | def generate_bim(root, a12_allele):
 43 |     select = a12_allele[:, 1] != -1
 44 |     contig_id = root["contig_id"][:].astype(str)
 45 |     alleles = root["variant_allele"][:].astype(str)[select]
 46 |     a12_allele = a12_allele[select]
 47 |     num_variants = np.sum(select)
 48 |     allele_1 = alleles[np.arange(num_variants), a12_allele[:, 0]]
 49 |     single_allele_sites = np.where(a12_allele[:, 0] == -1)
 50 |     allele_1[single_allele_sites] = "0"
 51 | 
 52 |     num_variants = np.sum(select)
 53 |     if "variant_id" in root:
 54 |         variant_id = root["variant_id"][:][select]
 55 |     else:
 56 |         variant_id = np.array(["."] * num_variants, dtype="S")
 57 | 
 58 |     df = pd.DataFrame(
 59 |         {
 60 |             "Chrom": contig_id[root["variant_contig"][:][select]],
 61 |             "VariantId": variant_id,
 62 |             "GeneticPosition": np.zeros(np.sum(select), dtype=int),
 63 |             "Position": root["variant_position"][:][select],
 64 |             "Allele1": allele_1,
 65 |             "Allele2": alleles[np.arange(num_variants), a12_allele[:, 1]],
 66 |         }
 67 |     )
 68 |     return df.to_csv(header=False, sep="\t", index=False)
 69 | 
 70 | 
 71 | class Writer:
 72 |     def __init__(
 73 |         self, vcz_path, bed_path, fam_path, bim_path, include=None, exclude=None
 74 |     ):
 75 |         self.root = zarr.open(vcz_path, mode="r")
 76 | 
 77 |         self.bim_path = bim_path
 78 |         self.fam_path = fam_path
 79 |         self.bed_path = bed_path
 80 | 
 81 |     def _compute_alleles(self, G, alleles):
 82 |         """
 83 |         Returns the a12 alleles for the specified chunk of data.
 84 |         """
 85 |         max_alleles = alleles.shape[1]
 86 |         if max_alleles != 2:
 87 |             raise ValueError(
 88 |                 "Only biallelic VCFs supported currently: "
 89 |                 "please comment on https://github.com/sgkit-dev/vcztools/issues/224 "
 90 |                 "if this limitation affects you"
 91 |             )
 92 |         num_variants = G.shape[0]
 93 |         num_samples = G.shape[1]
 94 |         a12_allele = np.zeros((num_variants, 2), dtype=int) - 1
 95 |         for j, g in enumerate(G):
 96 |             g = g.reshape(num_samples * 2)
 97 |             assert np.all(g >= -2)
 98 |             count = np.bincount(g + 2, minlength=max_alleles + 2)
 99 |             # [dimension pad, missing data, reference, allele 1, ...]
100 |             count = count[2:]
101 |             argsort = np.argsort(count)
102 |             a12_allele[j, 1] = 0
103 |             if argsort[-1] == 0:
104 |                 # print("Ref allele most frequent")
105 |                 # Ref allele is most frequent - chose lowest allele from next most
106 |                 # frequent class
107 |                 f = count[argsort[-2]]
108 |             else:
109 |                 # print("Ref allele not most frequent")
110 |                 f = count[argsort[-1]]
111 |             a = 1
112 |             while count[a] != f:
113 |                 a += 1
114 |             a12_allele[j, 0] = a
115 |             assert a12_allele[j, 0] != a12_allele[j, 1]
116 |             if alleles[j][1] == "":
117 |                 a12_allele[j, 0] = -1
118 |             # print(
119 |             #     self.root["variant_contig"][j],
120 |             #     self.root["variant_position"][j],
121 |             #     [j],
122 |             #     self.root["variant_allele"][j],
123 |             #     count,
124 |             #     argsort,
125 |             #     a12_allele[j],
126 |             # )
127 |         return a12_allele
128 | 
129 |     def _write_genotypes(self):
130 |         ci = retrieval.variant_chunk_iter(
131 |             self.root, fields=["call_genotype", "variant_allele"]
132 |         )
133 |         call_genotype = self.root["call_genotype"]
134 |         a12_allele = zarr.zeros(
135 |             (call_genotype.shape[0], 2), chunks=call_genotype.chunks[0], dtype=int
136 |         )
137 |         with open(self.bed_path, "wb") as bed_file:
138 |             bed_file.write(bytes([0x6C, 0x1B, 0x01]))
139 | 
140 |             for j, chunk in enumerate(ci):
141 |                 G = chunk["call_genotype"]
142 |                 a12 = self._compute_alleles(G, chunk["variant_allele"])
143 |                 buff = encode_genotypes(G, a12)
144 |                 bed_file.write(buff)
145 |                 a12_allele.blocks[j] = a12
146 |         return a12_allele[:]
147 | 
148 |     def run(self):
149 |         a12_allele = self._write_genotypes()
150 | 
151 |         with open(self.bim_path, "w") as f:
152 |             f.write(generate_bim(self.root, a12_allele))
153 | 
154 |         with open(self.fam_path, "w") as f:
155 |             f.write(generate_fam(self.root))
156 | 
157 | 
158 | def write_plink(vcz_path, out, include=None, exclude=None):
159 |     out_prefix = pathlib.Path(out)
160 |     # out_prefix.mkdir(exist_ok=True)
161 |     writer = Writer(
162 |         vcz_path,
163 |         bed_path=out_prefix.with_suffix(".bed"),
164 |         fam_path=out_prefix.with_suffix(".fam"),
165 |         bim_path=out_prefix.with_suffix(".bim"),
166 |         include=include,
167 |         exclude=exclude,
168 |     )
169 |     writer.run()
170 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
  1 | name: CI
  2 | 
  3 | on:
  4 |   merge_group:
  5 |   pull_request:
  6 |   push:
  7 |     branches:
  8 |       - main
  9 | 
 10 | jobs:
 11 |   pre-commit:
 12 |     name: Lint
 13 |     runs-on: ubuntu-latest
 14 |     steps:
 15 |       - uses: actions/checkout@v4
 16 |       - uses: actions/setup-python@v5
 17 |         with:
 18 |           python-version: '3.11'
 19 |       - uses: pre-commit/action@v3.0.1
 20 |   python_test:
 21 |     name: Python tests
 22 |     runs-on: ${{ matrix.os }}
 23 |     defaults:
 24 |       run:
 25 |         shell: bash -el {0}
 26 |     strategy:
 27 |       matrix:
 28 |         # Use macos-13 because pip binary packages for ARM aren't
 29 |         # available for many dependencies
 30 |         os: [macos-13, macos-14, ubuntu-latest]
 31 |         python-version: ["3.10", "3.11", "3.12"]
 32 |         exclude:
 33 |           # Just run macos tests on one Python version
 34 |           - os: macos-13
 35 |             python-version: "3.10"
 36 |           - os: macos-13
 37 |             python-version: "3.11"
 38 |           - os: macos-13
 39 |             python-version: "3.12"
 40 |           - os: macos-14
 41 |             python-version: "3.10"
 42 |           - os: macos-14
 43 |             python-version: "3.12"
 44 |     steps:
 45 |       - uses: actions/checkout@v4
 46 |       - name: Set up Miniconda with Python ${{ matrix.python-version }}
 47 |         uses: conda-incubator/setup-miniconda@v3
 48 |         with:
 49 |           auto-update-conda: true
 50 |           python-version: ${{ matrix.python-version }}
 51 |           channels: conda-forge,bioconda
 52 |       - name: Install dependencies
 53 |         run: |
 54 |           conda install bcftools plink
 55 |           python -m pip install --upgrade pip
 56 |           python -m pip install '.[dev]'
 57 |           # Build the extension module in-place so pytest can find it
 58 |           python3 setup.py build_ext --inplace
 59 |       - name: Run tests
 60 |         run: |
 61 |           pytest
 62 | 
 63 |   c_python_test:
 64 |     name: CPython interface tests
 65 |     runs-on: ubuntu-latest
 66 |     steps:
 67 |       - uses: actions/checkout@v4
 68 |       - name: Set up Python
 69 |         uses: actions/setup-python@v5
 70 |         with:
 71 |           python-version: '3.11'
 72 |       - name: Install system dependencies
 73 |         run: |
 74 |           sudo apt install -y gcovr
 75 |       - name: Install python dependencies
 76 |         run: |
 77 |           python -m pip install --upgrade pip
 78 |           python -m pip install numpy pytest pytest_cov
 79 |       - name: Build module with coverage
 80 |         run: |
 81 |           # Build the extension module in-place so pytest can find it
 82 |           CFLAGS="--coverage" python3 setup.py build_ext --inplace
 83 |       - name: Run tests
 84 |         run: |
 85 |           pytest -vs tests/test_cpython_interface.py
 86 |       - name: Show coverage
 87 |         run: |
 88 |           gcovr --filter vcztools
 89 | 
 90 |   c_test:
 91 |     name: C tests
 92 |     runs-on: ubuntu-latest
 93 |     steps:
 94 |       - uses: actions/checkout@v4
 95 |       - name: Install dependencies
 96 |         run: |
 97 |           sudo apt install -y ninja-build libcunit1-dev valgrind meson gcovr
 98 |       - name: Build
 99 |         working-directory: ./lib
100 |         run: |
101 |           meson setup -Db_coverage=true build
102 |       - name: Tests
103 |         working-directory: ./lib
104 |         run: |
105 |           ninja -C build test
106 |       - name: Show coverage
107 |         working-directory: ./lib
108 |         run: |
109 |           ninja -C build coverage-text
110 |           cat build/meson-logs/coverage.txt
111 |       - name: Valgrind
112 |         working-directory: ./lib
113 |         run: |
114 |           valgrind --leak-check=full --error-exitcode=1 ./build/tests
115 | 
116 |   packaging:
117 |     name: Packaging
118 |     runs-on: ubuntu-latest
119 |     steps:
120 |       - uses: actions/checkout@v4
121 |       - uses: actions/setup-python@v5
122 |         with:
123 |           python-version: '3.11'
124 |       - name: Install dependencies
125 |         run: |
126 |           python -m pip install --upgrade pip
127 |           python -m pip install build twine validate-pyproject[all]
128 |       - name: Check and install package
129 |         run: |
130 |           validate-pyproject pyproject.toml
131 |           python -m build
132 |           python -m twine check --strict dist/*
133 |           python -m pip install dist/*.whl
134 |       - name: Check vcztools CLI
135 |         run: |
136 |           vcztools --help
137 |           # Make sure we don't have ``vcztools`` in the CWD
138 |           cd tests
139 |           python -m vcztools --help
140 | 
141 |   test-numpy-version:
142 |     name: Test numpy versions
143 |     runs-on: ubuntu-latest
144 |     defaults:
145 |       run:
146 |         shell: bash -el {0}
147 |     strategy:
148 |       matrix:
149 |         numpy: ["==1.26", ">=2"]
150 |     steps:
151 |       - uses: actions/checkout@v4
152 |       - name: Set up Miniconda
153 |         uses: conda-incubator/setup-miniconda@v3
154 |         with:
155 |           auto-update-conda: true
156 |           python-version: '3.11'
157 |           channels: conda-forge,bioconda
158 |       - name: Install dependencies
159 |         run: |
160 |           conda install bcftools plink
161 |           python -m pip install --upgrade pip
162 |           python -m pip install '.[dev]'
163 |           # Build the extension module in-place so pytest can find it
164 |           python3 setup.py build_ext --inplace
165 |       - name: Install numpy${{ matrix.numpy }}
166 |         run: |
167 |           python -m pip install 'numpy${{ matrix.numpy }}'
168 |       - name: Run tests
169 |         run: |
170 |           pytest
171 | 
172 |   test-zarr-version:
173 |     name: Test Zarr versions
174 |     runs-on: ubuntu-latest
175 |     defaults:
176 |       run:
177 |         shell: bash -el {0}
178 |     strategy:
179 |       matrix:
180 |         zarr: ["==2.18.3", ">=3,!=3.0.5"]
181 |     steps:
182 |       - uses: actions/checkout@v4
183 |       - name: Set up Miniconda
184 |         uses: conda-incubator/setup-miniconda@v3
185 |         with:
186 |           auto-update-conda: true
187 |           python-version: '3.11'
188 |           channels: conda-forge,bioconda
189 |       - name: Install dependencies
190 |         run: |
191 |           conda install bcftools plink
192 |           python -m pip install --upgrade pip
193 |           python -m pip install '.[dev]'
194 |           # Build the extension module in-place so pytest can find it
195 |           python3 setup.py build_ext --inplace
196 |       - name: Install zarr${{ matrix.zarr }}
197 |         run: |
198 |           python -m pip install 'zarr${{ matrix.zarr }}'
199 |       - name: Run tests
200 |         run: |
201 |           pytest
202 | 


--------------------------------------------------------------------------------
/vcztools/regions.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from typing import Any
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | from pyranges import PyRanges
  7 | 
  8 | 
  9 | def parse_region_string(region: str) -> tuple[str, int | None, int | None]:
 10 |     """Return the contig, start position and end position from a region string."""
 11 |     if re.search(r":\d+-\d*$", region):
 12 |         contig, start_end = region.rsplit(":", 1)
 13 |         start, end = start_end.split("-")
 14 |         return contig, int(start), int(end) if len(end) > 0 else None
 15 |     elif re.search(r":\d+$", region):
 16 |         contig, start = region.rsplit(":", 1)
 17 |         return contig, int(start), int(start)
 18 |     else:
 19 |         contig = region
 20 |         return contig, None, None
 21 | 
 22 | 
 23 | def regions_to_pyranges(
 24 |     regions: list[tuple[str, int | None, int | None]], all_contigs: list[str]
 25 | ) -> PyRanges:
 26 |     """Convert region tuples to a PyRanges object."""
 27 | 
 28 |     chromosomes = []
 29 |     starts = []
 30 |     ends = []
 31 |     for contig, start, end in regions:
 32 |         if start is None:
 33 |             start = 0
 34 |         else:
 35 |             start -= 1
 36 | 
 37 |         if end is None:
 38 |             end = np.iinfo(np.int64).max
 39 | 
 40 |         chromosomes.append(all_contigs.index(contig))
 41 |         starts.append(start)
 42 |         ends.append(end)
 43 | 
 44 |     return PyRanges(chromosomes=chromosomes, starts=starts, ends=ends)
 45 | 
 46 | 
 47 | def parse_regions(
 48 |     regions: list[str] | str | None, all_contigs: list[str]
 49 | ) -> PyRanges | None:
 50 |     """Return a PyRanges object from a comma-separated set of region strings,
 51 |     or a list of region strings."""
 52 |     if regions is None:
 53 |         return None
 54 |     elif isinstance(regions, list):
 55 |         regions_list = regions
 56 |     else:
 57 |         regions_list = regions.split(",")
 58 |     return regions_to_pyranges(
 59 |         [parse_region_string(region) for region in regions_list], all_contigs
 60 |     )
 61 | 
 62 | 
 63 | def parse_targets(
 64 |     targets: list[str] | str | None, all_contigs: list[str]
 65 | ) -> tuple[PyRanges | None, bool]:
 66 |     """Return a PyRanges object from a comma-separated set of region strings,
 67 |     optionally preceeded by a ^ character to indicate complement,
 68 |     or a list of region strings."""
 69 |     if targets is None:
 70 |         return None, False
 71 |     elif isinstance(targets, list):
 72 |         targets_list = targets
 73 |         complement = False
 74 |     else:
 75 |         complement = targets.startswith("^")
 76 |         targets_list = (targets[1:] if complement else targets).split(",")
 77 |     return (
 78 |         parse_regions(targets_list, all_contigs),
 79 |         complement,
 80 |     )
 81 | 
 82 | 
 83 | def regions_to_chunk_indexes(
 84 |     regions: PyRanges | None,
 85 |     targets: PyRanges | None,
 86 |     complement: bool,
 87 |     regions_index: Any,
 88 | ):
 89 |     """Return chunks indexes that overlap the given regions or targets.
 90 | 
 91 |     If both regions and targets are specified then only regions are used
 92 |     to find overlapping chunks (since targets are used later to refine).
 93 | 
 94 |     If only targets are specified then they are used to find overlapping chunks,
 95 |     taking into account the complement flag.
 96 |     """
 97 | 
 98 |     # Create pyranges for chunks using the region index.
 99 |     # For regions use max end position, for targets just end position
100 |     chunk_index = regions_index[:, 0]
101 |     contig_id = regions_index[:, 1]
102 |     start_position = regions_index[:, 2]
103 |     end_position = regions_index[:, 3]
104 |     max_end_position = regions_index[:, 4]
105 |     df = pd.DataFrame(
106 |         {
107 |             "chunk_index": chunk_index,
108 |             "Chromosome": contig_id,
109 |             "Start": start_position,
110 |             "End": max_end_position if regions is not None else end_position,
111 |         }
112 |     )
113 |     chunk_regions = PyRanges(df)
114 | 
115 |     if regions is not None:
116 |         overlap = chunk_regions.overlap(regions)
117 |     elif complement:
118 |         overlap = chunk_regions.subtract(targets)
119 |     else:
120 |         overlap = chunk_regions.overlap(targets)
121 |     if overlap.empty:
122 |         return np.empty((0,), dtype=np.int64)
123 |     chunk_indexes = overlap.df["chunk_index"].to_numpy()
124 |     chunk_indexes = np.unique(chunk_indexes)
125 |     return chunk_indexes
126 | 
127 | 
128 | def regions_to_selection(
129 |     regions: PyRanges | None,
130 |     targets: PyRanges | None,
131 |     complement: bool,
132 |     variant_contig: Any,
133 |     variant_position: Any,
134 |     variant_length: Any,
135 | ):
136 |     """Return a variant selection that corresponds to the given regions and targets.
137 | 
138 |     If both regions and targets are specified then they are both used to find
139 |     overlapping variants.
140 |     """
141 | 
142 |     # subtract 1 from start coordinate to convert intervals
143 |     # from VCF (1-based, fully-closed) to Python (0-based, half-open)
144 |     variant_start = variant_position - 1
145 | 
146 |     if regions is not None:
147 |         variant_end = variant_start + variant_length
148 |         df = pd.DataFrame(
149 |             {"Chromosome": variant_contig, "Start": variant_start, "End": variant_end}
150 |         )
151 |         # save original index as column so we can retrieve it after finding overlap
152 |         df["index"] = df.index
153 |         variant_regions = PyRanges(df)
154 |     else:
155 |         variant_regions = None
156 | 
157 |     if targets is not None:
158 |         targets_variant_end = variant_position  # length 1
159 |         df = pd.DataFrame(
160 |             {
161 |                 "Chromosome": variant_contig,
162 |                 "Start": variant_start,
163 |                 "End": targets_variant_end,
164 |             }
165 |         )
166 |         # save original index as column so we can retrieve it after finding overlap
167 |         df["index"] = df.index
168 |         variant_targets = PyRanges(df)
169 |     else:
170 |         variant_targets = None
171 | 
172 |     if variant_regions is not None:
173 |         regions_overlap = variant_regions.overlap(regions)
174 |     else:
175 |         regions_overlap = None
176 | 
177 |     if variant_targets is not None:
178 |         if complement:
179 |             targets_overlap = variant_targets.subtract(targets)
180 |         else:
181 |             targets_overlap = variant_targets.overlap(targets)
182 |     else:
183 |         targets_overlap = None
184 | 
185 |     if regions_overlap is not None and targets_overlap is not None:
186 |         overlap = regions_overlap.overlap(targets_overlap)
187 |     elif regions_overlap is not None:
188 |         overlap = regions_overlap
189 |     else:
190 |         overlap = targets_overlap
191 | 
192 |     if overlap.empty:
193 |         return np.empty((0,), dtype=np.int64)
194 |     return overlap.df["index"].to_numpy()
195 | 


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
  1 | import pathlib
  2 | from unittest import mock
  3 | 
  4 | import click.testing as ct
  5 | import pytest
  6 | 
  7 | import vcztools.cli as cli
  8 | from tests.test_bcftools_validation import run_vcztools
  9 | from tests.utils import vcz_path_cache
 10 | from vcztools import provenance
 11 | 
 12 | 
 13 | @pytest.fixture()
 14 | def vcz_path():
 15 |     vcf_path = pathlib.Path("tests/data/vcf/sample.vcf.gz")
 16 |     return vcz_path_cache(vcf_path)
 17 | 
 18 | 
 19 | def test_version_header(vcz_path):
 20 |     output, _ = run_vcztools(f"view {vcz_path}")
 21 |     assert output.find("##vcztools_viewCommand=") >= 0
 22 |     assert output.find("Date=") >= 0
 23 | 
 24 | 
 25 | class TestOutput:
 26 |     def test_view_unsupported_output(self, tmp_path, vcz_path):
 27 |         bad_output = tmp_path / "output.vcf.gz"
 28 | 
 29 |         _, vcztools_error = run_vcztools(
 30 |             f"view --no-version {vcz_path} -o {bad_output}", expect_error=True
 31 |         )
 32 |         assert (
 33 |             "Only uncompressed VCF output supported, suffix .gz not allowed"
 34 |             in vcztools_error
 35 |         )
 36 | 
 37 |     @pytest.mark.parametrize("suffix", ["gz", "bgz", "bcf"])
 38 |     def test_view_unsupported_output_suffix(self, tmp_path, vcz_path, suffix):
 39 |         bad_output = tmp_path / f"output.vcf.{suffix}"
 40 | 
 41 |         _, vcztools_error = run_vcztools(
 42 |             f"view --no-version {vcz_path} -o {bad_output}", expect_error=True
 43 |         )
 44 |         assert f".{suffix} not allowed" in vcztools_error
 45 | 
 46 |     def test_view_good_path(self, tmp_path, vcz_path):
 47 |         output_path = tmp_path / "tmp.vcf"
 48 |         runner = ct.CliRunner()
 49 |         result = runner.invoke(
 50 |             cli.vcztools_main,
 51 |             f"view --no-version {vcz_path} -o {output_path}",
 52 |             catch_exceptions=False,
 53 |         )
 54 |         assert result.exit_code == 0
 55 |         assert len(result.stdout) == 0
 56 |         assert output_path.exists()
 57 | 
 58 |     def test_view_write_directory(self, tmp_path, vcz_path):
 59 |         runner = ct.CliRunner()
 60 |         result = runner.invoke(
 61 |             cli.vcztools_main,
 62 |             f"view --no-version {vcz_path} -o {tmp_path}",
 63 |             catch_exceptions=False,
 64 |         )
 65 |         assert result.exit_code == 1
 66 |         assert len(result.stdout) == 0
 67 |         assert "Is a directory" in result.stderr
 68 | 
 69 |     def test_view_write_pipe(self, tmp_path, vcz_path):
 70 |         runner = ct.CliRunner()
 71 |         result = runner.invoke(
 72 |             cli.vcztools_main,
 73 |             f"view --no-version {vcz_path} -o {tmp_path}",
 74 |             catch_exceptions=False,
 75 |         )
 76 |         assert result.exit_code == 1
 77 |         assert len(result.stdout) == 0
 78 |         assert "Is a directory" in result.stderr
 79 | 
 80 | 
 81 | def test_excluding_and_including_samples(vcz_path):
 82 |     samples_file_path = pathlib.Path("tests/data/txt/samples.txt")
 83 |     error_message = "vcztools does not support combining -s and -S"
 84 | 
 85 |     _, vcztools_error = run_vcztools(
 86 |         f"view {vcz_path} -s NA00001 -S ^{samples_file_path}", expect_error=True
 87 |     )
 88 |     assert error_message in vcztools_error
 89 |     _, vcztools_error = run_vcztools(
 90 |         f"view {vcz_path} -s ^NA00001 -S {samples_file_path}", expect_error=True
 91 |     )
 92 |     assert error_message in vcztools_error
 93 | 
 94 | 
 95 | @mock.patch("sys.exit")
 96 | @mock.patch("os.dup2")
 97 | def test_broken_pipe(mocked_dup2, mocked_exit, tmp_path):
 98 |     with open(tmp_path / "tmp.txt", "w") as output:
 99 |         with cli.handle_broken_pipe(output):
100 |             raise BrokenPipeError()
101 |         mocked_dup2.assert_called_once()
102 |         mocked_exit.assert_called_once_with(1)
103 | 
104 | 
105 | class TestQuery:
106 |     def test_format_required(self, vcz_path):
107 |         runner = ct.CliRunner()
108 |         result = runner.invoke(
109 |             cli.vcztools_main,
110 |             f"query {vcz_path} ",
111 |             catch_exceptions=False,
112 |         )
113 |         assert result.exit_code != 0
114 |         assert len(result.stdout) == 0
115 |         assert len(result.stderr) > 0
116 | 
117 |     def test_path_required(self):
118 |         runner = ct.CliRunner()
119 |         result = runner.invoke(
120 |             cli.vcztools_main,
121 |             "query --format=POS ",
122 |             catch_exceptions=False,
123 |         )
124 |         assert result.exit_code != 0
125 |         assert len(result.stdout) == 0
126 |         assert len(result.stderr) > 0
127 | 
128 |     def test_list(self, vcz_path):
129 |         result, _ = run_vcztools(f"query -l {vcz_path}")
130 |         assert list(result.splitlines()) == ["NA00001", "NA00002", "NA00003"]
131 | 
132 |     def test_list_ignores_output(self, vcz_path, tmp_path):
133 |         output = tmp_path / "tmp.txt"
134 |         result, _ = run_vcztools(f"query -l {vcz_path} -o {output}")
135 |         assert list(result.splitlines()) == ["NA00001", "NA00002", "NA00003"]
136 |         assert not output.exists()
137 | 
138 |     def test_output(self, vcz_path, tmp_path):
139 |         output = tmp_path / "tmp.txt"
140 |         result, _ = run_vcztools(f"query -f '%POS\n' {vcz_path} -o {output}")
141 |         assert list(result.splitlines()) == []
142 |         assert output.exists()
143 | 
144 | 
145 | class TestIndex:
146 |     def test_stats(self, vcz_path):
147 |         result, _ = run_vcztools(f"index -s {vcz_path}")
148 |         assert list(result.splitlines()) == ["19\t.\t2", "20\t.\t6", "X\t.\t1"]
149 | 
150 |     def test_nrecords(self, vcz_path):
151 |         result, _ = run_vcztools(f"index -n {vcz_path}")
152 |         assert list(result.splitlines()) == ["9"]
153 | 
154 |     def test_stats_and_nrecords(self, vcz_path):
155 |         runner = ct.CliRunner()
156 |         result = runner.invoke(
157 |             cli.vcztools_main,
158 |             f"index -ns {vcz_path}",
159 |             catch_exceptions=False,
160 |         )
161 |         assert result.exit_code != 0
162 |         assert len(result.stdout) == 0
163 |         assert len(result.stderr) > 0
164 |         assert "Expected only one of --stats or --nrecords options" in result.stderr
165 | 
166 |     def test_no_stats_or_nrecords(self, vcz_path):
167 |         runner = ct.CliRunner()
168 |         result = runner.invoke(
169 |             cli.vcztools_main,
170 |             f"index {vcz_path}",
171 |             catch_exceptions=False,
172 |         )
173 |         assert result.exit_code != 0
174 |         assert len(result.stdout) == 0
175 |         assert len(result.stderr) > 0
176 |         assert "Error: Building region indexes is not supported" in result.stderr
177 | 
178 | 
179 | def test_top_level():
180 |     runner = ct.CliRunner()
181 |     result = runner.invoke(
182 |         cli.vcztools_main,
183 |         catch_exceptions=False,
184 |     )
185 |     assert result.exit_code != 0
186 |     assert len(result.stdout) == 0
187 |     assert len(result.stderr) > 0
188 | 
189 | 
190 | def test_version():
191 |     runner = ct.CliRunner()
192 |     result = runner.invoke(cli.vcztools_main, ["--version"], catch_exceptions=False)
193 |     s = f"version {provenance.__version__}\n"
194 |     assert result.stdout.endswith(s)
195 | 


--------------------------------------------------------------------------------
/tests/test_query.py:
--------------------------------------------------------------------------------
  1 | import pathlib
  2 | import re
  3 | from io import StringIO
  4 | 
  5 | import numpy as np
  6 | import pyparsing as pp
  7 | import pytest
  8 | import zarr
  9 | 
 10 | from tests.utils import vcz_path_cache
 11 | from vcztools.query import (
 12 |     QueryFormatGenerator,
 13 |     QueryFormatParser,
 14 |     list_samples,
 15 |     write_query,
 16 | )
 17 | from vcztools.retrieval import variant_chunk_iter
 18 | 
 19 | 
 20 | def test_list_samples(tmp_path):
 21 |     vcf_path = pathlib.Path("tests/data/vcf") / "sample.vcf.gz"
 22 |     vcz_path = vcz_path_cache(vcf_path)
 23 |     expected_output = "NA00001\nNA00002\nNA00003\n"
 24 | 
 25 |     with StringIO() as output:
 26 |         list_samples(vcz_path, output)
 27 |         assert output.getvalue() == expected_output
 28 | 
 29 | 
 30 | class TestQueryFormatParser:
 31 |     @pytest.fixture()
 32 |     def parser(self):
 33 |         return QueryFormatParser()
 34 | 
 35 |     @pytest.mark.parametrize(
 36 |         ("expression", "expected_result"),
 37 |         [
 38 |             ("%CHROM", ["%CHROM"]),
 39 |             (r"\n", ["\n"]),
 40 |             (r"\t", ["\t"]),
 41 |             (r"%CHROM\n", ["%CHROM", "\n"]),
 42 |             ("%CHROM  %POS  %REF", ["%CHROM", "  ", "%POS", "  ", "%REF"]),
 43 |             (r"%CHROM  %POS0  %REF\n", ["%CHROM", "  ", "%POS0", "  ", "%REF", "\n"]),
 44 |             (
 45 |                 r"%CHROM\t%POS\t%REF\t%ALT{0}\n",
 46 |                 ["%CHROM", "\t", "%POS", "\t", "%REF", "\t", ["%ALT", 0], "\n"],
 47 |             ),
 48 |             (
 49 |                 r"%CHROM\t%POS0\t%END\t%ID\n",
 50 |                 ["%CHROM", "\t", "%POS0", "\t", "%END", "\t", "%ID", "\n"],
 51 |             ),
 52 |             (r"%CHROM:%POS\n", ["%CHROM", ":", "%POS", "\n"]),
 53 |             (r"%AC{1}\n", [["%AC", 1], "\n"]),
 54 |             (
 55 |                 r"Read depth: %INFO/DP\n",
 56 |                 ["Read", " ", "depth:", " ", "%INFO/DP", "\n"],
 57 |             ),
 58 |             (
 59 |                 r"%CHROM\t%POS\t%REF\t%ALT[\t%SAMPLE=%GT]\n",
 60 |                 [
 61 |                     "%CHROM",
 62 |                     "\t",
 63 |                     "%POS",
 64 |                     "\t",
 65 |                     "%REF",
 66 |                     "\t",
 67 |                     "%ALT",
 68 |                     ["\t", "%SAMPLE", "=", "%GT"],
 69 |                     "\n",
 70 |                 ],
 71 |             ),
 72 |             (
 73 |                 r"%CHROM\t%POS\t%REF\t%ALT[\t%SAMPLE=%GT{0}]\n",
 74 |                 [
 75 |                     "%CHROM",
 76 |                     "\t",
 77 |                     "%POS",
 78 |                     "\t",
 79 |                     "%REF",
 80 |                     "\t",
 81 |                     "%ALT",
 82 |                     ["\t", "%SAMPLE", "=", ["%GT", 0]],
 83 |                     "\n",
 84 |                 ],
 85 |             ),
 86 |             (
 87 |                 r"GQ:[ %GQ] \t GT:[ %GT]\n",
 88 |                 ["GQ:", [" ", "%GQ"], " ", "\t", " ", "GT:", [" ", "%GT"], "\n"],
 89 |             ),
 90 |             (
 91 |                 r"[%SAMPLE %GT %DP\n]",
 92 |                 [["%SAMPLE", " ", "%GT", " ", "%DP", "\n"]],
 93 |             ),
 94 |         ],
 95 |     )
 96 |     def test_valid_expressions(self, parser, expression, expected_result):
 97 |         assert parser(expression).as_list() == expected_result
 98 | 
 99 |     @pytest.mark.parametrize(
100 |         "expression",
101 |         [
102 |             "%ac",
103 |             "%AC {1}",
104 |             "% CHROM",
105 |         ],
106 |     )
107 |     def test_invalid_expressions(self, parser, expression):
108 |         with pytest.raises(pp.ParseException):
109 |             parser(expression)
110 | 
111 | 
112 | class TestQueryFormatEvaluator:
113 |     @pytest.fixture()
114 |     def root(self):
115 |         vcf_path = pathlib.Path("tests/data/vcf/sample.vcf.gz")
116 |         vcz_path = vcz_path_cache(vcf_path)
117 |         return zarr.open(vcz_path, mode="r")
118 | 
119 |     @pytest.mark.parametrize(
120 |         ("query_format", "expected_result"),
121 |         [
122 |             (r"A\t", "A\t" * 9),
123 |             (r"CHROM", "CHROM" * 9),
124 |             (
125 |                 r"%CHROM:%POS\n",
126 |                 "19:111\n19:112\n20:14370\n20:17330\n20:1110696\n20:1230237\n20:1234567\n20:1235237\nX:10\n",
127 |             ),
128 |             (r"%INFO/DP\n", ".\n.\n14\n11\n10\n13\n9\n.\n.\n"),
129 |             (r"%AC\n", ".\n.\n.\n.\n.\n.\n1,1\n.\n.\n"),
130 |             (r"%AC{0}\n", ".\n.\n.\n.\n.\n.\n1\n.\n.\n"),
131 |         ],
132 |     )
133 |     def test(self, root, query_format, expected_result):
134 |         generator = QueryFormatGenerator(
135 |             query_format,
136 |             root["sample_id"][:],
137 |             root["contig_id"][:],
138 |             root["filter_id"][:],
139 |         )
140 |         chunk_data = next(variant_chunk_iter(root))
141 |         result = "".join(generator(chunk_data))
142 |         assert result == expected_result
143 | 
144 |     # fmt: off
145 |     @pytest.mark.parametrize(
146 |         ("query_format", "call_mask", "expected_result"),
147 |         [
148 |             (
149 |                 r"[%DP ]\n",
150 |                 None,
151 |                 ". . . \n. . . \n1 8 5 \n3 5 3 \n6 0 4 \n. 4 2 \n4 2 3 \n. . . \n. . . \n",  # noqa: E501
152 |             ),
153 |             (
154 |                 r"[%DP ]\n",
155 |                 np.array(
156 |                     [
157 |                         [1, 1, 1,],
158 |                         [1, 1, 1,],
159 |                         [1, 0, 1,],
160 |                         [1, 1, 1,],
161 |                         [1, 1, 1,],
162 |                         [1, 1, 1,],
163 |                         [1, 1, 1,],
164 |                         [1, 1, 1,],
165 |                         [1, 1, 1,],
166 |                     ]
167 |                 ),
168 |                 ". . . \n. . . \n1 5 \n3 5 3 \n6 0 4 \n. 4 2 \n4 2 3 \n. . . \n. . . \n",  # noqa: E501
169 |             ),
170 |         ],
171 |     )
172 |     # fmt: on
173 |     def test_call_mask(self, root, query_format, call_mask, expected_result):
174 |         generator = QueryFormatGenerator(
175 |             query_format,
176 |             root["sample_id"][:],
177 |             root["contig_id"][:],
178 |             root["filter_id"][:],
179 |         )
180 |         chunk_data = next(variant_chunk_iter(root))
181 |         if call_mask is not None:
182 |             chunk_data["call_mask"] = call_mask
183 |         result = "".join(generator(chunk_data))
184 |         assert result == expected_result
185 | 
186 |     @pytest.mark.parametrize(
187 |         ("query_format", "expected_result"),
188 |         [(r"%QUAL\n", "9.6\n10\n29\n3\n67\n47\n50\n.\n10\n")],
189 |     )
190 |     def test_with_parse_results(self, root, query_format, expected_result):
191 |         parser = QueryFormatParser()
192 |         parse_results = parser(query_format)
193 |         generator = QueryFormatGenerator(
194 |             parse_results,
195 |             root["sample_id"][:],
196 |             root["contig_id"][:],
197 |             root["filter_id"][:],
198 |         )
199 |         chunk_data = next(variant_chunk_iter(root))
200 |         result = "".join(generator(chunk_data))
201 |         assert result == expected_result
202 | 
203 | 
204 | def test_write_query__include_exclude(tmp_path):
205 |     original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz"
206 |     vcz = vcz_path_cache(original)
207 |     output = tmp_path.joinpath("output.vcf")
208 | 
209 |     query_format = r"%POS\n"
210 |     variant_site_filter = "POS > 1"
211 | 
212 |     with pytest.raises(
213 |         ValueError,
214 |         match=re.escape(
215 |             "Cannot handle both an include expression and an exclude expression."
216 |         ),
217 |     ):
218 |         write_query(
219 |             vcz,
220 |             output,
221 |             query_format=query_format,
222 |             include=variant_site_filter,
223 |             exclude=variant_site_filter,
224 |         )
225 | 


--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
  1 | import pathlib
  2 | from collections.abc import Iterator
  3 | from contextlib import contextmanager
  4 | from itertools import zip_longest
  5 | 
  6 | import cyvcf2
  7 | import numpy as np
  8 | from bio2zarr import vcf
  9 | 
 10 | 
 11 | @contextmanager
 12 | def open_vcf(path) -> Iterator[cyvcf2.VCF]:
 13 |     """A context manager for opening a VCF file."""
 14 |     vcf = cyvcf2.VCF(path)
 15 |     try:
 16 |         yield vcf
 17 |     finally:
 18 |         vcf.close()
 19 | 
 20 | 
 21 | def normalise_info_missingness(info_dict, key):
 22 |     value = info_dict.get(key, None)
 23 |     if isinstance(value, tuple):
 24 |         if all(x is None for x in value):
 25 |             value = None
 26 |     elif isinstance(value, str):
 27 |         if all(x == "." for x in value.split(",")):
 28 |             value = None
 29 |     return value
 30 | 
 31 | 
 32 | def _get_header_field_dicts(vcf, header_type):
 33 |     def to_dict(header_field):
 34 |         d = header_field.info(extra=True)
 35 |         del d[b"IDX"]  # remove IDX since we don't care about ordering
 36 | 
 37 |         # cyvcf2 duplicates some keys as strings and bytes, so remove the bytes one
 38 |         for k in list(d.keys()):
 39 |             if isinstance(k, bytes) and k.decode("utf-8") in d:
 40 |                 del d[k]
 41 |         return d
 42 | 
 43 |     return {
 44 |         field["ID"]: to_dict(field)
 45 |         for field in vcf.header_iter()
 46 |         if field["HeaderType"] == header_type
 47 |     }
 48 | 
 49 | 
 50 | def _assert_header_field_dicts_equivalent(field_dicts1, field_dicts2):
 51 |     assert len(field_dicts1) == len(field_dicts2)
 52 | 
 53 |     for id in field_dicts1.keys():
 54 |         assert id in field_dicts2
 55 |         field_dict1 = field_dicts1[id]
 56 |         field_dict2 = field_dicts2[id]
 57 | 
 58 |         assert len(field_dict1) == len(field_dict2)
 59 |         # all fields should be the same, except Number="." which can match any value
 60 |         for k in field_dict1.keys():
 61 |             assert k in field_dict2
 62 |             v1 = field_dict1[k]
 63 |             v2 = field_dict2[k]
 64 |             if k == "Number" and (v1 == "." or v2 == "."):
 65 |                 continue
 66 |             assert v1 == v2, f"Failed in field {id} with key {k}"
 67 | 
 68 | 
 69 | def _assert_vcf_headers_equivalent(vcf1, vcf2):
 70 |     # Only compare INFO, FORMAT, FILTER, CONTIG fields, ignoring order
 71 |     # Other fields are ignored
 72 | 
 73 |     info1 = _get_header_field_dicts(vcf1, "INFO")
 74 |     info2 = _get_header_field_dicts(vcf2, "INFO")
 75 |     _assert_header_field_dicts_equivalent(info1, info2)
 76 | 
 77 |     format1 = _get_header_field_dicts(vcf1, "FORMAT")
 78 |     format2 = _get_header_field_dicts(vcf2, "FORMAT")
 79 |     _assert_header_field_dicts_equivalent(format1, format2)
 80 | 
 81 |     filter1 = _get_header_field_dicts(vcf1, "FILTER")
 82 |     filter2 = _get_header_field_dicts(vcf2, "FILTER")
 83 |     _assert_header_field_dicts_equivalent(filter1, filter2)
 84 | 
 85 |     contig1 = _get_header_field_dicts(vcf1, "CONTIG")
 86 |     contig2 = _get_header_field_dicts(vcf2, "CONTIG")
 87 |     _assert_header_field_dicts_equivalent(contig1, contig2)
 88 | 
 89 | 
 90 | def assert_vcfs_close(f1, f2, *, rtol=1e-05, atol=1e-03, allow_zero_variants=False):
 91 |     """Like :py:func:`numpy.testing.assert_allclose()`, but for VCF files.
 92 | 
 93 |     Raises an `AssertionError` if two VCF files are not equal to one another.
 94 |     Float values in QUAL, INFO, or FORMAT fields are compared up to the
 95 |     desired tolerance. All other values must match exactly.
 96 | 
 97 |     Parameters
 98 |     ----------
 99 |     f1
100 |         Path to first VCF to compare.
101 |     f2
102 |         Path to second VCF to compare.
103 |     rtol
104 |         Relative tolerance.
105 |     atol
106 |         Absolute tolerance.
107 |     """
108 |     with open_vcf(f1) as vcf1, open_vcf(f2) as vcf2:
109 |         _assert_vcf_headers_equivalent(vcf1, vcf2)
110 |         assert vcf1.samples == vcf2.samples
111 | 
112 |         count = 0
113 |         for v1, v2 in zip_longest(vcf1, vcf2):
114 |             if v1 is None and v2 is not None:
115 |                 raise AssertionError(f"Right contains extra variant: {v2}")
116 |             if v1 is not None and v2 is None:
117 |                 raise AssertionError(f"Left contains extra variant: {v1}")
118 | 
119 |             count += 1
120 | 
121 |             assert v1.CHROM == v2.CHROM, f"CHROM not equal for variants\n{v1}{v2}"
122 |             assert v1.POS == v2.POS, f"POS not equal for variants\n{v1}{v2}"
123 |             assert v1.ID == v2.ID, f"ID not equal for variants\n{v1}{v2}"
124 |             assert v1.REF == v2.REF, f"REF not equal for variants\n{v1}{v2}"
125 |             assert v1.ALT == v2.ALT, f"ALT not equal for variants\n{v1}{v2}"
126 |             np.testing.assert_allclose(
127 |                 np.array(v1.QUAL, dtype=np.float32),
128 |                 np.array(v2.QUAL, dtype=np.float32),
129 |                 rtol=rtol,
130 |                 atol=atol,
131 |                 err_msg=f"QUAL not equal for variants\n{v1}{v2}",
132 |             )
133 |             assert set(v1.FILTERS) == set(
134 |                 v2.FILTERS
135 |             ), f"FILTER not equal for variants\n{v1}{v2}"
136 | 
137 |             v1_info = dict(v1.INFO)
138 |             v2_info = dict(v2.INFO)
139 |             all_keys = set(v1_info.keys()) | set(v2_info.keys())
140 |             for k in all_keys:
141 |                 val1 = normalise_info_missingness(v1_info, k)
142 |                 val2 = normalise_info_missingness(v2_info, k)
143 |                 # values are python objects (not np arrays)
144 |                 if isinstance(val1, float) or (
145 |                     isinstance(val1, tuple) and any(isinstance(v, float) for v in val1)
146 |                 ):
147 |                     np.testing.assert_allclose(
148 |                         np.array(val1, dtype=np.float32),
149 |                         np.array(val2, dtype=np.float32),
150 |                         rtol=rtol,
151 |                         atol=atol,
152 |                         err_msg=f"INFO {k} not equal for variants\n{v1}{v2}",
153 |                     )
154 |                 else:
155 |                     assert val1 == val2, f"INFO {k} not equal for variants\n{v1}{v2}"
156 | 
157 |             # NOTE skipping this because it requires items to be in the same order.
158 |             # assert v1.FORMAT == v2.FORMAT, f"FORMAT not equal for variants\n{v1}{v2}"
159 |             for field in v1.FORMAT:
160 |                 if field == "GT":
161 |                     assert (
162 |                         v1.genotypes == v2.genotypes
163 |                     ), f"GT not equal for variants\n{v1}{v2}"
164 |                 else:
165 |                     val1 = v1.format(field)
166 |                     val2 = v2.format(field)
167 |                     if val2 is None:
168 |                         # FIXME this is a quick hack to workaround missing support for
169 |                         # dealing with the field missing vs all-elements-in-field
170 |                         # missing issue.
171 |                         # https://github.com/jeromekelleher/vcztools/issues/14
172 |                         assert [str(x) == "." for x in val1]
173 |                     else:
174 |                         if val1.dtype.kind == "f":
175 |                             np.testing.assert_allclose(
176 |                                 val1,
177 |                                 val2,
178 |                                 rtol=rtol,
179 |                                 atol=atol,
180 |                                 err_msg=f"FORMAT {field} not equal for "
181 |                                 f"variants\n{v1}{v2}",
182 |                             )
183 |                         else:
184 |                             np.testing.assert_array_equal(
185 |                                 val1,
186 |                                 val2,
187 |                                 err_msg=f"FORMAT {field} not equal for "
188 |                                 f"variants\n{v1}{v2}",
189 |                             )
190 | 
191 |         if not allow_zero_variants:
192 |             assert count > 0, "No variants in file"
193 | 
194 | 
195 | def vcz_path_cache(vcf_path):
196 |     """
197 |     Store converted files in a cache to speed up tests. We're not testing
198 |     vcf2zarr here, so no point in running over and over again.
199 |     """
200 |     cache_path = pathlib.Path("vcz_test_cache")
201 |     if not cache_path.exists():
202 |         cache_path.mkdir()
203 |     cached_vcz_path = (cache_path / vcf_path.name).with_suffix(".vcz")
204 |     if not cached_vcz_path.exists():
205 |         if vcf_path.name.startswith("chr22"):
206 |             vcf.convert(
207 |                 [vcf_path],
208 |                 cached_vcz_path,
209 |                 worker_processes=0,
210 |                 variants_chunk_size=10,
211 |                 samples_chunk_size=10,
212 |             )
213 |         else:
214 |             vcf.convert(
215 |                 [vcf_path], cached_vcz_path, worker_processes=0, local_alleles=False
216 |             )
217 |     return cached_vcz_path
218 | 


--------------------------------------------------------------------------------
/vcztools/retrieval.py:
--------------------------------------------------------------------------------
  1 | import collections.abc
  2 | 
  3 | import numpy as np
  4 | import zarr
  5 | 
  6 | from vcztools import filter as filter_mod
  7 | from vcztools.regions import (
  8 |     parse_regions,
  9 |     parse_targets,
 10 |     regions_to_chunk_indexes,
 11 |     regions_to_selection,
 12 | )
 13 | from vcztools.samples import parse_samples
 14 | 
 15 | 
 16 | # NOTE:  this class is just a skeleton for now. The idea is that this
 17 | # will provide readahead, caching etc, and will be the central location
 18 | # for fetching bulk Zarr data.
 19 | class VariantChunkReader(collections.abc.Sequence):
 20 |     """
 21 |     Retrieve data from a Zarr store and return chunk-by-chunk in the
 22 |     variants dimension.
 23 |     """
 24 | 
 25 |     def __init__(self, root, *, fields=None):
 26 |         self.root = root
 27 |         if fields is None:
 28 |             fields = [
 29 |                 key
 30 |                 for key in root.keys()
 31 |                 if key.startswith("variant_") or key.startswith("call_")
 32 |             ]
 33 |         self.arrays = {key: self.root[key] for key in fields}
 34 |         # TODO validate the arrays have the correct shapes setc
 35 |         self.num_chunks = next(iter(self.arrays.values())).cdata_shape[0]
 36 | 
 37 |     def __len__(self):
 38 |         return self.num_chunks
 39 | 
 40 |     def __getitem__(self, chunk):
 41 |         return {key: array.blocks[chunk] for key, array in self.arrays.items()}
 42 | 
 43 |     def get_chunk_data(self, chunk, mask, samples_selection=None):
 44 |         num_samples = len(samples_selection) if samples_selection is not None else 0
 45 |         return {
 46 |             key: get_vchunk_array(
 47 |                 array,
 48 |                 chunk,
 49 |                 mask,
 50 |                 samples_selection
 51 |                 if (key.startswith("call_") and num_samples > 0)
 52 |                 else None,
 53 |             )
 54 |             for key, array in self.arrays.items()
 55 |         }
 56 | 
 57 | 
 58 | def get_vchunk_array(zarray, v_chunk, mask, samples_selection=None):
 59 |     v_chunksize = zarray.chunks[0]
 60 |     start = v_chunksize * v_chunk
 61 |     end = v_chunksize * (v_chunk + 1)
 62 |     if samples_selection is None:
 63 |         result = zarray[start:end]
 64 |     else:
 65 |         result = zarray.oindex[start:end, samples_selection]
 66 |     if mask is not None:
 67 |         result = result[mask]
 68 |     return result
 69 | 
 70 | 
 71 | def variant_chunk_index_iter(root, regions=None, targets=None):
 72 |     """Iterate over variant chunk indexes that overlap the given regions or targets.
 73 | 
 74 |     Returns tuples of variant chunk indexes and (optional) variant masks.
 75 | 
 76 |     A variant mask of None indicates that all the variants in the chunk are included.
 77 |     """
 78 | 
 79 |     pos = root["variant_position"]
 80 | 
 81 |     if regions is None and targets is None:
 82 |         num_chunks = pos.cdata_shape[0]
 83 |         # no regions or targets selected
 84 |         for v_chunk in range(num_chunks):
 85 |             v_mask_chunk = None
 86 |             yield v_chunk, v_mask_chunk
 87 | 
 88 |     else:
 89 |         contigs_u = root["contig_id"][:].astype("U").tolist()
 90 |         regions_pyranges = parse_regions(regions, contigs_u)
 91 |         targets_pyranges, complement = parse_targets(targets, contigs_u)
 92 | 
 93 |         # Use the region index to find the chunks that overlap specfied regions or
 94 |         # targets
 95 |         region_index = root["region_index"][:]
 96 |         chunk_indexes = regions_to_chunk_indexes(
 97 |             regions_pyranges,
 98 |             targets_pyranges,
 99 |             complement,
100 |             region_index,
101 |         )
102 | 
103 |         # Then use only load required variant_contig/position chunks
104 |         if len(chunk_indexes) == 0:
105 |             # no chunks - no variants to write
106 |             return
107 |         elif len(chunk_indexes) == 1:
108 |             # single chunk
109 |             block_sel = chunk_indexes[0]
110 |         else:
111 |             # zarr.blocks doesn't support int array indexing - use that when it does
112 |             block_sel = slice(chunk_indexes[0], chunk_indexes[-1] + 1)
113 | 
114 |         region_variant_contig = root["variant_contig"].blocks[block_sel][:]
115 |         region_variant_position = root["variant_position"].blocks[block_sel][:]
116 |         region_variant_length = root["variant_length"].blocks[block_sel][:]
117 | 
118 |         # Find the final variant selection
119 |         variant_selection = regions_to_selection(
120 |             regions_pyranges,
121 |             targets_pyranges,
122 |             complement,
123 |             region_variant_contig,
124 |             region_variant_position,
125 |             region_variant_length,
126 |         )
127 |         variant_mask = np.zeros(region_variant_position.shape[0], dtype=bool)
128 |         variant_mask[variant_selection] = 1
129 |         # Use zarr arrays to get mask chunks aligned with the main data
130 |         # for convenience.
131 |         z_variant_mask = zarr.array(variant_mask, chunks=pos.chunks[0])
132 | 
133 |         for i, v_chunk in enumerate(chunk_indexes):
134 |             v_mask_chunk = z_variant_mask.blocks[i]
135 |             yield v_chunk, v_mask_chunk
136 | 
137 | 
138 | def variant_chunk_index_iter_with_filtering(
139 |     root,
140 |     *,
141 |     regions=None,
142 |     targets=None,
143 |     include: str | None = None,
144 |     exclude: str | None = None,
145 | ):
146 |     """Iterate over variant chunk indexes that overlap the given regions or targets
147 |     and which match the include/exclude filter expression.
148 | 
149 |     Returns tuples of variant chunk indexes and (optional) variant masks.
150 | 
151 |     A variant mask of None indicates that all the variants in the chunk are included.
152 |     """
153 | 
154 |     filter_expr = filter_mod.FilterExpression(
155 |         field_names=set(root), include=include, exclude=exclude
156 |     )
157 |     if filter_expr.parse_result is None:
158 |         filter_expr = None
159 |     else:
160 |         filter_fields = list(filter_expr.referenced_fields)
161 |         filter_fields_reader = VariantChunkReader(root, fields=filter_fields)
162 | 
163 |     for v_chunk, v_mask_chunk in variant_chunk_index_iter(root, regions, targets):
164 |         if filter_expr is not None:
165 |             chunk_data = filter_fields_reader[v_chunk]
166 |             v_mask_chunk_filter = filter_expr.evaluate(chunk_data)
167 |             if v_mask_chunk is None:
168 |                 v_mask_chunk = v_mask_chunk_filter
169 |             else:
170 |                 if v_mask_chunk_filter.ndim == 2:
171 |                     v_mask_chunk = np.expand_dims(v_mask_chunk, axis=1)
172 |                 v_mask_chunk = np.logical_and(v_mask_chunk, v_mask_chunk_filter)
173 |         if v_mask_chunk is None or np.any(v_mask_chunk):
174 |             yield v_chunk, v_mask_chunk
175 | 
176 | 
177 | def variant_chunk_iter(
178 |     root,
179 |     *,
180 |     fields: list[str] | None = None,
181 |     regions=None,
182 |     targets=None,
183 |     include: str | None = None,
184 |     exclude: str | None = None,
185 |     samples_selection=None,
186 | ):
187 |     if fields is not None and len(fields) == 0:
188 |         return  # empty iterator
189 |     query_fields_reader = VariantChunkReader(root, fields=fields)
190 |     for v_chunk, v_mask_chunk in variant_chunk_index_iter_with_filtering(
191 |         root,
192 |         regions=regions,
193 |         targets=targets,
194 |         include=include,
195 |         exclude=exclude,
196 |     ):
197 |         # The variants_selection is used to subset variant chunks along
198 |         # the variants dimension.
199 |         # The call_mask is returned to the client to indicate which samples
200 |         # matched (for each variant) in the case of per-sample filtering.
201 |         if v_mask_chunk is None or v_mask_chunk.ndim == 1:
202 |             variants_selection = v_mask_chunk
203 |             call_mask = None
204 |         else:
205 |             variants_selection = np.any(v_mask_chunk, axis=1)
206 |             call_mask = v_mask_chunk[variants_selection]
207 |             if samples_selection is not None:
208 |                 call_mask = call_mask[:, samples_selection]
209 |         chunk_data = query_fields_reader.get_chunk_data(
210 |             v_chunk, variants_selection, samples_selection=samples_selection
211 |         )
212 |         if call_mask is not None:
213 |             chunk_data["call_mask"] = call_mask
214 |         yield chunk_data
215 | 
216 | 
217 | def variant_iter(
218 |     vcz,
219 |     *,
220 |     fields: list[str] | None = None,
221 |     regions: str | None = None,
222 |     targets: str | None = None,
223 |     include: str | None = None,
224 |     exclude: str | None = None,
225 |     samples: list[str] | str | None = None,
226 | ):
227 |     """Iterate over variants that overlap the given regions or targets
228 |     and which match the include/exclude filter expression.
229 | 
230 |     Only values for the samples specified are returned.
231 | 
232 |     Returns dicts containing the specified fields keyed by VCF Zarr name.
233 | 
234 |     By default all fields for all variants and samples are returned.
235 |     """
236 |     root = zarr.open(vcz, mode="r")
237 |     all_samples = root["sample_id"][:]
238 |     _, samples_selection = parse_samples(samples, all_samples)
239 | 
240 |     for chunk_data in variant_chunk_iter(
241 |         root,
242 |         fields=fields,
243 |         regions=regions,
244 |         targets=targets,
245 |         include=include,
246 |         exclude=exclude,
247 |         samples_selection=samples_selection,
248 |     ):
249 |         # get first field in chunk_data to find number of variants
250 |         field = next(iter(chunk_data.values()))
251 |         num_variants = len(field)
252 |         for i in range(num_variants):
253 |             yield {name: chunk_data[name][i] for name in chunk_data.keys()}
254 | 


--------------------------------------------------------------------------------
/vcztools/cli.py:
--------------------------------------------------------------------------------
  1 | import contextlib
  2 | import os
  3 | import sys
  4 | from functools import wraps
  5 | 
  6 | import click
  7 | 
  8 | from . import plink, provenance, vcf_writer
  9 | from . import query as query_module
 10 | from . import stats as stats_module
 11 | 
 12 | 
 13 | @contextlib.contextmanager
 14 | def handle_broken_pipe(output):
 15 |     """
 16 |     Handle sigpipe following official advice:
 17 |     https://docs.python.org/3/library/signal.html#note-on-sigpipe
 18 |     """
 19 |     try:
 20 |         yield
 21 |         # flush output here to force SIGPIPE to be triggered
 22 |         # while inside this try block.
 23 |         output.flush()
 24 |     except BrokenPipeError:
 25 |         # Python flushes standard streams on exit; redirect remaining output
 26 |         # to devnull to avoid another BrokenPipeError at shutdown
 27 |         devnull = os.open(os.devnull, os.O_WRONLY)
 28 |         os.dup2(devnull, sys.stdout.fileno())
 29 |         sys.exit(1)  # Python exits with error code 1 on EPIPE
 30 | 
 31 | 
 32 | def handle_exception(func):
 33 |     """
 34 |     Handle known application exceptions (ValueError) by converting to
 35 |     a ClickException, so the message is written to stderr and a non-zero exit
 36 |     code is set.
 37 |     """
 38 | 
 39 |     @wraps(func)
 40 |     def wrapper(*args, **kwargs):
 41 |         try:
 42 |             return func(*args, **kwargs)
 43 |         except ValueError as e:
 44 |             raise click.ClickException(e) from e
 45 | 
 46 |     return wrapper
 47 | 
 48 | 
 49 | include = click.option(
 50 |     "-i", "--include", type=str, help="Filter expression to include variant sites."
 51 | )
 52 | exclude = click.option(
 53 |     "-e", "--exclude", type=str, help="Filter expression to exclude variant sites."
 54 | )
 55 | force_samples = click.option(
 56 |     "--force-samples", is_flag=True, help="Only warn about unknown sample subsets."
 57 | )
 58 | output = click.option(
 59 |     "-o",
 60 |     "--output",
 61 |     type=click.File("w"),
 62 |     default="-",
 63 |     help="File path to write output to (defaults to stdout '-').",
 64 | )
 65 | regions = click.option(
 66 |     "-r",
 67 |     "--regions",
 68 |     type=str,
 69 |     default=None,
 70 |     help="Regions to include.",
 71 | )
 72 | samples = click.option(
 73 |     "-s",
 74 |     "--samples",
 75 |     type=str,
 76 |     default=None,
 77 |     help="Samples to include.",
 78 | )
 79 | targets = click.option(
 80 |     "-t",
 81 |     "--targets",
 82 |     type=str,
 83 |     default=None,
 84 |     help="Target regions to include.",
 85 | )
 86 | version = click.version_option(version=f"{provenance.__version__}")
 87 | 
 88 | 
 89 | class NaturalOrderGroup(click.Group):
 90 |     """
 91 |     List commands in the order they are provided in the help text.
 92 |     """
 93 | 
 94 |     def list_commands(self, ctx):
 95 |         return self.commands.keys()
 96 | 
 97 | 
 98 | @click.command
 99 | @click.argument("path", type=click.Path())
100 | @click.option(
101 |     "-n",
102 |     "--nrecords",
103 |     is_flag=True,
104 |     help="Print the number of records (variants).",
105 | )
106 | @click.option(
107 |     "-s",
108 |     "--stats",
109 |     is_flag=True,
110 |     help="Print per contig stats.",
111 | )
112 | @handle_exception
113 | def index(path, nrecords, stats):
114 |     """
115 |     Query the number of records in a VCZ dataset. This subcommand only
116 |     implements the --nrecords and --stats options and does not build any
117 |     indexes.
118 |     """
119 |     if nrecords and stats:
120 |         raise click.UsageError("Expected only one of --stats or --nrecords options")
121 |     if nrecords:
122 |         stats_module.nrecords(path, sys.stdout)
123 |     elif stats:
124 |         stats_module.stats(path, sys.stdout)
125 |     else:
126 |         raise click.UsageError("Building region indexes is not supported")
127 | 
128 | 
129 | @click.command
130 | @click.argument("path", type=click.Path())
131 | @output
132 | @click.option(
133 |     "-l",
134 |     "--list-samples",
135 |     is_flag=True,
136 |     help="List the sample IDs and exit.",
137 | )
138 | @click.option(
139 |     "-f",
140 |     "--format",
141 |     type=str,
142 |     help="The format of the output.",
143 |     default=None,
144 | )
145 | @regions
146 | @force_samples
147 | @samples
148 | @targets
149 | @include
150 | @exclude
151 | @handle_exception
152 | def query(
153 |     path,
154 |     output,
155 |     list_samples,
156 |     format,
157 |     regions,
158 |     targets,
159 |     force_samples,
160 |     samples,
161 |     include,
162 |     exclude,
163 | ):
164 |     """
165 |     Transform VCZ into user-defined formats with efficient subsetting and
166 |     filtering. Intended as a drop-in replacement for bcftools query, where we
167 |     replace the VCF file path with a VCZ dataset URL.
168 | 
169 |     This is an early version and not feature complete: if you are missing a
170 |     particular piece of functionality please open an issue at
171 |     https://github.com/sgkit-dev/vcztools/issues
172 |     """
173 |     if list_samples:
174 |         # bcftools query -l ignores the --output option and always writes to stdout
175 |         output = sys.stdout
176 |         with handle_broken_pipe(output):
177 |             query_module.list_samples(path, output)
178 |         return
179 | 
180 |     if format is None:
181 |         raise click.UsageError("Missing option -f / --format")
182 |     with handle_broken_pipe(output):
183 |         query_module.write_query(
184 |             path,
185 |             output,
186 |             query_format=format,
187 |             regions=regions,
188 |             targets=targets,
189 |             samples=samples,
190 |             force_samples=force_samples,
191 |             include=include,
192 |             exclude=exclude,
193 |         )
194 | 
195 | 
196 | @click.command
197 | @click.argument("path", type=click.Path())
198 | @output
199 | @click.option(
200 |     "-h",
201 |     "--header-only",
202 |     is_flag=True,
203 |     help="Output the VCF header only.",
204 | )
205 | @click.option(
206 |     "-H",
207 |     "--no-header",
208 |     is_flag=True,
209 |     help="Suppress the header in VCF output.",
210 | )
211 | @click.option(
212 |     "--no-version",
213 |     is_flag=True,
214 |     help="Do not append version and command line information to the output VCF header.",
215 | )
216 | @regions
217 | @force_samples
218 | @click.option(
219 |     "-I",
220 |     "--no-update",
221 |     is_flag=True,
222 |     help="Do not recalculate INFO fields for the sample subset.",
223 | )
224 | @samples
225 | @click.option(
226 |     "-S",
227 |     "--samples-file",
228 |     type=str,
229 |     default=None,
230 |     help="File of sample names to include.",
231 | )
232 | @click.option(
233 |     "-G",
234 |     "--drop-genotypes",
235 |     is_flag=True,
236 |     help="Drop genotypes.",
237 | )
238 | @targets
239 | @include
240 | @exclude
241 | @handle_exception
242 | def view(
243 |     path,
244 |     output,
245 |     header_only,
246 |     no_header,
247 |     no_version,
248 |     regions,
249 |     targets,
250 |     force_samples,
251 |     no_update,
252 |     samples,
253 |     samples_file,
254 |     drop_genotypes,
255 |     include,
256 |     exclude,
257 | ):
258 |     """
259 |     Convert VCZ dataset to VCF with efficient subsetting and filtering.
260 |     Intended as a drop-in replacement for bcftools view, where
261 |     we replace the VCF file path with a VCZ dataset URL.
262 | 
263 |     This is an early version and not feature complete: if you are missing a
264 |     particular piece of functionality please open an issue at
265 |     https://github.com/sgkit-dev/vcztools/issues
266 |     """
267 |     suffix = output.name.split(".")[-1]
268 |     # Exclude suffixes which require bgzipped or BCF output:
269 |     # https://github.com/samtools/htslib/blob/329e7943b7ba3f0af15b0eaa00a367a1ac15bd83/vcf.c#L3815
270 |     if suffix in ["gz", "bcf", "bgz"]:
271 |         raise ValueError(
272 |             f"Only uncompressed VCF output supported, suffix .{suffix} not allowed"
273 |         )
274 | 
275 |     if samples_file:
276 |         if samples is not None:
277 |             raise ValueError("vcztools does not support combining -s and -S")
278 | 
279 |         samples = ""
280 |         exclude_samples_file = samples_file.startswith("^")
281 |         samples_file = samples_file.lstrip("^")
282 | 
283 |         with open(samples_file) as file:
284 |             if exclude_samples_file:
285 |                 samples = "^" + samples
286 |             samples += ",".join(line.strip() for line in file.readlines())
287 | 
288 |     with handle_broken_pipe(output):
289 |         vcf_writer.write_vcf(
290 |             path,
291 |             output,
292 |             header_only=header_only,
293 |             no_header=no_header,
294 |             no_version=no_version,
295 |             regions=regions,
296 |             targets=targets,
297 |             no_update=no_update,
298 |             samples=samples,
299 |             force_samples=force_samples,
300 |             drop_genotypes=drop_genotypes,
301 |             include=include,
302 |             exclude=exclude,
303 |         )
304 | 
305 | 
306 | @click.command
307 | @click.argument("path", type=click.Path())
308 | @include
309 | @exclude
310 | @click.option("--out", default="plink")
311 | def view_plink1(path, include, exclude, out):
312 |     """
313 |     Generate a plink1 binary fileset compatible with plink1.9 --vcf.
314 |     This command is equivalent to running ``vcztools view [filtering options]
315 |     -o intermediate.vcf && plink 1.9 --vcf intermediate.vcf [plink options]``
316 |     without generating the intermediate VCF.
317 |     """
318 |     plink.write_plink(path, out, include=include, exclude=exclude)
319 | 
320 | 
321 | @version
322 | @click.group(cls=NaturalOrderGroup, name="vcztools")
323 | def vcztools_main():
324 |     pass
325 | 
326 | 
327 | vcztools_main.add_command(index)
328 | vcztools_main.add_command(query)
329 | vcztools_main.add_command(view)
330 | # vcztools_main.add_command(view_plink1)
331 | 


--------------------------------------------------------------------------------
/tests/test_tskit_data.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tests for data originating from tskit format for compatibility
  3 | with various outputs.
  4 | """
  5 | 
  6 | import bio2zarr.plink as p2z
  7 | import bio2zarr.tskit as ts2z
  8 | import bio2zarr.vcf as v2z
  9 | import msprime
 10 | import numpy as np
 11 | import numpy.testing as nt
 12 | import pytest
 13 | import sgkit as sg
 14 | import tskit
 15 | import xarray.testing as xt
 16 | 
 17 | from vcztools.plink import write_plink
 18 | from vcztools.vcf_writer import write_vcf
 19 | 
 20 | 
 21 | def add_mutations(ts):
 22 |     # Add some mutation to the tree sequence. This guarantees that
 23 |     # we have variation at all sites > 0.
 24 |     tables = ts.dump_tables()
 25 |     samples = ts.samples()
 26 |     states = "ACGT"
 27 |     for j in range(1, int(ts.sequence_length) - 1):
 28 |         site = tables.sites.add_row(j, ancestral_state=states[j % 4])
 29 |         tables.mutations.add_row(
 30 |             site=site,
 31 |             derived_state=states[(j + 1) % 4],
 32 |             node=samples[j % ts.num_samples],
 33 |         )
 34 |     return tables.tree_sequence()
 35 | 
 36 | 
 37 | @pytest.fixture()
 38 | def fx_diploid_msprime_sim(tmp_path):
 39 |     seed = 1234
 40 |     ts = msprime.sim_ancestry(5, sequence_length=10_000, random_seed=seed)
 41 |     ts = msprime.sim_mutations(ts, rate=1e-4, random_seed=seed)
 42 |     assert ts.num_mutations > 0
 43 |     assert ts.num_mutations == ts.num_sites  # make sure we have biallelic sites
 44 |     zarr_path = tmp_path / "sim.vcz"
 45 |     ts2z.convert(ts, zarr_path)
 46 |     return zarr_path
 47 | 
 48 | 
 49 | @pytest.fixture()
 50 | def fx_haploid_missing_data(tmp_path):
 51 |     # 2.00┊   4     ┊
 52 |     #     ┊ ┏━┻┓    ┊
 53 |     # 1.00┊ ┃  3    ┊
 54 |     #     ┊ ┃ ┏┻┓   ┊
 55 |     # 0.00┊ 0 1 2 5 ┊
 56 |     #     0        10
 57 |     #      |      |
 58 |     #  pos 2      9
 59 |     #  anc A      T
 60 |     ts = tskit.Tree.generate_balanced(3, span=10).tree_sequence
 61 |     tables = ts.dump_tables()
 62 |     tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0)
 63 |     tables.sites.add_row(2, ancestral_state="A")
 64 |     tables.sites.add_row(9, ancestral_state="T")
 65 |     tables.mutations.add_row(site=0, node=0, derived_state="G")
 66 |     tables.mutations.add_row(site=1, node=3, derived_state="C")
 67 |     zarr_path = tmp_path / "sim.vcz"
 68 |     ts2z.convert(tables.tree_sequence(), zarr_path, isolated_as_missing=True)
 69 |     return zarr_path
 70 | 
 71 | 
 72 | def test_haploid_missing_data(fx_haploid_missing_data):
 73 |     ds = sg.load_dataset(fx_haploid_missing_data)
 74 |     nt.assert_array_equal(
 75 |         ds.call_genotype.values,
 76 |         [
 77 |             [[1], [0], [0], [-1]],
 78 |             [[0], [1], [1], [-1]],
 79 |         ],
 80 |     )
 81 | 
 82 | 
 83 | @pytest.fixture()
 84 | def fx_diploid_missing_data(tmp_path):
 85 |     # 2.00┊    6       ┊
 86 |     #     ┊  ┏━┻━┓     ┊
 87 |     # 1.00┊  4   5     ┊
 88 |     #     ┊ ┏┻┓ ┏┻┓    ┊
 89 |     # 0.00┊ 0 1 2 3 7 8┊
 90 |     #     0            10
 91 |     #      |         |
 92 |     #  pos 2         9
 93 |     #  anc A         T
 94 |     ts = tskit.Tree.generate_balanced(4, span=10).tree_sequence
 95 |     tables = ts.dump_tables()
 96 |     tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0)
 97 |     u = tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0)
 98 |     assert u == 8
 99 |     tables.sites.add_row(2, ancestral_state="A")
100 |     tables.sites.add_row(9, ancestral_state="T")
101 |     tables.mutations.add_row(site=0, node=0, derived_state="G")
102 |     tables.mutations.add_row(site=1, node=5, derived_state="C")
103 |     zarr_path = tmp_path / "sim.vcz"
104 |     ts = tables.tree_sequence()
105 |     model_map = ts.map_to_vcf_model(ploidy=2)
106 |     ts2z.convert(ts, zarr_path, model_mapping=model_map, isolated_as_missing=True)
107 |     return zarr_path
108 | 
109 | 
110 | def test_diploid_missing_data(fx_diploid_missing_data):
111 |     ds = sg.load_dataset(fx_diploid_missing_data)
112 |     nt.assert_array_equal(
113 |         ds.call_genotype.values,
114 |         [
115 |             [[1, 0], [0, 0], [-1, -1]],
116 |             [[0, 0], [1, 1], [-1, -1]],
117 |         ],
118 |     )
119 | 
120 | 
121 | @pytest.fixture()
122 | def fx_diploid_multi_allelic(tmp_path):
123 |     # 2.00┊    6    ┊
124 |     #     ┊  ┏━┻━┓  ┊
125 |     # 1.00┊  4   5  ┊
126 |     #     ┊ ┏┻┓ ┏┻┓ ┊
127 |     # 0.00┊ 0 1 2 3 ┊
128 |     #     0         10
129 |     #      |       |
130 |     #  pos 2       9
131 |     #  anc A       T
132 |     ts = tskit.Tree.generate_balanced(4, span=10).tree_sequence
133 |     tables = ts.dump_tables()
134 |     tables.sites.add_row(2, ancestral_state="A")
135 |     tables.sites.add_row(9, ancestral_state="T")
136 |     tables.mutations.add_row(site=0, node=0, derived_state="G")
137 |     tables.mutations.add_row(site=1, node=1, derived_state="G")
138 |     tables.mutations.add_row(site=1, node=5, derived_state="C")
139 |     zarr_path = tmp_path / "sim.vcz"
140 |     ts = tables.tree_sequence()
141 |     model_map = ts.map_to_vcf_model(ploidy=2)
142 |     ts2z.convert(ts, zarr_path, model_mapping=model_map)
143 |     return zarr_path
144 | 
145 | 
146 | def test_diploid_multi_allelic(fx_diploid_multi_allelic):
147 |     ds = sg.load_dataset(fx_diploid_multi_allelic)
148 |     # NOTE this example is constructed so that the rarest allele is in the middle
149 |     # of the alleles array
150 |     nt.assert_array_equal(ds.variant_allele.values, [["A", "G", ""], ["T", "G", "C"]])
151 |     nt.assert_array_equal(
152 |         ds.call_genotype.values,
153 |         [
154 |             [[1, 0], [0, 0]],
155 |             [[0, 1], [2, 2]],
156 |         ],
157 |     )
158 | 
159 | 
160 | @pytest.fixture()
161 | def fx_haploid_msprime_sim(tmp_path):
162 |     seed = 12345
163 |     ts = msprime.sim_ancestry(5, ploidy=1, sequence_length=100, random_seed=seed)
164 |     ts = msprime.sim_mutations(ts, rate=0.5, random_seed=seed)
165 |     assert ts.num_mutations > 0
166 |     zarr_path = tmp_path / "sim.vcz"
167 |     ts2z.convert(ts, zarr_path)
168 |     return zarr_path
169 | 
170 | 
171 | def simple_ts_tables():
172 |     tables = tskit.TableCollection(sequence_length=100)
173 |     for _ in range(4):
174 |         ind = -1
175 |         ind = tables.individuals.add_row()
176 |         tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=ind)
177 |     tables.nodes.add_row(flags=0, time=1)  # MRCA for 0,1
178 |     tables.nodes.add_row(flags=0, time=1)  # MRCA for 2,3
179 |     tables.edges.add_row(left=0, right=100, parent=4, child=0)
180 |     tables.edges.add_row(left=0, right=100, parent=4, child=1)
181 |     tables.edges.add_row(left=0, right=100, parent=5, child=2)
182 |     tables.edges.add_row(left=0, right=100, parent=5, child=3)
183 |     site_id = tables.sites.add_row(position=10, ancestral_state="A")
184 |     tables.mutations.add_row(site=site_id, node=4, derived_state="TTTT")
185 |     site_id = tables.sites.add_row(position=20, ancestral_state="CCC")
186 |     tables.mutations.add_row(site=site_id, node=5, derived_state="G")
187 |     site_id = tables.sites.add_row(position=30, ancestral_state="G")
188 |     tables.mutations.add_row(site=site_id, node=0, derived_state="AA")
189 | 
190 |     tables.sort()
191 |     return tables
192 | 
193 | 
194 | @pytest.fixture()
195 | def fx_simple_ts(tmp_path):
196 |     ts = simple_ts_tables().tree_sequence()
197 |     zarr_path = tmp_path / "sim.vcz"
198 |     ts2z.convert(ts, zarr_path)
199 |     return zarr_path
200 | 
201 | 
202 | # TODO add other fixtures here like stuff with odd mixtures of ploidy,
203 | # and zero variants (need to address
204 | # https://github.com/sgkit-dev/bio2zarr/issues/342 before zero variants
205 | # handled)
206 | 
207 | 
208 | class TestVcfRoundTrip:
209 |     def assert_bio2zarr_rt(self, tmp_path, tskit_vcz):
210 |         vcf_path = tmp_path / "out.vcf"
211 |         write_vcf(tskit_vcz, vcf_path)
212 |         rt_vcz_path = tmp_path / "rt.vcz"
213 |         v2z.convert([vcf_path], rt_vcz_path)
214 |         ds1 = sg.load_dataset(tskit_vcz)
215 |         ds2 = sg.load_dataset(rt_vcz_path)
216 |         drop_fields = [
217 |             "variant_id",
218 |             "variant_id_mask",
219 |             "filter_id",
220 |             "filter_description",
221 |             "variant_filter",
222 |             "variant_quality",
223 |         ]
224 |         xt.assert_equal(ds1, ds2.drop_vars(drop_fields))
225 |         num_variants = ds2.sizes["variants"]
226 |         assert np.all(np.isnan(ds2["variant_quality"].values))
227 |         nt.assert_array_equal(
228 |             ds2["variant_filter"], np.ones((num_variants, 1), dtype=bool)
229 |         )
230 |         assert list(ds2["filter_id"].values) == ["PASS"]
231 | 
232 |     def test_diploid_msprime_sim(self, tmp_path, fx_diploid_msprime_sim):
233 |         self.assert_bio2zarr_rt(tmp_path, fx_diploid_msprime_sim)
234 | 
235 |     def test_haploid_msprime_sim(self, tmp_path, fx_haploid_msprime_sim):
236 |         self.assert_bio2zarr_rt(tmp_path, fx_haploid_msprime_sim)
237 | 
238 |     def test_simple_ts(self, tmp_path, fx_simple_ts):
239 |         self.assert_bio2zarr_rt(tmp_path, fx_simple_ts)
240 | 
241 |     def test_haploid_missing_data(self, tmp_path, fx_haploid_missing_data):
242 |         self.assert_bio2zarr_rt(tmp_path, fx_haploid_missing_data)
243 | 
244 |     def test_diploid_missing_data(self, tmp_path, fx_diploid_missing_data):
245 |         self.assert_bio2zarr_rt(tmp_path, fx_diploid_missing_data)
246 | 
247 |     def test_diploid_multi_allelic(self, tmp_path, fx_diploid_multi_allelic):
248 |         self.assert_bio2zarr_rt(tmp_path, fx_diploid_multi_allelic)
249 | 
250 | 
251 | def recode_plink_hets(G):
252 |     """
253 |     Returns a copy of the specified genotype matrix in which hets are all
254 |     in the canonical unphased plink orientation, [0, 1]
255 |     """
256 |     G = G.copy()
257 |     for j in range(G.shape[0]):
258 |         for k in range(G.shape[1]):
259 |             if G[j, k, 0] == 1 and G[j, k, 1] == 0:
260 |                 G[j, k, 0] = 0
261 |                 G[j, k, 1] = 1
262 |     return G
263 | 
264 | 
265 | class TestPlinkRoundTrip:
266 |     def assert_bio2zarr_rt(self, tmp_path, tskit_vcz):
267 |         # import pathlib
268 |         # tmp_path = pathlib.Path("tmp/plink")
269 |         plink_path = tmp_path / "plink"
270 |         write_plink(tskit_vcz, plink_path)
271 |         rt_vcz_path = tmp_path / "rt.vcz"
272 |         p2z.convert(plink_path, rt_vcz_path)
273 |         ds1 = sg.load_dataset(tskit_vcz)
274 |         ds2 = sg.load_dataset(rt_vcz_path)
275 | 
276 |         assert np.all(ds1["call_genotype_phased"])
277 |         assert np.all(~ds2["call_genotype_phased"])
278 | 
279 |         nt.assert_array_equal(
280 |             recode_plink_hets(ds1["call_genotype"].values), ds2["call_genotype"]
281 |         )
282 | 
283 |         drop_fields = [
284 |             "variant_id",
285 |             "variant_id_mask",
286 |             "call_genotype",
287 |             "call_genotype_phased",
288 |         ]
289 |         xt.assert_equal(
290 |             ds1.drop_vars(["call_genotype", "call_genotype_phased"]),
291 |             ds2.drop_vars(drop_fields),
292 |         )
293 | 
294 |     def test_diploid_msprime_sim(self, tmp_path, fx_diploid_msprime_sim):
295 |         self.assert_bio2zarr_rt(tmp_path, fx_diploid_msprime_sim)
296 | 
297 |     def test_diploid_missing_data(self, tmp_path, fx_diploid_missing_data):
298 |         self.assert_bio2zarr_rt(tmp_path, fx_diploid_missing_data)
299 | 
300 |     def test_diploid_multi_allelic(self, tmp_path, fx_diploid_multi_allelic):
301 |         with pytest.raises(ValueError, match="Only biallelic VCFs supported"):
302 |             self.assert_bio2zarr_rt(tmp_path, fx_diploid_multi_allelic)
303 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/vcztools/query.py:
--------------------------------------------------------------------------------
  1 | import functools
  2 | import itertools
  3 | import math
  4 | from collections.abc import Callable
  5 | 
  6 | import numpy as np
  7 | import pyparsing as pp
  8 | import zarr
  9 | 
 10 | from vcztools import constants, retrieval
 11 | from vcztools.samples import parse_samples
 12 | from vcztools.utils import vcf_name_to_vcz_names
 13 | 
 14 | 
 15 | def list_samples(vcz_path, output):
 16 |     root = zarr.open(vcz_path, mode="r")
 17 | 
 18 |     sample_ids = root["sample_id"][:]
 19 |     print("\n".join(sample_ids), file=output)
 20 | 
 21 | 
 22 | class QueryFormatParser:
 23 |     def __init__(self):
 24 |         info_tag_pattern = pp.Combine(
 25 |             pp.Literal("%INFO/") + pp.Word(pp.srange("[A-Z]"))
 26 |         )
 27 |         tag_pattern = info_tag_pattern | pp.Combine(
 28 |             pp.Literal("%") + pp.Regex(r"[A-Z]+\d?")
 29 |         )
 30 |         subfield_pattern = pp.Group(
 31 |             tag_pattern
 32 |             + pp.Literal("{").suppress()
 33 |             + pp.common.integer
 34 |             + pp.Literal("}").suppress()
 35 |         ).set_results_name("subfield")
 36 |         newline_pattern = pp.Literal("\\n").set_parse_action(pp.replace_with("\n"))
 37 |         tab_pattern = pp.Literal("\\t").set_parse_action(pp.replace_with("\t"))
 38 |         format_pattern = pp.Forward()
 39 |         sample_loop_pattern = pp.Group(
 40 |             pp.Literal("[").suppress() + format_pattern + pp.Literal("]").suppress()
 41 |         ).set_results_name("sample loop")
 42 |         format_pattern <<= pp.ZeroOrMore(
 43 |             sample_loop_pattern
 44 |             | subfield_pattern
 45 |             | tag_pattern
 46 |             | newline_pattern
 47 |             | tab_pattern
 48 |             | pp.White()
 49 |             | pp.Word(pp.printables, exclude_chars=r"\{}[]%")
 50 |         ).leave_whitespace()
 51 | 
 52 |         self._parser = functools.partial(format_pattern.parse_string, parse_all=True)
 53 | 
 54 |     def __call__(self, *args, **kwargs):
 55 |         assert len(args) == 1
 56 |         assert not kwargs
 57 | 
 58 |         return self._parser(args[0])
 59 | 
 60 | 
 61 | class QueryFormatGenerator:
 62 |     def __init__(self, query_format, sample_ids, contigs, filters):
 63 |         self.sample_ids = sample_ids
 64 |         self.sample_count = len(self.sample_ids)
 65 |         self.contig_ids = contigs
 66 |         self.filter_ids = filters
 67 |         if isinstance(query_format, str):
 68 |             parser = QueryFormatParser()
 69 |             parse_results = parser(query_format)
 70 |         else:
 71 |             assert isinstance(query_format, pp.ParseResults)
 72 |             parse_results = query_format
 73 | 
 74 |         self._generator = self._compose_generator(parse_results)
 75 | 
 76 |     def __call__(self, *args, **kwargs):
 77 |         assert len(args) == 1
 78 |         assert not kwargs
 79 | 
 80 |         yield from self._generator(args[0])
 81 | 
 82 |     def _compose_gt_generator(self) -> Callable:
 83 |         def generate(chunk_data):
 84 |             gt_array = chunk_data["call_genotype"]
 85 | 
 86 |             if "call_genotype_phased" in chunk_data:
 87 |                 phase_array = chunk_data["call_genotype_phased"]
 88 |                 assert gt_array.shape[:2] == phase_array.shape
 89 | 
 90 |                 for gt_row, phase in zip(gt_array, phase_array):
 91 | 
 92 |                     def stringify(gt_and_phase: tuple):
 93 |                         gt, phase = gt_and_phase
 94 |                         gt = [
 95 |                             str(allele) if allele != constants.INT_MISSING else "."
 96 |                             for allele in gt
 97 |                             if allele != constants.INT_FILL
 98 |                         ]
 99 |                         separator = "|" if phase else "/"
100 |                         return separator.join(gt)
101 | 
102 |                     gt_row = gt_row.tolist()
103 |                     yield map(stringify, zip(gt_row, phase))
104 |             else:
105 |                 # TODO: Support datasets without the phasing data
106 |                 raise NotImplementedError
107 | 
108 |         return generate
109 | 
110 |     def _compose_sample_ids_generator(self) -> Callable:
111 |         def generate(chunk_data):
112 |             variant_count = chunk_data["variant_position"].shape[0]
113 |             yield from itertools.repeat(self.sample_ids, variant_count)
114 | 
115 |         return generate
116 | 
117 |     def _compose_tag_generator(
118 |         self, tag: str, *, subfield=False, sample_loop=False
119 |     ) -> Callable:
120 |         assert tag.startswith("%")
121 |         tag = tag[1:]
122 | 
123 |         if tag == "GT":
124 |             if not sample_loop:
125 |                 raise ValueError(
126 |                     "no such tag defined: INFO/GT. "
127 |                     'FORMAT fields must be enclosed in square brackets, e.g. "[ %GT]"'
128 |                 )
129 |             return self._compose_gt_generator()
130 | 
131 |         if tag == "SAMPLE":
132 |             if not sample_loop:
133 |                 raise ValueError("no such tag defined: INFO/SAMPLE")
134 |             return self._compose_sample_ids_generator()
135 | 
136 |         def generate(chunk_data):
137 |             vcz_names = set(chunk_data.keys())
138 |             vcz_name_matches = vcf_name_to_vcz_names(vcz_names, tag)
139 |             if len(vcz_name_matches) == 0:
140 |                 raise ValueError(f"No mapping found for '{tag}'")
141 |             if sample_loop:
142 |                 # FORMAT fields have precedence over INFO fields
143 |                 vcz_name = vcz_name_matches[0]
144 |             else:
145 |                 # FORMAT fields are not allowed
146 |                 vcz_name = vcz_name_matches[-1]
147 |                 if vcz_name.startswith("call_"):
148 |                     raise ValueError(
149 |                         f"no such tag defined: INFO/{tag}. "
150 |                         "FORMAT fields must be enclosed in square brackets, "
151 |                         f'e.g. "[ %{tag}]"'
152 |                     )
153 |             array = chunk_data[vcz_name]
154 |             for row in array:
155 |                 is_missing = np.any(row == -1)
156 |                 sep = ","
157 | 
158 |                 if tag == "CHROM":
159 |                     row = self.contig_ids[row]
160 |                 if tag == "REF":
161 |                     row = row[0]
162 |                 if tag == "ALT":
163 |                     row = [allele for allele in row[1:] if allele] or "."
164 |                 if tag == "FILTER":
165 |                     if np.any(row):
166 |                         row = self.filter_ids[row]
167 |                     else:
168 |                         row = "."
169 |                     sep = ";"
170 |                 if tag == "QUAL":
171 |                     if math.isnan(row):
172 |                         row = "."
173 |                     else:
174 |                         row = f"{row:g}"
175 |                 if (
176 |                     not subfield
177 |                     and not sample_loop
178 |                     and (isinstance(row, np.ndarray) or isinstance(row, list))
179 |                 ):
180 |                     row = sep.join(map(str, row))
181 | 
182 |                 if sample_loop:
183 |                     if isinstance(row, np.ndarray):
184 |                         row = row.tolist()
185 |                         row = [
186 |                             (str(element) if element != constants.INT_MISSING else ".")
187 |                             for element in row
188 |                             if element != constants.INT_FILL
189 |                         ]
190 |                         yield row
191 |                     else:
192 |                         yield itertools.repeat(str(row), self.sample_count)
193 |                 else:
194 |                     yield row if not is_missing else "."
195 | 
196 |         return generate
197 | 
198 |     def _compose_subfield_generator(self, parse_results: pp.ParseResults) -> Callable:
199 |         assert len(parse_results) == 2
200 | 
201 |         tag, subfield_index = parse_results
202 |         tag_generator = self._compose_tag_generator(tag, subfield=True)
203 | 
204 |         def generate(chunk_data):
205 |             for tag in tag_generator(chunk_data):
206 |                 if isinstance(tag, str):
207 |                     assert tag == "."
208 |                     yield "."
209 |                 else:
210 |                     if subfield_index < len(tag):
211 |                         yield tag[subfield_index]
212 |                     else:
213 |                         yield "."
214 | 
215 |         return generate
216 | 
217 |     def _compose_sample_loop_generator(
218 |         self, parse_results: pp.ParseResults
219 |     ) -> Callable:
220 |         generators = map(
221 |             functools.partial(self._compose_element_generator, sample_loop=True),
222 |             parse_results,
223 |         )
224 | 
225 |         def generate(chunk_data):
226 |             iterables = (generator(chunk_data) for generator in generators)
227 |             zipped = zip(*iterables)
228 |             zipped_zipped = (zip(*element) for element in zipped)
229 |             if "call_mask" not in chunk_data:
230 |                 flattened_zipped_zipped = (
231 |                     (
232 |                         subsubelement
233 |                         for subelement in element  # sample-wise
234 |                         for subsubelement in subelement
235 |                     )
236 |                     for element in zipped_zipped  # variant-wise
237 |                 )
238 |             else:
239 |                 call_mask = chunk_data["call_mask"]
240 |                 flattened_zipped_zipped = (
241 |                     (
242 |                         subsubelement
243 |                         for j, subelement in enumerate(element)  # sample-wise
244 |                         if call_mask[i, j]
245 |                         for subsubelement in subelement
246 |                     )
247 |                     for i, element in enumerate(zipped_zipped)  # variant-wise
248 |                 )
249 |             yield from map("".join, flattened_zipped_zipped)
250 | 
251 |         return generate
252 | 
253 |     def _compose_element_generator(
254 |         self, element: str | pp.ParseResults, *, sample_loop=False
255 |     ) -> Callable:
256 |         if isinstance(element, pp.ParseResults):
257 |             if element.get_name() == "subfield":
258 |                 return self._compose_subfield_generator(element)
259 |             elif element.get_name() == "sample loop":
260 |                 return self._compose_sample_loop_generator(element)
261 | 
262 |         assert isinstance(element, str)
263 | 
264 |         if element.startswith("%"):
265 |             return self._compose_tag_generator(element, sample_loop=sample_loop)
266 |         else:
267 | 
268 |             def generate(chunk_data):
269 |                 nonlocal element
270 |                 variant_count = chunk_data["variant_position"].shape[0]
271 |                 if sample_loop:
272 |                     for _ in range(variant_count):
273 |                         yield itertools.repeat(element, self.sample_count)
274 |                 else:
275 |                     yield from itertools.repeat(element, variant_count)
276 | 
277 |             return generate
278 | 
279 |     def _compose_generator(
280 |         self,
281 |         parse_results,
282 |     ) -> Callable:
283 |         generators = (
284 |             self._compose_element_generator(element) for element in parse_results
285 |         )
286 | 
287 |         def generate(chunk_data) -> str:
288 |             iterables = (generator(chunk_data) for generator in generators)
289 |             for results in zip(*iterables):
290 |                 results = map(str, results)
291 |                 yield "".join(results)
292 | 
293 |         return generate
294 | 
295 | 
296 | def write_query(
297 |     vcz,
298 |     output,
299 |     *,
300 |     query_format: str,
301 |     regions=None,
302 |     targets=None,
303 |     samples=None,
304 |     force_samples: bool = False,
305 |     include: str | None = None,
306 |     exclude: str | None = None,
307 | ):
308 |     root = zarr.open(vcz, mode="r")
309 | 
310 |     all_samples = root["sample_id"][:]
311 |     sample_ids, samples_selection = parse_samples(
312 |         samples, all_samples, force_samples=force_samples
313 |     )
314 |     contigs = root["contig_id"][:]
315 |     filters = root["filter_id"][:]
316 | 
317 |     generator = QueryFormatGenerator(query_format, sample_ids, contigs, filters)
318 | 
319 |     for chunk_data in retrieval.variant_chunk_iter(
320 |         root,
321 |         regions=regions,
322 |         targets=targets,
323 |         include=include,
324 |         exclude=exclude,
325 |         samples_selection=samples_selection,
326 |     ):
327 |         for result in generator(chunk_data):
328 |             print(result, sep="", end="", file=output)
329 | 


--------------------------------------------------------------------------------
/tests/test_bcftools_validation.py:
--------------------------------------------------------------------------------
  1 | import pathlib
  2 | import subprocess
  3 | 
  4 | import click.testing as ct
  5 | import pytest
  6 | 
  7 | import vcztools.cli as cli
  8 | 
  9 | from .utils import assert_vcfs_close, vcz_path_cache
 10 | 
 11 | 
 12 | def run_bcftools(args: str, expect_error=False) -> tuple[str, str]:
 13 |     """
 14 |     Run bcftools (which must be on the PATH) and return stdout and stderr
 15 |     as a pair of strings.
 16 |     """
 17 |     completed = subprocess.run(
 18 |         f"bcftools {args}", capture_output=True, check=False, shell=True
 19 |     )
 20 |     if expect_error:
 21 |         assert completed.returncode != 0
 22 |     else:
 23 |         assert completed.returncode == 0
 24 |     return completed.stdout.decode("utf-8"), completed.stderr.decode("utf-8")
 25 | 
 26 | 
 27 | def run_vcztools(args: str, expect_error=False) -> tuple[str, str]:
 28 |     """Run run_vcztools and return stdout and stderr as a pair of strings."""
 29 |     runner = ct.CliRunner()
 30 |     result = runner.invoke(
 31 |         cli.vcztools_main,
 32 |         args,
 33 |         catch_exceptions=False,
 34 |     )
 35 |     if expect_error:
 36 |         assert result.exit_code != 0
 37 |     else:
 38 |         assert result.exit_code == 0
 39 |     return result.stdout, result.stderr
 40 | 
 41 | 
 42 | # fmt: off
 43 | @pytest.mark.parametrize(
 44 |     ("args", "vcf_file"),
 45 |     [
 46 |         ("view --no-version", "sample.vcf.gz"),
 47 |         ("view --no-version", "chr22.vcf.gz"),
 48 |         ("view --no-version", "msprime_diploid.vcf.gz"),
 49 |         ("view --no-version -i 'CHROM == \"20\"'", "sample.vcf.gz"),
 50 |         ("view --no-version -i 'CHROM != \"Z\"'", "sample.vcf.gz"),
 51 |         ("view --no-version -i 'ID == \"rs6054257\"'", "sample.vcf.gz"),
 52 |         ("view --no-version -i 'DB=0'", "sample.vcf.gz"),
 53 |         ("view --no-version -i 'DB=1'", "sample.vcf.gz"),
 54 |         ("view --no-version -i 'FILTER=\"PASS\"'", "sample.vcf.gz"),
 55 |         ("view --no-version -i 'INFO/DP > 10'", "sample.vcf.gz"),
 56 |         ("view --no-version -i 'FMT/DP >= 5'", "sample.vcf.gz"),
 57 |         ("view --no-version -i 'FMT/DP >= 5 && FMT/GQ > 10'", "sample.vcf.gz"),
 58 |         ("view --no-version -i 'FMT/DP >= 5 & FMT/GQ>10'", "sample.vcf.gz"),
 59 |         ("view --no-version -i 'FMT/DP>5 && FMT/GQ<45'", "sample.vcf.gz"),
 60 |         ("view --no-version -i 'FMT/DP>5 & FMT/GQ<45'", "sample.vcf.gz"),
 61 |         (
 62 |                 "view --no-version -i '(QUAL > 10 || FMT/GQ>10) && POS > 100000'",
 63 |                 "sample.vcf.gz"
 64 |         ),
 65 |         (
 66 |                 "view --no-version -i '(FMT/DP >= 8 | FMT/GQ>40) && POS > 100000'",
 67 |                 "sample.vcf.gz"
 68 |         ),
 69 |         (
 70 |                 "view --no-version -e '(FMT/DP >= 8 | FMT/GQ>40) && POS > 100000'",
 71 |                 "sample.vcf.gz"
 72 |         ),
 73 |         ("view --no-version -i 'TYPE=\"ref\"'", "sample.vcf.gz"),
 74 |         ("view --no-version -i 'TYPE!=\"ref\"'", "sample.vcf.gz"),
 75 |         ("view --no-version -i 'TYPE=\"snp\"'", "sample.vcf.gz"),
 76 |         ("view --no-version -i 'TYPE!=\"snp\"'", "sample.vcf.gz"),
 77 |         # All alleles are SNPs, 14 rows
 78 |         ("view --no-version -i 'TYPE=\"snp\"'", "1kg_2020_chrM.vcf.gz"),
 79 |         # Any allele is a SNP, 22 rows
 80 |         ("view --no-version -i 'TYPE~\"snp\"'", "1kg_2020_chrM.vcf.gz"),
 81 |         # No allele is a SNP, 1 row
 82 |         ("view --no-version -i 'TYPE!~\"snp\"'", "1kg_2020_chrM.vcf.gz"),
 83 |         # Any allele is not a SNP, 9 rows
 84 |         ("view --no-version -i 'TYPE!=\"snp\"'", "1kg_2020_chrM.vcf.gz"),
 85 |         ("view --no-version -G", "sample.vcf.gz"),
 86 |         (
 87 |                 "view --no-update --no-version --samples-file "
 88 |                 "tests/data/txt/samples.txt",
 89 |                 "sample.vcf.gz"),
 90 |         ("view -I --no-version -S tests/data/txt/samples.txt", "sample.vcf.gz"),
 91 |         ("view --no-version -s NA00001", "sample.vcf.gz"),
 92 |         ("view --no-version -s NA00001,NA00003", "sample.vcf.gz"),
 93 |         ("view --no-version -s HG00096", "1kg_2020_chrM.vcf.gz"),
 94 |         ("view --no-version -s tsk_0,tsk_1", "msprime_diploid.vcf.gz"),
 95 |         ("view --no-version -s tsk_0,tsk_1,tsk_2", "msprime_diploid.vcf.gz"),
 96 |         ("view --no-version -s ^tsk_0,tsk_1,tsk_2", "msprime_diploid.vcf.gz"),
 97 |         ("view --no-version -s '' --force-samples", "sample.vcf.gz"),
 98 |         ("view --no-version -s 'NO_SAMPLE' --force-samples", "sample.vcf.gz"),
 99 |         ("view --no-version -s 'NO_SAMPLE,NA00001' --force-samples", "sample.vcf.gz"),
100 |         ("view --no-version -s ^NA00001", "sample.vcf.gz"),
101 |         ("view --no-version -s ^NA00003,NA00002", "sample.vcf.gz"),
102 |         ("view --no-version -s ^NA00003,NA00002,NA00003", "sample.vcf.gz"),
103 |         ("view --no-version -S ^tests/data/txt/samples.txt", "sample.vcf.gz"),
104 |         (
105 |             "view --no-version -r '20:1230236-' -i 'FMT/DP>3' -s 'NA00002,NA00003'",
106 |             "sample.vcf.gz"
107 |         ),
108 |         (
109 |             "view --no-version -i 'FILTER=\"VQSRTrancheSNP99.80to100.00\"'",
110 |             "1kg_2020_chrM.vcf.gz"
111 |         ),
112 |         (
113 |             "view --no-version -i 'FILTER!=\"VQSRTrancheSNP99.80to100.00\"'",
114 |             "1kg_2020_chrM.vcf.gz"
115 |         ),
116 |         (
117 |             "view --no-version -i 'FILTER~\"VQSRTrancheINDEL99.00to100.00\"'",
118 |             "1kg_2020_chrM.vcf.gz"
119 |         ),
120 |         ("view --no-version -i 'INFO/AC>2'", "chr22.vcf.gz")
121 |     ],
122 |     # This is necessary when trying to run individual tests, as the arguments above
123 |     # make for unworkable command lines
124 |     # ids=range(36),
125 | )
126 | # fmt: on
127 | def test_vcf_output(tmp_path, args, vcf_file):
128 |     # print("args:", args)
129 |     original = pathlib.Path("tests/data/vcf") / vcf_file
130 |     vcz = vcz_path_cache(original)
131 | 
132 |     bcftools_out, _ = run_bcftools(f"{args} {original}")
133 |     bcftools_out_file = tmp_path.joinpath("bcftools_out.vcf")
134 |     with open(bcftools_out_file, "w") as f:
135 |         f.write(bcftools_out)
136 | 
137 |     vcztools_out, _ = run_vcztools(f"{args} {vcz}")
138 |     vcztools_out_file = tmp_path.joinpath("vcztools_out.vcf")
139 |     with open(vcztools_out_file, "w") as f:
140 |         f.write(vcztools_out)
141 | 
142 |     assert_vcfs_close(bcftools_out_file, vcztools_out_file)
143 | 
144 | 
145 | @pytest.mark.parametrize(
146 |     ("args", "vcf_file"),
147 |     [("view --no-version", "sample.vcf.gz")],
148 | )
149 | def test_vcf_output_with_output_option(tmp_path, args, vcf_file):
150 |     vcf_path = pathlib.Path("tests/data/vcf") / vcf_file
151 |     vcz_path = vcz_path_cache(vcf_path)
152 | 
153 |     bcftools_out_file = tmp_path.joinpath("bcftools_out.vcf")
154 |     vcztools_out_file = tmp_path.joinpath("vcztools_out.vcf")
155 | 
156 |     bcftools_args = f"{args} -o {bcftools_out_file}"
157 |     vcztools_args = f"{args} -o {vcztools_out_file}"
158 | 
159 |     run_bcftools(f"{bcftools_args} {vcf_path}")
160 |     run_vcztools(f"{vcztools_args} {vcz_path}")
161 | 
162 |     assert_vcfs_close(bcftools_out_file, vcztools_out_file)
163 | 
164 | 
165 | @pytest.mark.parametrize(
166 |     ("args", "vcf_name"),
167 |     [
168 |         ("index -n", "sample.vcf.gz"),
169 |         ("index --nrecords", "1kg_2020_chrM.vcf.gz"),
170 |         ("index -s", "sample.vcf.gz"),
171 |         ("index --stats", "1kg_2020_chrM.vcf.gz"),
172 |         ("query -l", "sample.vcf.gz"),
173 |         ("query --list-samples", "1kg_2020_chrM.vcf.gz"),
174 |         (r"query -f 'A\n'", "sample.vcf.gz"),
175 |         (r"query -f '%CHROM:%POS\n'", "sample.vcf.gz"),
176 |         (r"query -f '[%CHROM %POS %GT\n]'", "sample.vcf.gz"),
177 |         (r"query -f '%INFO/DP\n'", "sample.vcf.gz"),
178 |         (r"query -f '%DP\n'", "sample.vcf.gz"),
179 |         (r"query -f '%AC{0}\n'", "sample.vcf.gz"),
180 |         (r"query -f '%REF\t%ALT\n'", "sample.vcf.gz"),
181 |         (r"query -f '%ALT{1}\n'", "sample.vcf.gz"),
182 |         (r"query -f '%ID\n'", "sample.vcf.gz"),
183 |         (r"query -f '%QUAL\n'", "sample.vcf.gz"),
184 |         (r"query -f '%FILTER\n'", "sample.vcf.gz"),
185 |         (r"query --format '%FILTER\n'", "1kg_2020_chrM.vcf.gz"),
186 |         (r"query -f '%POS\n' -i 'POS=112'", "sample.vcf.gz"),
187 |         (r"query -f '%POS\n' -e 'POS=112'", "sample.vcf.gz"),
188 |         (r"query -f '[%CHROM\t]\n'", "sample.vcf.gz"),
189 |         (r"query -f '[%CHROM\t]\n' -i 'POS=112'", "sample.vcf.gz"),
190 |         (r"query -f '[%CHROM:%POS %SAMPLE %GT\n]'", "sample.vcf.gz"),
191 |         (r"query -f '[%SAMPLE %GT %DP\n]'", "sample.vcf.gz"),
192 |         (
193 |             r"query -f '[%POS %SAMPLE %GT %DP %GQ\n]' -i 'INFO/DP >= 5'",
194 |             "sample.vcf.gz",
195 |         ),
196 |         (
197 |             r"query -f '[%POS %QUAL\n]' -i'(QUAL > 10 && POS > 100000)'",
198 |             "sample.vcf.gz",
199 |         ),
200 |         # Examples from bcftools query documentation
201 |         (r"query -f '%CHROM  %POS  %REF  %ALT{0}\n'", "sample.vcf.gz"),
202 |         (r"query -f '%CHROM\t%POS\t%REF\t%ALT[\t%SAMPLE=%GT]\n'", "sample.vcf.gz"),
203 |         (r"query -f 'GQ:[ %GQ] \t GT:[ %GT]\n'", "sample.vcf.gz"),
204 |         # POS0 not supported
205 |         # (r"query -f '%CHROM\t%POS0\t%END\t%ID\n'", "sample.vcf.gz"),
206 |         # Filtering on GT not supported
207 |         # (r"query -f [%CHROM:%POS %SAMPLE %GT\n]' -i'GT=\"alt\"'", "sample.vcf.gz"),
208 |         # Indexing not supported in filtering
209 |         # (r"query  -f '%AC{1}\n' -i 'AC[1]>10' ", "sample.vcf.gz"),
210 |         # TODO fill-out more of these when supported for more stuff is available
211 |         # in filtering
212 |         ("query -f '%CHROM %POS %FILTER\n' -i 'FILTER=\"PASS\"'", "sample.vcf.gz"),
213 |         # Per-sample query tests
214 |         (
215 |             r"query -f '[%CHROM %POS %SAMPLE %GT %DP %GQ\n]' -i 'FMT/DP>3'",
216 |             "sample.vcf.gz",
217 |         ),
218 |         (
219 |             r"query -f '[%CHROM %POS %SAMPLE %GT %DP %GQ\n]' -i 'FMT/GQ>30'",
220 |             "sample.vcf.gz",
221 |         ),
222 |         (
223 |             r"query -f '[%CHROM %POS %SAMPLE %GT %DP %GQ\n]' -i 'FMT/DP>3 & FMT/GQ>30'",
224 |             "sample.vcf.gz",
225 |         ),
226 |         (
227 |             r"query -f '[%CHROM %POS %SAMPLE %GT %DP %GQ\n]' -i 'FMT/DP>3 && FMT/GQ>30'",  # noqa: E501
228 |             "sample.vcf.gz",
229 |         ),
230 |         (
231 |             r"query -f '[%CHROM %POS %SAMPLE %GT %DP %GQ\n]' -r '20:1230236-' -i 'FMT/DP>3' -s 'NA00002,NA00003'",  # noqa: E501
232 |             "sample.vcf.gz",
233 |         ),
234 |     ],
235 | )
236 | def test_output(tmp_path, args, vcf_name):
237 |     vcf_path = pathlib.Path("tests/data/vcf") / vcf_name
238 |     vcz_path = vcz_path_cache(vcf_path)
239 | 
240 |     bcftools_output, _ = run_bcftools(f"{args} {vcf_path}")
241 |     vcztools_output, _ = run_vcztools(f"{args} {vcz_path}")
242 | 
243 |     assert vcztools_output == bcftools_output
244 | 
245 | 
246 | @pytest.mark.parametrize(
247 |     "expr",
248 |     [
249 |         # Check arithmetic evaluation in filter queries. All these should
250 |         # result to POS=112, which exists.
251 |         "POS=(111 + 1)",
252 |         "POS =(224 / 2)",
253 |         "POS= (112 * 3) / 3",
254 |         "POS=(112 * 3 / 3   )",
255 |         "POS=25 * 4 + 24 / 2",
256 |         "POS=112 * -1 * -1",
257 |         "-POS=-112",
258 |         "POS=112.25 - 1 / 4",
259 |         "POS=112.25e3 * 1e-3 - 0.25",
260 |     ],
261 | )
262 | def test_query_arithmethic(tmp_path, expr):
263 | 
264 |     args = r"query -f '%POS\n'" + f" -i '{expr}'"
265 |     vcf_name = "sample.vcf.gz"
266 |     vcf_path = pathlib.Path("tests/data/vcf") / vcf_name
267 |     vcz_path = vcz_path_cache(vcf_path)
268 | 
269 |     bcftools_output, _ = run_bcftools(f"{args} {vcf_path}")
270 |     vcztools_output, _ = run_vcztools(f"{args} {vcz_path}")
271 | 
272 |     assert vcztools_output == bcftools_output
273 |     assert vcztools_output == "112\n"
274 | 
275 | 
276 | @pytest.mark.parametrize(
277 |     ("expr", "expected"),
278 |     [
279 |         # Check boolean logic evaluation. Will evaluate this with
280 |         # POS=112, so POS=112 is True and POS!=112 is False
281 |         ("POS==112 || POS!=112", True),
282 |         ("POS==112 && POS!=112", False),
283 |         ("POS==112 || POS!=112 && POS!= 112", True),
284 |         ("(POS==112 || POS!=112) && POS!= 112", False),
285 |     ],
286 | )
287 | def test_query_logic_precendence(tmp_path, expr, expected):
288 | 
289 |     args = r"query -f '%POS\n'" + f" -i 'POS=112 && ({expr})'"
290 |     vcf_name = "sample.vcf.gz"
291 |     vcf_path = pathlib.Path("tests/data/vcf") / vcf_name
292 |     vcz_path = vcz_path_cache(vcf_path)
293 | 
294 |     bcftools_output, _ = run_bcftools(f"{args} {vcf_path}")
295 |     vcztools_output, _ = run_vcztools(f"{args} {vcz_path}")
296 | 
297 |     assert vcztools_output == bcftools_output
298 |     num_lines = len(list(vcztools_output.splitlines()))
299 |     assert num_lines == int(expected)
300 | 
301 | 
302 | # fmt: off
303 | @pytest.mark.parametrize(
304 |     ("args", "vcf_name", "bcftools_error_string"),
305 |     [
306 |         ("index -ns", "sample.vcf.gz", True),
307 |         ("query -f '%POS\n' -i 'INFO/DP > 10' -e 'INFO/DP < 50'", "sample.vcf.gz", True),  # noqa: E501
308 |         ("query -f '%GT'", "sample.vcf.gz", True),
309 |         ("query -f '%HQ'", "sample.vcf.gz", True),
310 |         ("query -f '%SAMPLE'", "sample.vcf.gz", True),
311 |         ("view -i 'INFO/DP > 10' -e 'INFO/DP < 50'", "sample.vcf.gz", True),
312 |         ("view -i 'DP > 10'", "sample.vcf.gz", True),
313 |         # bcftools output does not start with "Error"
314 |         ("view -i 'FILTER=\"F\"'", "sample.vcf.gz", False),
315 |     ],
316 | )
317 | # fmt: on
318 | def test_error(tmp_path, args, vcf_name, bcftools_error_string):
319 |     vcf_path = pathlib.Path("tests/data/vcf") / vcf_name
320 |     vcz_path = vcz_path_cache(vcf_path)
321 | 
322 |     _, bcftools_error = run_bcftools(f"{args} {vcf_path}", expect_error=True)
323 |     if bcftools_error_string:
324 |         assert bcftools_error.startswith("Error:") or bcftools_error.startswith("[E::")
325 | 
326 |     _, vcztools_error = run_vcztools(f"{args} {vcz_path}", expect_error=True)
327 |     assert "Error:" in vcztools_error
328 | 


--------------------------------------------------------------------------------
/vcztools/vcf_writer.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import logging
  3 | import sys
  4 | from datetime import datetime
  5 | 
  6 | import numpy as np
  7 | import zarr
  8 | 
  9 | from vcztools.samples import parse_samples
 10 | from vcztools.utils import (
 11 |     open_file_like,
 12 | )
 13 | 
 14 | from . import _vcztools, constants, retrieval
 15 | from . import filter as filter_mod
 16 | from .constants import FLOAT32_MISSING, RESERVED_VARIABLE_NAMES
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | # references to the VCF spec are for https://samtools.github.io/hts-specs/VCFv4.3.pdf
 21 | 
 22 | # [Table 1: Reserved INFO keys]
 23 | RESERVED_INFO_KEY_DESCRIPTIONS = {
 24 |     "AA": "Ancestral allele",
 25 |     "AC": "Allele count in genotypes",
 26 |     "AD": "Total read depth for each allele",
 27 |     "ADF": "Read depth for each allele on the forward strand",
 28 |     "ADR": "Read depth for each allele on the reverse strand",
 29 |     "AF": "Allele frequency for each ALT allele in the same order as listed",
 30 |     "AN": "Total number of alleles in called genotypes",
 31 |     "BQ": "RMS base quality",
 32 |     "CIGAR": "Cigar string describing how to align an alternate allele to the reference"
 33 |     "allele",
 34 |     "DB": "dbSNP membership",
 35 |     "DP": "Combined depth across samples",
 36 |     "END": "End position on CHROM",
 37 |     "H2": "HapMap2 membership",
 38 |     "H3": "HapMap3 membership",
 39 |     "MQ": "RMS mapping quality",
 40 |     "MQ0": "Number of MAPQ == 0 reads",
 41 |     "NS": "Number of samples with data",
 42 |     "SB": "Strand bias",
 43 |     "SOMATIC": "Somatic mutation",
 44 |     "VALIDATED": "Validated by follow-up experiment",
 45 |     "1000G": "1000 Genomes membership",
 46 | }
 47 | 
 48 | # [Table 2: Reserved genotype keys]
 49 | RESERVED_FORMAT_KEY_DESCRIPTIONS = {
 50 |     "AD": "Read depth for each allele",
 51 |     "ADF": "Read depth for each allele on the forward strand",
 52 |     "ADR": "Read depth for each allele on the reverse strand",
 53 |     "DP": "Read depth",
 54 |     "EC": "Expected alternate allele counts",
 55 |     "FT": 'Filter indicating if this genotype was "called"',
 56 |     "GL": "Genotype likelihoods",
 57 |     "GP": "Genotype posterior probabilities",
 58 |     "GQ": "Conditional genotype quality",
 59 |     "GT": "Genotype",
 60 |     "HQ": "Haplotype quality",
 61 |     "MQ": "RMS mapping quality",
 62 |     "PL": "Phred-scaled genotype likelihoods rounded to the closest integer",
 63 |     "PP": "Phred-scaled genotype posterior probabilities rounded to the closest "
 64 |     "integer",
 65 |     "PQ": "Phasing quality",
 66 |     "PS": "Phase set",
 67 | }
 68 | 
 69 | 
 70 | def dims(arr):
 71 |     return arr.attrs["_ARRAY_DIMENSIONS"]
 72 | 
 73 | 
 74 | def write_vcf(
 75 |     vcz,
 76 |     output,
 77 |     *,
 78 |     header_only: bool = False,
 79 |     no_header: bool = False,
 80 |     no_version: bool = False,
 81 |     regions=None,
 82 |     targets=None,
 83 |     no_update=None,
 84 |     samples=None,
 85 |     force_samples: bool = False,
 86 |     drop_genotypes: bool = False,
 87 |     include: str | None = None,
 88 |     exclude: str | None = None,
 89 | ) -> None:
 90 |     root = zarr.open(vcz, mode="r")
 91 | 
 92 |     with open_file_like(output) as output:
 93 |         if samples and drop_genotypes:
 94 |             raise ValueError("Cannot select samples and drop genotypes.")
 95 |         elif drop_genotypes:
 96 |             sample_ids = []
 97 |             samples_selection = np.array([])
 98 |         else:
 99 |             all_samples = root["sample_id"][:]
100 |             sample_ids, samples_selection = parse_samples(
101 |                 samples, all_samples, force_samples=force_samples
102 |             )
103 | 
104 |         # Need to try parsing filter expressions before writing header
105 |         filter_mod.FilterExpression(
106 |             field_names=set(root), include=include, exclude=exclude
107 |         )
108 | 
109 |         if not no_header:
110 |             force_ac_an_header = not drop_genotypes and samples_selection is not None
111 |             vcf_header = _generate_header(
112 |                 root,
113 |                 sample_ids,
114 |                 no_version=no_version,
115 |                 force_ac_an=force_ac_an_header,
116 |             )
117 |             print(vcf_header, end="", file=output)
118 | 
119 |         if header_only:
120 |             return
121 | 
122 |         contigs = root["contig_id"][:].astype("S")
123 |         filters = get_filter_ids(root)
124 | 
125 |         for chunk_data in retrieval.variant_chunk_iter(
126 |             root,
127 |             regions=regions,
128 |             targets=targets,
129 |             include=include,
130 |             exclude=exclude,
131 |             samples_selection=samples_selection,
132 |         ):
133 |             c_chunk_to_vcf(
134 |                 chunk_data,
135 |                 samples_selection,
136 |                 contigs,
137 |                 filters,
138 |                 output,
139 |                 drop_genotypes=drop_genotypes,
140 |                 no_update=no_update,
141 |             )
142 | 
143 | 
144 | def c_chunk_to_vcf(
145 |     chunk_data,
146 |     samples_selection,
147 |     contigs,
148 |     filters,
149 |     output,
150 |     *,
151 |     drop_genotypes,
152 |     no_update,
153 | ):
154 |     format_fields = {}
155 |     info_fields = {}
156 |     num_samples = len(samples_selection) if samples_selection is not None else None
157 | 
158 |     # TODO check we don't truncate silently by doing this
159 |     pos = chunk_data["variant_position"].astype(np.int32)
160 |     num_variants = len(pos)
161 |     if num_variants == 0:
162 |         return ""
163 |     # Required fields
164 |     chrom = contigs[chunk_data["variant_contig"]]
165 |     alleles = chunk_data["variant_allele"]
166 | 
167 |     # Optional fields which we fill in with "all missing" defaults
168 |     if "variant_id" in chunk_data:
169 |         id = chunk_data["variant_id"].astype("S")
170 |     else:
171 |         id = np.array(["."] * num_variants, dtype="S")
172 |     if "variant_quality" in chunk_data:
173 |         qual = chunk_data["variant_quality"]
174 |     else:
175 |         qual = np.full(num_variants, FLOAT32_MISSING, dtype=np.float32)
176 | 
177 |     # Filter defaults to "PASS" if not present
178 |     if "variant_filter" in chunk_data:
179 |         filter_ = chunk_data["variant_filter"]
180 |     else:
181 |         filter_ = np.ones((num_variants, 1), dtype=bool)
182 | 
183 |     gt = None
184 |     gt_phased = None
185 | 
186 |     if "call_genotype" in chunk_data and not drop_genotypes:
187 |         gt = chunk_data["call_genotype"]
188 | 
189 |         if (
190 |             "call_genotype_phased" in chunk_data
191 |             and not drop_genotypes
192 |             and (samples_selection is None or num_samples != 0)
193 |         ):
194 |             gt_phased = chunk_data["call_genotype_phased"]
195 |         else:
196 |             # Default to unphased if call_genotype_phased not present
197 |             gt_phased = np.zeros(gt.shape[:2], dtype=bool)
198 | 
199 |     for name, array in chunk_data.items():
200 |         if (
201 |             name.startswith("call_")
202 |             and not name == "call_mask"
203 |             and not name.startswith("call_genotype")
204 |             and num_samples != 0
205 |         ):
206 |             vcf_name = name[len("call_") :]
207 |             format_fields[vcf_name] = array
208 |             if num_samples is None:
209 |                 num_samples = array.shape[1]
210 |         elif name.startswith("variant_") and name not in RESERVED_VARIABLE_NAMES:
211 |             vcf_name = name[len("variant_") :]
212 |             info_fields[vcf_name] = array
213 | 
214 |     ref = alleles[:, 0].astype("S")
215 |     alt = alleles[:, 1:].astype("S")
216 | 
217 |     if len(id.shape) == 1:
218 |         id = id.reshape((-1, 1))
219 |     if (
220 |         not no_update
221 |         and samples_selection is not None
222 |         and "call_genotype" in chunk_data
223 |         and not drop_genotypes
224 |     ):
225 |         # Recompute INFO/AC and INFO/AN
226 |         info_fields |= _compute_info_fields(gt, alt)
227 |     if num_samples == 0:
228 |         gt = None
229 |     if gt is not None and num_samples is None:
230 |         num_samples = gt.shape[1]
231 | 
232 |     encoder = _vcztools.VcfEncoder(
233 |         num_variants,
234 |         num_samples if num_samples is not None else 0,
235 |         chrom=chrom,
236 |         pos=pos,
237 |         id=id,
238 |         alt=alt,
239 |         ref=ref,
240 |         qual=qual,
241 |         filter_ids=filters,
242 |         filter=filter_,
243 |     )
244 |     # print(encoder.arrays)
245 |     if gt is not None:
246 |         encoder.add_gt_field(gt, gt_phased)
247 |     for name, zarray in info_fields.items():
248 |         # print(array.dtype.kind)
249 |         if zarray.dtype.kind in ("O", "U"):
250 |             zarray = zarray.astype("S")
251 |         if len(zarray.shape) == 1:
252 |             zarray = zarray.reshape((num_variants, 1))
253 |         encoder.add_info_field(name, zarray)
254 | 
255 |     if num_samples != 0:
256 |         for name, zarray in format_fields.items():
257 |             if zarray.dtype.kind in ("O", "U"):
258 |                 zarray = zarray.astype("S")
259 |             if len(zarray.shape) == 2:
260 |                 zarray = zarray.reshape((num_variants, num_samples, 1))
261 |             encoder.add_format_field(name, zarray)
262 | 
263 |     # TODO: (1) make a guess at this based on number of fields and samples,
264 |     # and (2) log a DEBUG message when we have to double.
265 |     buflen = 1024
266 |     for j in range(num_variants):
267 |         failed = True
268 |         while failed:
269 |             try:
270 |                 line = encoder.encode(j, buflen)
271 |                 failed = False
272 |             except _vcztools.VczBufferTooSmall:
273 |                 buflen *= 2
274 |                 # print("Bumping buflen to", buflen)
275 |         print(line, file=output)
276 | 
277 | 
278 | def get_filter_ids(root):
279 |     """
280 |     Returns the filter IDs from the specified Zarr store. If the array
281 |     does not exist, return a single filter "PASS" by default.
282 |     """
283 |     if "filter_id" in root:
284 |         filters = root["filter_id"][:].astype("S")
285 |     else:
286 |         filters = np.array(["PASS"], dtype="S")
287 |     return filters
288 | 
289 | 
290 | def _generate_header(
291 |     ds,
292 |     sample_ids,
293 |     *,
294 |     no_version: bool = False,
295 |     force_ac_an: bool = False,
296 | ):
297 |     output = io.StringIO()
298 | 
299 |     contigs = list(ds["contig_id"][:])
300 |     filters = list(get_filter_ids(ds).astype("U"))
301 |     info_fields = []
302 |     format_fields = []
303 | 
304 |     if "call_genotype" in ds and len(sample_ids) > 0:
305 |         # GT must be the first field if present, per the spec (section 1.6.2)
306 |         format_fields.append("GT")
307 | 
308 |     for var in sorted(ds.keys()):
309 |         arr = ds[var]
310 |         if (
311 |             var.startswith("variant_")
312 |             and not var.endswith("_fill")
313 |             and not var.endswith("_mask")
314 |             and var not in RESERVED_VARIABLE_NAMES
315 |             and dims(arr)[0] == "variants"
316 |         ):
317 |             key = var[len("variant_") :]
318 |             info_fields.append(key)
319 |         elif (
320 |             len(sample_ids) > 0
321 |             and var.startswith("call_")
322 |             and not var.endswith("_fill")
323 |             and not var.endswith("_mask")
324 |             and dims(arr)[0] == "variants"
325 |             and dims(arr)[1] == "samples"
326 |         ):
327 |             key = var[len("call_") :]
328 |             if key in ("genotype", "genotype_phased"):
329 |                 continue
330 |             format_fields.append(key)
331 | 
332 |     # [1.4.1 File format]
333 |     print("##fileformat=VCFv4.3", file=output)
334 | 
335 |     if "source" in ds.attrs:
336 |         print(f'##source={ds.attrs["source"]}', file=output)
337 | 
338 |     # [1.4.2 Information field format]
339 |     for key in info_fields:
340 |         arr = ds[f"variant_{key}"]
341 |         category = "INFO"
342 |         vcf_number = _array_to_vcf_number(category, key, arr)
343 |         vcf_type = _array_to_vcf_type(arr)
344 |         vcf_description = arr.attrs.get(
345 |             "description", RESERVED_INFO_KEY_DESCRIPTIONS.get(key, "")
346 |         )
347 |         print(
348 |             f'##INFO=<ID={key},Number={vcf_number},Type={vcf_type},Description="{vcf_description}">',
349 |             file=output,
350 |         )
351 | 
352 |     if force_ac_an:
353 |         # bcftools always recomputes the AC and AN fields when samples are specified,
354 |         # even if these fields don't exist before
355 |         for key, number in [("AC", "A"), ("AN", "1")]:
356 |             if key not in info_fields:
357 |                 print(
358 |                     f"##INFO=<ID={key},Number={number},Type=Integer,"
359 |                     f'Description="{RESERVED_INFO_KEY_DESCRIPTIONS[key]}">',
360 |                     file=output,
361 |                 )
362 | 
363 |     # [1.4.3 Filter field format]
364 |     filter_descriptions = (
365 |         ds["filter_description"] if "filter_description" in ds else None
366 |     )
367 |     for i, filter in enumerate(filters):
368 |         filter_description = (
369 |             "" if filter_descriptions is None else filter_descriptions[i]
370 |         )
371 |         print(
372 |             f'##FILTER=<ID={filter},Description="{filter_description}">',
373 |             file=output,
374 |         )
375 | 
376 |     # [1.4.4 Individual format field format]
377 |     for key in format_fields:
378 |         if key == "GT":
379 |             print(
380 |                 '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
381 |                 file=output,
382 |             )
383 |         else:
384 |             arr = ds[f"call_{key}"]
385 |             category = "FORMAT"
386 |             vcf_number = _array_to_vcf_number(category, key, arr)
387 |             vcf_type = _array_to_vcf_type(arr)
388 |             vcf_description = arr.attrs.get(
389 |                 "description", RESERVED_FORMAT_KEY_DESCRIPTIONS.get(key, "")
390 |             )
391 |             print(
392 |                 f'##FORMAT=<ID={key},Number={vcf_number},Type={vcf_type},Description="{vcf_description}">',
393 |                 file=output,
394 |             )
395 | 
396 |     # [1.4.7 Contig field format]
397 |     contig_lengths = ds["contig_length"] if "contig_length" in ds else None
398 |     for i, contig in enumerate(contigs):
399 |         if contig_lengths is None:
400 |             print(f"##contig=<ID={contig}>", file=output)
401 |         else:
402 |             print(f"##contig=<ID={contig},length={contig_lengths[i]}>", file=output)
403 | 
404 |     if not no_version:
405 |         print(
406 |             f"##vcztools_viewCommand={' '.join(sys.argv[1:])}; Date={datetime.now()}",
407 |             file=output,
408 |         )
409 | 
410 |     # Other meta information lines not covered above
411 |     if "vcf_meta_information" in ds.attrs:
412 |         for key, value in ds.attrs["vcf_meta_information"]:
413 |             if key not in ("fileformat", "source"):
414 |                 print(f"##{key}={value}", file=output)
415 | 
416 |     # [1.5 Header line syntax]
417 |     print(
418 |         "#CHROM",
419 |         "POS",
420 |         "ID",
421 |         "REF",
422 |         "ALT",
423 |         "QUAL",
424 |         "FILTER",
425 |         "INFO",
426 |         sep="\t",
427 |         end="",
428 |         file=output,
429 |     )
430 | 
431 |     if len(sample_ids) > 0:
432 |         print(end="\t", file=output)
433 |         print("FORMAT", *sample_ids, sep="\t", file=output)
434 |     else:
435 |         print(file=output)
436 | 
437 |     return output.getvalue()
438 | 
439 | 
440 | def _array_to_vcf_number(category, key, a):
441 |     # reverse of vcf_number_to_dimension_and_size
442 |     if a.dtype == bool:
443 |         return 0
444 |     elif category == "INFO" and len(dims(a)) == 1:
445 |         return 1
446 |     elif category == "FORMAT" and len(dims(a)) == 2:
447 |         return 1
448 | 
449 |     last_dim = dims(a)[-1]
450 |     if last_dim == "alt_alleles":
451 |         return "A"
452 |     elif last_dim == "alleles":
453 |         return "R"
454 |     elif last_dim == "genotypes":
455 |         return "G"
456 |     elif last_dim == f"{category}_{key}_dim":
457 |         return a.shape[-1]
458 |     else:
459 |         raise ValueError(
460 |             f"Cannot determine VCF Number for dimension name '{last_dim}' in {a}"
461 |         )
462 | 
463 | 
464 | def _array_to_vcf_type(a):
465 |     if a.dtype == bool:
466 |         return "Flag"
467 |     elif np.issubdtype(a.dtype, np.integer):
468 |         return "Integer"
469 |     elif np.issubdtype(a.dtype, np.float32):
470 |         return "Float"
471 |     elif a.dtype.str[1:] in ("S1", "U1"):
472 |         return "Character"
473 |     elif a.dtype.kind in ("O", "S", "U"):
474 |         return "String"
475 |     else:
476 |         raise ValueError(f"Unsupported dtype: {a.dtype}")
477 | 
478 | 
479 | def _compute_info_fields(gt: np.ndarray, alt: np.ndarray):
480 |     flatter_gt = gt.reshape((gt.shape[0], -1))
481 |     allele_count = alt.shape[1] + 1
482 | 
483 |     def filter_and_bincount(values: np.ndarray):
484 |         positive = values[values > 0]
485 |         return np.bincount(positive, minlength=allele_count)[1:]
486 | 
487 |     computed_ac = np.apply_along_axis(filter_and_bincount, 1, flatter_gt).astype(
488 |         np.int32
489 |     )
490 |     computed_ac[alt == b""] = constants.INT_FILL
491 |     computed_an = np.sum(flatter_gt >= 0, axis=1, dtype=np.int32)
492 | 
493 |     return {
494 |         "AC": computed_ac,
495 |         "AN": computed_an,
496 |     }
497 | 


--------------------------------------------------------------------------------
/tests/test_vcf_writer.py:
--------------------------------------------------------------------------------
  1 | import pathlib
  2 | import re
  3 | import sys
  4 | from io import StringIO
  5 | 
  6 | import numpy as np
  7 | import pytest
  8 | import zarr
  9 | from cyvcf2 import VCF
 10 | from numpy.testing import assert_array_equal
 11 | 
 12 | from vcztools.constants import INT_FILL, INT_MISSING
 13 | from vcztools.vcf_writer import _compute_info_fields, c_chunk_to_vcf, write_vcf
 14 | 
 15 | from .utils import assert_vcfs_close, vcz_path_cache
 16 | 
 17 | 
 18 | @pytest.mark.parametrize("output_is_path", [True, False])
 19 | def test_write_vcf(tmp_path, output_is_path):
 20 |     original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz"
 21 |     vcz = vcz_path_cache(original)
 22 |     output = tmp_path.joinpath("output.vcf")
 23 | 
 24 |     if output_is_path:
 25 |         write_vcf(vcz, output, no_version=True)
 26 |     else:
 27 |         output_str = StringIO()
 28 |         write_vcf(vcz, output_str, no_version=True)
 29 |         with open(output, "w") as f:
 30 |             f.write(output_str.getvalue())
 31 | 
 32 |     v = VCF(output)
 33 | 
 34 |     assert v.samples == ["NA00001", "NA00002", "NA00003"]
 35 | 
 36 |     variant = next(v)
 37 | 
 38 |     assert variant.CHROM == "19"
 39 |     assert variant.POS == 111
 40 |     assert variant.ID is None
 41 |     assert variant.REF == "A"
 42 |     assert variant.ALT == ["C"]
 43 |     assert variant.QUAL == pytest.approx(9.6)
 44 |     assert variant.FILTER is None
 45 | 
 46 |     assert variant.genotypes == [[0, 0, True], [0, 0, True], [0, 1, False]]
 47 | 
 48 |     assert_array_equal(
 49 |         variant.format("HQ"),
 50 |         [[10, 15], [10, 10], [3, 3]],
 51 |     )
 52 | 
 53 |     # check headers are the same
 54 |     assert_vcfs_close(original, output)
 55 | 
 56 | 
 57 | @pytest.mark.parametrize(
 58 |     ("include", "exclude", "expected_chrom_pos"),
 59 |     [
 60 |         ("POS < 1000", None, [("19", 111), ("19", 112), ("X", 10)]),
 61 |         (
 62 |             None,
 63 |             "POS < 1000",
 64 |             [
 65 |                 ("20", 14370),
 66 |                 ("20", 17330),
 67 |                 ("20", 1110696),
 68 |                 ("20", 1230237),
 69 |                 ("20", 1234567),
 70 |                 ("20", 1235237),
 71 |             ],
 72 |         ),
 73 |     ],
 74 | )
 75 | def test_write_vcf__filtering(tmp_path, include, exclude, expected_chrom_pos):
 76 |     original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz"
 77 |     vcz = vcz_path_cache(original)
 78 |     output = tmp_path.joinpath("output.vcf")
 79 | 
 80 |     write_vcf(vcz, output, include=include, exclude=exclude)
 81 | 
 82 |     v = VCF(str(output))
 83 |     variants = list(v)
 84 | 
 85 |     assert len(variants) == len(expected_chrom_pos)
 86 |     assert v.samples == ["NA00001", "NA00002", "NA00003"]
 87 | 
 88 |     for variant, chrom_pos in zip(variants, expected_chrom_pos):
 89 |         chrom, pos = chrom_pos
 90 |         assert variant.CHROM == chrom
 91 |         assert variant.POS == pos
 92 | 
 93 | 
 94 | # fmt: off
 95 | @pytest.mark.parametrize(
 96 |     ("regions", "targets", "expected_chrom_pos"),
 97 |     [
 98 |         # regions only
 99 |         ("19", None, [("19", 111), ("19", 112)]),
100 |         ("19:112", None, [("19", 112)]),
101 |         ("20:1230236-", None, [("20", 1230237), ("20", 1234567), ("20", 1235237)]),
102 |         ("20:1230237-", None, [("20", 1230237), ("20", 1234567), ("20", 1235237)]),
103 |         ("20:1230238-", None, [("20", 1234567), ("20", 1235237)]),
104 |         ("20:1230237-1235236", None, [("20", 1230237), ("20", 1234567)]),
105 |         ("20:1230237-1235237", None, [("20", 1230237), ("20", 1234567), ("20", 1235237)]),  # noqa: E501
106 |         ("20:1230237-1235238", None, [("20", 1230237), ("20", 1234567), ("20", 1235237)]),  # noqa: E501
107 |         ("19,X", None, [("19", 111), ("19", 112), ("X", 10)]),
108 |         ("X:11", None, [("X", 10)]),  # note differs from targets
109 | 
110 |         # targets only
111 |         (None, "19", [("19", 111), ("19", 112)]),
112 |         (None, "19:112", [("19", 112)]),
113 |         (None, "20:1230236-", [("20", 1230237), ("20", 1234567), ("20", 1235237)]),
114 |         (None, "20:1230237-", [("20", 1230237), ("20", 1234567), ("20", 1235237)]),
115 |         (None, "20:1230238-", [("20", 1234567), ("20", 1235237)]),
116 |         (None, "20:1230237-1235236", [("20", 1230237), ("20", 1234567)]),
117 |         (None, "20:1230237-1235237", [("20", 1230237), ("20", 1234567), ("20", 1235237)]),  # noqa: E501
118 |         (None, "20:1230237-1235238", [("20", 1230237), ("20", 1234567), ("20", 1235237)]),  # noqa: E501
119 |         (None, "19,X", [("19", 111), ("19", 112), ("X", 10)]),
120 |         (None, "X:11", []),
121 |         (None, "^19,20:1-1234567", [("20", 1235237), ("X", 10)]),  # complement
122 | 
123 |         # regions and targets
124 |         ("20", "^20:1110696-", [("20", 14370), ("20", 17330)])
125 |     ]
126 | )
127 | # fmt: on
128 | def test_write_vcf__regions(tmp_path, regions, targets, expected_chrom_pos):
129 | 
130 |     original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz"
131 |     vcz = vcz_path_cache(original)
132 |     output = tmp_path.joinpath("output.vcf")
133 | 
134 |     write_vcf(vcz, output, regions=regions, targets=targets)
135 | 
136 |     v = VCF(output)
137 |     variants = list(v)
138 |     assert len(variants) == len(expected_chrom_pos)
139 | 
140 |     assert v.samples == ["NA00001", "NA00002", "NA00003"]
141 | 
142 |     for variant, chrom_pos in zip(variants, expected_chrom_pos):
143 |         chrom, pos = chrom_pos
144 |         assert variant.CHROM == chrom
145 |         assert variant.POS == pos
146 | 
147 | 
148 | @pytest.mark.parametrize(
149 |     ("samples", "force_samples", "expected_samples", "expected_genotypes"),
150 |     [
151 |         ("NA00001", False, ["NA00001"], [[0, 0, True]]),
152 |         (
153 |             "NA00001,NA00003",
154 |             False,
155 |             ["NA00001", "NA00003"],
156 |             [[0, 0, True], [0, 1, False]],
157 |         ),
158 |         (
159 |             "NA00003,NA00001",
160 |             False,
161 |             ["NA00003", "NA00001"],
162 |             [[0, 1, False], [0, 0, True]],
163 |         ),
164 |         ("^NA00002", False, ["NA00001", "NA00003"], [[0, 0, True], [0, 1, False]]),
165 |         ("^NA00003,NA00002", False, ["NA00001"], [[0, 0, True]]),
166 |         ("^NA00003,NA00002,NA00003", False, ["NA00001"], [[0, 0, True]]),
167 |         ("NO_SAMPLE", True, [], None),
168 |     ],
169 | )
170 | def test_write_vcf__samples(
171 |     tmp_path, samples, force_samples, expected_samples, expected_genotypes
172 | ):
173 |     original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz"
174 |     vcz = vcz_path_cache(original)
175 |     output = tmp_path.joinpath("output.vcf")
176 | 
177 |     write_vcf(vcz, output, samples=samples, force_samples=force_samples)
178 | 
179 |     v = VCF(output)
180 | 
181 |     assert v.samples == expected_samples
182 | 
183 |     variant = next(v)
184 | 
185 |     assert variant.CHROM == "19"
186 |     assert variant.POS == 111
187 |     assert variant.ID is None
188 |     assert variant.REF == "A"
189 |     assert variant.ALT == ["C"]
190 |     assert variant.QUAL == pytest.approx(9.6)
191 |     assert variant.FILTER is None
192 | 
193 |     assert variant.genotypes == expected_genotypes
194 | 
195 | 
196 | def test_write_vcf__non_existent_sample(tmp_path):
197 |     original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz"
198 |     vcz = vcz_path_cache(original)
199 |     output = tmp_path.joinpath("output.vcf")
200 | 
201 |     with pytest.raises(
202 |         ValueError,
203 |         match=re.escape(
204 |             "subset called for sample(s) not in header: NO_SAMPLE. "
205 |             'Use "--force-samples" to ignore this error.'
206 |         ),
207 |     ):
208 |         write_vcf(vcz, output, samples="NO_SAMPLE")
209 | 
210 | 
211 | def test_write_vcf__no_samples(tmp_path):
212 |     original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz"
213 |     vcz = vcz_path_cache(original)
214 |     output = tmp_path.joinpath("output.vcf")
215 | 
216 |     write_vcf(vcz, output, drop_genotypes=True)
217 | 
218 |     v = VCF(output)
219 | 
220 |     assert v.samples == []
221 | 
222 | 
223 | @pytest.mark.parametrize(
224 |     ("regions", "targets", "samples", "include", "expected_chrom_pos"),
225 |     [
226 |         # Test that sample filtering takes place after include filtering.
227 |         ("20", None, "NA00001", "FMT/GQ > 60", [("20", 1230237)]),
228 |         # Test that region filtering and include expression are combined.
229 |         ("19", None, "NA00001", "POS > 200", []),
230 |         # Test that target filtering and include expression are combined.
231 |         (None, "19", "NA00001", "POS > 200", []),
232 |         # Test that empty output in the no-regions cases works
233 |         (None, None, "NA00001", "POS < 1", []),
234 |         # Test that empty output in the no-regions cases works
235 |         (None, None, None, "POS < 1", []),
236 |     ],
237 | )
238 | def test_write_vcf__regions_samples_filtering(
239 |     tmp_path, regions, targets, samples, include, expected_chrom_pos
240 | ):
241 |     original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz"
242 |     vcz = vcz_path_cache(original)
243 |     output = tmp_path.joinpath("output.vcf")
244 | 
245 |     write_vcf(
246 |         vcz,
247 |         output,
248 |         regions=regions,
249 |         targets=targets,
250 |         samples=samples,
251 |         include=include,
252 |     )
253 | 
254 |     v = VCF(str(output))
255 |     variants = list(v)
256 | 
257 |     assert len(variants) == len(expected_chrom_pos)
258 |     if samples is not None:
259 |         assert v.samples == [samples]
260 | 
261 |     for variant, chrom_pos in zip(variants, expected_chrom_pos):
262 |         chrom, pos = chrom_pos
263 |         assert variant.CHROM == chrom
264 |         assert variant.POS == pos
265 | 
266 | 
267 | def test_write_vcf__include_exclude(tmp_path):
268 |     original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz"
269 |     vcz = vcz_path_cache(original)
270 |     output = tmp_path.joinpath("output.vcf")
271 | 
272 |     variant_site_filter = "POS > 1"
273 | 
274 |     with pytest.raises(
275 |         ValueError,
276 |         match=re.escape(
277 |             "Cannot handle both an include expression and an exclude expression."
278 |         ),
279 |     ):
280 |         write_vcf(vcz, output, include=variant_site_filter, exclude=variant_site_filter)
281 | 
282 | 
283 | def test_write_vcf__header_flags(tmp_path):
284 |     original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz"
285 |     vcz = vcz_path_cache(original)
286 |     output = tmp_path.joinpath("output.vcf")
287 | 
288 |     output_header = StringIO()
289 |     write_vcf(vcz, output_header, header_only=True, no_version=True)
290 | 
291 |     output_no_header = StringIO()
292 |     write_vcf(vcz, output_no_header, no_header=True, no_version=True)
293 |     assert not output_no_header.getvalue().startswith("#")
294 | 
295 |     # combine outputs and check VCFs match
296 |     output_str = output_header.getvalue() + output_no_header.getvalue()
297 |     with open(output, "w") as f:
298 |         f.write(output_str)
299 |     assert_vcfs_close(original, output)
300 | 
301 | 
302 | def test_write_vcf__generate_header():
303 |     original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz"
304 |     vcz = vcz_path_cache(original)
305 | 
306 |     output_header = StringIO()
307 |     write_vcf(vcz, output_header, header_only=True, no_version=True)
308 | 
309 |     expected_vcf_header = """##fileformat=VCFv4.3
310 | ##source={}
311 | ##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
312 | ##INFO=<ID=AC,Number=2,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
313 | ##INFO=<ID=AF,Number=2,Type=Float,Description="Allele Frequency">
314 | ##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
315 | ##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
316 | ##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
317 | ##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
318 | ##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
319 | ##FILTER=<ID=PASS,Description="All filters passed">
320 | ##FILTER=<ID=s50,Description="Less than 50% of samples have data">
321 | ##FILTER=<ID=q10,Description="Quality below 10">
322 | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
323 | ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
324 | ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
325 | ##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
326 | ##contig=<ID=19>
327 | ##contig=<ID=20>
328 | ##contig=<ID=X>
329 | ##fileDate=20090805
330 | ##reference=1000GenomesPilot-NCBI36
331 | ##phasing=partial
332 | ##ALT=<ID=DEL:ME:ALU,Description="Deletion of ALU element">
333 | ##ALT=<ID=CNV,Description="Copy number variable region">
334 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA00001	NA00002	NA00003
335 | """  # noqa: E501
336 | 
337 |     # substitute value of source
338 |     root = zarr.open(vcz, mode="r+")
339 |     expected_vcf_header = expected_vcf_header.format(root.attrs["source"])
340 | 
341 |     assert output_header.getvalue() == expected_vcf_header
342 | 
343 | 
344 | def test_compute_info_fields():
345 |     gt = np.array(
346 |         [
347 |             [[0, 0], [0, 1], [1, 1]],
348 |             [[0, 0], [0, 2], [2, 2]],
349 |             [[0, 1], [1, 2], [2, 2]],
350 |             [
351 |                 [INT_MISSING, INT_MISSING],
352 |                 [INT_MISSING, INT_MISSING],
353 |                 [INT_FILL, INT_FILL],
354 |             ],
355 |             [[INT_MISSING, INT_MISSING], [0, 3], [INT_FILL, INT_FILL]],
356 |         ]
357 |     )
358 |     alt = np.array(
359 |         [
360 |             [b"A", b"B", b""],
361 |             [b"A", b"B", b"C"],
362 |             [b"A", b"B", b"C"],
363 |             [b"", b"", b""],
364 |             [b"A", b"B", b"C"],
365 |         ]
366 |     )
367 |     expected_result = {
368 |         "AC": np.array(
369 |             [
370 |                 [3, 0, INT_FILL],
371 |                 [0, 3, 0],
372 |                 [2, 3, 0],
373 |                 [INT_FILL, INT_FILL, INT_FILL],
374 |                 [0, 0, 1],
375 |             ]
376 |         ),
377 |         "AN": np.array([6, 6, 6, 0, 2]),
378 |     }
379 | 
380 |     computed_info_fields = _compute_info_fields(gt, alt)
381 | 
382 |     assert expected_result.keys() == computed_info_fields.keys()
383 | 
384 |     for key in expected_result.keys():
385 |         np.testing.assert_array_equal(expected_result[key], computed_info_fields[key])
386 | 
387 | 
388 | class TestApiErrors:
389 | 
390 |     @pytest.fixture()
391 |     def vcz(self):
392 |         original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz"
393 |         return vcz_path_cache(original)
394 | 
395 |     def test_samples_and_drop_genotypes(self, vcz):
396 |         with pytest.raises(
397 |             ValueError, match="Cannot select samples and drop genotypes"
398 |         ):
399 |             write_vcf(vcz, sys.stdout, samples=["NA00001"], drop_genotypes=True)
400 | 
401 |     def test_no_output_filter_parse_error(self, vcz):
402 |         output = StringIO()
403 |         with pytest.raises(ValueError, match='the tag "Not" is not defined'):
404 |             write_vcf(vcz, output, include="Not a valid expression")
405 |         assert output.getvalue() == ""
406 | 
407 | 
408 | def minimal_vcf_chunk(num_variants, num_samples, ploidy=2):
409 |     return {
410 |         "variant_position": 1 + np.arange(num_variants, dtype=np.int32),
411 |         "variant_contig": np.zeros(num_variants, dtype=np.int32),
412 |         # "variant_id": np.array(["."] * num_variants, dtype="S1"),
413 |         "variant_id": np.array(["."] * num_variants, dtype="S").reshape(
414 |             (num_variants, 1)
415 |         ),
416 |         "variant_allele": np.array([("A", "T")] * num_variants),
417 |         "variant_quality": np.zeros(num_variants, dtype=np.float32),
418 |         "variant_filter": np.ones(num_variants, dtype=bool).reshape((num_variants, 1)),
419 |         "call_genotype": np.zeros((num_variants, num_samples, ploidy), dtype=np.int8),
420 |     }
421 | 
422 | 
423 | def chunk_to_vcf(chunk):
424 |     filters = np.array([b"PASS"])
425 |     contigs = np.array([b"chr1"])
426 |     output = StringIO()
427 |     c_chunk_to_vcf(
428 |         chunk,
429 |         samples_selection=None,
430 |         contigs=contigs,
431 |         filters=filters,
432 |         output=output,
433 |         drop_genotypes=False,
434 |         no_update=False,
435 |     )
436 |     return output.getvalue()
437 | 
438 | 
439 | def chunk_to_vcf_file(chunk):
440 |     """
441 |     Simple function just to get the data out to a minimal file for
442 |     testing and evaluation
443 |     """
444 |     num_samples = chunk["call_genotype"].shape[1]
445 | 
446 |     output = StringIO()
447 |     print("##fileformat=VCFv4.3", file=output)
448 |     print("##contig=<ID=chr1>", file=output)
449 |     print(
450 |         '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
451 |         file=output,
452 |     )
453 |     print(
454 |         "#CHROM",
455 |         "POS",
456 |         "ID",
457 |         "REF",
458 |         "ALT",
459 |         "QUAL",
460 |         "FILTER",
461 |         "INFO",
462 |         sep="\t",
463 |         end="",
464 |         file=output,
465 |     )
466 |     print(end="\t", file=output)
467 |     sample_ids = [f"x{j}" for j in range(num_samples)]
468 |     print("FORMAT", *sample_ids, sep="\t", file=output)
469 |     return output.getvalue() + chunk_to_vcf(chunk)
470 | 
471 | 
472 | class TestEncoding:
473 | 
474 |     def test_basic_example(self):
475 |         chunk = minimal_vcf_chunk(1, 2)
476 |         out = chunk_to_vcf(chunk)
477 |         line = "\t".join(
478 |             ["chr1", "1", ".", "A", "T", "0", "PASS", ".", "GT", "0/0", "0/0"]
479 |         )
480 |         assert out == line + "\n"
481 | 
482 |     def test_mixed_ploidy(self):
483 |         chunk = minimal_vcf_chunk(2, 2)
484 |         chunk["call_genotype"][0, 0, 1] = -2
485 |         chunk["call_genotype"][1, 1, 1] = -2
486 |         out = chunk_to_vcf(chunk)
487 |         lines = [
488 |             ["chr1", "1", ".", "A", "T", "0", "PASS", ".", "GT", "0", "0/0"],
489 |             ["chr1", "2", ".", "A", "T", "0", "PASS", ".", "GT", "0/0", "0"],
490 |         ]
491 |         lines = "\n".join("\t".join(line) for line in lines)
492 |         assert out == lines + "\n"
493 | 
494 |     def test_zero_ploidy(self):
495 |         chunk = minimal_vcf_chunk(2, 2)
496 |         chunk["call_genotype"][0, 0] = -2
497 |         chunk["call_genotype"][1, 1] = -2
498 |         out = chunk_to_vcf(chunk)
499 |         lines = [
500 |             ["chr1", "1", ".", "A", "T", "0", "PASS", ".", "GT", "", "0/0"],
501 |             ["chr1", "2", ".", "A", "T", "0", "PASS", ".", "GT", "0/0", ""],
502 |         ]
503 |         lines = "\n".join("\t".join(line) for line in lines)
504 |         assert out == lines + "\n"
505 | 
506 |         # NOTE bcftools/htslib doesn't like this
507 |         # [E::vcf_parse_format] Couldn't read GT data:
508 |         #  value not a number or '.' at chr1:1
509 | 
510 |         # with open("zero-ploidy.vcf", "w") as f:
511 |         #     print(chunk_to_vcf_file(chunk), file=f, end="")
512 | 


--------------------------------------------------------------------------------
/tests/test_filter.py:
--------------------------------------------------------------------------------
  1 | import pathlib
  2 | 
  3 | import numpy as np
  4 | import numpy.testing as nt
  5 | import pyparsing as pp
  6 | import pytest
  7 | import zarr
  8 | 
  9 | from tests.utils import vcz_path_cache
 10 | from vcztools import filter as filter_mod
 11 | 
 12 | 
 13 | class TestFilterExpressionParser:
 14 |     @pytest.fixture()
 15 |     def parser(self):
 16 |         return filter_mod.make_bcftools_filter_parser(map_vcf_identifiers=False)
 17 | 
 18 |     @pytest.mark.parametrize(
 19 |         "expression",
 20 |         [
 21 |             "",
 22 |             "| |",
 23 |             "a +",
 24 |             '"stri + 2',
 25 |         ],
 26 |     )
 27 |     def test_invalid_expressions(self, parser, expression):
 28 |         with pytest.raises(pp.ParseException):
 29 |             parser.parse_string(expression, parse_all=True)
 30 | 
 31 |     @pytest.mark.parametrize(
 32 |         ("expression", "exception_class"),
 33 |         [
 34 |             # NOTE: using an integer here so that we don't trigger the
 35 |             # generic string issue. Can fix this later when we've gotten
 36 |             # some partial string handling implemented
 37 |             ("INFO/HAYSTACK ~ 0", filter_mod.UnsupportedRegexError),
 38 |             ('DP="."', filter_mod.UnsupportedMissingDataError),
 39 |             ("ID!=@~/file", filter_mod.UnsupportedFileReferenceError),
 40 |             ("INFO/TAG=@file", filter_mod.UnsupportedFileReferenceError),
 41 |             ("INFO/X[0] == 1", filter_mod.UnsupportedArraySubscriptError),
 42 |             ("INFO/AF[0] > 0.3", filter_mod.UnsupportedArraySubscriptError),
 43 |             ("FORMAT/AD[0:0] > 30", filter_mod.UnsupportedArraySubscriptError),
 44 |             ("DP4[*] == 0", filter_mod.UnsupportedArraySubscriptError),
 45 |             ("FORMAT/DP[1-3] > 10", filter_mod.UnsupportedArraySubscriptError),
 46 |             ("FORMAT/DP[1-] < 7", filter_mod.UnsupportedArraySubscriptError),
 47 |             ("FORMAT/DP[0,2-4] > 20", filter_mod.UnsupportedArraySubscriptError),
 48 |             ("FORMAT/AD[0:*]", filter_mod.UnsupportedArraySubscriptError),
 49 |             ("FORMAT/AD[0:]", filter_mod.UnsupportedArraySubscriptError),
 50 |             ("FORMAT/AD[*:1]", filter_mod.UnsupportedArraySubscriptError),
 51 |             (
 52 |                 "(DP4[0]+DP4[1])/(DP4[2]+DP4[3]) > 0.3",
 53 |                 filter_mod.UnsupportedArraySubscriptError,
 54 |             ),
 55 |             ("binom(FMT/AD)", filter_mod.UnsupportedFunctionsError),
 56 |             ("fisher(INFO/DP4)", filter_mod.UnsupportedFunctionsError),
 57 |             ("fisher(FMT/ADF,FMT/ADR)", filter_mod.UnsupportedFunctionsError),
 58 |             ("N_PASS(GQ>90)", filter_mod.UnsupportedFunctionsError),
 59 |             ('TYPE="bnd"', filter_mod.UnsupportedTypeFieldError),
 60 |         ],
 61 |     )
 62 |     def test_unsupported_syntax(self, parser, expression, exception_class):
 63 |         with pytest.raises(exception_class):
 64 |             parser.parse_string(expression, parse_all=True)
 65 | 
 66 | 
 67 | class TestFilterExpressionSample:
 68 |     @pytest.mark.parametrize(
 69 |         ("expression", "expected_result"),
 70 |         [
 71 |             ('CHROM = "20"', [0, 0, 1, 1, 1, 1, 1, 1, 0]),
 72 |             ("POS < 1000", [1, 1, 0, 0, 0, 0, 0, 0, 1]),
 73 |             ("INFO/DP > 10", [0, 0, 1, 1, 0, 1, 0, 0, 0]),
 74 |             (
 75 |                 "FMT/GQ > 20",
 76 |                 [
 77 |                     [0, 0, 0],
 78 |                     [0, 0, 0],
 79 |                     [1, 1, 1],
 80 |                     [1, 0, 1],
 81 |                     [1, 0, 1],
 82 |                     [1, 1, 1],
 83 |                     [0, 0, 1],
 84 |                     [0, 0, 0],
 85 |                     [0, 0, 0],
 86 |                 ],
 87 |             ),
 88 |             (
 89 |                 "FMT/DP >= 5 && FMT/GQ > 10",
 90 |                 [
 91 |                     [0, 0, 0],
 92 |                     [0, 0, 0],
 93 |                     [1, 1, 1],
 94 |                     [1, 1, 1],
 95 |                     [1, 0, 1],
 96 |                     [0, 0, 0],
 97 |                     [0, 0, 0],
 98 |                     [0, 0, 0],
 99 |                     [0, 0, 0],
100 |                 ],
101 |             ),
102 |             (
103 |                 "FMT/DP >= 5 & FMT/GQ > 10",
104 |                 [
105 |                     [0, 0, 0],
106 |                     [0, 0, 0],
107 |                     [0, 1, 1],
108 |                     [0, 0, 0],
109 |                     [1, 0, 0],
110 |                     [0, 0, 0],
111 |                     [0, 0, 0],
112 |                     [0, 0, 0],
113 |                     [0, 0, 0],
114 |                 ],
115 |             ),
116 |             (
117 |                 "QUAL > 10 || FMT/GQ > 10",
118 |                 [
119 |                     [0, 0, 0],
120 |                     [0, 0, 0],
121 |                     [1, 1, 1],
122 |                     [1, 1, 1],
123 |                     [1, 1, 1],
124 |                     [1, 1, 1],
125 |                     [1, 1, 1],
126 |                     [0, 0, 0],
127 |                     [0, 0, 0],
128 |                 ],
129 |             ),
130 |             (
131 |                 "(QUAL > 10 || FMT/GQ > 10) && POS > 100000",
132 |                 [
133 |                     [0, 0, 0],
134 |                     [0, 0, 0],
135 |                     [0, 0, 0],
136 |                     [0, 0, 0],
137 |                     [1, 1, 1],
138 |                     [1, 1, 1],
139 |                     [1, 1, 1],
140 |                     [0, 0, 0],
141 |                     [0, 0, 0],
142 |                 ],
143 |             ),
144 |             (
145 |                 "(FMT/DP >= 8 | FMT/GQ > 40) && POS > 100000",
146 |                 [
147 |                     [0, 0, 0],
148 |                     [0, 0, 0],
149 |                     [0, 0, 0],
150 |                     [0, 0, 0],
151 |                     [0, 0, 0],
152 |                     [1, 1, 1],
153 |                     [0, 0, 0],
154 |                     [0, 0, 0],
155 |                     [0, 0, 0],
156 |                 ],
157 |             ),
158 |         ],
159 |     )
160 |     def test(self, expression, expected_result):
161 |         original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz"
162 |         vcz = vcz_path_cache(original)
163 |         root = zarr.open(vcz, mode="r")
164 |         data = {field: root[field][:] for field in root.keys()}
165 |         filter_expr = filter_mod.FilterExpression(
166 |             field_names=set(root), include=expression
167 |         )
168 |         result = filter_expr.evaluate(data)
169 |         nt.assert_array_equal(result, expected_result)
170 | 
171 |         filter_expr = filter_mod.FilterExpression(
172 |             field_names=set(root), exclude=expression
173 |         )
174 |         result = filter_expr.evaluate(data)
175 |         nt.assert_array_equal(result, np.logical_not(expected_result))
176 | 
177 | 
178 | def numpify_values(data):
179 |     return {k: np.array(v) for k, v in data.items()}
180 | 
181 | 
182 | class TestFilterExpression:
183 |     @pytest.mark.parametrize(
184 |         ("expression", "data", "expected"),
185 |         [
186 |             ("POS<5", {"variant_position": [1, 5, 6, 10]}, [1, 0, 0, 0]),
187 |             ("INFO/XX>=10", {"variant_XX": [1, 5, 6, 10]}, [0, 0, 0, 1]),
188 |             ("INFO/XX / 2 >=5", {"variant_XX": [1, 5, 6, 10]}, [0, 0, 0, 1]),
189 |             ("POS<5 | POS>8", {"variant_position": [1, 5, 6, 10]}, [1, 0, 0, 1]),
190 |             (
191 |                 "POS<0 & POS<1 & POS<2 & POS<3 & POS<4",
192 |                 {"variant_position": range(10)},
193 |                 np.zeros(10, dtype=bool),
194 |             ),
195 |         ],
196 |     )
197 |     def test_evaluate(self, expression, data, expected):
198 |         fee = filter_mod.FilterExpression(field_names=data.keys(), include=expression)
199 |         result = fee.evaluate(numpify_values(data))
200 |         nt.assert_array_equal(result, expected)
201 | 
202 |     @pytest.mark.parametrize(
203 |         ("expression", "expected"),
204 |         [
205 |             ('FILTER="PASS"', [False, True, False, False, False, False]),
206 |             ('FILTER="."', [True, False, False, False, False, False]),
207 |             ('FILTER="A"', [False, False, True, False, False, False]),
208 |             ('FILTER!="A"', [True, True, False, True, True, True]),
209 |             ('FILTER~"A"', [False, False, True, False, True, True]),
210 |             ('FILTER="A;B"', [False, False, False, False, True, False]),
211 |             ('FILTER="B;A"', [False, False, False, False, True, False]),
212 |             ('FILTER!="A;B"', [True, True, True, True, False, True]),
213 |             ('FILTER~"A;B"', [False, False, False, False, True, True]),
214 |             ('FILTER~"B;A"', [False, False, False, False, True, True]),
215 |             ('FILTER!~"A;B"', [True, True, True, True, False, False]),
216 |         ],
217 |     )
218 |     def test_evaluate_filter_comparison(self, expression, expected):
219 |         data = {
220 |             "variant_filter": [
221 |                 [False, False, False, False],
222 |                 [True, False, False, False],
223 |                 [False, True, False, False],
224 |                 [False, False, True, False],
225 |                 [False, True, True, False],
226 |                 [False, True, True, True],
227 |             ],
228 |             "filter_id": ["PASS", "A", "B", "C"],
229 |         }
230 |         fee = filter_mod.FilterExpression(include=expression)
231 |         result = fee.evaluate(numpify_values(data))
232 |         nt.assert_array_equal(result, expected)
233 | 
234 |     @pytest.mark.parametrize(
235 |         ("expression", "expected"),
236 |         [
237 |             ('TYPE="ref"', [True, False, False, False, False, False]),
238 |             ('TYPE=="ref"', [True, False, False, False, False, False]),
239 |             ('TYPE!="ref"', [False, True, True, True, True, True]),
240 |             ('TYPE~"ref"', [True, False, False, False, False, False]),
241 |             ('TYPE!~"ref"', [False, True, True, True, True, True]),
242 |             ('TYPE="snp"', [False, True, False, False, False, True]),
243 |             ('TYPE=="snp"', [False, True, False, False, False, True]),
244 |             ('TYPE!="snp"', [True, False, True, True, True, False]),
245 |             ('TYPE~"snp"', [False, True, False, False, True, True]),
246 |             ('TYPE!~"snp"', [True, False, True, True, False, False]),
247 |         ],
248 |     )
249 |     def test_evaluate_type_operation(self, expression, expected):
250 |         data = {
251 |             "variant_allele": [
252 |                 ["A", "", "", ""],
253 |                 ["A", "T", "", ""],
254 |                 ["A", "AT", "", ""],
255 |                 ["A", "CT", "", ""],
256 |                 ["A", "T", "CT", ""],
257 |                 ["A", "T", "G", "C"],
258 |             ],
259 |         }
260 |         fee = filter_mod.FilterExpression(include=expression)
261 |         result = fee.evaluate(numpify_values(data))
262 |         nt.assert_array_equal(result, expected)
263 | 
264 |     @pytest.mark.parametrize(
265 |         ("expr", "expected"),
266 |         [
267 |             ("a == b", {"variant_a", "variant_b"}),
268 |             ("a == b + c", {"variant_a", "variant_b", "variant_c"}),
269 |             ("(a + 1) < (b + c) - d / a", {f"variant_{x}" for x in "abcd"}),
270 |             ("-(a + b)", {f"variant_{x}" for x in "ab"}),
271 |         ],
272 |     )
273 |     def test_referenced_fields(self, expr, expected):
274 |         fe = filter_mod.FilterExpression(
275 |             field_names={f"variant_{x}" for x in "abcd"}, include=expr
276 |         )
277 |         assert fe.referenced_fields == expected
278 | 
279 |     @pytest.mark.parametrize(
280 |         ("expr", "expected"),
281 |         [
282 |             ("a == b", "(variant_a)==(variant_b)"),
283 |             ("a + 1", "(variant_a)+(1)"),
284 |             ("-a + 1", "(-(variant_a))+(1)"),
285 |             ("a + 1 + 2", "(variant_a)+(1)+(2)"),
286 |             ("a + (1 + 2)", "(variant_a)+((1)+(2))"),
287 |             ("POS<10", "(variant_position)<(10)"),
288 |             ('ID=="rs6054257"', "(variant_id)==('rs6054257')"),
289 |         ],
290 |     )
291 |     def test_repr(self, expr, expected):
292 |         fe = filter_mod.FilterExpression(
293 |             field_names={"variant_a", "variant_b"}, include=expr
294 |         )
295 |         assert repr(fe.parse_result[0]) == expected
296 | 
297 | 
298 | class TestBcftoolsParser:
299 |     @pytest.mark.parametrize(
300 |         "expr",
301 |         [
302 |             "2",
303 |             "2 + 2",
304 |             "(2 + 3) / 2",
305 |             "2 / (2 + 3)",
306 |             "1 + 1 + 1 + 1 + 1",
307 |             "5 * (2 / 3)",
308 |             "5 * 2 / 3",
309 |             "1 + 2 - 3 / 4 * 5 + 6 * 7 / 8",
310 |             "5 / (1 + 2 - 4) / (4 * 5 + 6 * 7 / 8)",
311 |             "5 < 2",
312 |             "5 > 2",
313 |             "0 == 0",
314 |             "0 != 0",
315 |             "(1 + 2) == 0",
316 |             "1 + 2 == 0",
317 |             "1 + 2 == 1 + 2 + 3",
318 |             "(1 + 2) == (1 + 2 + 3)",
319 |             "(1 == 1) != (2 == 2)",
320 |             "-1 == 1 + 2 - 4",
321 |             '("x" == "x")',
322 |             '"x"',
323 |             '"INFO/STRING"',
324 |         ],
325 |     )
326 |     def test_python_arithmetic_expressions(self, expr):
327 |         parser = filter_mod.make_bcftools_filter_parser()
328 |         parsed = parser.parse_string(expr, parse_all=True)
329 |         result = parsed[0].eval({})
330 |         assert result == eval(expr)
331 | 
332 |     @pytest.mark.parametrize(
333 |         ("expr", "data"),
334 |         [
335 |             ('("x" == "x")', {}),
336 |             ('"x"', {}),
337 |             ('"INFO/STRING"', {}),
338 |             ('a == "string"', {"a": "string"}),
339 |         ],
340 |     )
341 |     def test_python_string_expressions_data(self, expr, data):
342 |         parser = filter_mod.make_bcftools_filter_parser(map_vcf_identifiers=False)
343 |         parsed = parser.parse_string(expr, parse_all=True)
344 |         result = parsed[0].eval(data)
345 |         assert result == eval(expr, data)
346 | 
347 |     @pytest.mark.parametrize(
348 |         ("expr", "data"),
349 |         [
350 |             ("a", {"a": 1}),
351 |             ("a + a", {"a": 1}),
352 |             ("a + 2 * a - 1", {"a": 7}),
353 |             ("a - b < a + b", {"a": 7, "b": 6}),
354 |             ("(a - b) < (a + b)", {"a": 7, "b": 6}),
355 |             ("(a - b) < (a + b)", {"a": 7.0, "b": 6.666}),
356 |             ("a == a", {"a": 1}),
357 |             ("-a == -a", {"a": 1}),
358 |             ("-a == b", {"a": 1, "b": -1}),
359 |         ],
360 |     )
361 |     def test_python_arithmetic_expressions_data(self, expr, data):
362 |         parser = filter_mod.make_bcftools_filter_parser(map_vcf_identifiers=False)
363 |         parsed = parser.parse_string(expr, parse_all=True)
364 |         result = parsed[0].eval(data)
365 |         assert result == eval(expr, data)
366 | 
367 |     @pytest.mark.parametrize(
368 |         ("expr", "data"),
369 |         [
370 |             ("a", {"a": [1, 2, 3]}),
371 |             ("a + a", {"a": [1, 2, 3]}),
372 |             ("1 + a + a", {"a": [1, 2, 3]}),
373 |             ("a + b", {"a": [1, 2, 3], "b": [5, 6, 7]}),
374 |             ("(a + b) < c", {"a": [1, 2, 3], "b": [5, 6, 7], "c": [5, 10, 15]}),
375 |         ],
376 |     )
377 |     def test_numpy_arithmetic_expressions_data(self, expr, data):
378 |         parser = filter_mod.make_bcftools_filter_parser(map_vcf_identifiers=False)
379 |         parsed = parser.parse_string(expr, parse_all=True)
380 |         npdata = numpify_values(data)
381 |         result = parsed[0].eval(npdata)
382 |         evaled = eval(expr, npdata)
383 |         nt.assert_array_equal(result, evaled)
384 | 
385 |     @pytest.mark.parametrize(
386 |         ("expr", "data"),
387 |         [
388 |             ("call_a", {"call_a": [[[1]], [[2]], [[3]]]}),
389 |         ],
390 |     )
391 |     def test_numpy_higher_dimension_arithmetic_expressions_data(self, expr, data):
392 |         parser = filter_mod.make_bcftools_filter_parser(map_vcf_identifiers=False)
393 |         parsed = parser.parse_string(expr, parse_all=True)
394 |         npdata = numpify_values(data)
395 |         with pytest.raises(filter_mod.UnsupportedHigherDimensionalFormatFieldsError):
396 |             parsed[0].eval(npdata)
397 | 
398 |     @pytest.mark.parametrize(
399 |         ("expr", "expected"),
400 |         [
401 |             ("1 & 1", True),
402 |             ("0 & 1", False),
403 |             ("1 & 0", False),
404 |             ("0 & 0", False),
405 |             ("1 | 1", True),
406 |             ("0 | 1", True),
407 |             ("1 | 0", True),
408 |             ("0 | 0", False),
409 |             ("(1 < 2) | 0", True),
410 |             ("(1 < 2) & 0", False),
411 |         ],
412 |     )
413 |     def test_boolean_operator_expressions(self, expr, expected):
414 |         parser = filter_mod.make_bcftools_filter_parser()
415 |         parsed = parser.parse_string(expr, parse_all=True)
416 |         result = parsed[0].eval({})
417 |         assert result == expected
418 | 
419 |     @pytest.mark.parametrize(
420 |         ("expr", "data", "expected"),
421 |         [
422 |             ("a == b", {"a": [0, 1], "b": [1, 1]}, [False, True]),
423 |             ("a = b", {"a": [0, 1], "b": [1, 1]}, [False, True]),
424 |             ("a & b", {"a": [0, 1], "b": [1, 1]}, [False, True]),
425 |             ("a | b", {"a": [0, 1], "b": [1, 1]}, [True, True]),
426 |             ("(a < 2) & (b > 1)", {"a": [0, 1], "b": [1, 2]}, [False, True]),
427 |             # AND has precedence over OR
428 |             ("t | f & f", {"t": [1], "f": [0]}, [True or False and False]),
429 |             ("(t | f) & f", {"t": [1], "f": [0]}, [(True or False) and False]),
430 |             (
431 |                 "call_a && call_b",
432 |                 {
433 |                     "call_a": [
434 |                         [0, 0, 0, 0],
435 |                         [0, 0, 1, 1],
436 |                         [0, 0, 0, 0],
437 |                     ],
438 |                     "call_b": [
439 |                         [0, 0, 0, 0],
440 |                         [0, 1, 0, 1],
441 |                         [1, 1, 1, 1],
442 |                     ],
443 |                 },
444 |                 [
445 |                     [False, False, False, False],
446 |                     [False, True, True, True],
447 |                     # all False since condition a is not met (all 0)
448 |                     [False, False, False, False],
449 |                 ],
450 |             ),
451 |             (
452 |                 "call_a || call_b",
453 |                 {
454 |                     "call_a": [
455 |                         [0, 0, 0, 0],
456 |                         [0, 0, 1, 1],
457 |                         [0, 0, 0, 0],
458 |                     ],
459 |                     "call_b": [
460 |                         [0, 0, 0, 0],
461 |                         [0, 1, 0, 1],
462 |                         [1, 1, 1, 1],
463 |                     ],
464 |                 },
465 |                 [
466 |                     [False, False, False, False],
467 |                     # all True since variant site is included
468 |                     [True, True, True, True],
469 |                     [True, True, True, True],
470 |                 ],
471 |             ),
472 |         ],
473 |     )
474 |     def test_boolean_operator_expressions_data(self, expr, data, expected):
475 |         parser = filter_mod.make_bcftools_filter_parser(map_vcf_identifiers=False)
476 |         parsed = parser.parse_string(expr, parse_all=True)
477 |         result = parsed[0].eval(numpify_values(data))
478 |         nt.assert_array_equal(result, expected)
479 | 
480 | 
481 | class TestAPIErrors:
482 |     def test_include_and_exclude(self):
483 |         with pytest.raises(ValueError, match="Cannot handle both an include "):
484 |             filter_mod.FilterExpression(include="x", exclude="y")
485 | 


--------------------------------------------------------------------------------