├── MANIFEST.in ├── tests ├── data │ ├── txt │ │ └── samples.txt │ └── vcf │ │ ├── chr22.vcf.gz │ │ ├── sample.vcf.gz │ │ ├── chr22.vcf.gz.csi │ │ ├── sample.vcf.gz.csi │ │ ├── 1kg_2020_chrM.vcf.gz │ │ ├── msprime_diploid.vcf.gz │ │ ├── 1kg_2020_chrM.vcf.gz.csi │ │ ├── field_type_combos.vcf.gz │ │ ├── msprime_diploid.vcf.gz.csi │ │ ├── field_type_combos.vcf.gz.csi │ │ ├── 1kg_2020_chr20_annotations.bcf │ │ └── 1kg_2020_chr20_annotations.bcf.csi ├── __init__.py ├── test_regions.py ├── test_calculate.py ├── test_vcf_roundtrip.py ├── test_stats.py ├── test_utils.py ├── test_plink_validation.py ├── test_retrieval.py ├── test_plink.py ├── test_cli.py ├── test_query.py ├── utils.py ├── test_tskit_data.py ├── test_bcftools_validation.py ├── test_vcf_writer.py └── test_filter.py ├── performance ├── data │ ├── .gitignore │ ├── requirements.txt │ └── Makefile └── compare.py ├── vcztools ├── __init__.py ├── __main__.py ├── provenance.py ├── constants.py ├── calculate.py ├── stats.py ├── samples.py ├── utils.py ├── plink.py ├── regions.py ├── retrieval.py ├── cli.py ├── query.py └── vcf_writer.py ├── .github └── workflows │ ├── docker │ ├── shared.env │ └── buildwheel.sh │ ├── cd.yml │ └── ci.yml ├── Makefile ├── setup.py ├── .pre-commit-config.yaml ├── .clang-format ├── lib ├── meson.build └── vcf_encoder.h ├── CHANGELOG.md ├── README.md ├── pyproject.toml ├── .gitignore ├── dev.py └── LICENSE /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include lib/*.h 2 | -------------------------------------------------------------------------------- /tests/data/txt/samples.txt: -------------------------------------------------------------------------------- 1 | NA00001 2 | NA00003 3 | -------------------------------------------------------------------------------- /performance/data/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !Makefile 3 | !requirements.txt 4 | -------------------------------------------------------------------------------- /performance/data/requirements.txt: -------------------------------------------------------------------------------- 1 | stdpopsim 2 | tskit 3 | bio2zarr 4 | -------------------------------------------------------------------------------- /vcztools/__init__.py: -------------------------------------------------------------------------------- 1 | from .provenance import __version__ # noqa F401 2 | -------------------------------------------------------------------------------- /tests/data/vcf/chr22.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/vcztools/HEAD/tests/data/vcf/chr22.vcf.gz -------------------------------------------------------------------------------- /vcztools/__main__.py: -------------------------------------------------------------------------------- 1 | from . import cli 2 | 3 | if __name__ == "__main__": 4 | cli.vcztools_main() 5 | -------------------------------------------------------------------------------- /tests/data/vcf/sample.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/vcztools/HEAD/tests/data/vcf/sample.vcf.gz -------------------------------------------------------------------------------- /tests/data/vcf/chr22.vcf.gz.csi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/vcztools/HEAD/tests/data/vcf/chr22.vcf.gz.csi -------------------------------------------------------------------------------- /tests/data/vcf/sample.vcf.gz.csi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/vcztools/HEAD/tests/data/vcf/sample.vcf.gz.csi -------------------------------------------------------------------------------- /tests/data/vcf/1kg_2020_chrM.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/vcztools/HEAD/tests/data/vcf/1kg_2020_chrM.vcf.gz -------------------------------------------------------------------------------- /tests/data/vcf/msprime_diploid.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/vcztools/HEAD/tests/data/vcf/msprime_diploid.vcf.gz -------------------------------------------------------------------------------- /tests/data/vcf/1kg_2020_chrM.vcf.gz.csi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/vcztools/HEAD/tests/data/vcf/1kg_2020_chrM.vcf.gz.csi -------------------------------------------------------------------------------- /tests/data/vcf/field_type_combos.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/vcztools/HEAD/tests/data/vcf/field_type_combos.vcf.gz -------------------------------------------------------------------------------- /tests/data/vcf/msprime_diploid.vcf.gz.csi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/vcztools/HEAD/tests/data/vcf/msprime_diploid.vcf.gz.csi -------------------------------------------------------------------------------- /.github/workflows/docker/shared.env: -------------------------------------------------------------------------------- 1 | PYTHON_VERSIONS=( 2 | cp39-cp39 3 | cp310-cp310 4 | cp311-cp311 5 | cp312-cp312 6 | ) 7 | -------------------------------------------------------------------------------- /tests/data/vcf/field_type_combos.vcf.gz.csi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/vcztools/HEAD/tests/data/vcf/field_type_combos.vcf.gz.csi -------------------------------------------------------------------------------- /tests/data/vcf/1kg_2020_chr20_annotations.bcf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/vcztools/HEAD/tests/data/vcf/1kg_2020_chr20_annotations.bcf -------------------------------------------------------------------------------- /tests/data/vcf/1kg_2020_chr20_annotations.bcf.csi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/vcztools/HEAD/tests/data/vcf/1kg_2020_chr20_annotations.bcf.csi -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | # rewrite asserts in assert_vcfs_close to give better failure messages 4 | pytest.register_assert_rewrite("tests.utils") 5 | -------------------------------------------------------------------------------- /vcztools/provenance.py: -------------------------------------------------------------------------------- 1 | __version__ = "undefined" 2 | try: 3 | from . import _version 4 | 5 | __version__ = _version.version 6 | except ImportError: # pragma: nocover 7 | pass 8 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | all: ext 3 | 4 | ext: vcztools/_vcztoolsmodule.c 5 | CFLAGS="-std=c99 -Wall -Wextra -Werror -Wno-unused-parameter -Wno-cast-function-type" \ 6 | python3 setup.py build_ext --inplace 7 | 8 | clean: 9 | rm -f vcztools/*.so 10 | rm -fR build 11 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | from setuptools import Extension, setup 3 | 4 | _vcztools_module = Extension( 5 | "vcztools._vcztools", 6 | sources=["vcztools/_vcztoolsmodule.c", "lib/vcf_encoder.c"], 7 | extra_compile_args=["-std=c99"], 8 | include_dirs=["lib", numpy.get_include()], 9 | ) 10 | 11 | setup( 12 | name="vcztools", 13 | ext_modules=[_vcztools_module], 14 | ) 15 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.5.0 4 | hooks: 5 | - id: check-merge-conflict 6 | - id: debug-statements 7 | - id: mixed-line-ending 8 | - id: check-case-conflict 9 | - id: check-yaml 10 | - repo: https://github.com/astral-sh/ruff-pre-commit 11 | rev: v0.4.2 12 | hooks: 13 | - id: ruff 14 | args: [ --fix ] 15 | - id: ruff-format -------------------------------------------------------------------------------- /tests/test_regions.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from vcztools.regions import parse_region_string 4 | 5 | 6 | @pytest.mark.parametrize( 7 | ("targets", "expected"), 8 | [ 9 | ("chr1", ("chr1", None, None)), 10 | ("chr1:12", ("chr1", 12, 12)), 11 | ("chr1:12-", ("chr1", 12, None)), 12 | ("chr1:12-103", ("chr1", 12, 103)), 13 | ], 14 | ) 15 | def test_parse_region_string( 16 | targets: str, expected: tuple[str, int | None, int | None] 17 | ): 18 | assert parse_region_string(targets) == expected 19 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | Language: Cpp 2 | BasedOnStyle: GNU 3 | SortIncludes: false 4 | AllowShortIfStatementsOnASingleLine: false 5 | BreakBeforeBraces: Linux 6 | TabWidth: 4 7 | IndentWidth: 4 8 | ColumnLimit: 89 9 | SpaceBeforeParens: 10 | ControlStatements 11 | SpacesInCStyleCastParentheses: false 12 | SpaceAfterCStyleCast: true 13 | IndentCaseLabels: true 14 | AlignAfterOpenBracket: DontAlign 15 | BinPackArguments: true 16 | BinPackParameters: true 17 | AlwaysBreakAfterReturnType: AllDefinitions 18 | 19 | # These are disabled for version 6 compatibility 20 | # StatementMacros: ["PyObject_HEAD"] 21 | # AlignConsecutiveMacros: true 22 | -------------------------------------------------------------------------------- /tests/test_calculate.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from vcztools.calculate import REF, SNP, UNCLASSIFIED, get_variant_type 4 | 5 | 6 | @pytest.mark.parametrize( 7 | ("ref", "alt", "expected_type"), 8 | [ 9 | ("A", "T", SNP), 10 | ("A", "A", REF), 11 | ("A", "", REF), 12 | ("A", "<*>", REF), 13 | ("A", "", REF), 14 | ("A", "AA", UNCLASSIFIED), 15 | # these are all SNPs since they differ in one base 16 | ("AC", "TC", SNP), 17 | ("CA", "CT", SNP), 18 | ("CAGG", "CTGG", SNP), 19 | ], 20 | ) 21 | def test_get_variant_type(ref, alt, expected_type): 22 | assert get_variant_type(ref, alt) == expected_type 23 | -------------------------------------------------------------------------------- /tests/test_vcf_roundtrip.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | import pytest 4 | 5 | from tests.utils import vcz_path_cache 6 | from vcztools.vcf_writer import write_vcf 7 | 8 | from .utils import assert_vcfs_close 9 | 10 | 11 | @pytest.mark.parametrize( 12 | "vcf_file", 13 | [ 14 | "sample.vcf.gz", 15 | "1kg_2020_chr20_annotations.bcf", 16 | "1kg_2020_chrM.vcf.gz", 17 | "field_type_combos.vcf.gz", 18 | ], 19 | ) 20 | def test_vcf_to_zarr_to_vcf__real_files(tmp_path, vcf_file): 21 | original = pathlib.Path("tests/data/vcf") / vcf_file 22 | vcz = vcz_path_cache(original) 23 | generated = tmp_path.joinpath("output.vcf") 24 | write_vcf(vcz, generated, no_version=True) 25 | assert_vcfs_close(original, generated) 26 | -------------------------------------------------------------------------------- /lib/meson.build: -------------------------------------------------------------------------------- 1 | project('vcf_encoder', ['c'], 2 | default_options: ['c_std=c99'] 3 | ) 4 | 5 | cc = meson.get_compiler('c') 6 | m_dep = cc.find_library('m', required: false) 7 | 8 | extra_c_args = [ 9 | '-Wall', '-Wextra', '-Werror', '-Wpedantic', '-W', 10 | '-Wmissing-prototypes', '-Wstrict-prototypes', 11 | '-Wconversion', '-Wshadow', '-Wpointer-arith', '-Wcast-align', 12 | '-Wcast-qual', '-Wwrite-strings', '-Wnested-externs', 13 | '-fshort-enums', '-fno-common'] 14 | 15 | lib_sources = ['vcf_encoder.c'] 16 | lib_headers = ['vcf_encoder.h'] 17 | 18 | cunit_dep = dependency('cunit') 19 | 20 | tests = executable('tests', 21 | sources: ['tests.c', 'vcf_encoder.c'], 22 | dependencies: [cunit_dep, m_dep], 23 | c_args: extra_c_args, 24 | ) 25 | test('tests', tests) 26 | -------------------------------------------------------------------------------- /vcztools/constants.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | INT_MISSING, INT_FILL = -1, -2 4 | 5 | FLOAT32_MISSING, FLOAT32_FILL = np.array([0x7F800001, 0x7F800002], dtype=np.int32).view( 6 | np.float32 7 | ) 8 | FLOAT32_MISSING_AS_INT32, FLOAT32_FILL_AS_INT32 = np.array( 9 | [0x7F800001, 0x7F800002], dtype=np.int32 10 | ) 11 | 12 | # From VCF fixed fields 13 | RESERVED_VARIABLE_NAMES = [ 14 | "variant_contig", 15 | "variant_position", 16 | "variant_length", 17 | "variant_id", 18 | "variant_id_mask", 19 | "variant_allele", 20 | "variant_quality", 21 | "variant_filter", 22 | ] 23 | 24 | RESERVED_VCF_FIELDS = { 25 | "CHROM": "variant_contig", 26 | "POS": "variant_position", 27 | "ID": "variant_id", 28 | "REF": "variant_allele", 29 | "ALT": "variant_allele", 30 | "QUAL": "variant_quality", 31 | "FILTER": "variant_filter", 32 | } 33 | -------------------------------------------------------------------------------- /.github/workflows/docker/buildwheel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | DOCKER_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 3 | source "$DOCKER_DIR/shared.env" 4 | 5 | set -e -x 6 | 7 | ARCH=`uname -p` 8 | echo "arch=$ARCH" 9 | 10 | # We're running as root in the docker container so git commands issued by 11 | # setuptools_scm will fail without this: 12 | git config --global --add safe.directory /project 13 | # Fetch the full history as we'll be missing tags otherwise. 14 | git fetch --unshallow 15 | for V in "${PYTHON_VERSIONS[@]}"; do 16 | git reset --hard 17 | git clean -fd 18 | PYBIN=/opt/python/$V/bin 19 | rm -rf build/ # Avoid lib build by one Python is used by another 20 | $PYBIN/python -m venv env 21 | source env/bin/activate 22 | $PYBIN/python -m pip install --upgrade build 23 | SETUPTOOLS_SCM_DEBUG=1 $PYBIN/python -m build 24 | done 25 | 26 | cd dist 27 | for whl in *.whl; do 28 | auditwheel -v repair "$whl" 29 | rm "$whl" 30 | done 31 | -------------------------------------------------------------------------------- /vcztools/calculate.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | # Variant types 4 | REF = -1 # missing value 5 | SNP = 1 << 0 6 | UNCLASSIFIED = 1 << 8 7 | 8 | 9 | def get_variant_type(ref: str, alt: str) -> int: 10 | """Return the variant type int for the given REF, ALT combination.""" 11 | if len(alt) == 0: 12 | return REF 13 | elif len(ref) == 1 and len(alt) == 1 and alt != "*": 14 | if ref == alt: 15 | return REF 16 | else: 17 | return SNP 18 | elif alt == "<*>" or alt == "": 19 | return REF 20 | elif ( 21 | len(ref) > 1 22 | and len(ref) == len(alt) 23 | and sum([r != a for r, a in zip(ref, alt)]) == 1 # one base differs 24 | ): 25 | return SNP 26 | else: 27 | return UNCLASSIFIED 28 | 29 | 30 | def calculate_variant_type(variant_allele: np.ndarray) -> np.ndarray: 31 | """Calculate the variant type array from the variant_allele array.""" 32 | ref = variant_allele[:, 0] 33 | alt = variant_allele[:, 1:] 34 | 35 | variant_type = np.zeros(alt.shape, dtype=np.int16) 36 | 37 | for i in range(alt.shape[0]): 38 | for j in range(alt.shape[1]): 39 | variant_type[i, j] = get_variant_type(ref[i], alt[i, j]) 40 | return variant_type 41 | -------------------------------------------------------------------------------- /tests/test_stats.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | from io import StringIO 3 | 4 | import pytest 5 | import zarr 6 | from bio2zarr import vcf 7 | 8 | from vcztools.stats import nrecords, stats 9 | 10 | from .utils import vcz_path_cache 11 | 12 | 13 | def test_nrecords(): 14 | original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz" 15 | vcz = vcz_path_cache(original) 16 | 17 | output_str = StringIO() 18 | nrecords(vcz, output_str) 19 | assert output_str.getvalue() == "9\n" 20 | 21 | 22 | def test_stats(): 23 | original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz" 24 | vcz = vcz_path_cache(original) 25 | 26 | output_str = StringIO() 27 | stats(vcz, output_str) 28 | 29 | assert ( 30 | output_str.getvalue() 31 | == """19 . 2 32 | 20 . 6 33 | X . 1 34 | """ 35 | ) 36 | 37 | 38 | def test_stats__no_index(tmp_path): 39 | original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz" 40 | # don't use cache here since we want to make sure vcz is not indexed 41 | vcz = tmp_path.joinpath("intermediate.vcz") 42 | vcf.convert([original], vcz, worker_processes=0, local_alleles=False) 43 | 44 | # delete the index created by vcf2zarr 45 | root = zarr.open(vcz, mode="a") 46 | del root["region_index"] 47 | 48 | with pytest.raises(ValueError, match="Could not load 'region_index' variable."): 49 | stats(vcz, StringIO()) 50 | -------------------------------------------------------------------------------- /vcztools/stats.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import zarr 3 | 4 | from vcztools.utils import open_file_like 5 | 6 | 7 | def nrecords(vcz, output): 8 | root = zarr.open(vcz, mode="r") 9 | 10 | with open_file_like(output) as output: 11 | num_variants = root["variant_position"].shape[0] 12 | print(num_variants, file=output) 13 | 14 | 15 | def stats(vcz, output): 16 | root = zarr.open(vcz, mode="r") 17 | 18 | if "region_index" not in root: 19 | raise ValueError( 20 | "Could not load 'region_index' variable. " 21 | "Use 'vcz2zarr' to create an index." 22 | ) 23 | 24 | with open_file_like(output) as output: 25 | contigs = root["contig_id"][:].astype("U").tolist() 26 | if "contig_length" in root: 27 | contig_lengths = root["contig_length"][:] 28 | else: 29 | contig_lengths = ["."] * len(contigs) 30 | 31 | region_index = root["region_index"][:] 32 | 33 | contig_indexes = region_index[:, 1] 34 | num_records = region_index[:, 5] 35 | 36 | num_records_per_contig = np.bincount( 37 | contig_indexes, weights=num_records 38 | ).astype(np.int64) 39 | 40 | for contig, contig_length, nr in zip( 41 | contigs, contig_lengths, num_records_per_contig 42 | ): 43 | if nr > 0: 44 | print(f"{contig}\t{contig_length}\t{nr}", file=output) 45 | -------------------------------------------------------------------------------- /performance/data/Makefile: -------------------------------------------------------------------------------- 1 | # The make recipes require bcftools and bgzip. 2 | 3 | # https://samtools.github.io/bcftools/howtos/install.html 4 | # https://www.htslib.org/doc/bgzip.html 5 | 6 | # On macOS, there are Homebrew formulas for bcftools and htslib, 7 | # which contains bgzip. 8 | 9 | # The Python requirements are listed in requirements.txt: 10 | # pip install -r requirements.txt 11 | 12 | # Flags / commandline arguments: 13 | CHROMOSOME ?= 22 14 | WGS ?= 1 15 | 16 | ifeq ($(WGS), 1) 17 | TGP_URL = "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20190425_NYGC_GATK/CCDG_13607_B01_GRM_WGS_2019-02-19_chr$(CHROMOSOME).recalibrated_variants.vcf.gz" 18 | else 19 | # Use URL for genotyping data: 20 | TGP_URL = "http://hgdownload.cse.ucsc.edu/gbdb/hg19/1000Genomes/phase3/ALL.chr$(CHROMOSOME).phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz" 21 | endif 22 | 23 | .PHONY: all simulated real clean 24 | 25 | all: simulated real 26 | 27 | simulated: sim_10k.vcz 28 | 29 | real: chr22.vcz 30 | 31 | sim_10k.ts: 32 | stdpopsim HomSap -c chr22 -o sim_10k.ts pop_0:10000 33 | 34 | chr22.vcf.gz: 35 | bcftools view $(TGP_URL) | head -n 25000 | bcftools view -O z -o chr22.vcf.gz 36 | 37 | %.vcf.gz: %.ts 38 | tskit vcf $< | bgzip > $@ 39 | 40 | %.vcf.gz.csi: %.vcf.gz 41 | bcftools index $< 42 | 43 | %.vcz: %.vcf.gz %.vcf.gz.csi 44 | vcf2zarr convert $< $@ 45 | 46 | clean: 47 | rm -rf sim_10k.* 48 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## [0.1.0] - 2025-05-29 4 | 5 | Improvements: 6 | 7 | - Support filtering by FILTER (#217), CHROM (#223) and general string values (#220) 8 | - Support regions (-r/-t), filter expressions (-i/-e) and samples (-s) in query command (#205) 9 | - Various improvements to support VCZ datasets produced from tskit and plink files by bio2zarr. 10 | - Use a fully dynamically generated header via ``vcf_meta_information`` attributes 11 | (#208). Requires vcf-zarr version >= 0.4 (bio2zarr >= 0.1.6) to fully recover the original 12 | header. 13 | - Add --version (#197) 14 | 15 | Breaking: 16 | 17 | - Update minimum Click version to 8.2.0 (#206) 18 | 19 | ## [0.0.2] - 2025-04-04 20 | 21 | Important bugfixes for filtering language and sample subsetting. 22 | 23 | - Clarify the implementation status of the filtering mini-lanuage in 24 | view/query. Version 0.0.1 contained several data-corrupting bugs, 25 | including incorrect missing data handling (#163), incorrect 26 | matching on FILTER (#164) and CHROM (#178) columns, and 27 | incorrect per-sample filtering in query (#179). These issues 28 | have been resolved by raising informative errors on aspects 29 | of the query language that are not implemented correctly. 30 | 31 | - The filtering mini-language now consists of arbitrary arithmetic 32 | expressions on 1-dimensional fields. 33 | 34 | - Add support for specifying samples via -s/-S options 35 | 36 | ## [0.0.1] - 2025-02-05 37 | 38 | Initial release of vcztools 39 | -------------------------------------------------------------------------------- /performance/compare.py: -------------------------------------------------------------------------------- 1 | # This script requires pv. 2 | 3 | # https://www.ivarch.com/programs/pv.shtml 4 | 5 | # There is a Homebrew formula to install pv on macOS. 6 | 7 | # This script also depends on the simulation data: 8 | # make -C data 9 | 10 | import subprocess 11 | import sys 12 | 13 | 14 | def run_time_pv(command: str): 15 | print(command) 16 | subprocess.run(f"time {command} | pv > /dev/null", shell=True) 17 | print() 18 | 19 | 20 | def run_bcftools(command: str, dataset_name: str): 21 | run_time_pv(f"bcftools {command} data/{dataset_name}.vcf.gz") 22 | 23 | 24 | def run_vcztools(command: str, dataset_name: str): 25 | run_time_pv(f"vcztools {command} data/{dataset_name}.vcz") 26 | 27 | 28 | if __name__ == "__main__": 29 | commands = [ 30 | ("view", "sim_10k"), 31 | ("view", "chr22"), 32 | ("view -s tsk_7068,tsk_8769,tsk_8820", "sim_10k"), 33 | (r"query -f '%CHROM %POS %REF %ALT{0}\n'", "sim_10k"), 34 | (r"query -f '%CHROM:%POS\n' -i 'POS=49887394 | POS=50816415'", "sim_10k"), 35 | ("view -s '' --force-samples", "sim_10k"), 36 | ("view -i 'FMT/DP>10 & FMT/GQ>10'", "chr22"), 37 | ("view -i 'QUAL>10 || FMT/GQ>10'", "chr22"), 38 | (r"query -f 'GQ:[ %GQ] \t GT:[ %GT]\n'", "chr22"), 39 | ] 40 | 41 | if len(sys.argv) == 2 and sys.argv[1].isnumeric(): 42 | index = int(sys.argv[1]) 43 | command, dataset = commands[index] 44 | run_bcftools(command, dataset) 45 | run_vcztools(command, dataset) 46 | else: 47 | for command, dataset in commands: 48 | run_bcftools(command, dataset) 49 | run_vcztools(command, dataset) 50 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from numpy.testing import assert_array_equal 3 | 4 | from vcztools.utils import search, vcf_name_to_vcz_names 5 | 6 | 7 | @pytest.mark.parametrize( 8 | ("a", "v", "expected_ind"), 9 | [ 10 | (["a", "b", "c", "d"], ["b", "a", "c"], [1, 0, 2]), 11 | (["a", "c", "d", "b"], ["b", "a", "c"], [3, 0, 1]), 12 | (["a", "c", "d", "b"], ["b", "a", "a", "c"], [3, 0, 0, 1]), 13 | (["a", "c", "d", "b"], [], []), 14 | ], 15 | ) 16 | def test_search(a, v, expected_ind): 17 | assert_array_equal(search(a, v), expected_ind) 18 | 19 | 20 | @pytest.mark.parametrize( 21 | ("vczs", "vcf", "expected_vcz_names"), 22 | [ 23 | ({"call_genotype"}, "GT", ["call_genotype"]), 24 | ({"call_genotype"}, "FMT/GT", ["call_genotype"]), 25 | ({"call_genotype"}, "FORMAT/GT", ["call_genotype"]), 26 | ({"call_DP"}, "DP", ["call_DP"]), 27 | ({"variant_DP"}, "DP", ["variant_DP"]), 28 | ({"call_DP", "variant_DP"}, "DP", ["call_DP", "variant_DP"]), 29 | ({"call_DP", "variant_DP"}, "FORMAT/DP", ["call_DP"]), 30 | ({"call_DP", "variant_DP"}, "INFO/DP", ["variant_DP"]), 31 | ({"variant_DP"}, "FORMAT/DP", []), 32 | ({"call_DP"}, "INFO/DP", []), 33 | (set(), "CHROM", ["variant_contig"]), 34 | (set(), "POS", ["variant_position"]), 35 | (set(), "ID", ["variant_id"]), 36 | (set(), "REF", ["variant_allele"]), 37 | (set(), "ALT", ["variant_allele"]), 38 | (set(), "QUAL", ["variant_quality"]), 39 | (set(), "FILTER", ["variant_filter"]), 40 | ], 41 | ) 42 | def test_vcf_name_to_vcz_names(vczs, vcf, expected_vcz_names): 43 | assert vcf_name_to_vcz_names(vczs, vcf) == expected_vcz_names 44 | -------------------------------------------------------------------------------- /tests/test_plink_validation.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pathlib 3 | import subprocess 4 | 5 | import click.testing as ct 6 | import pytest 7 | 8 | import vcztools.cli as cli 9 | 10 | from . import utils 11 | 12 | 13 | def assert_files_identical(path1, path2): 14 | """ 15 | Asserts the files are byte-for-byte identical. 16 | """ 17 | with open(path1, "rb") as f: 18 | b1 = f.read() 19 | with open(path2, "rb") as f: 20 | b2 = f.read() 21 | assert b1 == b2 22 | 23 | 24 | @pytest.mark.skip("Removing plink from CLI for bugfix release") 25 | # fmt: off 26 | @pytest.mark.parametrize( 27 | ("args", "vcf_file"), 28 | [ 29 | ("", "sample.vcf.gz"), 30 | ("", "chr22.vcf.gz"), 31 | ("", "1kg_2020_chrM.vcf.gz"), 32 | # FIXME this needs some extra args to deal with sample ID format 33 | # ("", "msprime_diploid.vcf.gz"), 34 | ], 35 | ) 36 | # fmt: on 37 | def test_conversion_identical(tmp_path, args, vcf_file): 38 | original = pathlib.Path("tests/data/vcf") / vcf_file 39 | vcz = utils.vcz_path_cache(original) 40 | 41 | plink_workdir = tmp_path / "plink1.9" 42 | plink_workdir.mkdir() 43 | plink_bin = os.environ.get("PLINK_BIN", "plink") 44 | cmd = f"{plink_bin} --vcf {original.absolute()} {args}" 45 | result = subprocess.run(cmd, shell=True, cwd=plink_workdir, capture_output=True) 46 | assert result.returncode == 0 47 | 48 | cmd = f"view-plink1 {vcz.absolute()} {args}" 49 | runner = ct.CliRunner() 50 | with runner.isolated_filesystem(tmp_path) as working_dir: 51 | vcz_workdir = pathlib.Path(working_dir) 52 | result = runner.invoke(cli.vcztools_main, cmd, catch_exceptions=False) 53 | for filename in ["plink.fam", "plink.bim", "plink.bed"]: 54 | assert_files_identical(vcz_workdir / filename, plink_workdir / filename) 55 | -------------------------------------------------------------------------------- /vcztools/samples.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import numpy as np 4 | 5 | from vcztools.utils import search 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | def parse_samples( 11 | samples: list[str] | str | None, 12 | all_samples: np.ndarray, 13 | *, 14 | force_samples: bool = True, 15 | ) -> tuple[np.ndarray, np.ndarray | None]: 16 | """Parse a bcftools-style samples string, or a list of sample IDs. 17 | 18 | Returns an array of the sample IDs, and an array indicating the selection 19 | from all samples. 20 | """ 21 | 22 | if samples is None: 23 | return all_samples, None 24 | elif isinstance(samples, list): 25 | exclude_samples = False 26 | sample_ids = np.array(samples) 27 | else: 28 | exclude_samples = samples.startswith("^") 29 | samples = samples.lstrip("^") 30 | sample_ids = np.array(samples.split(",")) 31 | 32 | if np.all(sample_ids == np.array("")): 33 | sample_ids = np.empty((0,)) 34 | 35 | unknown_samples = np.setdiff1d(sample_ids, all_samples) 36 | if len(unknown_samples) > 0: 37 | if force_samples: 38 | # remove unknown samples from sample_ids 39 | logger.warning( 40 | "subset called for sample(s) not in header: " 41 | f'{",".join(unknown_samples)}.' 42 | ) 43 | sample_ids = np.delete(sample_ids, search(sample_ids, unknown_samples)) 44 | else: 45 | raise ValueError( 46 | "subset called for sample(s) not in header: " 47 | f'{",".join(unknown_samples)}. ' 48 | 'Use "--force-samples" to ignore this error.' 49 | ) 50 | 51 | samples_selection = search(all_samples, sample_ids) 52 | if exclude_samples: 53 | samples_selection = np.setdiff1d(np.arange(all_samples.size), samples_selection) 54 | sample_ids = all_samples[samples_selection] 55 | return sample_ids, samples_selection 56 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![CI](https://github.com/sgkit-dev/vcztools/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/sgkit-dev/vcztools/actions/workflows/ci.yml) 2 | [![PyPI Downloads](https://static.pepy.tech/badge/vcztools)](https://pepy.tech/projects/vcztools) 3 | 4 | # vcztools 5 | Partial reimplementation of bcftools for [VCF Zarr](https://github.com/sgkit-dev/vcf-zarr-spec/) 6 | 7 | Please see the [preprint](https://www.biorxiv.org/content/10.1101/2024.06.11.598241) for more information. 8 | 9 | 10 | ## Installation 11 | 12 | ``` 13 | python3 -m pip install vcztools 14 | ``` 15 | 16 | ## Usage 17 | 18 | ``` 19 | vcztools view 20 | ``` 21 | or 22 | ``` 23 | python -m vcztools view 24 | ``` 25 | should be equivalent to running 26 | ``` 27 | bcftools view 28 | ``` 29 | 30 | See the [bio2zarr](https://sgkit-dev.github.io/bio2zarr/) project for help in 31 | converting VCF files to Zarr. 32 | 33 | ## Goals 34 | 35 | Vcztools aims to be a drop-in replacement for a subset of bcftools functionality. 36 | Currently supported are the ``view``, ``query`` and ``index -s/-n`` commands. 37 | 38 | We aim for 100% compatibility so if you notice a difference between the output of 39 | vcztools and bcftools please do open an issue. 40 | 41 | ## Cloud stores 42 | 43 | Vcztools can read vcz files from cloud stores using [fsspec](https://filesystem-spec.readthedocs.io/en/latest/). 44 | 45 | For example, to read from Amazon S3, first install the `s3fs` fsspec library: 46 | 47 | ``` 48 | python3 -m pip install s3fs 49 | ``` 50 | 51 | Then provide your AWS credentials as described in the [`s3fs` documentation](https://s3fs.readthedocs.io/en/latest/#credentials), for example by setting environment variables: 52 | 53 | ``` 54 | export AWS_ACCESS_KEY_ID=... 55 | export AWS_SECRET_ACCESS_KEY=... 56 | ``` 57 | 58 | You can then run vcztools using an `s3://` URL: 59 | 60 | ``` 61 | python -m vcztools view s3:///path/to.vcz 62 | ``` 63 | 64 | ## Development 65 | 66 | Vcztools is under active development and contributions are warmly welcomed. Please 67 | see the project on [GitHub](https://github.com/sgkit-dev/vcztools). 68 | 69 | -------------------------------------------------------------------------------- /vcztools/utils.py: -------------------------------------------------------------------------------- 1 | from contextlib import ExitStack, contextmanager 2 | from pathlib import Path 3 | 4 | import numpy as np 5 | 6 | from vcztools.constants import RESERVED_VCF_FIELDS 7 | 8 | 9 | def search(a, v): 10 | """ 11 | Finds the indices into an array a corresponding to the elements in v. 12 | The behaviour is undefined if any elements in v are not in a. 13 | """ 14 | sorter = np.argsort(a) 15 | rank = np.searchsorted(a, v, sorter=sorter) 16 | return sorter[rank] 17 | 18 | 19 | @contextmanager 20 | def open_file_like(file): 21 | """A context manager for opening a file path or string (and closing on exit), 22 | or passing a file-like object through.""" 23 | with ExitStack() as stack: 24 | if isinstance(file, (str, Path)): 25 | file = stack.enter_context(open(file, mode="w")) 26 | yield file 27 | 28 | 29 | def vcf_name_to_vcz_names(vcz_names: set[str], vcf_name: str) -> list[str]: 30 | """ 31 | Convert the name of a VCF field to the names of corresponding VCF Zarr arrays. 32 | 33 | :param set[str] vcz_names: A set of allowed VCF Zarr field names 34 | :param str vcf_name: The name of the VCF field 35 | :return: The names of corresponding VCF Zarr arrays, with call (FORMAT) fields 36 | before variant (INFO) fields, if both are possible matches, or an empty list 37 | if there are no matches. 38 | :rtype: list[str] 39 | """ 40 | 41 | candidates = [] 42 | split = vcf_name.split("/") 43 | assert 1 <= len(split) <= 2 44 | 45 | if split[-1] == "GT": 46 | candidates.append("call_genotype") 47 | elif len(split) > 1: 48 | if split[0] in {"FORMAT", "FMT"}: 49 | candidates.append(f"call_{split[-1]}") 50 | elif split[0] in {"INFO"}: 51 | candidates.append(f"variant_{split[-1]}") 52 | else: 53 | candidates.append(f"call_{split[-1]}") 54 | candidates.append(f"variant_{split[-1]}") 55 | 56 | matches = [candidate for candidate in candidates if candidate in vcz_names] 57 | 58 | if vcf_name in RESERVED_VCF_FIELDS: 59 | matches.append(RESERVED_VCF_FIELDS[vcf_name]) 60 | 61 | return matches 62 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=45", 4 | "wheel", 5 | "numpy>=2", 6 | "setuptools_scm" 7 | ] 8 | build-backend = "setuptools.build_meta" 9 | 10 | [project] 11 | name = "vcztools" 12 | description = "Implementation of bcftools for VCF Zarr" 13 | readme = "README.md" 14 | license = {file = "LICENSE"} 15 | authors = [ 16 | {name = "sgkit Developers", email = "project@sgkit.dev"}, 17 | ] 18 | dependencies = [ 19 | "numpy>=1.23.5", 20 | "zarr>=2.17,<3", 21 | "click>=8.2.0", 22 | "pyranges!=0.1.3", 23 | "pyparsing>=3" 24 | ] 25 | requires-python = ">=3.10" 26 | classifiers = [ 27 | "Development Status :: 4 - Beta", 28 | "License :: OSI Approved :: Apache Software License", 29 | "Operating System :: POSIX", 30 | "Operating System :: POSIX :: Linux", 31 | "Operating System :: MacOS", 32 | "Operating System :: MacOS :: MacOS X", 33 | "Intended Audience :: Science/Research", 34 | "Programming Language :: Python", 35 | "Programming Language :: Python :: 3", 36 | "Programming Language :: Python :: 3.10", 37 | "Programming Language :: Python :: 3.11", 38 | "Programming Language :: Python :: 3.12", 39 | "Topic :: Scientific/Engineering" 40 | ] 41 | dynamic = ["version"] 42 | 43 | [project.urls] 44 | repository = "https://github.com/sgkit-dev/vcztools" 45 | 46 | [project.scripts] 47 | vcztools = "vcztools.cli:vcztools_main" 48 | 49 | [project.optional-dependencies] 50 | dev = [ 51 | "bio2zarr", 52 | "cyvcf2", 53 | "pytest", 54 | "pytest-cov", 55 | "msprime", 56 | "sgkit", 57 | ] 58 | 59 | [tool.setuptools] 60 | packages = ["vcztools"] 61 | 62 | [tool.pytest.ini_options] 63 | testpaths = ["tests"] 64 | addopts = "--cov=vcztools --cov-report=term-missing" 65 | 66 | [tool.setuptools_scm] 67 | write_to = "vcztools/_version.py" 68 | 69 | [tool.ruff] 70 | # Assume Python 3.10 71 | target-version = "py310" 72 | 73 | # Same as Black. 74 | line-length = 88 75 | indent-width = 4 76 | 77 | [tool.ruff.lint] 78 | select = ["E", "F", "B", "W", "I", "N", "UP", "A", "PT"] 79 | #Allow uppercase names for e.g. call_AD 80 | #Don't add strict=False to zips (B905) 81 | ignore = ["N806", "N802", "N803", "A001", "A002", "B905", "RUF", "UP038"] 82 | 83 | fixable = ["ALL"] 84 | unfixable = [] 85 | 86 | [tool.ruff.lint.isort] 87 | known-third-party = [ 88 | "bio2zarr", 89 | "click", 90 | "cyvcf2", 91 | "numcodecs", 92 | "numpy", 93 | "pandas", 94 | "pyranges", 95 | "pytest", 96 | "setuptools", 97 | "zarr" 98 | ] 99 | -------------------------------------------------------------------------------- /.github/workflows/cd.yml: -------------------------------------------------------------------------------- 1 | name: CD 2 | 3 | on: 4 | merge_group: 5 | pull_request: 6 | push: 7 | branches: 8 | - main 9 | tags: 10 | - '*' 11 | release: 12 | types: [published] 13 | 14 | jobs: 15 | packaging: 16 | if: github.repository_owner == 'sgkit-dev' 17 | name: Packaging 18 | runs-on: ubuntu-latest 19 | steps: 20 | - uses: actions/checkout@v4 21 | - uses: actions/setup-python@v5 22 | with: 23 | python-version: '3.11' 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip build validate-pyproject[all] 27 | python -m build --sdist 28 | - name: Upload sdist 29 | uses: actions/upload-artifact@v4 30 | with: 31 | name: sdist 32 | path: dist 33 | - name: Build wheels 34 | run: | 35 | validate-pyproject pyproject.toml 36 | docker run --rm -v `pwd`:/project -w /project quay.io/pypa/manylinux2014_x86_64 bash .github/workflows/docker/buildwheel.sh 37 | - name: Check vcztools CLI 38 | run: | 39 | pip install numpy "zarr>=2.17,<3" "click>=8.2.0" "pyranges!=0.1.3" pyparsing 40 | pip install vcztools --no-index --only-binary vcztools -f dist/wheelhouse 41 | vcztools --help 42 | # Make sure we don't have ``vcztools`` in the CWD 43 | cd tests 44 | python -m vcztools --help 45 | - name: Store the distribution packages 46 | uses: actions/upload-artifact@v4 47 | with: 48 | name: linux-wheels 49 | path: dist/wheelhouse 50 | 51 | publish-to-pypi: 52 | if: github.repository_owner == 'sgkit-dev' && github.event_name == 'release' 53 | needs: 54 | - packaging 55 | runs-on: ubuntu-latest 56 | 57 | environment: 58 | name: pypi 59 | url: https://pypi.org/p/vcztools 60 | permissions: 61 | id-token: write # IMPORTANT: mandatory for trusted publishing 62 | 63 | steps: 64 | - name: Download all 65 | uses: actions/download-artifact@v4.1.8 66 | - name: Move to dist 67 | run: | 68 | mkdir dist 69 | cp */*.{whl,gz} dist/. 70 | ls dist 71 | - uses: pypa/gh-action-pypi-publish@release/v1 72 | 73 | 74 | publish-to-testpypi: 75 | if: github.repository_owner == 'sgkit-dev' && github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') 76 | needs: 77 | - packaging 78 | runs-on: ubuntu-latest 79 | 80 | environment: 81 | name: testpypi 82 | url: https://test.pypi.org/p/vcztools 83 | 84 | permissions: 85 | id-token: write # IMPORTANT: mandatory for trusted publishing 86 | 87 | steps: 88 | - name: Download all 89 | uses: actions/download-artifact@v4.1.8 90 | - name: Move to dist 91 | run: | 92 | mkdir dist 93 | cp */*.{whl,gz} dist/. 94 | ls dist 95 | - uses: pypa/gh-action-pypi-publish@release/v1 96 | with: 97 | verbose: true 98 | repository-url: https://test.pypi.org/legacy/ 99 | -------------------------------------------------------------------------------- /tests/test_retrieval.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | import numpy.testing as nt 4 | import pytest 5 | import zarr 6 | 7 | from vcztools.retrieval import variant_chunk_iter, variant_iter 8 | from vcztools.samples import parse_samples 9 | 10 | from .utils import vcz_path_cache 11 | 12 | 13 | def test_variant_chunk_iter(): 14 | original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz" 15 | vcz = vcz_path_cache(original) 16 | root = zarr.open(vcz, mode="r") 17 | 18 | _, samples_selection = parse_samples("NA00002,NA00003", root["sample_id"][:]) 19 | chunk_data = next( 20 | variant_chunk_iter( 21 | root, 22 | fields=["variant_contig", "variant_position", "call_DP", "call_GQ"], 23 | regions="20:1230236-", 24 | include="FMT/DP>3", 25 | samples_selection=samples_selection, 26 | ) 27 | ) 28 | nt.assert_array_equal(chunk_data["variant_contig"], [1, 1]) 29 | nt.assert_array_equal(chunk_data["variant_position"], [1230237, 1234567]) 30 | nt.assert_array_equal(chunk_data["call_DP"], [[4, 2], [2, 3]]) 31 | nt.assert_array_equal(chunk_data["call_GQ"], [[48, 61], [17, 40]]) 32 | # note second site (at pos 1234567) is included even though both samples in mask 33 | # are False (NA00002 and NA00003), since sample NA00001 matched filter criteria, 34 | # but was then removed by samples_selection 35 | nt.assert_array_equal(chunk_data["call_mask"], [[True, False], [False, False]]) 36 | 37 | 38 | def test_variant_chunk_iter_empty_fields(): 39 | original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz" 40 | vcz = vcz_path_cache(original) 41 | root = zarr.open(vcz, mode="r") 42 | 43 | with pytest.raises(StopIteration): 44 | print(next(variant_chunk_iter(root, fields=[]))) 45 | 46 | 47 | @pytest.mark.parametrize( 48 | ("regions", "samples"), 49 | [("20:1230236-", "NA00002,NA00003"), (["20:1230236-"], ["NA00002", "NA00003"])], 50 | ) 51 | def test_variant_iter(regions, samples): 52 | original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz" 53 | vcz = vcz_path_cache(original) 54 | 55 | iter = variant_iter( 56 | vcz, 57 | fields=["variant_contig", "variant_position", "call_DP", "call_GQ"], 58 | regions=regions, 59 | include="FMT/DP>3", 60 | samples=samples, 61 | ) 62 | 63 | variant1 = next(iter) 64 | assert variant1["variant_contig"] == 1 65 | assert variant1["variant_position"] == 1230237 66 | nt.assert_array_equal(variant1["call_DP"], [4, 2]) 67 | nt.assert_array_equal(variant1["call_GQ"], [48, 61]) 68 | nt.assert_array_equal(variant1["call_mask"], [True, False]) 69 | 70 | variant2 = next(iter) 71 | assert variant2["variant_contig"] == 1 72 | assert variant2["variant_position"] == 1234567 73 | nt.assert_array_equal(variant2["call_DP"], [2, 3]) 74 | nt.assert_array_equal(variant2["call_GQ"], [17, 40]) 75 | nt.assert_array_equal(variant2["call_mask"], [False, False]) 76 | 77 | with pytest.raises(StopIteration): 78 | next(iter) 79 | 80 | 81 | def test_variant_iter_empty_fields(): 82 | original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz" 83 | vcz = vcz_path_cache(original) 84 | 85 | with pytest.raises(StopIteration): 86 | next(variant_iter(vcz, fields=[])) 87 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # auto generated by setuptools_scm and configured in pyproject.toml 2 | vcztools/_version.py 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | cover/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | .pybuilder/ 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | # For a library or package, you might want to ignore these files since the code is 90 | # intended to run in multiple environments; otherwise, check them in: 91 | # .python-version 92 | 93 | # pipenv 94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 97 | # install all needed dependencies. 98 | #Pipfile.lock 99 | 100 | # poetry 101 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 102 | # This is especially recommended for binary packages to ensure reproducibility, and is more 103 | # commonly ignored for libraries. 104 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 105 | #poetry.lock 106 | 107 | # pdm 108 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 109 | #pdm.lock 110 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 111 | # in version control. 112 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 113 | .pdm.toml 114 | .pdm-python 115 | .pdm-build/ 116 | 117 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 118 | __pypackages__/ 119 | 120 | # Celery stuff 121 | celerybeat-schedule 122 | celerybeat.pid 123 | 124 | # SageMath parsed files 125 | *.sage.py 126 | 127 | # Environments 128 | .env 129 | .venv 130 | env/ 131 | venv/ 132 | ENV/ 133 | env.bak/ 134 | venv.bak/ 135 | 136 | # Spyder project settings 137 | .spyderproject 138 | .spyproject 139 | 140 | # Rope project settings 141 | .ropeproject 142 | 143 | # mkdocs documentation 144 | /site 145 | 146 | # mypy 147 | .mypy_cache/ 148 | .dmypy.json 149 | dmypy.json 150 | 151 | # Pyre type checker 152 | .pyre/ 153 | 154 | # pytype static type analyzer 155 | .pytype/ 156 | 157 | # Cython debug symbols 158 | cython_debug/ 159 | 160 | # PyCharm 161 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 162 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 163 | # and can be added to the global gitignore or merged into this file. For a more nuclear 164 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 165 | .idea/ 166 | 167 | .vscode 168 | vcz_test_cache/ 169 | **/.DS_Store 170 | -------------------------------------------------------------------------------- /lib/vcf_encoder.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #ifdef __GNUC__ 6 | #define VCZ_UNUSED(x) VCZ_UNUSED_##x __attribute__((__unused__)) 7 | #else 8 | #define VCZ_UNUSED(x) VCZ_UNUSED_##x 9 | /* Don't bother with restrict for MSVC */ 10 | #define restrict 11 | #endif 12 | 13 | /* /1* We assume CHAR_BIT == 8 when loading strings from 8-bit byte arrays *1/ */ 14 | /* #if CHAR_BIT != 8 */ 15 | /* #error CHAR_BIT MUST EQUAL 8 */ 16 | /* #endif */ 17 | 18 | #define VCZ_INT_MISSING -1 19 | #define VCZ_INT_FILL -2 20 | #define VCZ_STRING_MISSING '.' 21 | #define VCZ_STRING_FILL '\0' 22 | #define VCZ_FLOAT32_MISSING_AS_INT32 0x7F800001 23 | #define VCZ_FLOAT32_FILL_AS_INT32 0x7F800002 24 | 25 | #define VCZ_NUM_FIXED_FIELDS 6 26 | 27 | #define VCZ_TYPE_INT 1 28 | #define VCZ_TYPE_FLOAT 2 29 | #define VCZ_TYPE_STRING 3 30 | #define VCZ_TYPE_BOOL 4 31 | 32 | // arbitrary - we can increase if needs be 33 | #define VCZ_MAX_FIELD_NAME_LEN 255 34 | #define VCZ_INT32_BUF_SIZE 12 // 10 digits, leading '-' and terminating NULL 35 | // Safe limit, no point in trying to make it too tight as it's easy to represent 36 | // certain very large numbers of floating point. 37 | #define VCZ_FLOAT32_BUF_SIZE 256 38 | 39 | #define VCZ_ERR_NO_MEMORY (-100) 40 | #define VCZ_ERR_BUFFER_OVERFLOW (-101) 41 | #define VCZ_ERR_VARIANT_OUT_OF_BOUNDS (-102) 42 | 43 | /* Built-in-limitations */ 44 | #define VCZ_ERR_FIELD_NAME_TOO_LONG (-201) 45 | #define VCZ_ERR_FIELD_UNSUPPORTED_TYPE (-202) 46 | #define VCZ_ERR_FIELD_UNSUPPORTED_ITEM_SIZE (-203) 47 | #define VCZ_ERR_FIELD_UNSUPPORTED_NUM_COLUMNS (-204) 48 | 49 | typedef struct { 50 | // maximum length + 1 for NULL byte 51 | char name[VCZ_MAX_FIELD_NAME_LEN + 1]; 52 | size_t name_length; 53 | int type; 54 | size_t item_size; 55 | size_t num_columns; 56 | const char *data; 57 | } vcz_field_t; 58 | 59 | int vcz_field_init(vcz_field_t *self, const char *name, int type, size_t item_size, 60 | size_t num_columns, const void *data); 61 | int64_t vcz_field_write_1d( 62 | const vcz_field_t *self, size_t row, char *buf, int64_t buflen, int64_t offset); 63 | void vcz_field_print_state(const vcz_field_t *self, FILE *out); 64 | 65 | typedef struct { 66 | size_t num_variants; 67 | size_t num_samples; 68 | vcz_field_t fixed_fields[VCZ_NUM_FIXED_FIELDS]; 69 | vcz_field_t filter_id; 70 | const int8_t *filter_data; 71 | vcz_field_t gt; 72 | const int8_t *gt_phased_data; 73 | size_t num_info_fields; 74 | size_t max_info_fields; 75 | vcz_field_t *info_fields; 76 | size_t num_format_fields; 77 | size_t max_format_fields; 78 | size_t field_array_size_increment; 79 | vcz_field_t *format_fields; 80 | } vcz_variant_encoder_t; 81 | 82 | int vcz_variant_encoder_init( 83 | vcz_variant_encoder_t *self, size_t num_variants, size_t num_samples); 84 | void vcz_variant_encoder_free(vcz_variant_encoder_t *self); 85 | void vcz_variant_encoder_print_state(const vcz_variant_encoder_t *self, FILE *out); 86 | 87 | int vcz_variant_encoder_add_chrom_field( 88 | vcz_variant_encoder_t *self, size_t item_size, const char *data); 89 | int vcz_variant_encoder_add_pos_field(vcz_variant_encoder_t *self, const int32_t *data); 90 | int vcz_variant_encoder_add_qual_field(vcz_variant_encoder_t *self, const float *data); 91 | int vcz_variant_encoder_add_ref_field( 92 | vcz_variant_encoder_t *self, size_t item_size, const char *data); 93 | int vcz_variant_encoder_add_id_field( 94 | vcz_variant_encoder_t *self, size_t item_size, size_t num_columns, const char *data); 95 | int vcz_variant_encoder_add_alt_field( 96 | vcz_variant_encoder_t *self, size_t item_size, size_t num_columns, const char *data); 97 | int vcz_variant_encoder_add_filter_field(vcz_variant_encoder_t *self, 98 | size_t id_item_size, size_t id_num_columns, const char *id_data, 99 | const int8_t *filter_data); 100 | int vcz_variant_encoder_add_gt_field(vcz_variant_encoder_t *self, size_t item_size, 101 | size_t num_columns, const void *data, const int8_t *phased_data); 102 | int vcz_variant_encoder_add_info_field(vcz_variant_encoder_t *self, const char *name, 103 | int type, size_t item_size, size_t num_columns, const void *data); 104 | int vcz_variant_encoder_add_format_field(vcz_variant_encoder_t *self, const char *name, 105 | int type, size_t item_size, size_t num_columns, const void *data); 106 | 107 | int64_t vcz_variant_encoder_encode( 108 | const vcz_variant_encoder_t *self, size_t row, char *buf, size_t buflen); 109 | 110 | int vcz_itoa(char *buf, int64_t v); 111 | int vcz_ftoa(char *buf, float v); 112 | 113 | 114 | #define VCZ_PLINK_HOM_A1 0x0 /* 00 */ 115 | #define VCZ_PLINK_HOM_A2 0x3 /* 11 */ 116 | #define VCZ_PLINK_HET 0x2 /* 10 */ 117 | #define VCZ_PLINK_MISSING 0x1 /* 01 */ 118 | int vcz_encode_plink(size_t num_variants, size_t num_samples, const int8_t *genotypes, 119 | const int8_t *a12_allele, char *buf); 120 | -------------------------------------------------------------------------------- /tests/test_plink.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from vcztools import plink 5 | 6 | 7 | def _encode_genotypes_row(g, allele_1, allele_2): 8 | # Missing genotype: 01 in PLINK format 9 | # Homozygous allele 1: 00 in PLINK format 10 | # Homozygous allele 2: 11 in PLINK format 11 | # Heterozygous: 10 in PLINK format 12 | HOM_A1 = 0b00 13 | HOM_A2 = 0b11 14 | HET = 0b10 15 | MISSING = 0b01 16 | 17 | num_samples = g.shape[0] 18 | assert g.shape[1] == 2 19 | bytes_per_variant = (num_samples + 3) // 4 20 | buff = bytearray(bytes_per_variant) 21 | for j in range(num_samples): 22 | byte_idx = j // 4 23 | bit_pos = (j % 4) * 2 24 | code = MISSING 25 | a, b = g[j] 26 | if b == -2: 27 | # Treated as a haploid call by plink 28 | if a == allele_1: 29 | code = HOM_A1 30 | elif a == allele_2: 31 | code = HOM_A2 32 | else: 33 | if a == allele_1: 34 | if b == allele_1: 35 | code = HOM_A1 36 | elif b == allele_2: 37 | code = HET 38 | elif a == allele_2: 39 | if b == allele_2: 40 | code = HOM_A2 41 | elif b == allele_1: 42 | code = HET 43 | if allele_1 == -1 and (code == HOM_A1 or code == HET): 44 | code = MISSING 45 | # print("\t", a, b, code) 46 | mask = ~(0b11 << bit_pos) 47 | buff[byte_idx] = (buff[byte_idx] & mask) | (code << bit_pos) 48 | return buff 49 | 50 | 51 | def encode_genotypes(G, a12_allele=None): 52 | G = np.array(G, dtype=np.int8) 53 | if a12_allele is None: 54 | a12_allele = np.zeros((G.shape[0], 2), dtype=G.dtype) 55 | a12_allele[:, 0] = 1 56 | assert G.shape[0] == a12_allele.shape[0] 57 | assert G.shape[2] == 2 58 | buff = bytearray() 59 | for j in range(len(G)): 60 | buff.extend(_encode_genotypes_row(G[j], *a12_allele[j])) 61 | return bytes(buff) 62 | 63 | 64 | class TestEncodeGenotypes: 65 | @pytest.mark.parametrize( 66 | "genotypes", 67 | [ 68 | [ 69 | [[0, 0], [0, 1], [0, 0]], 70 | ], 71 | [ 72 | [[0, 0], [0, 1], [0, 0]], 73 | [[1, 0], [1, 1], [0, -2]], 74 | [[1, 1], [0, 1], [-1, -1]], 75 | ], 76 | [ 77 | [[0, 0], [0, 1], [0, 0], [0, 1]], 78 | [[0, 0], [0, 1], [0, 0], [0, 1]], 79 | ], 80 | [ 81 | [[0, 0], [0, 1], [0, 0], [0, 1], [1, 1]], 82 | [[0, 0], [0, 1], [0, 0], [0, 1], [-1, -2]], 83 | [[0, 0], [0, 1], [0, 0], [0, 1], [1, 1]], 84 | [[1, 0], [-3, 1], [0, 0], [0, 1], [-1, -2]], 85 | [[0, 1], [0, 1], [1, 2], [0, 1], [1, 1]], 86 | [[0, 0], [0, -2], [0, 3], [-2, 1], [-1, -2]], 87 | ], 88 | ], 89 | ) 90 | def test_examples_01_alleles(self, genotypes): 91 | b1 = encode_genotypes(genotypes) 92 | b2 = plink.encode_genotypes(genotypes) 93 | assert b1 == b2 94 | 95 | @pytest.mark.parametrize( 96 | ("num_variants", "num_samples"), 97 | [ 98 | (0, 0), 99 | (1, 0), 100 | (0, 1), 101 | (1, 1), 102 | (1, 10), 103 | (1, 4), 104 | (1, 16), 105 | (1, 100), 106 | (1, 101), 107 | (10, 1), 108 | (100, 1), 109 | (10, 2), 110 | (10, 3), 111 | (10, 4), 112 | (10, 5), 113 | (10, 6), 114 | (10, 7), 115 | (10, 8), 116 | (10, 9), 117 | ], 118 | ) 119 | @pytest.mark.parametrize("value", [-1, 0, 1, 2]) 120 | def test_shapes_01_alleles(self, value, num_variants, num_samples): 121 | g = np.zeros((num_variants, num_samples, 2), dtype=np.int8) + value 122 | b1 = encode_genotypes(g) 123 | b2 = plink.encode_genotypes(g) 124 | # assert len(b1) == len(b2) 125 | assert b1 == b2 126 | 127 | @pytest.mark.parametrize( 128 | ("num_variants", "num_samples"), 129 | [ 130 | (1, 4), 131 | (1, 8), 132 | (1, 16), 133 | (1, 32), 134 | (1, 100), 135 | (33, 4), 136 | (33, 8), 137 | (33, 16), 138 | (33, 32), 139 | (33, 100), 140 | ], 141 | ) 142 | def test_all_zeros_div_4(self, num_variants, num_samples): 143 | assert num_samples % 4 == 0 144 | g = np.zeros((num_variants, num_samples, 2), dtype=np.int8) 145 | b1 = encode_genotypes(g) 146 | b2 = plink.encode_genotypes(g) 147 | assert b1 == b2 148 | assert b1 == bytearray(0xFF for _ in range(num_variants * num_samples // 4)) 149 | 150 | @pytest.mark.parametrize( 151 | ("num_variants", "num_samples"), 152 | [ 153 | (1, 33), 154 | (10, 1000), 155 | ], 156 | ) 157 | def test_nonsensical_data(self, num_variants, num_samples): 158 | g = np.arange((num_variants * num_samples * 2), dtype=np.int8).reshape( 159 | (num_variants, num_samples, 2) 160 | ) 161 | a12 = np.arange(num_variants * 2, dtype=np.int8).reshape((num_variants, 2)) 162 | b1 = encode_genotypes(g, a12) 163 | b2 = plink.encode_genotypes(g, a12) 164 | assert b1 == b2 165 | -------------------------------------------------------------------------------- /dev.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import _vcztools 4 | import numpy as np 5 | import zarr 6 | 7 | # From VCF fixed fields 8 | RESERVED_VARIABLE_NAMES = [ 9 | "variant_contig", 10 | "variant_position", 11 | "variant_id", 12 | "variant_id_mask", 13 | "variant_allele", 14 | "variant_quality", 15 | "variant_filter", 16 | ] 17 | 18 | 19 | def copy_to_memory(group): 20 | mem_group = zarr.group() 21 | for name, array in group.items(): 22 | if array.dtype == "O": 23 | copy = mem_group.empty_like(name, array, compressor={}) 24 | # FIXME: this is the best I've been able to come up with here. 25 | # Something very weird with object arrays in v2 26 | for j, row in enumerate(array): 27 | copy[j] = row 28 | else: 29 | copy = mem_group.empty_like(name, array, compressor={}) 30 | copy[:] = array[:] 31 | # print("copy = ", copy[:]) 32 | return mem_group 33 | 34 | 35 | def main(root): 36 | v_chunk = 0 37 | contigs = root["contig_id"][:].astype("S") 38 | # filters = root["filter_id"][:].astype("S") 39 | # print("contigs = ", contigs) 40 | # print("filters = ", contigs) 41 | 42 | chrom = contigs[root.variant_contig.blocks[v_chunk]] 43 | pos = root.variant_position.blocks[v_chunk] 44 | id = root.variant_id.blocks[v_chunk].astype("S") 45 | alleles = root.variant_allele.blocks[v_chunk] 46 | ref = alleles[:, 0].astype("S") 47 | alt = alleles[:, 1:].astype("S") 48 | # qual = root.variant_quality.blocks[v_chunk] 49 | # filter_ = filters[root.variant_filter.blocks[v_chunk]] 50 | 51 | num_variants = len(pos) 52 | if len(id.shape) == 1: 53 | id = id.reshape((num_variants, 1)) 54 | 55 | # TODO gathering fields and doing IO will be done separately later so that 56 | # we avoid retrieving stuff we don't need. 57 | format_fields = {} 58 | info_fields = {} 59 | for name, array in root.arrays(): 60 | if name.startswith("call_") and not name.startswith("call_genotype"): 61 | vcf_name = name[len("call_") :] 62 | format_fields[vcf_name] = array.blocks[v_chunk] 63 | elif name.startswith("variant_") and name not in RESERVED_VARIABLE_NAMES: 64 | vcf_name = name[len("variant_") :] 65 | info_fields[vcf_name] = array.blocks[v_chunk] 66 | 67 | gt = None 68 | gt_phased = None 69 | if "call_genotype" in root: 70 | array = root["call_genotype"] 71 | gt = array.blocks[v_chunk] 72 | if "call_genotype_phased" in root: 73 | array = root["call_genotype_phased"] 74 | gt_phased = array.blocks[v_chunk] 75 | else: 76 | gt_phased = np.zeros_like(gt, dtype=bool) 77 | 78 | # print(gt, gt_phased) 79 | # print(list(format_fields.keys())) 80 | # print(list(info_fields.keys())) 81 | 82 | # print(contigs[chrom]) 83 | # print(bytes(contigs[chrom])) 84 | # print(pos) 85 | # print(alleles) 86 | # print(alleles.dtype) 87 | # print(chrom) 88 | # print(pos) 89 | # print(id) 90 | # print(ref) 91 | # print(alt) 92 | 93 | num_samples = 0 94 | if gt is not None: 95 | num_samples = gt.shape[1] 96 | 97 | encoder = _vcztools.VcfEncoder( 98 | num_variants, num_samples, chrom=chrom, pos=pos, id=id, alt=alt, ref=ref 99 | ) 100 | print(gt.shape) 101 | print(gt_phased.shape) 102 | encoder.add_gt_field(gt.astype("int32"), gt_phased) 103 | # # print(encoder.arrays) 104 | # # print(encoder) 105 | for name, array in info_fields.items(): 106 | if array.dtype.kind == "O": 107 | array = array.astype("S") 108 | if len(array.shape) == 1: 109 | array = array.reshape((num_variants, 1)) 110 | if array.dtype.kind == "i": 111 | array = array.astype("int32") # tmp 112 | if array.dtype.kind == "f": 113 | continue # tmp 114 | if array.dtype.kind == "b": 115 | continue # tmp 116 | # array = array.astype("int32") # tmp 117 | 118 | print(name, array.dtype, array.dtype.kind) 119 | encoder.add_info_field(name, array) 120 | 121 | for name, array in format_fields.items(): 122 | if array.dtype.kind == "O": 123 | array = array.astype("S") 124 | if len(array.shape) == 2: 125 | array = array.reshape((num_variants, num_samples, 1)) 126 | if array.dtype.kind == "i": 127 | array = array.astype("int32") # tmp 128 | if array.dtype.kind == "f": 129 | continue # tmp 130 | # array = array.astype("int32") # tmp 131 | 132 | print(name, array.dtype, array.dtype.kind) 133 | encoder.add_format_field(name, array) 134 | 135 | # d = encoder.arrays 136 | # pos = encoder.arrays["POS"] 137 | # print(pos) 138 | # # print(d) 139 | # pos[0] = 123457 140 | # print(pos.flags) 141 | # pos.resize(0, refcheck=False) 142 | # print(pos) 143 | 144 | encoder.print_state(sys.stdout) 145 | for k, v in encoder.arrays.items(): 146 | print(k, "\t", v.shape) 147 | for j in range(num_variants): 148 | line = encoder.encode_row(j, 2**30) 149 | print(line) 150 | 151 | 152 | if __name__ == "__main__": 153 | root = zarr.open(sys.argv[1], mode="r") 154 | # root = copy_to_memory(root) 155 | # print("pos = ", root["variant_position"].info) 156 | # print(root.tree()) 157 | main(root) 158 | # for _ in range(10000): 159 | # import tqdm 160 | 161 | # for _ in tqdm.tqdm(range(10000)): 162 | # main(root) 163 | -------------------------------------------------------------------------------- /vcztools/plink.py: -------------------------------------------------------------------------------- 1 | """ 2 | Convert VCZ to plink 1 binary format. 3 | """ 4 | 5 | import pathlib 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import zarr 10 | 11 | from . import _vcztools, retrieval 12 | 13 | 14 | def encode_genotypes(genotypes, a12_allele=None): 15 | G = np.asarray(genotypes, dtype=np.int8) 16 | if a12_allele is None: 17 | a12_allele = np.zeros((G.shape[0], 2), dtype=G.dtype) 18 | a12_allele[:, 0] = 1 19 | a12_allele = np.asarray(a12_allele, dtype=G.dtype) 20 | # TODO: not sure if this is taking a copy. See the point about 21 | # allocating a numpy array in the C code. 22 | return bytes(_vcztools.encode_plink(G, a12_allele).data) 23 | 24 | 25 | def generate_fam(root): 26 | # TODO generate an error if sample_id contains a space 27 | sample_id = root["sample_id"][:].astype(str) 28 | zeros = np.zeros(sample_id.shape, dtype=int) 29 | df = pd.DataFrame( 30 | { 31 | "FamilyID": sample_id, 32 | "IndividualID": sample_id, 33 | "FatherID": zeros, 34 | "MotherId": zeros, 35 | "Sex": zeros, 36 | "Phenotype": np.full_like(zeros, -9), 37 | } 38 | ) 39 | return df.to_csv(sep="\t", header=False, index=False) 40 | 41 | 42 | def generate_bim(root, a12_allele): 43 | select = a12_allele[:, 1] != -1 44 | contig_id = root["contig_id"][:].astype(str) 45 | alleles = root["variant_allele"][:].astype(str)[select] 46 | a12_allele = a12_allele[select] 47 | num_variants = np.sum(select) 48 | allele_1 = alleles[np.arange(num_variants), a12_allele[:, 0]] 49 | single_allele_sites = np.where(a12_allele[:, 0] == -1) 50 | allele_1[single_allele_sites] = "0" 51 | 52 | num_variants = np.sum(select) 53 | if "variant_id" in root: 54 | variant_id = root["variant_id"][:][select] 55 | else: 56 | variant_id = np.array(["."] * num_variants, dtype="S") 57 | 58 | df = pd.DataFrame( 59 | { 60 | "Chrom": contig_id[root["variant_contig"][:][select]], 61 | "VariantId": variant_id, 62 | "GeneticPosition": np.zeros(np.sum(select), dtype=int), 63 | "Position": root["variant_position"][:][select], 64 | "Allele1": allele_1, 65 | "Allele2": alleles[np.arange(num_variants), a12_allele[:, 1]], 66 | } 67 | ) 68 | return df.to_csv(header=False, sep="\t", index=False) 69 | 70 | 71 | class Writer: 72 | def __init__( 73 | self, vcz_path, bed_path, fam_path, bim_path, include=None, exclude=None 74 | ): 75 | self.root = zarr.open(vcz_path, mode="r") 76 | 77 | self.bim_path = bim_path 78 | self.fam_path = fam_path 79 | self.bed_path = bed_path 80 | 81 | def _compute_alleles(self, G, alleles): 82 | """ 83 | Returns the a12 alleles for the specified chunk of data. 84 | """ 85 | max_alleles = alleles.shape[1] 86 | if max_alleles != 2: 87 | raise ValueError( 88 | "Only biallelic VCFs supported currently: " 89 | "please comment on https://github.com/sgkit-dev/vcztools/issues/224 " 90 | "if this limitation affects you" 91 | ) 92 | num_variants = G.shape[0] 93 | num_samples = G.shape[1] 94 | a12_allele = np.zeros((num_variants, 2), dtype=int) - 1 95 | for j, g in enumerate(G): 96 | g = g.reshape(num_samples * 2) 97 | assert np.all(g >= -2) 98 | count = np.bincount(g + 2, minlength=max_alleles + 2) 99 | # [dimension pad, missing data, reference, allele 1, ...] 100 | count = count[2:] 101 | argsort = np.argsort(count) 102 | a12_allele[j, 1] = 0 103 | if argsort[-1] == 0: 104 | # print("Ref allele most frequent") 105 | # Ref allele is most frequent - chose lowest allele from next most 106 | # frequent class 107 | f = count[argsort[-2]] 108 | else: 109 | # print("Ref allele not most frequent") 110 | f = count[argsort[-1]] 111 | a = 1 112 | while count[a] != f: 113 | a += 1 114 | a12_allele[j, 0] = a 115 | assert a12_allele[j, 0] != a12_allele[j, 1] 116 | if alleles[j][1] == "": 117 | a12_allele[j, 0] = -1 118 | # print( 119 | # self.root["variant_contig"][j], 120 | # self.root["variant_position"][j], 121 | # [j], 122 | # self.root["variant_allele"][j], 123 | # count, 124 | # argsort, 125 | # a12_allele[j], 126 | # ) 127 | return a12_allele 128 | 129 | def _write_genotypes(self): 130 | ci = retrieval.variant_chunk_iter( 131 | self.root, fields=["call_genotype", "variant_allele"] 132 | ) 133 | call_genotype = self.root["call_genotype"] 134 | a12_allele = zarr.zeros( 135 | (call_genotype.shape[0], 2), chunks=call_genotype.chunks[0], dtype=int 136 | ) 137 | with open(self.bed_path, "wb") as bed_file: 138 | bed_file.write(bytes([0x6C, 0x1B, 0x01])) 139 | 140 | for j, chunk in enumerate(ci): 141 | G = chunk["call_genotype"] 142 | a12 = self._compute_alleles(G, chunk["variant_allele"]) 143 | buff = encode_genotypes(G, a12) 144 | bed_file.write(buff) 145 | a12_allele.blocks[j] = a12 146 | return a12_allele[:] 147 | 148 | def run(self): 149 | a12_allele = self._write_genotypes() 150 | 151 | with open(self.bim_path, "w") as f: 152 | f.write(generate_bim(self.root, a12_allele)) 153 | 154 | with open(self.fam_path, "w") as f: 155 | f.write(generate_fam(self.root)) 156 | 157 | 158 | def write_plink(vcz_path, out, include=None, exclude=None): 159 | out_prefix = pathlib.Path(out) 160 | # out_prefix.mkdir(exist_ok=True) 161 | writer = Writer( 162 | vcz_path, 163 | bed_path=out_prefix.with_suffix(".bed"), 164 | fam_path=out_prefix.with_suffix(".fam"), 165 | bim_path=out_prefix.with_suffix(".bim"), 166 | include=include, 167 | exclude=exclude, 168 | ) 169 | writer.run() 170 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | merge_group: 5 | pull_request: 6 | push: 7 | branches: 8 | - main 9 | 10 | jobs: 11 | pre-commit: 12 | name: Lint 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v4 16 | - uses: actions/setup-python@v5 17 | with: 18 | python-version: '3.11' 19 | - uses: pre-commit/action@v3.0.1 20 | python_test: 21 | name: Python tests 22 | runs-on: ${{ matrix.os }} 23 | defaults: 24 | run: 25 | shell: bash -el {0} 26 | strategy: 27 | matrix: 28 | # Use macos-13 because pip binary packages for ARM aren't 29 | # available for many dependencies 30 | os: [macos-13, macos-14, ubuntu-latest] 31 | python-version: ["3.10", "3.11", "3.12"] 32 | exclude: 33 | # Just run macos tests on one Python version 34 | - os: macos-13 35 | python-version: "3.10" 36 | - os: macos-13 37 | python-version: "3.11" 38 | - os: macos-13 39 | python-version: "3.12" 40 | - os: macos-14 41 | python-version: "3.10" 42 | - os: macos-14 43 | python-version: "3.12" 44 | steps: 45 | - uses: actions/checkout@v4 46 | - name: Set up Miniconda with Python ${{ matrix.python-version }} 47 | uses: conda-incubator/setup-miniconda@v3 48 | with: 49 | auto-update-conda: true 50 | python-version: ${{ matrix.python-version }} 51 | channels: conda-forge,bioconda 52 | - name: Install dependencies 53 | run: | 54 | conda install bcftools plink 55 | python -m pip install --upgrade pip 56 | python -m pip install '.[dev]' 57 | # Build the extension module in-place so pytest can find it 58 | python3 setup.py build_ext --inplace 59 | - name: Run tests 60 | run: | 61 | pytest 62 | 63 | c_python_test: 64 | name: CPython interface tests 65 | runs-on: ubuntu-latest 66 | steps: 67 | - uses: actions/checkout@v4 68 | - name: Set up Python 69 | uses: actions/setup-python@v5 70 | with: 71 | python-version: '3.11' 72 | - name: Install system dependencies 73 | run: | 74 | sudo apt install -y gcovr 75 | - name: Install python dependencies 76 | run: | 77 | python -m pip install --upgrade pip 78 | python -m pip install numpy pytest pytest_cov 79 | - name: Build module with coverage 80 | run: | 81 | # Build the extension module in-place so pytest can find it 82 | CFLAGS="--coverage" python3 setup.py build_ext --inplace 83 | - name: Run tests 84 | run: | 85 | pytest -vs tests/test_cpython_interface.py 86 | - name: Show coverage 87 | run: | 88 | gcovr --filter vcztools 89 | 90 | c_test: 91 | name: C tests 92 | runs-on: ubuntu-latest 93 | steps: 94 | - uses: actions/checkout@v4 95 | - name: Install dependencies 96 | run: | 97 | sudo apt install -y ninja-build libcunit1-dev valgrind meson gcovr 98 | - name: Build 99 | working-directory: ./lib 100 | run: | 101 | meson setup -Db_coverage=true build 102 | - name: Tests 103 | working-directory: ./lib 104 | run: | 105 | ninja -C build test 106 | - name: Show coverage 107 | working-directory: ./lib 108 | run: | 109 | ninja -C build coverage-text 110 | cat build/meson-logs/coverage.txt 111 | - name: Valgrind 112 | working-directory: ./lib 113 | run: | 114 | valgrind --leak-check=full --error-exitcode=1 ./build/tests 115 | 116 | packaging: 117 | name: Packaging 118 | runs-on: ubuntu-latest 119 | steps: 120 | - uses: actions/checkout@v4 121 | - uses: actions/setup-python@v5 122 | with: 123 | python-version: '3.11' 124 | - name: Install dependencies 125 | run: | 126 | python -m pip install --upgrade pip 127 | python -m pip install build twine validate-pyproject[all] 128 | - name: Check and install package 129 | run: | 130 | validate-pyproject pyproject.toml 131 | python -m build 132 | python -m twine check --strict dist/* 133 | python -m pip install dist/*.whl 134 | - name: Check vcztools CLI 135 | run: | 136 | vcztools --help 137 | # Make sure we don't have ``vcztools`` in the CWD 138 | cd tests 139 | python -m vcztools --help 140 | 141 | test-numpy-version: 142 | name: Test numpy versions 143 | runs-on: ubuntu-latest 144 | defaults: 145 | run: 146 | shell: bash -el {0} 147 | strategy: 148 | matrix: 149 | numpy: ["==1.26", ">=2"] 150 | steps: 151 | - uses: actions/checkout@v4 152 | - name: Set up Miniconda 153 | uses: conda-incubator/setup-miniconda@v3 154 | with: 155 | auto-update-conda: true 156 | python-version: '3.11' 157 | channels: conda-forge,bioconda 158 | - name: Install dependencies 159 | run: | 160 | conda install bcftools plink 161 | python -m pip install --upgrade pip 162 | python -m pip install '.[dev]' 163 | # Build the extension module in-place so pytest can find it 164 | python3 setup.py build_ext --inplace 165 | - name: Install numpy${{ matrix.numpy }} 166 | run: | 167 | python -m pip install 'numpy${{ matrix.numpy }}' 168 | - name: Run tests 169 | run: | 170 | pytest 171 | 172 | test-zarr-version: 173 | name: Test Zarr versions 174 | runs-on: ubuntu-latest 175 | defaults: 176 | run: 177 | shell: bash -el {0} 178 | strategy: 179 | matrix: 180 | zarr: ["==2.18.3", ">=3,!=3.0.5"] 181 | steps: 182 | - uses: actions/checkout@v4 183 | - name: Set up Miniconda 184 | uses: conda-incubator/setup-miniconda@v3 185 | with: 186 | auto-update-conda: true 187 | python-version: '3.11' 188 | channels: conda-forge,bioconda 189 | - name: Install dependencies 190 | run: | 191 | conda install bcftools plink 192 | python -m pip install --upgrade pip 193 | python -m pip install '.[dev]' 194 | # Build the extension module in-place so pytest can find it 195 | python3 setup.py build_ext --inplace 196 | - name: Install zarr${{ matrix.zarr }} 197 | run: | 198 | python -m pip install 'zarr${{ matrix.zarr }}' 199 | - name: Run tests 200 | run: | 201 | pytest 202 | -------------------------------------------------------------------------------- /vcztools/regions.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import Any 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from pyranges import PyRanges 7 | 8 | 9 | def parse_region_string(region: str) -> tuple[str, int | None, int | None]: 10 | """Return the contig, start position and end position from a region string.""" 11 | if re.search(r":\d+-\d*$", region): 12 | contig, start_end = region.rsplit(":", 1) 13 | start, end = start_end.split("-") 14 | return contig, int(start), int(end) if len(end) > 0 else None 15 | elif re.search(r":\d+$", region): 16 | contig, start = region.rsplit(":", 1) 17 | return contig, int(start), int(start) 18 | else: 19 | contig = region 20 | return contig, None, None 21 | 22 | 23 | def regions_to_pyranges( 24 | regions: list[tuple[str, int | None, int | None]], all_contigs: list[str] 25 | ) -> PyRanges: 26 | """Convert region tuples to a PyRanges object.""" 27 | 28 | chromosomes = [] 29 | starts = [] 30 | ends = [] 31 | for contig, start, end in regions: 32 | if start is None: 33 | start = 0 34 | else: 35 | start -= 1 36 | 37 | if end is None: 38 | end = np.iinfo(np.int64).max 39 | 40 | chromosomes.append(all_contigs.index(contig)) 41 | starts.append(start) 42 | ends.append(end) 43 | 44 | return PyRanges(chromosomes=chromosomes, starts=starts, ends=ends) 45 | 46 | 47 | def parse_regions( 48 | regions: list[str] | str | None, all_contigs: list[str] 49 | ) -> PyRanges | None: 50 | """Return a PyRanges object from a comma-separated set of region strings, 51 | or a list of region strings.""" 52 | if regions is None: 53 | return None 54 | elif isinstance(regions, list): 55 | regions_list = regions 56 | else: 57 | regions_list = regions.split(",") 58 | return regions_to_pyranges( 59 | [parse_region_string(region) for region in regions_list], all_contigs 60 | ) 61 | 62 | 63 | def parse_targets( 64 | targets: list[str] | str | None, all_contigs: list[str] 65 | ) -> tuple[PyRanges | None, bool]: 66 | """Return a PyRanges object from a comma-separated set of region strings, 67 | optionally preceeded by a ^ character to indicate complement, 68 | or a list of region strings.""" 69 | if targets is None: 70 | return None, False 71 | elif isinstance(targets, list): 72 | targets_list = targets 73 | complement = False 74 | else: 75 | complement = targets.startswith("^") 76 | targets_list = (targets[1:] if complement else targets).split(",") 77 | return ( 78 | parse_regions(targets_list, all_contigs), 79 | complement, 80 | ) 81 | 82 | 83 | def regions_to_chunk_indexes( 84 | regions: PyRanges | None, 85 | targets: PyRanges | None, 86 | complement: bool, 87 | regions_index: Any, 88 | ): 89 | """Return chunks indexes that overlap the given regions or targets. 90 | 91 | If both regions and targets are specified then only regions are used 92 | to find overlapping chunks (since targets are used later to refine). 93 | 94 | If only targets are specified then they are used to find overlapping chunks, 95 | taking into account the complement flag. 96 | """ 97 | 98 | # Create pyranges for chunks using the region index. 99 | # For regions use max end position, for targets just end position 100 | chunk_index = regions_index[:, 0] 101 | contig_id = regions_index[:, 1] 102 | start_position = regions_index[:, 2] 103 | end_position = regions_index[:, 3] 104 | max_end_position = regions_index[:, 4] 105 | df = pd.DataFrame( 106 | { 107 | "chunk_index": chunk_index, 108 | "Chromosome": contig_id, 109 | "Start": start_position, 110 | "End": max_end_position if regions is not None else end_position, 111 | } 112 | ) 113 | chunk_regions = PyRanges(df) 114 | 115 | if regions is not None: 116 | overlap = chunk_regions.overlap(regions) 117 | elif complement: 118 | overlap = chunk_regions.subtract(targets) 119 | else: 120 | overlap = chunk_regions.overlap(targets) 121 | if overlap.empty: 122 | return np.empty((0,), dtype=np.int64) 123 | chunk_indexes = overlap.df["chunk_index"].to_numpy() 124 | chunk_indexes = np.unique(chunk_indexes) 125 | return chunk_indexes 126 | 127 | 128 | def regions_to_selection( 129 | regions: PyRanges | None, 130 | targets: PyRanges | None, 131 | complement: bool, 132 | variant_contig: Any, 133 | variant_position: Any, 134 | variant_length: Any, 135 | ): 136 | """Return a variant selection that corresponds to the given regions and targets. 137 | 138 | If both regions and targets are specified then they are both used to find 139 | overlapping variants. 140 | """ 141 | 142 | # subtract 1 from start coordinate to convert intervals 143 | # from VCF (1-based, fully-closed) to Python (0-based, half-open) 144 | variant_start = variant_position - 1 145 | 146 | if regions is not None: 147 | variant_end = variant_start + variant_length 148 | df = pd.DataFrame( 149 | {"Chromosome": variant_contig, "Start": variant_start, "End": variant_end} 150 | ) 151 | # save original index as column so we can retrieve it after finding overlap 152 | df["index"] = df.index 153 | variant_regions = PyRanges(df) 154 | else: 155 | variant_regions = None 156 | 157 | if targets is not None: 158 | targets_variant_end = variant_position # length 1 159 | df = pd.DataFrame( 160 | { 161 | "Chromosome": variant_contig, 162 | "Start": variant_start, 163 | "End": targets_variant_end, 164 | } 165 | ) 166 | # save original index as column so we can retrieve it after finding overlap 167 | df["index"] = df.index 168 | variant_targets = PyRanges(df) 169 | else: 170 | variant_targets = None 171 | 172 | if variant_regions is not None: 173 | regions_overlap = variant_regions.overlap(regions) 174 | else: 175 | regions_overlap = None 176 | 177 | if variant_targets is not None: 178 | if complement: 179 | targets_overlap = variant_targets.subtract(targets) 180 | else: 181 | targets_overlap = variant_targets.overlap(targets) 182 | else: 183 | targets_overlap = None 184 | 185 | if regions_overlap is not None and targets_overlap is not None: 186 | overlap = regions_overlap.overlap(targets_overlap) 187 | elif regions_overlap is not None: 188 | overlap = regions_overlap 189 | else: 190 | overlap = targets_overlap 191 | 192 | if overlap.empty: 193 | return np.empty((0,), dtype=np.int64) 194 | return overlap.df["index"].to_numpy() 195 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | from unittest import mock 3 | 4 | import click.testing as ct 5 | import pytest 6 | 7 | import vcztools.cli as cli 8 | from tests.test_bcftools_validation import run_vcztools 9 | from tests.utils import vcz_path_cache 10 | from vcztools import provenance 11 | 12 | 13 | @pytest.fixture() 14 | def vcz_path(): 15 | vcf_path = pathlib.Path("tests/data/vcf/sample.vcf.gz") 16 | return vcz_path_cache(vcf_path) 17 | 18 | 19 | def test_version_header(vcz_path): 20 | output, _ = run_vcztools(f"view {vcz_path}") 21 | assert output.find("##vcztools_viewCommand=") >= 0 22 | assert output.find("Date=") >= 0 23 | 24 | 25 | class TestOutput: 26 | def test_view_unsupported_output(self, tmp_path, vcz_path): 27 | bad_output = tmp_path / "output.vcf.gz" 28 | 29 | _, vcztools_error = run_vcztools( 30 | f"view --no-version {vcz_path} -o {bad_output}", expect_error=True 31 | ) 32 | assert ( 33 | "Only uncompressed VCF output supported, suffix .gz not allowed" 34 | in vcztools_error 35 | ) 36 | 37 | @pytest.mark.parametrize("suffix", ["gz", "bgz", "bcf"]) 38 | def test_view_unsupported_output_suffix(self, tmp_path, vcz_path, suffix): 39 | bad_output = tmp_path / f"output.vcf.{suffix}" 40 | 41 | _, vcztools_error = run_vcztools( 42 | f"view --no-version {vcz_path} -o {bad_output}", expect_error=True 43 | ) 44 | assert f".{suffix} not allowed" in vcztools_error 45 | 46 | def test_view_good_path(self, tmp_path, vcz_path): 47 | output_path = tmp_path / "tmp.vcf" 48 | runner = ct.CliRunner() 49 | result = runner.invoke( 50 | cli.vcztools_main, 51 | f"view --no-version {vcz_path} -o {output_path}", 52 | catch_exceptions=False, 53 | ) 54 | assert result.exit_code == 0 55 | assert len(result.stdout) == 0 56 | assert output_path.exists() 57 | 58 | def test_view_write_directory(self, tmp_path, vcz_path): 59 | runner = ct.CliRunner() 60 | result = runner.invoke( 61 | cli.vcztools_main, 62 | f"view --no-version {vcz_path} -o {tmp_path}", 63 | catch_exceptions=False, 64 | ) 65 | assert result.exit_code == 1 66 | assert len(result.stdout) == 0 67 | assert "Is a directory" in result.stderr 68 | 69 | def test_view_write_pipe(self, tmp_path, vcz_path): 70 | runner = ct.CliRunner() 71 | result = runner.invoke( 72 | cli.vcztools_main, 73 | f"view --no-version {vcz_path} -o {tmp_path}", 74 | catch_exceptions=False, 75 | ) 76 | assert result.exit_code == 1 77 | assert len(result.stdout) == 0 78 | assert "Is a directory" in result.stderr 79 | 80 | 81 | def test_excluding_and_including_samples(vcz_path): 82 | samples_file_path = pathlib.Path("tests/data/txt/samples.txt") 83 | error_message = "vcztools does not support combining -s and -S" 84 | 85 | _, vcztools_error = run_vcztools( 86 | f"view {vcz_path} -s NA00001 -S ^{samples_file_path}", expect_error=True 87 | ) 88 | assert error_message in vcztools_error 89 | _, vcztools_error = run_vcztools( 90 | f"view {vcz_path} -s ^NA00001 -S {samples_file_path}", expect_error=True 91 | ) 92 | assert error_message in vcztools_error 93 | 94 | 95 | @mock.patch("sys.exit") 96 | @mock.patch("os.dup2") 97 | def test_broken_pipe(mocked_dup2, mocked_exit, tmp_path): 98 | with open(tmp_path / "tmp.txt", "w") as output: 99 | with cli.handle_broken_pipe(output): 100 | raise BrokenPipeError() 101 | mocked_dup2.assert_called_once() 102 | mocked_exit.assert_called_once_with(1) 103 | 104 | 105 | class TestQuery: 106 | def test_format_required(self, vcz_path): 107 | runner = ct.CliRunner() 108 | result = runner.invoke( 109 | cli.vcztools_main, 110 | f"query {vcz_path} ", 111 | catch_exceptions=False, 112 | ) 113 | assert result.exit_code != 0 114 | assert len(result.stdout) == 0 115 | assert len(result.stderr) > 0 116 | 117 | def test_path_required(self): 118 | runner = ct.CliRunner() 119 | result = runner.invoke( 120 | cli.vcztools_main, 121 | "query --format=POS ", 122 | catch_exceptions=False, 123 | ) 124 | assert result.exit_code != 0 125 | assert len(result.stdout) == 0 126 | assert len(result.stderr) > 0 127 | 128 | def test_list(self, vcz_path): 129 | result, _ = run_vcztools(f"query -l {vcz_path}") 130 | assert list(result.splitlines()) == ["NA00001", "NA00002", "NA00003"] 131 | 132 | def test_list_ignores_output(self, vcz_path, tmp_path): 133 | output = tmp_path / "tmp.txt" 134 | result, _ = run_vcztools(f"query -l {vcz_path} -o {output}") 135 | assert list(result.splitlines()) == ["NA00001", "NA00002", "NA00003"] 136 | assert not output.exists() 137 | 138 | def test_output(self, vcz_path, tmp_path): 139 | output = tmp_path / "tmp.txt" 140 | result, _ = run_vcztools(f"query -f '%POS\n' {vcz_path} -o {output}") 141 | assert list(result.splitlines()) == [] 142 | assert output.exists() 143 | 144 | 145 | class TestIndex: 146 | def test_stats(self, vcz_path): 147 | result, _ = run_vcztools(f"index -s {vcz_path}") 148 | assert list(result.splitlines()) == ["19\t.\t2", "20\t.\t6", "X\t.\t1"] 149 | 150 | def test_nrecords(self, vcz_path): 151 | result, _ = run_vcztools(f"index -n {vcz_path}") 152 | assert list(result.splitlines()) == ["9"] 153 | 154 | def test_stats_and_nrecords(self, vcz_path): 155 | runner = ct.CliRunner() 156 | result = runner.invoke( 157 | cli.vcztools_main, 158 | f"index -ns {vcz_path}", 159 | catch_exceptions=False, 160 | ) 161 | assert result.exit_code != 0 162 | assert len(result.stdout) == 0 163 | assert len(result.stderr) > 0 164 | assert "Expected only one of --stats or --nrecords options" in result.stderr 165 | 166 | def test_no_stats_or_nrecords(self, vcz_path): 167 | runner = ct.CliRunner() 168 | result = runner.invoke( 169 | cli.vcztools_main, 170 | f"index {vcz_path}", 171 | catch_exceptions=False, 172 | ) 173 | assert result.exit_code != 0 174 | assert len(result.stdout) == 0 175 | assert len(result.stderr) > 0 176 | assert "Error: Building region indexes is not supported" in result.stderr 177 | 178 | 179 | def test_top_level(): 180 | runner = ct.CliRunner() 181 | result = runner.invoke( 182 | cli.vcztools_main, 183 | catch_exceptions=False, 184 | ) 185 | assert result.exit_code != 0 186 | assert len(result.stdout) == 0 187 | assert len(result.stderr) > 0 188 | 189 | 190 | def test_version(): 191 | runner = ct.CliRunner() 192 | result = runner.invoke(cli.vcztools_main, ["--version"], catch_exceptions=False) 193 | s = f"version {provenance.__version__}\n" 194 | assert result.stdout.endswith(s) 195 | -------------------------------------------------------------------------------- /tests/test_query.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import re 3 | from io import StringIO 4 | 5 | import numpy as np 6 | import pyparsing as pp 7 | import pytest 8 | import zarr 9 | 10 | from tests.utils import vcz_path_cache 11 | from vcztools.query import ( 12 | QueryFormatGenerator, 13 | QueryFormatParser, 14 | list_samples, 15 | write_query, 16 | ) 17 | from vcztools.retrieval import variant_chunk_iter 18 | 19 | 20 | def test_list_samples(tmp_path): 21 | vcf_path = pathlib.Path("tests/data/vcf") / "sample.vcf.gz" 22 | vcz_path = vcz_path_cache(vcf_path) 23 | expected_output = "NA00001\nNA00002\nNA00003\n" 24 | 25 | with StringIO() as output: 26 | list_samples(vcz_path, output) 27 | assert output.getvalue() == expected_output 28 | 29 | 30 | class TestQueryFormatParser: 31 | @pytest.fixture() 32 | def parser(self): 33 | return QueryFormatParser() 34 | 35 | @pytest.mark.parametrize( 36 | ("expression", "expected_result"), 37 | [ 38 | ("%CHROM", ["%CHROM"]), 39 | (r"\n", ["\n"]), 40 | (r"\t", ["\t"]), 41 | (r"%CHROM\n", ["%CHROM", "\n"]), 42 | ("%CHROM %POS %REF", ["%CHROM", " ", "%POS", " ", "%REF"]), 43 | (r"%CHROM %POS0 %REF\n", ["%CHROM", " ", "%POS0", " ", "%REF", "\n"]), 44 | ( 45 | r"%CHROM\t%POS\t%REF\t%ALT{0}\n", 46 | ["%CHROM", "\t", "%POS", "\t", "%REF", "\t", ["%ALT", 0], "\n"], 47 | ), 48 | ( 49 | r"%CHROM\t%POS0\t%END\t%ID\n", 50 | ["%CHROM", "\t", "%POS0", "\t", "%END", "\t", "%ID", "\n"], 51 | ), 52 | (r"%CHROM:%POS\n", ["%CHROM", ":", "%POS", "\n"]), 53 | (r"%AC{1}\n", [["%AC", 1], "\n"]), 54 | ( 55 | r"Read depth: %INFO/DP\n", 56 | ["Read", " ", "depth:", " ", "%INFO/DP", "\n"], 57 | ), 58 | ( 59 | r"%CHROM\t%POS\t%REF\t%ALT[\t%SAMPLE=%GT]\n", 60 | [ 61 | "%CHROM", 62 | "\t", 63 | "%POS", 64 | "\t", 65 | "%REF", 66 | "\t", 67 | "%ALT", 68 | ["\t", "%SAMPLE", "=", "%GT"], 69 | "\n", 70 | ], 71 | ), 72 | ( 73 | r"%CHROM\t%POS\t%REF\t%ALT[\t%SAMPLE=%GT{0}]\n", 74 | [ 75 | "%CHROM", 76 | "\t", 77 | "%POS", 78 | "\t", 79 | "%REF", 80 | "\t", 81 | "%ALT", 82 | ["\t", "%SAMPLE", "=", ["%GT", 0]], 83 | "\n", 84 | ], 85 | ), 86 | ( 87 | r"GQ:[ %GQ] \t GT:[ %GT]\n", 88 | ["GQ:", [" ", "%GQ"], " ", "\t", " ", "GT:", [" ", "%GT"], "\n"], 89 | ), 90 | ( 91 | r"[%SAMPLE %GT %DP\n]", 92 | [["%SAMPLE", " ", "%GT", " ", "%DP", "\n"]], 93 | ), 94 | ], 95 | ) 96 | def test_valid_expressions(self, parser, expression, expected_result): 97 | assert parser(expression).as_list() == expected_result 98 | 99 | @pytest.mark.parametrize( 100 | "expression", 101 | [ 102 | "%ac", 103 | "%AC {1}", 104 | "% CHROM", 105 | ], 106 | ) 107 | def test_invalid_expressions(self, parser, expression): 108 | with pytest.raises(pp.ParseException): 109 | parser(expression) 110 | 111 | 112 | class TestQueryFormatEvaluator: 113 | @pytest.fixture() 114 | def root(self): 115 | vcf_path = pathlib.Path("tests/data/vcf/sample.vcf.gz") 116 | vcz_path = vcz_path_cache(vcf_path) 117 | return zarr.open(vcz_path, mode="r") 118 | 119 | @pytest.mark.parametrize( 120 | ("query_format", "expected_result"), 121 | [ 122 | (r"A\t", "A\t" * 9), 123 | (r"CHROM", "CHROM" * 9), 124 | ( 125 | r"%CHROM:%POS\n", 126 | "19:111\n19:112\n20:14370\n20:17330\n20:1110696\n20:1230237\n20:1234567\n20:1235237\nX:10\n", 127 | ), 128 | (r"%INFO/DP\n", ".\n.\n14\n11\n10\n13\n9\n.\n.\n"), 129 | (r"%AC\n", ".\n.\n.\n.\n.\n.\n1,1\n.\n.\n"), 130 | (r"%AC{0}\n", ".\n.\n.\n.\n.\n.\n1\n.\n.\n"), 131 | ], 132 | ) 133 | def test(self, root, query_format, expected_result): 134 | generator = QueryFormatGenerator( 135 | query_format, 136 | root["sample_id"][:], 137 | root["contig_id"][:], 138 | root["filter_id"][:], 139 | ) 140 | chunk_data = next(variant_chunk_iter(root)) 141 | result = "".join(generator(chunk_data)) 142 | assert result == expected_result 143 | 144 | # fmt: off 145 | @pytest.mark.parametrize( 146 | ("query_format", "call_mask", "expected_result"), 147 | [ 148 | ( 149 | r"[%DP ]\n", 150 | None, 151 | ". . . \n. . . \n1 8 5 \n3 5 3 \n6 0 4 \n. 4 2 \n4 2 3 \n. . . \n. . . \n", # noqa: E501 152 | ), 153 | ( 154 | r"[%DP ]\n", 155 | np.array( 156 | [ 157 | [1, 1, 1,], 158 | [1, 1, 1,], 159 | [1, 0, 1,], 160 | [1, 1, 1,], 161 | [1, 1, 1,], 162 | [1, 1, 1,], 163 | [1, 1, 1,], 164 | [1, 1, 1,], 165 | [1, 1, 1,], 166 | ] 167 | ), 168 | ". . . \n. . . \n1 5 \n3 5 3 \n6 0 4 \n. 4 2 \n4 2 3 \n. . . \n. . . \n", # noqa: E501 169 | ), 170 | ], 171 | ) 172 | # fmt: on 173 | def test_call_mask(self, root, query_format, call_mask, expected_result): 174 | generator = QueryFormatGenerator( 175 | query_format, 176 | root["sample_id"][:], 177 | root["contig_id"][:], 178 | root["filter_id"][:], 179 | ) 180 | chunk_data = next(variant_chunk_iter(root)) 181 | if call_mask is not None: 182 | chunk_data["call_mask"] = call_mask 183 | result = "".join(generator(chunk_data)) 184 | assert result == expected_result 185 | 186 | @pytest.mark.parametrize( 187 | ("query_format", "expected_result"), 188 | [(r"%QUAL\n", "9.6\n10\n29\n3\n67\n47\n50\n.\n10\n")], 189 | ) 190 | def test_with_parse_results(self, root, query_format, expected_result): 191 | parser = QueryFormatParser() 192 | parse_results = parser(query_format) 193 | generator = QueryFormatGenerator( 194 | parse_results, 195 | root["sample_id"][:], 196 | root["contig_id"][:], 197 | root["filter_id"][:], 198 | ) 199 | chunk_data = next(variant_chunk_iter(root)) 200 | result = "".join(generator(chunk_data)) 201 | assert result == expected_result 202 | 203 | 204 | def test_write_query__include_exclude(tmp_path): 205 | original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz" 206 | vcz = vcz_path_cache(original) 207 | output = tmp_path.joinpath("output.vcf") 208 | 209 | query_format = r"%POS\n" 210 | variant_site_filter = "POS > 1" 211 | 212 | with pytest.raises( 213 | ValueError, 214 | match=re.escape( 215 | "Cannot handle both an include expression and an exclude expression." 216 | ), 217 | ): 218 | write_query( 219 | vcz, 220 | output, 221 | query_format=query_format, 222 | include=variant_site_filter, 223 | exclude=variant_site_filter, 224 | ) 225 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | from collections.abc import Iterator 3 | from contextlib import contextmanager 4 | from itertools import zip_longest 5 | 6 | import cyvcf2 7 | import numpy as np 8 | from bio2zarr import vcf 9 | 10 | 11 | @contextmanager 12 | def open_vcf(path) -> Iterator[cyvcf2.VCF]: 13 | """A context manager for opening a VCF file.""" 14 | vcf = cyvcf2.VCF(path) 15 | try: 16 | yield vcf 17 | finally: 18 | vcf.close() 19 | 20 | 21 | def normalise_info_missingness(info_dict, key): 22 | value = info_dict.get(key, None) 23 | if isinstance(value, tuple): 24 | if all(x is None for x in value): 25 | value = None 26 | elif isinstance(value, str): 27 | if all(x == "." for x in value.split(",")): 28 | value = None 29 | return value 30 | 31 | 32 | def _get_header_field_dicts(vcf, header_type): 33 | def to_dict(header_field): 34 | d = header_field.info(extra=True) 35 | del d[b"IDX"] # remove IDX since we don't care about ordering 36 | 37 | # cyvcf2 duplicates some keys as strings and bytes, so remove the bytes one 38 | for k in list(d.keys()): 39 | if isinstance(k, bytes) and k.decode("utf-8") in d: 40 | del d[k] 41 | return d 42 | 43 | return { 44 | field["ID"]: to_dict(field) 45 | for field in vcf.header_iter() 46 | if field["HeaderType"] == header_type 47 | } 48 | 49 | 50 | def _assert_header_field_dicts_equivalent(field_dicts1, field_dicts2): 51 | assert len(field_dicts1) == len(field_dicts2) 52 | 53 | for id in field_dicts1.keys(): 54 | assert id in field_dicts2 55 | field_dict1 = field_dicts1[id] 56 | field_dict2 = field_dicts2[id] 57 | 58 | assert len(field_dict1) == len(field_dict2) 59 | # all fields should be the same, except Number="." which can match any value 60 | for k in field_dict1.keys(): 61 | assert k in field_dict2 62 | v1 = field_dict1[k] 63 | v2 = field_dict2[k] 64 | if k == "Number" and (v1 == "." or v2 == "."): 65 | continue 66 | assert v1 == v2, f"Failed in field {id} with key {k}" 67 | 68 | 69 | def _assert_vcf_headers_equivalent(vcf1, vcf2): 70 | # Only compare INFO, FORMAT, FILTER, CONTIG fields, ignoring order 71 | # Other fields are ignored 72 | 73 | info1 = _get_header_field_dicts(vcf1, "INFO") 74 | info2 = _get_header_field_dicts(vcf2, "INFO") 75 | _assert_header_field_dicts_equivalent(info1, info2) 76 | 77 | format1 = _get_header_field_dicts(vcf1, "FORMAT") 78 | format2 = _get_header_field_dicts(vcf2, "FORMAT") 79 | _assert_header_field_dicts_equivalent(format1, format2) 80 | 81 | filter1 = _get_header_field_dicts(vcf1, "FILTER") 82 | filter2 = _get_header_field_dicts(vcf2, "FILTER") 83 | _assert_header_field_dicts_equivalent(filter1, filter2) 84 | 85 | contig1 = _get_header_field_dicts(vcf1, "CONTIG") 86 | contig2 = _get_header_field_dicts(vcf2, "CONTIG") 87 | _assert_header_field_dicts_equivalent(contig1, contig2) 88 | 89 | 90 | def assert_vcfs_close(f1, f2, *, rtol=1e-05, atol=1e-03, allow_zero_variants=False): 91 | """Like :py:func:`numpy.testing.assert_allclose()`, but for VCF files. 92 | 93 | Raises an `AssertionError` if two VCF files are not equal to one another. 94 | Float values in QUAL, INFO, or FORMAT fields are compared up to the 95 | desired tolerance. All other values must match exactly. 96 | 97 | Parameters 98 | ---------- 99 | f1 100 | Path to first VCF to compare. 101 | f2 102 | Path to second VCF to compare. 103 | rtol 104 | Relative tolerance. 105 | atol 106 | Absolute tolerance. 107 | """ 108 | with open_vcf(f1) as vcf1, open_vcf(f2) as vcf2: 109 | _assert_vcf_headers_equivalent(vcf1, vcf2) 110 | assert vcf1.samples == vcf2.samples 111 | 112 | count = 0 113 | for v1, v2 in zip_longest(vcf1, vcf2): 114 | if v1 is None and v2 is not None: 115 | raise AssertionError(f"Right contains extra variant: {v2}") 116 | if v1 is not None and v2 is None: 117 | raise AssertionError(f"Left contains extra variant: {v1}") 118 | 119 | count += 1 120 | 121 | assert v1.CHROM == v2.CHROM, f"CHROM not equal for variants\n{v1}{v2}" 122 | assert v1.POS == v2.POS, f"POS not equal for variants\n{v1}{v2}" 123 | assert v1.ID == v2.ID, f"ID not equal for variants\n{v1}{v2}" 124 | assert v1.REF == v2.REF, f"REF not equal for variants\n{v1}{v2}" 125 | assert v1.ALT == v2.ALT, f"ALT not equal for variants\n{v1}{v2}" 126 | np.testing.assert_allclose( 127 | np.array(v1.QUAL, dtype=np.float32), 128 | np.array(v2.QUAL, dtype=np.float32), 129 | rtol=rtol, 130 | atol=atol, 131 | err_msg=f"QUAL not equal for variants\n{v1}{v2}", 132 | ) 133 | assert set(v1.FILTERS) == set( 134 | v2.FILTERS 135 | ), f"FILTER not equal for variants\n{v1}{v2}" 136 | 137 | v1_info = dict(v1.INFO) 138 | v2_info = dict(v2.INFO) 139 | all_keys = set(v1_info.keys()) | set(v2_info.keys()) 140 | for k in all_keys: 141 | val1 = normalise_info_missingness(v1_info, k) 142 | val2 = normalise_info_missingness(v2_info, k) 143 | # values are python objects (not np arrays) 144 | if isinstance(val1, float) or ( 145 | isinstance(val1, tuple) and any(isinstance(v, float) for v in val1) 146 | ): 147 | np.testing.assert_allclose( 148 | np.array(val1, dtype=np.float32), 149 | np.array(val2, dtype=np.float32), 150 | rtol=rtol, 151 | atol=atol, 152 | err_msg=f"INFO {k} not equal for variants\n{v1}{v2}", 153 | ) 154 | else: 155 | assert val1 == val2, f"INFO {k} not equal for variants\n{v1}{v2}" 156 | 157 | # NOTE skipping this because it requires items to be in the same order. 158 | # assert v1.FORMAT == v2.FORMAT, f"FORMAT not equal for variants\n{v1}{v2}" 159 | for field in v1.FORMAT: 160 | if field == "GT": 161 | assert ( 162 | v1.genotypes == v2.genotypes 163 | ), f"GT not equal for variants\n{v1}{v2}" 164 | else: 165 | val1 = v1.format(field) 166 | val2 = v2.format(field) 167 | if val2 is None: 168 | # FIXME this is a quick hack to workaround missing support for 169 | # dealing with the field missing vs all-elements-in-field 170 | # missing issue. 171 | # https://github.com/jeromekelleher/vcztools/issues/14 172 | assert [str(x) == "." for x in val1] 173 | else: 174 | if val1.dtype.kind == "f": 175 | np.testing.assert_allclose( 176 | val1, 177 | val2, 178 | rtol=rtol, 179 | atol=atol, 180 | err_msg=f"FORMAT {field} not equal for " 181 | f"variants\n{v1}{v2}", 182 | ) 183 | else: 184 | np.testing.assert_array_equal( 185 | val1, 186 | val2, 187 | err_msg=f"FORMAT {field} not equal for " 188 | f"variants\n{v1}{v2}", 189 | ) 190 | 191 | if not allow_zero_variants: 192 | assert count > 0, "No variants in file" 193 | 194 | 195 | def vcz_path_cache(vcf_path): 196 | """ 197 | Store converted files in a cache to speed up tests. We're not testing 198 | vcf2zarr here, so no point in running over and over again. 199 | """ 200 | cache_path = pathlib.Path("vcz_test_cache") 201 | if not cache_path.exists(): 202 | cache_path.mkdir() 203 | cached_vcz_path = (cache_path / vcf_path.name).with_suffix(".vcz") 204 | if not cached_vcz_path.exists(): 205 | if vcf_path.name.startswith("chr22"): 206 | vcf.convert( 207 | [vcf_path], 208 | cached_vcz_path, 209 | worker_processes=0, 210 | variants_chunk_size=10, 211 | samples_chunk_size=10, 212 | ) 213 | else: 214 | vcf.convert( 215 | [vcf_path], cached_vcz_path, worker_processes=0, local_alleles=False 216 | ) 217 | return cached_vcz_path 218 | -------------------------------------------------------------------------------- /vcztools/retrieval.py: -------------------------------------------------------------------------------- 1 | import collections.abc 2 | 3 | import numpy as np 4 | import zarr 5 | 6 | from vcztools import filter as filter_mod 7 | from vcztools.regions import ( 8 | parse_regions, 9 | parse_targets, 10 | regions_to_chunk_indexes, 11 | regions_to_selection, 12 | ) 13 | from vcztools.samples import parse_samples 14 | 15 | 16 | # NOTE: this class is just a skeleton for now. The idea is that this 17 | # will provide readahead, caching etc, and will be the central location 18 | # for fetching bulk Zarr data. 19 | class VariantChunkReader(collections.abc.Sequence): 20 | """ 21 | Retrieve data from a Zarr store and return chunk-by-chunk in the 22 | variants dimension. 23 | """ 24 | 25 | def __init__(self, root, *, fields=None): 26 | self.root = root 27 | if fields is None: 28 | fields = [ 29 | key 30 | for key in root.keys() 31 | if key.startswith("variant_") or key.startswith("call_") 32 | ] 33 | self.arrays = {key: self.root[key] for key in fields} 34 | # TODO validate the arrays have the correct shapes setc 35 | self.num_chunks = next(iter(self.arrays.values())).cdata_shape[0] 36 | 37 | def __len__(self): 38 | return self.num_chunks 39 | 40 | def __getitem__(self, chunk): 41 | return {key: array.blocks[chunk] for key, array in self.arrays.items()} 42 | 43 | def get_chunk_data(self, chunk, mask, samples_selection=None): 44 | num_samples = len(samples_selection) if samples_selection is not None else 0 45 | return { 46 | key: get_vchunk_array( 47 | array, 48 | chunk, 49 | mask, 50 | samples_selection 51 | if (key.startswith("call_") and num_samples > 0) 52 | else None, 53 | ) 54 | for key, array in self.arrays.items() 55 | } 56 | 57 | 58 | def get_vchunk_array(zarray, v_chunk, mask, samples_selection=None): 59 | v_chunksize = zarray.chunks[0] 60 | start = v_chunksize * v_chunk 61 | end = v_chunksize * (v_chunk + 1) 62 | if samples_selection is None: 63 | result = zarray[start:end] 64 | else: 65 | result = zarray.oindex[start:end, samples_selection] 66 | if mask is not None: 67 | result = result[mask] 68 | return result 69 | 70 | 71 | def variant_chunk_index_iter(root, regions=None, targets=None): 72 | """Iterate over variant chunk indexes that overlap the given regions or targets. 73 | 74 | Returns tuples of variant chunk indexes and (optional) variant masks. 75 | 76 | A variant mask of None indicates that all the variants in the chunk are included. 77 | """ 78 | 79 | pos = root["variant_position"] 80 | 81 | if regions is None and targets is None: 82 | num_chunks = pos.cdata_shape[0] 83 | # no regions or targets selected 84 | for v_chunk in range(num_chunks): 85 | v_mask_chunk = None 86 | yield v_chunk, v_mask_chunk 87 | 88 | else: 89 | contigs_u = root["contig_id"][:].astype("U").tolist() 90 | regions_pyranges = parse_regions(regions, contigs_u) 91 | targets_pyranges, complement = parse_targets(targets, contigs_u) 92 | 93 | # Use the region index to find the chunks that overlap specfied regions or 94 | # targets 95 | region_index = root["region_index"][:] 96 | chunk_indexes = regions_to_chunk_indexes( 97 | regions_pyranges, 98 | targets_pyranges, 99 | complement, 100 | region_index, 101 | ) 102 | 103 | # Then use only load required variant_contig/position chunks 104 | if len(chunk_indexes) == 0: 105 | # no chunks - no variants to write 106 | return 107 | elif len(chunk_indexes) == 1: 108 | # single chunk 109 | block_sel = chunk_indexes[0] 110 | else: 111 | # zarr.blocks doesn't support int array indexing - use that when it does 112 | block_sel = slice(chunk_indexes[0], chunk_indexes[-1] + 1) 113 | 114 | region_variant_contig = root["variant_contig"].blocks[block_sel][:] 115 | region_variant_position = root["variant_position"].blocks[block_sel][:] 116 | region_variant_length = root["variant_length"].blocks[block_sel][:] 117 | 118 | # Find the final variant selection 119 | variant_selection = regions_to_selection( 120 | regions_pyranges, 121 | targets_pyranges, 122 | complement, 123 | region_variant_contig, 124 | region_variant_position, 125 | region_variant_length, 126 | ) 127 | variant_mask = np.zeros(region_variant_position.shape[0], dtype=bool) 128 | variant_mask[variant_selection] = 1 129 | # Use zarr arrays to get mask chunks aligned with the main data 130 | # for convenience. 131 | z_variant_mask = zarr.array(variant_mask, chunks=pos.chunks[0]) 132 | 133 | for i, v_chunk in enumerate(chunk_indexes): 134 | v_mask_chunk = z_variant_mask.blocks[i] 135 | yield v_chunk, v_mask_chunk 136 | 137 | 138 | def variant_chunk_index_iter_with_filtering( 139 | root, 140 | *, 141 | regions=None, 142 | targets=None, 143 | include: str | None = None, 144 | exclude: str | None = None, 145 | ): 146 | """Iterate over variant chunk indexes that overlap the given regions or targets 147 | and which match the include/exclude filter expression. 148 | 149 | Returns tuples of variant chunk indexes and (optional) variant masks. 150 | 151 | A variant mask of None indicates that all the variants in the chunk are included. 152 | """ 153 | 154 | filter_expr = filter_mod.FilterExpression( 155 | field_names=set(root), include=include, exclude=exclude 156 | ) 157 | if filter_expr.parse_result is None: 158 | filter_expr = None 159 | else: 160 | filter_fields = list(filter_expr.referenced_fields) 161 | filter_fields_reader = VariantChunkReader(root, fields=filter_fields) 162 | 163 | for v_chunk, v_mask_chunk in variant_chunk_index_iter(root, regions, targets): 164 | if filter_expr is not None: 165 | chunk_data = filter_fields_reader[v_chunk] 166 | v_mask_chunk_filter = filter_expr.evaluate(chunk_data) 167 | if v_mask_chunk is None: 168 | v_mask_chunk = v_mask_chunk_filter 169 | else: 170 | if v_mask_chunk_filter.ndim == 2: 171 | v_mask_chunk = np.expand_dims(v_mask_chunk, axis=1) 172 | v_mask_chunk = np.logical_and(v_mask_chunk, v_mask_chunk_filter) 173 | if v_mask_chunk is None or np.any(v_mask_chunk): 174 | yield v_chunk, v_mask_chunk 175 | 176 | 177 | def variant_chunk_iter( 178 | root, 179 | *, 180 | fields: list[str] | None = None, 181 | regions=None, 182 | targets=None, 183 | include: str | None = None, 184 | exclude: str | None = None, 185 | samples_selection=None, 186 | ): 187 | if fields is not None and len(fields) == 0: 188 | return # empty iterator 189 | query_fields_reader = VariantChunkReader(root, fields=fields) 190 | for v_chunk, v_mask_chunk in variant_chunk_index_iter_with_filtering( 191 | root, 192 | regions=regions, 193 | targets=targets, 194 | include=include, 195 | exclude=exclude, 196 | ): 197 | # The variants_selection is used to subset variant chunks along 198 | # the variants dimension. 199 | # The call_mask is returned to the client to indicate which samples 200 | # matched (for each variant) in the case of per-sample filtering. 201 | if v_mask_chunk is None or v_mask_chunk.ndim == 1: 202 | variants_selection = v_mask_chunk 203 | call_mask = None 204 | else: 205 | variants_selection = np.any(v_mask_chunk, axis=1) 206 | call_mask = v_mask_chunk[variants_selection] 207 | if samples_selection is not None: 208 | call_mask = call_mask[:, samples_selection] 209 | chunk_data = query_fields_reader.get_chunk_data( 210 | v_chunk, variants_selection, samples_selection=samples_selection 211 | ) 212 | if call_mask is not None: 213 | chunk_data["call_mask"] = call_mask 214 | yield chunk_data 215 | 216 | 217 | def variant_iter( 218 | vcz, 219 | *, 220 | fields: list[str] | None = None, 221 | regions: str | None = None, 222 | targets: str | None = None, 223 | include: str | None = None, 224 | exclude: str | None = None, 225 | samples: list[str] | str | None = None, 226 | ): 227 | """Iterate over variants that overlap the given regions or targets 228 | and which match the include/exclude filter expression. 229 | 230 | Only values for the samples specified are returned. 231 | 232 | Returns dicts containing the specified fields keyed by VCF Zarr name. 233 | 234 | By default all fields for all variants and samples are returned. 235 | """ 236 | root = zarr.open(vcz, mode="r") 237 | all_samples = root["sample_id"][:] 238 | _, samples_selection = parse_samples(samples, all_samples) 239 | 240 | for chunk_data in variant_chunk_iter( 241 | root, 242 | fields=fields, 243 | regions=regions, 244 | targets=targets, 245 | include=include, 246 | exclude=exclude, 247 | samples_selection=samples_selection, 248 | ): 249 | # get first field in chunk_data to find number of variants 250 | field = next(iter(chunk_data.values())) 251 | num_variants = len(field) 252 | for i in range(num_variants): 253 | yield {name: chunk_data[name][i] for name in chunk_data.keys()} 254 | -------------------------------------------------------------------------------- /vcztools/cli.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import os 3 | import sys 4 | from functools import wraps 5 | 6 | import click 7 | 8 | from . import plink, provenance, vcf_writer 9 | from . import query as query_module 10 | from . import stats as stats_module 11 | 12 | 13 | @contextlib.contextmanager 14 | def handle_broken_pipe(output): 15 | """ 16 | Handle sigpipe following official advice: 17 | https://docs.python.org/3/library/signal.html#note-on-sigpipe 18 | """ 19 | try: 20 | yield 21 | # flush output here to force SIGPIPE to be triggered 22 | # while inside this try block. 23 | output.flush() 24 | except BrokenPipeError: 25 | # Python flushes standard streams on exit; redirect remaining output 26 | # to devnull to avoid another BrokenPipeError at shutdown 27 | devnull = os.open(os.devnull, os.O_WRONLY) 28 | os.dup2(devnull, sys.stdout.fileno()) 29 | sys.exit(1) # Python exits with error code 1 on EPIPE 30 | 31 | 32 | def handle_exception(func): 33 | """ 34 | Handle known application exceptions (ValueError) by converting to 35 | a ClickException, so the message is written to stderr and a non-zero exit 36 | code is set. 37 | """ 38 | 39 | @wraps(func) 40 | def wrapper(*args, **kwargs): 41 | try: 42 | return func(*args, **kwargs) 43 | except ValueError as e: 44 | raise click.ClickException(e) from e 45 | 46 | return wrapper 47 | 48 | 49 | include = click.option( 50 | "-i", "--include", type=str, help="Filter expression to include variant sites." 51 | ) 52 | exclude = click.option( 53 | "-e", "--exclude", type=str, help="Filter expression to exclude variant sites." 54 | ) 55 | force_samples = click.option( 56 | "--force-samples", is_flag=True, help="Only warn about unknown sample subsets." 57 | ) 58 | output = click.option( 59 | "-o", 60 | "--output", 61 | type=click.File("w"), 62 | default="-", 63 | help="File path to write output to (defaults to stdout '-').", 64 | ) 65 | regions = click.option( 66 | "-r", 67 | "--regions", 68 | type=str, 69 | default=None, 70 | help="Regions to include.", 71 | ) 72 | samples = click.option( 73 | "-s", 74 | "--samples", 75 | type=str, 76 | default=None, 77 | help="Samples to include.", 78 | ) 79 | targets = click.option( 80 | "-t", 81 | "--targets", 82 | type=str, 83 | default=None, 84 | help="Target regions to include.", 85 | ) 86 | version = click.version_option(version=f"{provenance.__version__}") 87 | 88 | 89 | class NaturalOrderGroup(click.Group): 90 | """ 91 | List commands in the order they are provided in the help text. 92 | """ 93 | 94 | def list_commands(self, ctx): 95 | return self.commands.keys() 96 | 97 | 98 | @click.command 99 | @click.argument("path", type=click.Path()) 100 | @click.option( 101 | "-n", 102 | "--nrecords", 103 | is_flag=True, 104 | help="Print the number of records (variants).", 105 | ) 106 | @click.option( 107 | "-s", 108 | "--stats", 109 | is_flag=True, 110 | help="Print per contig stats.", 111 | ) 112 | @handle_exception 113 | def index(path, nrecords, stats): 114 | """ 115 | Query the number of records in a VCZ dataset. This subcommand only 116 | implements the --nrecords and --stats options and does not build any 117 | indexes. 118 | """ 119 | if nrecords and stats: 120 | raise click.UsageError("Expected only one of --stats or --nrecords options") 121 | if nrecords: 122 | stats_module.nrecords(path, sys.stdout) 123 | elif stats: 124 | stats_module.stats(path, sys.stdout) 125 | else: 126 | raise click.UsageError("Building region indexes is not supported") 127 | 128 | 129 | @click.command 130 | @click.argument("path", type=click.Path()) 131 | @output 132 | @click.option( 133 | "-l", 134 | "--list-samples", 135 | is_flag=True, 136 | help="List the sample IDs and exit.", 137 | ) 138 | @click.option( 139 | "-f", 140 | "--format", 141 | type=str, 142 | help="The format of the output.", 143 | default=None, 144 | ) 145 | @regions 146 | @force_samples 147 | @samples 148 | @targets 149 | @include 150 | @exclude 151 | @handle_exception 152 | def query( 153 | path, 154 | output, 155 | list_samples, 156 | format, 157 | regions, 158 | targets, 159 | force_samples, 160 | samples, 161 | include, 162 | exclude, 163 | ): 164 | """ 165 | Transform VCZ into user-defined formats with efficient subsetting and 166 | filtering. Intended as a drop-in replacement for bcftools query, where we 167 | replace the VCF file path with a VCZ dataset URL. 168 | 169 | This is an early version and not feature complete: if you are missing a 170 | particular piece of functionality please open an issue at 171 | https://github.com/sgkit-dev/vcztools/issues 172 | """ 173 | if list_samples: 174 | # bcftools query -l ignores the --output option and always writes to stdout 175 | output = sys.stdout 176 | with handle_broken_pipe(output): 177 | query_module.list_samples(path, output) 178 | return 179 | 180 | if format is None: 181 | raise click.UsageError("Missing option -f / --format") 182 | with handle_broken_pipe(output): 183 | query_module.write_query( 184 | path, 185 | output, 186 | query_format=format, 187 | regions=regions, 188 | targets=targets, 189 | samples=samples, 190 | force_samples=force_samples, 191 | include=include, 192 | exclude=exclude, 193 | ) 194 | 195 | 196 | @click.command 197 | @click.argument("path", type=click.Path()) 198 | @output 199 | @click.option( 200 | "-h", 201 | "--header-only", 202 | is_flag=True, 203 | help="Output the VCF header only.", 204 | ) 205 | @click.option( 206 | "-H", 207 | "--no-header", 208 | is_flag=True, 209 | help="Suppress the header in VCF output.", 210 | ) 211 | @click.option( 212 | "--no-version", 213 | is_flag=True, 214 | help="Do not append version and command line information to the output VCF header.", 215 | ) 216 | @regions 217 | @force_samples 218 | @click.option( 219 | "-I", 220 | "--no-update", 221 | is_flag=True, 222 | help="Do not recalculate INFO fields for the sample subset.", 223 | ) 224 | @samples 225 | @click.option( 226 | "-S", 227 | "--samples-file", 228 | type=str, 229 | default=None, 230 | help="File of sample names to include.", 231 | ) 232 | @click.option( 233 | "-G", 234 | "--drop-genotypes", 235 | is_flag=True, 236 | help="Drop genotypes.", 237 | ) 238 | @targets 239 | @include 240 | @exclude 241 | @handle_exception 242 | def view( 243 | path, 244 | output, 245 | header_only, 246 | no_header, 247 | no_version, 248 | regions, 249 | targets, 250 | force_samples, 251 | no_update, 252 | samples, 253 | samples_file, 254 | drop_genotypes, 255 | include, 256 | exclude, 257 | ): 258 | """ 259 | Convert VCZ dataset to VCF with efficient subsetting and filtering. 260 | Intended as a drop-in replacement for bcftools view, where 261 | we replace the VCF file path with a VCZ dataset URL. 262 | 263 | This is an early version and not feature complete: if you are missing a 264 | particular piece of functionality please open an issue at 265 | https://github.com/sgkit-dev/vcztools/issues 266 | """ 267 | suffix = output.name.split(".")[-1] 268 | # Exclude suffixes which require bgzipped or BCF output: 269 | # https://github.com/samtools/htslib/blob/329e7943b7ba3f0af15b0eaa00a367a1ac15bd83/vcf.c#L3815 270 | if suffix in ["gz", "bcf", "bgz"]: 271 | raise ValueError( 272 | f"Only uncompressed VCF output supported, suffix .{suffix} not allowed" 273 | ) 274 | 275 | if samples_file: 276 | if samples is not None: 277 | raise ValueError("vcztools does not support combining -s and -S") 278 | 279 | samples = "" 280 | exclude_samples_file = samples_file.startswith("^") 281 | samples_file = samples_file.lstrip("^") 282 | 283 | with open(samples_file) as file: 284 | if exclude_samples_file: 285 | samples = "^" + samples 286 | samples += ",".join(line.strip() for line in file.readlines()) 287 | 288 | with handle_broken_pipe(output): 289 | vcf_writer.write_vcf( 290 | path, 291 | output, 292 | header_only=header_only, 293 | no_header=no_header, 294 | no_version=no_version, 295 | regions=regions, 296 | targets=targets, 297 | no_update=no_update, 298 | samples=samples, 299 | force_samples=force_samples, 300 | drop_genotypes=drop_genotypes, 301 | include=include, 302 | exclude=exclude, 303 | ) 304 | 305 | 306 | @click.command 307 | @click.argument("path", type=click.Path()) 308 | @include 309 | @exclude 310 | @click.option("--out", default="plink") 311 | def view_plink1(path, include, exclude, out): 312 | """ 313 | Generate a plink1 binary fileset compatible with plink1.9 --vcf. 314 | This command is equivalent to running ``vcztools view [filtering options] 315 | -o intermediate.vcf && plink 1.9 --vcf intermediate.vcf [plink options]`` 316 | without generating the intermediate VCF. 317 | """ 318 | plink.write_plink(path, out, include=include, exclude=exclude) 319 | 320 | 321 | @version 322 | @click.group(cls=NaturalOrderGroup, name="vcztools") 323 | def vcztools_main(): 324 | pass 325 | 326 | 327 | vcztools_main.add_command(index) 328 | vcztools_main.add_command(query) 329 | vcztools_main.add_command(view) 330 | # vcztools_main.add_command(view_plink1) 331 | -------------------------------------------------------------------------------- /tests/test_tskit_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for data originating from tskit format for compatibility 3 | with various outputs. 4 | """ 5 | 6 | import bio2zarr.plink as p2z 7 | import bio2zarr.tskit as ts2z 8 | import bio2zarr.vcf as v2z 9 | import msprime 10 | import numpy as np 11 | import numpy.testing as nt 12 | import pytest 13 | import sgkit as sg 14 | import tskit 15 | import xarray.testing as xt 16 | 17 | from vcztools.plink import write_plink 18 | from vcztools.vcf_writer import write_vcf 19 | 20 | 21 | def add_mutations(ts): 22 | # Add some mutation to the tree sequence. This guarantees that 23 | # we have variation at all sites > 0. 24 | tables = ts.dump_tables() 25 | samples = ts.samples() 26 | states = "ACGT" 27 | for j in range(1, int(ts.sequence_length) - 1): 28 | site = tables.sites.add_row(j, ancestral_state=states[j % 4]) 29 | tables.mutations.add_row( 30 | site=site, 31 | derived_state=states[(j + 1) % 4], 32 | node=samples[j % ts.num_samples], 33 | ) 34 | return tables.tree_sequence() 35 | 36 | 37 | @pytest.fixture() 38 | def fx_diploid_msprime_sim(tmp_path): 39 | seed = 1234 40 | ts = msprime.sim_ancestry(5, sequence_length=10_000, random_seed=seed) 41 | ts = msprime.sim_mutations(ts, rate=1e-4, random_seed=seed) 42 | assert ts.num_mutations > 0 43 | assert ts.num_mutations == ts.num_sites # make sure we have biallelic sites 44 | zarr_path = tmp_path / "sim.vcz" 45 | ts2z.convert(ts, zarr_path) 46 | return zarr_path 47 | 48 | 49 | @pytest.fixture() 50 | def fx_haploid_missing_data(tmp_path): 51 | # 2.00┊ 4 ┊ 52 | # ┊ ┏━┻┓ ┊ 53 | # 1.00┊ ┃ 3 ┊ 54 | # ┊ ┃ ┏┻┓ ┊ 55 | # 0.00┊ 0 1 2 5 ┊ 56 | # 0 10 57 | # | | 58 | # pos 2 9 59 | # anc A T 60 | ts = tskit.Tree.generate_balanced(3, span=10).tree_sequence 61 | tables = ts.dump_tables() 62 | tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) 63 | tables.sites.add_row(2, ancestral_state="A") 64 | tables.sites.add_row(9, ancestral_state="T") 65 | tables.mutations.add_row(site=0, node=0, derived_state="G") 66 | tables.mutations.add_row(site=1, node=3, derived_state="C") 67 | zarr_path = tmp_path / "sim.vcz" 68 | ts2z.convert(tables.tree_sequence(), zarr_path, isolated_as_missing=True) 69 | return zarr_path 70 | 71 | 72 | def test_haploid_missing_data(fx_haploid_missing_data): 73 | ds = sg.load_dataset(fx_haploid_missing_data) 74 | nt.assert_array_equal( 75 | ds.call_genotype.values, 76 | [ 77 | [[1], [0], [0], [-1]], 78 | [[0], [1], [1], [-1]], 79 | ], 80 | ) 81 | 82 | 83 | @pytest.fixture() 84 | def fx_diploid_missing_data(tmp_path): 85 | # 2.00┊ 6 ┊ 86 | # ┊ ┏━┻━┓ ┊ 87 | # 1.00┊ 4 5 ┊ 88 | # ┊ ┏┻┓ ┏┻┓ ┊ 89 | # 0.00┊ 0 1 2 3 7 8┊ 90 | # 0 10 91 | # | | 92 | # pos 2 9 93 | # anc A T 94 | ts = tskit.Tree.generate_balanced(4, span=10).tree_sequence 95 | tables = ts.dump_tables() 96 | tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) 97 | u = tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) 98 | assert u == 8 99 | tables.sites.add_row(2, ancestral_state="A") 100 | tables.sites.add_row(9, ancestral_state="T") 101 | tables.mutations.add_row(site=0, node=0, derived_state="G") 102 | tables.mutations.add_row(site=1, node=5, derived_state="C") 103 | zarr_path = tmp_path / "sim.vcz" 104 | ts = tables.tree_sequence() 105 | model_map = ts.map_to_vcf_model(ploidy=2) 106 | ts2z.convert(ts, zarr_path, model_mapping=model_map, isolated_as_missing=True) 107 | return zarr_path 108 | 109 | 110 | def test_diploid_missing_data(fx_diploid_missing_data): 111 | ds = sg.load_dataset(fx_diploid_missing_data) 112 | nt.assert_array_equal( 113 | ds.call_genotype.values, 114 | [ 115 | [[1, 0], [0, 0], [-1, -1]], 116 | [[0, 0], [1, 1], [-1, -1]], 117 | ], 118 | ) 119 | 120 | 121 | @pytest.fixture() 122 | def fx_diploid_multi_allelic(tmp_path): 123 | # 2.00┊ 6 ┊ 124 | # ┊ ┏━┻━┓ ┊ 125 | # 1.00┊ 4 5 ┊ 126 | # ┊ ┏┻┓ ┏┻┓ ┊ 127 | # 0.00┊ 0 1 2 3 ┊ 128 | # 0 10 129 | # | | 130 | # pos 2 9 131 | # anc A T 132 | ts = tskit.Tree.generate_balanced(4, span=10).tree_sequence 133 | tables = ts.dump_tables() 134 | tables.sites.add_row(2, ancestral_state="A") 135 | tables.sites.add_row(9, ancestral_state="T") 136 | tables.mutations.add_row(site=0, node=0, derived_state="G") 137 | tables.mutations.add_row(site=1, node=1, derived_state="G") 138 | tables.mutations.add_row(site=1, node=5, derived_state="C") 139 | zarr_path = tmp_path / "sim.vcz" 140 | ts = tables.tree_sequence() 141 | model_map = ts.map_to_vcf_model(ploidy=2) 142 | ts2z.convert(ts, zarr_path, model_mapping=model_map) 143 | return zarr_path 144 | 145 | 146 | def test_diploid_multi_allelic(fx_diploid_multi_allelic): 147 | ds = sg.load_dataset(fx_diploid_multi_allelic) 148 | # NOTE this example is constructed so that the rarest allele is in the middle 149 | # of the alleles array 150 | nt.assert_array_equal(ds.variant_allele.values, [["A", "G", ""], ["T", "G", "C"]]) 151 | nt.assert_array_equal( 152 | ds.call_genotype.values, 153 | [ 154 | [[1, 0], [0, 0]], 155 | [[0, 1], [2, 2]], 156 | ], 157 | ) 158 | 159 | 160 | @pytest.fixture() 161 | def fx_haploid_msprime_sim(tmp_path): 162 | seed = 12345 163 | ts = msprime.sim_ancestry(5, ploidy=1, sequence_length=100, random_seed=seed) 164 | ts = msprime.sim_mutations(ts, rate=0.5, random_seed=seed) 165 | assert ts.num_mutations > 0 166 | zarr_path = tmp_path / "sim.vcz" 167 | ts2z.convert(ts, zarr_path) 168 | return zarr_path 169 | 170 | 171 | def simple_ts_tables(): 172 | tables = tskit.TableCollection(sequence_length=100) 173 | for _ in range(4): 174 | ind = -1 175 | ind = tables.individuals.add_row() 176 | tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=ind) 177 | tables.nodes.add_row(flags=0, time=1) # MRCA for 0,1 178 | tables.nodes.add_row(flags=0, time=1) # MRCA for 2,3 179 | tables.edges.add_row(left=0, right=100, parent=4, child=0) 180 | tables.edges.add_row(left=0, right=100, parent=4, child=1) 181 | tables.edges.add_row(left=0, right=100, parent=5, child=2) 182 | tables.edges.add_row(left=0, right=100, parent=5, child=3) 183 | site_id = tables.sites.add_row(position=10, ancestral_state="A") 184 | tables.mutations.add_row(site=site_id, node=4, derived_state="TTTT") 185 | site_id = tables.sites.add_row(position=20, ancestral_state="CCC") 186 | tables.mutations.add_row(site=site_id, node=5, derived_state="G") 187 | site_id = tables.sites.add_row(position=30, ancestral_state="G") 188 | tables.mutations.add_row(site=site_id, node=0, derived_state="AA") 189 | 190 | tables.sort() 191 | return tables 192 | 193 | 194 | @pytest.fixture() 195 | def fx_simple_ts(tmp_path): 196 | ts = simple_ts_tables().tree_sequence() 197 | zarr_path = tmp_path / "sim.vcz" 198 | ts2z.convert(ts, zarr_path) 199 | return zarr_path 200 | 201 | 202 | # TODO add other fixtures here like stuff with odd mixtures of ploidy, 203 | # and zero variants (need to address 204 | # https://github.com/sgkit-dev/bio2zarr/issues/342 before zero variants 205 | # handled) 206 | 207 | 208 | class TestVcfRoundTrip: 209 | def assert_bio2zarr_rt(self, tmp_path, tskit_vcz): 210 | vcf_path = tmp_path / "out.vcf" 211 | write_vcf(tskit_vcz, vcf_path) 212 | rt_vcz_path = tmp_path / "rt.vcz" 213 | v2z.convert([vcf_path], rt_vcz_path) 214 | ds1 = sg.load_dataset(tskit_vcz) 215 | ds2 = sg.load_dataset(rt_vcz_path) 216 | drop_fields = [ 217 | "variant_id", 218 | "variant_id_mask", 219 | "filter_id", 220 | "filter_description", 221 | "variant_filter", 222 | "variant_quality", 223 | ] 224 | xt.assert_equal(ds1, ds2.drop_vars(drop_fields)) 225 | num_variants = ds2.sizes["variants"] 226 | assert np.all(np.isnan(ds2["variant_quality"].values)) 227 | nt.assert_array_equal( 228 | ds2["variant_filter"], np.ones((num_variants, 1), dtype=bool) 229 | ) 230 | assert list(ds2["filter_id"].values) == ["PASS"] 231 | 232 | def test_diploid_msprime_sim(self, tmp_path, fx_diploid_msprime_sim): 233 | self.assert_bio2zarr_rt(tmp_path, fx_diploid_msprime_sim) 234 | 235 | def test_haploid_msprime_sim(self, tmp_path, fx_haploid_msprime_sim): 236 | self.assert_bio2zarr_rt(tmp_path, fx_haploid_msprime_sim) 237 | 238 | def test_simple_ts(self, tmp_path, fx_simple_ts): 239 | self.assert_bio2zarr_rt(tmp_path, fx_simple_ts) 240 | 241 | def test_haploid_missing_data(self, tmp_path, fx_haploid_missing_data): 242 | self.assert_bio2zarr_rt(tmp_path, fx_haploid_missing_data) 243 | 244 | def test_diploid_missing_data(self, tmp_path, fx_diploid_missing_data): 245 | self.assert_bio2zarr_rt(tmp_path, fx_diploid_missing_data) 246 | 247 | def test_diploid_multi_allelic(self, tmp_path, fx_diploid_multi_allelic): 248 | self.assert_bio2zarr_rt(tmp_path, fx_diploid_multi_allelic) 249 | 250 | 251 | def recode_plink_hets(G): 252 | """ 253 | Returns a copy of the specified genotype matrix in which hets are all 254 | in the canonical unphased plink orientation, [0, 1] 255 | """ 256 | G = G.copy() 257 | for j in range(G.shape[0]): 258 | for k in range(G.shape[1]): 259 | if G[j, k, 0] == 1 and G[j, k, 1] == 0: 260 | G[j, k, 0] = 0 261 | G[j, k, 1] = 1 262 | return G 263 | 264 | 265 | class TestPlinkRoundTrip: 266 | def assert_bio2zarr_rt(self, tmp_path, tskit_vcz): 267 | # import pathlib 268 | # tmp_path = pathlib.Path("tmp/plink") 269 | plink_path = tmp_path / "plink" 270 | write_plink(tskit_vcz, plink_path) 271 | rt_vcz_path = tmp_path / "rt.vcz" 272 | p2z.convert(plink_path, rt_vcz_path) 273 | ds1 = sg.load_dataset(tskit_vcz) 274 | ds2 = sg.load_dataset(rt_vcz_path) 275 | 276 | assert np.all(ds1["call_genotype_phased"]) 277 | assert np.all(~ds2["call_genotype_phased"]) 278 | 279 | nt.assert_array_equal( 280 | recode_plink_hets(ds1["call_genotype"].values), ds2["call_genotype"] 281 | ) 282 | 283 | drop_fields = [ 284 | "variant_id", 285 | "variant_id_mask", 286 | "call_genotype", 287 | "call_genotype_phased", 288 | ] 289 | xt.assert_equal( 290 | ds1.drop_vars(["call_genotype", "call_genotype_phased"]), 291 | ds2.drop_vars(drop_fields), 292 | ) 293 | 294 | def test_diploid_msprime_sim(self, tmp_path, fx_diploid_msprime_sim): 295 | self.assert_bio2zarr_rt(tmp_path, fx_diploid_msprime_sim) 296 | 297 | def test_diploid_missing_data(self, tmp_path, fx_diploid_missing_data): 298 | self.assert_bio2zarr_rt(tmp_path, fx_diploid_missing_data) 299 | 300 | def test_diploid_multi_allelic(self, tmp_path, fx_diploid_multi_allelic): 301 | with pytest.raises(ValueError, match="Only biallelic VCFs supported"): 302 | self.assert_bio2zarr_rt(tmp_path, fx_diploid_multi_allelic) 303 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /vcztools/query.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import itertools 3 | import math 4 | from collections.abc import Callable 5 | 6 | import numpy as np 7 | import pyparsing as pp 8 | import zarr 9 | 10 | from vcztools import constants, retrieval 11 | from vcztools.samples import parse_samples 12 | from vcztools.utils import vcf_name_to_vcz_names 13 | 14 | 15 | def list_samples(vcz_path, output): 16 | root = zarr.open(vcz_path, mode="r") 17 | 18 | sample_ids = root["sample_id"][:] 19 | print("\n".join(sample_ids), file=output) 20 | 21 | 22 | class QueryFormatParser: 23 | def __init__(self): 24 | info_tag_pattern = pp.Combine( 25 | pp.Literal("%INFO/") + pp.Word(pp.srange("[A-Z]")) 26 | ) 27 | tag_pattern = info_tag_pattern | pp.Combine( 28 | pp.Literal("%") + pp.Regex(r"[A-Z]+\d?") 29 | ) 30 | subfield_pattern = pp.Group( 31 | tag_pattern 32 | + pp.Literal("{").suppress() 33 | + pp.common.integer 34 | + pp.Literal("}").suppress() 35 | ).set_results_name("subfield") 36 | newline_pattern = pp.Literal("\\n").set_parse_action(pp.replace_with("\n")) 37 | tab_pattern = pp.Literal("\\t").set_parse_action(pp.replace_with("\t")) 38 | format_pattern = pp.Forward() 39 | sample_loop_pattern = pp.Group( 40 | pp.Literal("[").suppress() + format_pattern + pp.Literal("]").suppress() 41 | ).set_results_name("sample loop") 42 | format_pattern <<= pp.ZeroOrMore( 43 | sample_loop_pattern 44 | | subfield_pattern 45 | | tag_pattern 46 | | newline_pattern 47 | | tab_pattern 48 | | pp.White() 49 | | pp.Word(pp.printables, exclude_chars=r"\{}[]%") 50 | ).leave_whitespace() 51 | 52 | self._parser = functools.partial(format_pattern.parse_string, parse_all=True) 53 | 54 | def __call__(self, *args, **kwargs): 55 | assert len(args) == 1 56 | assert not kwargs 57 | 58 | return self._parser(args[0]) 59 | 60 | 61 | class QueryFormatGenerator: 62 | def __init__(self, query_format, sample_ids, contigs, filters): 63 | self.sample_ids = sample_ids 64 | self.sample_count = len(self.sample_ids) 65 | self.contig_ids = contigs 66 | self.filter_ids = filters 67 | if isinstance(query_format, str): 68 | parser = QueryFormatParser() 69 | parse_results = parser(query_format) 70 | else: 71 | assert isinstance(query_format, pp.ParseResults) 72 | parse_results = query_format 73 | 74 | self._generator = self._compose_generator(parse_results) 75 | 76 | def __call__(self, *args, **kwargs): 77 | assert len(args) == 1 78 | assert not kwargs 79 | 80 | yield from self._generator(args[0]) 81 | 82 | def _compose_gt_generator(self) -> Callable: 83 | def generate(chunk_data): 84 | gt_array = chunk_data["call_genotype"] 85 | 86 | if "call_genotype_phased" in chunk_data: 87 | phase_array = chunk_data["call_genotype_phased"] 88 | assert gt_array.shape[:2] == phase_array.shape 89 | 90 | for gt_row, phase in zip(gt_array, phase_array): 91 | 92 | def stringify(gt_and_phase: tuple): 93 | gt, phase = gt_and_phase 94 | gt = [ 95 | str(allele) if allele != constants.INT_MISSING else "." 96 | for allele in gt 97 | if allele != constants.INT_FILL 98 | ] 99 | separator = "|" if phase else "/" 100 | return separator.join(gt) 101 | 102 | gt_row = gt_row.tolist() 103 | yield map(stringify, zip(gt_row, phase)) 104 | else: 105 | # TODO: Support datasets without the phasing data 106 | raise NotImplementedError 107 | 108 | return generate 109 | 110 | def _compose_sample_ids_generator(self) -> Callable: 111 | def generate(chunk_data): 112 | variant_count = chunk_data["variant_position"].shape[0] 113 | yield from itertools.repeat(self.sample_ids, variant_count) 114 | 115 | return generate 116 | 117 | def _compose_tag_generator( 118 | self, tag: str, *, subfield=False, sample_loop=False 119 | ) -> Callable: 120 | assert tag.startswith("%") 121 | tag = tag[1:] 122 | 123 | if tag == "GT": 124 | if not sample_loop: 125 | raise ValueError( 126 | "no such tag defined: INFO/GT. " 127 | 'FORMAT fields must be enclosed in square brackets, e.g. "[ %GT]"' 128 | ) 129 | return self._compose_gt_generator() 130 | 131 | if tag == "SAMPLE": 132 | if not sample_loop: 133 | raise ValueError("no such tag defined: INFO/SAMPLE") 134 | return self._compose_sample_ids_generator() 135 | 136 | def generate(chunk_data): 137 | vcz_names = set(chunk_data.keys()) 138 | vcz_name_matches = vcf_name_to_vcz_names(vcz_names, tag) 139 | if len(vcz_name_matches) == 0: 140 | raise ValueError(f"No mapping found for '{tag}'") 141 | if sample_loop: 142 | # FORMAT fields have precedence over INFO fields 143 | vcz_name = vcz_name_matches[0] 144 | else: 145 | # FORMAT fields are not allowed 146 | vcz_name = vcz_name_matches[-1] 147 | if vcz_name.startswith("call_"): 148 | raise ValueError( 149 | f"no such tag defined: INFO/{tag}. " 150 | "FORMAT fields must be enclosed in square brackets, " 151 | f'e.g. "[ %{tag}]"' 152 | ) 153 | array = chunk_data[vcz_name] 154 | for row in array: 155 | is_missing = np.any(row == -1) 156 | sep = "," 157 | 158 | if tag == "CHROM": 159 | row = self.contig_ids[row] 160 | if tag == "REF": 161 | row = row[0] 162 | if tag == "ALT": 163 | row = [allele for allele in row[1:] if allele] or "." 164 | if tag == "FILTER": 165 | if np.any(row): 166 | row = self.filter_ids[row] 167 | else: 168 | row = "." 169 | sep = ";" 170 | if tag == "QUAL": 171 | if math.isnan(row): 172 | row = "." 173 | else: 174 | row = f"{row:g}" 175 | if ( 176 | not subfield 177 | and not sample_loop 178 | and (isinstance(row, np.ndarray) or isinstance(row, list)) 179 | ): 180 | row = sep.join(map(str, row)) 181 | 182 | if sample_loop: 183 | if isinstance(row, np.ndarray): 184 | row = row.tolist() 185 | row = [ 186 | (str(element) if element != constants.INT_MISSING else ".") 187 | for element in row 188 | if element != constants.INT_FILL 189 | ] 190 | yield row 191 | else: 192 | yield itertools.repeat(str(row), self.sample_count) 193 | else: 194 | yield row if not is_missing else "." 195 | 196 | return generate 197 | 198 | def _compose_subfield_generator(self, parse_results: pp.ParseResults) -> Callable: 199 | assert len(parse_results) == 2 200 | 201 | tag, subfield_index = parse_results 202 | tag_generator = self._compose_tag_generator(tag, subfield=True) 203 | 204 | def generate(chunk_data): 205 | for tag in tag_generator(chunk_data): 206 | if isinstance(tag, str): 207 | assert tag == "." 208 | yield "." 209 | else: 210 | if subfield_index < len(tag): 211 | yield tag[subfield_index] 212 | else: 213 | yield "." 214 | 215 | return generate 216 | 217 | def _compose_sample_loop_generator( 218 | self, parse_results: pp.ParseResults 219 | ) -> Callable: 220 | generators = map( 221 | functools.partial(self._compose_element_generator, sample_loop=True), 222 | parse_results, 223 | ) 224 | 225 | def generate(chunk_data): 226 | iterables = (generator(chunk_data) for generator in generators) 227 | zipped = zip(*iterables) 228 | zipped_zipped = (zip(*element) for element in zipped) 229 | if "call_mask" not in chunk_data: 230 | flattened_zipped_zipped = ( 231 | ( 232 | subsubelement 233 | for subelement in element # sample-wise 234 | for subsubelement in subelement 235 | ) 236 | for element in zipped_zipped # variant-wise 237 | ) 238 | else: 239 | call_mask = chunk_data["call_mask"] 240 | flattened_zipped_zipped = ( 241 | ( 242 | subsubelement 243 | for j, subelement in enumerate(element) # sample-wise 244 | if call_mask[i, j] 245 | for subsubelement in subelement 246 | ) 247 | for i, element in enumerate(zipped_zipped) # variant-wise 248 | ) 249 | yield from map("".join, flattened_zipped_zipped) 250 | 251 | return generate 252 | 253 | def _compose_element_generator( 254 | self, element: str | pp.ParseResults, *, sample_loop=False 255 | ) -> Callable: 256 | if isinstance(element, pp.ParseResults): 257 | if element.get_name() == "subfield": 258 | return self._compose_subfield_generator(element) 259 | elif element.get_name() == "sample loop": 260 | return self._compose_sample_loop_generator(element) 261 | 262 | assert isinstance(element, str) 263 | 264 | if element.startswith("%"): 265 | return self._compose_tag_generator(element, sample_loop=sample_loop) 266 | else: 267 | 268 | def generate(chunk_data): 269 | nonlocal element 270 | variant_count = chunk_data["variant_position"].shape[0] 271 | if sample_loop: 272 | for _ in range(variant_count): 273 | yield itertools.repeat(element, self.sample_count) 274 | else: 275 | yield from itertools.repeat(element, variant_count) 276 | 277 | return generate 278 | 279 | def _compose_generator( 280 | self, 281 | parse_results, 282 | ) -> Callable: 283 | generators = ( 284 | self._compose_element_generator(element) for element in parse_results 285 | ) 286 | 287 | def generate(chunk_data) -> str: 288 | iterables = (generator(chunk_data) for generator in generators) 289 | for results in zip(*iterables): 290 | results = map(str, results) 291 | yield "".join(results) 292 | 293 | return generate 294 | 295 | 296 | def write_query( 297 | vcz, 298 | output, 299 | *, 300 | query_format: str, 301 | regions=None, 302 | targets=None, 303 | samples=None, 304 | force_samples: bool = False, 305 | include: str | None = None, 306 | exclude: str | None = None, 307 | ): 308 | root = zarr.open(vcz, mode="r") 309 | 310 | all_samples = root["sample_id"][:] 311 | sample_ids, samples_selection = parse_samples( 312 | samples, all_samples, force_samples=force_samples 313 | ) 314 | contigs = root["contig_id"][:] 315 | filters = root["filter_id"][:] 316 | 317 | generator = QueryFormatGenerator(query_format, sample_ids, contigs, filters) 318 | 319 | for chunk_data in retrieval.variant_chunk_iter( 320 | root, 321 | regions=regions, 322 | targets=targets, 323 | include=include, 324 | exclude=exclude, 325 | samples_selection=samples_selection, 326 | ): 327 | for result in generator(chunk_data): 328 | print(result, sep="", end="", file=output) 329 | -------------------------------------------------------------------------------- /tests/test_bcftools_validation.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import subprocess 3 | 4 | import click.testing as ct 5 | import pytest 6 | 7 | import vcztools.cli as cli 8 | 9 | from .utils import assert_vcfs_close, vcz_path_cache 10 | 11 | 12 | def run_bcftools(args: str, expect_error=False) -> tuple[str, str]: 13 | """ 14 | Run bcftools (which must be on the PATH) and return stdout and stderr 15 | as a pair of strings. 16 | """ 17 | completed = subprocess.run( 18 | f"bcftools {args}", capture_output=True, check=False, shell=True 19 | ) 20 | if expect_error: 21 | assert completed.returncode != 0 22 | else: 23 | assert completed.returncode == 0 24 | return completed.stdout.decode("utf-8"), completed.stderr.decode("utf-8") 25 | 26 | 27 | def run_vcztools(args: str, expect_error=False) -> tuple[str, str]: 28 | """Run run_vcztools and return stdout and stderr as a pair of strings.""" 29 | runner = ct.CliRunner() 30 | result = runner.invoke( 31 | cli.vcztools_main, 32 | args, 33 | catch_exceptions=False, 34 | ) 35 | if expect_error: 36 | assert result.exit_code != 0 37 | else: 38 | assert result.exit_code == 0 39 | return result.stdout, result.stderr 40 | 41 | 42 | # fmt: off 43 | @pytest.mark.parametrize( 44 | ("args", "vcf_file"), 45 | [ 46 | ("view --no-version", "sample.vcf.gz"), 47 | ("view --no-version", "chr22.vcf.gz"), 48 | ("view --no-version", "msprime_diploid.vcf.gz"), 49 | ("view --no-version -i 'CHROM == \"20\"'", "sample.vcf.gz"), 50 | ("view --no-version -i 'CHROM != \"Z\"'", "sample.vcf.gz"), 51 | ("view --no-version -i 'ID == \"rs6054257\"'", "sample.vcf.gz"), 52 | ("view --no-version -i 'DB=0'", "sample.vcf.gz"), 53 | ("view --no-version -i 'DB=1'", "sample.vcf.gz"), 54 | ("view --no-version -i 'FILTER=\"PASS\"'", "sample.vcf.gz"), 55 | ("view --no-version -i 'INFO/DP > 10'", "sample.vcf.gz"), 56 | ("view --no-version -i 'FMT/DP >= 5'", "sample.vcf.gz"), 57 | ("view --no-version -i 'FMT/DP >= 5 && FMT/GQ > 10'", "sample.vcf.gz"), 58 | ("view --no-version -i 'FMT/DP >= 5 & FMT/GQ>10'", "sample.vcf.gz"), 59 | ("view --no-version -i 'FMT/DP>5 && FMT/GQ<45'", "sample.vcf.gz"), 60 | ("view --no-version -i 'FMT/DP>5 & FMT/GQ<45'", "sample.vcf.gz"), 61 | ( 62 | "view --no-version -i '(QUAL > 10 || FMT/GQ>10) && POS > 100000'", 63 | "sample.vcf.gz" 64 | ), 65 | ( 66 | "view --no-version -i '(FMT/DP >= 8 | FMT/GQ>40) && POS > 100000'", 67 | "sample.vcf.gz" 68 | ), 69 | ( 70 | "view --no-version -e '(FMT/DP >= 8 | FMT/GQ>40) && POS > 100000'", 71 | "sample.vcf.gz" 72 | ), 73 | ("view --no-version -i 'TYPE=\"ref\"'", "sample.vcf.gz"), 74 | ("view --no-version -i 'TYPE!=\"ref\"'", "sample.vcf.gz"), 75 | ("view --no-version -i 'TYPE=\"snp\"'", "sample.vcf.gz"), 76 | ("view --no-version -i 'TYPE!=\"snp\"'", "sample.vcf.gz"), 77 | # All alleles are SNPs, 14 rows 78 | ("view --no-version -i 'TYPE=\"snp\"'", "1kg_2020_chrM.vcf.gz"), 79 | # Any allele is a SNP, 22 rows 80 | ("view --no-version -i 'TYPE~\"snp\"'", "1kg_2020_chrM.vcf.gz"), 81 | # No allele is a SNP, 1 row 82 | ("view --no-version -i 'TYPE!~\"snp\"'", "1kg_2020_chrM.vcf.gz"), 83 | # Any allele is not a SNP, 9 rows 84 | ("view --no-version -i 'TYPE!=\"snp\"'", "1kg_2020_chrM.vcf.gz"), 85 | ("view --no-version -G", "sample.vcf.gz"), 86 | ( 87 | "view --no-update --no-version --samples-file " 88 | "tests/data/txt/samples.txt", 89 | "sample.vcf.gz"), 90 | ("view -I --no-version -S tests/data/txt/samples.txt", "sample.vcf.gz"), 91 | ("view --no-version -s NA00001", "sample.vcf.gz"), 92 | ("view --no-version -s NA00001,NA00003", "sample.vcf.gz"), 93 | ("view --no-version -s HG00096", "1kg_2020_chrM.vcf.gz"), 94 | ("view --no-version -s tsk_0,tsk_1", "msprime_diploid.vcf.gz"), 95 | ("view --no-version -s tsk_0,tsk_1,tsk_2", "msprime_diploid.vcf.gz"), 96 | ("view --no-version -s ^tsk_0,tsk_1,tsk_2", "msprime_diploid.vcf.gz"), 97 | ("view --no-version -s '' --force-samples", "sample.vcf.gz"), 98 | ("view --no-version -s 'NO_SAMPLE' --force-samples", "sample.vcf.gz"), 99 | ("view --no-version -s 'NO_SAMPLE,NA00001' --force-samples", "sample.vcf.gz"), 100 | ("view --no-version -s ^NA00001", "sample.vcf.gz"), 101 | ("view --no-version -s ^NA00003,NA00002", "sample.vcf.gz"), 102 | ("view --no-version -s ^NA00003,NA00002,NA00003", "sample.vcf.gz"), 103 | ("view --no-version -S ^tests/data/txt/samples.txt", "sample.vcf.gz"), 104 | ( 105 | "view --no-version -r '20:1230236-' -i 'FMT/DP>3' -s 'NA00002,NA00003'", 106 | "sample.vcf.gz" 107 | ), 108 | ( 109 | "view --no-version -i 'FILTER=\"VQSRTrancheSNP99.80to100.00\"'", 110 | "1kg_2020_chrM.vcf.gz" 111 | ), 112 | ( 113 | "view --no-version -i 'FILTER!=\"VQSRTrancheSNP99.80to100.00\"'", 114 | "1kg_2020_chrM.vcf.gz" 115 | ), 116 | ( 117 | "view --no-version -i 'FILTER~\"VQSRTrancheINDEL99.00to100.00\"'", 118 | "1kg_2020_chrM.vcf.gz" 119 | ), 120 | ("view --no-version -i 'INFO/AC>2'", "chr22.vcf.gz") 121 | ], 122 | # This is necessary when trying to run individual tests, as the arguments above 123 | # make for unworkable command lines 124 | # ids=range(36), 125 | ) 126 | # fmt: on 127 | def test_vcf_output(tmp_path, args, vcf_file): 128 | # print("args:", args) 129 | original = pathlib.Path("tests/data/vcf") / vcf_file 130 | vcz = vcz_path_cache(original) 131 | 132 | bcftools_out, _ = run_bcftools(f"{args} {original}") 133 | bcftools_out_file = tmp_path.joinpath("bcftools_out.vcf") 134 | with open(bcftools_out_file, "w") as f: 135 | f.write(bcftools_out) 136 | 137 | vcztools_out, _ = run_vcztools(f"{args} {vcz}") 138 | vcztools_out_file = tmp_path.joinpath("vcztools_out.vcf") 139 | with open(vcztools_out_file, "w") as f: 140 | f.write(vcztools_out) 141 | 142 | assert_vcfs_close(bcftools_out_file, vcztools_out_file) 143 | 144 | 145 | @pytest.mark.parametrize( 146 | ("args", "vcf_file"), 147 | [("view --no-version", "sample.vcf.gz")], 148 | ) 149 | def test_vcf_output_with_output_option(tmp_path, args, vcf_file): 150 | vcf_path = pathlib.Path("tests/data/vcf") / vcf_file 151 | vcz_path = vcz_path_cache(vcf_path) 152 | 153 | bcftools_out_file = tmp_path.joinpath("bcftools_out.vcf") 154 | vcztools_out_file = tmp_path.joinpath("vcztools_out.vcf") 155 | 156 | bcftools_args = f"{args} -o {bcftools_out_file}" 157 | vcztools_args = f"{args} -o {vcztools_out_file}" 158 | 159 | run_bcftools(f"{bcftools_args} {vcf_path}") 160 | run_vcztools(f"{vcztools_args} {vcz_path}") 161 | 162 | assert_vcfs_close(bcftools_out_file, vcztools_out_file) 163 | 164 | 165 | @pytest.mark.parametrize( 166 | ("args", "vcf_name"), 167 | [ 168 | ("index -n", "sample.vcf.gz"), 169 | ("index --nrecords", "1kg_2020_chrM.vcf.gz"), 170 | ("index -s", "sample.vcf.gz"), 171 | ("index --stats", "1kg_2020_chrM.vcf.gz"), 172 | ("query -l", "sample.vcf.gz"), 173 | ("query --list-samples", "1kg_2020_chrM.vcf.gz"), 174 | (r"query -f 'A\n'", "sample.vcf.gz"), 175 | (r"query -f '%CHROM:%POS\n'", "sample.vcf.gz"), 176 | (r"query -f '[%CHROM %POS %GT\n]'", "sample.vcf.gz"), 177 | (r"query -f '%INFO/DP\n'", "sample.vcf.gz"), 178 | (r"query -f '%DP\n'", "sample.vcf.gz"), 179 | (r"query -f '%AC{0}\n'", "sample.vcf.gz"), 180 | (r"query -f '%REF\t%ALT\n'", "sample.vcf.gz"), 181 | (r"query -f '%ALT{1}\n'", "sample.vcf.gz"), 182 | (r"query -f '%ID\n'", "sample.vcf.gz"), 183 | (r"query -f '%QUAL\n'", "sample.vcf.gz"), 184 | (r"query -f '%FILTER\n'", "sample.vcf.gz"), 185 | (r"query --format '%FILTER\n'", "1kg_2020_chrM.vcf.gz"), 186 | (r"query -f '%POS\n' -i 'POS=112'", "sample.vcf.gz"), 187 | (r"query -f '%POS\n' -e 'POS=112'", "sample.vcf.gz"), 188 | (r"query -f '[%CHROM\t]\n'", "sample.vcf.gz"), 189 | (r"query -f '[%CHROM\t]\n' -i 'POS=112'", "sample.vcf.gz"), 190 | (r"query -f '[%CHROM:%POS %SAMPLE %GT\n]'", "sample.vcf.gz"), 191 | (r"query -f '[%SAMPLE %GT %DP\n]'", "sample.vcf.gz"), 192 | ( 193 | r"query -f '[%POS %SAMPLE %GT %DP %GQ\n]' -i 'INFO/DP >= 5'", 194 | "sample.vcf.gz", 195 | ), 196 | ( 197 | r"query -f '[%POS %QUAL\n]' -i'(QUAL > 10 && POS > 100000)'", 198 | "sample.vcf.gz", 199 | ), 200 | # Examples from bcftools query documentation 201 | (r"query -f '%CHROM %POS %REF %ALT{0}\n'", "sample.vcf.gz"), 202 | (r"query -f '%CHROM\t%POS\t%REF\t%ALT[\t%SAMPLE=%GT]\n'", "sample.vcf.gz"), 203 | (r"query -f 'GQ:[ %GQ] \t GT:[ %GT]\n'", "sample.vcf.gz"), 204 | # POS0 not supported 205 | # (r"query -f '%CHROM\t%POS0\t%END\t%ID\n'", "sample.vcf.gz"), 206 | # Filtering on GT not supported 207 | # (r"query -f [%CHROM:%POS %SAMPLE %GT\n]' -i'GT=\"alt\"'", "sample.vcf.gz"), 208 | # Indexing not supported in filtering 209 | # (r"query -f '%AC{1}\n' -i 'AC[1]>10' ", "sample.vcf.gz"), 210 | # TODO fill-out more of these when supported for more stuff is available 211 | # in filtering 212 | ("query -f '%CHROM %POS %FILTER\n' -i 'FILTER=\"PASS\"'", "sample.vcf.gz"), 213 | # Per-sample query tests 214 | ( 215 | r"query -f '[%CHROM %POS %SAMPLE %GT %DP %GQ\n]' -i 'FMT/DP>3'", 216 | "sample.vcf.gz", 217 | ), 218 | ( 219 | r"query -f '[%CHROM %POS %SAMPLE %GT %DP %GQ\n]' -i 'FMT/GQ>30'", 220 | "sample.vcf.gz", 221 | ), 222 | ( 223 | r"query -f '[%CHROM %POS %SAMPLE %GT %DP %GQ\n]' -i 'FMT/DP>3 & FMT/GQ>30'", 224 | "sample.vcf.gz", 225 | ), 226 | ( 227 | r"query -f '[%CHROM %POS %SAMPLE %GT %DP %GQ\n]' -i 'FMT/DP>3 && FMT/GQ>30'", # noqa: E501 228 | "sample.vcf.gz", 229 | ), 230 | ( 231 | r"query -f '[%CHROM %POS %SAMPLE %GT %DP %GQ\n]' -r '20:1230236-' -i 'FMT/DP>3' -s 'NA00002,NA00003'", # noqa: E501 232 | "sample.vcf.gz", 233 | ), 234 | ], 235 | ) 236 | def test_output(tmp_path, args, vcf_name): 237 | vcf_path = pathlib.Path("tests/data/vcf") / vcf_name 238 | vcz_path = vcz_path_cache(vcf_path) 239 | 240 | bcftools_output, _ = run_bcftools(f"{args} {vcf_path}") 241 | vcztools_output, _ = run_vcztools(f"{args} {vcz_path}") 242 | 243 | assert vcztools_output == bcftools_output 244 | 245 | 246 | @pytest.mark.parametrize( 247 | "expr", 248 | [ 249 | # Check arithmetic evaluation in filter queries. All these should 250 | # result to POS=112, which exists. 251 | "POS=(111 + 1)", 252 | "POS =(224 / 2)", 253 | "POS= (112 * 3) / 3", 254 | "POS=(112 * 3 / 3 )", 255 | "POS=25 * 4 + 24 / 2", 256 | "POS=112 * -1 * -1", 257 | "-POS=-112", 258 | "POS=112.25 - 1 / 4", 259 | "POS=112.25e3 * 1e-3 - 0.25", 260 | ], 261 | ) 262 | def test_query_arithmethic(tmp_path, expr): 263 | 264 | args = r"query -f '%POS\n'" + f" -i '{expr}'" 265 | vcf_name = "sample.vcf.gz" 266 | vcf_path = pathlib.Path("tests/data/vcf") / vcf_name 267 | vcz_path = vcz_path_cache(vcf_path) 268 | 269 | bcftools_output, _ = run_bcftools(f"{args} {vcf_path}") 270 | vcztools_output, _ = run_vcztools(f"{args} {vcz_path}") 271 | 272 | assert vcztools_output == bcftools_output 273 | assert vcztools_output == "112\n" 274 | 275 | 276 | @pytest.mark.parametrize( 277 | ("expr", "expected"), 278 | [ 279 | # Check boolean logic evaluation. Will evaluate this with 280 | # POS=112, so POS=112 is True and POS!=112 is False 281 | ("POS==112 || POS!=112", True), 282 | ("POS==112 && POS!=112", False), 283 | ("POS==112 || POS!=112 && POS!= 112", True), 284 | ("(POS==112 || POS!=112) && POS!= 112", False), 285 | ], 286 | ) 287 | def test_query_logic_precendence(tmp_path, expr, expected): 288 | 289 | args = r"query -f '%POS\n'" + f" -i 'POS=112 && ({expr})'" 290 | vcf_name = "sample.vcf.gz" 291 | vcf_path = pathlib.Path("tests/data/vcf") / vcf_name 292 | vcz_path = vcz_path_cache(vcf_path) 293 | 294 | bcftools_output, _ = run_bcftools(f"{args} {vcf_path}") 295 | vcztools_output, _ = run_vcztools(f"{args} {vcz_path}") 296 | 297 | assert vcztools_output == bcftools_output 298 | num_lines = len(list(vcztools_output.splitlines())) 299 | assert num_lines == int(expected) 300 | 301 | 302 | # fmt: off 303 | @pytest.mark.parametrize( 304 | ("args", "vcf_name", "bcftools_error_string"), 305 | [ 306 | ("index -ns", "sample.vcf.gz", True), 307 | ("query -f '%POS\n' -i 'INFO/DP > 10' -e 'INFO/DP < 50'", "sample.vcf.gz", True), # noqa: E501 308 | ("query -f '%GT'", "sample.vcf.gz", True), 309 | ("query -f '%HQ'", "sample.vcf.gz", True), 310 | ("query -f '%SAMPLE'", "sample.vcf.gz", True), 311 | ("view -i 'INFO/DP > 10' -e 'INFO/DP < 50'", "sample.vcf.gz", True), 312 | ("view -i 'DP > 10'", "sample.vcf.gz", True), 313 | # bcftools output does not start with "Error" 314 | ("view -i 'FILTER=\"F\"'", "sample.vcf.gz", False), 315 | ], 316 | ) 317 | # fmt: on 318 | def test_error(tmp_path, args, vcf_name, bcftools_error_string): 319 | vcf_path = pathlib.Path("tests/data/vcf") / vcf_name 320 | vcz_path = vcz_path_cache(vcf_path) 321 | 322 | _, bcftools_error = run_bcftools(f"{args} {vcf_path}", expect_error=True) 323 | if bcftools_error_string: 324 | assert bcftools_error.startswith("Error:") or bcftools_error.startswith("[E::") 325 | 326 | _, vcztools_error = run_vcztools(f"{args} {vcz_path}", expect_error=True) 327 | assert "Error:" in vcztools_error 328 | -------------------------------------------------------------------------------- /vcztools/vcf_writer.py: -------------------------------------------------------------------------------- 1 | import io 2 | import logging 3 | import sys 4 | from datetime import datetime 5 | 6 | import numpy as np 7 | import zarr 8 | 9 | from vcztools.samples import parse_samples 10 | from vcztools.utils import ( 11 | open_file_like, 12 | ) 13 | 14 | from . import _vcztools, constants, retrieval 15 | from . import filter as filter_mod 16 | from .constants import FLOAT32_MISSING, RESERVED_VARIABLE_NAMES 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | # references to the VCF spec are for https://samtools.github.io/hts-specs/VCFv4.3.pdf 21 | 22 | # [Table 1: Reserved INFO keys] 23 | RESERVED_INFO_KEY_DESCRIPTIONS = { 24 | "AA": "Ancestral allele", 25 | "AC": "Allele count in genotypes", 26 | "AD": "Total read depth for each allele", 27 | "ADF": "Read depth for each allele on the forward strand", 28 | "ADR": "Read depth for each allele on the reverse strand", 29 | "AF": "Allele frequency for each ALT allele in the same order as listed", 30 | "AN": "Total number of alleles in called genotypes", 31 | "BQ": "RMS base quality", 32 | "CIGAR": "Cigar string describing how to align an alternate allele to the reference" 33 | "allele", 34 | "DB": "dbSNP membership", 35 | "DP": "Combined depth across samples", 36 | "END": "End position on CHROM", 37 | "H2": "HapMap2 membership", 38 | "H3": "HapMap3 membership", 39 | "MQ": "RMS mapping quality", 40 | "MQ0": "Number of MAPQ == 0 reads", 41 | "NS": "Number of samples with data", 42 | "SB": "Strand bias", 43 | "SOMATIC": "Somatic mutation", 44 | "VALIDATED": "Validated by follow-up experiment", 45 | "1000G": "1000 Genomes membership", 46 | } 47 | 48 | # [Table 2: Reserved genotype keys] 49 | RESERVED_FORMAT_KEY_DESCRIPTIONS = { 50 | "AD": "Read depth for each allele", 51 | "ADF": "Read depth for each allele on the forward strand", 52 | "ADR": "Read depth for each allele on the reverse strand", 53 | "DP": "Read depth", 54 | "EC": "Expected alternate allele counts", 55 | "FT": 'Filter indicating if this genotype was "called"', 56 | "GL": "Genotype likelihoods", 57 | "GP": "Genotype posterior probabilities", 58 | "GQ": "Conditional genotype quality", 59 | "GT": "Genotype", 60 | "HQ": "Haplotype quality", 61 | "MQ": "RMS mapping quality", 62 | "PL": "Phred-scaled genotype likelihoods rounded to the closest integer", 63 | "PP": "Phred-scaled genotype posterior probabilities rounded to the closest " 64 | "integer", 65 | "PQ": "Phasing quality", 66 | "PS": "Phase set", 67 | } 68 | 69 | 70 | def dims(arr): 71 | return arr.attrs["_ARRAY_DIMENSIONS"] 72 | 73 | 74 | def write_vcf( 75 | vcz, 76 | output, 77 | *, 78 | header_only: bool = False, 79 | no_header: bool = False, 80 | no_version: bool = False, 81 | regions=None, 82 | targets=None, 83 | no_update=None, 84 | samples=None, 85 | force_samples: bool = False, 86 | drop_genotypes: bool = False, 87 | include: str | None = None, 88 | exclude: str | None = None, 89 | ) -> None: 90 | root = zarr.open(vcz, mode="r") 91 | 92 | with open_file_like(output) as output: 93 | if samples and drop_genotypes: 94 | raise ValueError("Cannot select samples and drop genotypes.") 95 | elif drop_genotypes: 96 | sample_ids = [] 97 | samples_selection = np.array([]) 98 | else: 99 | all_samples = root["sample_id"][:] 100 | sample_ids, samples_selection = parse_samples( 101 | samples, all_samples, force_samples=force_samples 102 | ) 103 | 104 | # Need to try parsing filter expressions before writing header 105 | filter_mod.FilterExpression( 106 | field_names=set(root), include=include, exclude=exclude 107 | ) 108 | 109 | if not no_header: 110 | force_ac_an_header = not drop_genotypes and samples_selection is not None 111 | vcf_header = _generate_header( 112 | root, 113 | sample_ids, 114 | no_version=no_version, 115 | force_ac_an=force_ac_an_header, 116 | ) 117 | print(vcf_header, end="", file=output) 118 | 119 | if header_only: 120 | return 121 | 122 | contigs = root["contig_id"][:].astype("S") 123 | filters = get_filter_ids(root) 124 | 125 | for chunk_data in retrieval.variant_chunk_iter( 126 | root, 127 | regions=regions, 128 | targets=targets, 129 | include=include, 130 | exclude=exclude, 131 | samples_selection=samples_selection, 132 | ): 133 | c_chunk_to_vcf( 134 | chunk_data, 135 | samples_selection, 136 | contigs, 137 | filters, 138 | output, 139 | drop_genotypes=drop_genotypes, 140 | no_update=no_update, 141 | ) 142 | 143 | 144 | def c_chunk_to_vcf( 145 | chunk_data, 146 | samples_selection, 147 | contigs, 148 | filters, 149 | output, 150 | *, 151 | drop_genotypes, 152 | no_update, 153 | ): 154 | format_fields = {} 155 | info_fields = {} 156 | num_samples = len(samples_selection) if samples_selection is not None else None 157 | 158 | # TODO check we don't truncate silently by doing this 159 | pos = chunk_data["variant_position"].astype(np.int32) 160 | num_variants = len(pos) 161 | if num_variants == 0: 162 | return "" 163 | # Required fields 164 | chrom = contigs[chunk_data["variant_contig"]] 165 | alleles = chunk_data["variant_allele"] 166 | 167 | # Optional fields which we fill in with "all missing" defaults 168 | if "variant_id" in chunk_data: 169 | id = chunk_data["variant_id"].astype("S") 170 | else: 171 | id = np.array(["."] * num_variants, dtype="S") 172 | if "variant_quality" in chunk_data: 173 | qual = chunk_data["variant_quality"] 174 | else: 175 | qual = np.full(num_variants, FLOAT32_MISSING, dtype=np.float32) 176 | 177 | # Filter defaults to "PASS" if not present 178 | if "variant_filter" in chunk_data: 179 | filter_ = chunk_data["variant_filter"] 180 | else: 181 | filter_ = np.ones((num_variants, 1), dtype=bool) 182 | 183 | gt = None 184 | gt_phased = None 185 | 186 | if "call_genotype" in chunk_data and not drop_genotypes: 187 | gt = chunk_data["call_genotype"] 188 | 189 | if ( 190 | "call_genotype_phased" in chunk_data 191 | and not drop_genotypes 192 | and (samples_selection is None or num_samples != 0) 193 | ): 194 | gt_phased = chunk_data["call_genotype_phased"] 195 | else: 196 | # Default to unphased if call_genotype_phased not present 197 | gt_phased = np.zeros(gt.shape[:2], dtype=bool) 198 | 199 | for name, array in chunk_data.items(): 200 | if ( 201 | name.startswith("call_") 202 | and not name == "call_mask" 203 | and not name.startswith("call_genotype") 204 | and num_samples != 0 205 | ): 206 | vcf_name = name[len("call_") :] 207 | format_fields[vcf_name] = array 208 | if num_samples is None: 209 | num_samples = array.shape[1] 210 | elif name.startswith("variant_") and name not in RESERVED_VARIABLE_NAMES: 211 | vcf_name = name[len("variant_") :] 212 | info_fields[vcf_name] = array 213 | 214 | ref = alleles[:, 0].astype("S") 215 | alt = alleles[:, 1:].astype("S") 216 | 217 | if len(id.shape) == 1: 218 | id = id.reshape((-1, 1)) 219 | if ( 220 | not no_update 221 | and samples_selection is not None 222 | and "call_genotype" in chunk_data 223 | and not drop_genotypes 224 | ): 225 | # Recompute INFO/AC and INFO/AN 226 | info_fields |= _compute_info_fields(gt, alt) 227 | if num_samples == 0: 228 | gt = None 229 | if gt is not None and num_samples is None: 230 | num_samples = gt.shape[1] 231 | 232 | encoder = _vcztools.VcfEncoder( 233 | num_variants, 234 | num_samples if num_samples is not None else 0, 235 | chrom=chrom, 236 | pos=pos, 237 | id=id, 238 | alt=alt, 239 | ref=ref, 240 | qual=qual, 241 | filter_ids=filters, 242 | filter=filter_, 243 | ) 244 | # print(encoder.arrays) 245 | if gt is not None: 246 | encoder.add_gt_field(gt, gt_phased) 247 | for name, zarray in info_fields.items(): 248 | # print(array.dtype.kind) 249 | if zarray.dtype.kind in ("O", "U"): 250 | zarray = zarray.astype("S") 251 | if len(zarray.shape) == 1: 252 | zarray = zarray.reshape((num_variants, 1)) 253 | encoder.add_info_field(name, zarray) 254 | 255 | if num_samples != 0: 256 | for name, zarray in format_fields.items(): 257 | if zarray.dtype.kind in ("O", "U"): 258 | zarray = zarray.astype("S") 259 | if len(zarray.shape) == 2: 260 | zarray = zarray.reshape((num_variants, num_samples, 1)) 261 | encoder.add_format_field(name, zarray) 262 | 263 | # TODO: (1) make a guess at this based on number of fields and samples, 264 | # and (2) log a DEBUG message when we have to double. 265 | buflen = 1024 266 | for j in range(num_variants): 267 | failed = True 268 | while failed: 269 | try: 270 | line = encoder.encode(j, buflen) 271 | failed = False 272 | except _vcztools.VczBufferTooSmall: 273 | buflen *= 2 274 | # print("Bumping buflen to", buflen) 275 | print(line, file=output) 276 | 277 | 278 | def get_filter_ids(root): 279 | """ 280 | Returns the filter IDs from the specified Zarr store. If the array 281 | does not exist, return a single filter "PASS" by default. 282 | """ 283 | if "filter_id" in root: 284 | filters = root["filter_id"][:].astype("S") 285 | else: 286 | filters = np.array(["PASS"], dtype="S") 287 | return filters 288 | 289 | 290 | def _generate_header( 291 | ds, 292 | sample_ids, 293 | *, 294 | no_version: bool = False, 295 | force_ac_an: bool = False, 296 | ): 297 | output = io.StringIO() 298 | 299 | contigs = list(ds["contig_id"][:]) 300 | filters = list(get_filter_ids(ds).astype("U")) 301 | info_fields = [] 302 | format_fields = [] 303 | 304 | if "call_genotype" in ds and len(sample_ids) > 0: 305 | # GT must be the first field if present, per the spec (section 1.6.2) 306 | format_fields.append("GT") 307 | 308 | for var in sorted(ds.keys()): 309 | arr = ds[var] 310 | if ( 311 | var.startswith("variant_") 312 | and not var.endswith("_fill") 313 | and not var.endswith("_mask") 314 | and var not in RESERVED_VARIABLE_NAMES 315 | and dims(arr)[0] == "variants" 316 | ): 317 | key = var[len("variant_") :] 318 | info_fields.append(key) 319 | elif ( 320 | len(sample_ids) > 0 321 | and var.startswith("call_") 322 | and not var.endswith("_fill") 323 | and not var.endswith("_mask") 324 | and dims(arr)[0] == "variants" 325 | and dims(arr)[1] == "samples" 326 | ): 327 | key = var[len("call_") :] 328 | if key in ("genotype", "genotype_phased"): 329 | continue 330 | format_fields.append(key) 331 | 332 | # [1.4.1 File format] 333 | print("##fileformat=VCFv4.3", file=output) 334 | 335 | if "source" in ds.attrs: 336 | print(f'##source={ds.attrs["source"]}', file=output) 337 | 338 | # [1.4.2 Information field format] 339 | for key in info_fields: 340 | arr = ds[f"variant_{key}"] 341 | category = "INFO" 342 | vcf_number = _array_to_vcf_number(category, key, arr) 343 | vcf_type = _array_to_vcf_type(arr) 344 | vcf_description = arr.attrs.get( 345 | "description", RESERVED_INFO_KEY_DESCRIPTIONS.get(key, "") 346 | ) 347 | print( 348 | f'##INFO=', 349 | file=output, 350 | ) 351 | 352 | if force_ac_an: 353 | # bcftools always recomputes the AC and AN fields when samples are specified, 354 | # even if these fields don't exist before 355 | for key, number in [("AC", "A"), ("AN", "1")]: 356 | if key not in info_fields: 357 | print( 358 | f"##INFO=', 360 | file=output, 361 | ) 362 | 363 | # [1.4.3 Filter field format] 364 | filter_descriptions = ( 365 | ds["filter_description"] if "filter_description" in ds else None 366 | ) 367 | for i, filter in enumerate(filters): 368 | filter_description = ( 369 | "" if filter_descriptions is None else filter_descriptions[i] 370 | ) 371 | print( 372 | f'##FILTER=', 373 | file=output, 374 | ) 375 | 376 | # [1.4.4 Individual format field format] 377 | for key in format_fields: 378 | if key == "GT": 379 | print( 380 | '##FORMAT=', 381 | file=output, 382 | ) 383 | else: 384 | arr = ds[f"call_{key}"] 385 | category = "FORMAT" 386 | vcf_number = _array_to_vcf_number(category, key, arr) 387 | vcf_type = _array_to_vcf_type(arr) 388 | vcf_description = arr.attrs.get( 389 | "description", RESERVED_FORMAT_KEY_DESCRIPTIONS.get(key, "") 390 | ) 391 | print( 392 | f'##FORMAT=', 393 | file=output, 394 | ) 395 | 396 | # [1.4.7 Contig field format] 397 | contig_lengths = ds["contig_length"] if "contig_length" in ds else None 398 | for i, contig in enumerate(contigs): 399 | if contig_lengths is None: 400 | print(f"##contig=", file=output) 401 | else: 402 | print(f"##contig=", file=output) 403 | 404 | if not no_version: 405 | print( 406 | f"##vcztools_viewCommand={' '.join(sys.argv[1:])}; Date={datetime.now()}", 407 | file=output, 408 | ) 409 | 410 | # Other meta information lines not covered above 411 | if "vcf_meta_information" in ds.attrs: 412 | for key, value in ds.attrs["vcf_meta_information"]: 413 | if key not in ("fileformat", "source"): 414 | print(f"##{key}={value}", file=output) 415 | 416 | # [1.5 Header line syntax] 417 | print( 418 | "#CHROM", 419 | "POS", 420 | "ID", 421 | "REF", 422 | "ALT", 423 | "QUAL", 424 | "FILTER", 425 | "INFO", 426 | sep="\t", 427 | end="", 428 | file=output, 429 | ) 430 | 431 | if len(sample_ids) > 0: 432 | print(end="\t", file=output) 433 | print("FORMAT", *sample_ids, sep="\t", file=output) 434 | else: 435 | print(file=output) 436 | 437 | return output.getvalue() 438 | 439 | 440 | def _array_to_vcf_number(category, key, a): 441 | # reverse of vcf_number_to_dimension_and_size 442 | if a.dtype == bool: 443 | return 0 444 | elif category == "INFO" and len(dims(a)) == 1: 445 | return 1 446 | elif category == "FORMAT" and len(dims(a)) == 2: 447 | return 1 448 | 449 | last_dim = dims(a)[-1] 450 | if last_dim == "alt_alleles": 451 | return "A" 452 | elif last_dim == "alleles": 453 | return "R" 454 | elif last_dim == "genotypes": 455 | return "G" 456 | elif last_dim == f"{category}_{key}_dim": 457 | return a.shape[-1] 458 | else: 459 | raise ValueError( 460 | f"Cannot determine VCF Number for dimension name '{last_dim}' in {a}" 461 | ) 462 | 463 | 464 | def _array_to_vcf_type(a): 465 | if a.dtype == bool: 466 | return "Flag" 467 | elif np.issubdtype(a.dtype, np.integer): 468 | return "Integer" 469 | elif np.issubdtype(a.dtype, np.float32): 470 | return "Float" 471 | elif a.dtype.str[1:] in ("S1", "U1"): 472 | return "Character" 473 | elif a.dtype.kind in ("O", "S", "U"): 474 | return "String" 475 | else: 476 | raise ValueError(f"Unsupported dtype: {a.dtype}") 477 | 478 | 479 | def _compute_info_fields(gt: np.ndarray, alt: np.ndarray): 480 | flatter_gt = gt.reshape((gt.shape[0], -1)) 481 | allele_count = alt.shape[1] + 1 482 | 483 | def filter_and_bincount(values: np.ndarray): 484 | positive = values[values > 0] 485 | return np.bincount(positive, minlength=allele_count)[1:] 486 | 487 | computed_ac = np.apply_along_axis(filter_and_bincount, 1, flatter_gt).astype( 488 | np.int32 489 | ) 490 | computed_ac[alt == b""] = constants.INT_FILL 491 | computed_an = np.sum(flatter_gt >= 0, axis=1, dtype=np.int32) 492 | 493 | return { 494 | "AC": computed_ac, 495 | "AN": computed_an, 496 | } 497 | -------------------------------------------------------------------------------- /tests/test_vcf_writer.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import re 3 | import sys 4 | from io import StringIO 5 | 6 | import numpy as np 7 | import pytest 8 | import zarr 9 | from cyvcf2 import VCF 10 | from numpy.testing import assert_array_equal 11 | 12 | from vcztools.constants import INT_FILL, INT_MISSING 13 | from vcztools.vcf_writer import _compute_info_fields, c_chunk_to_vcf, write_vcf 14 | 15 | from .utils import assert_vcfs_close, vcz_path_cache 16 | 17 | 18 | @pytest.mark.parametrize("output_is_path", [True, False]) 19 | def test_write_vcf(tmp_path, output_is_path): 20 | original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz" 21 | vcz = vcz_path_cache(original) 22 | output = tmp_path.joinpath("output.vcf") 23 | 24 | if output_is_path: 25 | write_vcf(vcz, output, no_version=True) 26 | else: 27 | output_str = StringIO() 28 | write_vcf(vcz, output_str, no_version=True) 29 | with open(output, "w") as f: 30 | f.write(output_str.getvalue()) 31 | 32 | v = VCF(output) 33 | 34 | assert v.samples == ["NA00001", "NA00002", "NA00003"] 35 | 36 | variant = next(v) 37 | 38 | assert variant.CHROM == "19" 39 | assert variant.POS == 111 40 | assert variant.ID is None 41 | assert variant.REF == "A" 42 | assert variant.ALT == ["C"] 43 | assert variant.QUAL == pytest.approx(9.6) 44 | assert variant.FILTER is None 45 | 46 | assert variant.genotypes == [[0, 0, True], [0, 0, True], [0, 1, False]] 47 | 48 | assert_array_equal( 49 | variant.format("HQ"), 50 | [[10, 15], [10, 10], [3, 3]], 51 | ) 52 | 53 | # check headers are the same 54 | assert_vcfs_close(original, output) 55 | 56 | 57 | @pytest.mark.parametrize( 58 | ("include", "exclude", "expected_chrom_pos"), 59 | [ 60 | ("POS < 1000", None, [("19", 111), ("19", 112), ("X", 10)]), 61 | ( 62 | None, 63 | "POS < 1000", 64 | [ 65 | ("20", 14370), 66 | ("20", 17330), 67 | ("20", 1110696), 68 | ("20", 1230237), 69 | ("20", 1234567), 70 | ("20", 1235237), 71 | ], 72 | ), 73 | ], 74 | ) 75 | def test_write_vcf__filtering(tmp_path, include, exclude, expected_chrom_pos): 76 | original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz" 77 | vcz = vcz_path_cache(original) 78 | output = tmp_path.joinpath("output.vcf") 79 | 80 | write_vcf(vcz, output, include=include, exclude=exclude) 81 | 82 | v = VCF(str(output)) 83 | variants = list(v) 84 | 85 | assert len(variants) == len(expected_chrom_pos) 86 | assert v.samples == ["NA00001", "NA00002", "NA00003"] 87 | 88 | for variant, chrom_pos in zip(variants, expected_chrom_pos): 89 | chrom, pos = chrom_pos 90 | assert variant.CHROM == chrom 91 | assert variant.POS == pos 92 | 93 | 94 | # fmt: off 95 | @pytest.mark.parametrize( 96 | ("regions", "targets", "expected_chrom_pos"), 97 | [ 98 | # regions only 99 | ("19", None, [("19", 111), ("19", 112)]), 100 | ("19:112", None, [("19", 112)]), 101 | ("20:1230236-", None, [("20", 1230237), ("20", 1234567), ("20", 1235237)]), 102 | ("20:1230237-", None, [("20", 1230237), ("20", 1234567), ("20", 1235237)]), 103 | ("20:1230238-", None, [("20", 1234567), ("20", 1235237)]), 104 | ("20:1230237-1235236", None, [("20", 1230237), ("20", 1234567)]), 105 | ("20:1230237-1235237", None, [("20", 1230237), ("20", 1234567), ("20", 1235237)]), # noqa: E501 106 | ("20:1230237-1235238", None, [("20", 1230237), ("20", 1234567), ("20", 1235237)]), # noqa: E501 107 | ("19,X", None, [("19", 111), ("19", 112), ("X", 10)]), 108 | ("X:11", None, [("X", 10)]), # note differs from targets 109 | 110 | # targets only 111 | (None, "19", [("19", 111), ("19", 112)]), 112 | (None, "19:112", [("19", 112)]), 113 | (None, "20:1230236-", [("20", 1230237), ("20", 1234567), ("20", 1235237)]), 114 | (None, "20:1230237-", [("20", 1230237), ("20", 1234567), ("20", 1235237)]), 115 | (None, "20:1230238-", [("20", 1234567), ("20", 1235237)]), 116 | (None, "20:1230237-1235236", [("20", 1230237), ("20", 1234567)]), 117 | (None, "20:1230237-1235237", [("20", 1230237), ("20", 1234567), ("20", 1235237)]), # noqa: E501 118 | (None, "20:1230237-1235238", [("20", 1230237), ("20", 1234567), ("20", 1235237)]), # noqa: E501 119 | (None, "19,X", [("19", 111), ("19", 112), ("X", 10)]), 120 | (None, "X:11", []), 121 | (None, "^19,20:1-1234567", [("20", 1235237), ("X", 10)]), # complement 122 | 123 | # regions and targets 124 | ("20", "^20:1110696-", [("20", 14370), ("20", 17330)]) 125 | ] 126 | ) 127 | # fmt: on 128 | def test_write_vcf__regions(tmp_path, regions, targets, expected_chrom_pos): 129 | 130 | original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz" 131 | vcz = vcz_path_cache(original) 132 | output = tmp_path.joinpath("output.vcf") 133 | 134 | write_vcf(vcz, output, regions=regions, targets=targets) 135 | 136 | v = VCF(output) 137 | variants = list(v) 138 | assert len(variants) == len(expected_chrom_pos) 139 | 140 | assert v.samples == ["NA00001", "NA00002", "NA00003"] 141 | 142 | for variant, chrom_pos in zip(variants, expected_chrom_pos): 143 | chrom, pos = chrom_pos 144 | assert variant.CHROM == chrom 145 | assert variant.POS == pos 146 | 147 | 148 | @pytest.mark.parametrize( 149 | ("samples", "force_samples", "expected_samples", "expected_genotypes"), 150 | [ 151 | ("NA00001", False, ["NA00001"], [[0, 0, True]]), 152 | ( 153 | "NA00001,NA00003", 154 | False, 155 | ["NA00001", "NA00003"], 156 | [[0, 0, True], [0, 1, False]], 157 | ), 158 | ( 159 | "NA00003,NA00001", 160 | False, 161 | ["NA00003", "NA00001"], 162 | [[0, 1, False], [0, 0, True]], 163 | ), 164 | ("^NA00002", False, ["NA00001", "NA00003"], [[0, 0, True], [0, 1, False]]), 165 | ("^NA00003,NA00002", False, ["NA00001"], [[0, 0, True]]), 166 | ("^NA00003,NA00002,NA00003", False, ["NA00001"], [[0, 0, True]]), 167 | ("NO_SAMPLE", True, [], None), 168 | ], 169 | ) 170 | def test_write_vcf__samples( 171 | tmp_path, samples, force_samples, expected_samples, expected_genotypes 172 | ): 173 | original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz" 174 | vcz = vcz_path_cache(original) 175 | output = tmp_path.joinpath("output.vcf") 176 | 177 | write_vcf(vcz, output, samples=samples, force_samples=force_samples) 178 | 179 | v = VCF(output) 180 | 181 | assert v.samples == expected_samples 182 | 183 | variant = next(v) 184 | 185 | assert variant.CHROM == "19" 186 | assert variant.POS == 111 187 | assert variant.ID is None 188 | assert variant.REF == "A" 189 | assert variant.ALT == ["C"] 190 | assert variant.QUAL == pytest.approx(9.6) 191 | assert variant.FILTER is None 192 | 193 | assert variant.genotypes == expected_genotypes 194 | 195 | 196 | def test_write_vcf__non_existent_sample(tmp_path): 197 | original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz" 198 | vcz = vcz_path_cache(original) 199 | output = tmp_path.joinpath("output.vcf") 200 | 201 | with pytest.raises( 202 | ValueError, 203 | match=re.escape( 204 | "subset called for sample(s) not in header: NO_SAMPLE. " 205 | 'Use "--force-samples" to ignore this error.' 206 | ), 207 | ): 208 | write_vcf(vcz, output, samples="NO_SAMPLE") 209 | 210 | 211 | def test_write_vcf__no_samples(tmp_path): 212 | original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz" 213 | vcz = vcz_path_cache(original) 214 | output = tmp_path.joinpath("output.vcf") 215 | 216 | write_vcf(vcz, output, drop_genotypes=True) 217 | 218 | v = VCF(output) 219 | 220 | assert v.samples == [] 221 | 222 | 223 | @pytest.mark.parametrize( 224 | ("regions", "targets", "samples", "include", "expected_chrom_pos"), 225 | [ 226 | # Test that sample filtering takes place after include filtering. 227 | ("20", None, "NA00001", "FMT/GQ > 60", [("20", 1230237)]), 228 | # Test that region filtering and include expression are combined. 229 | ("19", None, "NA00001", "POS > 200", []), 230 | # Test that target filtering and include expression are combined. 231 | (None, "19", "NA00001", "POS > 200", []), 232 | # Test that empty output in the no-regions cases works 233 | (None, None, "NA00001", "POS < 1", []), 234 | # Test that empty output in the no-regions cases works 235 | (None, None, None, "POS < 1", []), 236 | ], 237 | ) 238 | def test_write_vcf__regions_samples_filtering( 239 | tmp_path, regions, targets, samples, include, expected_chrom_pos 240 | ): 241 | original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz" 242 | vcz = vcz_path_cache(original) 243 | output = tmp_path.joinpath("output.vcf") 244 | 245 | write_vcf( 246 | vcz, 247 | output, 248 | regions=regions, 249 | targets=targets, 250 | samples=samples, 251 | include=include, 252 | ) 253 | 254 | v = VCF(str(output)) 255 | variants = list(v) 256 | 257 | assert len(variants) == len(expected_chrom_pos) 258 | if samples is not None: 259 | assert v.samples == [samples] 260 | 261 | for variant, chrom_pos in zip(variants, expected_chrom_pos): 262 | chrom, pos = chrom_pos 263 | assert variant.CHROM == chrom 264 | assert variant.POS == pos 265 | 266 | 267 | def test_write_vcf__include_exclude(tmp_path): 268 | original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz" 269 | vcz = vcz_path_cache(original) 270 | output = tmp_path.joinpath("output.vcf") 271 | 272 | variant_site_filter = "POS > 1" 273 | 274 | with pytest.raises( 275 | ValueError, 276 | match=re.escape( 277 | "Cannot handle both an include expression and an exclude expression." 278 | ), 279 | ): 280 | write_vcf(vcz, output, include=variant_site_filter, exclude=variant_site_filter) 281 | 282 | 283 | def test_write_vcf__header_flags(tmp_path): 284 | original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz" 285 | vcz = vcz_path_cache(original) 286 | output = tmp_path.joinpath("output.vcf") 287 | 288 | output_header = StringIO() 289 | write_vcf(vcz, output_header, header_only=True, no_version=True) 290 | 291 | output_no_header = StringIO() 292 | write_vcf(vcz, output_no_header, no_header=True, no_version=True) 293 | assert not output_no_header.getvalue().startswith("#") 294 | 295 | # combine outputs and check VCFs match 296 | output_str = output_header.getvalue() + output_no_header.getvalue() 297 | with open(output, "w") as f: 298 | f.write(output_str) 299 | assert_vcfs_close(original, output) 300 | 301 | 302 | def test_write_vcf__generate_header(): 303 | original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz" 304 | vcz = vcz_path_cache(original) 305 | 306 | output_header = StringIO() 307 | write_vcf(vcz, output_header, header_only=True, no_version=True) 308 | 309 | expected_vcf_header = """##fileformat=VCFv4.3 310 | ##source={} 311 | ##INFO= 312 | ##INFO= 313 | ##INFO= 314 | ##INFO= 315 | ##INFO= 316 | ##INFO= 317 | ##INFO= 318 | ##INFO= 319 | ##FILTER= 320 | ##FILTER= 321 | ##FILTER= 322 | ##FORMAT= 323 | ##FORMAT= 324 | ##FORMAT= 325 | ##FORMAT= 326 | ##contig= 327 | ##contig= 328 | ##contig= 329 | ##fileDate=20090805 330 | ##reference=1000GenomesPilot-NCBI36 331 | ##phasing=partial 332 | ##ALT= 333 | ##ALT= 334 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 335 | """ # noqa: E501 336 | 337 | # substitute value of source 338 | root = zarr.open(vcz, mode="r+") 339 | expected_vcf_header = expected_vcf_header.format(root.attrs["source"]) 340 | 341 | assert output_header.getvalue() == expected_vcf_header 342 | 343 | 344 | def test_compute_info_fields(): 345 | gt = np.array( 346 | [ 347 | [[0, 0], [0, 1], [1, 1]], 348 | [[0, 0], [0, 2], [2, 2]], 349 | [[0, 1], [1, 2], [2, 2]], 350 | [ 351 | [INT_MISSING, INT_MISSING], 352 | [INT_MISSING, INT_MISSING], 353 | [INT_FILL, INT_FILL], 354 | ], 355 | [[INT_MISSING, INT_MISSING], [0, 3], [INT_FILL, INT_FILL]], 356 | ] 357 | ) 358 | alt = np.array( 359 | [ 360 | [b"A", b"B", b""], 361 | [b"A", b"B", b"C"], 362 | [b"A", b"B", b"C"], 363 | [b"", b"", b""], 364 | [b"A", b"B", b"C"], 365 | ] 366 | ) 367 | expected_result = { 368 | "AC": np.array( 369 | [ 370 | [3, 0, INT_FILL], 371 | [0, 3, 0], 372 | [2, 3, 0], 373 | [INT_FILL, INT_FILL, INT_FILL], 374 | [0, 0, 1], 375 | ] 376 | ), 377 | "AN": np.array([6, 6, 6, 0, 2]), 378 | } 379 | 380 | computed_info_fields = _compute_info_fields(gt, alt) 381 | 382 | assert expected_result.keys() == computed_info_fields.keys() 383 | 384 | for key in expected_result.keys(): 385 | np.testing.assert_array_equal(expected_result[key], computed_info_fields[key]) 386 | 387 | 388 | class TestApiErrors: 389 | 390 | @pytest.fixture() 391 | def vcz(self): 392 | original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz" 393 | return vcz_path_cache(original) 394 | 395 | def test_samples_and_drop_genotypes(self, vcz): 396 | with pytest.raises( 397 | ValueError, match="Cannot select samples and drop genotypes" 398 | ): 399 | write_vcf(vcz, sys.stdout, samples=["NA00001"], drop_genotypes=True) 400 | 401 | def test_no_output_filter_parse_error(self, vcz): 402 | output = StringIO() 403 | with pytest.raises(ValueError, match='the tag "Not" is not defined'): 404 | write_vcf(vcz, output, include="Not a valid expression") 405 | assert output.getvalue() == "" 406 | 407 | 408 | def minimal_vcf_chunk(num_variants, num_samples, ploidy=2): 409 | return { 410 | "variant_position": 1 + np.arange(num_variants, dtype=np.int32), 411 | "variant_contig": np.zeros(num_variants, dtype=np.int32), 412 | # "variant_id": np.array(["."] * num_variants, dtype="S1"), 413 | "variant_id": np.array(["."] * num_variants, dtype="S").reshape( 414 | (num_variants, 1) 415 | ), 416 | "variant_allele": np.array([("A", "T")] * num_variants), 417 | "variant_quality": np.zeros(num_variants, dtype=np.float32), 418 | "variant_filter": np.ones(num_variants, dtype=bool).reshape((num_variants, 1)), 419 | "call_genotype": np.zeros((num_variants, num_samples, ploidy), dtype=np.int8), 420 | } 421 | 422 | 423 | def chunk_to_vcf(chunk): 424 | filters = np.array([b"PASS"]) 425 | contigs = np.array([b"chr1"]) 426 | output = StringIO() 427 | c_chunk_to_vcf( 428 | chunk, 429 | samples_selection=None, 430 | contigs=contigs, 431 | filters=filters, 432 | output=output, 433 | drop_genotypes=False, 434 | no_update=False, 435 | ) 436 | return output.getvalue() 437 | 438 | 439 | def chunk_to_vcf_file(chunk): 440 | """ 441 | Simple function just to get the data out to a minimal file for 442 | testing and evaluation 443 | """ 444 | num_samples = chunk["call_genotype"].shape[1] 445 | 446 | output = StringIO() 447 | print("##fileformat=VCFv4.3", file=output) 448 | print("##contig=", file=output) 449 | print( 450 | '##FORMAT=', 451 | file=output, 452 | ) 453 | print( 454 | "#CHROM", 455 | "POS", 456 | "ID", 457 | "REF", 458 | "ALT", 459 | "QUAL", 460 | "FILTER", 461 | "INFO", 462 | sep="\t", 463 | end="", 464 | file=output, 465 | ) 466 | print(end="\t", file=output) 467 | sample_ids = [f"x{j}" for j in range(num_samples)] 468 | print("FORMAT", *sample_ids, sep="\t", file=output) 469 | return output.getvalue() + chunk_to_vcf(chunk) 470 | 471 | 472 | class TestEncoding: 473 | 474 | def test_basic_example(self): 475 | chunk = minimal_vcf_chunk(1, 2) 476 | out = chunk_to_vcf(chunk) 477 | line = "\t".join( 478 | ["chr1", "1", ".", "A", "T", "0", "PASS", ".", "GT", "0/0", "0/0"] 479 | ) 480 | assert out == line + "\n" 481 | 482 | def test_mixed_ploidy(self): 483 | chunk = minimal_vcf_chunk(2, 2) 484 | chunk["call_genotype"][0, 0, 1] = -2 485 | chunk["call_genotype"][1, 1, 1] = -2 486 | out = chunk_to_vcf(chunk) 487 | lines = [ 488 | ["chr1", "1", ".", "A", "T", "0", "PASS", ".", "GT", "0", "0/0"], 489 | ["chr1", "2", ".", "A", "T", "0", "PASS", ".", "GT", "0/0", "0"], 490 | ] 491 | lines = "\n".join("\t".join(line) for line in lines) 492 | assert out == lines + "\n" 493 | 494 | def test_zero_ploidy(self): 495 | chunk = minimal_vcf_chunk(2, 2) 496 | chunk["call_genotype"][0, 0] = -2 497 | chunk["call_genotype"][1, 1] = -2 498 | out = chunk_to_vcf(chunk) 499 | lines = [ 500 | ["chr1", "1", ".", "A", "T", "0", "PASS", ".", "GT", "", "0/0"], 501 | ["chr1", "2", ".", "A", "T", "0", "PASS", ".", "GT", "0/0", ""], 502 | ] 503 | lines = "\n".join("\t".join(line) for line in lines) 504 | assert out == lines + "\n" 505 | 506 | # NOTE bcftools/htslib doesn't like this 507 | # [E::vcf_parse_format] Couldn't read GT data: 508 | # value not a number or '.' at chr1:1 509 | 510 | # with open("zero-ploidy.vcf", "w") as f: 511 | # print(chunk_to_vcf_file(chunk), file=f, end="") 512 | -------------------------------------------------------------------------------- /tests/test_filter.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | import numpy as np 4 | import numpy.testing as nt 5 | import pyparsing as pp 6 | import pytest 7 | import zarr 8 | 9 | from tests.utils import vcz_path_cache 10 | from vcztools import filter as filter_mod 11 | 12 | 13 | class TestFilterExpressionParser: 14 | @pytest.fixture() 15 | def parser(self): 16 | return filter_mod.make_bcftools_filter_parser(map_vcf_identifiers=False) 17 | 18 | @pytest.mark.parametrize( 19 | "expression", 20 | [ 21 | "", 22 | "| |", 23 | "a +", 24 | '"stri + 2', 25 | ], 26 | ) 27 | def test_invalid_expressions(self, parser, expression): 28 | with pytest.raises(pp.ParseException): 29 | parser.parse_string(expression, parse_all=True) 30 | 31 | @pytest.mark.parametrize( 32 | ("expression", "exception_class"), 33 | [ 34 | # NOTE: using an integer here so that we don't trigger the 35 | # generic string issue. Can fix this later when we've gotten 36 | # some partial string handling implemented 37 | ("INFO/HAYSTACK ~ 0", filter_mod.UnsupportedRegexError), 38 | ('DP="."', filter_mod.UnsupportedMissingDataError), 39 | ("ID!=@~/file", filter_mod.UnsupportedFileReferenceError), 40 | ("INFO/TAG=@file", filter_mod.UnsupportedFileReferenceError), 41 | ("INFO/X[0] == 1", filter_mod.UnsupportedArraySubscriptError), 42 | ("INFO/AF[0] > 0.3", filter_mod.UnsupportedArraySubscriptError), 43 | ("FORMAT/AD[0:0] > 30", filter_mod.UnsupportedArraySubscriptError), 44 | ("DP4[*] == 0", filter_mod.UnsupportedArraySubscriptError), 45 | ("FORMAT/DP[1-3] > 10", filter_mod.UnsupportedArraySubscriptError), 46 | ("FORMAT/DP[1-] < 7", filter_mod.UnsupportedArraySubscriptError), 47 | ("FORMAT/DP[0,2-4] > 20", filter_mod.UnsupportedArraySubscriptError), 48 | ("FORMAT/AD[0:*]", filter_mod.UnsupportedArraySubscriptError), 49 | ("FORMAT/AD[0:]", filter_mod.UnsupportedArraySubscriptError), 50 | ("FORMAT/AD[*:1]", filter_mod.UnsupportedArraySubscriptError), 51 | ( 52 | "(DP4[0]+DP4[1])/(DP4[2]+DP4[3]) > 0.3", 53 | filter_mod.UnsupportedArraySubscriptError, 54 | ), 55 | ("binom(FMT/AD)", filter_mod.UnsupportedFunctionsError), 56 | ("fisher(INFO/DP4)", filter_mod.UnsupportedFunctionsError), 57 | ("fisher(FMT/ADF,FMT/ADR)", filter_mod.UnsupportedFunctionsError), 58 | ("N_PASS(GQ>90)", filter_mod.UnsupportedFunctionsError), 59 | ('TYPE="bnd"', filter_mod.UnsupportedTypeFieldError), 60 | ], 61 | ) 62 | def test_unsupported_syntax(self, parser, expression, exception_class): 63 | with pytest.raises(exception_class): 64 | parser.parse_string(expression, parse_all=True) 65 | 66 | 67 | class TestFilterExpressionSample: 68 | @pytest.mark.parametrize( 69 | ("expression", "expected_result"), 70 | [ 71 | ('CHROM = "20"', [0, 0, 1, 1, 1, 1, 1, 1, 0]), 72 | ("POS < 1000", [1, 1, 0, 0, 0, 0, 0, 0, 1]), 73 | ("INFO/DP > 10", [0, 0, 1, 1, 0, 1, 0, 0, 0]), 74 | ( 75 | "FMT/GQ > 20", 76 | [ 77 | [0, 0, 0], 78 | [0, 0, 0], 79 | [1, 1, 1], 80 | [1, 0, 1], 81 | [1, 0, 1], 82 | [1, 1, 1], 83 | [0, 0, 1], 84 | [0, 0, 0], 85 | [0, 0, 0], 86 | ], 87 | ), 88 | ( 89 | "FMT/DP >= 5 && FMT/GQ > 10", 90 | [ 91 | [0, 0, 0], 92 | [0, 0, 0], 93 | [1, 1, 1], 94 | [1, 1, 1], 95 | [1, 0, 1], 96 | [0, 0, 0], 97 | [0, 0, 0], 98 | [0, 0, 0], 99 | [0, 0, 0], 100 | ], 101 | ), 102 | ( 103 | "FMT/DP >= 5 & FMT/GQ > 10", 104 | [ 105 | [0, 0, 0], 106 | [0, 0, 0], 107 | [0, 1, 1], 108 | [0, 0, 0], 109 | [1, 0, 0], 110 | [0, 0, 0], 111 | [0, 0, 0], 112 | [0, 0, 0], 113 | [0, 0, 0], 114 | ], 115 | ), 116 | ( 117 | "QUAL > 10 || FMT/GQ > 10", 118 | [ 119 | [0, 0, 0], 120 | [0, 0, 0], 121 | [1, 1, 1], 122 | [1, 1, 1], 123 | [1, 1, 1], 124 | [1, 1, 1], 125 | [1, 1, 1], 126 | [0, 0, 0], 127 | [0, 0, 0], 128 | ], 129 | ), 130 | ( 131 | "(QUAL > 10 || FMT/GQ > 10) && POS > 100000", 132 | [ 133 | [0, 0, 0], 134 | [0, 0, 0], 135 | [0, 0, 0], 136 | [0, 0, 0], 137 | [1, 1, 1], 138 | [1, 1, 1], 139 | [1, 1, 1], 140 | [0, 0, 0], 141 | [0, 0, 0], 142 | ], 143 | ), 144 | ( 145 | "(FMT/DP >= 8 | FMT/GQ > 40) && POS > 100000", 146 | [ 147 | [0, 0, 0], 148 | [0, 0, 0], 149 | [0, 0, 0], 150 | [0, 0, 0], 151 | [0, 0, 0], 152 | [1, 1, 1], 153 | [0, 0, 0], 154 | [0, 0, 0], 155 | [0, 0, 0], 156 | ], 157 | ), 158 | ], 159 | ) 160 | def test(self, expression, expected_result): 161 | original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz" 162 | vcz = vcz_path_cache(original) 163 | root = zarr.open(vcz, mode="r") 164 | data = {field: root[field][:] for field in root.keys()} 165 | filter_expr = filter_mod.FilterExpression( 166 | field_names=set(root), include=expression 167 | ) 168 | result = filter_expr.evaluate(data) 169 | nt.assert_array_equal(result, expected_result) 170 | 171 | filter_expr = filter_mod.FilterExpression( 172 | field_names=set(root), exclude=expression 173 | ) 174 | result = filter_expr.evaluate(data) 175 | nt.assert_array_equal(result, np.logical_not(expected_result)) 176 | 177 | 178 | def numpify_values(data): 179 | return {k: np.array(v) for k, v in data.items()} 180 | 181 | 182 | class TestFilterExpression: 183 | @pytest.mark.parametrize( 184 | ("expression", "data", "expected"), 185 | [ 186 | ("POS<5", {"variant_position": [1, 5, 6, 10]}, [1, 0, 0, 0]), 187 | ("INFO/XX>=10", {"variant_XX": [1, 5, 6, 10]}, [0, 0, 0, 1]), 188 | ("INFO/XX / 2 >=5", {"variant_XX": [1, 5, 6, 10]}, [0, 0, 0, 1]), 189 | ("POS<5 | POS>8", {"variant_position": [1, 5, 6, 10]}, [1, 0, 0, 1]), 190 | ( 191 | "POS<0 & POS<1 & POS<2 & POS<3 & POS<4", 192 | {"variant_position": range(10)}, 193 | np.zeros(10, dtype=bool), 194 | ), 195 | ], 196 | ) 197 | def test_evaluate(self, expression, data, expected): 198 | fee = filter_mod.FilterExpression(field_names=data.keys(), include=expression) 199 | result = fee.evaluate(numpify_values(data)) 200 | nt.assert_array_equal(result, expected) 201 | 202 | @pytest.mark.parametrize( 203 | ("expression", "expected"), 204 | [ 205 | ('FILTER="PASS"', [False, True, False, False, False, False]), 206 | ('FILTER="."', [True, False, False, False, False, False]), 207 | ('FILTER="A"', [False, False, True, False, False, False]), 208 | ('FILTER!="A"', [True, True, False, True, True, True]), 209 | ('FILTER~"A"', [False, False, True, False, True, True]), 210 | ('FILTER="A;B"', [False, False, False, False, True, False]), 211 | ('FILTER="B;A"', [False, False, False, False, True, False]), 212 | ('FILTER!="A;B"', [True, True, True, True, False, True]), 213 | ('FILTER~"A;B"', [False, False, False, False, True, True]), 214 | ('FILTER~"B;A"', [False, False, False, False, True, True]), 215 | ('FILTER!~"A;B"', [True, True, True, True, False, False]), 216 | ], 217 | ) 218 | def test_evaluate_filter_comparison(self, expression, expected): 219 | data = { 220 | "variant_filter": [ 221 | [False, False, False, False], 222 | [True, False, False, False], 223 | [False, True, False, False], 224 | [False, False, True, False], 225 | [False, True, True, False], 226 | [False, True, True, True], 227 | ], 228 | "filter_id": ["PASS", "A", "B", "C"], 229 | } 230 | fee = filter_mod.FilterExpression(include=expression) 231 | result = fee.evaluate(numpify_values(data)) 232 | nt.assert_array_equal(result, expected) 233 | 234 | @pytest.mark.parametrize( 235 | ("expression", "expected"), 236 | [ 237 | ('TYPE="ref"', [True, False, False, False, False, False]), 238 | ('TYPE=="ref"', [True, False, False, False, False, False]), 239 | ('TYPE!="ref"', [False, True, True, True, True, True]), 240 | ('TYPE~"ref"', [True, False, False, False, False, False]), 241 | ('TYPE!~"ref"', [False, True, True, True, True, True]), 242 | ('TYPE="snp"', [False, True, False, False, False, True]), 243 | ('TYPE=="snp"', [False, True, False, False, False, True]), 244 | ('TYPE!="snp"', [True, False, True, True, True, False]), 245 | ('TYPE~"snp"', [False, True, False, False, True, True]), 246 | ('TYPE!~"snp"', [True, False, True, True, False, False]), 247 | ], 248 | ) 249 | def test_evaluate_type_operation(self, expression, expected): 250 | data = { 251 | "variant_allele": [ 252 | ["A", "", "", ""], 253 | ["A", "T", "", ""], 254 | ["A", "AT", "", ""], 255 | ["A", "CT", "", ""], 256 | ["A", "T", "CT", ""], 257 | ["A", "T", "G", "C"], 258 | ], 259 | } 260 | fee = filter_mod.FilterExpression(include=expression) 261 | result = fee.evaluate(numpify_values(data)) 262 | nt.assert_array_equal(result, expected) 263 | 264 | @pytest.mark.parametrize( 265 | ("expr", "expected"), 266 | [ 267 | ("a == b", {"variant_a", "variant_b"}), 268 | ("a == b + c", {"variant_a", "variant_b", "variant_c"}), 269 | ("(a + 1) < (b + c) - d / a", {f"variant_{x}" for x in "abcd"}), 270 | ("-(a + b)", {f"variant_{x}" for x in "ab"}), 271 | ], 272 | ) 273 | def test_referenced_fields(self, expr, expected): 274 | fe = filter_mod.FilterExpression( 275 | field_names={f"variant_{x}" for x in "abcd"}, include=expr 276 | ) 277 | assert fe.referenced_fields == expected 278 | 279 | @pytest.mark.parametrize( 280 | ("expr", "expected"), 281 | [ 282 | ("a == b", "(variant_a)==(variant_b)"), 283 | ("a + 1", "(variant_a)+(1)"), 284 | ("-a + 1", "(-(variant_a))+(1)"), 285 | ("a + 1 + 2", "(variant_a)+(1)+(2)"), 286 | ("a + (1 + 2)", "(variant_a)+((1)+(2))"), 287 | ("POS<10", "(variant_position)<(10)"), 288 | ('ID=="rs6054257"', "(variant_id)==('rs6054257')"), 289 | ], 290 | ) 291 | def test_repr(self, expr, expected): 292 | fe = filter_mod.FilterExpression( 293 | field_names={"variant_a", "variant_b"}, include=expr 294 | ) 295 | assert repr(fe.parse_result[0]) == expected 296 | 297 | 298 | class TestBcftoolsParser: 299 | @pytest.mark.parametrize( 300 | "expr", 301 | [ 302 | "2", 303 | "2 + 2", 304 | "(2 + 3) / 2", 305 | "2 / (2 + 3)", 306 | "1 + 1 + 1 + 1 + 1", 307 | "5 * (2 / 3)", 308 | "5 * 2 / 3", 309 | "1 + 2 - 3 / 4 * 5 + 6 * 7 / 8", 310 | "5 / (1 + 2 - 4) / (4 * 5 + 6 * 7 / 8)", 311 | "5 < 2", 312 | "5 > 2", 313 | "0 == 0", 314 | "0 != 0", 315 | "(1 + 2) == 0", 316 | "1 + 2 == 0", 317 | "1 + 2 == 1 + 2 + 3", 318 | "(1 + 2) == (1 + 2 + 3)", 319 | "(1 == 1) != (2 == 2)", 320 | "-1 == 1 + 2 - 4", 321 | '("x" == "x")', 322 | '"x"', 323 | '"INFO/STRING"', 324 | ], 325 | ) 326 | def test_python_arithmetic_expressions(self, expr): 327 | parser = filter_mod.make_bcftools_filter_parser() 328 | parsed = parser.parse_string(expr, parse_all=True) 329 | result = parsed[0].eval({}) 330 | assert result == eval(expr) 331 | 332 | @pytest.mark.parametrize( 333 | ("expr", "data"), 334 | [ 335 | ('("x" == "x")', {}), 336 | ('"x"', {}), 337 | ('"INFO/STRING"', {}), 338 | ('a == "string"', {"a": "string"}), 339 | ], 340 | ) 341 | def test_python_string_expressions_data(self, expr, data): 342 | parser = filter_mod.make_bcftools_filter_parser(map_vcf_identifiers=False) 343 | parsed = parser.parse_string(expr, parse_all=True) 344 | result = parsed[0].eval(data) 345 | assert result == eval(expr, data) 346 | 347 | @pytest.mark.parametrize( 348 | ("expr", "data"), 349 | [ 350 | ("a", {"a": 1}), 351 | ("a + a", {"a": 1}), 352 | ("a + 2 * a - 1", {"a": 7}), 353 | ("a - b < a + b", {"a": 7, "b": 6}), 354 | ("(a - b) < (a + b)", {"a": 7, "b": 6}), 355 | ("(a - b) < (a + b)", {"a": 7.0, "b": 6.666}), 356 | ("a == a", {"a": 1}), 357 | ("-a == -a", {"a": 1}), 358 | ("-a == b", {"a": 1, "b": -1}), 359 | ], 360 | ) 361 | def test_python_arithmetic_expressions_data(self, expr, data): 362 | parser = filter_mod.make_bcftools_filter_parser(map_vcf_identifiers=False) 363 | parsed = parser.parse_string(expr, parse_all=True) 364 | result = parsed[0].eval(data) 365 | assert result == eval(expr, data) 366 | 367 | @pytest.mark.parametrize( 368 | ("expr", "data"), 369 | [ 370 | ("a", {"a": [1, 2, 3]}), 371 | ("a + a", {"a": [1, 2, 3]}), 372 | ("1 + a + a", {"a": [1, 2, 3]}), 373 | ("a + b", {"a": [1, 2, 3], "b": [5, 6, 7]}), 374 | ("(a + b) < c", {"a": [1, 2, 3], "b": [5, 6, 7], "c": [5, 10, 15]}), 375 | ], 376 | ) 377 | def test_numpy_arithmetic_expressions_data(self, expr, data): 378 | parser = filter_mod.make_bcftools_filter_parser(map_vcf_identifiers=False) 379 | parsed = parser.parse_string(expr, parse_all=True) 380 | npdata = numpify_values(data) 381 | result = parsed[0].eval(npdata) 382 | evaled = eval(expr, npdata) 383 | nt.assert_array_equal(result, evaled) 384 | 385 | @pytest.mark.parametrize( 386 | ("expr", "data"), 387 | [ 388 | ("call_a", {"call_a": [[[1]], [[2]], [[3]]]}), 389 | ], 390 | ) 391 | def test_numpy_higher_dimension_arithmetic_expressions_data(self, expr, data): 392 | parser = filter_mod.make_bcftools_filter_parser(map_vcf_identifiers=False) 393 | parsed = parser.parse_string(expr, parse_all=True) 394 | npdata = numpify_values(data) 395 | with pytest.raises(filter_mod.UnsupportedHigherDimensionalFormatFieldsError): 396 | parsed[0].eval(npdata) 397 | 398 | @pytest.mark.parametrize( 399 | ("expr", "expected"), 400 | [ 401 | ("1 & 1", True), 402 | ("0 & 1", False), 403 | ("1 & 0", False), 404 | ("0 & 0", False), 405 | ("1 | 1", True), 406 | ("0 | 1", True), 407 | ("1 | 0", True), 408 | ("0 | 0", False), 409 | ("(1 < 2) | 0", True), 410 | ("(1 < 2) & 0", False), 411 | ], 412 | ) 413 | def test_boolean_operator_expressions(self, expr, expected): 414 | parser = filter_mod.make_bcftools_filter_parser() 415 | parsed = parser.parse_string(expr, parse_all=True) 416 | result = parsed[0].eval({}) 417 | assert result == expected 418 | 419 | @pytest.mark.parametrize( 420 | ("expr", "data", "expected"), 421 | [ 422 | ("a == b", {"a": [0, 1], "b": [1, 1]}, [False, True]), 423 | ("a = b", {"a": [0, 1], "b": [1, 1]}, [False, True]), 424 | ("a & b", {"a": [0, 1], "b": [1, 1]}, [False, True]), 425 | ("a | b", {"a": [0, 1], "b": [1, 1]}, [True, True]), 426 | ("(a < 2) & (b > 1)", {"a": [0, 1], "b": [1, 2]}, [False, True]), 427 | # AND has precedence over OR 428 | ("t | f & f", {"t": [1], "f": [0]}, [True or False and False]), 429 | ("(t | f) & f", {"t": [1], "f": [0]}, [(True or False) and False]), 430 | ( 431 | "call_a && call_b", 432 | { 433 | "call_a": [ 434 | [0, 0, 0, 0], 435 | [0, 0, 1, 1], 436 | [0, 0, 0, 0], 437 | ], 438 | "call_b": [ 439 | [0, 0, 0, 0], 440 | [0, 1, 0, 1], 441 | [1, 1, 1, 1], 442 | ], 443 | }, 444 | [ 445 | [False, False, False, False], 446 | [False, True, True, True], 447 | # all False since condition a is not met (all 0) 448 | [False, False, False, False], 449 | ], 450 | ), 451 | ( 452 | "call_a || call_b", 453 | { 454 | "call_a": [ 455 | [0, 0, 0, 0], 456 | [0, 0, 1, 1], 457 | [0, 0, 0, 0], 458 | ], 459 | "call_b": [ 460 | [0, 0, 0, 0], 461 | [0, 1, 0, 1], 462 | [1, 1, 1, 1], 463 | ], 464 | }, 465 | [ 466 | [False, False, False, False], 467 | # all True since variant site is included 468 | [True, True, True, True], 469 | [True, True, True, True], 470 | ], 471 | ), 472 | ], 473 | ) 474 | def test_boolean_operator_expressions_data(self, expr, data, expected): 475 | parser = filter_mod.make_bcftools_filter_parser(map_vcf_identifiers=False) 476 | parsed = parser.parse_string(expr, parse_all=True) 477 | result = parsed[0].eval(numpify_values(data)) 478 | nt.assert_array_equal(result, expected) 479 | 480 | 481 | class TestAPIErrors: 482 | def test_include_and_exclude(self): 483 | with pytest.raises(ValueError, match="Cannot handle both an include "): 484 | filter_mod.FilterExpression(include="x", exclude="y") 485 | --------------------------------------------------------------------------------