├── .cirun.yml ├── .coveragerc ├── .flake8 ├── .github ├── PULL_REQUEST_TEMPLATE.md ├── scripts │ ├── test_sgkit.py │ ├── test_sgkit_bgen.py │ ├── test_sgkit_plink.py │ └── upstream_install.py └── workflows │ ├── benchmark.yml │ ├── build-gpu.yml │ ├── build-numpy-1.yml │ ├── build.yml │ ├── check-docs.yml │ ├── cubed.yml │ ├── docs.yml │ ├── upstream.yml │ ├── validation.yml │ ├── wheels.yml │ └── windows.yml ├── .gitignore ├── .mergify.yml ├── .pre-commit-config.yaml ├── CONTRIBUTING.md ├── GOVERNANCE.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── benchmarks ├── __init__.py ├── asv.conf.json ├── benchmarks_plink.py └── benchmarks_stats.py ├── conftest.py ├── docs ├── .gitignore ├── Makefile ├── _static │ ├── data-structures-xarray.jpg │ ├── docsearch.sbt.css │ ├── docsearch.sbt.js │ ├── mydask.png │ ├── numfocus_logo.png │ ├── order.png │ ├── sgkit_blue_trnsprnt.png │ ├── sgkit_trnsprnt.png │ └── switcher.json ├── about.rst ├── api.rst ├── changelog.rst ├── conf.py ├── contributing.rst ├── examples │ ├── 1kg.schema.json │ ├── gwas_tutorial.ipynb │ ├── index.rst │ └── relatedness_tutorial.ipynb ├── extensions │ └── typed_returns.py ├── getting_started.rst ├── how_do_i.rst ├── index.rst ├── news.rst ├── news │ └── introducing_sgkit.md └── user_guide.rst ├── pyproject.toml ├── requirements-dev.txt ├── requirements-doc.txt ├── requirements-numpy1-dev.txt ├── requirements-numpy1.txt ├── requirements.txt ├── sgkit ├── __init__.py ├── accelerate.py ├── cohorts.py ├── display.py ├── display_numba_fns.py ├── distance │ ├── __init__.py │ ├── api.py │ └── metrics.py ├── distarray.py ├── io │ ├── __init__.py │ ├── bgen │ │ ├── __init__.py │ │ └── bgen_reader.py │ ├── dataset.py │ ├── plink │ │ ├── __init__.py │ │ ├── plink_reader.py │ │ └── plink_writer.py │ └── utils.py ├── model.py ├── py.typed ├── stats │ ├── __init__.py │ ├── aggregation.py │ ├── aggregation_numba_fns.py │ ├── association.py │ ├── cohort_numba_fns.py │ ├── conversion.py │ ├── conversion_numba_fns.py │ ├── genedrop.py │ ├── genedrop_numba_fns.py │ ├── genee.py │ ├── genee_momentchi2py.py │ ├── grm.py │ ├── hwe.py │ ├── ibs.py │ ├── ibs_numba_fns.py │ ├── ld.py │ ├── pc_relate.py │ ├── pca.py │ ├── pedigree.py │ ├── pedigree_numba_fns.py │ ├── popgen.py │ ├── popgen_numba_fns.py │ ├── preprocessing.py │ ├── regenie.py │ ├── truncated_svd.py │ └── utils.py ├── testing.py ├── tests │ ├── __init__.py │ ├── data │ │ └── sample.bed │ ├── io │ │ ├── __init__.py │ │ ├── bgen │ │ │ ├── __init__.py │ │ │ ├── data │ │ │ │ ├── .gitignore │ │ │ │ ├── example-no-samples.bgen │ │ │ │ ├── example-separate-samples.bgen │ │ │ │ ├── example-separate-samples.sample │ │ │ │ ├── example.bgen │ │ │ │ └── samples │ │ │ └── test_bgen_reader.py │ │ ├── data │ │ │ ├── sample.vcf.gz │ │ │ └── sample.vcf.gz.tbi │ │ ├── plink │ │ │ ├── __init__.py │ │ │ ├── data │ │ │ │ ├── example.bed │ │ │ │ ├── example.bim │ │ │ │ ├── example.fam │ │ │ │ ├── example.map │ │ │ │ ├── example.nosex │ │ │ │ ├── example.ped │ │ │ │ ├── example_with_fam.bed │ │ │ │ ├── example_with_fam.bim │ │ │ │ ├── example_with_fam.fam │ │ │ │ ├── plink_sim_10s_100v_10pmiss.bed │ │ │ │ ├── plink_sim_10s_100v_10pmiss.bim │ │ │ │ └── plink_sim_10s_100v_10pmiss.fam │ │ │ ├── test_plink_reader.py │ │ │ └── test_plink_writer.py │ │ ├── test_dataset.py │ │ └── test_vcf2zarr_compat.py │ ├── test_aggregation.py │ ├── test_association.py │ ├── test_cohort_numba_fns.py │ ├── test_cohorts.py │ ├── test_conversion.py │ ├── test_display.py │ ├── test_distance.py │ ├── test_genedrop.py │ ├── test_genee.py │ ├── test_genee │ │ ├── gene_list.csv │ │ ├── ld.csv │ │ ├── mydata.csv │ │ └── result.csv │ ├── test_genee_momentchi2py.py │ ├── test_grm.py │ ├── test_grm │ │ ├── AGHmatrix_sol100_A.csv │ │ ├── AGHmatrix_sol100_H_tau1.2_omega1.csv │ │ ├── AGHmatrix_sol100_H_tau1_omega0.9.csv │ │ ├── AGHmatrix_sol100_H_tau1_omega1.csv │ │ ├── AGHmatrix_sol30_A.csv │ │ ├── AGHmatrix_sol30_G.csv │ │ ├── AGHmatrix_sol30_H_tau0.8_omega1.csv │ │ ├── AGHmatrix_sol30_H_tau1_omega1.1.csv │ │ ├── AGHmatrix_sol30_H_tau1_omega1.csv │ │ ├── Legara2009_G_matrix.txt │ │ ├── Legara2009_H_matrix.txt │ │ ├── Legara2009_pedigree.txt │ │ ├── pine_snps_100_500.csv │ │ ├── pine_snps_100_500_A_matrix.txt │ │ ├── pine_snps_100_500_EJ_matrix.txt │ │ ├── sim4x_snps_A_matrix.txt │ │ └── sim4x_snps_EJ_matrix.txt │ ├── test_hwe.py │ ├── test_hwe │ │ └── sim_01.csv │ ├── test_ibs.py │ ├── test_ibs │ │ ├── hierfstat.sim1.beta.txt │ │ ├── hierfstat.sim1.dose.txt │ │ ├── hierfstat.sim2.beta.txt │ │ ├── hierfstat.sim2.dose.txt │ │ ├── hierfstat.sim3.beta.txt │ │ └── hierfstat.sim3.dose.txt │ ├── test_import_star.py │ ├── test_ld.py │ ├── test_mis.py │ ├── test_model.py │ ├── test_pc_relate.py │ ├── test_pca.py │ ├── test_pedigree.py │ ├── test_pedigree │ │ ├── hamilton_kerr_A_matrix.txt │ │ ├── hamilton_kerr_A_matrix_inv.txt │ │ ├── hamilton_kerr_inbreeding.txt │ │ ├── hamilton_kerr_kinship.txt │ │ ├── hamilton_kerr_kinship_inv.txt │ │ ├── hamilton_kerr_pedigree.csv │ │ ├── kinship2_kinship.txt │ │ ├── kinship2_pedigree.csv │ │ ├── pedkin_sim_founder.txt │ │ ├── pedkin_sim_interest.txt │ │ ├── pedkin_sim_out.txt │ │ └── pedkin_sim_ped.txt │ ├── test_popgen.py │ ├── test_preprocessing.py │ ├── test_regenie.py │ ├── test_regenie │ │ ├── config.yml │ │ ├── dataset │ │ │ ├── sim_sm_01 │ │ │ │ ├── beta_covariate.csv │ │ │ │ ├── beta_variant.csv │ │ │ │ ├── covariates.csv │ │ │ │ ├── genotypes.zarr.zip │ │ │ │ └── traits.csv │ │ │ └── sim_sm_02 │ │ │ │ ├── beta_covariate.csv │ │ │ │ ├── beta_variant.csv │ │ │ │ ├── covariates.csv │ │ │ │ ├── genotypes.zarr.zip │ │ │ │ ├── glow_offsets.zarr.zip │ │ │ │ ├── glow_offsets_nocovariate.zarr.zip │ │ │ │ └── traits.csv │ │ └── result │ │ │ ├── sim_sm_01-wgr_01 │ │ │ ├── gwas.csv │ │ │ ├── predictions.csv │ │ │ └── reduced_blocks_flat.csv.gz │ │ │ └── sim_sm_02-wgr_02 │ │ │ ├── gwas.csv │ │ │ ├── gwas_loco.csv │ │ │ ├── gwas_loco_nocovariate.csv │ │ │ ├── predictions.csv │ │ │ └── reduced_blocks_flat.csv.gz │ ├── test_stats_utils.py │ ├── test_testing.py │ ├── test_utils.py │ ├── test_variables.py │ └── test_window.py ├── typing.py ├── utils.py ├── variables.py └── window.py └── validation ├── __init__.py └── gwas ├── __init__.py ├── docker ├── Dockerfile ├── README.md ├── environment-dev.yml ├── environment-glow.yml ├── environment-hail.yml └── environment.yml └── method ├── __init__.py ├── hwe ├── Makefile ├── README.md ├── __init__.py ├── chwe.c ├── chwe.o ├── data │ └── sim_01.csv ├── hwe_unit_test.ipynb ├── invoke.yaml ├── logging.ini └── tasks.py ├── pc_relate ├── Dockerfile ├── README.md ├── convert_plink_to_gds.R ├── pc_relate.R ├── run.sh └── validate_pc_relate.py ├── regenie ├── .gitignore ├── README.md ├── __init__.py ├── config.yml ├── glow_wgr.py ├── hail_sim.py ├── invoke.yaml ├── logging.ini ├── sgkit_zarr.py ├── tasks.py └── unit_test_dev.ipynb └── regenie_loco_regression ├── GlowGR_continuous.ipynb ├── README.md └── environment.yml /.cirun.yml: -------------------------------------------------------------------------------- 1 | # Self-Hosted Github Action Runners on GCP via Cirun.io 2 | # Reference: https://docs.cirun.io/reference/yaml 3 | runners: 4 | - name: gpu-runner 5 | # Cloud Provider: GCP 6 | cloud: gcp 7 | # Cheapest GPU on GCP 8 | gpu: nvidia-tesla-t4 9 | # Cheapest VM on GCP, with GPU attachable 10 | instance_type: n1-standard-1 11 | # Custom image with NVIDIA drivers installed on Ubuntu-20.4 12 | # to reduce provision time 13 | # Format => project_name:image_name 14 | machine_image: sgkit-dev:cirun-nvidia-v2 15 | region: 16 | - us-central1-a 17 | - us-central1-b 18 | - us-central1-c 19 | - us-central1-f 20 | - us-east1-c 21 | - us-east1-d 22 | - us-east4-a 23 | - us-east4-b 24 | - us-east4-c 25 | - us-west1-a 26 | - us-west1-b 27 | - us-west2-b 28 | - us-west2-c 29 | - us-west4-a 30 | - us-west4-b 31 | # preemptible instances seems quite less reliable. 32 | preemptible: false 33 | # Adding the GPU label, this matches the runs-on param from .github/workflows/build-gpu.yml 34 | labels: 35 | - cirun-gpu-runner 36 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = 3 | sgkit/tests/* 4 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = 3 | # whitespace before ':' - doesn't work well with black 4 | E203 5 | E402 6 | # line too long - let black worry about that 7 | E501 8 | # do not assign a lambda expression, use a def 9 | E731 10 | # line break before binary operator 11 | W503 12 | 13 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | - [ ] Fixes #xxxx 4 | - [ ] Tests added 5 | - [ ] User visible changes (including notable bug fixes) are documented in `changelog.rst` 6 | - [ ] New functions are listed in `api.rst` 7 | -------------------------------------------------------------------------------- /.github/scripts/test_sgkit.py: -------------------------------------------------------------------------------- 1 | import sgkit as sg 2 | 3 | if __name__ == "__main__": 4 | ds = sg.simulate_genotype_call_dataset(n_variant=100, n_sample=50, n_contig=23) 5 | print(ds) 6 | -------------------------------------------------------------------------------- /.github/scripts/test_sgkit_bgen.py: -------------------------------------------------------------------------------- 1 | import urllib.request 2 | 3 | from sgkit.io.bgen import read_bgen 4 | 5 | if __name__ == "__main__": 6 | urllib.request.urlretrieve( 7 | "https://github.com/sgkit-dev/sgkit/raw/main/sgkit/tests/io/bgen/data/example.bgen", 8 | "example.bgen", 9 | ) 10 | ds = read_bgen("example.bgen") 11 | print(ds) 12 | -------------------------------------------------------------------------------- /.github/scripts/test_sgkit_plink.py: -------------------------------------------------------------------------------- 1 | import urllib.request 2 | 3 | from sgkit.io.plink import read_plink 4 | 5 | if __name__ == "__main__": 6 | for ext in (".bed", ".bim", ".fam"): 7 | urllib.request.urlretrieve( 8 | f"https://github.com/sgkit-dev/sgkit/raw/main/sgkit/tests/io/plink/data/plink_sim_10s_100v_10pmiss{ext}", 9 | f"plink_sim_10s_100v_10pmiss{ext}", 10 | ) 11 | ds = read_plink(path="plink_sim_10s_100v_10pmiss") 12 | print(ds) 13 | -------------------------------------------------------------------------------- /.github/scripts/upstream_install.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import sys 3 | from pathlib import Path 4 | 5 | 6 | def install_deps() -> None: 7 | # NOTE: need to use legacy-resolver due to https://github.com/dask/community/issues/124 8 | install_cmd = ( 9 | sys.executable, 10 | "-m", 11 | "pip", 12 | "install", 13 | "--use-deprecated=legacy-resolver", 14 | "--upgrade", 15 | ) 16 | upstream_deps = ( 17 | "git+https://github.com/dask/dask.git#egg=dask[array,dataframe]", 18 | "git+https://github.com/dask/distributed.git#egg=distributed", 19 | "git+https://github.com/pandas-dev/pandas#egg=pandas", 20 | "git+https://github.com/pangeo-data/rechunker.git#egg=rechunker", 21 | "git+https://github.com/pydata/xarray.git#egg=xarray", 22 | "git+https://github.com/zarr-developers/zarr-python.git#egg=zarr", 23 | ) 24 | full_cmd_upstream = install_cmd + upstream_deps 25 | print(f"Install upstream dependencies via: {full_cmd_upstream}") 26 | subprocess.check_call(full_cmd_upstream) 27 | req_deps = set(Path("requirements.txt").read_text().splitlines()) 28 | req_upstream = [x.split("egg=")[-1].strip() for x in upstream_deps] 29 | req_left = tuple(x for x in req_deps if not any(y in x for y in req_upstream)) 30 | full_cmd_left_over = install_cmd + req_left 31 | print(f"Install left over dependencies via: {full_cmd_left_over}") 32 | subprocess.check_call(full_cmd_left_over) 33 | 34 | 35 | def install_self() -> None: 36 | install_cmd = ( 37 | sys.executable, 38 | "-m", 39 | "pip", 40 | "install", 41 | "--no-deps", 42 | "-e" ".", 43 | ) 44 | print(f"Install sgkit via: `{install_cmd}`") 45 | subprocess.check_call(install_cmd) 46 | 47 | 48 | if __name__ == "__main__": 49 | install_deps() 50 | install_self() 51 | -------------------------------------------------------------------------------- /.github/workflows/benchmark.yml: -------------------------------------------------------------------------------- 1 | name: Benchmarks 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | env: 9 | GITHUB_TOKEN: ${{ secrets.GH_TOKEN }} 10 | BENCHMARKS_REPO: sgkit-dev/sgkit-benchmarks-asv 11 | ASV_CONFIG: benchmarks/asv.conf.json 12 | MACHINE_NAME: github-actions # to identify github actions machine as hostname changes everytime 13 | 14 | jobs: 15 | build: 16 | # This workflow only runs on the origin org 17 | if: github.repository_owner == 'sgkit-dev' 18 | runs-on: ubuntu-latest 19 | steps: 20 | - uses: actions/checkout@v4 21 | with: 22 | fetch-depth: 0 # To fetch all commits to be able to generate benchmarks html 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v5 25 | with: 26 | python-version: "3.10" 27 | - name: Install dependencies 28 | run: | 29 | sudo apt update -y 30 | python -m pip install --upgrade pip 31 | pip install asv 32 | 33 | - name: Set and log asv machine configuration 34 | run: | 35 | asv machine --yes --config benchmarks/asv.conf.json 36 | echo "Machine Configuration:" 37 | cat ~/.asv-machine.json 38 | rm ~/.asv-machine.json 39 | 40 | echo "Setting machine name to $MACHINE_NAME" 41 | asv machine --machine $MACHINE_NAME --yes --config $ASV_CONFIG -v 42 | 43 | - name: Run benchmarks 44 | run: | 45 | asv run --config $ASV_CONFIG -v 46 | 47 | - name: Copy benchmarks to benchmarks repo directory 48 | run: | 49 | git clone https://$GITHUB_TOKEN@github.com/$BENCHMARKS_REPO.git ~/$BENCHMARKS_REPO 50 | RESULTS_DIR=~/$BENCHMARKS_REPO/results 51 | if [ -d "$RESULTS_DIR" ] 52 | then 53 | cp -r $RESULTS_DIR/$MACHINE_NAME/* benchmarks/results/$MACHINE_NAME/ 54 | else 55 | echo "results/ directory does not exist in the benchmarks repository" 56 | fi 57 | asv publish --config $ASV_CONFIG -v 58 | cp -r benchmarks/html/* ~/$BENCHMARKS_REPO/ 59 | cp -r benchmarks/results ~/$BENCHMARKS_REPO/ 60 | 61 | - name: Push benchmarks 62 | run: | 63 | cd ~/$BENCHMARKS_REPO 64 | git add . 65 | git config --global user.email "project@sgkit.dev" 66 | git config --global user.name "sgkit benchmark bot" 67 | git commit -m "Update benchmarks" 68 | git push origin main 69 | -------------------------------------------------------------------------------- /.github/workflows/build-gpu.yml: -------------------------------------------------------------------------------- 1 | name: Build GPU 2 | 3 | on: 4 | workflow_dispatch: 5 | # Disabled on 2024-10-14 as this has been broken for over six months 6 | # https://github.com/sgkit-dev/sgkit/issues/1270 7 | # push: 8 | # branches: 9 | # - main 10 | 11 | jobs: 12 | build: 13 | 14 | runs-on: "cirun-gpu-runner--${{ github.run_id }}" 15 | defaults: 16 | run: 17 | shell: bash -l {0} 18 | strategy: 19 | matrix: 20 | python-version: ["3.10"] 21 | 22 | steps: 23 | - uses: actions/checkout@v4 24 | 25 | - name: Run Nvidia-smi 26 | run: | 27 | nvidia-smi 28 | 29 | - name: Set up Python 30 | uses: conda-incubator/setup-miniconda@v2.2.0 31 | env: 32 | CONDA: /home/runnerx/miniconda3 33 | with: 34 | python-version: ${{ matrix.python-version }} 35 | miniconda-version: "latest" 36 | 37 | - name: Conda info 38 | run: | 39 | conda info 40 | conda list 41 | 42 | - name: Installing cudatoolkit and dependencies 43 | run: | 44 | conda install -c nvidia cudatoolkit 45 | pip install -r requirements.txt -r requirements-dev.txt 46 | 47 | - name: Numba Information 48 | run: | 49 | numba -s 50 | 51 | - name: Run GPU tagged tests 52 | run: | 53 | pytest -m gpu -v 54 | -------------------------------------------------------------------------------- /.github/workflows/build-numpy-1.yml: -------------------------------------------------------------------------------- 1 | name: Build NumPy 1 2 | 3 | on: 4 | push: 5 | pull_request: 6 | 7 | jobs: 8 | build: 9 | # Scheduled runs only on the origin org 10 | if: (github.event_name == 'schedule' && github.repository_owner == 'sgkit-dev') || (github.event_name != 'schedule') 11 | runs-on: ubuntu-latest 12 | strategy: 13 | matrix: 14 | python-version: ["3.10", "3.11"] 15 | 16 | steps: 17 | - uses: actions/checkout@v4 18 | - name: Set up Python ${{ matrix.python-version }} 19 | uses: actions/setup-python@v5 20 | with: 21 | python-version: ${{ matrix.python-version }} 22 | - name: Install dependencies 23 | run: | 24 | python -m pip install --upgrade pip 25 | pip install -r requirements-numpy1.txt -r requirements-numpy1-dev.txt 26 | # - name: Run pre-commit 27 | # uses: pre-commit/action@v3.0.1 28 | - name: Test with pytest and coverage 29 | run: | 30 | pytest -v --cov=sgkit --cov-report=term-missing 31 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | 3 | on: 4 | push: 5 | pull_request: 6 | schedule: 7 | # Run at the end of every day 8 | - cron: "0 0 * * *" 9 | 10 | jobs: 11 | build: 12 | # Scheduled runs only on the origin org 13 | if: (github.event_name == 'schedule' && github.repository_owner == 'sgkit-dev') || (github.event_name != 'schedule') 14 | runs-on: ubuntu-latest 15 | strategy: 16 | matrix: 17 | python-version: ["3.10", "3.11", "3.12"] 18 | 19 | steps: 20 | - uses: actions/checkout@v4 21 | - name: Set up Python ${{ matrix.python-version }} 22 | uses: actions/setup-python@v5 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | - name: Install dependencies 26 | run: | 27 | python -m pip install --upgrade pip 28 | pip install -r requirements.txt -r requirements-dev.txt 29 | - name: Run pre-commit 30 | uses: pre-commit/action@v3.0.1 31 | - name: Test with pytest and coverage 32 | run: | 33 | pytest -v --cov=sgkit --cov-report=term-missing 34 | - name: Upload coverage to Codecov 35 | uses: codecov/codecov-action@v3 36 | with: 37 | token: ${{ secrets.CODECOV_TOKEN }} 38 | 39 | test-zarr-version: 40 | name: Test Zarr Python v3 41 | # Scheduled runs only on the origin org 42 | if: (github.event_name == 'schedule' && github.repository_owner == 'sgkit-dev') || (github.event_name != 'schedule') 43 | runs-on: ubuntu-latest 44 | strategy: 45 | matrix: 46 | zarr: [">=3"] 47 | steps: 48 | - uses: actions/checkout@v4 49 | - uses: actions/setup-python@v5 50 | with: 51 | python-version: '3.11' 52 | - name: Install dependencies 53 | run: | 54 | python -m pip install --upgrade pip 55 | pip install -r requirements.txt -r requirements-dev.txt 56 | - name: Install zarr${{ matrix.zarr }} 57 | run: | 58 | python -m pip install --pre 'zarr${{ matrix.zarr }}' 59 | python -m pip uninstall -y bio2zarr # TODO: remove when bio2zarr supports Zarr Python 3 60 | - name: Run tests 61 | run: | 62 | pytest 63 | -------------------------------------------------------------------------------- /.github/workflows/check-docs.yml: -------------------------------------------------------------------------------- 1 | name: Check docs 2 | 3 | on: 4 | push: 5 | pull_request: 6 | schedule: 7 | # Run at the end of every day 8 | - cron: "0 0 * * *" 9 | 10 | jobs: 11 | build: 12 | # Scheduled runs only on the origin org 13 | if: (github.event_name == 'schedule' && github.repository_owner == 'sgkit-dev') || (github.event_name != 'schedule') 14 | runs-on: ubuntu-latest 15 | strategy: 16 | matrix: 17 | python-version: ["3.11"] 18 | 19 | steps: 20 | - uses: actions/checkout@v4 21 | - name: Set up Python ${{ matrix.python-version }} 22 | uses: actions/setup-python@v5 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | - name: Install dependencies 26 | run: | 27 | sudo apt update -y 28 | sudo apt install graphviz # Needed for documentation 29 | python -m pip install --upgrade pip 30 | pip install -r requirements.txt -r requirements-dev.txt -r requirements-doc.txt 31 | pip install -U dask distributed # need latest versions to successully build docs 32 | - name: Run pre-commit 33 | uses: pre-commit/action@v3.0.1 34 | - name: Check for Sphinx doc warnings 35 | run: | 36 | cd docs 37 | make html SPHINXOPTS="-W --keep-going -n" 38 | - uses: actions/upload-artifact@v4 39 | if: failure() 40 | with: 41 | name: gwas_tutorial 42 | path: /home/runner/work/sgkit/sgkit/docs/_build/html/reports/examples/gwas_tutorial.err.log 43 | - uses: actions/upload-artifact@v4 44 | if: failure() 45 | with: 46 | name: relatedness_tutorial 47 | path: /home/runner/work/sgkit/sgkit/docs/_build/html/reports/examples/relatedness_tutorial.err.log 48 | -------------------------------------------------------------------------------- /.github/workflows/cubed.yml: -------------------------------------------------------------------------------- 1 | name: Cubed 2 | 3 | on: 4 | push: 5 | pull_request: 6 | # manual trigger 7 | workflow_dispatch: 8 | 9 | jobs: 10 | build: 11 | # This workflow only runs on the origin org 12 | # if: github.repository_owner == 'sgkit-dev' 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | python-version: ["3.11"] 17 | 18 | steps: 19 | - uses: actions/checkout@v4 20 | - name: Set up Python ${{ matrix.python-version }} 21 | uses: actions/setup-python@v5 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | 25 | - name: Install deps and sgkit 26 | run: | 27 | python -m pip install --upgrade pip 28 | python -m pip install -r requirements.txt -r requirements-dev.txt 29 | python -m pip install -U git+https://github.com/cubed-dev/cubed.git -U git+https://github.com/cubed-dev/cubed-xarray.git -U git+https://github.com/pydata/xarray.git 30 | 31 | - name: Test with pytest 32 | run: | 33 | pytest -v sgkit/tests/test_{aggregation,association,hwe,pca,window}.py \ 34 | -k "test_count_call_alleles or \ 35 | test_gwas_linear_regression or \ 36 | test_hwep or \ 37 | test_sample_stats or \ 38 | (test_count_variant_alleles and not test_count_variant_alleles__chunked[call_genotype]) or \ 39 | (test_variant_stats and not test_variant_stats__chunks[chunks2-False]) or \ 40 | (test_pca__array_backend and tsqr) or \ 41 | (test_window and not 12-5-4-4)" \ 42 | --use-cubed 43 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: Docs 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | build: 10 | 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - uses: actions/checkout@v4 15 | - name: Set up Python ${{ matrix.python-version }} 16 | uses: actions/setup-python@v5 17 | with: 18 | python-version: "3.11" 19 | - name: Install dependencies 20 | run: | 21 | sudo apt update -y 22 | sudo apt install graphviz # Needed for documentation 23 | python -m pip install --upgrade pip 24 | pip install -r requirements.txt -r requirements-dev.txt -r requirements-doc.txt 25 | pip install -U dask distributed # need latest versions to successully build docs 26 | - name: Build Sphinx documentation 27 | run: | 28 | cd docs 29 | make html SPHINXOPTS="-W --keep-going -n" 30 | - name: Commit documentation changes to gh-pages branch 31 | run: | 32 | git clone https://github.com/sgkit-dev/sgkit.git --branch gh-pages --single-branch gh-pages 33 | mkdir -p gh-pages/latest 34 | cp -r docs/_build/html/* gh-pages/latest 35 | cd gh-pages 36 | git config --local user.email "action@github.com" 37 | git config --local user.name "GitHub Action" 38 | git add . 39 | git commit -m "Update latest documentation" -a || true # Ignore error if no changes present 40 | - name: Push changes 41 | uses: ad-m/github-push-action@master 42 | with: 43 | branch: gh-pages 44 | directory: gh-pages 45 | force: true 46 | github_token: ${{ secrets.GITHUB_TOKEN }} 47 | -------------------------------------------------------------------------------- /.github/workflows/upstream.yml: -------------------------------------------------------------------------------- 1 | name: Upstream 2 | 3 | on: 4 | push: 5 | schedule: 6 | - cron: "0 1 * * *" 7 | # manual trigger 8 | workflow_dispatch: 9 | 10 | jobs: 11 | build: 12 | # This workflow only runs on the origin org 13 | if: github.repository_owner == 'sgkit-dev' 14 | runs-on: ubuntu-latest 15 | strategy: 16 | matrix: 17 | python-version: ["3.10", "3.11"] 18 | 19 | steps: 20 | - uses: actions/checkout@v4 21 | - name: Set up Python ${{ matrix.python-version }} 22 | uses: actions/setup-python@v5 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | 26 | - name: Install deps and sgkit 27 | run: | 28 | sudo apt update -y 29 | python -m pip install --upgrade pip 30 | python .github/scripts/upstream_install.py 31 | python -m pip install -r requirements-dev.txt 32 | 33 | - name: Test with pytest 34 | run: | 35 | python -m pip freeze 36 | pytest -v 37 | -------------------------------------------------------------------------------- /.github/workflows/validation.yml: -------------------------------------------------------------------------------- 1 | name: Validation 2 | 3 | on: 4 | # schedule: 5 | # Run at the end of every day 6 | # Disabled on 2024-09-02 as this has been broken for over a year, and no-one is interested 7 | # in fixing it. https://github.com/sgkit-dev/sgkit/issues/1112 8 | # - cron: "0 0 * * *" 9 | # manual trigger 10 | workflow_dispatch: 11 | 12 | jobs: 13 | validation_suite: 14 | # This workflow only runs on the origin org 15 | if: github.repository_owner == 'sgkit-dev' 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/checkout@v4 19 | - uses: google-github-actions/setup-gcloud@v0 20 | with: 21 | project_id: ${{ secrets.GCP_PROJECT_ID }} 22 | service_account_key: ${{ secrets.GCP_SA_KEY }} 23 | export_default_credentials: true 24 | - name: Download public test data (real HapMap data) 25 | run: gsutil -u $GCLOUD_PROJECT cp gs://sgkit-data/validation/hapmap_JPT_CHB_r23a_filtered.zip /tmp/ 26 | - name: Validate PC Relate 27 | run: ./validation/gwas/method/pc_relate/run.sh /tmp/hapmap_JPT_CHB_r23a_filtered.zip 28 | -------------------------------------------------------------------------------- /.github/workflows/wheels.yml: -------------------------------------------------------------------------------- 1 | name: Wheels 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: 7 | - main 8 | - test 9 | tags: 10 | - '*' 11 | release: 12 | types: [published] 13 | 14 | jobs: 15 | build: 16 | # This workflow only runs on the origin org 17 | if: github.repository_owner == 'sgkit-dev' 18 | runs-on: ubuntu-latest 19 | strategy: 20 | matrix: 21 | python-version: ["3.10"] 22 | 23 | steps: 24 | - uses: actions/checkout@v4 25 | - name: Set up Python ${{ matrix.python-version }} 26 | uses: actions/setup-python@v5 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install setuptools twine wheel build 33 | - name: Build a source distribution and a wheel 34 | run: | 35 | python -m build --sdist --wheel 36 | python -m twine check --strict dist/* 37 | - name: Upload artifacts 38 | uses: actions/upload-artifact@v4 39 | with: 40 | path: dist 41 | 42 | unix-test: 43 | # This workflow only runs on the origin org 44 | if: github.repository_owner == 'sgkit-dev' 45 | needs: ['build'] 46 | strategy: 47 | matrix: 48 | os: [ubuntu-latest, macos-latest] 49 | python-version: ["3.10", "3.11"] 50 | runs-on: ${{ matrix.os }} 51 | steps: 52 | # checkout repo to subdirectory to get access to scripts 53 | - uses: actions/checkout@v4 54 | with: 55 | path: sgkit-copy 56 | - name: Download artifacts 57 | uses: actions/download-artifact@v4.1.7 58 | - name: Set up Python ${{ matrix.python-version }} 59 | uses: actions/setup-python@v5 60 | with: 61 | python-version: ${{ matrix.python-version }} 62 | - name: Install wheel and test 63 | run: | 64 | python -VV 65 | # Install the local wheel 66 | wheel=$(ls artifact/sgkit-*.whl) 67 | pip install ${wheel} ${wheel}[bgen] ${wheel}[plink] 68 | python sgkit-copy/.github/scripts/test_sgkit.py 69 | python sgkit-copy/.github/scripts/test_sgkit_bgen.py 70 | python sgkit-copy/.github/scripts/test_sgkit_plink.py 71 | 72 | windows-test: 73 | # This workflow only runs on the origin org 74 | if: github.repository_owner == 'sgkit-dev' 75 | runs-on: windows-latest 76 | needs: ['build'] 77 | strategy: 78 | matrix: 79 | python-version: ["3.10"] 80 | steps: 81 | # checkout repo to subdirectory to get access to scripts 82 | - uses: actions/checkout@v4 83 | with: 84 | path: sgkit-copy 85 | - name: Download artifacts 86 | uses: actions/download-artifact@v4.1.7 87 | - name: Set up Python ${{ matrix.python-version }} 88 | uses: actions/setup-python@v5 89 | with: 90 | python-version: ${{ matrix.python-version }} 91 | - name: Install wheel and test 92 | run: | 93 | python -VV 94 | # Install the local wheel 95 | $env:wheel = $(ls artifact/sgkit-*.whl) 96 | pip install $env:wheel "$env:wheel[bgen]" "$env:wheel[plink]" 97 | python sgkit-copy/.github/scripts/test_sgkit.py 98 | python sgkit-copy/.github/scripts/test_sgkit_bgen.py 99 | python sgkit-copy/.github/scripts/test_sgkit_plink.py 100 | 101 | 102 | pypi-upload: 103 | if: github.repository_owner == 'sgkit-dev' 104 | runs-on: ubuntu-latest 105 | needs: ['unix-test', 'windows-test'] 106 | steps: 107 | - name: Download all 108 | uses: actions/download-artifact@v4.1.7 109 | - name: Move to dist 110 | run: | 111 | mkdir dist 112 | cp */*.{whl,gz} dist/. 113 | - name: Publish package to TestPyPI 114 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') 115 | uses: pypa/gh-action-pypi-publish@release/v1 116 | with: 117 | user: __token__ 118 | password: ${{ secrets.TEST_PYPI_API_TOKEN }} 119 | repository_url: https://test.pypi.org/legacy/ 120 | - name: Publish package to PyPI 121 | if: github.event_name == 'release' 122 | uses: pypa/gh-action-pypi-publish@release/v1 123 | with: 124 | user: __token__ 125 | password: ${{ secrets.PYPI_API_TOKEN }} 126 | -------------------------------------------------------------------------------- /.github/workflows/windows.yml: -------------------------------------------------------------------------------- 1 | name: Windows 2 | 3 | on: 4 | push: 5 | pull_request: 6 | schedule: 7 | # Run at the end of every day 8 | - cron: "0 0 * * *" 9 | 10 | jobs: 11 | win_build: 12 | # Scheduled runs only on the origin org 13 | if: (github.event_name == 'schedule' && github.repository_owner == 'sgkit-dev') || (github.event_name != 'schedule') 14 | runs-on: windows-latest 15 | strategy: 16 | matrix: 17 | python-version: ["3.10"] 18 | 19 | steps: 20 | - uses: actions/checkout@v4 21 | - name: Set up Miniconda with Python version ${{ matrix.python-version }} 22 | uses: conda-incubator/setup-miniconda@v2 23 | with: 24 | auto-update-conda: true 25 | channels: conda-forge,numba 26 | miniconda-version: "latest" 27 | python-version: ${{ matrix.python-version }} 28 | - name: Install dependencies 29 | # activate conda 30 | shell: bash -l {0} 31 | # conda can't install all dev tools, so we need to split it between conda and pip 32 | run: | 33 | conda install --file requirements.txt msprime 34 | pip install -r requirements-dev.txt 35 | - name: Test with pytest 36 | # activate conda 37 | shell: bash -l {0} 38 | # To avoid: 'UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1' 39 | env: 40 | OMP_NUM_THREADS: 1 41 | run: | 42 | pytest -v 43 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | datasets/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | pip-wheel-metadata/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # pipenv 89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 92 | # install all needed dependencies. 93 | #Pipfile.lock 94 | 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 96 | __pypackages__/ 97 | 98 | # Celery stuff 99 | celerybeat-schedule 100 | celerybeat.pid 101 | 102 | # SageMath parsed files 103 | *.sage.py 104 | 105 | # Environments 106 | .env 107 | .venv 108 | env/ 109 | venv/ 110 | ENV/ 111 | env.bak/ 112 | venv.bak/ 113 | 114 | # Spyder project settings 115 | .spyderproject 116 | .spyproject 117 | 118 | # Rope project settings 119 | .ropeproject 120 | 121 | # mkdocs documentation 122 | /site 123 | 124 | # mypy 125 | .mypy_cache/ 126 | .dmypy.json 127 | dmypy.json 128 | 129 | # Pyre type checker 130 | .pyre/ 131 | 132 | # IDE 133 | .vscode 134 | .idea 135 | .DS_Store 136 | 137 | # sgkit 138 | docs/generated 139 | docs/mydask.png 140 | docs/order.png 141 | benchmarks/html 142 | benchmarks/results 143 | -------------------------------------------------------------------------------- /.mergify.yml: -------------------------------------------------------------------------------- 1 | queue_rules: 2 | - name: default 3 | conditions: 4 | - base=main 5 | - status-success=build (3.10) 6 | - status-success=build (3.11) 7 | - status-success=win_build (3.10) 8 | - approved-reviews-by=@sgkit-dev/committers 9 | - "#approved-reviews-by>=1" 10 | - label=auto-merge 11 | 12 | pull_request_rules: 13 | - name: automatic merge 14 | conditions: 15 | - base=main 16 | - status-success=build (3.10) 17 | - status-success=build (3.11) 18 | - status-success=win_build (3.10) 19 | - approved-reviews-by=@sgkit-dev/committers 20 | - "#approved-reviews-by>=1" 21 | - label=auto-merge 22 | actions: 23 | queue: 24 | name: default 25 | method: rebase 26 | - name: deleting merged branch 27 | conditions: 28 | - merged 29 | actions: 30 | delete_head_branch: {} 31 | - name: ping author on conflicts 32 | conditions: 33 | - conflict 34 | actions: 35 | comment: 36 | message: This PR has conflicts, @{{author}} please rebase and push updated version 🙏 37 | label: 38 | add: 39 | - conflict 40 | - name: remove conflict label if not needed 41 | conditions: 42 | - -conflict 43 | actions: 44 | label: 45 | remove: 46 | - conflict 47 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.4.0 4 | hooks: 5 | - id: check-merge-conflict 6 | - id: debug-statements 7 | - id: mixed-line-ending 8 | - id: check-case-conflict 9 | - id: check-yaml 10 | - repo: https://github.com/timothycrosley/isort 11 | rev: 5.12.0 12 | hooks: 13 | - id: isort 14 | - repo: https://github.com/python/black 15 | rev: 23.1.0 16 | hooks: 17 | - id: black 18 | language_version: python3 19 | - repo: https://github.com/pycqa/flake8 20 | rev: 6.1.0 21 | hooks: 22 | - id: flake8 23 | language_version: python3 24 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | All contributions, bug reports, bug fixes, documentation improvements, enhancements, and ideas are welcome. 2 | 3 | For general information on how to contribute see https://sgkit-dev.github.io/sgkit/latest/contributing.html. 4 | -------------------------------------------------------------------------------- /GOVERNANCE.md: -------------------------------------------------------------------------------- 1 | Please see our [code of conduct](https://github.com/sgkit-dev/.github/blob/master/CODE_OF_CONDUCT.md) for more information. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | prune .github 2 | prune sgkit/tests 3 | prune validation 4 | exclude .coveragerc .gitignore .mergify.yml .pre-commit-config.yaml conftest.py 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # sgkit: Scalable genetics toolkit in Python 2 | [![Build status](https://github.com/sgkit-dev/sgkit/workflows/Build/badge.svg?branch=main)](https://github.com/sgkit-dev/sgkit/actions?query=workflow%3A%22Build%22+branch%3Amain) 3 | [![Windows build status](https://github.com/sgkit-dev/sgkit/workflows/Windows/badge.svg?branch=main)](https://github.com/sgkit-dev/sgkit/actions?query=workflow%3A%22Windows%22+branch%3Amain) 4 | [![Documentation status](https://github.com/sgkit-dev/sgkit/workflows/Docs/badge.svg?branch=main)](https://sgkit-dev.github.io/sgkit/) 5 | [![Validation status](https://github.com/sgkit-dev/sgkit/workflows/Validation/badge.svg?branch=main)](https://github.com/sgkit-dev/sgkit/actions?query=workflow%3A%22Validation%22+branch%3Amain) 6 | [![Upstream status](https://github.com/sgkit-dev/sgkit/workflows/Upstream/badge.svg?branch=main)](https://github.com/sgkit-dev/sgkit/actions?query=workflow%3A%22Upstream%22+branch%3Amain) 7 | [![asv](https://img.shields.io/badge/Benchmarked%20by-asv-green.svg?style=flat)](https://sgkit-dev.github.io/sgkit-benchmarks-asv/) 8 | [![Powered by NumFOCUS](https://img.shields.io/badge/powered%20by-NumFOCUS-orange.svg?style=flat&colorA=E1523D&colorB=007D8A)](http://numfocus.org) 9 | 10 | Sgkit is a Python package that provides a variety of analytical genetics methods through the use of 11 | general-purpose frameworks such as [Xarray](http://xarray.pydata.org/en/stable/), [Pandas](https://pandas.pydata.org/docs/), 12 | [Dask](https://docs.dask.org/en/latest/) and [Zarr](https://zarr.readthedocs.io/en/stable/). 13 | 14 | For more information on using sgkit, see the [documentation](https://sgkit-dev.github.io/sgkit/). 15 | 16 | [//]: # (numfocus-fiscal-sponsor-attribution) 17 | 18 | The sgkit project uses a [custom governance model](./GOVERNANCE.md) 19 | and is fiscally sponsored by [NumFOCUS](https://numfocus.org/). Consider making 20 | a [tax-deductible donation](https://numfocus.org/donate-to-sgkit) to help the project 21 | pay for developer time, professional services, travel, workshops, and a variety of other needs. 22 | 23 |
24 | 25 | 28 | 29 |
30 |
31 | -------------------------------------------------------------------------------- /benchmarks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/benchmarks/__init__.py -------------------------------------------------------------------------------- /benchmarks/benchmarks_plink.py: -------------------------------------------------------------------------------- 1 | """Benchmark suite for PLINK module.""" 2 | import tempfile 3 | import time 4 | from pathlib import Path 5 | 6 | from sgkit.io.plink.plink_writer import write_plink 7 | from sgkit.testing import simulate_genotype_call_dataset 8 | 9 | 10 | class PlinkSpeedSuite: 11 | def setup(self) -> None: 12 | self.ds = simulate_genotype_call_dataset( 13 | n_variant=1000000, n_sample=1000, seed=0 14 | ) 15 | 16 | self.dir = Path(tempfile.mkdtemp()) 17 | self.output_plink = self.dir / "plink_out" 18 | 19 | # use track_* asv methods since we want to measure speed (MB/s) not time 20 | 21 | def track_write_plink_speed(self) -> None: 22 | # throw away first run due to numba jit compilation 23 | for _ in range(2): 24 | duration = _time_func(write_plink, self.ds, path=self.output_plink) 25 | return _to_mb_per_s(get_dir_size(self.dir), duration) 26 | 27 | 28 | def _time_func(func, *args, **kwargs): 29 | start = time.time() 30 | func(*args, **kwargs) 31 | end = time.time() 32 | return end - start 33 | 34 | 35 | def _to_mb_per_s(bytes, duration): 36 | return bytes / (1_000_000 * duration) 37 | 38 | 39 | def get_dir_size(dir): 40 | return sum(f.stat().st_size for f in dir.glob("**/*") if f.is_file()) 41 | -------------------------------------------------------------------------------- /benchmarks/benchmarks_stats.py: -------------------------------------------------------------------------------- 1 | """Benchmarks suite for stats module.""" 2 | 3 | import numpy as np 4 | import xarray as xr 5 | 6 | from sgkit import ( 7 | count_call_alleles, 8 | count_cohort_alleles, 9 | simulate_genotype_call_dataset, 10 | ) 11 | 12 | 13 | class TimeSuite: 14 | def setup(self) -> None: 15 | self.count_call_alleles_ds = simulate_genotype_call_dataset( 16 | n_variant=100_000, n_sample=1000 17 | ) 18 | self.count_cohort_alleles_ds = simulate_genotype_call_dataset( 19 | n_variant=100_000, n_sample=1000 20 | ) 21 | sample_cohort = np.repeat( 22 | [0, 1], self.count_cohort_alleles_ds.dims["samples"] // 2 23 | ) 24 | self.count_cohort_alleles_ds["sample_cohort"] = xr.DataArray( 25 | sample_cohort, dims="samples" 26 | ) 27 | 28 | def time_count_call_alleles(self) -> None: 29 | count_call_alleles(self.count_call_alleles_ds) 30 | 31 | def time_count_cohort_alleles(self) -> None: 32 | count_cohort_alleles(self.count_cohort_alleles_ds) 33 | -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | collect_ignore_glob = ["benchmarks/**", ".github/scripts/*.py"] 2 | 3 | 4 | def pytest_addoption(parser): 5 | parser.addoption( 6 | "--use-cubed", action="store_true", default=False, help="run with cubed" 7 | ) 8 | 9 | 10 | def use_cubed(): 11 | import dask 12 | import xarray as xr 13 | 14 | # set xarray to use cubed by default 15 | xr.set_options(chunk_manager="cubed") 16 | 17 | # ensure that dask compute raises if it is ever called 18 | class AlwaysRaiseScheduler: 19 | def __call__(self, dsk, keys, **kwargs): 20 | raise RuntimeError("Dask 'compute' was called") 21 | 22 | dask.config.set(scheduler=AlwaysRaiseScheduler()) 23 | 24 | 25 | def pytest_configure(config) -> None: # type: ignore 26 | # Add "gpu" marker 27 | config.addinivalue_line("markers", "gpu:Run tests that run on GPU") 28 | 29 | if config.getoption("--use-cubed"): 30 | use_cubed() 31 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | savefig/ 2 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | PYPATH=${PWD}/.. 5 | 6 | # You can set these variables from the command line, and also 7 | # from the environment for the first two. 8 | SPHINXOPTS ?= 9 | SPHINXBUILD ?= sphinx-build 10 | SOURCEDIR = . 11 | BUILDDIR = _build 12 | 13 | # Put it first so that "make" without argument is like "make help". 14 | help: 15 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 16 | 17 | .PHONY: help Makefile 18 | 19 | .PHONY: clean 20 | clean: 21 | rm -rf $(BUILDDIR)/* 22 | rm -rf generated/* 23 | 24 | # Catch-all target: route all unknown targets to Sphinx using the new 25 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 26 | %: Makefile 27 | @PYTHONPATH=${PYPATH} $(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 28 | -------------------------------------------------------------------------------- /docs/_static/data-structures-xarray.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/docs/_static/data-structures-xarray.jpg -------------------------------------------------------------------------------- /docs/_static/docsearch.sbt.css: -------------------------------------------------------------------------------- 1 | #site-navigation { overflow: visible; } 2 | -------------------------------------------------------------------------------- /docs/_static/docsearch.sbt.js: -------------------------------------------------------------------------------- 1 | docsearch({ 2 | apiKey: 'b547668ae472e6a13ae311fb4a8928a3', 3 | indexName: 'sgkit', 4 | inputSelector: '#search-input', 5 | debug: false // Set debug to true if you want to inspect the dropdown 6 | }); 7 | -------------------------------------------------------------------------------- /docs/_static/mydask.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/docs/_static/mydask.png -------------------------------------------------------------------------------- /docs/_static/numfocus_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/docs/_static/numfocus_logo.png -------------------------------------------------------------------------------- /docs/_static/order.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/docs/_static/order.png -------------------------------------------------------------------------------- /docs/_static/sgkit_blue_trnsprnt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/docs/_static/sgkit_blue_trnsprnt.png -------------------------------------------------------------------------------- /docs/_static/sgkit_trnsprnt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/docs/_static/sgkit_trnsprnt.png -------------------------------------------------------------------------------- /docs/_static/switcher.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "version": "latest", 4 | "url": "https://sgkit-dev.github.io/sgkit/latest/" 5 | }, 6 | { 7 | "name": "0.10.0 (stable)", 8 | "version": "0.10.0", 9 | "url": "https://sgkit-dev.github.io/sgkit/0.10.0/" 10 | }, 11 | { 12 | "name": "0.9.0", 13 | "version": "0.9.0", 14 | "url": "https://sgkit-dev.github.io/sgkit/0.9.0/" 15 | }, 16 | { 17 | "name": "0.8.0", 18 | "version": "0.8.0", 19 | "url": "https://sgkit-dev.github.io/sgkit/0.8.0/" 20 | }, 21 | { 22 | "name": "0.7.0", 23 | "version": "0.7.0", 24 | "url": "https://sgkit-dev.github.io/sgkit/0.7.0/" 25 | }, 26 | { 27 | "name": "0.6.0", 28 | "version": "0.6.0", 29 | "url": "https://sgkit-dev.github.io/sgkit/0.6.0/" 30 | }, 31 | { 32 | "name": "0.5.0", 33 | "version": "0.5.0", 34 | "url": "https://sgkit-dev.github.io/sgkit/0.5.0/" 35 | }, 36 | { 37 | "version": "0.4.0", 38 | "url": "https://sgkit-dev.github.io/sgkit/0.4.0/" 39 | }, 40 | { 41 | "version": "0.3.0", 42 | "url": "https://sgkit-dev.github.io/sgkit/0.3.0/" 43 | }, 44 | { 45 | "version": "0.2.0a1", 46 | "url": "https://sgkit-dev.github.io/sgkit/0.2.0a1/" 47 | }, 48 | { 49 | "version": "0.1.0a1", 50 | "url": "https://sgkit-dev.github.io/sgkit/0.1.0a1/" 51 | } 52 | ] -------------------------------------------------------------------------------- /docs/about.rst: -------------------------------------------------------------------------------- 1 | .. _about: 2 | 3 | ***** 4 | About 5 | ***** 6 | 7 | .. image:: _static/numfocus_logo.png 8 | :scale: 50 % 9 | :target: https://numfocus.org/ 10 | 11 | Sgkit is a fiscally sponsored project of NumFOCUS_, a nonprofit dedicated 12 | to supporting the open-source scientific computing community. If you like 13 | sgkit and want to support our mission, please consider making a donation_ 14 | to support our efforts. 15 | 16 | NumFOCUS is 501(c)(3) non-profit charity in the United States; as such, 17 | donations to NumFOCUS are tax-deductible as allowed by law. As with any 18 | donation, you should consult with your personal tax adviser or the IRS 19 | about your particular tax situation. 20 | 21 | .. _NumFOCUS: https://numfocus.org 22 | .. _donation: https://numfocus.org/donate-to-sgkit -------------------------------------------------------------------------------- /docs/examples/index.rst: -------------------------------------------------------------------------------- 1 | ######## 2 | Examples 3 | ######## 4 | 5 | Example notebooks showing how to use sgkit. 6 | 7 | 8 | .. toctree:: 9 | :maxdepth: 2 10 | :hidden: 11 | 12 | gwas_tutorial 13 | relatedness_tutorial 14 | -------------------------------------------------------------------------------- /docs/extensions/typed_returns.py: -------------------------------------------------------------------------------- 1 | """ 2 | This extension is taken directly from scanpy here: 3 | https://github.com/theislab/scanpy/blob/5533b644e796379fd146bf8e659fd49f92f718cd/docs/extensions/typed_returns.py 4 | 5 | to fix this issue: https://github.com/theislab/scanpydoc/issues/7 6 | """ 7 | import re 8 | from typing import Iterator, List 9 | 10 | from sphinx.application import Sphinx 11 | from sphinx.ext.napoleon import NumpyDocstring 12 | 13 | 14 | def process_return(lines: List[str]) -> Iterator[str]: 15 | for line in lines: 16 | m = re.fullmatch(r"(?P\w+)\s+:\s+(?P[\w.]+)", line) 17 | if m: 18 | # Once this is in scanpydoc, we can use the fancy hover stuff 19 | yield f'**{m["param"]}** : :class:`~{m["type"]}`' 20 | else: 21 | yield line 22 | 23 | 24 | def scanpy_parse_returns_section(self: NumpyDocstring, section: str) -> List[str]: 25 | lines_raw = list(process_return(self._dedent(self._consume_to_next_section()))) 26 | lines: List[str] = self._format_block(":returns: ", lines_raw) 27 | if lines and lines[-1]: 28 | lines.append("") 29 | return lines 30 | 31 | 32 | def setup(app: Sphinx) -> None: 33 | NumpyDocstring._parse_returns_section = scanpy_parse_returns_section 34 | -------------------------------------------------------------------------------- /docs/how_do_i.rst: -------------------------------------------------------------------------------- 1 | .. currentmodule:: sgkit 2 | 3 | .. _how_do_i: 4 | 5 | ************ 6 | How do I ... 7 | ************ 8 | 9 | .. contents:: 10 | :local: 11 | 12 | Create a test dataset? 13 | ---------------------- 14 | 15 | Call :py:func:`simulate_genotype_call_dataset` to create a test :class:`xarray.Dataset`: 16 | 17 | .. ipython:: python 18 | 19 | import sgkit as sg 20 | ds = sg.simulate_genotype_call_dataset(n_variant=100, n_sample=50, n_contig=23, missing_pct=.1) 21 | 22 | Look at the dataset summary? 23 | ---------------------------- 24 | 25 | Print using the :class:`xarray.Dataset` ``repr``: 26 | 27 | .. ipython:: python 28 | 29 | ds 30 | 31 | Get the values for a variable in a dataset? 32 | ------------------------------------------- 33 | 34 | Call :attr:`xarray.Variable.values`: 35 | 36 | .. ipython:: python 37 | 38 | ds.variant_contig.values 39 | ds["variant_contig"].values # equivalent alternative 40 | 41 | .. warning:: 42 | 43 | Calling ``values`` materializes a variable's data in memory, so is only suitable for small datasets. 44 | 45 | Find the definition for a variable in a dataset? 46 | ------------------------------------------------ 47 | 48 | Use the ``comment`` attribute on the variable: 49 | 50 | .. ipython:: python 51 | 52 | ds.variant_contig.comment 53 | 54 | All the variables defined in sgkit are documented on the :ref:`api_variables` API page. 55 | 56 | Look at the genotypes? 57 | ---------------------- 58 | 59 | Call :py:func:`display_genotypes`: 60 | 61 | .. ipython:: python 62 | 63 | sg.display_genotypes(ds, max_variants=10) 64 | 65 | 66 | Subset the variables? 67 | --------------------- 68 | 69 | Use Xarray's pandas-like method for `selecting variables `_: 70 | 71 | .. ipython:: python 72 | 73 | ds[["variant_contig", "variant_position", "variant_allele"]] 74 | 75 | Alternatively, you can `drop variables `_ that you want to remove: 76 | 77 | .. ipython:: python 78 | 79 | ds.drop_vars(["variant_contig", "variant_position", "variant_allele"]) 80 | 81 | Subset to a genomic range? 82 | -------------------------- 83 | 84 | Set an index on the dataset, then call :meth:`xarray.Dataset.sel`: 85 | 86 | .. ipython:: python 87 | 88 | ds.set_index(variants=("variant_contig", "variant_position")).sel(variants=(0, slice(2, 4))) 89 | 90 | An API to make this easier is under discussion. Please add your requirements to https://github.com/sgkit-dev/sgkit/pull/658. 91 | 92 | Get the list of samples? 93 | ------------------------ 94 | 95 | Get the values for the ``sample_id`` variable: 96 | 97 | .. ipython:: python 98 | 99 | ds.sample_id.values 100 | 101 | Subset the samples? 102 | ------------------- 103 | 104 | Call :meth:`xarray.Dataset.sel` and :meth:`xarray.DataArray.isin`: 105 | 106 | .. ipython:: python 107 | 108 | ds.sel(samples=ds.sample_id.isin(["S30", "S32"])) 109 | 110 | Define a new variable based on others? 111 | -------------------------------------- 112 | 113 | Use Xarray's `dictionary like methods `_, or :meth:`xarray.Dataset.assign`: 114 | 115 | .. ipython:: python 116 | 117 | ds["pos0"] = ds.variant_position - 1 118 | ds.assign(pos0 = ds.variant_position - 1) # alternative 119 | 120 | Get summary stats? 121 | ------------------ 122 | 123 | Call :py:func:`sample_stats` or :py:func:`variant_stats` as appropriate: 124 | 125 | .. ipython:: python 126 | 127 | sg.sample_stats(ds) 128 | sg.variant_stats(ds) 129 | 130 | Filter variants? 131 | ---------------- 132 | 133 | Call :meth:`xarray.Dataset.sel` on the ``variants`` dimension: 134 | 135 | .. ipython:: python 136 | 137 | ds2 = sg.hardy_weinberg_test(ds) 138 | ds2.sel(variants=(ds2.variant_hwe_p_value > 1e-2).compute()) 139 | 140 | .. note:: 141 | 142 | The call to ``compute`` is needed to avoid an Xarray error. 143 | 144 | Find which new variables were added by a method? 145 | ------------------------------------------------ 146 | 147 | Use :py:attr:`xarray.Dataset.data_vars` to compare the new dataset variables to the old: 148 | 149 | .. ipython:: python 150 | 151 | ds2 = sg.sample_stats(ds) 152 | set(ds2.data_vars) - set(ds.data_vars) 153 | 154 | Save results to a Zarr file? 155 | ---------------------------- 156 | 157 | Call :py:func:`save_dataset`: 158 | 159 | .. ipython:: python 160 | 161 | sg.save_dataset(ds, "ds.zarr") 162 | 163 | .. note:: 164 | 165 | Zarr datasets must have equal-sized chunks (except for the final chunk, which may be smaller), 166 | so you may have to `rechunk the dataset `_ first. 167 | 168 | Load a dataset from Zarr? 169 | ------------------------- 170 | 171 | Call :py:func:`load_dataset`: 172 | 173 | .. ipython:: python 174 | 175 | ds = sg.load_dataset("ds.zarr") 176 | @suppress 177 | !rm -r ds.zarr 178 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | sgkit: Statistical genetics toolkit in Python 2 | ============================================= 3 | 4 | Sgkit is a Python package that provides a variety of analytical genetics methods through the use of 5 | general-purpose frameworks such as `Xarray `_, `Pandas `_, 6 | `Dask `_ and `Zarr `_. The sgkit API makes as 7 | few assumptions as possible about the origin, structure, and intended use of genetic data by adopting a set of 8 | domain-specific conventions that allow such data to be used within this broader ecosystem of tools. The package is 9 | designed for complex workflows over large distributed datasets but attempts to make it as easy as possible to scale 10 | down to smaller datasets and access simpler functionality for those that may be new to Python (though there is still 11 | a good bit of work to done on this front). See :ref:`getting_started` for more details. 12 | 13 | Sgkit is inspired heavily by `scikit-allel `_ and `Hail `_, 14 | both popular Python genetics toolkits with a respective focus on population and quantitative genetics. 15 | 16 | .. toctree:: 17 | :maxdepth: 2 18 | :caption: Contents 19 | 20 | getting_started 21 | user_guide 22 | examples/index 23 | api 24 | how_do_i 25 | contributing 26 | about 27 | news 28 | changelog 29 | 30 | Indices and tables 31 | ================== 32 | 33 | * :ref:`genindex` 34 | * :ref:`search` 35 | -------------------------------------------------------------------------------- /docs/news.rst: -------------------------------------------------------------------------------- 1 | .. _blog: 2 | 3 | **** 4 | News 5 | **** 6 | 7 | .. postlist:: 10 8 | :date: %Y-%m-%d 9 | :format: {date} - {title} 10 | :list-style: none 11 | :excerpts: -------------------------------------------------------------------------------- /docs/news/introducing_sgkit.md: -------------------------------------------------------------------------------- 1 | # Introducing sgkit 2 | 3 | ```{post} 2022-08-01 4 | --- 5 | category: releases 6 | author: hammer 7 | --- 8 | ``` 9 | 10 | The sgkit team is pleased to announce the release of [sgkit 0.5.0](https://github.com/sgkit-dev/sgkit/releases/tag/0.5.0)! This release adds support for the [VCF Zarr specification](https://github.com/sgkit-dev/vcf-zarr-spec), which describes an encoding of VCF data in chunked-columnar form using the [Zarr format](https://zarr.readthedocs.io/en/stable/). 11 | 12 | With this release, we also introduce our news page, where we will announce future releases and provide other relevant updates for the `sgkit` project. 13 | 14 | Oxford and Related Sciences began collaborating in early 2020 on `sgkit` as a successor to the popular [scikit-allel](https://github.com/cggh/scikit-allel) library. We’ve worked closely with third-party library authors to read and write data stored in VCF ([cyvcf2](https://github.com/brentp/cyvcf2)), BGEN ([cbgen](https://github.com/limix/cbgen)), and PLINK ([bed_reader](https://github.com/fastlmm/bed-reader)) files. We’ve designed an [Xarray](https://github.com/pydata/xarray)-based [data model](https://sgkit-dev.github.io/sgkit/latest/getting_started.html#data-structures) and implemented many common methods from statistical and population genetics, including variant and sample [quality control](https://sgkit-dev.github.io/sgkit/latest/examples/gwas_tutorial.html#quality-control), [kinship analysis](https://sgkit-dev.github.io/sgkit/latest/generated/sgkit.pc_relate.html#sgkit-pc-relate), genome-wide [selection scans](https://sgkit-dev.github.io/sgkit/latest/generated/sgkit.Garud_H.html), and genome-wide [association analyses](https://sgkit-dev.github.io/sgkit/latest/generated/sgkit.gwas_linear_regression.html), as well as a [novel implementation](https://sgkit-dev.github.io/sgkit/latest/generated/sgkit.regenie.html#sgkit-regenie) of the recently developed [REGENIE algorithm](https://github.com/rgcgithub/regenie). 15 | 16 | `sgkit` was accepted as a [NumFOCUS Sponsored Project](https://numfocus.org/project/sgkit) in 2021, and we now have developers in the US, the UK, and New Zealand. 17 | 18 | If you think sgkit might be useful for your project, please don't hesitate to file an [issue](https://github.com/sgkit-dev/sgkit/issues) or start a [discussion](https://github.com/sgkit-dev/sgkit/discussions) with questions and feedback! 19 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | callee 2 | pre-commit 3 | pytest 4 | pytest-cov 5 | pytest-datadir 6 | pytest-mock 7 | hypothesis 8 | scikit-allel 9 | statsmodels 10 | msprime>=1.0 11 | scikit-learn 12 | partd 13 | bed-reader 14 | rechunker 15 | cbgen > 1.0.5 16 | bio2zarr[vcf]; platform_system != "Windows" 17 | yarl 18 | matplotlib 19 | asv 20 | networkx 21 | aiohttp 22 | requests 23 | graphviz 24 | -------------------------------------------------------------------------------- /requirements-doc.txt: -------------------------------------------------------------------------------- 1 | myst_nb 2 | pydata-sphinx-theme 3 | sphinx==6.2.1 4 | sphinx_autodoc_typehints>=1.14.0 5 | sphinx-copybutton 6 | scanpydoc 7 | ipython 8 | matplotlib 9 | seaborn 10 | ablog!=0.10.27 11 | pickleshare 12 | -------------------------------------------------------------------------------- /requirements-numpy1-dev.txt: -------------------------------------------------------------------------------- 1 | callee 2 | pre-commit 3 | pytest 4 | pytest-cov 5 | pytest-datadir 6 | pytest-mock 7 | hypothesis 8 | scikit-allel 9 | statsmodels 10 | msprime>=1.0 11 | scikit-learn 12 | partd 13 | bed-reader 14 | rechunker 15 | cbgen < 1.0.5 16 | bio2zarr[vcf]; platform_system != "Windows" 17 | yarl 18 | matplotlib 19 | asv 20 | networkx 21 | aiohttp 22 | requests 23 | graphviz 24 | -------------------------------------------------------------------------------- /requirements-numpy1.txt: -------------------------------------------------------------------------------- 1 | numpy < 2 2 | xarray < 2025.03.1 3 | dask[array,dataframe] >= 2023.01.0, <= 2024.8.0 4 | distributed >= 2023.01.0, <= 2024.8.0 5 | scipy 6 | typing-extensions 7 | numba 8 | zarr >= 2.10.0, != 2.11.0, != 2.11.1, != 2.11.2, < 3 9 | fsspec != 2021.6.* 10 | scikit-learn 11 | pandas 12 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy < 2.2 2 | xarray < 2025.03.1 3 | dask[array,dataframe] >= 2023.01.0, <= 2024.8.0 4 | distributed >= 2023.01.0, <= 2024.8.0 5 | scipy 6 | typing-extensions 7 | numba 8 | zarr >= 2.10.0, != 2.11.0, != 2.11.1, != 2.11.2, < 3 9 | fsspec != 2021.6.* 10 | scikit-learn 11 | pandas 12 | -------------------------------------------------------------------------------- /sgkit/__init__.py: -------------------------------------------------------------------------------- 1 | from .display import display_genotypes, display_pedigree 2 | from .distance.api import pairwise_distance 3 | from .io.dataset import load_dataset, save_dataset 4 | from .model import ( 5 | DIM_ALLELE, 6 | DIM_PLOIDY, 7 | DIM_SAMPLE, 8 | DIM_VARIANT, 9 | create_genotype_call_dataset, 10 | create_genotype_dosage_dataset, 11 | ) 12 | from .stats.aggregation import ( 13 | call_allele_frequencies, 14 | cohort_allele_frequencies, 15 | count_call_alleles, 16 | count_cohort_alleles, 17 | count_variant_alleles, 18 | count_variant_genotypes, 19 | individual_heterozygosity, 20 | infer_call_ploidy, 21 | infer_sample_ploidy, 22 | infer_variant_ploidy, 23 | sample_stats, 24 | variant_stats, 25 | ) 26 | from .stats.association import gwas_linear_regression, regenie_loco_regression 27 | from .stats.conversion import convert_call_to_index, convert_probability_to_call 28 | from .stats.genedrop import simulate_genedrop 29 | from .stats.genee import genee 30 | from .stats.grm import ( 31 | genomic_relationship, 32 | hybrid_inverse_relationship, 33 | hybrid_relationship, 34 | invert_relationship_matrix, 35 | ) 36 | from .stats.hwe import hardy_weinberg_test 37 | from .stats.ibs import Weir_Goudet_beta, identity_by_state 38 | from .stats.ld import ld_matrix, ld_prune, maximal_independent_set 39 | from .stats.pc_relate import pc_relate 40 | from .stats.pca import pca 41 | from .stats.pedigree import ( 42 | parent_indices, 43 | pedigree_contribution, 44 | pedigree_inbreeding, 45 | pedigree_inverse_kinship, 46 | pedigree_kinship, 47 | pedigree_sel, 48 | ) 49 | from .stats.popgen import ( 50 | Fst, 51 | Garud_H, 52 | Tajimas_D, 53 | divergence, 54 | diversity, 55 | observed_heterozygosity, 56 | pbs, 57 | ) 58 | from .stats.preprocessing import filter_partial_calls 59 | from .stats.regenie import regenie 60 | from .testing import simulate_genotype_call_dataset 61 | from .window import ( 62 | window_by_genome, 63 | window_by_interval, 64 | window_by_position, 65 | window_by_variant, 66 | ) 67 | 68 | __version__ = "unknown" 69 | try: 70 | from . import _version 71 | 72 | __version__ = _version.version # pragma: nocover 73 | except ImportError: # pragma: nocover 74 | pass 75 | 76 | __all__ = [ 77 | "DIM_ALLELE", 78 | "DIM_PLOIDY", 79 | "DIM_SAMPLE", 80 | "DIM_VARIANT", 81 | "call_allele_frequencies", 82 | "create_genotype_call_dataset", 83 | "cohort_allele_frequencies", 84 | "convert_call_to_index", 85 | "convert_probability_to_call", 86 | "count_variant_alleles", 87 | "count_call_alleles", 88 | "count_cohort_alleles", 89 | "count_variant_genotypes", 90 | "create_genotype_dosage_dataset", 91 | "display_genotypes", 92 | "display_pedigree", 93 | "filter_partial_calls", 94 | "genee", 95 | "genomic_relationship", 96 | "gwas_linear_regression", 97 | "regenie", 98 | "regenie_loco_regression", 99 | "hardy_weinberg_test", 100 | "hybrid_relationship", 101 | "hybrid_inverse_relationship", 102 | "identity_by_state", 103 | "individual_heterozygosity", 104 | "infer_call_ploidy", 105 | "infer_sample_ploidy", 106 | "infer_variant_ploidy", 107 | "invert_relationship_matrix", 108 | "ld_matrix", 109 | "ld_prune", 110 | "maximal_independent_set", 111 | "parent_indices", 112 | "pedigree_contribution", 113 | "pedigree_inbreeding", 114 | "pedigree_inverse_kinship", 115 | "pedigree_kinship", 116 | "pedigree_sel", 117 | "sample_stats", 118 | "variant_stats", 119 | "diversity", 120 | "divergence", 121 | "Fst", 122 | "Garud_H", 123 | "Tajimas_D", 124 | "pbs", 125 | "pc_relate", 126 | "simulate_genedrop", 127 | "simulate_genotype_call_dataset", 128 | "variables", 129 | "observed_heterozygosity", 130 | "pca", 131 | "Weir_Goudet_beta", 132 | "window_by_genome", 133 | "window_by_interval", 134 | "window_by_position", 135 | "window_by_variant", 136 | "load_dataset", 137 | "save_dataset", 138 | "pairwise_distance", 139 | ] 140 | -------------------------------------------------------------------------------- /sgkit/accelerate.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Callable 3 | 4 | from numba import guvectorize, jit 5 | 6 | _DISABLE_CACHE = os.environ.get("SGKIT_DISABLE_NUMBA_CACHE", "1") 7 | 8 | try: 9 | CACHE_NUMBA = {"0": True, "1": False}[_DISABLE_CACHE] 10 | except KeyError as e: # pragma: no cover 11 | raise KeyError( 12 | "Environment variable 'SGKIT_DISABLE_NUMBA_CACHE' must be '0' or '1'" 13 | ) from e 14 | 15 | 16 | DEFAULT_NUMBA_ARGS = { 17 | "nopython": True, 18 | "cache": CACHE_NUMBA, 19 | } 20 | 21 | 22 | def numba_jit(*args, **kwargs) -> Callable: # pragma: no cover 23 | kwargs_ = DEFAULT_NUMBA_ARGS.copy() 24 | kwargs_.update(kwargs) 25 | return jit(*args, **kwargs_) 26 | 27 | 28 | def numba_guvectorize(*args, **kwargs) -> Callable: # pragma: no cover 29 | kwargs_ = DEFAULT_NUMBA_ARGS.copy() 30 | kwargs_.update(kwargs) 31 | return guvectorize(*args, **kwargs_) 32 | -------------------------------------------------------------------------------- /sgkit/cohorts.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Sequence, Tuple, Union 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | 7 | def _tuple_len(t: Union[int, Tuple[int, ...], str, Tuple[str, ...]]) -> int: 8 | """Return the length of a tuple, or 1 for an int or string value.""" 9 | if isinstance(t, int) or isinstance(t, str): 10 | return 1 11 | return len(t) 12 | 13 | 14 | def _cohorts_to_array( 15 | cohorts: Sequence[Union[int, Tuple[int, ...], str, Tuple[str, ...]]], 16 | index: Optional[pd.Index] = None, 17 | ) -> np.ndarray: 18 | """Convert cohorts or cohort tuples specified as a sequence of values or 19 | tuples to an array of ints used to match samples in ``sample_cohorts``. 20 | 21 | Cohorts can be specified by index (as used in ``sample_cohorts``), or a label, in 22 | which case an ``index`` must be provided to find index locations for cohorts. 23 | 24 | Parameters 25 | ---------- 26 | cohorts 27 | A sequence of values or tuple representing cohorts or cohort tuples. 28 | index 29 | An index to turn labels into index locations, by default None. 30 | 31 | Returns 32 | ------- 33 | An array of shape ``(len(cohorts), tuple_len)``, where ``tuple_len`` is the length 34 | of the tuples, or 1 if ``cohorts`` is a sequence of values. 35 | 36 | Raises 37 | ------ 38 | ValueError 39 | If the cohort tuples are not all the same length. 40 | 41 | Examples 42 | -------- 43 | 44 | >>> import pandas as pd 45 | >>> from sgkit.cohorts import _cohorts_to_array 46 | >>> _cohorts_to_array([(0, 1), (2, 1)]) # doctest: +SKIP 47 | array([[0, 1], 48 | [2, 1]], dtype=int32) 49 | >>> _cohorts_to_array([("c0", "c1"), ("c2", "c1")], pd.Index(["c0", "c1", "c2"])) # doctest: +SKIP 50 | array([[0, 1], 51 | [2, 1]], dtype=int32) 52 | """ 53 | if len(cohorts) == 0: 54 | return np.array([], np.int32) 55 | 56 | tuple_len = _tuple_len(cohorts[0]) 57 | if not all(_tuple_len(cohort) == tuple_len for cohort in cohorts): 58 | raise ValueError("Cohort tuples must all be the same length") 59 | 60 | # convert cohort IDs using an index 61 | if index is not None: 62 | if isinstance(cohorts[0], str): 63 | cohorts = [index.get_loc(id) for id in cohorts] 64 | elif tuple_len > 1 and isinstance(cohorts[0][0], str): # type: ignore 65 | cohorts = [tuple(index.get_loc(id) for id in t) for t in cohorts] # type: ignore 66 | 67 | ct = np.empty((len(cohorts), tuple_len), np.int32) 68 | for n, t in enumerate(cohorts): 69 | ct[n, :] = t 70 | return ct 71 | -------------------------------------------------------------------------------- /sgkit/display_numba_fns.py: -------------------------------------------------------------------------------- 1 | from sgkit.accelerate import numba_guvectorize 2 | from sgkit.typing import ArrayLike 3 | 4 | 5 | @numba_guvectorize( # type: ignore 6 | [ 7 | "void(uint8[:], uint8[:], boolean[:], uint8[:], uint8[:])", 8 | ], 9 | "(b),(),(),(c)->(c)", 10 | ) 11 | def _format_genotype_bytes( 12 | chars: ArrayLike, ploidy: int, phased: bool, _: ArrayLike, out: ArrayLike 13 | ) -> None: # pragma: no cover 14 | ploidy = ploidy[0] 15 | sep = 124 if phased[0] else 47 # "|" or "/" 16 | chars_per_allele = len(chars) // ploidy 17 | slot = 0 18 | for slot in range(ploidy): 19 | offset_inp = slot * chars_per_allele 20 | offset_out = slot * (chars_per_allele + 1) 21 | if slot > 0: 22 | out[offset_out - 1] = sep 23 | for char in range(chars_per_allele): 24 | i = offset_inp + char 25 | j = offset_out + char 26 | val = chars[i] 27 | if val == 45: # "-" 28 | if chars[i + 1] == 49: # "1" 29 | # this is an unknown allele 30 | out[j] = 46 # "." 31 | out[j + 1 : j + chars_per_allele] = 0 32 | break 33 | else: 34 | # < -1 indicates a gap 35 | out[j : j + chars_per_allele] = 0 36 | if slot > 0: 37 | # remove separator 38 | out[offset_out - 1] = 0 39 | break 40 | else: 41 | out[j] = val 42 | # shuffle zeros to end 43 | c = len(out) 44 | for i in range(c): 45 | if out[i] == 0: 46 | for j in range(i + 1, c): 47 | if out[j] != 0: 48 | out[i] = out[j] 49 | out[j] = 0 50 | break 51 | -------------------------------------------------------------------------------- /sgkit/distance/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/distance/__init__.py -------------------------------------------------------------------------------- /sgkit/distarray.py: -------------------------------------------------------------------------------- 1 | from xarray.namedarray.parallelcompat import guess_chunkmanager 2 | 3 | # use the xarray chunk manager to determine the distributed array module to use 4 | cm = guess_chunkmanager(None) 5 | 6 | if cm.array_cls.__module__.split(".")[0] == "cubed": 7 | from cubed import * # pragma: no cover # noqa: F401, F403 8 | else: 9 | # default to dask 10 | from dask.array import * # noqa: F401, F403 11 | 12 | # dask doesn't have a top-level astype required by the array API 13 | def astype(x, dtype, /, *, copy=True): # pragma: no cover 14 | if not copy and dtype == x.dtype: 15 | return x 16 | return x.astype(dtype=dtype, copy=copy) 17 | 18 | # dask doesn't have concat required by the array API 19 | concat = concatenate # noqa: F405 20 | -------------------------------------------------------------------------------- /sgkit/io/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/io/__init__.py -------------------------------------------------------------------------------- /sgkit/io/bgen/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | from .bgen_reader import bgen_to_zarr, read_bgen, rechunk_bgen 3 | 4 | __all__ = ["read_bgen", "bgen_to_zarr", "rechunk_bgen"] 5 | except ImportError as e: # pragma: no cover 6 | msg = ( 7 | "sgkit bgen requirements are not installed.\n\n" 8 | "Please install them via pip :\n\n" 9 | " pip install 'sgkit[bgen]'" 10 | ) 11 | raise ImportError(str(e) + "\n\n" + msg) from e 12 | -------------------------------------------------------------------------------- /sgkit/io/dataset.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, MutableMapping, Optional, Union 2 | 3 | import numcodecs 4 | import xarray as xr 5 | from xarray import Dataset 6 | 7 | from sgkit.typing import PathType 8 | from sgkit.utils import has_keyword 9 | 10 | 11 | def save_dataset( 12 | ds: Dataset, 13 | store: Union[PathType, MutableMapping[str, bytes]], 14 | storage_options: Optional[Dict[str, str]] = None, 15 | auto_rechunk: Optional[bool] = None, 16 | zarr_format: int = 2, 17 | **kwargs: Any, 18 | ) -> None: 19 | """Save a dataset to Zarr storage. 20 | 21 | This function is a thin wrapper around :meth:`xarray.Dataset.to_zarr` 22 | that uses sensible defaults and makes it easier to use in a pipeline. 23 | 24 | Parameters 25 | ---------- 26 | ds 27 | Dataset to save. 28 | store 29 | Zarr store or path to directory in file system to save to. 30 | storage_options: 31 | Any additional parameters for the storage backend (see ``fsspec.open``). 32 | auto_rechunk: 33 | If True, automatically rechunk the dataset to uniform chunks before saving, 34 | if necessary. This is required for Zarr, but can be expensive. Defaults to False. 35 | kwargs 36 | Additional arguments to pass to :meth:`xarray.Dataset.to_zarr`. 37 | """ 38 | if auto_rechunk is None: 39 | auto_rechunk = False 40 | for v in ds: 41 | # Workaround for https://github.com/pydata/xarray/issues/4380 42 | ds[v].encoding.pop("chunks", None) 43 | 44 | # Remove VLenUTF8 from filters to avoid double encoding error https://github.com/pydata/xarray/issues/3476 45 | filters = ds[v].encoding.get("filters", None) 46 | var_len_str_codec = numcodecs.VLenUTF8() 47 | if filters is not None and var_len_str_codec in filters: 48 | filters = list(filters) 49 | filters.remove(var_len_str_codec) 50 | ds[v].encoding["filters"] = filters 51 | 52 | if auto_rechunk: 53 | # This logic for checking if rechunking is necessary is 54 | # taken from xarray/backends/zarr.py#L109. 55 | # We can't try to save and catch the error as by that 56 | # point the zarr store is non-empty. 57 | if any(len(set(chunks[:-1])) > 1 for chunks in ds.chunks.values()) or any( 58 | (chunks[0] < chunks[-1]) for chunks in ds.chunks.values() 59 | ): 60 | # Here we use the max chunk size as the target chunk size as for the commonest 61 | # case of subsetting an existing dataset, this will be closest to the original 62 | # intended chunk size. 63 | ds = ds.chunk( 64 | chunks={dim: max(chunks) for dim, chunks in ds.chunks.items()} 65 | ) 66 | 67 | # Catch unequal chunking errors to provide a more helpful error message 68 | try: 69 | if has_keyword(ds.to_zarr, "zarr_format"): # from xarray v2024.10.0 70 | kwargs["zarr_format"] = zarr_format 71 | ds.to_zarr(store, storage_options=storage_options, **kwargs) 72 | except ValueError as e: 73 | if "Zarr requires uniform chunk sizes" in str( 74 | e 75 | ) or "Final chunk of Zarr array must be the same size" in str(e): 76 | raise ValueError( 77 | "Zarr requires uniform chunk sizes. Use the `auto_rechunk` argument to" 78 | "`save_dataset` to automatically rechunk the dataset." 79 | ) from e 80 | else: 81 | raise e 82 | 83 | 84 | def load_dataset( 85 | store: Union[PathType, MutableMapping[str, bytes]], 86 | storage_options: Optional[Dict[str, str]] = None, 87 | **kwargs: Any, 88 | ) -> Dataset: 89 | """Load a dataset from Zarr storage. 90 | 91 | This function is a thin wrapper around :func:`xarray.open_zarr` 92 | that uses sensible defaults and makes it easier to use in a pipeline. 93 | 94 | Parameters 95 | ---------- 96 | store 97 | Zarr store or path to directory in file system to load from. 98 | storage_options: 99 | Any additional parameters for the storage backend (see ``fsspec.open``). 100 | kwargs 101 | Additional arguments to pass to :func:`xarray.open_zarr`. 102 | 103 | Returns 104 | ------- 105 | 106 | Dataset 107 | The dataset loaded from the Zarr store or file system. 108 | """ 109 | ds: Dataset = xr.open_zarr(store, storage_options=storage_options, concat_characters=False, **kwargs) # type: ignore[no-untyped-call] 110 | for v in ds: 111 | # Workaround for https://github.com/pydata/xarray/issues/4386 112 | if v.endswith("_mask"): # type: ignore 113 | ds[v] = ds[v].astype(bool) 114 | return ds 115 | -------------------------------------------------------------------------------- /sgkit/io/plink/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | from .plink_reader import plink_to_zarr, read_plink 3 | from .plink_writer import write_plink, zarr_to_plink 4 | 5 | __all__ = ["plink_to_zarr", "read_plink", "write_plink", "zarr_to_plink"] 6 | except ImportError as e: # pragma: no cover 7 | msg = ( 8 | "sgkit plink requirements are not installed.\n\n" 9 | "Please install them via pip :\n\n" 10 | " pip install 'sgkit[plink]'" 11 | ) 12 | raise ImportError(str(e) + "\n\n" + msg) from e 13 | -------------------------------------------------------------------------------- /sgkit/io/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Mapping, Optional, Tuple 2 | 3 | import dask.dataframe as dd 4 | import numpy as np 5 | 6 | from ..typing import ArrayLike, DType 7 | from ..utils import encode_array, max_str_len 8 | 9 | INT_MISSING, INT_FILL = -1, -2 10 | 11 | FLOAT32_MISSING, FLOAT32_FILL = np.array([0x7F800001, 0x7F800002], dtype=np.int32).view( 12 | np.float32 13 | ) 14 | FLOAT32_MISSING_AS_INT32, FLOAT32_FILL_AS_INT32 = np.array( 15 | [0x7F800001, 0x7F800002], dtype=np.int32 16 | ) 17 | 18 | CHAR_MISSING, CHAR_FILL = ".", "" 19 | 20 | STR_MISSING, STR_FILL = ".", "" 21 | 22 | 23 | def dataframe_to_dict( 24 | df: dd.DataFrame, dtype: Optional[Mapping[str, DType]] = None 25 | ) -> Mapping[str, ArrayLike]: 26 | """Convert dask dataframe to dictionary of arrays""" 27 | arrs = {} 28 | for c in df: 29 | a = df[c].to_dask_array(lengths=True) 30 | dt = df[c].dtype 31 | if dtype: 32 | dt = dtype[c] 33 | kind = np.dtype(dt).kind 34 | if kind in ["U", "S"]: 35 | # Compute fixed-length string dtype for array 36 | max_len = max_str_len(a) 37 | dt = f"{kind}{max_len}" 38 | arrs[c] = a.astype(dt) 39 | return arrs 40 | 41 | 42 | def encode_contigs(contig: ArrayLike) -> Tuple[ArrayLike, ArrayLike]: 43 | # TODO: test preservation of int16 44 | # If contigs are already integers, use them as-is 45 | if np.issubdtype(contig.dtype, np.integer): 46 | ids = contig 47 | names = np.unique(np.asarray(ids)).astype(str) # type: ignore[no-untyped-call] 48 | # Otherwise create index for contig names based 49 | # on order of appearance in underlying file 50 | else: 51 | ids, names = encode_array(np.asarray(contig, dtype=str)) 52 | return ids, names 53 | -------------------------------------------------------------------------------- /sgkit/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/py.typed -------------------------------------------------------------------------------- /sgkit/stats/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/stats/__init__.py -------------------------------------------------------------------------------- /sgkit/stats/aggregation_numba_fns.py: -------------------------------------------------------------------------------- 1 | # Numba guvectorize functions (and their dependencies) are defined 2 | # in a separate file here, and imported dynamically to avoid 3 | # initial compilation overhead. 4 | 5 | import numpy as np 6 | 7 | from sgkit.accelerate import numba_guvectorize, numba_jit 8 | from sgkit.typing import ArrayLike 9 | 10 | 11 | @numba_guvectorize( # type: ignore 12 | [ 13 | "void(int8[:], uint8[:], uint8[:])", 14 | "void(int16[:], uint8[:], uint8[:])", 15 | "void(int32[:], uint8[:], uint8[:])", 16 | "void(int64[:], uint8[:], uint8[:])", 17 | "void(int8[:], uint64[:], uint64[:])", 18 | "void(int16[:], uint64[:], uint64[:])", 19 | "void(int32[:], uint64[:], uint64[:])", 20 | "void(int64[:], uint64[:], uint64[:])", 21 | ], 22 | "(k),(n)->(n)", 23 | ) 24 | def count_alleles( 25 | g: ArrayLike, _: ArrayLike, out: ArrayLike 26 | ) -> None: # pragma: no cover 27 | """Generalized U-function for computing per sample allele counts. 28 | 29 | Parameters 30 | ---------- 31 | g 32 | Genotype call of shape (ploidy,) containing alleles encoded as 33 | type `int` with values < 0 indicating a missing allele. 34 | _ 35 | Dummy variable of type `uint8` or `uint64` and shape (alleles,) 36 | used to define the number of unique alleles to be counted in the 37 | return value. The dtype of this array determines the dtype of the 38 | returned array. 39 | 40 | Returns 41 | ------- 42 | ac : ndarray 43 | Allele counts with shape (alleles,) and values corresponding to 44 | the number of non-missing occurrences of each allele. 45 | 46 | """ 47 | out[:] = 0 48 | n_allele = len(g) 49 | for i in range(n_allele): 50 | a = g[i] 51 | if a >= 0: 52 | out[a] += 1 53 | 54 | 55 | @numba_jit(nogil=True) 56 | def _classify_hom(genotype: ArrayLike) -> int: # pragma: no cover 57 | a0 = genotype[0] 58 | cat = min(a0, 1) # -1, 0, 1 59 | for i in range(1, len(genotype)): 60 | if cat < 0: 61 | break 62 | a = genotype[i] 63 | if a != a0: 64 | cat = 2 # het 65 | if a < 0: 66 | cat = -1 67 | return cat 68 | 69 | 70 | @numba_guvectorize( # type: ignore 71 | [ 72 | "void(int8[:,:], uint64[:], int64[:])", 73 | "void(int16[:,:], uint64[:], int64[:])", 74 | "void(int32[:,:], uint64[:], int64[:])", 75 | "void(int64[:,:], uint64[:], int64[:])", 76 | ], 77 | "(n, k),(c)->(c)", 78 | ) 79 | def count_hom( 80 | genotypes: ArrayLike, _: ArrayLike, out: ArrayLike 81 | ) -> None: # pragma: no cover 82 | """Generalized U-function for counting homozygous and heterozygous genotypes. 83 | 84 | Parameters 85 | ---------- 86 | g 87 | Genotype call of shape (ploidy,) containing alleles encoded as 88 | type `int` with values < 0 indicating a missing allele. 89 | _ 90 | Dummy variable of type `uint64` with length 3 which determines the 91 | number of categories returned (this should always be 3). 92 | 93 | Note 94 | ---- 95 | This method is not suitable for mixed-ploidy genotypes. 96 | 97 | Returns 98 | ------- 99 | counts : ndarray 100 | Counts of homozygous reference, homozygous alternate, and heterozygous genotypes. 101 | """ 102 | out[:] = 0 103 | for i in range(len(genotypes)): 104 | index = _classify_hom(genotypes[i]) 105 | if index >= 0: 106 | out[index] += 1 107 | 108 | 109 | def count_hom_new_axis(genotypes: ArrayLike, _: ArrayLike) -> ArrayLike: 110 | return count_hom(genotypes, _)[:, np.newaxis, :] 111 | -------------------------------------------------------------------------------- /sgkit/stats/genee.py: -------------------------------------------------------------------------------- 1 | import dask.array as da 2 | import numpy as np 3 | import pandas as pd 4 | from dask.dataframe import DataFrame 5 | from sklearn.mixture import GaussianMixture 6 | from xarray import Dataset 7 | 8 | from sgkit.stats.genee_momentchi2py import hbe 9 | from sgkit.stats.ld import map_windows_as_dataframe 10 | from sgkit.typing import ArrayLike 11 | 12 | 13 | def genee(ds: Dataset, ld: ArrayLike, *, reg_covar: float = 0.000001) -> DataFrame: 14 | """Compute gene-ε as described in Cheng, et al. 2020 [1]. 15 | 16 | Parameters 17 | ---------- 18 | ds 19 | Dataset containing beta values (OLS betas or regularized betas). 20 | ld 21 | 2D array of LD values. 22 | reg_covar 23 | Non-negative regularization added to the diagonal of covariance. 24 | Passed to scikit-learn ``GaussianMixture``. 25 | 26 | Warnings 27 | -------- 28 | Unlike the implementation in [2], this function will always use the 29 | second mixture component with the largest variance, rather than 30 | the first mixture component with the largest variance if it is composed 31 | of more than 50% of the SNPs. 32 | 33 | Returns 34 | ------- 35 | A dataframe containing the following fields: 36 | 37 | - ``test_q``: test statistic 38 | - ``q_var``: test variance 39 | - ``pval``: p-value 40 | 41 | References 42 | ---------- 43 | [1] - W. Cheng, S. Ramachandran, and L. Crawford (2020). 44 | Estimation of non-null SNP effect size distributions enables the detection of enriched genes underlying complex traits. 45 | PLOS Genetics. 16(6): e1008855. 46 | 47 | [2] - https://github.com/ramachandran-lab/genee 48 | """ 49 | 50 | betas = np.expand_dims(ds["beta"].values, 1) 51 | epsilon_effect = genee_EM(betas=betas, reg_covar=reg_covar) 52 | 53 | betas = da.asarray(betas) 54 | ld = da.asarray(ld) 55 | 56 | meta = [ 57 | ("test_q", np.float32), 58 | ("q_var", np.float32), 59 | ("pval", np.float32), 60 | ] 61 | return map_windows_as_dataframe( 62 | genee_loop_chunk, 63 | betas, 64 | ld, 65 | window_starts=ds["window_start"].values, 66 | window_stops=ds["window_stop"].values, 67 | meta=meta, 68 | epsilon_effect=epsilon_effect, 69 | ) 70 | 71 | 72 | def genee_EM(betas, reg_covar=0.000001): 73 | # based on https://scikit-learn.org/stable/auto_examples/mixture/plot_gmm_selection.html#sphx-glr-auto-examples-mixture-plot-gmm-selection-py 74 | lowest_bic = np.inf 75 | for n_components in range(1, 10): 76 | gmm = GaussianMixture( 77 | n_components=n_components, reg_covar=reg_covar, random_state=0 78 | ).fit(betas) 79 | bic = gmm.bic(betas) 80 | if bic < lowest_bic: 81 | lowest_bic = bic 82 | best_gmm = gmm 83 | 84 | covars = best_gmm.covariances_.squeeze() 85 | if best_gmm.n_components == 1: # pragma: no cover 86 | epsilon_effect = covars[0] 87 | else: 88 | # TODO: handle case where first component composed more than 50% SNPs 89 | # https://github.com/ramachandran-lab/genee/blob/a357a956241df93f16e07664e24f3aeac65f4177/genee/R/genee_EM.R#L28-L29 90 | covars_decreasing = np.sort(covars)[::-1] 91 | epsilon_effect = covars_decreasing[1] 92 | 93 | return epsilon_effect 94 | 95 | 96 | def genee_loop_chunk( 97 | args, 98 | chunk_window_starts, 99 | chunk_window_stops, 100 | abs_chunk_start, 101 | chunk_max_window_start, 102 | epsilon_effect, 103 | ): 104 | # Iterate over each window in this chunk 105 | # Note that betas and ld are just the chunked versions here 106 | betas, ld = args 107 | rows = list() 108 | for ti in range(len(chunk_window_starts)): 109 | window_start = chunk_window_starts[ti] 110 | window_stop = chunk_window_stops[ti] 111 | rows.append( 112 | genee_test(slice(window_start, window_stop), ld, betas, epsilon_effect) 113 | ) 114 | cols = [ 115 | ("test_q", np.float32), 116 | ("q_var", np.float32), 117 | ("pval", np.float32), 118 | ] 119 | df = pd.DataFrame(rows, columns=[c[0] for c in cols]) 120 | for k, v in dict(cols).items(): 121 | df[k] = df[k].astype(v) 122 | return df 123 | 124 | 125 | def genee_test(gene, ld, betas, epsilon_effect): 126 | ld_g = ld[gene, gene] 127 | x = ld_g * epsilon_effect 128 | e_values = np.linalg.eigvals(x) 129 | e_values = ensure_positive_real(e_values) 130 | 131 | betas_g = betas[gene] 132 | test_statistics = betas_g.T @ betas_g 133 | t_var = np.diag((ld_g * epsilon_effect) @ (ld_g * epsilon_effect)).sum() 134 | 135 | p_value_g = compute_p_values(e_values, test_statistics) 136 | p_value_g = ensure_positive_real(p_value_g) 137 | 138 | return test_statistics.squeeze().item(), t_var, p_value_g.squeeze().item() 139 | 140 | 141 | def ensure_positive_real(x): 142 | x = np.real_if_close(x) 143 | x[x <= 0.0] = 1e-20 144 | return x 145 | 146 | 147 | def compute_p_values(e_values, test_statistics, *, method="hbe"): 148 | if method == "hbe": 149 | # see discussion at https://github.com/deanbodenham/momentchi2py 150 | # hbe or lpb4 both pass the genee tests here 151 | return 1.0 - hbe(e_values, test_statistics) 152 | elif method == "liu_sf": # pragma: no cover 153 | # https://github.com/limix/chiscore 154 | # note that chiscore has a native dependency (chi2comb) and is not 155 | # available on all platforms 156 | from chiscore import liu_sf 157 | 158 | (q, _, _, _) = liu_sf( 159 | test_statistics, e_values, np.ones(len(e_values)), np.zeros(len(e_values)) 160 | ) 161 | return q 162 | else: # pragma: no cover 163 | raise ValueError(f"Unsupported method: {method}") 164 | -------------------------------------------------------------------------------- /sgkit/stats/ibs_numba_fns.py: -------------------------------------------------------------------------------- 1 | from sgkit.accelerate import numba_guvectorize 2 | from sgkit.typing import ArrayLike 3 | 4 | 5 | @numba_guvectorize( # type: ignore 6 | [ 7 | "void(int8[:,:,:], float64[:,:], float64[:,:])", 8 | "void(int16[:,:,:], float64[:,:], float64[:,:])", 9 | "void(int32[:,:,:], float64[:,:], float64[:,:])", 10 | "void(int64[:,:,:], float64[:,:], float64[:,:])", 11 | ], 12 | "(v,s,k)->(s,s),(s,s)", 13 | ) 14 | def allele_matching_diag( 15 | gt: ArrayLike, 16 | numerator: ArrayLike, 17 | denominator: ArrayLike, 18 | ) -> None: # pragma: no cover 19 | n_variant, n_sample, ploidy = gt.shape 20 | numerator[:] = 0.0 21 | denominator[:] = 0.0 22 | for v in range(n_variant): 23 | for s0 in range(n_sample): 24 | for s1 in range(s0 + 1): 25 | # local IBS prob to ensure even weighting of loci 26 | local_num = 0 27 | local_denom = 0 28 | for i in range(ploidy): 29 | a0 = gt[v, s0, i] 30 | if a0 >= 0: 31 | for j in range(ploidy): 32 | a1 = gt[v, s1, j] 33 | if a1 >= 0: 34 | local_denom += 1 35 | if a0 == a1: 36 | local_num += 1 37 | if local_denom > 0: 38 | p_ibs = local_num / local_denom 39 | numerator[s0, s1] += p_ibs 40 | numerator[s1, s0] += p_ibs 41 | denominator[s0, s1] += 1.0 42 | denominator[s1, s0] += 1.0 43 | # undo double addition to diagonal 44 | if local_denom > 0: 45 | numerator[s0, s0] -= p_ibs 46 | denominator[s0, s0] -= 1.0 47 | 48 | 49 | @numba_guvectorize( # type: ignore 50 | [ 51 | "void(int8[:,:,:], int8[:,:,:], float64[:,:], float64[:,:])", 52 | "void(int16[:,:,:], int16[:,:,:], float64[:,:], float64[:,:])", 53 | "void(int32[:,:,:], int32[:,:,:], float64[:,:], float64[:,:])", 54 | "void(int64[:,:,:], int64[:,:,:], float64[:,:], float64[:,:])", 55 | ], 56 | "(v,s0,k),(v,s1,k)->(s0,s1),(s0,s1)", 57 | ) 58 | def allele_matching_block( 59 | gt0: ArrayLike, 60 | gt1: ArrayLike, 61 | numerator: ArrayLike, 62 | denominator: ArrayLike, 63 | ) -> None: # pragma: no cover 64 | n_variant, n_sample0, ploidy = gt0.shape 65 | _, n_sample1, _ = gt1.shape 66 | numerator[:] = 0.0 67 | denominator[:] = 0.0 68 | for v in range(n_variant): 69 | for s0 in range(n_sample0): 70 | for s1 in range(n_sample1): 71 | # local IBS prob to ensure even weighting of loci 72 | local_num = 0 73 | local_denom = 0 74 | for i in range(ploidy): 75 | a0 = gt0[v, s0, i] 76 | if a0 >= 0: 77 | for j in range(ploidy): 78 | a1 = gt1[v, s1, j] 79 | if a1 >= 0: 80 | local_denom += 1 81 | if a0 == a1: 82 | local_num += 1 83 | if local_denom > 0: 84 | p_ibs = local_num / local_denom 85 | numerator[s0, s1] += p_ibs 86 | denominator[s0, s1] += 1.0 87 | -------------------------------------------------------------------------------- /sgkit/stats/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Hashable, Tuple 2 | 3 | import dask.array as da 4 | import numpy as np 5 | import xarray as xr 6 | from dask.array import Array 7 | from xarray import DataArray, Dataset 8 | 9 | from ..typing import ArrayLike 10 | 11 | 12 | def concat_2d(ds: Dataset, dims: Tuple[Hashable, Hashable]) -> DataArray: 13 | """Concatenate dataset with a mixture of <= 2D variables as single DataArray. 14 | 15 | Parameters 16 | ---------- 17 | ds 18 | Dataset containing variables to convert. 19 | Any variables with a first dimension not equal to `dims[0]` 20 | will be ignored. 21 | dims 22 | Names of resulting dimensions in 2D array where first dimension 23 | is shared by all variables and all others are collapsed into 24 | a new dimension named by the second item. 25 | 26 | Returns 27 | ------- 28 | Array with dimensions defined by `dims`. 29 | """ 30 | arrs = [] 31 | for var in ds: 32 | arr = ds[var] 33 | if arr.dims[0] != dims[0]: 34 | continue 35 | if arr.ndim > 2: 36 | raise ValueError( 37 | "All variables must have <= 2 dimensions " 38 | f"(variable {var} has shape {arr.shape})" 39 | ) 40 | if arr.ndim == 2: 41 | # Rename concatenation axis 42 | arr = arr.rename({arr.dims[1]: dims[1]}) 43 | else: 44 | # Add concatenation axis 45 | arr = arr.expand_dims(dim=dims[1], axis=1) 46 | arrs.append(arr) 47 | return xr.concat(arrs, dim=dims[1]) 48 | 49 | 50 | def r2_score(YP: ArrayLike, YT: ArrayLike) -> ArrayLike: 51 | """R2 score calculator for batches of vector pairs. 52 | 53 | Parameters 54 | ---------- 55 | YP 56 | ArrayLike (..., M) 57 | Predicted values, can be any of any shape >= 1D. 58 | All leading dimensions must be broadcastable to 59 | the leading dimensions of `YT`. 60 | YT 61 | ArrayLike (..., M) 62 | True values, can be any of any shape >= 1D. 63 | All leading dimensions must be broadcastable to 64 | the leading dimensions of `YP`. 65 | 66 | Returns 67 | ------- 68 | R2 : (...) ArrayLike 69 | R2 scores array with shape equal to all leading 70 | (i.e. batch) dimensions of the provided arrays. 71 | """ 72 | YP, YT = np.broadcast_arrays(YP, YT) # type: ignore[no-untyped-call] 73 | tot = np.power(YT - YT.mean(axis=-1, keepdims=True), 2) 74 | tot = tot.sum(axis=-1, keepdims=True) 75 | res = np.power(YT - YP, 2) 76 | res = res.sum(axis=-1, keepdims=True) 77 | res_nz, tot_nz = res != 0, tot != 0 78 | alt = np.where(res_nz & ~tot_nz, 0, 1) 79 | # Hide warnings rather than use masked division 80 | # because the latter is not supported by dask 81 | with np.errstate(divide="ignore", invalid="ignore"): 82 | r2 = np.where(res_nz & tot_nz, 1 - res / tot, alt) 83 | return np.squeeze(r2, axis=-1) 84 | 85 | 86 | def assert_block_shape(x: Array, *args: int) -> None: 87 | """Validate block shape (i.e. x.numblocks)""" 88 | shape = tuple(args) 89 | assert x.numblocks == tuple( 90 | shape 91 | ), f"Expecting block shape {shape}, found {x.numblocks}" 92 | 93 | 94 | def assert_chunk_shape(x: Array, *args: int) -> None: 95 | """Validate chunk shape (i.e. x.chunksize)""" 96 | shape = tuple(args) 97 | assert x.chunksize == shape, f"Expecting chunk shape {shape}, found {x.chunksize}" 98 | 99 | 100 | def assert_array_shape(x: ArrayLike, *args: int) -> None: 101 | """Validate array shape (i.e. x.shape)""" 102 | shape = tuple(args) 103 | assert x.shape == shape, f"Expecting array shape {shape}, found {x.shape}" 104 | 105 | 106 | def map_blocks_asnumpy(x: Array) -> Array: 107 | if hasattr(x, "_meta") and da.utils.is_cupy_type(x._meta): # pragma: no cover 108 | import cupy as cp # type: ignore[import] 109 | 110 | x = x.map_blocks(cp.asnumpy) 111 | return x 112 | -------------------------------------------------------------------------------- /sgkit/testing.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import numpy as np 4 | from xarray import Dataset 5 | 6 | from sgkit.typing import ArrayLike 7 | 8 | from .model import create_genotype_call_dataset 9 | from .utils import split_array_chunks 10 | 11 | 12 | def simulate_genotype_call_dataset( 13 | n_variant: int, 14 | n_sample: int, 15 | n_ploidy: int = 2, 16 | n_allele: int = 2, 17 | n_contig: int = 1, 18 | seed: Optional[int] = 0, 19 | missing_pct: Optional[float] = None, 20 | phased: Optional[bool] = None, 21 | additional_variant_fields: Optional[dict] = None, 22 | ) -> Dataset: 23 | """Simulate genotype calls and variant/sample data. 24 | 25 | Note that the data simulated by this function has no 26 | biological interpretation and that summary statistics 27 | or other methods applied to it will produce meaningless 28 | results. This function is primarily a convenience on 29 | generating :class:`xarray.Dataset` containers so quantities of interest 30 | should be overwritten, where appropriate, within the 31 | context of a more specific application. 32 | 33 | Parameters 34 | ---------- 35 | n_variant 36 | Number of variants to simulate 37 | n_sample 38 | Number of samples to simulate 39 | n_ploidy 40 | Number of chromosome copies in each sample 41 | n_allele 42 | Number of alleles to simulate 43 | n_contig 44 | optional 45 | Number of contigs to partition variants with, 46 | controlling values in ``variant_contig``. Values 47 | will all be 0 by default when ``n_contig`` is 1. 48 | seed 49 | Seed for random number generation, optional 50 | missing_pct 51 | The percentage of missing calls, must be within [0.0, 1.0], optional 52 | phased 53 | Whether genotypes are phased, default is unphased, optional 54 | additional_variant_fields 55 | Additional variant fields to add to the dataset as a dictionary of 56 | {field_name: field_dtype}, optional 57 | 58 | Returns 59 | ------- 60 | A dataset containing the following variables: 61 | 62 | - :data:`sgkit.variables.variant_contig_spec` (variants) 63 | - :data:`sgkit.variables.variant_position_spec` (variants) 64 | - :data:`sgkit.variables.variant_allele_spec` (variants) 65 | - :data:`sgkit.variables.sample_id_spec` (samples) 66 | - :data:`sgkit.variables.call_genotype_spec` (variants, samples, ploidy) 67 | - :data:`sgkit.variables.call_genotype_mask_spec` (variants, samples, ploidy) 68 | - :data:`sgkit.variables.call_genotype_phased_spec` (variants, samples), if ``phased`` is not None 69 | - Those specified in ``additional_variant_fields``, if provided 70 | """ 71 | if missing_pct and (missing_pct < 0.0 or missing_pct > 1.0): 72 | raise ValueError("missing_pct must be within [0.0, 1.0]") 73 | rs = np.random.RandomState(seed=seed) 74 | call_genotype = rs.randint( 75 | 0, n_allele, size=(n_variant, n_sample, n_ploidy), dtype=np.int8 76 | ) 77 | if missing_pct: 78 | call_genotype = np.where( 79 | rs.rand(*call_genotype.shape) < missing_pct, -1, call_genotype 80 | ) 81 | if phased is None: 82 | call_genotype_phased = None 83 | else: 84 | call_genotype_phased = np.full((n_variant, n_sample), phased, dtype=bool) 85 | 86 | contig_size = split_array_chunks(n_variant, n_contig) 87 | contig = np.repeat(np.arange(n_contig), contig_size) 88 | contig_names = np.unique(contig).astype(str).tolist() # type: ignore[no-untyped-call] 89 | position = np.concatenate([np.arange(contig_size[i]) for i in range(n_contig)]) # type: ignore[no-untyped-call] 90 | assert position.size == contig.size 91 | alleles: ArrayLike = rs.choice( 92 | ["A", "C", "G", "T"], size=(n_variant, n_allele) 93 | ).astype("S") 94 | sample_id = np.array([f"S{i}" for i in range(n_sample)]) 95 | ds = create_genotype_call_dataset( 96 | variant_contig_names=contig_names, 97 | variant_contig=contig, 98 | variant_position=position, 99 | variant_allele=alleles, 100 | sample_id=sample_id, 101 | call_genotype=call_genotype, 102 | call_genotype_phased=call_genotype_phased, 103 | ) 104 | # Add in each of the additional variant fields, if provided with random data 105 | if additional_variant_fields is not None: 106 | for field_name, field_dtype in additional_variant_fields.items(): 107 | if field_dtype in (np.float32, np.float64): 108 | field = rs.rand(n_variant).astype(field_dtype) 109 | elif field_dtype in (np.int8, np.int16, np.int32, np.int64): 110 | field = rs.randint(0, 100, n_variant, dtype=field_dtype) 111 | elif field_dtype is bool: 112 | field = rs.rand(n_variant) > 0.5 113 | elif field_dtype is str: 114 | field = np.arange(n_variant).astype("S") 115 | else: 116 | raise ValueError(f"Unrecognized dtype {field_dtype}") 117 | ds[field_name] = (("variants",), field) 118 | return ds 119 | -------------------------------------------------------------------------------- /sgkit/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/tests/__init__.py -------------------------------------------------------------------------------- /sgkit/tests/data/sample.bed: -------------------------------------------------------------------------------- 1 | chr0 0 10 2 | chr0 10 20 3 | chr1 0 10 4 | chr1 20 30 5 | chr1 30 40 6 | chr1 50 60 -------------------------------------------------------------------------------- /sgkit/tests/io/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/tests/io/__init__.py -------------------------------------------------------------------------------- /sgkit/tests/io/bgen/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/tests/io/bgen/__init__.py -------------------------------------------------------------------------------- /sgkit/tests/io/bgen/data/.gitignore: -------------------------------------------------------------------------------- 1 | *.metadata2.mmm 2 | *.metafile 3 | -------------------------------------------------------------------------------- /sgkit/tests/io/bgen/data/example-no-samples.bgen: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/tests/io/bgen/data/example-no-samples.bgen -------------------------------------------------------------------------------- /sgkit/tests/io/bgen/data/example-separate-samples.bgen: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/tests/io/bgen/data/example-separate-samples.bgen -------------------------------------------------------------------------------- /sgkit/tests/io/bgen/data/example-separate-samples.sample: -------------------------------------------------------------------------------- 1 | ID 2 | 0 3 | s1 4 | s2 5 | s3 6 | s4 7 | s5 8 | -------------------------------------------------------------------------------- /sgkit/tests/io/bgen/data/example.bgen: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/tests/io/bgen/data/example.bgen -------------------------------------------------------------------------------- /sgkit/tests/io/bgen/data/samples: -------------------------------------------------------------------------------- 1 | sample_001 2 | sample_002 3 | sample_003 4 | sample_004 5 | sample_005 6 | -------------------------------------------------------------------------------- /sgkit/tests/io/data/sample.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/tests/io/data/sample.vcf.gz -------------------------------------------------------------------------------- /sgkit/tests/io/data/sample.vcf.gz.tbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/tests/io/data/sample.vcf.gz.tbi -------------------------------------------------------------------------------- /sgkit/tests/io/plink/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/tests/io/plink/__init__.py -------------------------------------------------------------------------------- /sgkit/tests/io/plink/data/example.bed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/tests/io/plink/data/example.bed -------------------------------------------------------------------------------- /sgkit/tests/io/plink/data/example.bim: -------------------------------------------------------------------------------- 1 | 1 1_10 0 10 A G 2 | 1 1_20 0 20 T C 3 | -------------------------------------------------------------------------------- /sgkit/tests/io/plink/data/example.fam: -------------------------------------------------------------------------------- 1 | ind0 ind0 0 0 0 -9 2 | ind1 ind1 0 0 0 -9 3 | ind2 ind2 0 0 0 -9 4 | ind3 ind3 0 0 0 -9 5 | ind4 ind4 0 0 0 -9 6 | ind5 ind5 0 0 0 -9 7 | ind6 ind6 0 0 0 -9 8 | ind7 ind7 0 0 0 -9 9 | ind8 ind8 0 0 0 -9 10 | ind9 ind9 0 0 0 -9 11 | -------------------------------------------------------------------------------- /sgkit/tests/io/plink/data/example.map: -------------------------------------------------------------------------------- 1 | 1 1_10 0 10 2 | 1 1_20 0 20 3 | -------------------------------------------------------------------------------- /sgkit/tests/io/plink/data/example.nosex: -------------------------------------------------------------------------------- 1 | ind0 ind0 2 | ind1 ind1 3 | ind2 ind2 4 | ind3 ind3 5 | ind4 ind4 6 | ind5 ind5 7 | ind6 ind6 8 | ind7 ind7 9 | ind8 ind8 10 | ind9 ind9 11 | -------------------------------------------------------------------------------- /sgkit/tests/io/plink/data/example.ped: -------------------------------------------------------------------------------- 1 | ind0 ind0 0 0 0 0 A A T T 2 | ind1 ind1 0 0 0 0 A A T T 3 | ind2 ind2 0 0 0 0 A A T T 4 | ind3 ind3 0 0 0 0 G G T T 5 | ind4 ind4 0 0 0 0 G G C C 6 | ind5 ind5 0 0 0 0 G G C C 7 | ind6 ind6 0 0 0 0 G G C C 8 | ind7 ind7 0 0 0 0 G G C C 9 | ind8 ind8 0 0 0 0 G G C C 10 | ind9 ind9 0 0 0 0 G G C C 11 | -------------------------------------------------------------------------------- /sgkit/tests/io/plink/data/example_with_fam.bed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/tests/io/plink/data/example_with_fam.bed -------------------------------------------------------------------------------- /sgkit/tests/io/plink/data/example_with_fam.bim: -------------------------------------------------------------------------------- 1 | 1 1_10 0 10 A G 2 | 1 1_20 0 20 T C 3 | -------------------------------------------------------------------------------- /sgkit/tests/io/plink/data/example_with_fam.fam: -------------------------------------------------------------------------------- 1 | ind0 ind0 0 0 0 -9 2 | ind1 ind1 0 0 0 -9 3 | ind2 ind2 ind1 ind0 2 1 4 | ind3 ind3 ind1 ind0 1 2 5 | ind4 ind4 0 0 0 -9 6 | ind5 ind5 0 0 0 -9 7 | ind6 ind6 0 0 0 -9 8 | ind7 ind7 0 0 0 -9 9 | ind8 ind8 0 0 0 -9 10 | ind9 ind9 0 0 0 -9 11 | -------------------------------------------------------------------------------- /sgkit/tests/io/plink/data/plink_sim_10s_100v_10pmiss.bed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/tests/io/plink/data/plink_sim_10s_100v_10pmiss.bed -------------------------------------------------------------------------------- /sgkit/tests/io/plink/data/plink_sim_10s_100v_10pmiss.bim: -------------------------------------------------------------------------------- 1 | 1 1:1:G:CGCGCG 0.0 1 CGCGCG G 2 | 1 1:2:ACT:G 0.0 2 G ACT 3 | 1 1:3:ACT:G 0.0 3 G ACT 4 | 1 1:4:G:CGCGCG 0.0 4 CGCGCG G 5 | 1 1:5:G:CGCGCG 0.0 5 CGCGCG G 6 | 1 1:6:ACT:G 0.0 6 G ACT 7 | 1 1:7:G:CGCGCG 0.0 7 CGCGCG G 8 | 1 1:8:T:GTGG 0.0 8 GTGG T 9 | 1 1:9:T:GTGG 0.0 9 GTGG T 10 | 1 1:10:A:C 0.0 10 C A 11 | 1 1:11:ACT:G 0.0 11 G ACT 12 | 1 1:12:G:CGCGCG 0.0 12 CGCGCG G 13 | 1 1:13:G:CGCGCG 0.0 13 CGCGCG G 14 | 1 1:14:T:GTGG 0.0 14 GTGG T 15 | 1 1:15:ACT:G 0.0 15 G ACT 16 | 1 1:16:A:C 0.0 16 C A 17 | 1 1:17:ACT:G 0.0 17 G ACT 18 | 1 1:18:T:GTGG 0.0 18 GTGG T 19 | 1 1:19:A:C 0.0 19 C A 20 | 1 1:20:A:C 0.0 20 C A 21 | 1 1:21:T:GTGG 0.0 21 GTGG T 22 | 1 1:22:G:CGCGCG 0.0 22 CGCGCG G 23 | 1 1:23:T:GTGG 0.0 23 GTGG T 24 | 1 1:24:A:C 0.0 24 C A 25 | 1 1:25:A:C 0.0 25 C A 26 | 1 1:26:ACT:G 0.0 26 G ACT 27 | 1 1:27:G:CGCGCG 0.0 27 CGCGCG G 28 | 1 1:28:ACT:G 0.0 28 G ACT 29 | 1 1:29:T:GTGG 0.0 29 GTGG T 30 | 1 1:30:A:C 0.0 30 C A 31 | 1 1:31:T:GTGG 0.0 31 GTGG T 32 | 1 1:32:G:CGCGCG 0.0 32 CGCGCG G 33 | 1 1:33:ACT:G 0.0 33 G ACT 34 | 1 1:34:G:CGCGCG 0.0 34 CGCGCG G 35 | 1 1:35:A:C 0.0 35 C A 36 | 1 1:36:G:CGCGCG 0.0 36 CGCGCG G 37 | 1 1:37:T:GTGG 0.0 37 GTGG T 38 | 1 1:38:A:C 0.0 38 C A 39 | 1 1:39:A:C 0.0 39 C A 40 | 1 1:40:T:GTGG 0.0 40 GTGG T 41 | 1 1:41:A:C 0.0 41 C A 42 | 1 1:42:G:CGCGCG 0.0 42 CGCGCG G 43 | 1 1:43:T:GTGG 0.0 43 GTGG T 44 | 1 1:44:ACT:G 0.0 44 G ACT 45 | 1 1:45:G:CGCGCG 0.0 45 CGCGCG G 46 | 1 1:46:ACT:G 0.0 46 G ACT 47 | 1 1:47:G:CGCGCG 0.0 47 CGCGCG G 48 | 1 1:48:A:C 0.0 48 C A 49 | 1 1:49:A:C 0.0 49 C A 50 | 1 1:50:A:C 0.0 50 C A 51 | 1 1:51:G:CGCGCG 0.0 51 CGCGCG G 52 | 1 1:52:A:C 0.0 52 C A 53 | 1 1:53:ACT:G 0.0 53 G ACT 54 | 1 1:54:A:C 0.0 54 C A 55 | 1 1:55:G:CGCGCG 0.0 55 CGCGCG G 56 | 1 1:56:T:GTGG 0.0 56 GTGG T 57 | 1 1:57:G:CGCGCG 0.0 57 CGCGCG G 58 | 1 1:58:A:C 0.0 58 C A 59 | 1 1:59:T:GTGG 0.0 59 GTGG T 60 | 1 1:60:G:CGCGCG 0.0 60 CGCGCG G 61 | 1 1:61:ACT:G 0.0 61 G ACT 62 | 1 1:62:A:C 0.0 62 C A 63 | 1 1:63:G:CGCGCG 0.0 63 CGCGCG G 64 | 1 1:64:T:GTGG 0.0 64 GTGG T 65 | 1 1:65:T:GTGG 0.0 65 GTGG T 66 | 1 1:66:ACT:G 0.0 66 G ACT 67 | 1 1:67:T:GTGG 0.0 67 GTGG T 68 | 1 1:68:ACT:G 0.0 68 G ACT 69 | 1 1:69:G:CGCGCG 0.0 69 CGCGCG G 70 | 1 1:70:G:CGCGCG 0.0 70 CGCGCG G 71 | 1 1:71:ACT:G 0.0 71 G ACT 72 | 1 1:72:G:CGCGCG 0.0 72 CGCGCG G 73 | 1 1:73:A:C 0.0 73 C A 74 | 1 1:74:A:C 0.0 74 C A 75 | 1 1:75:T:GTGG 0.0 75 GTGG T 76 | 1 1:76:A:C 0.0 76 C A 77 | 1 1:77:ACT:G 0.0 77 G ACT 78 | 1 1:78:ACT:G 0.0 78 G ACT 79 | 1 1:79:A:C 0.0 79 C A 80 | 1 1:80:A:C 0.0 80 C A 81 | 1 1:81:A:C 0.0 81 C A 82 | 1 1:82:T:GTGG 0.0 82 GTGG T 83 | 1 1:83:A:C 0.0 83 C A 84 | 1 1:84:ACT:G 0.0 84 G ACT 85 | 1 1:85:A:C 0.0 85 C A 86 | 1 1:86:G:CGCGCG 0.0 86 CGCGCG G 87 | 1 1:87:ACT:G 0.0 87 G ACT 88 | 1 1:88:A:C 0.0 88 C A 89 | 1 1:89:A:C 0.0 89 C A 90 | 1 1:90:T:GTGG 0.0 90 GTGG T 91 | 1 1:91:T:GTGG 0.0 91 GTGG T 92 | 1 1:92:T:GTGG 0.0 92 GTGG T 93 | 1 1:93:A:C 0.0 93 C A 94 | 1 1:94:A:C 0.0 94 C A 95 | 1 1:95:A:C 0.0 95 C A 96 | 1 1:96:A:C 0.0 96 C A 97 | 1 1:97:T:GTGG 0.0 97 GTGG T 98 | 1 1:98:ACT:G 0.0 98 G ACT 99 | 1 1:99:T:GTGG 0.0 99 GTGG T 100 | 1 1:100:A:C 0.0 100 C A 101 | -------------------------------------------------------------------------------- /sgkit/tests/io/plink/data/plink_sim_10s_100v_10pmiss.fam: -------------------------------------------------------------------------------- 1 | 0 000 0 0 0 NA 2 | 0 001 0 0 0 NA 3 | 0 002 0 0 0 NA 4 | 0 003 0 0 0 NA 5 | 0 004 0 0 0 NA 6 | 0 005 0 0 0 NA 7 | 0 006 0 0 0 NA 8 | 0 007 0 0 0 NA 9 | 0 008 0 0 0 NA 10 | 0 009 0 0 0 NA 11 | -------------------------------------------------------------------------------- /sgkit/tests/io/plink/test_plink_writer.py: -------------------------------------------------------------------------------- 1 | from filecmp import cmp 2 | 3 | import pandas as pd 4 | import pytest 5 | 6 | from sgkit.io.plink import plink_to_zarr, read_plink 7 | from sgkit.io.plink.plink_reader import read_bim, read_fam 8 | from sgkit.io.plink.plink_writer import write_plink, zarr_to_plink 9 | from sgkit.testing import simulate_genotype_call_dataset 10 | 11 | example_dataset_1 = "plink_sim_10s_100v_10pmiss" 12 | example_dataset_2 = "example" 13 | example_dataset_3 = "example_with_fam" 14 | 15 | 16 | @pytest.fixture(params=[dict()]) 17 | def ds1(shared_datadir, request): 18 | path = shared_datadir / example_dataset_1 19 | return read_plink(path=path, bim_sep="\t", fam_sep="\t", **request.param) 20 | 21 | 22 | @pytest.mark.parametrize( 23 | "plink_in, fam_sep", 24 | [ 25 | (example_dataset_1, "\t"), 26 | (example_dataset_2, " "), 27 | (example_dataset_3, " "), 28 | ], 29 | ) 30 | def test_write_plink(shared_datadir, tmp_path, plink_in, fam_sep): 31 | # read plink file as a dataset then write it out again 32 | ds = read_plink(path=shared_datadir / plink_in, fam_sep=fam_sep) 33 | path = tmp_path / "plink_out" 34 | path.mkdir(parents=True, exist_ok=False) 35 | write_plink(ds, path=path) 36 | 37 | # check bed files are the same 38 | bed_path_expected = (shared_datadir / plink_in).with_suffix(".bed") 39 | bed_path_actual = path.with_suffix(".bed") 40 | assert cmp(bed_path_expected, bed_path_actual) 41 | 42 | # check bim files are the same 43 | bim_expected = read_bim((shared_datadir / plink_in).with_suffix(".bim")).compute() 44 | bim_actual = read_bim(path.with_suffix(".bim")).compute() 45 | pd.testing.assert_frame_equal(bim_expected, bim_actual) 46 | 47 | # check fam files are the same 48 | fam_expected = read_fam( 49 | (shared_datadir / plink_in).with_suffix(".fam"), sep=fam_sep 50 | ).compute() 51 | fam_actual = read_fam(path.with_suffix(".fam")).compute() 52 | pd.testing.assert_frame_equal(fam_expected, fam_actual) 53 | 54 | 55 | @pytest.mark.parametrize( 56 | "plink_in, fam_sep", 57 | [ 58 | (example_dataset_1, "\t"), 59 | (example_dataset_2, " "), 60 | (example_dataset_3, " "), 61 | ], 62 | ) 63 | def test_zarr_to_plink(shared_datadir, tmp_path, plink_in, fam_sep): 64 | # read plink file as a zarr file then write it out again 65 | zarr_path = tmp_path / "plink.zarr" 66 | plink_to_zarr(path=shared_datadir / plink_in, output=zarr_path, fam_sep=fam_sep) 67 | path = tmp_path / "plink_out" 68 | path.mkdir(parents=True, exist_ok=False) 69 | zarr_to_plink(zarr_path, path=path) 70 | 71 | # check bed files are the same 72 | bed_path_expected = (shared_datadir / plink_in).with_suffix(".bed") 73 | bed_path_actual = path.with_suffix(".bed") 74 | assert cmp(bed_path_expected, bed_path_actual) 75 | 76 | # check bim files are the same 77 | bim_expected = read_bim((shared_datadir / plink_in).with_suffix(".bim")).compute() 78 | bim_actual = read_bim(path.with_suffix(".bim")).compute() 79 | pd.testing.assert_frame_equal(bim_expected, bim_actual) 80 | 81 | # check fam files are the same 82 | fam_expected = read_fam( 83 | (shared_datadir / plink_in).with_suffix(".fam"), sep=fam_sep 84 | ).compute() 85 | fam_actual = read_fam(path.with_suffix(".fam")).compute() 86 | pd.testing.assert_frame_equal(fam_expected, fam_actual) 87 | 88 | 89 | def test_raise_on_both_path_types(ds1): 90 | with pytest.raises( 91 | ValueError, 92 | match="Either `path` or all 3 of `{bed,bim,fam}_path` must be specified but not both", 93 | ): 94 | write_plink(ds1, path="x", bed_path="x") 95 | 96 | 97 | def test_genotype_inputs_checks(): 98 | g_wrong_ploidy = simulate_genotype_call_dataset(100, 10, n_ploidy=3) 99 | with pytest.raises( 100 | ValueError, match="write_plink only works for diploid genotypes" 101 | ): 102 | write_plink(g_wrong_ploidy, path="x") 103 | 104 | g_non_biallelic = simulate_genotype_call_dataset(100, 10, n_allele=3) 105 | with pytest.raises( 106 | ValueError, match="write_plink only works for biallelic genotypes" 107 | ): 108 | write_plink(g_non_biallelic, path="x") 109 | -------------------------------------------------------------------------------- /sgkit/tests/io/test_dataset.py: -------------------------------------------------------------------------------- 1 | from typing import MutableMapping 2 | 3 | import pytest 4 | import xarray as xr 5 | import zarr 6 | from packaging.version import Version 7 | from xarray import Dataset 8 | 9 | from sgkit import load_dataset, save_dataset 10 | from sgkit.testing import simulate_genotype_call_dataset 11 | 12 | 13 | def assert_identical(ds1: Dataset, ds2: Dataset) -> None: 14 | """Assert two Datasets are identical, including dtypes for all variables.""" 15 | xr.testing.assert_identical(ds1, ds2) 16 | assert all([ds1[v].dtype == ds2[v].dtype for v in ds1.data_vars]) 17 | 18 | 19 | @pytest.mark.parametrize( 20 | "is_path", 21 | [True, False], 22 | ) 23 | def test_save_and_load_dataset(tmp_path, is_path): 24 | path = tmp_path / "ds.zarr" 25 | if not is_path: 26 | path = str(path) 27 | ds = simulate_genotype_call_dataset(n_variant=10, n_sample=10) 28 | save_dataset(ds, path) 29 | ds2 = load_dataset(path) 30 | assert_identical(ds, ds2) 31 | 32 | # save and load again to test https://github.com/pydata/xarray/issues/4386 33 | path2 = tmp_path / "ds2.zarr" 34 | if not is_path: 35 | path2 = str(path2) 36 | save_dataset(ds2, path2) 37 | assert_identical(ds, load_dataset(path2)) 38 | 39 | 40 | def test_save_and_load_dataset__mutable_mapping(): 41 | store: MutableMapping[str, bytes] = {} 42 | ds = simulate_genotype_call_dataset(n_variant=10, n_sample=10) 43 | save_dataset(ds, store) 44 | ds2 = load_dataset(store) 45 | assert_identical(ds, ds2) 46 | 47 | # save and load again to test https://github.com/pydata/xarray/issues/4386 48 | store2: MutableMapping[str, bytes] = {} 49 | save_dataset(ds2, store2) 50 | assert_identical(ds, load_dataset(store2)) 51 | 52 | 53 | def test_save_unequal_chunks_error(): 54 | # Make all dimensions the same size for ease of testing 55 | ds = simulate_genotype_call_dataset( 56 | n_variant=10, n_sample=10, n_ploidy=10, n_allele=10, n_contig=10 57 | ) 58 | # Normal zarr errors shouldn't be caught 59 | with pytest.raises( 60 | (FileExistsError, ValueError), 61 | match="(path '' contains an array|is not empty)", 62 | ): 63 | save_dataset(ds, {".zarray": ""}) 64 | 65 | # Make the dataset have unequal chunk sizes across all dimensions 66 | ds = ds.chunk({dim: (1, 3, 5, 1) for dim in ds.sizes}) 67 | 68 | # Check we get the sgkit error message 69 | with pytest.raises( 70 | ValueError, match="Zarr requires uniform chunk sizes. Use the `auto_rechunk`" 71 | ): 72 | save_dataset(ds, {}) 73 | 74 | # xarray gives a different error message when there are two chunks, so check that too 75 | ds = ds.chunk({dim: (4, 6) for dim in ds.sizes}) 76 | with pytest.raises( 77 | ValueError, match="Zarr requires uniform chunk sizes. Use the `auto_rechunk`" 78 | ): 79 | save_dataset(ds, {}) 80 | 81 | 82 | @pytest.mark.skipif( 83 | Version(zarr.__version__).major >= 3, reason="Fails for Zarr Python 3" 84 | ) 85 | def test_save_auto_rechunk(): 86 | # Make all dimensions the same size for ease of testing 87 | ds = simulate_genotype_call_dataset( 88 | n_variant=10, n_sample=10, n_ploidy=10, n_allele=10, n_contig=10 89 | ) 90 | # Make the dataset have unequal chunk sizes across all dimensions 91 | ds = ds.chunk({dim: (1, 3, 5, 1) for dim in ds.sizes}) 92 | 93 | # Default is to not rechunk 94 | with pytest.raises( 95 | ValueError, match="Zarr requires uniform chunk sizes. Use the `auto_rechunk`" 96 | ): 97 | save_dataset(ds, {}) 98 | 99 | # Rechunking off 100 | with pytest.raises( 101 | ValueError, match="Zarr requires uniform chunk sizes. Use the `auto_rechunk`" 102 | ): 103 | save_dataset(ds, {}, auto_rechunk=False) 104 | 105 | store = {} 106 | save_dataset(ds, store, auto_rechunk=True) 107 | assert_identical(ds, load_dataset(store)) 108 | 109 | # An equal chunked ds retains its original chunking 110 | ds = simulate_genotype_call_dataset(n_variant=10, n_sample=10) 111 | ds = ds.chunk({dim: 5 for dim in ds.sizes}) 112 | store2 = {} 113 | save_dataset(ds, store2, auto_rechunk=True) 114 | ds_loaded = load_dataset(store2) 115 | assert_identical(ds, ds_loaded) 116 | assert ds_loaded.chunks == ds.chunks 117 | -------------------------------------------------------------------------------- /sgkit/tests/test_cohort_numba_fns.py: -------------------------------------------------------------------------------- 1 | import dask.array as da 2 | import numpy as np 3 | import pytest 4 | 5 | from sgkit.stats.cohort_numba_fns import ( 6 | cohort_mean, 7 | cohort_nanmean, 8 | cohort_nansum, 9 | cohort_sum, 10 | ) 11 | 12 | 13 | def _random_cohort_data(chunks, n, axis, missing=0.0, scale=1, dtype=float, seed=0): 14 | shape = tuple(np.sum(tup) for tup in chunks) 15 | np.random.seed(seed) 16 | x = np.random.rand(*shape) * scale 17 | idx = np.random.choice([1, 0], shape, p=[missing, 1 - missing]).astype(bool) 18 | x[idx] = np.nan 19 | x = da.asarray(x, chunks=chunks, dtype=dtype) 20 | cohort = np.random.randint(-1, n, size=shape[axis]) 21 | return x, cohort, n, axis 22 | 23 | 24 | def _cohort_reduction(func, x, cohort, n, axis=-1): 25 | # reference implementation 26 | out = [] 27 | for i in range(n): 28 | idx = np.where(cohort == i)[0] 29 | x_c = np.take(x, idx, axis=axis) 30 | out.append(func(x_c, axis=axis)) 31 | out = np.swapaxes(np.array(out), 0, axis) 32 | return out 33 | 34 | 35 | @pytest.mark.parametrize( 36 | "x, cohort, n, axis", 37 | [ 38 | _random_cohort_data((20,), n=3, axis=0), 39 | _random_cohort_data((20, 20), n=2, axis=0, dtype=np.float32), 40 | _random_cohort_data((10, 10), n=2, axis=-1, scale=30, dtype=np.int16), 41 | _random_cohort_data((20, 20), n=3, axis=-1, missing=0.3), 42 | _random_cohort_data((7, 103, 4), n=5, axis=1, scale=7, missing=0.3), 43 | _random_cohort_data( 44 | ((3, 4), (50, 50, 3), 4), n=5, axis=1, scale=7, dtype=np.uint8 45 | ), 46 | _random_cohort_data( 47 | ((6, 6), (50, 50, 7), (3, 1)), n=5, axis=1, scale=7, missing=0.3 48 | ), 49 | ], 50 | ) 51 | @pytest.mark.parametrize( 52 | "reduction, func", 53 | [ 54 | (cohort_sum, np.sum), 55 | (cohort_nansum, np.nansum), 56 | (cohort_mean, np.mean), 57 | (cohort_nanmean, np.nanmean), 58 | ], 59 | ) 60 | def test_cohort_reductions(reduction, func, x, cohort, n, axis): 61 | expect = _cohort_reduction(func, x, cohort, n, axis=axis) 62 | actual = reduction(x, cohort, n, axis=axis) 63 | np.testing.assert_array_almost_equal(expect, actual) 64 | -------------------------------------------------------------------------------- /sgkit/tests/test_cohorts.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | 5 | from sgkit.cohorts import _cohorts_to_array, _tuple_len 6 | 7 | 8 | def test_tuple_len(): 9 | assert _tuple_len(tuple()) == 0 10 | assert _tuple_len(1) == 1 11 | assert _tuple_len("a") == 1 12 | assert _tuple_len("ab") == 1 13 | assert _tuple_len((1,)) == 1 14 | assert _tuple_len(("a",)) == 1 15 | assert _tuple_len(("ab",)) == 1 16 | assert _tuple_len((1, 2)) == 2 17 | assert _tuple_len(("a", "b")) == 2 18 | assert _tuple_len(("ab", "cd")) == 2 19 | 20 | 21 | def test_cohorts_to_array__indexes(): 22 | with pytest.raises(ValueError, match="Cohort tuples must all be the same length"): 23 | _cohorts_to_array([(0, 1), (0, 1, 2)]) 24 | 25 | np.testing.assert_equal(_cohorts_to_array([]), np.array([])) 26 | np.testing.assert_equal(_cohorts_to_array([0, 1]), np.array([[0], [1]])) 27 | np.testing.assert_equal( 28 | _cohorts_to_array([(0, 1), (2, 1)]), np.array([[0, 1], [2, 1]]) 29 | ) 30 | np.testing.assert_equal( 31 | _cohorts_to_array([(0, 1, 2), (3, 1, 2)]), np.array([[0, 1, 2], [3, 1, 2]]) 32 | ) 33 | 34 | 35 | def test_cohorts_to_array__ids(): 36 | with pytest.raises(ValueError, match="Cohort tuples must all be the same length"): 37 | _cohorts_to_array([("c0", "c1"), ("c0", "c1", "c2")]) 38 | 39 | np.testing.assert_equal(_cohorts_to_array([]), np.array([])) 40 | np.testing.assert_equal( 41 | _cohorts_to_array(["c0", "c1"], pd.Index(["c0", "c1"])), 42 | np.array([[0], [1]]), 43 | ) 44 | np.testing.assert_equal( 45 | _cohorts_to_array([("c0", "c1"), ("c2", "c1")], pd.Index(["c0", "c1", "c2"])), 46 | np.array([[0, 1], [2, 1]]), 47 | ) 48 | np.testing.assert_equal( 49 | _cohorts_to_array( 50 | [("c0", "c1", "c2"), ("c3", "c1", "c2")], pd.Index(["c0", "c1", "c2", "c3"]) 51 | ), 52 | np.array([[0, 1, 2], [3, 1, 2]]), 53 | ) 54 | -------------------------------------------------------------------------------- /sgkit/tests/test_genee.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numpy.testing as npt 3 | import pandas as pd 4 | 5 | from sgkit import genee 6 | from sgkit.model import create_genotype_call_dataset 7 | from sgkit.utils import encode_array 8 | 9 | 10 | def test_genee(datadir): 11 | # Simulated test data was created using https://github.com/ramachandran-lab/genee 12 | # 13 | # Edit Simulation.R to create a smaller dataset: 14 | # 15 | # -ngene=100; min_gsize=5; max_gsize=20; nsnp_intergenic=400 16 | # +ngene=5; min_gsize=5; max_gsize=10; nsnp_intergenic=20 17 | # 18 | # Then run 19 | # R --vanilla < Simulation.R 20 | # 21 | # Followed by 22 | # 23 | # library("genee") 24 | # load("Simulated_LD.RData") 25 | # load("Simulated_Summary_Statistics.RData") 26 | # load("gene_list.RData") 27 | # # use alpha = -1 for OLS 28 | # result = genee(mydata, ld, alpha = -1, gene_list = gene_list) 29 | # write.csv(mydata, "/path/to/sgkit/sgkit/tests/test_genee/mydata.csv") 30 | # write.csv(ld, "/path/to/sgkit/sgkit/tests/test_genee/ld.csv") 31 | # write.csv(result, "/path/to/sgkit/sgkit/tests/test_genee/result.csv") 32 | # write.csv(t(sapply(gene_list, unlist)), "/path/to/sgkit/sgkit/tests/test_genee/gene_list.csv") 33 | 34 | mydata = pd.read_csv(datadir / "mydata.csv", index_col=0) 35 | ld = pd.read_csv(datadir / "ld.csv", index_col=0) 36 | 37 | # This was extracted from gene_list.csv 38 | gene_list = "1:7,8:14,15:19,20:25,26:35" 39 | gene_list = [[int(s) for s in ss.split(":")] for ss in gene_list.split(",")] 40 | gene_start, gene_stop = list(zip(*gene_list)) 41 | gene_start = np.array(gene_start) - 1 # make 0-based 42 | gene_stop = np.array(gene_stop) 43 | 44 | ds = to_sgkit(mydata) 45 | 46 | # turn ld into an array 47 | ld = ld.to_numpy() 48 | 49 | # genes are windows in this simple example 50 | ds["window_contig"] = (["windows"], np.full(len(gene_start), 0)) 51 | ds["window_start"] = (["windows"], gene_start) 52 | ds["window_stop"] = (["windows"], gene_stop) 53 | 54 | df = genee(ds, ld).compute() 55 | 56 | expected = pd.read_csv(datadir / "result.csv", index_col=0) 57 | expected = expected.reset_index() 58 | 59 | npt.assert_allclose(df["test_q"], expected["test_q"]) 60 | npt.assert_allclose(df["q_var"], expected["q_var"], rtol=0.01) 61 | npt.assert_allclose( 62 | df[df["pval"] > 1e-6]["pval"], 63 | expected[expected["pval"] > 1e-6]["pval"], 64 | rtol=0.04, 65 | ) 66 | 67 | 68 | def to_sgkit(mydata): 69 | """Convert summary stats produced by genee R package to sgkit dataset""" 70 | variant_contig, variant_contig_names = encode_array(mydata.V1.to_numpy()) 71 | variant_contig = variant_contig.astype("int16") 72 | variant_contig_names = [str(contig) for contig in variant_contig_names] 73 | variant_position = mydata.V3.to_numpy() 74 | variant_id = mydata.V2.to_numpy() 75 | variant_allele = np.array([["A"]] * len(variant_contig), dtype="S1") # not used 76 | sample_id = ["SAMPLE1"] 77 | ds = create_genotype_call_dataset( 78 | variant_contig_names=variant_contig_names, 79 | variant_contig=variant_contig, 80 | variant_position=variant_position, 81 | variant_allele=variant_allele, 82 | sample_id=sample_id, 83 | variant_id=variant_id, 84 | ) 85 | ds["beta"] = (["variants"], mydata.V4.to_numpy()) 86 | return ds 87 | -------------------------------------------------------------------------------- /sgkit/tests/test_genee/gene_list.csv: -------------------------------------------------------------------------------- 1 | "","V1","V2","V3","V4","V5" 2 | "1",1:7,8:14,15:19,20:25,26:35 3 | -------------------------------------------------------------------------------- /sgkit/tests/test_genee/mydata.csv: -------------------------------------------------------------------------------- 1 | "","V1","V2","V3","V4" 2 | "1","1","rs1","1","0.0589981193774866" 3 | "2","1","rs2","2","-0.0479887297468593" 4 | "3","1","rs3","3","0.00755718359618298" 5 | "4","1","rs4","4","0.0403655126478955" 6 | "5","1","rs5","5","0.00386474332262596" 7 | "6","1","rs6","6","-0.00906859548746008" 8 | "7","1","rs7","7","0.0116812908619256" 9 | "8","1","rs8","8","0.0403476488344958" 10 | "9","1","rs9","9","0.0429894641101545" 11 | "10","1","rs10","10","0.033776804193074" 12 | "11","1","rs11","11","0.0162144868632937" 13 | "12","1","rs12","12","-0.0151074458113783" 14 | "13","1","rs13","13","0.0273565177791348" 15 | "14","1","rs14","14","-0.0587960969211619" 16 | "15","1","rs15","15","0.132980436833378" 17 | "16","1","rs16","16","-0.0394732368188627" 18 | "17","1","rs17","17","-0.0713981147997422" 19 | "18","1","rs18","18","0.0100579967683699" 20 | "19","1","rs19","19","0.0314712686930575" 21 | "20","1","rs20","20","-0.022557235722201" 22 | "21","1","rs21","21","-0.0514902521917086" 23 | "22","1","rs22","22","-0.380823961633008" 24 | "23","1","rs23","23","0.0376285736111005" 25 | "24","1","rs24","24","0.0813696952243066" 26 | "25","1","rs25","25","-0.0600195191576378" 27 | "26","1","rs26","26","0.0763255078792159" 28 | "27","1","rs27","27","-0.0157340586805173" 29 | "28","1","rs28","28","0.0053491779420878" 30 | "29","1","rs29","29","-0.0230096268786541" 31 | "30","1","rs30","30","-0.0166889799904788" 32 | "31","1","rs31","31","0.0440867194984021" 33 | "32","1","rs32","32","-0.0271740281102383" 34 | "33","1","rs33","33","0.0769936887394494" 35 | "34","1","rs34","34","0.0515012035053683" 36 | "35","1","rs35","35","0.11405453000561" 37 | "36","1","rs36","36","-0.0623497642279973" 38 | "37","1","rs37","37","-0.0439702026965466" 39 | "38","1","rs38","38","-0.0148583809493149" 40 | "39","1","rs39","39","0.0425562446140515" 41 | "40","1","rs40","40","0.0376324933539782" 42 | "41","1","rs41","41","0.15247764456262" 43 | "42","1","rs42","42","-0.0849894739154603" 44 | "43","1","rs43","43","0.0417361653291142" 45 | "44","1","rs44","44","0.057569406750715" 46 | "45","1","rs45","45","0.0155708994519841" 47 | "46","1","rs46","46","-0.212154240453643" 48 | "47","1","rs47","47","-0.302317171280787" 49 | "48","1","rs48","48","-0.317259261470206" 50 | "49","1","rs49","49","-0.0902619746076583" 51 | "50","1","rs50","50","0.0713954763102486" 52 | "51","1","rs51","51","-0.250385712358405" 53 | "52","1","rs52","52","0.0269857555070088" 54 | "53","1","rs53","53","0.0724172261358925" 55 | "54","1","rs54","54","-0.0115076932928384" 56 | "55","1","rs55","55","-0.241041012957552" 57 | -------------------------------------------------------------------------------- /sgkit/tests/test_genee/result.csv: -------------------------------------------------------------------------------- 1 | "","test_q","q_var","pval" 2 | "1",0.00770381012929696,5.97664199541393e-05,0.916111258824932 3 | "2",0.00931340387379375,5.97766731532365e-05,0.866617486805112 4 | "3",0.0254312278544544,4.2639105994213e-05,0.121151813035285 5 | "4",0.159826244240868,5.11662269290079e-05,5.26183874249853e-10 6 | "5",0.0311806269653318,8.54603570912353e-05,0.38271258751765 7 | -------------------------------------------------------------------------------- /sgkit/tests/test_genee_momentchi2py.py: -------------------------------------------------------------------------------- 1 | # hbe tests are from https://github.com/deanbodenham/momentchi2py/blob/master/tests/test_momentchi2.py 2 | 3 | import unittest 4 | 5 | import numpy as np 6 | 7 | from sgkit.stats.genee_momentchi2py import hbe 8 | 9 | 10 | class HbeTests(unittest.TestCase): 11 | def test_hbe1(self): 12 | """hbe with x float, coeff list""" 13 | x = 10.203 14 | coeff = [1.5, 1.5, 0.5, 0.5] 15 | ans = hbe(coeff, x) 16 | soln = 0.949 17 | self.assertAlmostEqual(ans, soln, places=3, msg=None, delta=None) 18 | 19 | def test_hbe2(self): 20 | """hbe with x float, coeff list, specifying arguments""" 21 | x = 10.203 22 | coeff = [1.5, 1.5, 0.5, 0.5] 23 | ans = hbe(x=x, coeff=coeff) 24 | soln = 0.949 25 | self.assertAlmostEqual(ans, soln, places=3, msg=None, delta=None) 26 | 27 | def test_hbe3(self): 28 | """hbe with x float, coeff list, specifying arguments""" 29 | x = 0.627 30 | coeff = [1.5, 1.5, 0.5, 0.5] 31 | ans = hbe(x=x, coeff=coeff) 32 | soln = 0.0285 33 | self.assertAlmostEqual(ans, soln, places=3, msg=None, delta=None) 34 | 35 | def test_hbe4(self): 36 | """hbe with x list, coeff list, specifying arguments""" 37 | x = [0.627, 10.203] 38 | coeff = [1.5, 1.5, 0.5, 0.5] 39 | ans = hbe(coeff=coeff, x=x) 40 | soln = [0.0285, 0.949] 41 | # check it is a list 42 | self.assertTrue(isinstance(ans, list)) 43 | # check lists are equal length and almost equal values 44 | self.assertEqual(len(ans), len(soln)) 45 | for i in range(len(ans)): 46 | self.assertAlmostEqual(ans[i], soln[i], places=2, msg=None, delta=None) 47 | 48 | def test_hbe5(self): 49 | """hbe with x float, coeff numpy array""" 50 | x = 10.203 51 | coeff = np.array([1.5, 1.5, 0.5, 0.5]) 52 | ans = hbe(coeff, x) 53 | soln = 0.949 54 | self.assertAlmostEqual(ans, soln, places=3, msg=None, delta=None) 55 | 56 | def test_hbe6(self): 57 | """hbe with x numpy array, coeff numpy array""" 58 | x = np.array([0.627, 10.203]) 59 | coeff = np.array([1.5, 1.5, 0.5, 0.5]) 60 | ans = hbe(coeff, x) 61 | soln = np.array([0.0285, 0.949]) 62 | self.assertEqual(len(ans), len(soln)) 63 | for i in range(len(ans)): 64 | self.assertAlmostEqual(ans[i], soln[i], places=3, msg=None, delta=None) 65 | 66 | def test_hbe7(self): 67 | """hbe with x numpy array one element, coeff numpy array""" 68 | x = np.array([0.627]) 69 | coeff = np.array([1.5, 1.5, 0.5, 0.5]) 70 | ans = hbe(coeff, x) 71 | soln = np.array([0.0285]) 72 | self.assertEqual(len(ans), len(soln)) 73 | for i in range(len(ans)): 74 | self.assertAlmostEqual(ans[i], soln[i], places=3, msg=None, delta=None) 75 | 76 | def test_hbe8(self): 77 | """hbe with x numpy array, coeff list""" 78 | x = np.array([0.627, 10.203]) 79 | coeff = [1.5, 1.5, 0.5, 0.5] 80 | ans = hbe(coeff, x) 81 | soln = np.array([0.0285, 0.949]) 82 | self.assertEqual(len(ans), len(soln)) 83 | for i in range(len(ans)): 84 | self.assertAlmostEqual(ans[i], soln[i], places=3, msg=None, delta=None) 85 | 86 | def test_hbe9(self): 87 | """hbe with x list, coeff numpy array""" 88 | x = np.array([0.627, 10.203]) 89 | coeff = [1.5, 1.5, 0.5, 0.5] 90 | ans = hbe(coeff, x) 91 | soln = np.array([0.0285, 0.949]) 92 | self.assertEqual(len(ans), len(soln)) 93 | for i in range(len(ans)): 94 | self.assertAlmostEqual(ans[i], soln[i], places=3, msg=None, delta=None) 95 | -------------------------------------------------------------------------------- /sgkit/tests/test_grm/Legara2009_G_matrix.txt: -------------------------------------------------------------------------------- 1 | 1.0 0.7 0.7 0.7 2 | 0.7 1.0 0.7 0.7 3 | 0.7 0.7 1.0 0.7 4 | 0.7 0.7 0.7 1.0 5 | -------------------------------------------------------------------------------- /sgkit/tests/test_grm/Legara2009_H_matrix.txt: -------------------------------------------------------------------------------- 1 | 1.00 0.00 0.18 0.18 0.18 0.18 0.18 0.18 0.50 0.35 0.35 0.35 0.43 0.35 0.26 0.34 0.39 2 | 0.00 1.00 0.18 0.18 0.18 0.18 0.18 0.18 0.50 0.35 0.35 0.35 0.43 0.35 0.26 0.34 0.39 3 | 0.18 0.18 1.00 0.00 0.18 0.18 0.18 0.18 0.35 0.50 0.35 0.35 0.43 0.35 0.18 0.30 0.39 4 | 0.18 0.18 0.00 1.00 0.18 0.18 0.18 0.18 0.35 0.50 0.35 0.35 0.43 0.35 0.68 0.55 0.39 5 | 0.18 0.18 0.18 0.18 1.00 0.00 0.18 0.18 0.35 0.35 0.50 0.35 0.35 0.43 0.34 0.34 0.39 6 | 0.18 0.18 0.18 0.18 0.00 1.00 0.18 0.18 0.35 0.35 0.50 0.35 0.35 0.43 0.34 0.34 0.39 7 | 0.18 0.18 0.18 0.18 0.18 0.18 1.00 0.00 0.35 0.35 0.35 0.50 0.35 0.43 0.26 0.31 0.39 8 | 0.18 0.18 0.18 0.18 0.18 0.18 0.00 1.00 0.35 0.35 0.35 0.50 0.35 0.43 0.26 0.31 0.39 9 | 0.50 0.50 0.35 0.35 0.35 0.35 0.35 0.35 1.00 0.70 0.70 0.70 0.85 0.70 0.53 0.69 0.78 10 | 0.35 0.35 0.50 0.50 0.35 0.35 0.35 0.35 0.70 1.00 0.70 0.70 0.85 0.70 0.60 0.73 0.78 11 | 0.35 0.35 0.35 0.35 0.50 0.50 0.35 0.35 0.70 0.70 1.00 0.70 0.70 0.85 0.68 0.69 0.78 12 | 0.35 0.35 0.35 0.35 0.35 0.35 0.50 0.50 0.70 0.70 0.70 1.00 0.70 0.85 0.53 0.61 0.78 13 | 0.43 0.43 0.43 0.43 0.35 0.35 0.35 0.35 0.85 0.85 0.70 0.70 1.35 0.70 0.56 0.96 1.03 14 | 0.35 0.35 0.35 0.35 0.43 0.43 0.43 0.43 0.70 0.70 0.85 0.85 0.70 1.35 0.60 0.65 1.03 15 | 0.26 0.26 0.18 0.68 0.34 0.34 0.26 0.26 0.53 0.60 0.68 0.53 0.56 0.60 1.18 0.87 0.58 16 | 0.34 0.34 0.30 0.55 0.34 0.34 0.31 0.31 0.69 0.73 0.69 0.61 0.96 0.65 0.87 1.41 0.80 17 | 0.39 0.39 0.39 0.39 0.39 0.39 0.39 0.39 0.78 0.78 0.78 0.78 1.03 1.03 0.58 0.80 1.53 18 | -------------------------------------------------------------------------------- /sgkit/tests/test_grm/Legara2009_pedigree.txt: -------------------------------------------------------------------------------- 1 | -1 -1 2 | -1 -1 3 | -1 -1 4 | -1 -1 5 | -1 -1 6 | -1 -1 7 | -1 -1 8 | -1 -1 9 | 0 1 10 | 2 3 11 | 4 5 12 | 6 7 13 | 8 9 14 | 10 11 15 | 3 10 16 | 12 14 17 | 12 13 18 | -------------------------------------------------------------------------------- /sgkit/tests/test_ibs/hierfstat.sim1.beta.txt: -------------------------------------------------------------------------------- 1 | 0.366651780276887 -0.221631277658236 -0.188745889326459 -0.227721164386342 0.0658113759084082 0.0743372173277576 -0.219195322966993 0.0779911493646216 0.0548495797978157 0.0889529454752141 0.0609394665259227 -0.0645122000730788 -0.164386342414031 -0.0486784945800009 0.00856644066420351 -0.0352807437781659 2 | -0.221631277658236 0.392229304534935 -0.168040274450895 -0.203361617473915 0.0962608095489423 0.0974787868945636 -0.18143802525273 -0.215541390930129 0.0986967642401851 0.0755551946733791 0.0926068775120781 -0.185091957289595 -0.191181844017701 -0.0632942227274573 -0.0657301774187001 -0.0730380414924283 3 | -0.188745889326459 -0.168040274450895 0.371523689659372 -0.211887458893265 -0.150988591612196 -0.169258251796517 0.0840810360927286 -0.219195322966993 -0.135154886119118 -0.141244772847225 -0.122975112662904 -0.0681661321099428 0.00856644066420351 -0.0584223133449718 -0.0973975884048558 -0.079127928220535 4 | -0.227721164386342 -0.203361617473915 -0.211887458893265 0.384921440461208 -0.235029028460071 -0.202143640128294 0.0767731720190004 0.0865169907839714 -0.196053753400187 -0.226503187040721 -0.208233526856401 0.0950428322033211 0.0999147415858066 -0.0584223133449718 -0.0511144492712436 -0.0291908570500589 5 | 0.0658113759084082 0.0962608095489423 -0.150988591612196 -0.235029028460071 0.405627055336771 0.0658113759084082 -0.199707685437051 -0.0839998376030205 0.247290000405993 0.239982136332264 0.242418091023507 -0.113231293897933 -0.170476229142138 0.0317080102310097 0.0597214891803012 0.039015874304738 6 | 0.0743372173277576 0.0974787868945636 -0.169258251796517 -0.202143640128294 0.0658113759084082 0.408063010028013 -0.188745889326459 -0.0937436563679915 0.237546181641022 0.232674272258536 0.237546181641022 -0.140026795501604 -0.180220047907109 0.00856644066420351 0.0426698063416019 0.0110023953554463 7 | -0.219195322966993 -0.18143802525273 0.0840810360927286 0.0767731720190004 -0.199707685437051 -0.188745889326459 0.394665259226179 -0.0973975884048558 -0.182656002598352 -0.199707685437051 -0.176566115870245 0.162031586212496 0.285047298120255 0.0536316024521944 0.0110023953554463 0.0670293532540294 8 | 0.0779911493646216 -0.215541390930129 -0.219195322966993 0.0865169907839714 -0.0839998376030205 -0.0937436563679915 -0.0973975884048558 0.398319191263042 -0.0754739961836711 -0.0803459055661565 -0.0827818602573993 0.148633835410661 0.02318216881166 -0.0304088343956804 0.0414518289959807 0.0146563273923105 9 | 0.0548495797978157 0.0986967642401851 -0.135154886119118 -0.196053753400187 0.247290000405993 0.237546181641022 -0.182656002598352 -0.0754739961836711 0.527424789898908 0.247290000405993 0.391011327189314 -0.112013316552312 -0.153424546303439 0.108440583005156 0.125492265843855 0.106004628313913 10 | 0.0889529454752141 0.0755551946733791 -0.141244772847225 -0.226503187040721 0.239982136332264 0.232674272258536 -0.199707685437051 -0.0803459055661565 0.247290000405993 0.521334903170801 0.373959644350615 -0.138808818155982 -0.188745889326459 0.0804271040558646 0.116966424424506 0.0852990134383501 11 | 0.0609394665259227 0.0926068775120781 -0.122975112662904 -0.208233526856401 0.242418091023507 0.237546181641022 -0.176566115870245 -0.0827818602573993 0.391011327189314 0.373959644350615 0.629734886931103 -0.121757135317283 -0.170476229142138 0.21805854411108 0.232674272258536 0.207096748000487 12 | -0.0645122000730788 -0.185091957289595 -0.0681661321099428 0.0950428322033211 -0.113231293897933 -0.140026795501604 0.162031586212496 0.148633835410661 -0.112013316552312 -0.138808818155982 -0.121757135317283 0.468961877309082 0.315496731760789 0.0865169907839714 0.187609110470545 0.171775404977467 13 | -0.164386342414031 -0.191181844017701 0.00856644066420351 0.0999147415858066 -0.170476229142138 -0.180220047907109 0.285047298120255 0.02318216881166 -0.153424546303439 -0.188745889326459 -0.170476229142138 0.315496731760789 0.596849498599326 0.20953270269173 0.103568673622671 0.202224838618002 14 | -0.0486784945800009 -0.0632942227274573 -0.0584223133449718 -0.0584223133449718 0.0317080102310097 0.00856644066420351 0.0536316024521944 -0.0304088343956804 0.108440583005156 0.0804271040558646 0.21805854411108 0.0865169907839714 0.20953270269173 0.401973123299906 0.149851812756283 0.286265275465876 15 | 0.00856644066420351 -0.0657301774187001 -0.0973975884048558 -0.0511144492712436 0.0597214891803012 0.0426698063416019 0.0110023953554463 0.0414518289959807 0.125492265843855 0.116966424424506 0.232674272258536 0.187609110470545 0.103568673622671 0.149851812756283 0.437294466322926 0.30697089034144 16 | -0.0352807437781659 -0.0730380414924283 -0.079127928220535 -0.0291908570500589 0.039015874304738 0.0110023953554463 0.0670293532540294 0.0146563273923105 0.106004628313913 0.0852990134383501 0.207096748000487 0.171775404977467 0.202224838618002 0.286265275465876 0.30697089034144 0.590759611871219 -------------------------------------------------------------------------------- /sgkit/tests/test_ibs/hierfstat.sim2.beta.txt: -------------------------------------------------------------------------------- 1 | 0.41475192333518 -0.197024512031376 0.10099131722595 2 | -0.197024512031376 0.407868471660385 0.0960331948054254 3 | 0.10099131722595 0.0960331948054254 0.40829863307675 -------------------------------------------------------------------------------- /sgkit/tests/test_ibs/hierfstat.sim3.beta.txt: -------------------------------------------------------------------------------- 1 | nan nan nan 2 | nan 0.34496319451447 0 3 | nan 0 0.34543905426274 -------------------------------------------------------------------------------- /sgkit/tests/test_import_star.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # Basic test to ensure we can import * and the the __all__ array is 3 | # well formed. We're doing bad things in this file by definition, 4 | # so easiest to turn off style checks. 5 | 6 | from sgkit import * 7 | 8 | 9 | def test_doc_example(): 10 | ds = simulate_genotype_call_dataset( 11 | n_variant=1000, n_sample=250, n_contig=23, missing_pct=0.1 12 | ) 13 | # assert something simple, just to make sure we're evaluating 14 | # things correctly. 15 | assert ds.variant_position.shape == (1000,) 16 | -------------------------------------------------------------------------------- /sgkit/tests/test_mis.py: -------------------------------------------------------------------------------- 1 | import math 2 | from typing import List, Optional, Tuple 3 | 4 | import networkx as nx 5 | import numpy as np 6 | import pytest 7 | import toolz 8 | 9 | from sgkit.stats.ld import _maximal_independent_set as numba_mis 10 | from sgkit.typing import NDArray 11 | 12 | 13 | def to_vertex_ids(g: nx.Graph) -> Tuple[NDArray, NDArray]: 14 | g = np.array(sorted(g.edges)) 15 | return g[:, 0], g[:, 1] 16 | 17 | 18 | def plink_mis(idi: NDArray, idj: NDArray, cmp: Optional[NDArray] = None) -> List[int]: 19 | # Direct port of https://groups.google.com/forum/#!msg/plink2-users/w5TuZo2fgsQ/WbNnE16_xDIJ 20 | if cmp is None: 21 | cmp = np.zeros(len(idi)) 22 | lost = set() 23 | grps = toolz.groupby(lambda p: p[0], list(zip(idi, idj, cmp))) 24 | for i in sorted(grps.keys()): 25 | if i in lost: 26 | continue 27 | for t in sorted(grps[i]): 28 | j, c = t[1:] 29 | if j <= i: 30 | continue 31 | if c < 0: 32 | lost.add(i) 33 | break 34 | else: 35 | lost.add(j) 36 | return sorted(lost) 37 | 38 | 39 | mis_fns = [numba_mis, plink_mis] 40 | 41 | 42 | @pytest.mark.parametrize("mis", mis_fns) 43 | @pytest.mark.parametrize("n", [2, 5, 25]) 44 | def test_star_graph(mis, n): 45 | # There are n+1 nodes in the resulting graph 46 | idi, idj = to_vertex_ids(nx.star_graph(n)) 47 | # Favoring non-center node (which is the first) 48 | # results in only middle node lost 49 | idx = mis(idi, idj, cmp=np.full(n, -1)) 50 | assert len(idx) == 1 51 | # Favoring center node results in all others lost 52 | idx = mis(idi, idj, cmp=np.full(n, 1)) 53 | assert len(idx) == n 54 | 55 | 56 | @pytest.mark.parametrize("mis", mis_fns) 57 | def test_path_graph(mis): 58 | # Graph is 3 nodes with A-B and B-C 59 | idi, idj = to_vertex_ids(nx.path_graph(3)) 60 | # First and third should be kept with no comparison 61 | idx = mis(idi, idj) 62 | assert idx == [1] 63 | # With comparisons favoring later nodes, only third is kept 64 | idx = mis(idi, idj, cmp=np.array([-1, -1])) 65 | assert idx == [0, 1] 66 | # With comparisons favoring earlier nodes, middle node is lost 67 | idx = mis(idi, idj, cmp=np.array([1, 1])) 68 | assert idx == [1] 69 | # With middle node largest, first and third are lost 70 | idx = mis(idi, idj, cmp=np.array([-1, 1])) 71 | assert idx == [0, 2] 72 | 73 | 74 | @pytest.mark.parametrize("mis", mis_fns) 75 | def test_disconnected_graph(mis): 76 | # Node 2 is connected to 3 but 0, 1, and 4 have no edges 77 | idi, idj = np.array([0, 1, 2, 2, 3, 4]), np.array([0, 1, 2, 3, 3, 4]) 78 | idx = mis(idi, idj) 79 | assert idx == [3] 80 | 81 | 82 | @pytest.mark.parametrize( 83 | "gfn", 84 | [ 85 | nx.ladder_graph, 86 | nx.circular_ladder_graph, 87 | nx.binomial_tree, 88 | nx.wheel_graph, 89 | nx.complete_graph, 90 | ], 91 | ) 92 | @pytest.mark.parametrize("n", [2, 10, 25]) 93 | def test_random_graphs(gfn, n): 94 | # For several more complex graph types, make sure 95 | # the plink algo is equal to the unrolled numba version 96 | if gfn == nx.binomial_tree: 97 | n = int(math.log(n, 2)) 98 | idi, idj = to_vertex_ids(gfn(n)) 99 | idx1 = numba_mis(idi, idj) 100 | idx2 = plink_mis(idi, idj) 101 | assert idx1 == idx2 102 | 103 | 104 | def test_unsorted_edges(): 105 | idi, idj = to_vertex_ids(nx.complete_graph(10)) 106 | idi = idi[::-1] 107 | with pytest.raises(ValueError): 108 | numba_mis(idi, idj) 109 | -------------------------------------------------------------------------------- /sgkit/tests/test_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy.testing import assert_array_equal 3 | 4 | from sgkit import ( 5 | DIM_ALLELE, 6 | DIM_PLOIDY, 7 | DIM_SAMPLE, 8 | DIM_VARIANT, 9 | __version__, 10 | create_genotype_call_dataset, 11 | create_genotype_dosage_dataset, 12 | display_genotypes, 13 | ) 14 | 15 | 16 | def test_create_genotype_call_dataset(): 17 | variant_contig_names = ["chr1"] 18 | variant_contig = np.array([0, 0], dtype="i1") 19 | variant_position = np.array([1000, 2000], dtype="i4") 20 | variant_allele = np.array([["A", "C"], ["G", "A"]], dtype="S1") 21 | variant_id = np.array(["rs1", "rs2"], dtype=str) 22 | sample_id = np.array(["sample_1", "sample_2", "sample_3"], dtype=str) 23 | call_genotype = np.array( 24 | [[[0, 0], [0, 1], [1, 0]], [[-1, 0], [0, -1], [-1, -1]]], dtype="i1" 25 | ) 26 | call_genotype_phased = np.array( 27 | [[True, True, False], [True, False, False]], dtype=bool 28 | ) 29 | ds = create_genotype_call_dataset( 30 | variant_contig_names=variant_contig_names, 31 | variant_contig=variant_contig, 32 | variant_position=variant_position, 33 | variant_allele=variant_allele, 34 | sample_id=sample_id, 35 | call_genotype=call_genotype, 36 | call_genotype_phased=call_genotype_phased, 37 | variant_id=variant_id, 38 | ) 39 | 40 | assert DIM_VARIANT in ds.sizes 41 | assert DIM_SAMPLE in ds.sizes 42 | assert DIM_PLOIDY in ds.sizes 43 | assert DIM_ALLELE in ds.sizes 44 | 45 | assert ds.attrs["source"] == f"sgkit-{__version__}" 46 | assert_array_equal(ds["contig_id"], variant_contig_names) 47 | assert_array_equal(ds["variant_contig"], variant_contig) 48 | assert_array_equal(ds["variant_position"], variant_position) 49 | assert_array_equal(ds["variant_allele"], variant_allele) 50 | assert_array_equal(ds["variant_id"], variant_id) 51 | assert_array_equal(ds["sample_id"], sample_id) 52 | assert_array_equal(ds["call_genotype"], call_genotype) 53 | assert_array_equal(ds["call_genotype_mask"], call_genotype < 0) 54 | assert_array_equal(ds["call_genotype_phased"], call_genotype_phased) 55 | 56 | disp = display_genotypes(ds) 57 | assert ( 58 | str(disp) 59 | == """ 60 | samples sample_1 sample_2 sample_3 61 | variants 62 | rs1 0|0 0|1 1/0 63 | rs2 .|0 0/. ./. 64 | """.strip() # noqa: W291 65 | ) 66 | 67 | 68 | def test_create_genotype_dosage_dataset(): 69 | variant_contig_names = ["chr1"] 70 | variant_contig = np.array([0, 0], dtype="i1") 71 | variant_position = np.array([1000, 2000], dtype="i4") 72 | variant_allele = np.array([["A", "C"], ["G", "A"]], dtype="S1") 73 | variant_id = np.array(["rs1", "rs2"], dtype=str) 74 | sample_id = np.array(["sample_1", "sample_2", "sample_3"], dtype=str) 75 | call_dosage = np.array([[0.8, 0.9, np.nan], [1.0, 1.1, 1.2]], dtype="f4") 76 | call_genotype_probability = np.array( 77 | [ 78 | [[0.1, 0.5, 0.4], [0.2, 0.2, 0.6], [np.nan, np.nan, np.nan]], 79 | [[0.1, 0.5, 0.4], [0.2, 0.2, 0.6], [0.3, 0.1, 0.6]], 80 | ], 81 | dtype="f4", 82 | ) 83 | ds = create_genotype_dosage_dataset( 84 | variant_contig_names=variant_contig_names, 85 | variant_contig=variant_contig, 86 | variant_position=variant_position, 87 | variant_allele=variant_allele, 88 | sample_id=sample_id, 89 | call_dosage=call_dosage, 90 | call_genotype_probability=call_genotype_probability, 91 | variant_id=variant_id, 92 | ) 93 | 94 | assert DIM_VARIANT in ds.sizes 95 | assert DIM_SAMPLE in ds.sizes 96 | 97 | assert_array_equal(ds["contig_id"], variant_contig_names) 98 | assert_array_equal(ds["variant_contig"], variant_contig) 99 | assert_array_equal(ds["variant_position"], variant_position) 100 | assert_array_equal(ds["variant_allele"], variant_allele) 101 | assert_array_equal(ds["variant_id"], variant_id) 102 | assert_array_equal(ds["sample_id"], sample_id) 103 | assert_array_equal(ds["call_dosage"], call_dosage) 104 | assert_array_equal(ds["call_dosage_mask"], np.isnan(call_dosage)) 105 | assert_array_equal(ds["call_genotype_probability"], call_genotype_probability) 106 | assert_array_equal( 107 | ds["call_genotype_probability_mask"], np.isnan(call_genotype_probability) 108 | ) 109 | -------------------------------------------------------------------------------- /sgkit/tests/test_pedigree/hamilton_kerr_A_matrix.txt: -------------------------------------------------------------------------------- 1 | 1 -1.39760517786776e-16 -1.97308966287214e-16 1 0.500000000000001 0.707106781186548 0.353553390593274 0.353553390593274 2 | 2.38906505262051e-16 1.23119520073835 0.870586475406418 2.38906505262051e-16 0.435293237703209 0.615597600369174 0.923396400553761 0.923396400553761 3 | 3.39424362382837e-16 0.870586475406418 1.23119520073835 3.39424362382837e-16 0.615597600369174 0.870586475406418 0.870586475406418 0.870586475406418 4 | 1 -1.43416017373419e-16 -2.15124026060129e-16 1.041 0.500000000000001 0.707106781186548 0.353553390593274 0.353553390593274 5 | 0.500000000000001 0.435293237703209 0.615597600369174 0.500000000000001 1 0.788846628296483 0.612069932999846 0.612069932999846 6 | 0.707106781186549 0.615597600369174 0.870586475406418 0.707106781186549 0.788846628296484 1.59035809875404 1.10297784956161 1.10297784956161 7 | 0.353553390593274 0.923396400553761 0.870586475406418 0.353553390593274 0.612069932999846 1.10297784956161 1.58885778326411 1.01318712505768 8 | 0.353553390593274 0.923396400553761 0.870586475406418 0.353553390593274 0.612069932999846 1.10297784956161 1.01318712505768 1.58885778326411 9 | -------------------------------------------------------------------------------- /sgkit/tests/test_pedigree/hamilton_kerr_A_matrix_inv.txt: -------------------------------------------------------------------------------- 1 | 27.0087600096362 0 1.61851610719714 -24.3902439024391 -1.13070701800157 -1.48939682975337 0 0 2 | 0 2.49298995216817 -1.14865097063812 0 0 0.868552171058725 -0.868552171058725 -0.868552171058725 3 | 1.61851610719714 -1.14865097063812 3.24295388830658 0 -1.13070701800157 -1.48939682975337 0 0 4 | -24.3902439024391 0 0 24.3902439024391 0 0 0 0 5 | -1.13070701800157 0 -1.13070701800157 0 2.26141403600313 0 0 0 6 | -1.48939682975337 0.868552171058725 -1.48939682975337 0 0 2.97487736745144 -0.868552171058725 -0.868552171058725 7 | 0 -0.868552171058725 0 0 0 -0.868552171058725 1.73710434211745 0 8 | 0 -0.868552171058725 0 0 0 -0.868552171058725 0 1.73710434211745 9 | -------------------------------------------------------------------------------- /sgkit/tests/test_pedigree/hamilton_kerr_inbreeding.txt: -------------------------------------------------------------------------------- 1 | 0 2 | 0.0770650669127826 3 | 0.231195200738348 4 | 0.041 5 | 0 6 | 0.196786032918013 7 | 0.196285927754704 8 | 0.196285927754704 9 | -------------------------------------------------------------------------------- /sgkit/tests/test_pedigree/hamilton_kerr_kinship.txt: -------------------------------------------------------------------------------- 1 | 0.500000000000001 -2.26083190537432e-17 -2.87742242502186e-17 0.500000000000001 0.25 0.25 0.125 0.125 2 | 1.20724902574995e-17 0.307798800184587 0.307798800184587 1.20724902574995e-17 0.153899400092293 0.153899400092293 0.23084910013844 0.23084910013844 3 | 4.24280452978546e-17 0.307798800184587 0.615597600369174 4.24280452978546e-17 0.307798800184587 0.307798800184587 0.307798800184587 0.307798800184587 4 | 0.500000000000001 -2.68905032575161e-17 -3.58540043433548e-17 0.520500000000001 0.25 0.25 0.125 0.125 5 | 0.25 0.153899400092293 0.307798800184587 0.25 0.5 0.278899400092294 0.216399400092293 0.216399400092293 6 | 0.25 0.153899400092293 0.307798800184587 0.25 0.278899400092294 0.39758952468851 0.275744462390402 0.275744462390402 7 | 0.125 0.23084910013844 0.307798800184587 0.125 0.216399400092294 0.275744462390402 0.397214445816028 0.253296781264421 8 | 0.125 0.23084910013844 0.307798800184587 0.125 0.216399400092294 0.275744462390402 0.253296781264421 0.397214445816028 9 | -------------------------------------------------------------------------------- /sgkit/tests/test_pedigree/hamilton_kerr_kinship_inv.txt: -------------------------------------------------------------------------------- 1 | 54.0175200192724 0 3.23703221439428 -48.7804878048781 -2.26141403600313 -4.21265039278542 0 0 2 | 0 9.97195980867268 -3.24887556221889 0 0 3.4742086842349 -3.4742086842349 -3.4742086842349 3 | 3.23703221439428 -3.24887556221889 6.48590777661317 0 -2.26141403600313 -4.21265039278542 0 0 4 | -48.7804878048781 0 0 48.7804878048781 0 0 0 0 5 | -2.26141403600313 0 -2.26141403600313 0 4.52282807200626 0 0 0 6 | -4.21265039278542 3.4742086842349 -4.21265039278542 0 0 11.8995094698057 -3.4742086842349 -3.4742086842349 7 | 0 -3.4742086842349 0 0 0 -3.4742086842349 6.9484173684698 0 8 | 0 -3.4742086842349 0 0 0 -3.4742086842349 0 6.9484173684698 9 | -------------------------------------------------------------------------------- /sgkit/tests/test_pedigree/hamilton_kerr_pedigree.csv: -------------------------------------------------------------------------------- 1 | "INDIV.ID","SIRE.ID","DAM.ID","SIRE.GAMETE.PLOIDY","SIRE.LAMBDA","DAM.GAMETE.PLOIDY","DAM.LAMBDA","SIRE.SEGREGATION","DAM.SEGREGATION","INDIV.PLOIDY" 2 | 1,0,0,1,0,1,0,"Normal","Normal","Diploid" 3 | 2,0,0,2,0.167,2,0.167,"Normal","Normal","Tetraploid" 4 | 3,0,2,0,0,2,0.167,"NA","Normal","Diploid" 5 | 4,1,0,2,0.041,0,0,"First division restitution","NA","Diploid" 6 | 5,1,3,1,0,1,0,"Normal","Normal","Diploid" 7 | 6,1,3,2,0.918,2,0.041,"Second division restitution","First division restitution","Tetraploid" 8 | 7,6,2,2,0.167,2,0.167,"Normal","Normal","Tetraploid" 9 | 8,6,2,2,0.167,2,0.167,"Normal","Normal","Tetraploid" 10 | -------------------------------------------------------------------------------- /sgkit/tests/test_pedigree/kinship2_pedigree.csv: -------------------------------------------------------------------------------- 1 | "","ped","id","father","mother","sex","affected","avail" 2 | "1",1,101,0,0,1,0,0 3 | "2",1,102,0,0,2,1,0 4 | "3",1,103,135,136,1,1,0 5 | "4",1,104,0,0,2,0,0 6 | "5",1,105,0,0,1,NA,0 7 | "6",1,106,0,0,2,NA,0 8 | "7",1,107,0,0,1,1,0 9 | "8",1,108,0,0,2,0,0 10 | "9",1,109,101,102,2,0,1 11 | "10",1,110,103,104,1,1,1 12 | "11",1,111,103,104,2,1,0 13 | "12",1,112,103,104,1,1,0 14 | "13",1,113,0,0,2,0,1 15 | "14",1,114,103,104,1,1,0 16 | "15",1,115,105,106,2,0,0 17 | "16",1,116,105,106,2,1,1 18 | "17",1,117,0,0,1,1,0 19 | "18",1,118,105,106,2,1,1 20 | "19",1,119,105,106,1,1,1 21 | "20",1,120,107,108,2,0,0 22 | "21",1,121,110,109,1,1,0 23 | "22",1,122,110,109,2,0,0 24 | "23",1,123,110,109,2,0,0 25 | "24",1,124,110,109,1,1,1 26 | "25",1,125,112,118,2,0,1 27 | "26",1,126,112,118,2,0,1 28 | "27",1,127,114,115,1,1,1 29 | "28",1,128,114,115,1,1,1 30 | "29",1,129,117,116,1,0,1 31 | "30",1,130,119,120,1,0,1 32 | "31",1,131,119,120,1,1,0 33 | "32",1,132,119,120,1,0,0 34 | "33",1,133,119,120,2,0,1 35 | "34",1,134,119,120,2,1,0 36 | "35",1,135,0,0,1,NA,0 37 | "36",1,136,0,0,2,NA,0 38 | "37",1,137,0,0,1,NA,0 39 | "38",1,138,135,136,2,NA,0 40 | "39",1,139,137,138,1,1,0 41 | "40",1,140,137,138,2,0,1 42 | "41",1,141,137,138,2,0,1 43 | "42",2,201,0,0,1,1,1 44 | "43",2,202,0,0,2,NA,0 45 | "44",2,203,0,0,1,1,1 46 | "45",2,204,201,202,2,0,1 47 | "46",2,205,201,202,1,NA,0 48 | "47",2,206,201,202,2,1,1 49 | "48",2,207,201,202,2,1,1 50 | "49",2,208,201,202,2,0,0 51 | "50",2,209,0,0,1,0,0 52 | "51",2,210,203,204,1,0,0 53 | "52",2,211,203,204,1,0,1 54 | "53",2,212,209,208,2,0,1 55 | "54",2,213,209,208,1,0,0 56 | "55",2,214,209,208,1,1,1 57 | -------------------------------------------------------------------------------- /sgkit/tests/test_pedigree/pedkin_sim_founder.txt: -------------------------------------------------------------------------------- 1 | 1 1 1 0.0625 2 | 1 2 1 0.03125 3 | 1 2 2 0.0 4 | 1 3 1 0.078125 5 | 1 3 2 0.0625 6 | 1 3 3 0.0 7 | 1 4 1 0.1015625 8 | 1 4 2 0.0 9 | 1 4 3 0.0 10 | 1 4 4 0.0 11 | 1 5 1 0.1171875 12 | 1 5 2 0.03125 13 | 1 5 3 0.25 14 | 1 5 4 0.09375 15 | 1 5 5 0.0 16 | 1 6 1 0.15234375 17 | 1 6 2 0.015625 18 | 1 6 3 0.0078125 19 | 1 6 4 0.18359375 20 | 1 6 5 0.08203125 21 | 1 6 6 0.078125 22 | 1 7 1 0.1015625 23 | 1 7 2 0.015625 24 | 1 7 3 0.140625 25 | 1 7 4 0.0546875 26 | 1 7 5 0.1015625 27 | 1 7 6 0.060546875 28 | 1 7 7 0.0 29 | 1 8 1 0.10546875 30 | 1 8 2 0.0625 31 | 1 8 3 0.15625 32 | 1 8 4 0.0625 33 | 1 8 5 0.09375 34 | 1 8 6 0.080078125 35 | 1 8 7 0.15625 36 | 1 8 8 0.0 37 | 1 9 1 0.0673828125 38 | 1 9 2 0.0 39 | 1 9 3 0.0390625 40 | 1 9 4 0.0703125 41 | 1 9 5 0.078125 42 | 1 9 6 0.07568359375 43 | 1 9 7 0.068359375 44 | 1 9 8 0.015625 45 | 1 9 9 0.125 46 | 1 10 1 0.015625 47 | 1 10 2 0.375 48 | 1 10 3 0.03125 49 | 1 10 4 0.0 50 | 1 10 5 0.015625 51 | 1 10 6 0.0078125 52 | 1 10 7 0.0078125 53 | 1 10 8 0.03125 54 | 1 10 9 0.0 55 | 1 10 10 0.25 56 | -------------------------------------------------------------------------------- /sgkit/tests/test_pedigree/pedkin_sim_interest.txt: -------------------------------------------------------------------------------- 1 | 1 1 2 | 1 2 3 | 1 3 4 | 1 4 5 | 1 5 6 | 1 6 7 | 1 7 8 | 1 8 9 | 1 9 10 | 1 10 11 | 1 11 12 | 1 12 13 | 1 13 14 | 1 14 15 | 1 15 16 | 1 16 17 | 1 17 18 | 1 18 19 | 1 19 20 | 1 20 21 | 1 21 22 | 1 22 23 | 1 23 24 | 1 24 25 | 1 25 26 | 1 26 27 | 1 27 28 | 1 28 29 | 1 29 30 | 1 30 31 | -------------------------------------------------------------------------------- /sgkit/tests/test_pedigree/pedkin_sim_ped.txt: -------------------------------------------------------------------------------- 1 | 1 1 0 0 1 0 2 | 1 2 0 0 2 0 3 | 1 3 0 0 1 0 4 | 1 4 0 0 2 0 5 | 1 5 0 0 1 0 6 | 1 6 0 0 2 0 7 | 1 7 0 0 1 0 8 | 1 8 0 0 2 0 9 | 1 9 0 0 1 0 10 | 1 10 0 0 2 0 11 | 1 11 7 10 1 0 12 | 1 12 1 4 2 0 13 | 1 13 7 12 1 0 14 | 1 14 1 2 2 0 15 | 1 15 3 10 1 0 16 | 1 16 15 12 2 0 17 | 1 17 9 14 1 0 18 | 1 18 5 10 2 0 19 | 1 19 11 6 1 0 20 | 1 20 9 6 2 0 21 | 1 21 9 16 1 0 22 | 1 22 15 20 2 0 23 | 1 23 3 16 1 0 24 | 1 24 1 14 2 0 25 | 1 25 19 20 1 0 26 | 1 26 15 14 2 0 27 | 1 27 19 4 1 0 28 | 1 28 1 26 2 0 29 | 1 29 3 18 1 0 30 | 1 30 17 28 2 0 31 | -------------------------------------------------------------------------------- /sgkit/tests/test_regenie/config.yml: -------------------------------------------------------------------------------- 1 | datasets: 2 | sim_sm_01: 3 | n_contigs: 1 4 | n_covars: 3 5 | n_samples: 50 6 | n_traits: 1 7 | n_variants: 250 8 | sim_sm_02: 9 | n_contigs: 10 10 | n_covars: 3 11 | n_samples: 50 12 | n_traits: 5 13 | n_variants: 250 14 | paramsets: 15 | wgr_01: 16 | alphas: 17 | - 1000 18 | sample_block_size: 10 19 | variant_block_size: 10 20 | wgr_02: 21 | alphas: null 22 | sample_block_size: 10 23 | variant_block_size: 10 24 | runs: 25 | - dataset: sim_sm_01 26 | name: sim_sm_01-wgr_01 27 | paramset: wgr_01 28 | - dataset: sim_sm_02 29 | name: sim_sm_02-wgr_02 30 | paramset: wgr_02 31 | -------------------------------------------------------------------------------- /sgkit/tests/test_regenie/dataset/sim_sm_01/beta_covariate.csv: -------------------------------------------------------------------------------- 1 | ,Y0000 2 | B-X000,3.764052345967664 3 | B-X001,0.0 4 | B-X002,0.0 5 | -------------------------------------------------------------------------------- /sgkit/tests/test_regenie/dataset/sim_sm_01/covariates.csv: -------------------------------------------------------------------------------- 1 | sample_id,X000,X001,X002 2 | S0000001,1.764052345967664,0.4001572083672233,0.9787379841057392 3 | S0000002,2.240893199201458,1.8675579901499675,-0.977277879876411 4 | S0000003,0.9500884175255894,-0.1513572082976979,-0.10321885179355784 5 | S0000004,0.41059850193837233,0.144043571160878,1.454273506962975 6 | S0000005,0.7610377251469934,0.12167501649282841,0.44386323274542566 7 | S0000006,0.33367432737426683,1.4940790731576061,-0.20515826376580087 8 | S0000007,0.31306770165090136,-0.8540957393017248,-2.5529898158340787 9 | S0000008,0.6536185954403606,0.8644361988595057,-0.7421650204064419 10 | S0000009,2.2697546239876076,-1.4543656745987648,0.04575851730144607 11 | S0000010,-0.1871838500258336,1.5327792143584575,1.469358769900285 12 | S0000011,0.1549474256969163,0.37816251960217356,-0.8877857476301128 13 | S0000012,-1.980796468223927,-0.3479121493261526,0.15634896910398005 14 | S0000013,1.2302906807277207,1.2023798487844113,-0.3873268174079523 15 | S0000014,-0.30230275057533557,-1.0485529650670926,-1.4200179371789752 16 | S0000015,-1.7062701906250126,1.9507753952317897,-0.5096521817516535 17 | S0000016,-0.4380743016111864,-1.2527953600499262,0.7774903558319101 18 | S0000017,-1.6138978475579515,-0.2127402802139687,-0.8954665611936756 19 | S0000018,0.386902497859262,-0.510805137568873,-1.180632184122412 20 | S0000019,-0.028182228338654868,0.42833187053041766,0.06651722238316789 21 | S0000020,0.3024718977397814,-0.6343220936809636,-0.3627411659871381 22 | S0000021,-0.672460447775951,-0.3595531615405413,-0.813146282044454 23 | S0000022,-1.7262826023316769,0.17742614225375283,-0.4017809362082619 24 | S0000023,-1.6301983469660446,0.4627822555257742,-0.9072983643832422 25 | S0000024,0.05194539579613895,0.7290905621775369,0.12898291075741067 26 | S0000025,1.1394006845433007,-1.2348258203536526,0.402341641177549 27 | S0000026,-0.6848100909403132,-0.8707971491818818,-0.5788496647644155 28 | S0000027,-0.31155253212737266,0.05616534222974544,-1.1651498407833565 29 | S0000028,0.9008264869541871,0.46566243973045984,-1.5362436862772237 30 | S0000029,1.4882521937955997,1.8958891760305832,1.1787795711596507 31 | S0000030,-0.17992483581235091,-1.0707526215105425,1.0544517269311366 32 | S0000031,-0.40317694697317963,1.2224450703824274,0.2082749780768603 33 | S0000032,0.9766390364837128,0.3563663971744019,0.7065731681919482 34 | S0000033,0.010500020720820478,1.7858704939058352,0.12691209270361992 35 | S0000034,0.40198936344470165,1.8831506970562544,-1.3477590611424464 36 | S0000035,-1.2704849984857336,0.9693967081580112,-1.17312340511416 37 | S0000036,1.9436211856492926,-0.41361898075974735,-0.7474548114407578 38 | S0000037,1.9229420264803847,1.4805147914344243,1.8675589604265699 39 | S0000038,0.9060446582753853,-0.8612256850547025,1.9100649530990337 40 | S0000039,-0.2680033709513804,0.8024563957963952,0.947251967773748 41 | S0000040,-0.1550100930908342,0.6140793703460803,0.9222066715665268 42 | S0000041,0.37642553115562943,-1.0994007905841945,0.298238174206056 43 | S0000042,1.3263858966870303,-0.6945678597313655,-0.14963454032767076 44 | S0000043,-0.43515355172163744,1.8492637284793418,0.6722947570124355 45 | S0000044,0.40746183624111043,-0.7699160744453164,0.5392491912918173 46 | S0000045,-0.6743326606573761,0.03183055827435118,-0.635846078378881 47 | S0000046,0.6764332949464997,0.5765908166149409,-0.20829875557799488 48 | S0000047,0.3960067126616453,-1.0930615087305058,-1.4912575927056055 49 | S0000048,0.4393917012645369,0.16667349537252904,0.6350314368921064 50 | S0000049,2.383144774863942,0.9444794869904138,-0.9128222254441586 51 | S0000050,1.117016288095853,-1.3159074105115212,-0.461584604814709 52 | -------------------------------------------------------------------------------- /sgkit/tests/test_regenie/dataset/sim_sm_01/genotypes.zarr.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/tests/test_regenie/dataset/sim_sm_01/genotypes.zarr.zip -------------------------------------------------------------------------------- /sgkit/tests/test_regenie/dataset/sim_sm_01/traits.csv: -------------------------------------------------------------------------------- 1 | sample_id,Y0000 2 | S0000001,-254.72480109910808 3 | S0000002,-230.09027706391151 4 | S0000003,-218.05493689539847 5 | S0000004,-219.5811261299788 6 | S0000005,-234.95921431366196 7 | S0000006,-227.22086660345207 8 | S0000007,-257.5205067301366 9 | S0000008,-214.26290999909807 10 | S0000009,-246.80612947384586 11 | S0000010,-221.09857320678782 12 | S0000011,-242.27165206670597 13 | S0000012,-259.2204842996455 14 | S0000013,-246.72979301500683 15 | S0000014,-209.59462849991064 16 | S0000015,-263.1263009837309 17 | S0000016,-254.4080374180435 18 | S0000017,-257.1430792551615 19 | S0000018,-248.76872179158278 20 | S0000019,-240.40214308086217 21 | S0000020,-261.4762675865956 22 | S0000021,-256.7216762782688 23 | S0000022,-255.42224125778205 24 | S0000023,-224.3321512357823 25 | S0000024,-236.59008887723687 26 | S0000025,-252.33028531206247 27 | S0000026,-235.45704328058943 28 | S0000027,-207.56874118449966 29 | S0000028,-246.30202546302775 30 | S0000029,-215.04988433004772 31 | S0000030,-261.991321162662 32 | S0000031,-245.70590215857987 33 | S0000032,-228.11108069502387 34 | S0000033,-251.53285322533534 35 | S0000034,-210.74627747729176 36 | S0000035,-250.5371665715462 37 | S0000036,-228.9568905365803 38 | S0000037,-220.6352585896156 39 | S0000038,-226.25148497511077 40 | S0000039,-247.16395771745957 41 | S0000040,-219.69166073267803 42 | S0000041,-206.30541134510108 43 | S0000042,-211.72156415779116 44 | S0000043,-243.20881457193406 45 | S0000044,-244.71227068608678 46 | S0000045,-267.4724732836421 47 | S0000046,-222.5489850793145 48 | S0000047,-250.73358170555258 49 | S0000048,-235.08853364235185 50 | S0000049,-220.23218045254927 51 | S0000050,-221.99172372182065 52 | -------------------------------------------------------------------------------- /sgkit/tests/test_regenie/dataset/sim_sm_02/beta_covariate.csv: -------------------------------------------------------------------------------- 1 | ,Y0000,Y0001,Y0002,Y0003,Y0004 2 | B-X000,3.764052345967664,2.400157208367223,2.9787379841057393,4.240893199201458,3.8675579901499675 3 | B-X001,0.0,0.0,0.0,0.0,0.0 4 | B-X002,0.0,0.0,0.0,0.0,0.0 5 | -------------------------------------------------------------------------------- /sgkit/tests/test_regenie/dataset/sim_sm_02/covariates.csv: -------------------------------------------------------------------------------- 1 | sample_id,X000,X001,X002 2 | S0000001,1.764052345967664,0.4001572083672233,0.9787379841057392 3 | S0000002,2.240893199201458,1.8675579901499675,-0.977277879876411 4 | S0000003,0.9500884175255894,-0.1513572082976979,-0.10321885179355784 5 | S0000004,0.41059850193837233,0.144043571160878,1.454273506962975 6 | S0000005,0.7610377251469934,0.12167501649282841,0.44386323274542566 7 | S0000006,0.33367432737426683,1.4940790731576061,-0.20515826376580087 8 | S0000007,0.31306770165090136,-0.8540957393017248,-2.5529898158340787 9 | S0000008,0.6536185954403606,0.8644361988595057,-0.7421650204064419 10 | S0000009,2.2697546239876076,-1.4543656745987648,0.04575851730144607 11 | S0000010,-0.1871838500258336,1.5327792143584575,1.469358769900285 12 | S0000011,0.1549474256969163,0.37816251960217356,-0.8877857476301128 13 | S0000012,-1.980796468223927,-0.3479121493261526,0.15634896910398005 14 | S0000013,1.2302906807277207,1.2023798487844113,-0.3873268174079523 15 | S0000014,-0.30230275057533557,-1.0485529650670926,-1.4200179371789752 16 | S0000015,-1.7062701906250126,1.9507753952317897,-0.5096521817516535 17 | S0000016,-0.4380743016111864,-1.2527953600499262,0.7774903558319101 18 | S0000017,-1.6138978475579515,-0.2127402802139687,-0.8954665611936756 19 | S0000018,0.386902497859262,-0.510805137568873,-1.180632184122412 20 | S0000019,-0.028182228338654868,0.42833187053041766,0.06651722238316789 21 | S0000020,0.3024718977397814,-0.6343220936809636,-0.3627411659871381 22 | S0000021,-0.672460447775951,-0.3595531615405413,-0.813146282044454 23 | S0000022,-1.7262826023316769,0.17742614225375283,-0.4017809362082619 24 | S0000023,-1.6301983469660446,0.4627822555257742,-0.9072983643832422 25 | S0000024,0.05194539579613895,0.7290905621775369,0.12898291075741067 26 | S0000025,1.1394006845433007,-1.2348258203536526,0.402341641177549 27 | S0000026,-0.6848100909403132,-0.8707971491818818,-0.5788496647644155 28 | S0000027,-0.31155253212737266,0.05616534222974544,-1.1651498407833565 29 | S0000028,0.9008264869541871,0.46566243973045984,-1.5362436862772237 30 | S0000029,1.4882521937955997,1.8958891760305832,1.1787795711596507 31 | S0000030,-0.17992483581235091,-1.0707526215105425,1.0544517269311366 32 | S0000031,-0.40317694697317963,1.2224450703824274,0.2082749780768603 33 | S0000032,0.9766390364837128,0.3563663971744019,0.7065731681919482 34 | S0000033,0.010500020720820478,1.7858704939058352,0.12691209270361992 35 | S0000034,0.40198936344470165,1.8831506970562544,-1.3477590611424464 36 | S0000035,-1.2704849984857336,0.9693967081580112,-1.17312340511416 37 | S0000036,1.9436211856492926,-0.41361898075974735,-0.7474548114407578 38 | S0000037,1.9229420264803847,1.4805147914344243,1.8675589604265699 39 | S0000038,0.9060446582753853,-0.8612256850547025,1.9100649530990337 40 | S0000039,-0.2680033709513804,0.8024563957963952,0.947251967773748 41 | S0000040,-0.1550100930908342,0.6140793703460803,0.9222066715665268 42 | S0000041,0.37642553115562943,-1.0994007905841945,0.298238174206056 43 | S0000042,1.3263858966870303,-0.6945678597313655,-0.14963454032767076 44 | S0000043,-0.43515355172163744,1.8492637284793418,0.6722947570124355 45 | S0000044,0.40746183624111043,-0.7699160744453164,0.5392491912918173 46 | S0000045,-0.6743326606573761,0.03183055827435118,-0.635846078378881 47 | S0000046,0.6764332949464997,0.5765908166149409,-0.20829875557799488 48 | S0000047,0.3960067126616453,-1.0930615087305058,-1.4912575927056055 49 | S0000048,0.4393917012645369,0.16667349537252904,0.6350314368921064 50 | S0000049,2.383144774863942,0.9444794869904138,-0.9128222254441586 51 | S0000050,1.117016288095853,-1.3159074105115212,-0.461584604814709 52 | -------------------------------------------------------------------------------- /sgkit/tests/test_regenie/dataset/sim_sm_02/genotypes.zarr.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/tests/test_regenie/dataset/sim_sm_02/genotypes.zarr.zip -------------------------------------------------------------------------------- /sgkit/tests/test_regenie/dataset/sim_sm_02/glow_offsets.zarr.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/tests/test_regenie/dataset/sim_sm_02/glow_offsets.zarr.zip -------------------------------------------------------------------------------- /sgkit/tests/test_regenie/dataset/sim_sm_02/glow_offsets_nocovariate.zarr.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/tests/test_regenie/dataset/sim_sm_02/glow_offsets_nocovariate.zarr.zip -------------------------------------------------------------------------------- /sgkit/tests/test_regenie/dataset/sim_sm_02/traits.csv: -------------------------------------------------------------------------------- 1 | sample_id,Y0000,Y0001,Y0002,Y0003,Y0004 2 | S0000001,-289.36482961044777,-275.55161096990247,-271.5112918054582,-271.35578637967285,-249.9802561342581 3 | S0000002,-280.2617466598422,-253.9720298942713,-273.6459148573497,-282.49078841243517,-241.09338997825571 4 | S0000003,-254.9285482992192,-244.26794278688027,-255.66389861945876,-247.8402123732413,-238.82817113331336 5 | S0000004,-260.4022375715708,-242.13835393304,-255.01612792818455,-268.44036534547627,-250.70721888268997 6 | S0000005,-258.1133404800941,-255.57357980548673,-261.90921405575847,-285.7029592534616,-245.24453454678022 7 | S0000006,-265.47689844301016,-244.28013875467363,-264.23891346125174,-266.07770608431025,-228.28301757796825 8 | S0000007,-283.2871761140315,-292.6260787637163,-292.1640603923218,-299.39471029355775,-264.3175954620037 9 | S0000008,-219.49833815684846,-229.40456661041102,-226.29208752330103,-222.02459238865347,-207.13382636978892 10 | S0000009,-279.51270464588004,-283.6975894756584,-273.7096726753412,-280.3554738029803,-259.4180219647341 11 | S0000010,-259.8032291695713,-251.36129374443573,-257.8977581771694,-269.23078726351764,-253.13279293137492 12 | S0000011,-269.9919814699405,-247.93206164429185,-261.7601867956449,-275.0298365515039,-244.21932805458428 13 | S0000012,-285.77393491791605,-275.3890397383568,-274.55380982704963,-284.9299666091712,-254.9928111785241 14 | S0000013,-277.1926809934609,-285.6689827198898,-266.3253403090971,-269.2727324762928,-250.21589155337878 15 | S0000014,-262.4163352438781,-267.4995773197119,-256.5335711240192,-271.9041593346388,-240.7477340675035 16 | S0000015,-287.73560131012886,-298.9007710385901,-279.4414838792006,-288.2026950107911,-261.6035479689111 17 | S0000016,-274.42311861954556,-258.80197244752253,-250.78878192452643,-255.90333311243012,-254.96719346171648 18 | S0000017,-288.78156475051264,-283.69484431811884,-282.0562102999267,-296.18754615077813,-269.81982529752344 19 | S0000018,-267.92458695783586,-274.333207503416,-270.16690545867147,-271.22687144367785,-257.084711352308 20 | S0000019,-259.652239029143,-271.3373968635834,-272.6600069993165,-264.2709560634566,-236.87420837724466 21 | S0000020,-276.432731006855,-265.11250963911374,-291.6925850053143,-287.2029220253524,-257.54740093719437 22 | S0000021,-302.2583816107363,-268.10402480224985,-266.5553387094999,-281.9858242858026,-258.8950016696872 23 | S0000022,-279.8812519534122,-276.5331115117956,-277.2710749855309,-290.26865041089155,-257.7871674979621 24 | S0000023,-259.2277460394356,-243.66122076394095,-245.76668836892475,-256.1007519571617,-241.23568200144058 25 | S0000024,-271.15634538304914,-278.2742395123379,-253.89982381679815,-266.3033017713841,-260.81345761229016 26 | S0000025,-289.12221816694904,-288.2213062217917,-299.17923356228863,-317.70814291174673,-270.87527151421574 27 | S0000026,-256.69561439734474,-266.6609715391943,-271.26324726800226,-272.1222328091294,-254.5707542300432 28 | S0000027,-260.47466710051197,-240.05822457504374,-247.51137384414386,-278.822915534984,-232.1627874599786 29 | S0000028,-293.83634241663555,-267.5942462465462,-260.3977803416625,-266.3495161503439,-249.55915696107422 30 | S0000029,-255.7649864858529,-256.6956682332754,-265.0818014497492,-248.47063293488537,-222.06254119680125 31 | S0000030,-292.7603769436315,-288.48829299572657,-272.85558338442576,-302.4533434187978,-264.89426716806315 32 | S0000031,-272.075281401142,-278.9048993798882,-261.982508345537,-266.32711738718075,-251.50346760610367 33 | S0000032,-251.07077191223377,-275.0877046973688,-261.11750205717243,-255.13447427146772,-242.20585453407114 34 | S0000033,-291.8325541138616,-287.63711513064686,-282.20486664981166,-265.826203021402,-264.4394057946944 35 | S0000034,-240.43647998894556,-238.03828868014054,-241.15607912899287,-241.52289541196632,-225.6261811358186 36 | S0000035,-269.6160134380013,-269.95418840713427,-259.3331428408944,-260.835568915071,-248.99036245952718 37 | S0000036,-283.17410117996593,-277.9604314834706,-281.65162507321173,-275.1431456639316,-236.6681812549374 38 | S0000037,-270.73264002439265,-267.19423360129593,-260.76691021417,-256.81109619851037,-230.82345757977177 39 | S0000038,-263.58004859805374,-248.3362358353548,-252.88775092166745,-256.1529742816787,-228.57582977376472 40 | S0000039,-272.1377582391363,-294.10446434385204,-265.3826971259937,-282.08726061350546,-254.95214449897563 41 | S0000040,-244.0143803641179,-238.3050884849436,-247.577635457108,-253.59230615277642,-225.592932356266 42 | S0000041,-248.44496391618878,-230.13158447036224,-232.32370220739398,-239.39271395267718,-218.43116642254466 43 | S0000042,-230.70463381624643,-226.273342148801,-248.36637606352767,-254.18185949642907,-220.11091567559257 44 | S0000043,-273.4363194314351,-254.1295932375416,-280.1206296032896,-263.7965130748505,-247.54407948763603 45 | S0000044,-250.08973677324587,-280.75695152448344,-252.1930476986954,-273.8235387462245,-255.0579489393799 46 | S0000045,-288.30584497419505,-277.2842374326607,-279.7480695633359,-297.1405472260583,-271.80540976936373 47 | S0000046,-256.8741187019728,-242.220619536689,-228.36044361473154,-230.24503842898756,-226.1419976707234 48 | S0000047,-281.39239364093703,-272.7648883331534,-275.1145231121791,-283.38041504923456,-248.68732803897694 49 | S0000048,-250.56543207518007,-260.88344107712805,-266.7397915037219,-288.52101293377217,-255.29649191317182 50 | S0000049,-250.8053322211505,-254.3889142578776,-251.58362762547668,-251.51574318129602,-237.56303756234547 51 | S0000050,-264.62721748673806,-261.7129193668437,-258.7481379119824,-255.61982190239425,-239.65055065580412 52 | -------------------------------------------------------------------------------- /sgkit/tests/test_regenie/result/sim_sm_01-wgr_01/predictions.csv: -------------------------------------------------------------------------------- 1 | sample_id,Y0000 2 | S0000001,0.6712206418512147 3 | S0000002,1.0675227946373933 4 | S0000003,0.3281001891790848 5 | S0000004,0.07792072438052661 6 | S0000005,0.2552996199145314 7 | S0000006,0.22526782376240384 8 | S0000007,0.09321159868391961 9 | S0000008,0.31969664357687166 10 | S0000009,0.7484631415574241 11 | S0000010,-0.04301884891150365 12 | S0000011,-0.12676226976916605 13 | S0000012,-0.8045886595639526 14 | S0000013,0.3486980726044114 15 | S0000014,-0.5345076110144904 16 | S0000015,-0.36148164377437925 17 | S0000016,-0.4878276520639582 18 | S0000017,-0.7319003293934547 19 | S0000018,-0.23188330032239363 20 | S0000019,-0.12008692522353148 21 | S0000020,-0.23228033372830584 22 | S0000021,-0.5377574701429808 23 | S0000022,-0.8639491051767361 24 | S0000023,-0.8415366191036098 25 | S0000024,-0.07839636461289698 26 | S0000025,0.18951125522929352 27 | S0000026,-0.569037828028304 28 | S0000027,-0.38903076208959475 29 | S0000028,0.08970628557699697 30 | S0000029,0.6735764020859089 31 | S0000030,-0.25358601756754817 32 | S0000031,0.04476691419131551 33 | S0000032,0.27895891483490426 34 | S0000033,0.21584900716468297 35 | S0000034,0.31290525875527464 36 | S0000035,-0.20718579770757645 37 | S0000036,0.408707089373321 38 | S0000037,0.6633782570769876 39 | S0000038,0.11939878625988729 40 | S0000039,0.03138343037095803 41 | S0000040,0.03444503439417827 42 | S0000041,-0.1837695563335075 43 | S0000042,0.1060011221467489 44 | S0000043,0.09311322939642908 45 | S0000044,-0.11954146368470456 46 | S0000045,-0.2647727548596352 47 | S0000046,0.1491181301129147 48 | S0000047,-0.19269728742877307 49 | S0000048,0.036116074120831745 50 | S0000049,0.611630749049116 51 | S0000050,-0.044042060654685466 52 | -------------------------------------------------------------------------------- /sgkit/tests/test_regenie/result/sim_sm_01-wgr_01/reduced_blocks_flat.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/tests/test_regenie/result/sim_sm_01-wgr_01/reduced_blocks_flat.csv.gz -------------------------------------------------------------------------------- /sgkit/tests/test_regenie/result/sim_sm_02-wgr_02/predictions.csv: -------------------------------------------------------------------------------- 1 | sample_id,Y0000,Y0001,Y0002,Y0003,Y0004 2 | S0000001,0.45374320321611866,0.4384553793358616,0.2692125245910143,0.5947202967913898,0.5975712436135826 3 | S0000002,0.3981979032191132,0.5213412372969929,-0.38081352848913874,0.8162941188969363,1.0644121156840316 4 | S0000003,0.28761389591743136,0.14834927965448796,0.12970160532604444,0.34870688419043405,0.3923761073751637 5 | S0000004,0.5237540570114866,0.3729791215556517,0.8998382594704468,0.4512697584572887,0.2666976547679285 6 | S0000005,0.3882711007908075,0.262365812438763,0.45600096357874265,0.40959612781192584,0.37754481318684524 7 | S0000006,0.5002068541742315,0.4519055333223547,0.4961881559317826,0.5880647385794258,0.5742797329186986 8 | S0000007,-0.050120684838683496,-0.31098003865765617,-0.49118524618893705,-0.01082592895721236,0.26300114538132086 9 | S0000008,0.3509199180044097,0.27754405527102205,0.1374680521267019,0.46008836105481554,0.5496752408451508 10 | S0000009,0.11755497500258066,-0.03703853409091502,-0.3816524713382645,0.23960844697355968,0.411377907012545 11 | S0000010,0.6999566456601272,0.6345581771142822,1.2594057651645734,0.6565669958786838,0.40319389416846113 12 | S0000011,-0.39764469195427143,-0.21402586638468526,-0.20721917625257816,-0.1989266921107567,-0.21683364253091156 13 | S0000012,-0.5032268326878345,-0.514720795289701,-0.5033526868738674,-0.756449833999863,-0.9810125870606645 14 | S0000013,-0.43716796587146806,-0.09898909240851166,-0.14822990713492612,0.2428228060089122,0.23013561388200385 15 | S0000014,-0.2000575369243856,-0.21823108090123566,-0.04486600391835377,-0.683842471389722,-0.6163148493612544 16 | S0000015,-0.8147850072512632,-0.6948186360042015,-0.8870978332981618,-0.15460919416204447,-0.4524977513386532 17 | S0000016,-0.23446318084410064,-0.14247111244689167,0.00963030563653732,-0.6313989480650365,-0.7300478652398097 18 | S0000017,-0.43722629504209853,-0.49055825470421416,-0.4371099278521622,-0.6960305745944001,-0.8283528779793553 19 | S0000018,-0.23760478198555937,-0.14331440643863386,-0.01827476109328028,-0.3907625159706337,-0.3252337774743284 20 | S0000019,-0.4524300143822794,-0.24682960143824897,-0.26764009022708674,-0.17728958361891117,-0.28629147131853017 21 | S0000020,-0.2256479259004916,-0.10215100433559084,-0.01962565702161251,-0.4022826765823528,-0.38151857255796007 22 | S0000021,-0.3910327350435797,-0.3245350157754625,-0.27853736601812795,-0.44382738029595614,-0.5134205253297407 23 | S0000022,-0.38979608421126516,-0.4198451605709752,-0.2696449211896824,-0.5170422020223361,-0.7491195444360533 24 | S0000023,-0.35655551643903965,-0.29736923275296573,-0.20093284218899046,-0.47544578234073537,-0.6581891393390545 25 | S0000024,-0.08770623978970846,-0.13276000303256172,-0.042926244220280615,-0.010938885879558317,-0.09987402556104427 26 | S0000025,-0.22369919290553647,-0.29331837449806203,-0.26876169304844094,-0.16406706931170847,-0.09366979019564187 27 | S0000026,-0.43113107067087575,-0.41924347744348384,-0.3687095403489385,-0.5345322401025586,-0.6077323051357736 28 | S0000027,-0.31671923425616233,-0.18135492115944263,-0.20325757280115372,-0.3195749298248845,-0.3135842352061409 29 | S0000028,-0.16821016031430422,0.08485280283732696,-0.043447535246061014,-0.018910971202518504,0.14252135190646437 30 | S0000029,0.2931403001919625,0.15480978255735667,0.24189957499918444,0.5927053912085299,0.551739262963243 31 | S0000030,-0.293739725712008,-0.5261225630418921,-0.33169926546552025,-0.355204796820649,-0.4907529449624141 32 | S0000031,0.3470118702581462,0.29708534982377605,0.43519825153927866,0.41765794844395393,0.18720076993734178 33 | S0000032,0.3318167512354224,0.22390402851647723,0.16634714550404497,0.3496263576075134,0.3281124859129637 34 | S0000033,0.44750718046198507,0.247512816053288,0.4315668753366107,0.5395368036693038,0.3303910160469289 35 | S0000034,0.41622328503983463,0.5394475323443995,0.5146392873606955,0.5553606926459201,0.40979552834829286 36 | S0000035,0.19991364164501324,0.6248724038935171,0.6265223337755907,0.3273351488744321,0.0028951826724831066 37 | S0000036,0.20293018449655612,0.5254677435558607,0.09575682466456487,0.26025542478773206,0.40378027253483806 38 | S0000037,0.6064421187499152,-0.1118526815564854,0.04597658064221027,0.5911238158854771,0.6422206838586847 39 | S0000038,0.21136837143972206,0.037161756652827574,-0.007952513678630087,0.15474909210690133,0.16248768309272021 40 | S0000039,0.3368525586794071,0.166108529390026,0.3264216843469296,0.36246236377831675,0.15356485781461562 41 | S0000040,0.309397876460777,0.17291587830510743,0.28929168222313123,0.34097924184533973,0.15087623793051533 42 | S0000041,0.14380592397674863,0.22759321110326292,-0.07136785852710745,-0.3316379862204535,-0.23385955478283238 43 | S0000042,0.22195461967311647,0.25740467623271074,0.0823696499706631,-0.0936122477002144,0.08424362953109349 44 | S0000043,-0.43298137231403444,-0.5981809394028718,-0.32079056196912537,0.15048545900437957,0.05321960377760775 45 | S0000044,0.11929418010813775,0.12782464901718665,-0.061418346619223026,-0.2520582490946169,-0.1813526262771483 46 | S0000045,-0.2686680377620926,-0.11196177609786492,-0.35826598672762855,-0.31202896706441896,-0.27645414098442067 47 | S0000046,-0.12558303037907784,-0.1553277499978641,-0.10305564670217215,0.038733231821486094,0.13429110392270913 48 | S0000047,0.032923167186658046,0.3368044172825969,-0.1346720258718968,-0.38144757192046075,-0.16625814580045456 49 | S0000048,-0.026908928558184442,-0.12285015254170514,-0.09110967206862998,-0.04780712480923875,-0.02069248105103286 50 | S0000049,0.053886792561909765,-0.030554565718910614,0.1734557366607655,0.40743642020101406,0.6476327872039673 51 | S0000050,0.25566722791774016,0.4211396835494259,0.05589731550765741,-0.2702755586740005,-0.05793637160149775 52 | -------------------------------------------------------------------------------- /sgkit/tests/test_regenie/result/sim_sm_02-wgr_02/reduced_blocks_flat.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/tests/test_regenie/result/sim_sm_02-wgr_02/reduced_blocks_flat.csv.gz -------------------------------------------------------------------------------- /sgkit/tests/test_testing.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import numpy as np 4 | import pytest 5 | import xarray as xr 6 | 7 | from sgkit.testing import simulate_genotype_call_dataset 8 | 9 | 10 | @pytest.mark.filterwarnings( 11 | "ignore::UserWarning" 12 | ) # codec `vlen-utf8` not in Zarr v3 spec` 13 | def test_simulate_genotype_call_dataset__zarr(tmp_path): 14 | path = str(tmp_path / "ds.zarr") 15 | ds = simulate_genotype_call_dataset(n_variant=10, n_sample=10) 16 | assert "call_genotype_phased" not in ds 17 | ds.to_zarr(path) 18 | xr.testing.assert_equal(ds, xr.open_zarr(path, concat_characters=False)) 19 | 20 | 21 | def test_simulate_genotype_call_dataset__invalid_missing_pct(): 22 | with pytest.raises( 23 | ValueError, match=re.escape("missing_pct must be within [0.0, 1.0]") 24 | ): 25 | simulate_genotype_call_dataset(n_variant=10, n_sample=10, missing_pct=-1.0) 26 | 27 | 28 | def test_simulate_genotype_call_dataset__phased(tmp_path): 29 | ds = simulate_genotype_call_dataset(n_variant=10, n_sample=10, phased=True) 30 | assert "call_genotype_phased" in ds 31 | assert np.all(ds["call_genotype_phased"]) 32 | ds = simulate_genotype_call_dataset(n_variant=10, n_sample=10, phased=False) 33 | assert "call_genotype_phased" in ds 34 | assert not np.any(ds["call_genotype_phased"]) 35 | 36 | 37 | def test_simulate_genotype_call_dataset__additional_variant_fields(): 38 | ds = simulate_genotype_call_dataset( 39 | n_variant=10, 40 | n_sample=10, 41 | phased=True, 42 | additional_variant_fields={ 43 | "variant_id": str, 44 | "variant_filter": bool, 45 | "variant_quality": np.int8, 46 | "variant_yummyness": np.float32, 47 | }, 48 | ) 49 | assert "variant_id" in ds 50 | assert np.all(ds["variant_id"] == np.arange(10).astype("S")) 51 | assert "variant_filter" in ds 52 | assert ds["variant_filter"].dtype == bool 53 | assert "variant_quality" in ds 54 | assert ds["variant_quality"].dtype == np.int8 55 | assert "variant_yummyness" in ds 56 | assert ds["variant_yummyness"].dtype == np.float32 57 | 58 | with pytest.raises(ValueError, match="Unrecognized dtype"): 59 | simulate_genotype_call_dataset( 60 | n_variant=10, 61 | n_sample=10, 62 | phased=True, 63 | additional_variant_fields={ 64 | "variant_id": None, 65 | }, 66 | ) 67 | -------------------------------------------------------------------------------- /sgkit/typing.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Any, Union 3 | 4 | import dask.array as da 5 | import numpy as np 6 | 7 | ArrayLike = Union[np.ndarray, da.Array] 8 | DType = Any 9 | NDArray = Any 10 | PathType = Union[str, Path] 11 | RandomStateType = Union[np.random.RandomState, da.random.RandomState, int] 12 | -------------------------------------------------------------------------------- /validation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/validation/__init__.py -------------------------------------------------------------------------------- /validation/gwas/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/validation/gwas/__init__.py -------------------------------------------------------------------------------- /validation/gwas/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | # sgkit gwas validation docker image 2 | FROM jupyter/minimal-notebook:54462805efcb 3 | ENV WORK_DIR=$HOME/work 4 | RUN mkdir $WORK_DIR/repos $WORK_DIR/auth $WORK_DIR/data $WORK_DIR/logs 5 | 6 | USER root 7 | 8 | # Install Hail 9 | RUN mkdir -p /usr/share/man/man1 && \ 10 | apt-get update && apt-get install -y \ 11 | openjdk-8-jre-headless \ 12 | && rm -rf /var/lib/apt/lists/* 13 | COPY environment-hail.yml /tmp/ 14 | RUN conda env create -p $CONDA_DIR/envs/hail -f /tmp/environment-hail.yml && \ 15 | conda clean --all -f -y 16 | RUN $CONDA_DIR/envs/hail/bin/pip install hail==0.2.47 17 | RUN $CONDA_DIR/envs/hail/bin/python -m ipykernel install --user --name=hail 18 | 19 | # Install Glow 20 | COPY environment-glow.yml /tmp/ 21 | RUN conda env create -p $CONDA_DIR/envs/glow -f /tmp/environment-glow.yml && \ 22 | conda clean --all -f -y 23 | RUN $CONDA_DIR/envs/glow/bin/pip install glow.py==0.5.0 24 | RUN $CONDA_DIR/envs/glow/bin/python -m ipykernel install --user --name=glow 25 | 26 | 27 | # Install base environment dependencies 28 | COPY environment.yml environment-dev.yml /tmp/ 29 | RUN conda env update -n base --file /tmp/environment.yml 30 | RUN conda env update -n base --file /tmp/environment-dev.yml 31 | 32 | # Install pysnptools separately (does not work as pip install with conda env update) 33 | RUN pip install --no-cache-dir pysnptools==0.4.19 34 | 35 | # Ensure this always occurs last before user switch 36 | RUN fix-permissions $CONDA_DIR && \ 37 | fix-permissions /home/$NB_USER 38 | 39 | USER $NB_UID 40 | 41 | ENV PYTHONPATH="${PYTHONPATH}:$WORK_DIR/repos/sgkit" 42 | ENV PYTHONPATH="${PYTHONPATH}:$WORK_DIR/repos/sgkit-plink" 43 | 44 | ENV OMP_NUM_THREADS=1 45 | ENV MKL_NUM_THREADS=1 46 | ENV OPENBLAS_NUM_THREADS=1 47 | 48 | ARG SPARK_DRIVER_MEMORY=64g 49 | ENV SPARK_DRIVER_MEMORY=$SPARK_DRIVER_MEMORY 50 | 51 | # Set this as needed to avoid https://issues.apache.org/jira/browse/SPARK-29367 52 | # with any pyspark 2.4.x + pyarrow >= 0.15.x 53 | # See: https://spark.apache.org/docs/latest/sql-pyspark-pandas-with-arrow.html#compatibility-setting-for-pyarrow--0150-and-spark-23x-24x 54 | ENV ARROW_PRE_0_15_IPC_FORMAT=1 -------------------------------------------------------------------------------- /validation/gwas/docker/README.md: -------------------------------------------------------------------------------- 1 | ### Sgkit GWAS Validation Docker Image 2 | 3 | This image installs a variety of useful dependencies for testing and validating genetic methods, with a focus on GWAS and related QC (e.g. [Hail](https://hail.is/index.html) and [Glow](https://projectglow.io/)). 4 | 5 | #### Build Image 6 | 7 | ```bash 8 | docker build -t sgkit-gwas-validation . 9 | ``` 10 | 11 | #### Run Image 12 | 13 | This will launch a conatiner with jupyter lab accessible at http://localhost:8888 and Spark UI at http://localhost:4040. 14 | 15 | ```bash 16 | # Adjust these as necessary for your setup 17 | DATA_DIR=/tmp # Set data directory to share locally 18 | REPO_DIR=$HOME/repos # Set local (host) repo dir containing sgkit 19 | JUPYTER_TOKEN=orDiAMbliNfI # Jupyter token for login 20 | SPARK_DRIVER_MEMORY=64g 21 | 22 | # Launch ephemeral container (remove `--rm` to persist state) 23 | WORK_DIR=/home/jovyan/work 24 | docker run --rm -ti \ 25 | -e GRANT_SUDO=yes --user=root \ 26 | -p 8888:8888 -p 4040:4040 \ 27 | -e JUPYTER_TOKEN=$JUPYTER_TOKEN \ 28 | -e SPARK_DRIVER_MEMORY=$SPARK_DRIVER_MEMORY \ 29 | -e JUPYTER_ENABLE_LAB=yes \ 30 | -v $DATA_DIR:$WORK_DIR/data \ 31 | -v $REPO_DIR/sgkit:$WORK_DIR/repos/sgkit \ 32 | -v $REPO_DIR/sgkit-plink:$WORK_DIR/repos/sgkit-plink \ 33 | sgkit-gwas-validation 34 | ``` -------------------------------------------------------------------------------- /validation/gwas/docker/environment-dev.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - defaults 4 | dependencies: 5 | # CICD dependencies 6 | - black 7 | - flake8 8 | - isort 9 | - mypy 10 | - pre-commit 11 | - pylint 12 | - pytest 13 | - pytest-datadir 14 | - pytest-cov 15 | - hypothesis 16 | # Validation/visualization dependencies 17 | - statsmodels 18 | - python-dotenv 19 | - matplotlib 20 | - rope # for code refactor 21 | - graphviz 22 | - python-graphviz 23 | - scikit-learn 24 | - fire 25 | - invoke 26 | - pyarrow 27 | -------------------------------------------------------------------------------- /validation/gwas/docker/environment-glow.yml: -------------------------------------------------------------------------------- 1 | name: glow 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.7 6 | - numpy 7 | - scipy 8 | - nomkl 9 | # See: https://spark.apache.org/docs/latest/sql-pyspark-pandas-with-arrow.html#recommended-pandas-and-pyarrow-versions 10 | # If conflict occurs, downgrade pandas to 0.24.2 and let it upgrade on glow pip install to 0.25.3 11 | - pandas=0.25.3 12 | - pyarrow=0.15.1 13 | - pyspark=2.4.5 14 | - ipython=7.16.1 15 | - ipykernel=5.3.3 16 | - pyyaml=5.3.1 17 | - fire=0.3.1 18 | - pip 19 | - pip: 20 | - typeguard==2.5.0 21 | - nptyping==1.1.0 22 | -------------------------------------------------------------------------------- /validation/gwas/docker/environment-hail.yml: -------------------------------------------------------------------------------- 1 | name: hail 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.7 6 | - ipython=7.16.1 7 | - ipykernel=5.3.3 8 | - pyarrow=0.15.1 9 | - pyyaml=5.3.1 10 | - fire=0.3.1 11 | - pip -------------------------------------------------------------------------------- /validation/gwas/docker/environment.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - defaults 4 | dependencies: 5 | - numpy 6 | - xarray 7 | - dask[complete] 8 | - dask-ml 9 | - scipy 10 | - numba 11 | - zarr 12 | -------------------------------------------------------------------------------- /validation/gwas/method/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/validation/gwas/method/__init__.py -------------------------------------------------------------------------------- /validation/gwas/method/hwe/Makefile: -------------------------------------------------------------------------------- 1 | all: clean chwe.so 2 | 3 | clean: 4 | rm -f *.o *.so 5 | 6 | chwe.so: chwe.o 7 | gcc -shared -o libchwe.so chwe.o 8 | 9 | chwe.o: chwe.c 10 | gcc -c -Wall -Werror -fpic chwe.c 11 | 12 | 13 | -------------------------------------------------------------------------------- /validation/gwas/method/hwe/README.md: -------------------------------------------------------------------------------- 1 | ## HWE Exact Test Validation 2 | 3 | This validation produces simulated genotype counts and corresponding HWE statistics from the (C) implementation described in [Wigginton et al. 2005](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1199378). 4 | 5 | The `invoke` [tasks](tasks.py) will compile the C code, simulate genotype counts (inputs for unit tests), and attach p values (outputs for unit tests) from the C code to the genotype counts, as a dataframe. 6 | 7 | The [hwe_unit_test.ipynb](hwe_unit_test.ipynb) is only instructive and shows how to debug and possibly extend test cases, perhaps validating inputs/outputs on a scale that wouldn't be included in unit testing. 8 | 9 | To export the unit test data, all steps can be run as follows: 10 | 11 | ```bash 12 | > invoke compile simulate export 13 | Building reference C library 14 | rm -f *.o *.so 15 | gcc -c -Wall -Werror -fpic chwe.c 16 | gcc -shared -o libchwe.so chwe.o 17 | Build complete 18 | Generating unit test data 19 | Unit test data written to data/sim_01.csv 20 | Exporting test data to /home/jovyan/work/repos/sgkit/sgkit/tests/test_hwe 21 | Clearing test datadir at /home/jovyan/work/repos/sgkit/sgkit/tests/test_hwe 22 | Copying data/sim_01.csv to /home/jovyan/work/repos/sgkit/sgkit/tests/test_hwe/sim_01.csv 23 | Export complete 24 | ``` -------------------------------------------------------------------------------- /validation/gwas/method/hwe/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/validation/gwas/method/hwe/__init__.py -------------------------------------------------------------------------------- /validation/gwas/method/hwe/chwe.c: -------------------------------------------------------------------------------- 1 | // Lift from http://csg.sph.umich.edu/abecasis/Exact/snp_hwe.c 2 | #include 3 | #include 4 | 5 | double hwep(int obs_hets, int obs_hom1, int obs_hom2){ 6 | if (obs_hom1 < 0 || obs_hom2 < 0 || obs_hets < 0) 7 | { 8 | printf("FATAL ERROR - SNP-HWE: Current genotype configuration (%d %d %d ) includes a" 9 | " negative count", obs_hets, obs_hom1, obs_hom2); 10 | exit(EXIT_FAILURE); 11 | } 12 | 13 | int obs_homc = obs_hom1 < obs_hom2 ? obs_hom2 : obs_hom1; 14 | int obs_homr = obs_hom1 < obs_hom2 ? obs_hom1 : obs_hom2; 15 | 16 | int rare_copies = 2 * obs_homr + obs_hets; 17 | int genotypes = obs_hets + obs_homc + obs_homr; 18 | 19 | double * het_probs = (double *) malloc((size_t) (rare_copies + 1) * sizeof(double)); 20 | if (het_probs == NULL) 21 | { 22 | printf("FATAL ERROR - SNP-HWE: Unable to allocate array for heterozygote probabilities" ); 23 | exit(EXIT_FAILURE); 24 | } 25 | 26 | int i; 27 | for (i = 0; i <= rare_copies; i++) 28 | het_probs[i] = 0.0; 29 | 30 | /* start at midpoint */ 31 | int mid = rare_copies * (2 * genotypes - rare_copies) / (2 * genotypes); 32 | 33 | /* check to ensure that midpoint and rare alleles have same parity */ 34 | if ((rare_copies & 1) ^ (mid & 1)) 35 | mid++; 36 | 37 | int curr_hets = mid; 38 | int curr_homr = (rare_copies - mid) / 2; 39 | int curr_homc = genotypes - curr_hets - curr_homr; 40 | 41 | het_probs[mid] = 1.0; 42 | double sum = het_probs[mid]; 43 | for (curr_hets = mid; curr_hets > 1; curr_hets -= 2) 44 | { 45 | het_probs[curr_hets - 2] = het_probs[curr_hets] * curr_hets * (curr_hets - 1.0) 46 | / (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0)); 47 | sum += het_probs[curr_hets - 2]; 48 | 49 | /* 2 fewer heterozygotes for next iteration -> add one rare, one common homozygote */ 50 | curr_homr++; 51 | curr_homc++; 52 | } 53 | 54 | curr_hets = mid; 55 | curr_homr = (rare_copies - mid) / 2; 56 | curr_homc = genotypes - curr_hets - curr_homr; 57 | for (curr_hets = mid; curr_hets <= rare_copies - 2; curr_hets += 2) 58 | { 59 | het_probs[curr_hets + 2] = het_probs[curr_hets] * 4.0 * curr_homr * curr_homc 60 | /((curr_hets + 2.0) * (curr_hets + 1.0)); 61 | sum += het_probs[curr_hets + 2]; 62 | 63 | /* add 2 heterozygotes for next iteration -> subtract one rare, one common homozygote */ 64 | curr_homr--; 65 | curr_homc--; 66 | } 67 | 68 | for (i = 0; i <= rare_copies; i++) 69 | het_probs[i] /= (sum > 0 ? sum : 1e-128); 70 | 71 | /* alternate p-value calculation for p_hi/p_lo 72 | double p_hi = het_probs[obs_hets]; 73 | for (i = obs_hets + 1; i <= rare_copies; i++) 74 | p_hi += het_probs[i]; 75 | 76 | double p_lo = het_probs[obs_hets]; 77 | for (i = obs_hets - 1; i >= 0; i--) 78 | p_lo += het_probs[i]; 79 | 80 | 81 | double p_hi_lo = p_hi < p_lo ? 2.0 * p_hi : 2.0 * p_lo; 82 | */ 83 | 84 | double p_hwe = 0.0; 85 | /* p-value calculation for p_hwe */ 86 | for (i = 0; i <= rare_copies; i++) 87 | { 88 | if (het_probs[i] > het_probs[obs_hets]) 89 | continue; 90 | p_hwe += het_probs[i]; 91 | } 92 | 93 | p_hwe = p_hwe > 1.0 ? 1.0 : p_hwe; 94 | 95 | free(het_probs); 96 | 97 | return p_hwe; 98 | } 99 | -------------------------------------------------------------------------------- /validation/gwas/method/hwe/chwe.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/validation/gwas/method/hwe/chwe.o -------------------------------------------------------------------------------- /validation/gwas/method/hwe/invoke.yaml: -------------------------------------------------------------------------------- 1 | tasks: 2 | auto_dash_names: false -------------------------------------------------------------------------------- /validation/gwas/method/hwe/logging.ini: -------------------------------------------------------------------------------- 1 | [loggers] 2 | keys=root 3 | 4 | [handlers] 5 | keys=console 6 | 7 | [formatters] 8 | keys=console_formatter 9 | 10 | [logger_root] 11 | level=INFO 12 | handlers=console 13 | 14 | [handler_console] 15 | level=INFO 16 | class=StreamHandler 17 | formatter=console_formatter 18 | args=(sys.stdout,) 19 | 20 | [formatter_console_formatter] 21 | format=%(asctime)s|%(levelname)s|%(name)s.%(funcName)s:%(lineno)d| %(message)s -------------------------------------------------------------------------------- /validation/gwas/method/hwe/tasks.py: -------------------------------------------------------------------------------- 1 | import ctypes 2 | import glob 3 | import logging 4 | import logging.config 5 | import os 6 | import shutil 7 | from pathlib import Path 8 | 9 | import numpy as np 10 | import pandas as pd 11 | from invoke import task 12 | 13 | logging.config.fileConfig("logging.ini") 14 | logger = logging.getLogger(__name__) 15 | 16 | DEFAULT_SIM_DATADIR = os.getenv("SIM_DATADIR", "data") 17 | DEFAULT_TEST_DATADIR = os.getenv("TEST_DATADIR", "../../../../sgkit/tests/test_hwe") 18 | 19 | 20 | @task 21 | def compile(ctx): 22 | """Build reference implementation C library""" 23 | logger.info("Building reference C library") 24 | ctx.run("make") 25 | logger.info("Build complete") 26 | 27 | 28 | def get_genotype_counts(): 29 | """Generate genotype counts for testing.""" 30 | rs = np.random.RandomState(0) 31 | n, s = 10_000, 50 32 | n_het = np.expand_dims(np.arange(n, step=s) + 1, -1) 33 | frac = rs.uniform(0.3, 0.7, size=(n // s, 2)) 34 | n_hom = frac * n_het 35 | n_hom = n_hom.astype(int) 36 | return pd.DataFrame( 37 | np.concatenate((n_het, n_hom), axis=1), columns=["n_het", "n_hom_1", "n_hom_2"] 38 | ) 39 | 40 | 41 | @task 42 | def simulate(ctx, sim_datadir=DEFAULT_SIM_DATADIR): 43 | """Create inputs and outputs for unit tests.""" 44 | logger.info("Generating unit test data") 45 | libc = ctypes.CDLL("./libchwe.so") 46 | chwep = libc.hwep 47 | chwep.restype = ctypes.c_double 48 | df = get_genotype_counts() 49 | df["p"] = df.apply( 50 | lambda r: chwep(int(r["n_het"]), int(r["n_hom_1"]), int(r["n_hom_2"])), axis=1 51 | ) 52 | output_dir = Path(sim_datadir) 53 | if not output_dir.exists(): 54 | output_dir.mkdir(parents=True, exist_ok=True) 55 | path = output_dir / "sim_01.csv" 56 | df.to_csv(path, index=False) 57 | logger.info(f"Unit test data written to {path}") 58 | 59 | 60 | @task 61 | def export( 62 | ctx, 63 | sim_datadir=DEFAULT_SIM_DATADIR, 64 | test_datadir=DEFAULT_TEST_DATADIR, 65 | clear=True, 66 | runs=None, 67 | ): 68 | sim_datadir = Path(sim_datadir) 69 | test_datadir = Path(test_datadir).resolve() 70 | logger.info(f"Exporting test data to {test_datadir}") 71 | if clear and test_datadir.exists(): 72 | logger.info(f"Clearing test datadir at {test_datadir}") 73 | shutil.rmtree(test_datadir) 74 | test_datadir.mkdir(exist_ok=True) 75 | for f in glob.glob(str(sim_datadir / "*.csv")): 76 | src = f 77 | dst = test_datadir / Path(f).name 78 | logger.info(f"Copying {src} to {dst}") 79 | shutil.copy(src, dst) 80 | logger.info("Export complete") 81 | -------------------------------------------------------------------------------- /validation/gwas/method/pc_relate/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM rstudio/r-base:4.0-focal 2 | 3 | # Note: We freeze versions because we want point in time validation 4 | # See: https://github.com/sgkit-dev/sgkit/pull/228 5 | 6 | RUN apt-get update \ 7 | && apt-get install python3 python3-pip git pkg-config -y \ 8 | && rm -rf /var/lib/apt/lists/* 9 | 10 | RUN R -e 'install.packages("https://cran.r-project.org/src/contrib/data.table_1.13.0.tar.gz", type="source", repos=NULL)' 11 | RUN R -e 'install.packages("tictoc", version = "1.0", repos = "http://cran.us.r-project.org")' 12 | RUN R -e 'install.packages("BiocManager", version = "1.30.10", repos = "http://cran.us.r-project.org")' 13 | RUN R -e 'BiocManager::install(version = "3.11", ask = FALSE)' || \ 14 | R -e 'BiocManager::install(version = "3.11", ask = FALSE, force = TRUE)' 15 | RUN R -e 'BiocManager::install("SNPRelate", version = "3.11", ask = FALSE)' 16 | RUN R -e 'BiocManager::install("gdsfmt", version = "3.11", ask = FALSE)' 17 | RUN R -e 'BiocManager::install("GWASTools", version = "3.11", ask = FALSE)' 18 | RUN R -e 'BiocManager::install("GENESIS", version = "3.11", ask = FALSE)' 19 | -------------------------------------------------------------------------------- /validation/gwas/method/pc_relate/README.md: -------------------------------------------------------------------------------- 1 | This runs test to validate our implementation gets the same 2 | exact results as the reference R implementation for production 3 | HapMap data. 4 | 5 | This code is scheduled as part of the Github Actions CI. 6 | 7 | To run manually, you need to first download the test data 8 | from `https://storage.googleapis.com/sgkit-data/validation/hapmap_JPT_CHB_r23a_filtered.zip`, 9 | the file size is about 32MB. 10 | 11 | ```bash 12 | wget https://storage.googleapis.com/sgkit-data/validation/hapmap_JPT_CHB_r23a_filtered.zip -P /tmp/ 13 | ./run.sh /tmp/hapmap_JPT_CHB_r23a_filtered.zip 14 | ``` 15 | 16 | `run.sh` will: 17 | * convert plink data to GDS 18 | * run reference [R PC-Relate implementation](pc_relate.R) 19 | * run [our PC-Relate and compare results](validate_pc_relate.py) 20 | 21 | The only requirement is that you have Docker and Bash installed. 22 | -------------------------------------------------------------------------------- /validation/gwas/method/pc_relate/convert_plink_to_gds.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | library(gdsfmt) 4 | library(SNPRelate) 5 | 6 | args <- commandArgs(trailingOnly=TRUE) 7 | 8 | if (length(args) < 2) { 9 | stop("usage: ", call.=FALSE) 10 | } 11 | 12 | snpgdsBED2GDS(bed.fn=paste(args[1], ".bed", sep = ""), 13 | bim.fn=paste(args[1], ".bim", sep = ""), 14 | fam.fn=paste(args[1], ".fam", sep = ""), 15 | out.gdsfn=args[2]) 16 | -------------------------------------------------------------------------------- /validation/gwas/method/pc_relate/pc_relate.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | library(gdsfmt) 4 | library(SNPRelate) 5 | library(GWASTools) 6 | library(GENESIS) 7 | library(tictoc) 8 | 9 | args <- commandArgs(trailingOnly=TRUE) 10 | 11 | if (length(args) < 1) { 12 | stop("usage: ", call.=FALSE) 13 | } 14 | 15 | gds_filepath = args[1] 16 | 17 | tic("KING kinship") 18 | genofile <- snpgdsOpen(gds_filepath) 19 | king_mat <- snpgdsIBDKING(genofile, num.thread=8) 20 | king_mat_2 = kingToMatrix(king_mat) 21 | toc(log = TRUE) 22 | snpgdsClose(genofile) 23 | 24 | reader <- GdsGenotypeReader(gds_filepath, "scan,snp") 25 | geno_data <- GenotypeData(reader) 26 | 27 | tic("PC-AIR") 28 | pcair_result <- pcair(geno_data, 29 | kinobj = king_mat_2, 30 | divobj = king_mat_2) 31 | toc(log = TRUE) 32 | summary(pcair_result) 33 | 34 | write.csv(pcair_result$vectors[,1:2], file = "pcs.csv") 35 | 36 | geno_data <- GenotypeBlockIterator(geno_data) 37 | tic("PC-Relate") 38 | pcrelate_result <- pcrelate(geno_data, 39 | pcs = pcair_result$vectors[,1:2]) 40 | toc(log = TRUE) 41 | 42 | 43 | write.csv(pcrelate_result$kinSelf, "kinself.csv") 44 | write.csv(pcrelate_result$kinBtwn, "kinbtwe.csv") 45 | write.csv(pcair_result$unrels, "unrels.csv") 46 | write.csv(pcair_result$rels, "rels.csv") 47 | summary(pcrelate_result) 48 | -------------------------------------------------------------------------------- /validation/gwas/method/pc_relate/run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -o errexit 4 | set -o pipefail 5 | 6 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 7 | REPO_ROOT="$( cd "$DIR/../../../.." >/dev/null 2>&1 && pwd )" 8 | 9 | TEST_DATA="$1" 10 | 11 | if [[ -z "$TEST_DATA" ]]; then 12 | echo "usage $0 " >&2 13 | echo "You can download real test data from https://storage.googleapis.com/sgkit-data/validation/hapmap_JPT_CHB_r23a_filtered.zip" >&2 14 | exit 1 15 | fi 16 | 17 | if [[ -z "$RUNNING_IN_SGKIT_PC_RELATE_VALIDATION_DOCKER" ]]; then 18 | # Note: to speed up the process up, we could: 19 | # * build the docker image and push it to the GHCR/GCR 20 | # * or add docker layer caching https://github.com/marketplace/actions/docker-layer-caching 21 | # 22 | # For now, we just build new docker image each time, if we want to run 23 | # validation more often than weekly or just want to get results 24 | # faster we could add any of the above in the future. 25 | echo "Building validation docker image, this will take about ~20 minutes ..." 26 | docker build -t sgkit_pc_relate_validation -f "$DIR/Dockerfile" "$DIR" 27 | docker run --rm \ 28 | -v $DIR:/work \ 29 | -v $REPO_ROOT:/code \ 30 | -v $TEST_DATA:/test_data.zip \ 31 | -e RUNNING_IN_SGKIT_PC_RELATE_VALIDATION_DOCKER=1 \ 32 | sgkit_pc_relate_validation /work/run.sh "$1" 33 | else 34 | echo "Running inside docker, will crunch data ..." 35 | unzip test_data.zip 36 | /work/convert_plink_to_gds.R hapmap_JPT_CHB_r23a_filtered hapmap_JPT_CHB_r23a_filtered.gds 37 | /work/pc_relate.R hapmap_JPT_CHB_r23a_filtered.gds 38 | cp /work/validate_pc_relate.py . 39 | pip3 install --upgrade '/code[plink]' pytest 40 | PYTHONPATH=/code:$PYTHONPATH pytest ./validate_pc_relate.py 41 | fi 42 | -------------------------------------------------------------------------------- /validation/gwas/method/pc_relate/validate_pc_relate.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import dask.array as da 4 | import numpy as np 5 | import pandas as pd 6 | import xarray as xr 7 | 8 | from sgkit import pc_relate 9 | from sgkit.io.plink import read_plink 10 | from sgkit.variables import sample_pca_projection 11 | 12 | 13 | def test_same_as_the_reference_implementation() -> None: 14 | """ 15 | This test validates that our implementation gets exactly 16 | the same results as the reference R implementation. 17 | """ 18 | 19 | d = Path(__file__).parent 20 | ds = read_plink(path="hapmap_JPT_CHB_r23a_filtered") 21 | 22 | pcs = da.from_array( 23 | pd.read_csv(d.joinpath("pcs.csv").as_posix(), usecols=[1, 2]).to_numpy() 24 | ) 25 | ds[sample_pca_projection] = (("samples", "components"), pcs) 26 | phi = pc_relate(ds).pc_relate_phi.compute() 27 | 28 | n_samples = 90 29 | assert isinstance(phi, xr.DataArray) 30 | assert phi.shape == (n_samples, n_samples) 31 | 32 | # Get genesis/reference results: 33 | genesis_phi = pd.read_csv(d.joinpath("kinbtwe.csv")) 34 | genesis_phi = genesis_phi[["kin"]].to_numpy() 35 | 36 | phi_s = phi.data[np.triu_indices_from(phi.data, 1)] # type: ignore[no-untyped-call] 37 | assert phi_s.size == genesis_phi.size 38 | assert np.allclose(phi_s, genesis_phi.T) 39 | -------------------------------------------------------------------------------- /validation/gwas/method/regenie/.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | hail-*.log -------------------------------------------------------------------------------- /validation/gwas/method/regenie/README.md: -------------------------------------------------------------------------------- 1 | ## REGENIE Validation 2 | 3 | The scripts in this directory are used to generate data and validate results from a reference implementation, specifically [GloWGR](https://glow.readthedocs.io/en/latest/tertiary/whole-genome-regression.html). 4 | 5 | The general flow for this process is: 6 | 7 | 1. Generate simulated genotypes, covariates, and traits saved as PLINK (via Hail) and pandas DataFrames 8 | 2. Convert PLINK results to Zarr 9 | 3. Run Glow WGR to produce results to compare against 10 | 4. Export a subset of these results and the configuration that defines them to a unit test directory 11 | 12 | *Note*: The initial PLINK output is used for compatibility with the REGENIE C++ application 13 | 14 | All of the above are represented as pyinvoke tasks in [tasks.py](tasks.py). 15 | 16 | The definition of each simulated dataset and parameterizations run against them can be seen in [config.yml](config.yml). 17 | 18 | At time of writing, these commands were used to generate the current test data: 19 | 20 | ```bash 21 | # Build the simulated inputs and outputs 22 | invoke build 23 | # Export select results to build unit tests against 24 | invoke export --runs sim_sm_02-wgr_02 --runs sim_sm_01-wgr_01 25 | ``` 26 | 27 | ### Glow WGR Release 28 | 29 | This validation was run for [glow.py==0.5.0](https://pypi.org/project/glow.py/0.5.0/). At this time, binary traits are not yet supported and the REGENIE implementation hasn't even been officially released. Support for [binary traits should come in the next release](https://github.com/projectglow/glow/issues/256) along with official support at which time this validation should be updated. From that point onward, there is little need to update this data unless either implementation (sgkit or Glow) has been shown to be incorrect. 30 | 31 | -------------------------------------------------------------------------------- /validation/gwas/method/regenie/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/validation/gwas/method/regenie/__init__.py -------------------------------------------------------------------------------- /validation/gwas/method/regenie/config.yml: -------------------------------------------------------------------------------- 1 | datasets: 2 | sim_sm_01: 3 | n_variants: 250 4 | n_samples: 50 5 | n_covars: 3 6 | n_contigs: 1 7 | n_traits: 1 8 | sim_sm_02: 9 | n_variants: 250 10 | n_samples: 50 11 | n_covars: 3 12 | n_contigs: 10 13 | n_traits: 5 14 | sim_sm_03: 15 | n_variants: 250 16 | n_samples: 50 17 | n_covars: 3 18 | n_contigs: 10 19 | n_traits: 1 20 | sim_md_01: 21 | n_variants: 1000 22 | n_samples: 250 23 | n_covars: 3 24 | n_contigs: 1 25 | n_traits: 1 26 | paramsets: 27 | wgr_01: 28 | variant_block_size: 10 29 | sample_block_size: 10 30 | alphas: [1000] 31 | wgr_02: 32 | variant_block_size: 10 33 | sample_block_size: 10 34 | alphas: null 35 | wgr_03: 36 | variant_block_size: 100 37 | sample_block_size: 50 38 | alphas: [1000] 39 | runs: 40 | - {dataset: sim_sm_01, paramset: wgr_01, name: sim_sm_01-wgr_01} 41 | - {dataset: sim_sm_02, paramset: wgr_01, name: sim_sm_02-wgr_01} 42 | - {dataset: sim_sm_02, paramset: wgr_02, name: sim_sm_02-wgr_02} 43 | - {dataset: sim_sm_03, paramset: wgr_01, name: sim_sm_03-wgr_01} 44 | - {dataset: sim_md_01, paramset: wgr_03, name: sim_md_01-wgr_01} -------------------------------------------------------------------------------- /validation/gwas/method/regenie/invoke.yaml: -------------------------------------------------------------------------------- 1 | tasks: 2 | auto_dash_names: false 3 | 4 | -------------------------------------------------------------------------------- /validation/gwas/method/regenie/logging.ini: -------------------------------------------------------------------------------- 1 | [loggers] 2 | keys=root 3 | 4 | [handlers] 5 | keys=console 6 | 7 | [formatters] 8 | keys=console_formatter 9 | 10 | [logger_root] 11 | level=INFO 12 | handlers=console 13 | 14 | [handler_console] 15 | level=INFO 16 | class=StreamHandler 17 | formatter=console_formatter 18 | args=(sys.stdout,) 19 | 20 | [formatter_console_formatter] 21 | format=%(asctime)s|%(levelname)s|%(name)s.%(funcName)s:%(lineno)d| %(message)s -------------------------------------------------------------------------------- /validation/gwas/method/regenie/sgkit_zarr.py: -------------------------------------------------------------------------------- 1 | #!/opt/conda/bin/python 2 | # coding: utf-8 3 | 4 | import logging 5 | import logging.config 6 | from pathlib import Path 7 | 8 | import fire 9 | import yaml 10 | import zarr 11 | from sgkit_plink import read_plink 12 | 13 | logging.config.fileConfig("logging.ini") 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | def run(dataset: str, dataset_dir="data/dataset"): 18 | dataset_dir = Path(dataset_dir) 19 | plink_path = dataset_dir / dataset / "genotypes" 20 | zarr_path = dataset_dir / dataset / "genotypes.zarr.zip" 21 | ds = read_plink(path=plink_path, bim_sep="\t", fam_sep="\t") 22 | # Pre-compute string lengths until this is done: 23 | # https://github.com/sgkit-dev/sgkit-plink/issues/12 24 | ds = ds.compute() 25 | logger.info(f"Loaded dataset {dataset}:") 26 | logger.info("\n" + str(ds)) 27 | store = zarr.ZipStore(zarr_path, mode="w") 28 | ds.to_zarr(store, mode="w") 29 | store.close() 30 | logger.info(f"Conversion to zarr at {zarr_path} successful") 31 | 32 | 33 | def run_from_config(): 34 | with open("config.yml") as fd: 35 | config = yaml.load(fd, Loader=yaml.FullLoader) 36 | for dataset in config["datasets"]: 37 | run(dataset) 38 | 39 | 40 | fire.Fire() 41 | -------------------------------------------------------------------------------- /validation/gwas/method/regenie/tasks.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import logging 3 | import logging.config 4 | import os 5 | import shutil 6 | from pathlib import Path 7 | 8 | import yaml 9 | from invoke import task 10 | 11 | logging.config.fileConfig("logging.ini") 12 | logger = logging.getLogger(__name__) 13 | 14 | HAILPY = os.environ.get("HAIL_PYTHON_EXECUTABLE", "/opt/conda/envs/hail/bin/python") 15 | GLOWPY = os.environ.get("GLOW_PYTHON_EXECUTABLE", "/opt/conda/envs/glow/bin/python") 16 | BASEPY = os.environ.get("BASE_PYTHON_EXECUTABLE", "/opt/conda/bin/python") 17 | DEFAULT_TEST_DATADIR = os.getenv( 18 | "TEST_DATADIR", str(Path(__file__).parents[4] / "sgkit/tests/test_regenie") 19 | ) 20 | 21 | 22 | def get_config(): 23 | with open("config.yml") as fd: 24 | return yaml.load(fd, Loader=yaml.FullLoader) 25 | 26 | 27 | def filter_config(config, runs): 28 | res = {"datasets": {}, "paramsets": {}, "runs": []} 29 | for run in config["runs"]: 30 | name = run["name"] 31 | if name not in runs: 32 | continue 33 | if run["dataset"] not in res["datasets"]: 34 | res["datasets"][run["dataset"]] = config["datasets"][run["dataset"]] 35 | if run["paramset"] not in res["paramsets"]: 36 | res["paramsets"][run["paramset"]] = config["paramsets"][run["paramset"]] 37 | res["runs"].append(run) 38 | return res 39 | 40 | 41 | @task 42 | def run_simulation(ctx, dataset): 43 | logger.info(f"Running simulation for dataset {dataset}") 44 | ctx.run(f"{HAILPY} hail_sim.py run_from_config {dataset}") 45 | 46 | 47 | @task 48 | def run_simulations(ctx): 49 | config = get_config() 50 | for dataset in config["datasets"]: 51 | run_simulation(ctx, dataset) 52 | 53 | 54 | @task 55 | def run_glow_wgr(ctx, dataset, paramset): 56 | logger.info(f"Running Glow WGR for dataset {dataset}, paramset {paramset}") 57 | ctx.run(f"{GLOWPY} glow_wgr.py run_from_config {dataset} {paramset}") 58 | 59 | 60 | @task 61 | def run_plink_to_zarr(ctx): 62 | ctx.run(f"{BASEPY} sgkit_zarr.py run_from_config") 63 | 64 | 65 | @task 66 | def run_all_glow_wgr(ctx): 67 | config = get_config() 68 | for run in config["runs"]: 69 | run_glow_wgr(ctx, run["dataset"], run["paramset"]) 70 | 71 | 72 | def copy_files(src, dst, patterns): 73 | logger.info(f"Copying files from {src} to {dst}") 74 | dst.mkdir(parents=True, exist_ok=True) 75 | files = [Path(f) for pattern in patterns for f in glob.glob(str(src / pattern))] 76 | for f in files: 77 | logger.info(f"\tCopying path: {f}") 78 | if f.is_dir(): 79 | shutil.copytree(f, dst / f.name) 80 | else: 81 | shutil.copy(f, dst) 82 | 83 | 84 | @task(iterable=["runs"]) 85 | def export(ctx, test_datadir=DEFAULT_TEST_DATADIR, clear=True, runs=None): 86 | test_datadir = Path(test_datadir).resolve() 87 | src_datadir = Path("data") 88 | if clear and test_datadir.exists(): 89 | logger.info(f"Clearing test datadir at {test_datadir}") 90 | shutil.rmtree(test_datadir) 91 | test_datadir.mkdir(exist_ok=True) 92 | config = get_config() 93 | if runs is not None: 94 | config = filter_config(config, runs) 95 | # Export datasets 96 | for dataset in config["datasets"]: 97 | dst = test_datadir / "dataset" / dataset 98 | src = src_datadir / "dataset" / dataset 99 | copy_files(src, dst, ["*.csv", "*.csv.gz", "*.zarr.zip"]) 100 | # Export results 101 | for run in config["runs"]: 102 | name = run["name"] 103 | dst = test_datadir / "result" / name 104 | src = src_datadir / "result" / name 105 | copy_files(src, dst, ["*.csv", "*.csv.gz"]) 106 | # Export config 107 | config_path = test_datadir / "config.yml" 108 | with open(config_path, "w") as fd: 109 | yaml.dump(config, fd) 110 | logger.info(f"Config written to {config_path}") 111 | logger.info("Export complete") 112 | 113 | 114 | @task(pre=[run_simulations, run_all_glow_wgr, run_plink_to_zarr]) 115 | def build(ctx): 116 | logger.info("Test data generation complete") 117 | -------------------------------------------------------------------------------- /validation/gwas/method/regenie_loco_regression/README.md: -------------------------------------------------------------------------------- 1 | This notebook is used to generate validation data for `sgkit.stats.association.regenie_loco_regression`. It generates offsets to pass as a parameter to the function as well as results from GLOW to check it against. 2 | 3 | Follow these steps to start the `.ipynb` notebooks 4 | 5 | 1. Create and activate a conda environment: 6 | 7 | ``` 8 | conda env create -f environment.yml 9 | conda activate glow 10 | ``` 11 | 12 | 2. Find the location of the corresponding pyspark binary, by typing the following commands in a python console: 13 | 14 | ``` 15 | python -c "import pyspark; print(pyspark.__path__)" 16 | ``` 17 | 18 | 3. Start the Jupyter notebook (make sure to replace `/path/to/pyspark` by that from the command above): 19 | 20 | ``` 21 | PYSPARK_DRIVER_PYTHON=jupyter-lab PYSPARK_DRIVER_PYTHON_OPTS="--ip 0.0.0.0 --port 9999 --no-browser" /path/to/pyspark/bin/pyspark --packages io.projectglow:glow-spark3_2.12:1.0.1 --conf spark.hadoop.io.compression.codecs=io.projectglow.sql.util.BGZFCodec 22 | ``` 23 | 24 | If your notebook is running on a remote server and you can't connect directly to port 9999, run the command below to tunnel the remote server's port 9999 to your local host. 25 | 26 | ``` 27 | ssh -N -L localhost:9999:localhost:9999 @ 28 | ``` 29 | 30 | Note: The notebooks are based on the two `.rst` files provided in the glow repository: `docs/source/tertiary/regression-tests.rst` and `whole-genome-regression.rst` 31 | -------------------------------------------------------------------------------- /validation/gwas/method/regenie_loco_regression/environment.yml: -------------------------------------------------------------------------------- 1 | name: glow 2 | channels: 3 | - bioconda 4 | - conda-forge 5 | dependencies: 6 | - python=3.7 7 | - glow=1.0.1=pyh44b312d_0 8 | - jupyterlab=3 9 | - numpy=1.18.1 10 | - pandas=1.0.1 11 | - pyspark=3.1.2 12 | - xarray 13 | - zarr 14 | --------------------------------------------------------------------------------