├── .cirun.yml
├── .coveragerc
├── .flake8
├── .github
    ├── PULL_REQUEST_TEMPLATE.md
    ├── scripts
    │   ├── test_sgkit.py
    │   ├── test_sgkit_bgen.py
    │   ├── test_sgkit_plink.py
    │   └── upstream_install.py
    └── workflows
    │   ├── benchmark.yml
    │   ├── build-gpu.yml
    │   ├── build-numpy-1.yml
    │   ├── build.yml
    │   ├── check-docs.yml
    │   ├── cubed.yml
    │   ├── docs.yml
    │   ├── upstream.yml
    │   ├── validation.yml
    │   ├── wheels.yml
    │   └── windows.yml
├── .gitignore
├── .mergify.yml
├── .pre-commit-config.yaml
├── CONTRIBUTING.md
├── GOVERNANCE.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── benchmarks
    ├── __init__.py
    ├── asv.conf.json
    ├── benchmarks_plink.py
    └── benchmarks_stats.py
├── conftest.py
├── docs
    ├── .gitignore
    ├── Makefile
    ├── _static
    │   ├── data-structures-xarray.jpg
    │   ├── docsearch.sbt.css
    │   ├── docsearch.sbt.js
    │   ├── mydask.png
    │   ├── numfocus_logo.png
    │   ├── order.png
    │   ├── sgkit_blue_trnsprnt.png
    │   ├── sgkit_trnsprnt.png
    │   └── switcher.json
    ├── about.rst
    ├── api.rst
    ├── changelog.rst
    ├── conf.py
    ├── contributing.rst
    ├── examples
    │   ├── 1kg.schema.json
    │   ├── gwas_tutorial.ipynb
    │   ├── index.rst
    │   └── relatedness_tutorial.ipynb
    ├── extensions
    │   └── typed_returns.py
    ├── getting_started.rst
    ├── how_do_i.rst
    ├── index.rst
    ├── news.rst
    ├── news
    │   └── introducing_sgkit.md
    └── user_guide.rst
├── pyproject.toml
├── requirements-dev.txt
├── requirements-doc.txt
├── requirements-numpy1-dev.txt
├── requirements-numpy1.txt
├── requirements.txt
├── sgkit
    ├── __init__.py
    ├── accelerate.py
    ├── cohorts.py
    ├── display.py
    ├── display_numba_fns.py
    ├── distance
    │   ├── __init__.py
    │   ├── api.py
    │   └── metrics.py
    ├── distarray.py
    ├── io
    │   ├── __init__.py
    │   ├── bgen
    │   │   ├── __init__.py
    │   │   └── bgen_reader.py
    │   ├── dataset.py
    │   ├── plink
    │   │   ├── __init__.py
    │   │   ├── plink_reader.py
    │   │   └── plink_writer.py
    │   └── utils.py
    ├── model.py
    ├── py.typed
    ├── stats
    │   ├── __init__.py
    │   ├── aggregation.py
    │   ├── aggregation_numba_fns.py
    │   ├── association.py
    │   ├── cohort_numba_fns.py
    │   ├── conversion.py
    │   ├── conversion_numba_fns.py
    │   ├── genedrop.py
    │   ├── genedrop_numba_fns.py
    │   ├── genee.py
    │   ├── genee_momentchi2py.py
    │   ├── grm.py
    │   ├── hwe.py
    │   ├── ibs.py
    │   ├── ibs_numba_fns.py
    │   ├── ld.py
    │   ├── pc_relate.py
    │   ├── pca.py
    │   ├── pedigree.py
    │   ├── pedigree_numba_fns.py
    │   ├── popgen.py
    │   ├── popgen_numba_fns.py
    │   ├── preprocessing.py
    │   ├── regenie.py
    │   ├── truncated_svd.py
    │   └── utils.py
    ├── testing.py
    ├── tests
    │   ├── __init__.py
    │   ├── data
    │   │   └── sample.bed
    │   ├── io
    │   │   ├── __init__.py
    │   │   ├── bgen
    │   │   │   ├── __init__.py
    │   │   │   ├── data
    │   │   │   │   ├── .gitignore
    │   │   │   │   ├── example-no-samples.bgen
    │   │   │   │   ├── example-separate-samples.bgen
    │   │   │   │   ├── example-separate-samples.sample
    │   │   │   │   ├── example.bgen
    │   │   │   │   └── samples
    │   │   │   └── test_bgen_reader.py
    │   │   ├── data
    │   │   │   ├── sample.vcf.gz
    │   │   │   └── sample.vcf.gz.tbi
    │   │   ├── plink
    │   │   │   ├── __init__.py
    │   │   │   ├── data
    │   │   │   │   ├── example.bed
    │   │   │   │   ├── example.bim
    │   │   │   │   ├── example.fam
    │   │   │   │   ├── example.map
    │   │   │   │   ├── example.nosex
    │   │   │   │   ├── example.ped
    │   │   │   │   ├── example_with_fam.bed
    │   │   │   │   ├── example_with_fam.bim
    │   │   │   │   ├── example_with_fam.fam
    │   │   │   │   ├── plink_sim_10s_100v_10pmiss.bed
    │   │   │   │   ├── plink_sim_10s_100v_10pmiss.bim
    │   │   │   │   └── plink_sim_10s_100v_10pmiss.fam
    │   │   │   ├── test_plink_reader.py
    │   │   │   └── test_plink_writer.py
    │   │   ├── test_dataset.py
    │   │   └── test_vcf2zarr_compat.py
    │   ├── test_aggregation.py
    │   ├── test_association.py
    │   ├── test_cohort_numba_fns.py
    │   ├── test_cohorts.py
    │   ├── test_conversion.py
    │   ├── test_display.py
    │   ├── test_distance.py
    │   ├── test_genedrop.py
    │   ├── test_genee.py
    │   ├── test_genee
    │   │   ├── gene_list.csv
    │   │   ├── ld.csv
    │   │   ├── mydata.csv
    │   │   └── result.csv
    │   ├── test_genee_momentchi2py.py
    │   ├── test_grm.py
    │   ├── test_grm
    │   │   ├── AGHmatrix_sol100_A.csv
    │   │   ├── AGHmatrix_sol100_H_tau1.2_omega1.csv
    │   │   ├── AGHmatrix_sol100_H_tau1_omega0.9.csv
    │   │   ├── AGHmatrix_sol100_H_tau1_omega1.csv
    │   │   ├── AGHmatrix_sol30_A.csv
    │   │   ├── AGHmatrix_sol30_G.csv
    │   │   ├── AGHmatrix_sol30_H_tau0.8_omega1.csv
    │   │   ├── AGHmatrix_sol30_H_tau1_omega1.1.csv
    │   │   ├── AGHmatrix_sol30_H_tau1_omega1.csv
    │   │   ├── Legara2009_G_matrix.txt
    │   │   ├── Legara2009_H_matrix.txt
    │   │   ├── Legara2009_pedigree.txt
    │   │   ├── pine_snps_100_500.csv
    │   │   ├── pine_snps_100_500_A_matrix.txt
    │   │   ├── pine_snps_100_500_EJ_matrix.txt
    │   │   ├── sim4x_snps_A_matrix.txt
    │   │   └── sim4x_snps_EJ_matrix.txt
    │   ├── test_hwe.py
    │   ├── test_hwe
    │   │   └── sim_01.csv
    │   ├── test_ibs.py
    │   ├── test_ibs
    │   │   ├── hierfstat.sim1.beta.txt
    │   │   ├── hierfstat.sim1.dose.txt
    │   │   ├── hierfstat.sim2.beta.txt
    │   │   ├── hierfstat.sim2.dose.txt
    │   │   ├── hierfstat.sim3.beta.txt
    │   │   └── hierfstat.sim3.dose.txt
    │   ├── test_import_star.py
    │   ├── test_ld.py
    │   ├── test_mis.py
    │   ├── test_model.py
    │   ├── test_pc_relate.py
    │   ├── test_pca.py
    │   ├── test_pedigree.py
    │   ├── test_pedigree
    │   │   ├── hamilton_kerr_A_matrix.txt
    │   │   ├── hamilton_kerr_A_matrix_inv.txt
    │   │   ├── hamilton_kerr_inbreeding.txt
    │   │   ├── hamilton_kerr_kinship.txt
    │   │   ├── hamilton_kerr_kinship_inv.txt
    │   │   ├── hamilton_kerr_pedigree.csv
    │   │   ├── kinship2_kinship.txt
    │   │   ├── kinship2_pedigree.csv
    │   │   ├── pedkin_sim_founder.txt
    │   │   ├── pedkin_sim_interest.txt
    │   │   ├── pedkin_sim_out.txt
    │   │   └── pedkin_sim_ped.txt
    │   ├── test_popgen.py
    │   ├── test_preprocessing.py
    │   ├── test_regenie.py
    │   ├── test_regenie
    │   │   ├── config.yml
    │   │   ├── dataset
    │   │   │   ├── sim_sm_01
    │   │   │   │   ├── beta_covariate.csv
    │   │   │   │   ├── beta_variant.csv
    │   │   │   │   ├── covariates.csv
    │   │   │   │   ├── genotypes.zarr.zip
    │   │   │   │   └── traits.csv
    │   │   │   └── sim_sm_02
    │   │   │   │   ├── beta_covariate.csv
    │   │   │   │   ├── beta_variant.csv
    │   │   │   │   ├── covariates.csv
    │   │   │   │   ├── genotypes.zarr.zip
    │   │   │   │   ├── glow_offsets.zarr.zip
    │   │   │   │   ├── glow_offsets_nocovariate.zarr.zip
    │   │   │   │   └── traits.csv
    │   │   └── result
    │   │   │   ├── sim_sm_01-wgr_01
    │   │   │       ├── gwas.csv
    │   │   │       ├── predictions.csv
    │   │   │       └── reduced_blocks_flat.csv.gz
    │   │   │   └── sim_sm_02-wgr_02
    │   │   │       ├── gwas.csv
    │   │   │       ├── gwas_loco.csv
    │   │   │       ├── gwas_loco_nocovariate.csv
    │   │   │       ├── predictions.csv
    │   │   │       └── reduced_blocks_flat.csv.gz
    │   ├── test_stats_utils.py
    │   ├── test_testing.py
    │   ├── test_utils.py
    │   ├── test_variables.py
    │   └── test_window.py
    ├── typing.py
    ├── utils.py
    ├── variables.py
    └── window.py
└── validation
    ├── __init__.py
    └── gwas
        ├── __init__.py
        ├── docker
            ├── Dockerfile
            ├── README.md
            ├── environment-dev.yml
            ├── environment-glow.yml
            ├── environment-hail.yml
            └── environment.yml
        └── method
            ├── __init__.py
            ├── hwe
                ├── Makefile
                ├── README.md
                ├── __init__.py
                ├── chwe.c
                ├── chwe.o
                ├── data
                │   └── sim_01.csv
                ├── hwe_unit_test.ipynb
                ├── invoke.yaml
                ├── logging.ini
                └── tasks.py
            ├── pc_relate
                ├── Dockerfile
                ├── README.md
                ├── convert_plink_to_gds.R
                ├── pc_relate.R
                ├── run.sh
                └── validate_pc_relate.py
            ├── regenie
                ├── .gitignore
                ├── README.md
                ├── __init__.py
                ├── config.yml
                ├── glow_wgr.py
                ├── hail_sim.py
                ├── invoke.yaml
                ├── logging.ini
                ├── sgkit_zarr.py
                ├── tasks.py
                └── unit_test_dev.ipynb
            └── regenie_loco_regression
                ├── GlowGR_continuous.ipynb
                ├── README.md
                └── environment.yml


/.cirun.yml:
--------------------------------------------------------------------------------
 1 | # Self-Hosted Github Action Runners on GCP via Cirun.io
 2 | # Reference: https://docs.cirun.io/reference/yaml
 3 | runners:
 4 |   - name: gpu-runner
 5 |     # Cloud Provider: GCP
 6 |     cloud: gcp
 7 |     # Cheapest GPU on GCP
 8 |     gpu: nvidia-tesla-t4
 9 |     # Cheapest VM on GCP, with GPU attachable
10 |     instance_type: n1-standard-1
11 |     # Custom image with NVIDIA drivers installed on Ubuntu-20.4
12 |     # to reduce provision time
13 |     # Format => project_name:image_name
14 |     machine_image: sgkit-dev:cirun-nvidia-v2
15 |     region:
16 |       - us-central1-a
17 |       - us-central1-b
18 |       - us-central1-c
19 |       - us-central1-f
20 |       - us-east1-c
21 |       - us-east1-d
22 |       - us-east4-a
23 |       - us-east4-b
24 |       - us-east4-c
25 |       - us-west1-a
26 |       - us-west1-b
27 |       - us-west2-b
28 |       - us-west2-c
29 |       - us-west4-a
30 |       - us-west4-b
31 |     # preemptible instances seems quite less reliable.
32 |     preemptible: false
33 |     # Adding the GPU label, this matches the runs-on param from .github/workflows/build-gpu.yml
34 |     labels:
35 |       - cirun-gpu-runner
36 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | omit =
3 |     sgkit/tests/*
4 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | ignore =
 3 |     # whitespace before ':' - doesn't work well with black
 4 |     E203
 5 |     E402
 6 |     # line too long - let black worry about that
 7 |     E501
 8 |     # do not assign a lambda expression, use a def
 9 |     E731
10 |     # line break before binary operator
11 |     W503
12 | 
13 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | <!-- Feel free to remove check-list items aren't relevant to your change -->
2 | 
3 | - [ ] Fixes #xxxx
4 | - [ ] Tests added
5 | - [ ] User visible changes (including notable bug fixes) are documented in `changelog.rst`
6 | - [ ] New functions are listed in `api.rst`
7 | 


--------------------------------------------------------------------------------
/.github/scripts/test_sgkit.py:
--------------------------------------------------------------------------------
1 | import sgkit as sg
2 | 
3 | if __name__ == "__main__":
4 |     ds = sg.simulate_genotype_call_dataset(n_variant=100, n_sample=50, n_contig=23)
5 |     print(ds)
6 | 


--------------------------------------------------------------------------------
/.github/scripts/test_sgkit_bgen.py:
--------------------------------------------------------------------------------
 1 | import urllib.request
 2 | 
 3 | from sgkit.io.bgen import read_bgen
 4 | 
 5 | if __name__ == "__main__":
 6 |     urllib.request.urlretrieve(
 7 |         "https://github.com/sgkit-dev/sgkit/raw/main/sgkit/tests/io/bgen/data/example.bgen",
 8 |         "example.bgen",
 9 |     )
10 |     ds = read_bgen("example.bgen")
11 |     print(ds)
12 | 


--------------------------------------------------------------------------------
/.github/scripts/test_sgkit_plink.py:
--------------------------------------------------------------------------------
 1 | import urllib.request
 2 | 
 3 | from sgkit.io.plink import read_plink
 4 | 
 5 | if __name__ == "__main__":
 6 |     for ext in (".bed", ".bim", ".fam"):
 7 |         urllib.request.urlretrieve(
 8 |             f"https://github.com/sgkit-dev/sgkit/raw/main/sgkit/tests/io/plink/data/plink_sim_10s_100v_10pmiss{ext}",
 9 |             f"plink_sim_10s_100v_10pmiss{ext}",
10 |         )
11 |     ds = read_plink(path="plink_sim_10s_100v_10pmiss")
12 |     print(ds)
13 | 


--------------------------------------------------------------------------------
/.github/scripts/upstream_install.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import sys
 3 | from pathlib import Path
 4 | 
 5 | 
 6 | def install_deps() -> None:
 7 |     # NOTE: need to use legacy-resolver due to https://github.com/dask/community/issues/124
 8 |     install_cmd = (
 9 |         sys.executable,
10 |         "-m",
11 |         "pip",
12 |         "install",
13 |         "--use-deprecated=legacy-resolver",
14 |         "--upgrade",
15 |     )
16 |     upstream_deps = (
17 |         "git+https://github.com/dask/dask.git#egg=dask[array,dataframe]",
18 |         "git+https://github.com/dask/distributed.git#egg=distributed",
19 |         "git+https://github.com/pandas-dev/pandas#egg=pandas",
20 |         "git+https://github.com/pangeo-data/rechunker.git#egg=rechunker",
21 |         "git+https://github.com/pydata/xarray.git#egg=xarray",
22 |         "git+https://github.com/zarr-developers/zarr-python.git#egg=zarr",
23 |     )
24 |     full_cmd_upstream = install_cmd + upstream_deps
25 |     print(f"Install upstream dependencies via: {full_cmd_upstream}")
26 |     subprocess.check_call(full_cmd_upstream)
27 |     req_deps = set(Path("requirements.txt").read_text().splitlines())
28 |     req_upstream = [x.split("egg=")[-1].strip() for x in upstream_deps]
29 |     req_left = tuple(x for x in req_deps if not any(y in x for y in req_upstream))
30 |     full_cmd_left_over = install_cmd + req_left
31 |     print(f"Install left over dependencies via: {full_cmd_left_over}")
32 |     subprocess.check_call(full_cmd_left_over)
33 | 
34 | 
35 | def install_self() -> None:
36 |     install_cmd = (
37 |         sys.executable,
38 |         "-m",
39 |         "pip",
40 |         "install",
41 |         "--no-deps",
42 |         "-e" ".",
43 |     )
44 |     print(f"Install sgkit via: `{install_cmd}`")
45 |     subprocess.check_call(install_cmd)
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     install_deps()
50 |     install_self()
51 | 


--------------------------------------------------------------------------------
/.github/workflows/benchmark.yml:
--------------------------------------------------------------------------------
 1 | name: Benchmarks
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 | 
 8 | env:
 9 |   GITHUB_TOKEN: ${{ secrets.GH_TOKEN }}
10 |   BENCHMARKS_REPO: sgkit-dev/sgkit-benchmarks-asv
11 |   ASV_CONFIG: benchmarks/asv.conf.json
12 |   MACHINE_NAME: github-actions  # to identify github actions machine as hostname changes everytime
13 | 
14 | jobs:
15 |   build:
16 |     # This workflow only runs on the origin org
17 |     if: github.repository_owner == 'sgkit-dev'
18 |     runs-on: ubuntu-latest
19 |     steps:
20 |       - uses: actions/checkout@v4
21 |         with:
22 |           fetch-depth: 0  # To fetch all commits to be able to generate benchmarks html
23 |       - name: Set up Python ${{ matrix.python-version }}
24 |         uses: actions/setup-python@v5
25 |         with:
26 |           python-version: "3.10"
27 |       - name: Install dependencies
28 |         run: |
29 |           sudo apt update -y
30 |           python -m pip install --upgrade pip
31 |           pip install asv
32 | 
33 |       - name: Set and log asv machine configuration
34 |         run: |
35 |           asv machine --yes --config benchmarks/asv.conf.json
36 |           echo "Machine Configuration:"
37 |           cat ~/.asv-machine.json
38 |           rm ~/.asv-machine.json
39 | 
40 |           echo "Setting machine name to $MACHINE_NAME"
41 |           asv machine --machine $MACHINE_NAME --yes --config $ASV_CONFIG -v
42 | 
43 |       - name: Run benchmarks
44 |         run: |
45 |           asv run --config $ASV_CONFIG -v
46 | 
47 |       - name: Copy benchmarks to benchmarks repo directory
48 |         run: |
49 |           git clone https://$GITHUB_TOKEN@github.com/$BENCHMARKS_REPO.git ~/$BENCHMARKS_REPO
50 |           RESULTS_DIR=~/$BENCHMARKS_REPO/results
51 |           if [ -d "$RESULTS_DIR" ]
52 |           then
53 |             cp -r $RESULTS_DIR/$MACHINE_NAME/* benchmarks/results/$MACHINE_NAME/
54 |           else
55 |             echo "results/ directory does not exist in the benchmarks repository"
56 |           fi
57 |           asv publish --config $ASV_CONFIG -v
58 |           cp -r benchmarks/html/* ~/$BENCHMARKS_REPO/
59 |           cp -r benchmarks/results ~/$BENCHMARKS_REPO/
60 | 
61 |       - name: Push benchmarks
62 |         run: |
63 |           cd ~/$BENCHMARKS_REPO
64 |           git add .
65 |           git config --global user.email "project@sgkit.dev"
66 |           git config --global user.name "sgkit benchmark bot"
67 |           git commit -m "Update benchmarks"
68 |           git push origin main
69 | 


--------------------------------------------------------------------------------
/.github/workflows/build-gpu.yml:
--------------------------------------------------------------------------------
 1 | name: Build GPU
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   # Disabled on 2024-10-14 as this has been broken for over six months
 6 |   # https://github.com/sgkit-dev/sgkit/issues/1270
 7 |   # push:
 8 |   #   branches:
 9 |   #     - main
10 | 
11 | jobs:
12 |   build:
13 | 
14 |     runs-on: "cirun-gpu-runner--${{ github.run_id }}"
15 |     defaults:
16 |       run:
17 |         shell: bash -l {0}
18 |     strategy:
19 |       matrix:
20 |         python-version: ["3.10"]
21 | 
22 |     steps:
23 |       - uses: actions/checkout@v4
24 | 
25 |       - name: Run Nvidia-smi
26 |         run: |
27 |           nvidia-smi
28 | 
29 |       - name: Set up Python
30 |         uses: conda-incubator/setup-miniconda@v2.2.0
31 |         env:
32 |           CONDA: /home/runnerx/miniconda3
33 |         with:
34 |           python-version: ${{ matrix.python-version }}
35 |           miniconda-version: "latest"
36 | 
37 |       - name: Conda info
38 |         run: |
39 |           conda info
40 |           conda list
41 | 
42 |       - name: Installing cudatoolkit and dependencies
43 |         run: |
44 |           conda install -c nvidia cudatoolkit
45 |           pip install -r requirements.txt -r requirements-dev.txt
46 | 
47 |       - name: Numba Information
48 |         run: |
49 |           numba -s
50 | 
51 |       - name: Run GPU tagged tests
52 |         run: |
53 |           pytest -m gpu -v
54 | 


--------------------------------------------------------------------------------
/.github/workflows/build-numpy-1.yml:
--------------------------------------------------------------------------------
 1 | name: Build NumPy 1
 2 | 
 3 | on:
 4 |   push:
 5 |   pull_request:
 6 | 
 7 | jobs:
 8 |   build:
 9 |     # Scheduled runs only on the origin org
10 |     if: (github.event_name == 'schedule' && github.repository_owner == 'sgkit-dev') || (github.event_name != 'schedule')
11 |     runs-on: ubuntu-latest
12 |     strategy:
13 |       matrix:
14 |         python-version: ["3.10", "3.11"]
15 | 
16 |     steps:
17 |     - uses: actions/checkout@v4
18 |     - name: Set up Python ${{ matrix.python-version }}
19 |       uses: actions/setup-python@v5
20 |       with:
21 |         python-version: ${{ matrix.python-version }}
22 |     - name: Install dependencies
23 |       run: |
24 |         python -m pip install --upgrade pip
25 |         pip install -r requirements-numpy1.txt -r requirements-numpy1-dev.txt
26 |     # - name: Run pre-commit
27 |     #   uses: pre-commit/action@v3.0.1
28 |     - name: Test with pytest and coverage
29 |       run: |
30 |         pytest -v --cov=sgkit --cov-report=term-missing
31 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | name: Build
 2 | 
 3 | on:
 4 |   push:
 5 |   pull_request:
 6 |   schedule:
 7 |     # Run at the end of every day
 8 |     - cron: "0 0 * * *"
 9 | 
10 | jobs:
11 |   build:
12 |     # Scheduled runs only on the origin org
13 |     if: (github.event_name == 'schedule' && github.repository_owner == 'sgkit-dev') || (github.event_name != 'schedule')
14 |     runs-on: ubuntu-latest
15 |     strategy:
16 |       matrix:
17 |         python-version: ["3.10", "3.11", "3.12"]
18 | 
19 |     steps:
20 |     - uses: actions/checkout@v4
21 |     - name: Set up Python ${{ matrix.python-version }}
22 |       uses: actions/setup-python@v5
23 |       with:
24 |         python-version: ${{ matrix.python-version }}
25 |     - name: Install dependencies
26 |       run: |
27 |         python -m pip install --upgrade pip
28 |         pip install -r requirements.txt -r requirements-dev.txt
29 |     - name: Run pre-commit
30 |       uses: pre-commit/action@v3.0.1
31 |     - name: Test with pytest and coverage
32 |       run: |
33 |         pytest -v --cov=sgkit --cov-report=term-missing
34 |     - name: Upload coverage to Codecov
35 |       uses: codecov/codecov-action@v3
36 |       with:
37 |         token: ${{ secrets.CODECOV_TOKEN }}
38 | 
39 |   test-zarr-version:
40 |     name: Test Zarr Python v3
41 |     # Scheduled runs only on the origin org
42 |     if: (github.event_name == 'schedule' && github.repository_owner == 'sgkit-dev') || (github.event_name != 'schedule')
43 |     runs-on: ubuntu-latest
44 |     strategy:
45 |       matrix:
46 |         zarr: [">=3"]
47 |     steps:
48 |       - uses: actions/checkout@v4
49 |       - uses: actions/setup-python@v5
50 |         with:
51 |           python-version: '3.11'
52 |       - name: Install dependencies
53 |         run: |
54 |           python -m pip install --upgrade pip
55 |           pip install -r requirements.txt -r requirements-dev.txt
56 |       - name: Install zarr${{ matrix.zarr }}
57 |         run: |
58 |           python -m pip install --pre 'zarr${{ matrix.zarr }}'
59 |           python -m pip uninstall -y bio2zarr  # TODO: remove when bio2zarr supports Zarr Python 3
60 |       - name: Run tests
61 |         run: |
62 |           pytest
63 | 


--------------------------------------------------------------------------------
/.github/workflows/check-docs.yml:
--------------------------------------------------------------------------------
 1 | name: Check docs
 2 | 
 3 | on:
 4 |   push:
 5 |   pull_request:
 6 |   schedule:
 7 |     # Run at the end of every day
 8 |     - cron: "0 0 * * *"
 9 | 
10 | jobs:
11 |   build:
12 |     # Scheduled runs only on the origin org
13 |     if: (github.event_name == 'schedule' && github.repository_owner == 'sgkit-dev') || (github.event_name != 'schedule')
14 |     runs-on: ubuntu-latest
15 |     strategy:
16 |       matrix:
17 |         python-version: ["3.11"]
18 | 
19 |     steps:
20 |     - uses: actions/checkout@v4
21 |     - name: Set up Python ${{ matrix.python-version }}
22 |       uses: actions/setup-python@v5
23 |       with:
24 |         python-version: ${{ matrix.python-version }}
25 |     - name: Install dependencies
26 |       run: |
27 |         sudo apt update -y
28 |         sudo apt install graphviz  # Needed for documentation
29 |         python -m pip install --upgrade pip
30 |         pip install -r requirements.txt -r requirements-dev.txt -r requirements-doc.txt
31 |         pip install -U dask distributed  # need latest versions to successully build docs
32 |     - name: Run pre-commit
33 |       uses: pre-commit/action@v3.0.1
34 |     - name: Check for Sphinx doc warnings
35 |       run: |
36 |         cd docs
37 |         make html SPHINXOPTS="-W --keep-going -n"
38 |     - uses: actions/upload-artifact@v4
39 |       if: failure()
40 |       with:
41 |         name: gwas_tutorial
42 |         path: /home/runner/work/sgkit/sgkit/docs/_build/html/reports/examples/gwas_tutorial.err.log
43 |     - uses: actions/upload-artifact@v4
44 |       if: failure()
45 |       with:
46 |         name: relatedness_tutorial
47 |         path: /home/runner/work/sgkit/sgkit/docs/_build/html/reports/examples/relatedness_tutorial.err.log
48 | 


--------------------------------------------------------------------------------
/.github/workflows/cubed.yml:
--------------------------------------------------------------------------------
 1 | name: Cubed
 2 | 
 3 | on:
 4 |   push:
 5 |   pull_request:
 6 |   # manual trigger
 7 |   workflow_dispatch:
 8 | 
 9 | jobs:
10 |   build:
11 |     # This workflow only runs on the origin org
12 |     # if: github.repository_owner == 'sgkit-dev'
13 |     runs-on: ubuntu-latest
14 |     strategy:
15 |       matrix:
16 |         python-version: ["3.11"]
17 | 
18 |     steps:
19 |     - uses: actions/checkout@v4
20 |     - name: Set up Python ${{ matrix.python-version }}
21 |       uses: actions/setup-python@v5
22 |       with:
23 |         python-version: ${{ matrix.python-version }}
24 | 
25 |     - name: Install deps and sgkit
26 |       run: |
27 |         python -m pip install --upgrade pip
28 |         python -m pip install -r requirements.txt -r requirements-dev.txt
29 |         python -m pip install -U git+https://github.com/cubed-dev/cubed.git -U git+https://github.com/cubed-dev/cubed-xarray.git -U git+https://github.com/pydata/xarray.git
30 | 
31 |     - name: Test with pytest
32 |       run: |
33 |         pytest -v sgkit/tests/test_{aggregation,association,hwe,pca,window}.py \
34 |           -k "test_count_call_alleles or \
35 |               test_gwas_linear_regression or \
36 |               test_hwep or \
37 |               test_sample_stats or \
38 |               (test_count_variant_alleles and not test_count_variant_alleles__chunked[call_genotype]) or \
39 |               (test_variant_stats and not test_variant_stats__chunks[chunks2-False]) or \
40 |               (test_pca__array_backend and tsqr) or \
41 |               (test_window and not 12-5-4-4)" \
42 |           --use-cubed
43 | 


--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
 1 | name: Docs
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 | 
 8 | jobs:
 9 |   build:
10 | 
11 |     runs-on: ubuntu-latest
12 | 
13 |     steps:
14 |     - uses: actions/checkout@v4
15 |     - name: Set up Python ${{ matrix.python-version }}
16 |       uses: actions/setup-python@v5
17 |       with:
18 |         python-version: "3.11"
19 |     - name: Install dependencies
20 |       run: |
21 |         sudo apt update -y
22 |         sudo apt install graphviz  # Needed for documentation
23 |         python -m pip install --upgrade pip
24 |         pip install -r requirements.txt -r requirements-dev.txt -r requirements-doc.txt
25 |         pip install -U dask distributed  # need latest versions to successully build docs
26 |     - name: Build Sphinx documentation
27 |       run: |
28 |         cd docs
29 |         make html SPHINXOPTS="-W --keep-going -n"
30 |     - name: Commit documentation changes to gh-pages branch
31 |       run: |
32 |         git clone https://github.com/sgkit-dev/sgkit.git --branch gh-pages --single-branch gh-pages
33 |         mkdir -p gh-pages/latest
34 |         cp -r docs/_build/html/* gh-pages/latest
35 |         cd gh-pages
36 |         git config --local user.email "action@github.com"
37 |         git config --local user.name "GitHub Action"
38 |         git add .
39 |         git commit -m "Update latest documentation" -a || true # Ignore error if no changes present
40 |     - name: Push changes
41 |       uses: ad-m/github-push-action@master
42 |       with:
43 |         branch: gh-pages
44 |         directory: gh-pages
45 |         force: true
46 |         github_token: ${{ secrets.GITHUB_TOKEN }}
47 | 


--------------------------------------------------------------------------------
/.github/workflows/upstream.yml:
--------------------------------------------------------------------------------
 1 | name: Upstream
 2 | 
 3 | on:
 4 |   push:
 5 |   schedule:
 6 |     - cron: "0 1 * * *"
 7 |   # manual trigger
 8 |   workflow_dispatch:
 9 | 
10 | jobs:
11 |   build:
12 |     # This workflow only runs on the origin org
13 |     if: github.repository_owner == 'sgkit-dev'
14 |     runs-on: ubuntu-latest
15 |     strategy:
16 |       matrix:
17 |         python-version: ["3.10", "3.11"]
18 | 
19 |     steps:
20 |     - uses: actions/checkout@v4
21 |     - name: Set up Python ${{ matrix.python-version }}
22 |       uses: actions/setup-python@v5
23 |       with:
24 |         python-version: ${{ matrix.python-version }}
25 | 
26 |     - name: Install deps and sgkit
27 |       run: |
28 |         sudo apt update -y
29 |         python -m pip install --upgrade pip
30 |         python .github/scripts/upstream_install.py
31 |         python -m pip install -r requirements-dev.txt
32 | 
33 |     - name: Test with pytest
34 |       run: |
35 |         python -m pip freeze
36 |         pytest -v
37 | 


--------------------------------------------------------------------------------
/.github/workflows/validation.yml:
--------------------------------------------------------------------------------
 1 | name: Validation
 2 | 
 3 | on:
 4 |   # schedule:
 5 |     # Run at the end of every day
 6 |     # Disabled on 2024-09-02 as this has been broken for over a year, and no-one is interested
 7 |     # in fixing it. https://github.com/sgkit-dev/sgkit/issues/1112
 8 |     # - cron: "0 0 * * *"
 9 |   # manual trigger
10 |   workflow_dispatch:
11 | 
12 | jobs:
13 |   validation_suite:
14 |     # This workflow only runs on the origin org
15 |     if: github.repository_owner == 'sgkit-dev'
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |     - uses: actions/checkout@v4
19 |     - uses: google-github-actions/setup-gcloud@v0
20 |       with:
21 |         project_id: ${{ secrets.GCP_PROJECT_ID }}
22 |         service_account_key: ${{ secrets.GCP_SA_KEY }}
23 |         export_default_credentials: true
24 |     - name: Download public test data (real HapMap data)
25 |       run: gsutil -u $GCLOUD_PROJECT cp gs://sgkit-data/validation/hapmap_JPT_CHB_r23a_filtered.zip /tmp/
26 |     - name: Validate PC Relate
27 |       run: ./validation/gwas/method/pc_relate/run.sh /tmp/hapmap_JPT_CHB_r23a_filtered.zip
28 | 


--------------------------------------------------------------------------------
/.github/workflows/wheels.yml:
--------------------------------------------------------------------------------
  1 | name: Wheels
  2 | 
  3 | on:
  4 |   pull_request:
  5 |   push:
  6 |     branches:
  7 |       - main
  8 |       - test
  9 |     tags:
 10 |       - '*'
 11 |   release:
 12 |     types: [published]
 13 | 
 14 | jobs:
 15 |   build:
 16 |     # This workflow only runs on the origin org
 17 |     if: github.repository_owner == 'sgkit-dev'
 18 |     runs-on: ubuntu-latest
 19 |     strategy:
 20 |       matrix:
 21 |         python-version: ["3.10"]
 22 | 
 23 |     steps:
 24 |     - uses: actions/checkout@v4
 25 |     - name: Set up Python ${{ matrix.python-version }}
 26 |       uses: actions/setup-python@v5
 27 |       with:
 28 |         python-version: ${{ matrix.python-version }}
 29 |     - name: Install dependencies
 30 |       run: |
 31 |         python -m pip install --upgrade pip
 32 |         pip install setuptools twine wheel build
 33 |     - name: Build a source distribution and a wheel
 34 |       run: |
 35 |         python -m build --sdist --wheel
 36 |         python -m twine check --strict dist/*
 37 |     - name: Upload artifacts
 38 |       uses: actions/upload-artifact@v4
 39 |       with:
 40 |         path: dist
 41 | 
 42 |   unix-test:
 43 |     # This workflow only runs on the origin org
 44 |     if: github.repository_owner == 'sgkit-dev'
 45 |     needs: ['build']
 46 |     strategy:
 47 |       matrix:
 48 |         os: [ubuntu-latest, macos-latest]
 49 |         python-version: ["3.10", "3.11"]
 50 |     runs-on: ${{ matrix.os }}
 51 |     steps:
 52 |       # checkout repo to subdirectory to get access to scripts
 53 |       - uses: actions/checkout@v4
 54 |         with:
 55 |           path: sgkit-copy
 56 |       - name: Download artifacts
 57 |         uses: actions/download-artifact@v4.1.7
 58 |       - name: Set up Python ${{ matrix.python-version }}
 59 |         uses: actions/setup-python@v5
 60 |         with:
 61 |           python-version: ${{ matrix.python-version }}
 62 |       - name: Install wheel and test
 63 |         run: |
 64 |           python -VV
 65 |           # Install the local wheel
 66 |           wheel=$(ls artifact/sgkit-*.whl)
 67 |           pip install ${wheel} ${wheel}[bgen] ${wheel}[plink]
 68 |           python sgkit-copy/.github/scripts/test_sgkit.py
 69 |           python sgkit-copy/.github/scripts/test_sgkit_bgen.py
 70 |           python sgkit-copy/.github/scripts/test_sgkit_plink.py
 71 | 
 72 |   windows-test:
 73 |     # This workflow only runs on the origin org
 74 |     if: github.repository_owner == 'sgkit-dev'
 75 |     runs-on: windows-latest
 76 |     needs: ['build']
 77 |     strategy:
 78 |       matrix:
 79 |         python-version: ["3.10"]
 80 |     steps:
 81 |       # checkout repo to subdirectory to get access to scripts
 82 |       - uses: actions/checkout@v4
 83 |         with:
 84 |           path: sgkit-copy
 85 |       - name: Download artifacts
 86 |         uses: actions/download-artifact@v4.1.7
 87 |       - name: Set up Python ${{ matrix.python-version }}
 88 |         uses: actions/setup-python@v5
 89 |         with:
 90 |           python-version: ${{ matrix.python-version }}
 91 |       - name: Install wheel and test
 92 |         run: |
 93 |           python -VV
 94 |           # Install the local wheel
 95 |           $env:wheel = $(ls artifact/sgkit-*.whl)
 96 |           pip install $env:wheel "$env:wheel[bgen]" "$env:wheel[plink]"
 97 |           python sgkit-copy/.github/scripts/test_sgkit.py
 98 |           python sgkit-copy/.github/scripts/test_sgkit_bgen.py
 99 |           python sgkit-copy/.github/scripts/test_sgkit_plink.py
100 | 
101 | 
102 |   pypi-upload:
103 |     if: github.repository_owner == 'sgkit-dev'
104 |     runs-on: ubuntu-latest
105 |     needs: ['unix-test', 'windows-test']
106 |     steps:
107 |       - name: Download all
108 |         uses: actions/download-artifact@v4.1.7
109 |       - name: Move to dist
110 |         run: |
111 |           mkdir dist
112 |           cp */*.{whl,gz} dist/.
113 |       - name: Publish package to TestPyPI
114 |         if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags')
115 |         uses: pypa/gh-action-pypi-publish@release/v1
116 |         with:
117 |           user: __token__
118 |           password: ${{ secrets.TEST_PYPI_API_TOKEN }}
119 |           repository_url: https://test.pypi.org/legacy/
120 |       - name: Publish package to PyPI
121 |         if: github.event_name == 'release'
122 |         uses: pypa/gh-action-pypi-publish@release/v1
123 |         with:
124 |           user: __token__
125 |           password: ${{ secrets.PYPI_API_TOKEN }}
126 | 


--------------------------------------------------------------------------------
/.github/workflows/windows.yml:
--------------------------------------------------------------------------------
 1 | name: Windows
 2 | 
 3 | on:
 4 |   push:
 5 |   pull_request:
 6 |   schedule:
 7 |     # Run at the end of every day
 8 |     - cron: "0 0 * * *"
 9 | 
10 | jobs:
11 |   win_build:
12 |     # Scheduled runs only on the origin org
13 |     if: (github.event_name == 'schedule' && github.repository_owner == 'sgkit-dev') || (github.event_name != 'schedule')
14 |     runs-on: windows-latest
15 |     strategy:
16 |       matrix:
17 |         python-version: ["3.10"]
18 | 
19 |     steps:
20 |     - uses: actions/checkout@v4
21 |     - name: Set up Miniconda with Python version ${{ matrix.python-version }}
22 |       uses: conda-incubator/setup-miniconda@v2
23 |       with:
24 |         auto-update-conda: true
25 |         channels: conda-forge,numba
26 |         miniconda-version: "latest"
27 |         python-version: ${{ matrix.python-version }}
28 |     - name: Install dependencies
29 |       # activate conda
30 |       shell: bash -l {0}
31 |       # conda can't install all dev tools, so we need to split it between conda and pip
32 |       run: |
33 |         conda install --file requirements.txt msprime
34 |         pip install -r requirements-dev.txt
35 |     - name: Test with pytest
36 |       # activate conda
37 |       shell: bash -l {0}
38 |       # To avoid: 'UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1'
39 |       env:
40 |         OMP_NUM_THREADS: 1
41 |       run: |
42 |         pytest -v
43 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | datasets/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | pip-wheel-metadata/
 25 | share/python-wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | .python-version
 87 | 
 88 | # pipenv
 89 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 90 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 91 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 92 | #   install all needed dependencies.
 93 | #Pipfile.lock
 94 | 
 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 96 | __pypackages__/
 97 | 
 98 | # Celery stuff
 99 | celerybeat-schedule
100 | celerybeat.pid
101 | 
102 | # SageMath parsed files
103 | *.sage.py
104 | 
105 | # Environments
106 | .env
107 | .venv
108 | env/
109 | venv/
110 | ENV/
111 | env.bak/
112 | venv.bak/
113 | 
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 | 
118 | # Rope project settings
119 | .ropeproject
120 | 
121 | # mkdocs documentation
122 | /site
123 | 
124 | # mypy
125 | .mypy_cache/
126 | .dmypy.json
127 | dmypy.json
128 | 
129 | # Pyre type checker
130 | .pyre/
131 | 
132 | # IDE
133 | .vscode
134 | .idea
135 | .DS_Store
136 | 
137 | # sgkit
138 | docs/generated
139 | docs/mydask.png
140 | docs/order.png
141 | benchmarks/html
142 | benchmarks/results
143 | 


--------------------------------------------------------------------------------
/.mergify.yml:
--------------------------------------------------------------------------------
 1 | queue_rules:
 2 |   - name: default
 3 |     conditions:
 4 |       - base=main
 5 |       - status-success=build (3.10)
 6 |       - status-success=build (3.11)
 7 |       - status-success=win_build (3.10)
 8 |       - approved-reviews-by=@sgkit-dev/committers
 9 |       - "#approved-reviews-by>=1"
10 |       - label=auto-merge
11 | 
12 | pull_request_rules:
13 |   - name: automatic merge
14 |     conditions:
15 |       - base=main
16 |       - status-success=build (3.10)
17 |       - status-success=build (3.11)
18 |       - status-success=win_build (3.10)
19 |       - approved-reviews-by=@sgkit-dev/committers
20 |       - "#approved-reviews-by>=1"
21 |       - label=auto-merge
22 |     actions:
23 |       queue:
24 |         name: default
25 |         method: rebase
26 |   - name: deleting merged branch
27 |     conditions:
28 |       - merged
29 |     actions:
30 |       delete_head_branch: {}
31 |   - name: ping author on conflicts
32 |     conditions:
33 |       - conflict
34 |     actions:
35 |       comment:
36 |         message: This PR has conflicts, @{{author}} please rebase and push updated version 🙏
37 |       label:
38 |         add:
39 |           - conflict
40 |   - name: remove conflict label if not needed
41 |     conditions:
42 |       - -conflict
43 |     actions:
44 |       label:
45 |         remove:
46 |           - conflict
47 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.4.0
 4 |     hooks:
 5 |       - id: check-merge-conflict
 6 |       - id: debug-statements
 7 |       - id: mixed-line-ending
 8 |       - id: check-case-conflict
 9 |       - id: check-yaml
10 |   - repo: https://github.com/timothycrosley/isort
11 |     rev: 5.12.0
12 |     hooks:
13 |       - id: isort
14 |   - repo: https://github.com/python/black
15 |     rev: 23.1.0
16 |     hooks:
17 |       - id: black
18 |         language_version: python3
19 |   - repo: https://github.com/pycqa/flake8
20 |     rev: 6.1.0
21 |     hooks:
22 |       - id: flake8
23 |         language_version: python3
24 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | All contributions, bug reports, bug fixes, documentation improvements, enhancements, and ideas are welcome.
2 | 
3 | For general information on how to contribute see https://sgkit-dev.github.io/sgkit/latest/contributing.html.
4 | 


--------------------------------------------------------------------------------
/GOVERNANCE.md:
--------------------------------------------------------------------------------
1 | Please see our [code of conduct](https://github.com/sgkit-dev/.github/blob/master/CODE_OF_CONDUCT.md) for more information.


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | prune .github
2 | prune sgkit/tests
3 | prune validation
4 | exclude .coveragerc .gitignore .mergify.yml .pre-commit-config.yaml conftest.py
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # sgkit: Scalable genetics toolkit in Python
 2 | [![Build status](https://github.com/sgkit-dev/sgkit/workflows/Build/badge.svg?branch=main)](https://github.com/sgkit-dev/sgkit/actions?query=workflow%3A%22Build%22+branch%3Amain)
 3 | [![Windows build status](https://github.com/sgkit-dev/sgkit/workflows/Windows/badge.svg?branch=main)](https://github.com/sgkit-dev/sgkit/actions?query=workflow%3A%22Windows%22+branch%3Amain)
 4 | [![Documentation status](https://github.com/sgkit-dev/sgkit/workflows/Docs/badge.svg?branch=main)](https://sgkit-dev.github.io/sgkit/)
 5 | [![Validation status](https://github.com/sgkit-dev/sgkit/workflows/Validation/badge.svg?branch=main)](https://github.com/sgkit-dev/sgkit/actions?query=workflow%3A%22Validation%22+branch%3Amain)
 6 | [![Upstream status](https://github.com/sgkit-dev/sgkit/workflows/Upstream/badge.svg?branch=main)](https://github.com/sgkit-dev/sgkit/actions?query=workflow%3A%22Upstream%22+branch%3Amain)
 7 | [![asv](https://img.shields.io/badge/Benchmarked%20by-asv-green.svg?style=flat)](https://sgkit-dev.github.io/sgkit-benchmarks-asv/)
 8 | [![Powered by NumFOCUS](https://img.shields.io/badge/powered%20by-NumFOCUS-orange.svg?style=flat&colorA=E1523D&colorB=007D8A)](http://numfocus.org)
 9 | 
10 | Sgkit is a Python package that provides a variety of analytical genetics methods through the use of
11 | general-purpose frameworks such as [Xarray](http://xarray.pydata.org/en/stable/), [Pandas](https://pandas.pydata.org/docs/),
12 | [Dask](https://docs.dask.org/en/latest/) and [Zarr](https://zarr.readthedocs.io/en/stable/).
13 | 
14 | For more information on using sgkit, see the [documentation](https://sgkit-dev.github.io/sgkit/).
15 | 
16 | [//]: # (numfocus-fiscal-sponsor-attribution)
17 | 
18 | The sgkit project uses a [custom governance model](./GOVERNANCE.md) 
19 | and is fiscally sponsored by [NumFOCUS](https://numfocus.org/). Consider making 
20 | a [tax-deductible donation](https://numfocus.org/donate-to-sgkit) to help the project 
21 | pay for developer time, professional services, travel, workshops, and a variety of other needs.
22 | 
23 | <div align="center">
24 |   <a href="https://numfocus.org/donate-to-sgkit">
25 |     <img height="60px" 
26 |          src="https://raw.githubusercontent.com/numfocus/templates/master/images/numfocus-logo.png" 
27 |          align="center">
28 |   </a>
29 | </div>
30 | <br>
31 | 


--------------------------------------------------------------------------------
/benchmarks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/benchmarks/__init__.py


--------------------------------------------------------------------------------
/benchmarks/benchmarks_plink.py:
--------------------------------------------------------------------------------
 1 | """Benchmark suite for PLINK module."""
 2 | import tempfile
 3 | import time
 4 | from pathlib import Path
 5 | 
 6 | from sgkit.io.plink.plink_writer import write_plink
 7 | from sgkit.testing import simulate_genotype_call_dataset
 8 | 
 9 | 
10 | class PlinkSpeedSuite:
11 |     def setup(self) -> None:
12 |         self.ds = simulate_genotype_call_dataset(
13 |             n_variant=1000000, n_sample=1000, seed=0
14 |         )
15 | 
16 |         self.dir = Path(tempfile.mkdtemp())
17 |         self.output_plink = self.dir / "plink_out"
18 | 
19 |     # use track_* asv methods since we want to measure speed (MB/s) not time
20 | 
21 |     def track_write_plink_speed(self) -> None:
22 |         # throw away first run due to numba jit compilation
23 |         for _ in range(2):
24 |             duration = _time_func(write_plink, self.ds, path=self.output_plink)
25 |         return _to_mb_per_s(get_dir_size(self.dir), duration)
26 | 
27 | 
28 | def _time_func(func, *args, **kwargs):
29 |     start = time.time()
30 |     func(*args, **kwargs)
31 |     end = time.time()
32 |     return end - start
33 | 
34 | 
35 | def _to_mb_per_s(bytes, duration):
36 |     return bytes / (1_000_000 * duration)
37 | 
38 | 
39 | def get_dir_size(dir):
40 |     return sum(f.stat().st_size for f in dir.glob("**/*") if f.is_file())
41 | 


--------------------------------------------------------------------------------
/benchmarks/benchmarks_stats.py:
--------------------------------------------------------------------------------
 1 | """Benchmarks suite for stats module."""
 2 | 
 3 | import numpy as np
 4 | import xarray as xr
 5 | 
 6 | from sgkit import (
 7 |     count_call_alleles,
 8 |     count_cohort_alleles,
 9 |     simulate_genotype_call_dataset,
10 | )
11 | 
12 | 
13 | class TimeSuite:
14 |     def setup(self) -> None:
15 |         self.count_call_alleles_ds = simulate_genotype_call_dataset(
16 |             n_variant=100_000, n_sample=1000
17 |         )
18 |         self.count_cohort_alleles_ds = simulate_genotype_call_dataset(
19 |             n_variant=100_000, n_sample=1000
20 |         )
21 |         sample_cohort = np.repeat(
22 |             [0, 1], self.count_cohort_alleles_ds.dims["samples"] // 2
23 |         )
24 |         self.count_cohort_alleles_ds["sample_cohort"] = xr.DataArray(
25 |             sample_cohort, dims="samples"
26 |         )
27 | 
28 |     def time_count_call_alleles(self) -> None:
29 |         count_call_alleles(self.count_call_alleles_ds)
30 | 
31 |     def time_count_cohort_alleles(self) -> None:
32 |         count_cohort_alleles(self.count_cohort_alleles_ds)
33 | 


--------------------------------------------------------------------------------
/conftest.py:
--------------------------------------------------------------------------------
 1 | collect_ignore_glob = ["benchmarks/**", ".github/scripts/*.py"]
 2 | 
 3 | 
 4 | def pytest_addoption(parser):
 5 |     parser.addoption(
 6 |         "--use-cubed", action="store_true", default=False, help="run with cubed"
 7 |     )
 8 | 
 9 | 
10 | def use_cubed():
11 |     import dask
12 |     import xarray as xr
13 | 
14 |     # set xarray to use cubed by default
15 |     xr.set_options(chunk_manager="cubed")
16 | 
17 |     # ensure that dask compute raises if it is ever called
18 |     class AlwaysRaiseScheduler:
19 |         def __call__(self, dsk, keys, **kwargs):
20 |             raise RuntimeError("Dask 'compute' was called")
21 | 
22 |     dask.config.set(scheduler=AlwaysRaiseScheduler())
23 | 
24 | 
25 | def pytest_configure(config) -> None:  # type: ignore
26 |     # Add "gpu" marker
27 |     config.addinivalue_line("markers", "gpu:Run tests that run on GPU")
28 | 
29 |     if config.getoption("--use-cubed"):
30 |         use_cubed()
31 | 


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | savefig/
2 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | PYPATH=${PWD}/..
 5 | 
 6 | # You can set these variables from the command line, and also
 7 | # from the environment for the first two.
 8 | SPHINXOPTS    ?=
 9 | SPHINXBUILD   ?= sphinx-build
10 | SOURCEDIR     = .
11 | BUILDDIR      = _build
12 | 
13 | # Put it first so that "make" without argument is like "make help".
14 | help:
15 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
16 | 
17 | .PHONY: help Makefile
18 | 
19 | .PHONY: clean
20 | clean:
21 | 	rm -rf $(BUILDDIR)/*
22 | 	rm -rf generated/*
23 | 
24 | # Catch-all target: route all unknown targets to Sphinx using the new
25 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
26 | %: Makefile
27 | 	@PYTHONPATH=${PYPATH} $(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
28 | 


--------------------------------------------------------------------------------
/docs/_static/data-structures-xarray.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/docs/_static/data-structures-xarray.jpg


--------------------------------------------------------------------------------
/docs/_static/docsearch.sbt.css:
--------------------------------------------------------------------------------
1 | #site-navigation { overflow: visible; }
2 | 


--------------------------------------------------------------------------------
/docs/_static/docsearch.sbt.js:
--------------------------------------------------------------------------------
1 | docsearch({
2 |     apiKey: 'b547668ae472e6a13ae311fb4a8928a3',
3 |     indexName: 'sgkit',
4 |     inputSelector: '#search-input',
5 |     debug: false // Set debug to true if you want to inspect the dropdown
6 | });
7 | 


--------------------------------------------------------------------------------
/docs/_static/mydask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/docs/_static/mydask.png


--------------------------------------------------------------------------------
/docs/_static/numfocus_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/docs/_static/numfocus_logo.png


--------------------------------------------------------------------------------
/docs/_static/order.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/docs/_static/order.png


--------------------------------------------------------------------------------
/docs/_static/sgkit_blue_trnsprnt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/docs/_static/sgkit_blue_trnsprnt.png


--------------------------------------------------------------------------------
/docs/_static/sgkit_trnsprnt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/docs/_static/sgkit_trnsprnt.png


--------------------------------------------------------------------------------
/docs/_static/switcher.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "version": "latest",
 4 |         "url": "https://sgkit-dev.github.io/sgkit/latest/"
 5 |     },
 6 |     {
 7 |         "name": "0.10.0 (stable)",
 8 |         "version": "0.10.0",
 9 |         "url": "https://sgkit-dev.github.io/sgkit/0.10.0/"
10 |     },
11 |     {
12 |         "name": "0.9.0",
13 |         "version": "0.9.0",
14 |         "url": "https://sgkit-dev.github.io/sgkit/0.9.0/"
15 |     },
16 |     {
17 |         "name": "0.8.0",
18 |         "version": "0.8.0",
19 |         "url": "https://sgkit-dev.github.io/sgkit/0.8.0/"
20 |     },
21 |     {
22 |         "name": "0.7.0",
23 |         "version": "0.7.0",
24 |         "url": "https://sgkit-dev.github.io/sgkit/0.7.0/"
25 |     },
26 |     {
27 |         "name": "0.6.0",
28 |         "version": "0.6.0",
29 |         "url": "https://sgkit-dev.github.io/sgkit/0.6.0/"
30 |     },
31 |     {
32 |         "name": "0.5.0",
33 |         "version": "0.5.0",
34 |         "url": "https://sgkit-dev.github.io/sgkit/0.5.0/"
35 |     },
36 |     {
37 |         "version": "0.4.0",
38 |         "url": "https://sgkit-dev.github.io/sgkit/0.4.0/"
39 |     },
40 |     {
41 |         "version": "0.3.0",
42 |         "url": "https://sgkit-dev.github.io/sgkit/0.3.0/"
43 |     },
44 |     {
45 |         "version": "0.2.0a1",
46 |         "url": "https://sgkit-dev.github.io/sgkit/0.2.0a1/"
47 |     },
48 |     {
49 |         "version": "0.1.0a1",
50 |         "url": "https://sgkit-dev.github.io/sgkit/0.1.0a1/"
51 |     }
52 | ]


--------------------------------------------------------------------------------
/docs/about.rst:
--------------------------------------------------------------------------------
 1 | .. _about:
 2 | 
 3 | *****
 4 | About
 5 | *****
 6 | 
 7 | .. image:: _static/numfocus_logo.png
 8 |    :scale: 50 %
 9 |    :target: https://numfocus.org/
10 | 
11 | Sgkit is a fiscally sponsored project of NumFOCUS_, a nonprofit dedicated
12 | to supporting the open-source scientific computing community. If you like
13 | sgkit and want to support our mission, please consider making a donation_
14 | to support our efforts.
15 | 
16 | NumFOCUS is 501(c)(3) non-profit charity in the United States; as such,
17 | donations to NumFOCUS are tax-deductible as allowed by law. As with any
18 | donation, you should consult with your personal tax adviser or the IRS
19 | about your particular tax situation.
20 | 
21 | .. _NumFOCUS: https://numfocus.org
22 | .. _donation: https://numfocus.org/donate-to-sgkit


--------------------------------------------------------------------------------
/docs/examples/index.rst:
--------------------------------------------------------------------------------
 1 | ########
 2 | Examples
 3 | ########
 4 | 
 5 | Example notebooks showing how to use sgkit.
 6 | 
 7 | 
 8 | .. toctree::
 9 |    :maxdepth: 2
10 |    :hidden:
11 | 
12 |    gwas_tutorial
13 |    relatedness_tutorial
14 | 


--------------------------------------------------------------------------------
/docs/extensions/typed_returns.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This extension is taken directly from scanpy here:
 3 |  https://github.com/theislab/scanpy/blob/5533b644e796379fd146bf8e659fd49f92f718cd/docs/extensions/typed_returns.py
 4 | 
 5 | to fix this issue: https://github.com/theislab/scanpydoc/issues/7
 6 | """
 7 | import re
 8 | from typing import Iterator, List
 9 | 
10 | from sphinx.application import Sphinx
11 | from sphinx.ext.napoleon import NumpyDocstring
12 | 
13 | 
14 | def process_return(lines: List[str]) -> Iterator[str]:
15 |     for line in lines:
16 |         m = re.fullmatch(r"(?P<param>\w+)\s+:\s+(?P<type>[\w.]+)", line)
17 |         if m:
18 |             # Once this is in scanpydoc, we can use the fancy hover stuff
19 |             yield f'**{m["param"]}** : :class:`~{m["type"]}`'
20 |         else:
21 |             yield line
22 | 
23 | 
24 | def scanpy_parse_returns_section(self: NumpyDocstring, section: str) -> List[str]:
25 |     lines_raw = list(process_return(self._dedent(self._consume_to_next_section())))
26 |     lines: List[str] = self._format_block(":returns: ", lines_raw)
27 |     if lines and lines[-1]:
28 |         lines.append("")
29 |     return lines
30 | 
31 | 
32 | def setup(app: Sphinx) -> None:
33 |     NumpyDocstring._parse_returns_section = scanpy_parse_returns_section
34 | 


--------------------------------------------------------------------------------
/docs/how_do_i.rst:
--------------------------------------------------------------------------------
  1 | .. currentmodule:: sgkit
  2 | 
  3 | .. _how_do_i:
  4 | 
  5 | ************
  6 | How do I ...
  7 | ************
  8 | 
  9 | .. contents::
 10 |    :local:
 11 | 
 12 | Create a test dataset?
 13 | ----------------------
 14 | 
 15 | Call :py:func:`simulate_genotype_call_dataset` to create a test :class:`xarray.Dataset`:
 16 | 
 17 | .. ipython:: python
 18 | 
 19 |     import sgkit as sg
 20 |     ds = sg.simulate_genotype_call_dataset(n_variant=100, n_sample=50, n_contig=23, missing_pct=.1)
 21 | 
 22 | Look at the dataset summary?
 23 | ----------------------------
 24 | 
 25 | Print using the :class:`xarray.Dataset` ``repr``:
 26 | 
 27 | .. ipython:: python
 28 | 
 29 |     ds
 30 | 
 31 | Get the values for a variable in a dataset?
 32 | -------------------------------------------
 33 | 
 34 | Call :attr:`xarray.Variable.values`:
 35 | 
 36 | .. ipython:: python
 37 | 
 38 |     ds.variant_contig.values
 39 |     ds["variant_contig"].values # equivalent alternative
 40 | 
 41 | .. warning::
 42 | 
 43 |    Calling ``values`` materializes a variable's data in memory, so is only suitable for small datasets.
 44 | 
 45 | Find the definition for a variable in a dataset?
 46 | ------------------------------------------------
 47 | 
 48 | Use the ``comment`` attribute on the variable:
 49 | 
 50 | .. ipython:: python
 51 | 
 52 |     ds.variant_contig.comment
 53 | 
 54 | All the variables defined in sgkit are documented on the :ref:`api_variables` API page.
 55 | 
 56 | Look at the genotypes?
 57 | ----------------------
 58 | 
 59 | Call :py:func:`display_genotypes`:
 60 | 
 61 | .. ipython:: python
 62 | 
 63 |     sg.display_genotypes(ds, max_variants=10)
 64 | 
 65 | 
 66 | Subset the variables?
 67 | ---------------------
 68 | 
 69 | Use Xarray's pandas-like method for `selecting variables <http://xarray.pydata.org/en/latest/user-guide/data-structures.html#transforming-datasets>`_:
 70 | 
 71 | .. ipython:: python
 72 | 
 73 |     ds[["variant_contig", "variant_position", "variant_allele"]]
 74 | 
 75 | Alternatively, you can `drop variables <http://xarray.pydata.org/en/latest/generated/xarray.Dataset.drop_vars.html#xarray.Dataset.drop_vars>`_ that you want to remove:
 76 | 
 77 | .. ipython:: python
 78 | 
 79 |     ds.drop_vars(["variant_contig", "variant_position", "variant_allele"])
 80 | 
 81 | Subset to a genomic range?
 82 | --------------------------
 83 | 
 84 | Set an index on the dataset, then call :meth:`xarray.Dataset.sel`:
 85 | 
 86 | .. ipython:: python
 87 | 
 88 |     ds.set_index(variants=("variant_contig", "variant_position")).sel(variants=(0, slice(2, 4)))
 89 | 
 90 | An API to make this easier is under discussion. Please add your requirements to https://github.com/sgkit-dev/sgkit/pull/658.
 91 | 
 92 | Get the list of samples?
 93 | ------------------------
 94 | 
 95 | Get the values for the ``sample_id`` variable:
 96 | 
 97 | .. ipython:: python
 98 | 
 99 |     ds.sample_id.values
100 | 
101 | Subset the samples?
102 | -------------------
103 | 
104 | Call :meth:`xarray.Dataset.sel` and :meth:`xarray.DataArray.isin`:
105 | 
106 | .. ipython:: python
107 | 
108 |     ds.sel(samples=ds.sample_id.isin(["S30", "S32"]))
109 | 
110 | Define a new variable based on others?
111 | --------------------------------------
112 | 
113 | Use Xarray's `dictionary like methods <http://xarray.pydata.org/en/stable/user-guide/data-structures.html#dictionary-like-methods>`_, or :meth:`xarray.Dataset.assign`:
114 | 
115 | .. ipython:: python
116 | 
117 |     ds["pos0"] = ds.variant_position - 1
118 |     ds.assign(pos0 = ds.variant_position - 1) # alternative
119 | 
120 | Get summary stats?
121 | ------------------
122 | 
123 | Call :py:func:`sample_stats` or :py:func:`variant_stats` as appropriate:
124 | 
125 | .. ipython:: python
126 | 
127 |     sg.sample_stats(ds)
128 |     sg.variant_stats(ds)
129 | 
130 | Filter variants?
131 | ----------------
132 | 
133 | Call :meth:`xarray.Dataset.sel` on the ``variants`` dimension:
134 | 
135 | .. ipython:: python
136 | 
137 |     ds2 = sg.hardy_weinberg_test(ds)
138 |     ds2.sel(variants=(ds2.variant_hwe_p_value > 1e-2).compute())
139 | 
140 | .. note::
141 | 
142 |    The call to ``compute`` is needed to avoid an Xarray error.
143 | 
144 | Find which new variables were added by a method?
145 | ------------------------------------------------
146 | 
147 | Use :py:attr:`xarray.Dataset.data_vars` to compare the new dataset variables to the old:
148 | 
149 | .. ipython:: python
150 | 
151 |     ds2 = sg.sample_stats(ds)
152 |     set(ds2.data_vars) - set(ds.data_vars)
153 | 
154 | Save results to a Zarr file?
155 | ----------------------------
156 | 
157 | Call :py:func:`save_dataset`:
158 | 
159 | .. ipython:: python
160 | 
161 |     sg.save_dataset(ds, "ds.zarr")
162 | 
163 | .. note::
164 | 
165 |    Zarr datasets must have equal-sized chunks (except for the final chunk, which may be smaller),
166 |    so you may have to `rechunk the dataset <http://xarray.pydata.org/en/stable/generated/xarray.DataArray.chunk.html>`_ first.
167 | 
168 | Load a dataset from Zarr?
169 | -------------------------
170 | 
171 | Call :py:func:`load_dataset`:
172 | 
173 | .. ipython:: python
174 | 
175 |     ds = sg.load_dataset("ds.zarr")
176 |     @suppress
177 |     !rm -r ds.zarr
178 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | sgkit: Statistical genetics toolkit in Python
 2 | =============================================
 3 | 
 4 | Sgkit is a Python package that provides a variety of analytical genetics methods through the use of
 5 | general-purpose frameworks such as `Xarray <http://xarray.pydata.org/en/stable/>`_, `Pandas <https://pandas.pydata.org/docs/>`_,
 6 | `Dask <https://docs.dask.org/en/latest/>`_ and `Zarr <https://zarr.readthedocs.io/en/stable/>`_. The sgkit API makes as
 7 | few assumptions as possible about the origin, structure, and intended use of genetic data by adopting a set of
 8 | domain-specific conventions that allow such data to be used within this broader ecosystem of tools. The package is
 9 | designed for complex workflows over large distributed datasets but attempts to make it as easy as possible to scale
10 | down to smaller datasets and access simpler functionality for those that may be new to Python (though there is still
11 | a good bit of work to done on this front). See :ref:`getting_started` for more details.
12 | 
13 | Sgkit is inspired heavily by `scikit-allel <https://scikit-allel.readthedocs.io/en/stable/>`_ and `Hail <https://hail.is/docs/0.2/index.html>`_,
14 | both popular Python genetics toolkits with a respective focus on population and quantitative genetics.
15 | 
16 | .. toctree::
17 |     :maxdepth: 2
18 |     :caption: Contents
19 | 
20 |     getting_started
21 |     user_guide
22 |     examples/index
23 |     api
24 |     how_do_i
25 |     contributing
26 |     about
27 |     news
28 |     changelog
29 | 
30 | Indices and tables
31 | ==================
32 | 
33 | * :ref:`genindex`
34 | * :ref:`search`
35 | 


--------------------------------------------------------------------------------
/docs/news.rst:
--------------------------------------------------------------------------------
 1 | .. _blog:
 2 | 
 3 | ****
 4 | News
 5 | ****
 6 | 
 7 | .. postlist:: 10
 8 |    :date: %Y-%m-%d
 9 |    :format: {date} - {title}
10 |    :list-style: none
11 |    :excerpts:


--------------------------------------------------------------------------------
/docs/news/introducing_sgkit.md:
--------------------------------------------------------------------------------
 1 | # Introducing sgkit
 2 | 
 3 | ```{post} 2022-08-01
 4 | ---
 5 | category: releases
 6 | author: hammer
 7 | ---
 8 | ```
 9 | 
10 | The sgkit team is pleased to announce the release of [sgkit 0.5.0](https://github.com/sgkit-dev/sgkit/releases/tag/0.5.0)! This release adds support for the [VCF Zarr specification](https://github.com/sgkit-dev/vcf-zarr-spec), which describes an encoding of VCF data in chunked-columnar form using the [Zarr format](https://zarr.readthedocs.io/en/stable/).
11 | 
12 | With this release, we also introduce our news page, where we will announce future releases and provide other relevant updates for the `sgkit` project.
13 | 
14 | Oxford and Related Sciences began collaborating in early 2020 on `sgkit` as a successor to the popular [scikit-allel](https://github.com/cggh/scikit-allel) library. We’ve worked closely with third-party library authors to read and write data stored in VCF ([cyvcf2](https://github.com/brentp/cyvcf2)), BGEN ([cbgen](https://github.com/limix/cbgen)), and PLINK ([bed_reader](https://github.com/fastlmm/bed-reader)) files. We’ve designed an [Xarray](https://github.com/pydata/xarray)-based [data model](https://sgkit-dev.github.io/sgkit/latest/getting_started.html#data-structures) and implemented many common methods from statistical and population genetics, including variant and sample [quality control](https://sgkit-dev.github.io/sgkit/latest/examples/gwas_tutorial.html#quality-control), [kinship analysis](https://sgkit-dev.github.io/sgkit/latest/generated/sgkit.pc_relate.html#sgkit-pc-relate), genome-wide [selection scans](https://sgkit-dev.github.io/sgkit/latest/generated/sgkit.Garud_H.html), and genome-wide [association analyses](https://sgkit-dev.github.io/sgkit/latest/generated/sgkit.gwas_linear_regression.html), as well as a [novel implementation](https://sgkit-dev.github.io/sgkit/latest/generated/sgkit.regenie.html#sgkit-regenie) of the recently developed [REGENIE algorithm](https://github.com/rgcgithub/regenie).
15 | 
16 | `sgkit` was accepted as a [NumFOCUS Sponsored Project](https://numfocus.org/project/sgkit) in 2021, and we now have developers in the US, the UK, and New Zealand.
17 | 
18 | If you think sgkit might be useful for your project, please don't hesitate to file an [issue](https://github.com/sgkit-dev/sgkit/issues) or start a [discussion](https://github.com/sgkit-dev/sgkit/discussions) with questions and feedback!
19 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
 1 | callee
 2 | pre-commit
 3 | pytest
 4 | pytest-cov
 5 | pytest-datadir
 6 | pytest-mock
 7 | hypothesis
 8 | scikit-allel
 9 | statsmodels
10 | msprime>=1.0
11 | scikit-learn
12 | partd
13 | bed-reader
14 | rechunker
15 | cbgen > 1.0.5
16 | bio2zarr[vcf]; platform_system != "Windows"
17 | yarl
18 | matplotlib
19 | asv
20 | networkx
21 | aiohttp
22 | requests
23 | graphviz
24 | 


--------------------------------------------------------------------------------
/requirements-doc.txt:
--------------------------------------------------------------------------------
 1 | myst_nb
 2 | pydata-sphinx-theme
 3 | sphinx==6.2.1
 4 | sphinx_autodoc_typehints>=1.14.0
 5 | sphinx-copybutton
 6 | scanpydoc
 7 | ipython
 8 | matplotlib
 9 | seaborn
10 | ablog!=0.10.27
11 | pickleshare
12 | 


--------------------------------------------------------------------------------
/requirements-numpy1-dev.txt:
--------------------------------------------------------------------------------
 1 | callee
 2 | pre-commit
 3 | pytest
 4 | pytest-cov
 5 | pytest-datadir
 6 | pytest-mock
 7 | hypothesis
 8 | scikit-allel
 9 | statsmodels
10 | msprime>=1.0
11 | scikit-learn
12 | partd
13 | bed-reader
14 | rechunker
15 | cbgen < 1.0.5
16 | bio2zarr[vcf]; platform_system != "Windows"
17 | yarl
18 | matplotlib
19 | asv
20 | networkx
21 | aiohttp
22 | requests
23 | graphviz
24 | 


--------------------------------------------------------------------------------
/requirements-numpy1.txt:
--------------------------------------------------------------------------------
 1 | numpy < 2
 2 | xarray < 2025.03.1
 3 | dask[array,dataframe] >= 2023.01.0, <= 2024.8.0
 4 | distributed >= 2023.01.0, <= 2024.8.0
 5 | scipy
 6 | typing-extensions
 7 | numba
 8 | zarr >= 2.10.0, != 2.11.0, != 2.11.1, != 2.11.2, < 3
 9 | fsspec != 2021.6.*
10 | scikit-learn
11 | pandas
12 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy < 2.2
 2 | xarray < 2025.03.1
 3 | dask[array,dataframe] >= 2023.01.0, <= 2024.8.0
 4 | distributed >= 2023.01.0, <= 2024.8.0
 5 | scipy
 6 | typing-extensions
 7 | numba
 8 | zarr >= 2.10.0, != 2.11.0, != 2.11.1, != 2.11.2, < 3
 9 | fsspec != 2021.6.*
10 | scikit-learn
11 | pandas
12 | 


--------------------------------------------------------------------------------
/sgkit/__init__.py:
--------------------------------------------------------------------------------
  1 | from .display import display_genotypes, display_pedigree
  2 | from .distance.api import pairwise_distance
  3 | from .io.dataset import load_dataset, save_dataset
  4 | from .model import (
  5 |     DIM_ALLELE,
  6 |     DIM_PLOIDY,
  7 |     DIM_SAMPLE,
  8 |     DIM_VARIANT,
  9 |     create_genotype_call_dataset,
 10 |     create_genotype_dosage_dataset,
 11 | )
 12 | from .stats.aggregation import (
 13 |     call_allele_frequencies,
 14 |     cohort_allele_frequencies,
 15 |     count_call_alleles,
 16 |     count_cohort_alleles,
 17 |     count_variant_alleles,
 18 |     count_variant_genotypes,
 19 |     individual_heterozygosity,
 20 |     infer_call_ploidy,
 21 |     infer_sample_ploidy,
 22 |     infer_variant_ploidy,
 23 |     sample_stats,
 24 |     variant_stats,
 25 | )
 26 | from .stats.association import gwas_linear_regression, regenie_loco_regression
 27 | from .stats.conversion import convert_call_to_index, convert_probability_to_call
 28 | from .stats.genedrop import simulate_genedrop
 29 | from .stats.genee import genee
 30 | from .stats.grm import (
 31 |     genomic_relationship,
 32 |     hybrid_inverse_relationship,
 33 |     hybrid_relationship,
 34 |     invert_relationship_matrix,
 35 | )
 36 | from .stats.hwe import hardy_weinberg_test
 37 | from .stats.ibs import Weir_Goudet_beta, identity_by_state
 38 | from .stats.ld import ld_matrix, ld_prune, maximal_independent_set
 39 | from .stats.pc_relate import pc_relate
 40 | from .stats.pca import pca
 41 | from .stats.pedigree import (
 42 |     parent_indices,
 43 |     pedigree_contribution,
 44 |     pedigree_inbreeding,
 45 |     pedigree_inverse_kinship,
 46 |     pedigree_kinship,
 47 |     pedigree_sel,
 48 | )
 49 | from .stats.popgen import (
 50 |     Fst,
 51 |     Garud_H,
 52 |     Tajimas_D,
 53 |     divergence,
 54 |     diversity,
 55 |     observed_heterozygosity,
 56 |     pbs,
 57 | )
 58 | from .stats.preprocessing import filter_partial_calls
 59 | from .stats.regenie import regenie
 60 | from .testing import simulate_genotype_call_dataset
 61 | from .window import (
 62 |     window_by_genome,
 63 |     window_by_interval,
 64 |     window_by_position,
 65 |     window_by_variant,
 66 | )
 67 | 
 68 | __version__ = "unknown"
 69 | try:
 70 |     from . import _version
 71 | 
 72 |     __version__ = _version.version  # pragma: nocover
 73 | except ImportError:  # pragma: nocover
 74 |     pass
 75 | 
 76 | __all__ = [
 77 |     "DIM_ALLELE",
 78 |     "DIM_PLOIDY",
 79 |     "DIM_SAMPLE",
 80 |     "DIM_VARIANT",
 81 |     "call_allele_frequencies",
 82 |     "create_genotype_call_dataset",
 83 |     "cohort_allele_frequencies",
 84 |     "convert_call_to_index",
 85 |     "convert_probability_to_call",
 86 |     "count_variant_alleles",
 87 |     "count_call_alleles",
 88 |     "count_cohort_alleles",
 89 |     "count_variant_genotypes",
 90 |     "create_genotype_dosage_dataset",
 91 |     "display_genotypes",
 92 |     "display_pedigree",
 93 |     "filter_partial_calls",
 94 |     "genee",
 95 |     "genomic_relationship",
 96 |     "gwas_linear_regression",
 97 |     "regenie",
 98 |     "regenie_loco_regression",
 99 |     "hardy_weinberg_test",
100 |     "hybrid_relationship",
101 |     "hybrid_inverse_relationship",
102 |     "identity_by_state",
103 |     "individual_heterozygosity",
104 |     "infer_call_ploidy",
105 |     "infer_sample_ploidy",
106 |     "infer_variant_ploidy",
107 |     "invert_relationship_matrix",
108 |     "ld_matrix",
109 |     "ld_prune",
110 |     "maximal_independent_set",
111 |     "parent_indices",
112 |     "pedigree_contribution",
113 |     "pedigree_inbreeding",
114 |     "pedigree_inverse_kinship",
115 |     "pedigree_kinship",
116 |     "pedigree_sel",
117 |     "sample_stats",
118 |     "variant_stats",
119 |     "diversity",
120 |     "divergence",
121 |     "Fst",
122 |     "Garud_H",
123 |     "Tajimas_D",
124 |     "pbs",
125 |     "pc_relate",
126 |     "simulate_genedrop",
127 |     "simulate_genotype_call_dataset",
128 |     "variables",
129 |     "observed_heterozygosity",
130 |     "pca",
131 |     "Weir_Goudet_beta",
132 |     "window_by_genome",
133 |     "window_by_interval",
134 |     "window_by_position",
135 |     "window_by_variant",
136 |     "load_dataset",
137 |     "save_dataset",
138 |     "pairwise_distance",
139 | ]
140 | 


--------------------------------------------------------------------------------
/sgkit/accelerate.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Callable
 3 | 
 4 | from numba import guvectorize, jit
 5 | 
 6 | _DISABLE_CACHE = os.environ.get("SGKIT_DISABLE_NUMBA_CACHE", "1")
 7 | 
 8 | try:
 9 |     CACHE_NUMBA = {"0": True, "1": False}[_DISABLE_CACHE]
10 | except KeyError as e:  # pragma: no cover
11 |     raise KeyError(
12 |         "Environment variable 'SGKIT_DISABLE_NUMBA_CACHE' must be '0' or '1'"
13 |     ) from e
14 | 
15 | 
16 | DEFAULT_NUMBA_ARGS = {
17 |     "nopython": True,
18 |     "cache": CACHE_NUMBA,
19 | }
20 | 
21 | 
22 | def numba_jit(*args, **kwargs) -> Callable:  # pragma: no cover
23 |     kwargs_ = DEFAULT_NUMBA_ARGS.copy()
24 |     kwargs_.update(kwargs)
25 |     return jit(*args, **kwargs_)
26 | 
27 | 
28 | def numba_guvectorize(*args, **kwargs) -> Callable:  # pragma: no cover
29 |     kwargs_ = DEFAULT_NUMBA_ARGS.copy()
30 |     kwargs_.update(kwargs)
31 |     return guvectorize(*args, **kwargs_)
32 | 


--------------------------------------------------------------------------------
/sgkit/cohorts.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, Sequence, Tuple, Union
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | 
 7 | def _tuple_len(t: Union[int, Tuple[int, ...], str, Tuple[str, ...]]) -> int:
 8 |     """Return the length of a tuple, or 1 for an int or string value."""
 9 |     if isinstance(t, int) or isinstance(t, str):
10 |         return 1
11 |     return len(t)
12 | 
13 | 
14 | def _cohorts_to_array(
15 |     cohorts: Sequence[Union[int, Tuple[int, ...], str, Tuple[str, ...]]],
16 |     index: Optional[pd.Index] = None,
17 | ) -> np.ndarray:
18 |     """Convert cohorts or cohort tuples specified as a sequence of values or
19 |     tuples to an array of ints used to match samples in ``sample_cohorts``.
20 | 
21 |     Cohorts can be specified by index (as used in ``sample_cohorts``), or a label, in
22 |     which case an ``index`` must be provided to find index locations for cohorts.
23 | 
24 |     Parameters
25 |     ----------
26 |     cohorts
27 |         A sequence of values or tuple representing cohorts or cohort tuples.
28 |     index
29 |         An index to turn labels into index locations, by default None.
30 | 
31 |     Returns
32 |     -------
33 |     An array of shape ``(len(cohorts), tuple_len)``, where ``tuple_len`` is the length
34 |     of the tuples, or 1 if ``cohorts`` is a sequence of values.
35 | 
36 |     Raises
37 |     ------
38 |     ValueError
39 |         If the cohort tuples are not all the same length.
40 | 
41 |     Examples
42 |     --------
43 | 
44 |     >>> import pandas as pd
45 |     >>> from sgkit.cohorts import _cohorts_to_array
46 |     >>> _cohorts_to_array([(0, 1), (2, 1)]) # doctest: +SKIP
47 |     array([[0, 1],
48 |            [2, 1]], dtype=int32)
49 |     >>> _cohorts_to_array([("c0", "c1"), ("c2", "c1")], pd.Index(["c0", "c1", "c2"])) # doctest: +SKIP
50 |     array([[0, 1],
51 |            [2, 1]], dtype=int32)
52 |     """
53 |     if len(cohorts) == 0:
54 |         return np.array([], np.int32)
55 | 
56 |     tuple_len = _tuple_len(cohorts[0])
57 |     if not all(_tuple_len(cohort) == tuple_len for cohort in cohorts):
58 |         raise ValueError("Cohort tuples must all be the same length")
59 | 
60 |     # convert cohort IDs using an index
61 |     if index is not None:
62 |         if isinstance(cohorts[0], str):
63 |             cohorts = [index.get_loc(id) for id in cohorts]
64 |         elif tuple_len > 1 and isinstance(cohorts[0][0], str):  # type: ignore
65 |             cohorts = [tuple(index.get_loc(id) for id in t) for t in cohorts]  # type: ignore
66 | 
67 |     ct = np.empty((len(cohorts), tuple_len), np.int32)
68 |     for n, t in enumerate(cohorts):
69 |         ct[n, :] = t
70 |     return ct
71 | 


--------------------------------------------------------------------------------
/sgkit/display_numba_fns.py:
--------------------------------------------------------------------------------
 1 | from sgkit.accelerate import numba_guvectorize
 2 | from sgkit.typing import ArrayLike
 3 | 
 4 | 
 5 | @numba_guvectorize(  # type: ignore
 6 |     [
 7 |         "void(uint8[:], uint8[:], boolean[:], uint8[:], uint8[:])",
 8 |     ],
 9 |     "(b),(),(),(c)->(c)",
10 | )
11 | def _format_genotype_bytes(
12 |     chars: ArrayLike, ploidy: int, phased: bool, _: ArrayLike, out: ArrayLike
13 | ) -> None:  # pragma: no cover
14 |     ploidy = ploidy[0]
15 |     sep = 124 if phased[0] else 47  # "|" or "/"
16 |     chars_per_allele = len(chars) // ploidy
17 |     slot = 0
18 |     for slot in range(ploidy):
19 |         offset_inp = slot * chars_per_allele
20 |         offset_out = slot * (chars_per_allele + 1)
21 |         if slot > 0:
22 |             out[offset_out - 1] = sep
23 |         for char in range(chars_per_allele):
24 |             i = offset_inp + char
25 |             j = offset_out + char
26 |             val = chars[i]
27 |             if val == 45:  # "-"
28 |                 if chars[i + 1] == 49:  # "1"
29 |                     # this is an unknown allele
30 |                     out[j] = 46  # "."
31 |                     out[j + 1 : j + chars_per_allele] = 0
32 |                     break
33 |                 else:
34 |                     # < -1 indicates a gap
35 |                     out[j : j + chars_per_allele] = 0
36 |                     if slot > 0:
37 |                         # remove separator
38 |                         out[offset_out - 1] = 0
39 |                     break
40 |             else:
41 |                 out[j] = val
42 |     # shuffle zeros to end
43 |     c = len(out)
44 |     for i in range(c):
45 |         if out[i] == 0:
46 |             for j in range(i + 1, c):
47 |                 if out[j] != 0:
48 |                     out[i] = out[j]
49 |                     out[j] = 0
50 |                     break
51 | 


--------------------------------------------------------------------------------
/sgkit/distance/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/distance/__init__.py


--------------------------------------------------------------------------------
/sgkit/distarray.py:
--------------------------------------------------------------------------------
 1 | from xarray.namedarray.parallelcompat import guess_chunkmanager
 2 | 
 3 | # use the xarray chunk manager to determine the distributed array module to use
 4 | cm = guess_chunkmanager(None)
 5 | 
 6 | if cm.array_cls.__module__.split(".")[0] == "cubed":
 7 |     from cubed import *  # pragma: no cover # noqa: F401, F403
 8 | else:
 9 |     # default to dask
10 |     from dask.array import *  # noqa: F401, F403
11 | 
12 |     # dask doesn't have a top-level astype required by the array API
13 |     def astype(x, dtype, /, *, copy=True):  # pragma: no cover
14 |         if not copy and dtype == x.dtype:
15 |             return x
16 |         return x.astype(dtype=dtype, copy=copy)
17 | 
18 |     # dask doesn't have concat required by the array API
19 |     concat = concatenate  # noqa: F405
20 | 


--------------------------------------------------------------------------------
/sgkit/io/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/io/__init__.py


--------------------------------------------------------------------------------
/sgkit/io/bgen/__init__.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     from .bgen_reader import bgen_to_zarr, read_bgen, rechunk_bgen
 3 | 
 4 |     __all__ = ["read_bgen", "bgen_to_zarr", "rechunk_bgen"]
 5 | except ImportError as e:  # pragma: no cover
 6 |     msg = (
 7 |         "sgkit bgen requirements are not installed.\n\n"
 8 |         "Please install them via pip :\n\n"
 9 |         "  pip install 'sgkit[bgen]'"
10 |     )
11 |     raise ImportError(str(e) + "\n\n" + msg) from e
12 | 


--------------------------------------------------------------------------------
/sgkit/io/dataset.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Dict, MutableMapping, Optional, Union
  2 | 
  3 | import numcodecs
  4 | import xarray as xr
  5 | from xarray import Dataset
  6 | 
  7 | from sgkit.typing import PathType
  8 | from sgkit.utils import has_keyword
  9 | 
 10 | 
 11 | def save_dataset(
 12 |     ds: Dataset,
 13 |     store: Union[PathType, MutableMapping[str, bytes]],
 14 |     storage_options: Optional[Dict[str, str]] = None,
 15 |     auto_rechunk: Optional[bool] = None,
 16 |     zarr_format: int = 2,
 17 |     **kwargs: Any,
 18 | ) -> None:
 19 |     """Save a dataset to Zarr storage.
 20 | 
 21 |     This function is a thin wrapper around :meth:`xarray.Dataset.to_zarr`
 22 |     that uses sensible defaults and makes it easier to use in a pipeline.
 23 | 
 24 |     Parameters
 25 |     ----------
 26 |     ds
 27 |         Dataset to save.
 28 |     store
 29 |         Zarr store or path to directory in file system to save to.
 30 |     storage_options:
 31 |         Any additional parameters for the storage backend (see ``fsspec.open``).
 32 |     auto_rechunk:
 33 |         If True, automatically rechunk the dataset to uniform chunks before saving,
 34 |         if necessary. This is required for Zarr, but can be expensive. Defaults to False.
 35 |     kwargs
 36 |         Additional arguments to pass to :meth:`xarray.Dataset.to_zarr`.
 37 |     """
 38 |     if auto_rechunk is None:
 39 |         auto_rechunk = False
 40 |     for v in ds:
 41 |         # Workaround for https://github.com/pydata/xarray/issues/4380
 42 |         ds[v].encoding.pop("chunks", None)
 43 | 
 44 |         # Remove VLenUTF8 from filters to avoid double encoding error https://github.com/pydata/xarray/issues/3476
 45 |         filters = ds[v].encoding.get("filters", None)
 46 |         var_len_str_codec = numcodecs.VLenUTF8()
 47 |         if filters is not None and var_len_str_codec in filters:
 48 |             filters = list(filters)
 49 |             filters.remove(var_len_str_codec)
 50 |             ds[v].encoding["filters"] = filters
 51 | 
 52 |     if auto_rechunk:
 53 |         # This logic for checking if rechunking is necessary is
 54 |         # taken from xarray/backends/zarr.py#L109.
 55 |         # We can't try to save and catch the error as by that
 56 |         # point the zarr store is non-empty.
 57 |         if any(len(set(chunks[:-1])) > 1 for chunks in ds.chunks.values()) or any(
 58 |             (chunks[0] < chunks[-1]) for chunks in ds.chunks.values()
 59 |         ):
 60 |             # Here we use the max chunk size as the target chunk size as for the commonest
 61 |             # case of subsetting an existing dataset, this will be closest to the original
 62 |             # intended chunk size.
 63 |             ds = ds.chunk(
 64 |                 chunks={dim: max(chunks) for dim, chunks in ds.chunks.items()}
 65 |             )
 66 | 
 67 |     # Catch unequal chunking errors to provide a more helpful error message
 68 |     try:
 69 |         if has_keyword(ds.to_zarr, "zarr_format"):  # from xarray v2024.10.0
 70 |             kwargs["zarr_format"] = zarr_format
 71 |         ds.to_zarr(store, storage_options=storage_options, **kwargs)
 72 |     except ValueError as e:
 73 |         if "Zarr requires uniform chunk sizes" in str(
 74 |             e
 75 |         ) or "Final chunk of Zarr array must be the same size" in str(e):
 76 |             raise ValueError(
 77 |                 "Zarr requires uniform chunk sizes. Use the `auto_rechunk` argument to"
 78 |                 "`save_dataset` to automatically rechunk the dataset."
 79 |             ) from e
 80 |         else:
 81 |             raise e
 82 | 
 83 | 
 84 | def load_dataset(
 85 |     store: Union[PathType, MutableMapping[str, bytes]],
 86 |     storage_options: Optional[Dict[str, str]] = None,
 87 |     **kwargs: Any,
 88 | ) -> Dataset:
 89 |     """Load a dataset from Zarr storage.
 90 | 
 91 |     This function is a thin wrapper around :func:`xarray.open_zarr`
 92 |     that uses sensible defaults and makes it easier to use in a pipeline.
 93 | 
 94 |     Parameters
 95 |     ----------
 96 |     store
 97 |         Zarr store or path to directory in file system to load from.
 98 |     storage_options:
 99 |         Any additional parameters for the storage backend (see ``fsspec.open``).
100 |     kwargs
101 |         Additional arguments to pass to :func:`xarray.open_zarr`.
102 | 
103 |     Returns
104 |     -------
105 | 
106 |     Dataset
107 |         The dataset loaded from the Zarr store or file system.
108 |     """
109 |     ds: Dataset = xr.open_zarr(store, storage_options=storage_options, concat_characters=False, **kwargs)  # type: ignore[no-untyped-call]
110 |     for v in ds:
111 |         # Workaround for https://github.com/pydata/xarray/issues/4386
112 |         if v.endswith("_mask"):  # type: ignore
113 |             ds[v] = ds[v].astype(bool)
114 |     return ds
115 | 


--------------------------------------------------------------------------------
/sgkit/io/plink/__init__.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     from .plink_reader import plink_to_zarr, read_plink
 3 |     from .plink_writer import write_plink, zarr_to_plink
 4 | 
 5 |     __all__ = ["plink_to_zarr", "read_plink", "write_plink", "zarr_to_plink"]
 6 | except ImportError as e:  # pragma: no cover
 7 |     msg = (
 8 |         "sgkit plink requirements are not installed.\n\n"
 9 |         "Please install them via pip :\n\n"
10 |         "  pip install 'sgkit[plink]'"
11 |     )
12 |     raise ImportError(str(e) + "\n\n" + msg) from e
13 | 


--------------------------------------------------------------------------------
/sgkit/io/utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Mapping, Optional, Tuple
 2 | 
 3 | import dask.dataframe as dd
 4 | import numpy as np
 5 | 
 6 | from ..typing import ArrayLike, DType
 7 | from ..utils import encode_array, max_str_len
 8 | 
 9 | INT_MISSING, INT_FILL = -1, -2
10 | 
11 | FLOAT32_MISSING, FLOAT32_FILL = np.array([0x7F800001, 0x7F800002], dtype=np.int32).view(
12 |     np.float32
13 | )
14 | FLOAT32_MISSING_AS_INT32, FLOAT32_FILL_AS_INT32 = np.array(
15 |     [0x7F800001, 0x7F800002], dtype=np.int32
16 | )
17 | 
18 | CHAR_MISSING, CHAR_FILL = ".", ""
19 | 
20 | STR_MISSING, STR_FILL = ".", ""
21 | 
22 | 
23 | def dataframe_to_dict(
24 |     df: dd.DataFrame, dtype: Optional[Mapping[str, DType]] = None
25 | ) -> Mapping[str, ArrayLike]:
26 |     """Convert dask dataframe to dictionary of arrays"""
27 |     arrs = {}
28 |     for c in df:
29 |         a = df[c].to_dask_array(lengths=True)
30 |         dt = df[c].dtype
31 |         if dtype:
32 |             dt = dtype[c]
33 |         kind = np.dtype(dt).kind
34 |         if kind in ["U", "S"]:
35 |             # Compute fixed-length string dtype for array
36 |             max_len = max_str_len(a)
37 |             dt = f"{kind}{max_len}"
38 |         arrs[c] = a.astype(dt)
39 |     return arrs
40 | 
41 | 
42 | def encode_contigs(contig: ArrayLike) -> Tuple[ArrayLike, ArrayLike]:
43 |     # TODO: test preservation of int16
44 |     # If contigs are already integers, use them as-is
45 |     if np.issubdtype(contig.dtype, np.integer):
46 |         ids = contig
47 |         names = np.unique(np.asarray(ids)).astype(str)  # type: ignore[no-untyped-call]
48 |     # Otherwise create index for contig names based
49 |     # on order of appearance in underlying file
50 |     else:
51 |         ids, names = encode_array(np.asarray(contig, dtype=str))
52 |     return ids, names
53 | 


--------------------------------------------------------------------------------
/sgkit/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/py.typed


--------------------------------------------------------------------------------
/sgkit/stats/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/stats/__init__.py


--------------------------------------------------------------------------------
/sgkit/stats/aggregation_numba_fns.py:
--------------------------------------------------------------------------------
  1 | # Numba guvectorize functions (and their dependencies) are defined
  2 | # in a separate file here, and imported dynamically to avoid
  3 | # initial compilation overhead.
  4 | 
  5 | import numpy as np
  6 | 
  7 | from sgkit.accelerate import numba_guvectorize, numba_jit
  8 | from sgkit.typing import ArrayLike
  9 | 
 10 | 
 11 | @numba_guvectorize(  # type: ignore
 12 |     [
 13 |         "void(int8[:], uint8[:], uint8[:])",
 14 |         "void(int16[:], uint8[:], uint8[:])",
 15 |         "void(int32[:], uint8[:], uint8[:])",
 16 |         "void(int64[:], uint8[:], uint8[:])",
 17 |         "void(int8[:], uint64[:], uint64[:])",
 18 |         "void(int16[:], uint64[:], uint64[:])",
 19 |         "void(int32[:], uint64[:], uint64[:])",
 20 |         "void(int64[:], uint64[:], uint64[:])",
 21 |     ],
 22 |     "(k),(n)->(n)",
 23 | )
 24 | def count_alleles(
 25 |     g: ArrayLike, _: ArrayLike, out: ArrayLike
 26 | ) -> None:  # pragma: no cover
 27 |     """Generalized U-function for computing per sample allele counts.
 28 | 
 29 |     Parameters
 30 |     ----------
 31 |     g
 32 |         Genotype call of shape (ploidy,) containing alleles encoded as
 33 |         type `int` with values < 0 indicating a missing allele.
 34 |     _
 35 |         Dummy variable of type `uint8` or `uint64` and shape (alleles,)
 36 |         used to define the number of unique alleles to be counted in the
 37 |         return value. The dtype of this array determines the dtype of the
 38 |         returned array.
 39 | 
 40 |     Returns
 41 |     -------
 42 |     ac : ndarray
 43 |         Allele counts with shape (alleles,) and values corresponding to
 44 |         the number of non-missing occurrences of each allele.
 45 | 
 46 |     """
 47 |     out[:] = 0
 48 |     n_allele = len(g)
 49 |     for i in range(n_allele):
 50 |         a = g[i]
 51 |         if a >= 0:
 52 |             out[a] += 1
 53 | 
 54 | 
 55 | @numba_jit(nogil=True)
 56 | def _classify_hom(genotype: ArrayLike) -> int:  # pragma: no cover
 57 |     a0 = genotype[0]
 58 |     cat = min(a0, 1)  # -1, 0, 1
 59 |     for i in range(1, len(genotype)):
 60 |         if cat < 0:
 61 |             break
 62 |         a = genotype[i]
 63 |         if a != a0:
 64 |             cat = 2  # het
 65 |         if a < 0:
 66 |             cat = -1
 67 |     return cat
 68 | 
 69 | 
 70 | @numba_guvectorize(  # type: ignore
 71 |     [
 72 |         "void(int8[:,:], uint64[:], int64[:])",
 73 |         "void(int16[:,:], uint64[:], int64[:])",
 74 |         "void(int32[:,:], uint64[:], int64[:])",
 75 |         "void(int64[:,:], uint64[:], int64[:])",
 76 |     ],
 77 |     "(n, k),(c)->(c)",
 78 | )
 79 | def count_hom(
 80 |     genotypes: ArrayLike, _: ArrayLike, out: ArrayLike
 81 | ) -> None:  # pragma: no cover
 82 |     """Generalized U-function for counting homozygous and heterozygous genotypes.
 83 | 
 84 |     Parameters
 85 |     ----------
 86 |     g
 87 |         Genotype call of shape (ploidy,) containing alleles encoded as
 88 |         type `int` with values < 0 indicating a missing allele.
 89 |     _
 90 |         Dummy variable of type `uint64` with length 3 which determines the
 91 |         number of categories returned (this should always be 3).
 92 | 
 93 |     Note
 94 |     ----
 95 |     This method is not suitable for mixed-ploidy genotypes.
 96 | 
 97 |     Returns
 98 |     -------
 99 |     counts : ndarray
100 |         Counts of homozygous reference, homozygous alternate, and heterozygous genotypes.
101 |     """
102 |     out[:] = 0
103 |     for i in range(len(genotypes)):
104 |         index = _classify_hom(genotypes[i])
105 |         if index >= 0:
106 |             out[index] += 1
107 | 
108 | 
109 | def count_hom_new_axis(genotypes: ArrayLike, _: ArrayLike) -> ArrayLike:
110 |     return count_hom(genotypes, _)[:, np.newaxis, :]
111 | 


--------------------------------------------------------------------------------
/sgkit/stats/genee.py:
--------------------------------------------------------------------------------
  1 | import dask.array as da
  2 | import numpy as np
  3 | import pandas as pd
  4 | from dask.dataframe import DataFrame
  5 | from sklearn.mixture import GaussianMixture
  6 | from xarray import Dataset
  7 | 
  8 | from sgkit.stats.genee_momentchi2py import hbe
  9 | from sgkit.stats.ld import map_windows_as_dataframe
 10 | from sgkit.typing import ArrayLike
 11 | 
 12 | 
 13 | def genee(ds: Dataset, ld: ArrayLike, *, reg_covar: float = 0.000001) -> DataFrame:
 14 |     """Compute gene-ε as described in Cheng, et al. 2020 [1].
 15 | 
 16 |     Parameters
 17 |     ----------
 18 |     ds
 19 |         Dataset containing beta values (OLS betas or regularized betas).
 20 |     ld
 21 |         2D array of LD values.
 22 |     reg_covar
 23 |         Non-negative regularization added to the diagonal of covariance.
 24 |         Passed to scikit-learn ``GaussianMixture``.
 25 | 
 26 |     Warnings
 27 |     --------
 28 |     Unlike the implementation in [2], this function will always use the
 29 |     second mixture component with the largest variance, rather than
 30 |     the first mixture component with the largest variance if it is composed
 31 |     of more than 50% of the SNPs.
 32 | 
 33 |     Returns
 34 |     -------
 35 |     A dataframe containing the following fields:
 36 | 
 37 |     - ``test_q``: test statistic
 38 |     - ``q_var``: test variance
 39 |     - ``pval``: p-value
 40 | 
 41 |     References
 42 |     ----------
 43 |     [1] - W. Cheng, S. Ramachandran, and L. Crawford (2020).
 44 |     Estimation of non-null SNP effect size distributions enables the detection of enriched genes underlying complex traits.
 45 |     PLOS Genetics. 16(6): e1008855.
 46 | 
 47 |     [2] - https://github.com/ramachandran-lab/genee
 48 |     """
 49 | 
 50 |     betas = np.expand_dims(ds["beta"].values, 1)
 51 |     epsilon_effect = genee_EM(betas=betas, reg_covar=reg_covar)
 52 | 
 53 |     betas = da.asarray(betas)
 54 |     ld = da.asarray(ld)
 55 | 
 56 |     meta = [
 57 |         ("test_q", np.float32),
 58 |         ("q_var", np.float32),
 59 |         ("pval", np.float32),
 60 |     ]
 61 |     return map_windows_as_dataframe(
 62 |         genee_loop_chunk,
 63 |         betas,
 64 |         ld,
 65 |         window_starts=ds["window_start"].values,
 66 |         window_stops=ds["window_stop"].values,
 67 |         meta=meta,
 68 |         epsilon_effect=epsilon_effect,
 69 |     )
 70 | 
 71 | 
 72 | def genee_EM(betas, reg_covar=0.000001):
 73 |     # based on https://scikit-learn.org/stable/auto_examples/mixture/plot_gmm_selection.html#sphx-glr-auto-examples-mixture-plot-gmm-selection-py
 74 |     lowest_bic = np.inf
 75 |     for n_components in range(1, 10):
 76 |         gmm = GaussianMixture(
 77 |             n_components=n_components, reg_covar=reg_covar, random_state=0
 78 |         ).fit(betas)
 79 |         bic = gmm.bic(betas)
 80 |         if bic < lowest_bic:
 81 |             lowest_bic = bic
 82 |             best_gmm = gmm
 83 | 
 84 |     covars = best_gmm.covariances_.squeeze()
 85 |     if best_gmm.n_components == 1:  # pragma: no cover
 86 |         epsilon_effect = covars[0]
 87 |     else:
 88 |         # TODO: handle case where first component composed more than 50% SNPs
 89 |         # https://github.com/ramachandran-lab/genee/blob/a357a956241df93f16e07664e24f3aeac65f4177/genee/R/genee_EM.R#L28-L29
 90 |         covars_decreasing = np.sort(covars)[::-1]
 91 |         epsilon_effect = covars_decreasing[1]
 92 | 
 93 |     return epsilon_effect
 94 | 
 95 | 
 96 | def genee_loop_chunk(
 97 |     args,
 98 |     chunk_window_starts,
 99 |     chunk_window_stops,
100 |     abs_chunk_start,
101 |     chunk_max_window_start,
102 |     epsilon_effect,
103 | ):
104 |     # Iterate over each window in this chunk
105 |     # Note that betas and ld are just the chunked versions here
106 |     betas, ld = args
107 |     rows = list()
108 |     for ti in range(len(chunk_window_starts)):
109 |         window_start = chunk_window_starts[ti]
110 |         window_stop = chunk_window_stops[ti]
111 |         rows.append(
112 |             genee_test(slice(window_start, window_stop), ld, betas, epsilon_effect)
113 |         )
114 |     cols = [
115 |         ("test_q", np.float32),
116 |         ("q_var", np.float32),
117 |         ("pval", np.float32),
118 |     ]
119 |     df = pd.DataFrame(rows, columns=[c[0] for c in cols])
120 |     for k, v in dict(cols).items():
121 |         df[k] = df[k].astype(v)
122 |     return df
123 | 
124 | 
125 | def genee_test(gene, ld, betas, epsilon_effect):
126 |     ld_g = ld[gene, gene]
127 |     x = ld_g * epsilon_effect
128 |     e_values = np.linalg.eigvals(x)
129 |     e_values = ensure_positive_real(e_values)
130 | 
131 |     betas_g = betas[gene]
132 |     test_statistics = betas_g.T @ betas_g
133 |     t_var = np.diag((ld_g * epsilon_effect) @ (ld_g * epsilon_effect)).sum()
134 | 
135 |     p_value_g = compute_p_values(e_values, test_statistics)
136 |     p_value_g = ensure_positive_real(p_value_g)
137 | 
138 |     return test_statistics.squeeze().item(), t_var, p_value_g.squeeze().item()
139 | 
140 | 
141 | def ensure_positive_real(x):
142 |     x = np.real_if_close(x)
143 |     x[x <= 0.0] = 1e-20
144 |     return x
145 | 
146 | 
147 | def compute_p_values(e_values, test_statistics, *, method="hbe"):
148 |     if method == "hbe":
149 |         # see discussion at https://github.com/deanbodenham/momentchi2py
150 |         # hbe or lpb4 both pass the genee tests here
151 |         return 1.0 - hbe(e_values, test_statistics)
152 |     elif method == "liu_sf":  # pragma: no cover
153 |         # https://github.com/limix/chiscore
154 |         # note that chiscore has a native dependency (chi2comb) and is not
155 |         # available on all platforms
156 |         from chiscore import liu_sf
157 | 
158 |         (q, _, _, _) = liu_sf(
159 |             test_statistics, e_values, np.ones(len(e_values)), np.zeros(len(e_values))
160 |         )
161 |         return q
162 |     else:  # pragma: no cover
163 |         raise ValueError(f"Unsupported method: {method}")
164 | 


--------------------------------------------------------------------------------
/sgkit/stats/ibs_numba_fns.py:
--------------------------------------------------------------------------------
 1 | from sgkit.accelerate import numba_guvectorize
 2 | from sgkit.typing import ArrayLike
 3 | 
 4 | 
 5 | @numba_guvectorize(  # type: ignore
 6 |     [
 7 |         "void(int8[:,:,:], float64[:,:], float64[:,:])",
 8 |         "void(int16[:,:,:], float64[:,:], float64[:,:])",
 9 |         "void(int32[:,:,:], float64[:,:], float64[:,:])",
10 |         "void(int64[:,:,:], float64[:,:], float64[:,:])",
11 |     ],
12 |     "(v,s,k)->(s,s),(s,s)",
13 | )
14 | def allele_matching_diag(
15 |     gt: ArrayLike,
16 |     numerator: ArrayLike,
17 |     denominator: ArrayLike,
18 | ) -> None:  # pragma: no cover
19 |     n_variant, n_sample, ploidy = gt.shape
20 |     numerator[:] = 0.0
21 |     denominator[:] = 0.0
22 |     for v in range(n_variant):
23 |         for s0 in range(n_sample):
24 |             for s1 in range(s0 + 1):
25 |                 # local IBS prob to ensure even weighting of loci
26 |                 local_num = 0
27 |                 local_denom = 0
28 |                 for i in range(ploidy):
29 |                     a0 = gt[v, s0, i]
30 |                     if a0 >= 0:
31 |                         for j in range(ploidy):
32 |                             a1 = gt[v, s1, j]
33 |                             if a1 >= 0:
34 |                                 local_denom += 1
35 |                                 if a0 == a1:
36 |                                     local_num += 1
37 |                 if local_denom > 0:
38 |                     p_ibs = local_num / local_denom
39 |                     numerator[s0, s1] += p_ibs
40 |                     numerator[s1, s0] += p_ibs
41 |                     denominator[s0, s1] += 1.0
42 |                     denominator[s1, s0] += 1.0
43 |             # undo double addition to diagonal
44 |             if local_denom > 0:
45 |                 numerator[s0, s0] -= p_ibs
46 |                 denominator[s0, s0] -= 1.0
47 | 
48 | 
49 | @numba_guvectorize(  # type: ignore
50 |     [
51 |         "void(int8[:,:,:], int8[:,:,:], float64[:,:], float64[:,:])",
52 |         "void(int16[:,:,:], int16[:,:,:], float64[:,:], float64[:,:])",
53 |         "void(int32[:,:,:], int32[:,:,:], float64[:,:], float64[:,:])",
54 |         "void(int64[:,:,:], int64[:,:,:], float64[:,:], float64[:,:])",
55 |     ],
56 |     "(v,s0,k),(v,s1,k)->(s0,s1),(s0,s1)",
57 | )
58 | def allele_matching_block(
59 |     gt0: ArrayLike,
60 |     gt1: ArrayLike,
61 |     numerator: ArrayLike,
62 |     denominator: ArrayLike,
63 | ) -> None:  # pragma: no cover
64 |     n_variant, n_sample0, ploidy = gt0.shape
65 |     _, n_sample1, _ = gt1.shape
66 |     numerator[:] = 0.0
67 |     denominator[:] = 0.0
68 |     for v in range(n_variant):
69 |         for s0 in range(n_sample0):
70 |             for s1 in range(n_sample1):
71 |                 # local IBS prob to ensure even weighting of loci
72 |                 local_num = 0
73 |                 local_denom = 0
74 |                 for i in range(ploidy):
75 |                     a0 = gt0[v, s0, i]
76 |                     if a0 >= 0:
77 |                         for j in range(ploidy):
78 |                             a1 = gt1[v, s1, j]
79 |                             if a1 >= 0:
80 |                                 local_denom += 1
81 |                                 if a0 == a1:
82 |                                     local_num += 1
83 |                 if local_denom > 0:
84 |                     p_ibs = local_num / local_denom
85 |                     numerator[s0, s1] += p_ibs
86 |                     denominator[s0, s1] += 1.0
87 | 


--------------------------------------------------------------------------------
/sgkit/stats/utils.py:
--------------------------------------------------------------------------------
  1 | from typing import Hashable, Tuple
  2 | 
  3 | import dask.array as da
  4 | import numpy as np
  5 | import xarray as xr
  6 | from dask.array import Array
  7 | from xarray import DataArray, Dataset
  8 | 
  9 | from ..typing import ArrayLike
 10 | 
 11 | 
 12 | def concat_2d(ds: Dataset, dims: Tuple[Hashable, Hashable]) -> DataArray:
 13 |     """Concatenate dataset with a mixture of <= 2D variables as single DataArray.
 14 | 
 15 |     Parameters
 16 |     ----------
 17 |     ds
 18 |         Dataset containing variables to convert.
 19 |         Any variables with a first dimension not equal to `dims[0]`
 20 |         will be ignored.
 21 |     dims
 22 |         Names of resulting dimensions in 2D array where first dimension
 23 |         is shared by all variables and all others are collapsed into
 24 |         a new dimension named by the second item.
 25 | 
 26 |     Returns
 27 |     -------
 28 |     Array with dimensions defined by `dims`.
 29 |     """
 30 |     arrs = []
 31 |     for var in ds:
 32 |         arr = ds[var]
 33 |         if arr.dims[0] != dims[0]:
 34 |             continue
 35 |         if arr.ndim > 2:
 36 |             raise ValueError(
 37 |                 "All variables must have <= 2 dimensions "
 38 |                 f"(variable {var} has shape {arr.shape})"
 39 |             )
 40 |         if arr.ndim == 2:
 41 |             # Rename concatenation axis
 42 |             arr = arr.rename({arr.dims[1]: dims[1]})
 43 |         else:
 44 |             # Add concatenation axis
 45 |             arr = arr.expand_dims(dim=dims[1], axis=1)
 46 |         arrs.append(arr)
 47 |     return xr.concat(arrs, dim=dims[1])
 48 | 
 49 | 
 50 | def r2_score(YP: ArrayLike, YT: ArrayLike) -> ArrayLike:
 51 |     """R2 score calculator for batches of vector pairs.
 52 | 
 53 |     Parameters
 54 |     ----------
 55 |     YP
 56 |         ArrayLike (..., M)
 57 |         Predicted values, can be any of any shape >= 1D.
 58 |         All leading dimensions must be broadcastable to
 59 |         the leading dimensions of `YT`.
 60 |     YT
 61 |         ArrayLike (..., M)
 62 |         True values, can be any of any shape >= 1D.
 63 |         All leading dimensions must be broadcastable to
 64 |         the leading dimensions of `YP`.
 65 | 
 66 |     Returns
 67 |     -------
 68 |     R2 : (...) ArrayLike
 69 |         R2 scores array with shape equal to all leading
 70 |         (i.e. batch) dimensions of the provided arrays.
 71 |     """
 72 |     YP, YT = np.broadcast_arrays(YP, YT)  # type: ignore[no-untyped-call]
 73 |     tot = np.power(YT - YT.mean(axis=-1, keepdims=True), 2)
 74 |     tot = tot.sum(axis=-1, keepdims=True)
 75 |     res = np.power(YT - YP, 2)
 76 |     res = res.sum(axis=-1, keepdims=True)
 77 |     res_nz, tot_nz = res != 0, tot != 0
 78 |     alt = np.where(res_nz & ~tot_nz, 0, 1)
 79 |     # Hide warnings rather than use masked division
 80 |     # because the latter is not supported by dask
 81 |     with np.errstate(divide="ignore", invalid="ignore"):
 82 |         r2 = np.where(res_nz & tot_nz, 1 - res / tot, alt)
 83 |     return np.squeeze(r2, axis=-1)
 84 | 
 85 | 
 86 | def assert_block_shape(x: Array, *args: int) -> None:
 87 |     """Validate block shape (i.e. x.numblocks)"""
 88 |     shape = tuple(args)
 89 |     assert x.numblocks == tuple(
 90 |         shape
 91 |     ), f"Expecting block shape {shape}, found {x.numblocks}"
 92 | 
 93 | 
 94 | def assert_chunk_shape(x: Array, *args: int) -> None:
 95 |     """Validate chunk shape (i.e. x.chunksize)"""
 96 |     shape = tuple(args)
 97 |     assert x.chunksize == shape, f"Expecting chunk shape {shape}, found {x.chunksize}"
 98 | 
 99 | 
100 | def assert_array_shape(x: ArrayLike, *args: int) -> None:
101 |     """Validate array shape (i.e. x.shape)"""
102 |     shape = tuple(args)
103 |     assert x.shape == shape, f"Expecting array shape {shape}, found {x.shape}"
104 | 
105 | 
106 | def map_blocks_asnumpy(x: Array) -> Array:
107 |     if hasattr(x, "_meta") and da.utils.is_cupy_type(x._meta):  # pragma: no cover
108 |         import cupy as cp  # type: ignore[import]
109 | 
110 |         x = x.map_blocks(cp.asnumpy)
111 |     return x
112 | 


--------------------------------------------------------------------------------
/sgkit/testing.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional
  2 | 
  3 | import numpy as np
  4 | from xarray import Dataset
  5 | 
  6 | from sgkit.typing import ArrayLike
  7 | 
  8 | from .model import create_genotype_call_dataset
  9 | from .utils import split_array_chunks
 10 | 
 11 | 
 12 | def simulate_genotype_call_dataset(
 13 |     n_variant: int,
 14 |     n_sample: int,
 15 |     n_ploidy: int = 2,
 16 |     n_allele: int = 2,
 17 |     n_contig: int = 1,
 18 |     seed: Optional[int] = 0,
 19 |     missing_pct: Optional[float] = None,
 20 |     phased: Optional[bool] = None,
 21 |     additional_variant_fields: Optional[dict] = None,
 22 | ) -> Dataset:
 23 |     """Simulate genotype calls and variant/sample data.
 24 | 
 25 |     Note that the data simulated by this function has no
 26 |     biological interpretation and that summary statistics
 27 |     or other methods applied to it will produce meaningless
 28 |     results.  This function is primarily a convenience on
 29 |     generating :class:`xarray.Dataset` containers so quantities of interest
 30 |     should be overwritten, where appropriate, within the
 31 |     context of a more specific application.
 32 | 
 33 |     Parameters
 34 |     ----------
 35 |     n_variant
 36 |         Number of variants to simulate
 37 |     n_sample
 38 |         Number of samples to simulate
 39 |     n_ploidy
 40 |         Number of chromosome copies in each sample
 41 |     n_allele
 42 |         Number of alleles to simulate
 43 |     n_contig
 44 |         optional
 45 |         Number of contigs to partition variants with,
 46 |         controlling values in ``variant_contig``. Values
 47 |         will all be 0 by default when ``n_contig`` is 1.
 48 |     seed
 49 |         Seed for random number generation, optional
 50 |     missing_pct
 51 |         The percentage of missing calls, must be within [0.0, 1.0], optional
 52 |     phased
 53 |         Whether genotypes are phased, default is unphased, optional
 54 |     additional_variant_fields
 55 |         Additional variant fields to add to the dataset as a dictionary of
 56 |         {field_name: field_dtype}, optional
 57 | 
 58 |     Returns
 59 |     -------
 60 |     A dataset containing the following variables:
 61 | 
 62 |     - :data:`sgkit.variables.variant_contig_spec` (variants)
 63 |     - :data:`sgkit.variables.variant_position_spec` (variants)
 64 |     - :data:`sgkit.variables.variant_allele_spec` (variants)
 65 |     - :data:`sgkit.variables.sample_id_spec` (samples)
 66 |     - :data:`sgkit.variables.call_genotype_spec` (variants, samples, ploidy)
 67 |     - :data:`sgkit.variables.call_genotype_mask_spec` (variants, samples, ploidy)
 68 |     - :data:`sgkit.variables.call_genotype_phased_spec` (variants, samples), if ``phased`` is not None
 69 |     - Those specified in ``additional_variant_fields``, if provided
 70 |     """
 71 |     if missing_pct and (missing_pct < 0.0 or missing_pct > 1.0):
 72 |         raise ValueError("missing_pct must be within [0.0, 1.0]")
 73 |     rs = np.random.RandomState(seed=seed)
 74 |     call_genotype = rs.randint(
 75 |         0, n_allele, size=(n_variant, n_sample, n_ploidy), dtype=np.int8
 76 |     )
 77 |     if missing_pct:
 78 |         call_genotype = np.where(
 79 |             rs.rand(*call_genotype.shape) < missing_pct, -1, call_genotype
 80 |         )
 81 |     if phased is None:
 82 |         call_genotype_phased = None
 83 |     else:
 84 |         call_genotype_phased = np.full((n_variant, n_sample), phased, dtype=bool)
 85 | 
 86 |     contig_size = split_array_chunks(n_variant, n_contig)
 87 |     contig = np.repeat(np.arange(n_contig), contig_size)
 88 |     contig_names = np.unique(contig).astype(str).tolist()  # type: ignore[no-untyped-call]
 89 |     position = np.concatenate([np.arange(contig_size[i]) for i in range(n_contig)])  # type: ignore[no-untyped-call]
 90 |     assert position.size == contig.size
 91 |     alleles: ArrayLike = rs.choice(
 92 |         ["A", "C", "G", "T"], size=(n_variant, n_allele)
 93 |     ).astype("S")
 94 |     sample_id = np.array([f"S{i}" for i in range(n_sample)])
 95 |     ds = create_genotype_call_dataset(
 96 |         variant_contig_names=contig_names,
 97 |         variant_contig=contig,
 98 |         variant_position=position,
 99 |         variant_allele=alleles,
100 |         sample_id=sample_id,
101 |         call_genotype=call_genotype,
102 |         call_genotype_phased=call_genotype_phased,
103 |     )
104 |     # Add in each of the additional variant fields, if provided with random data
105 |     if additional_variant_fields is not None:
106 |         for field_name, field_dtype in additional_variant_fields.items():
107 |             if field_dtype in (np.float32, np.float64):
108 |                 field = rs.rand(n_variant).astype(field_dtype)
109 |             elif field_dtype in (np.int8, np.int16, np.int32, np.int64):
110 |                 field = rs.randint(0, 100, n_variant, dtype=field_dtype)
111 |             elif field_dtype is bool:
112 |                 field = rs.rand(n_variant) > 0.5
113 |             elif field_dtype is str:
114 |                 field = np.arange(n_variant).astype("S")
115 |             else:
116 |                 raise ValueError(f"Unrecognized dtype {field_dtype}")
117 |             ds[field_name] = (("variants",), field)
118 |     return ds
119 | 


--------------------------------------------------------------------------------
/sgkit/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/tests/__init__.py


--------------------------------------------------------------------------------
/sgkit/tests/data/sample.bed:
--------------------------------------------------------------------------------
1 | chr0	0	10
2 | chr0	10	20
3 | chr1	0	10
4 | chr1	20	30
5 | chr1	30	40
6 | chr1	50	60


--------------------------------------------------------------------------------
/sgkit/tests/io/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/tests/io/__init__.py


--------------------------------------------------------------------------------
/sgkit/tests/io/bgen/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/tests/io/bgen/__init__.py


--------------------------------------------------------------------------------
/sgkit/tests/io/bgen/data/.gitignore:
--------------------------------------------------------------------------------
1 | *.metadata2.mmm
2 | *.metafile
3 | 


--------------------------------------------------------------------------------
/sgkit/tests/io/bgen/data/example-no-samples.bgen:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/tests/io/bgen/data/example-no-samples.bgen


--------------------------------------------------------------------------------
/sgkit/tests/io/bgen/data/example-separate-samples.bgen:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/tests/io/bgen/data/example-separate-samples.bgen


--------------------------------------------------------------------------------
/sgkit/tests/io/bgen/data/example-separate-samples.sample:
--------------------------------------------------------------------------------
1 | ID
2 | 0
3 | s1
4 | s2
5 | s3
6 | s4
7 | s5
8 | 


--------------------------------------------------------------------------------
/sgkit/tests/io/bgen/data/example.bgen:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/tests/io/bgen/data/example.bgen


--------------------------------------------------------------------------------
/sgkit/tests/io/bgen/data/samples:
--------------------------------------------------------------------------------
1 | sample_001
2 | sample_002
3 | sample_003
4 | sample_004
5 | sample_005
6 | 


--------------------------------------------------------------------------------
/sgkit/tests/io/data/sample.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/tests/io/data/sample.vcf.gz


--------------------------------------------------------------------------------
/sgkit/tests/io/data/sample.vcf.gz.tbi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/tests/io/data/sample.vcf.gz.tbi


--------------------------------------------------------------------------------
/sgkit/tests/io/plink/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/tests/io/plink/__init__.py


--------------------------------------------------------------------------------
/sgkit/tests/io/plink/data/example.bed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/tests/io/plink/data/example.bed


--------------------------------------------------------------------------------
/sgkit/tests/io/plink/data/example.bim:
--------------------------------------------------------------------------------
1 | 1	1_10	0	10	A	G
2 | 1	1_20	0	20	T	C
3 | 


--------------------------------------------------------------------------------
/sgkit/tests/io/plink/data/example.fam:
--------------------------------------------------------------------------------
 1 | ind0 ind0 0 0 0 -9
 2 | ind1 ind1 0 0 0 -9
 3 | ind2 ind2 0 0 0 -9
 4 | ind3 ind3 0 0 0 -9
 5 | ind4 ind4 0 0 0 -9
 6 | ind5 ind5 0 0 0 -9
 7 | ind6 ind6 0 0 0 -9
 8 | ind7 ind7 0 0 0 -9
 9 | ind8 ind8 0 0 0 -9
10 | ind9 ind9 0 0 0 -9
11 | 


--------------------------------------------------------------------------------
/sgkit/tests/io/plink/data/example.map:
--------------------------------------------------------------------------------
1 | 1 1_10 0 10
2 | 1 1_20 0 20
3 | 


--------------------------------------------------------------------------------
/sgkit/tests/io/plink/data/example.nosex:
--------------------------------------------------------------------------------
 1 | ind0	ind0
 2 | ind1	ind1
 3 | ind2	ind2
 4 | ind3	ind3
 5 | ind4	ind4
 6 | ind5	ind5
 7 | ind6	ind6
 8 | ind7	ind7
 9 | ind8	ind8
10 | ind9	ind9
11 | 


--------------------------------------------------------------------------------
/sgkit/tests/io/plink/data/example.ped:
--------------------------------------------------------------------------------
 1 | ind0 ind0 0 0 0 0 A A T T
 2 | ind1 ind1 0 0 0 0 A A T T
 3 | ind2 ind2 0 0 0 0 A A T T
 4 | ind3 ind3 0 0 0 0 G G T T
 5 | ind4 ind4 0 0 0 0 G G C C
 6 | ind5 ind5 0 0 0 0 G G C C
 7 | ind6 ind6 0 0 0 0 G G C C
 8 | ind7 ind7 0 0 0 0 G G C C
 9 | ind8 ind8 0 0 0 0 G G C C
10 | ind9 ind9 0 0 0 0 G G C C
11 | 


--------------------------------------------------------------------------------
/sgkit/tests/io/plink/data/example_with_fam.bed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/tests/io/plink/data/example_with_fam.bed


--------------------------------------------------------------------------------
/sgkit/tests/io/plink/data/example_with_fam.bim:
--------------------------------------------------------------------------------
1 | 1	1_10	0	10	A	G
2 | 1	1_20	0	20	T	C
3 | 


--------------------------------------------------------------------------------
/sgkit/tests/io/plink/data/example_with_fam.fam:
--------------------------------------------------------------------------------
 1 | ind0 ind0 0 0 0 -9
 2 | ind1 ind1 0 0 0 -9
 3 | ind2 ind2 ind1 ind0 2 1
 4 | ind3 ind3 ind1 ind0 1 2
 5 | ind4 ind4 0 0 0 -9
 6 | ind5 ind5 0 0 0 -9
 7 | ind6 ind6 0 0 0 -9
 8 | ind7 ind7 0 0 0 -9
 9 | ind8 ind8 0 0 0 -9
10 | ind9 ind9 0 0 0 -9
11 | 


--------------------------------------------------------------------------------
/sgkit/tests/io/plink/data/plink_sim_10s_100v_10pmiss.bed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/tests/io/plink/data/plink_sim_10s_100v_10pmiss.bed


--------------------------------------------------------------------------------
/sgkit/tests/io/plink/data/plink_sim_10s_100v_10pmiss.bim:
--------------------------------------------------------------------------------
  1 | 1	1:1:G:CGCGCG	0.0	1	CGCGCG	G
  2 | 1	1:2:ACT:G	0.0	2	G	ACT
  3 | 1	1:3:ACT:G	0.0	3	G	ACT
  4 | 1	1:4:G:CGCGCG	0.0	4	CGCGCG	G
  5 | 1	1:5:G:CGCGCG	0.0	5	CGCGCG	G
  6 | 1	1:6:ACT:G	0.0	6	G	ACT
  7 | 1	1:7:G:CGCGCG	0.0	7	CGCGCG	G
  8 | 1	1:8:T:GTGG	0.0	8	GTGG	T
  9 | 1	1:9:T:GTGG	0.0	9	GTGG	T
 10 | 1	1:10:A:C	0.0	10	C	A
 11 | 1	1:11:ACT:G	0.0	11	G	ACT
 12 | 1	1:12:G:CGCGCG	0.0	12	CGCGCG	G
 13 | 1	1:13:G:CGCGCG	0.0	13	CGCGCG	G
 14 | 1	1:14:T:GTGG	0.0	14	GTGG	T
 15 | 1	1:15:ACT:G	0.0	15	G	ACT
 16 | 1	1:16:A:C	0.0	16	C	A
 17 | 1	1:17:ACT:G	0.0	17	G	ACT
 18 | 1	1:18:T:GTGG	0.0	18	GTGG	T
 19 | 1	1:19:A:C	0.0	19	C	A
 20 | 1	1:20:A:C	0.0	20	C	A
 21 | 1	1:21:T:GTGG	0.0	21	GTGG	T
 22 | 1	1:22:G:CGCGCG	0.0	22	CGCGCG	G
 23 | 1	1:23:T:GTGG	0.0	23	GTGG	T
 24 | 1	1:24:A:C	0.0	24	C	A
 25 | 1	1:25:A:C	0.0	25	C	A
 26 | 1	1:26:ACT:G	0.0	26	G	ACT
 27 | 1	1:27:G:CGCGCG	0.0	27	CGCGCG	G
 28 | 1	1:28:ACT:G	0.0	28	G	ACT
 29 | 1	1:29:T:GTGG	0.0	29	GTGG	T
 30 | 1	1:30:A:C	0.0	30	C	A
 31 | 1	1:31:T:GTGG	0.0	31	GTGG	T
 32 | 1	1:32:G:CGCGCG	0.0	32	CGCGCG	G
 33 | 1	1:33:ACT:G	0.0	33	G	ACT
 34 | 1	1:34:G:CGCGCG	0.0	34	CGCGCG	G
 35 | 1	1:35:A:C	0.0	35	C	A
 36 | 1	1:36:G:CGCGCG	0.0	36	CGCGCG	G
 37 | 1	1:37:T:GTGG	0.0	37	GTGG	T
 38 | 1	1:38:A:C	0.0	38	C	A
 39 | 1	1:39:A:C	0.0	39	C	A
 40 | 1	1:40:T:GTGG	0.0	40	GTGG	T
 41 | 1	1:41:A:C	0.0	41	C	A
 42 | 1	1:42:G:CGCGCG	0.0	42	CGCGCG	G
 43 | 1	1:43:T:GTGG	0.0	43	GTGG	T
 44 | 1	1:44:ACT:G	0.0	44	G	ACT
 45 | 1	1:45:G:CGCGCG	0.0	45	CGCGCG	G
 46 | 1	1:46:ACT:G	0.0	46	G	ACT
 47 | 1	1:47:G:CGCGCG	0.0	47	CGCGCG	G
 48 | 1	1:48:A:C	0.0	48	C	A
 49 | 1	1:49:A:C	0.0	49	C	A
 50 | 1	1:50:A:C	0.0	50	C	A
 51 | 1	1:51:G:CGCGCG	0.0	51	CGCGCG	G
 52 | 1	1:52:A:C	0.0	52	C	A
 53 | 1	1:53:ACT:G	0.0	53	G	ACT
 54 | 1	1:54:A:C	0.0	54	C	A
 55 | 1	1:55:G:CGCGCG	0.0	55	CGCGCG	G
 56 | 1	1:56:T:GTGG	0.0	56	GTGG	T
 57 | 1	1:57:G:CGCGCG	0.0	57	CGCGCG	G
 58 | 1	1:58:A:C	0.0	58	C	A
 59 | 1	1:59:T:GTGG	0.0	59	GTGG	T
 60 | 1	1:60:G:CGCGCG	0.0	60	CGCGCG	G
 61 | 1	1:61:ACT:G	0.0	61	G	ACT
 62 | 1	1:62:A:C	0.0	62	C	A
 63 | 1	1:63:G:CGCGCG	0.0	63	CGCGCG	G
 64 | 1	1:64:T:GTGG	0.0	64	GTGG	T
 65 | 1	1:65:T:GTGG	0.0	65	GTGG	T
 66 | 1	1:66:ACT:G	0.0	66	G	ACT
 67 | 1	1:67:T:GTGG	0.0	67	GTGG	T
 68 | 1	1:68:ACT:G	0.0	68	G	ACT
 69 | 1	1:69:G:CGCGCG	0.0	69	CGCGCG	G
 70 | 1	1:70:G:CGCGCG	0.0	70	CGCGCG	G
 71 | 1	1:71:ACT:G	0.0	71	G	ACT
 72 | 1	1:72:G:CGCGCG	0.0	72	CGCGCG	G
 73 | 1	1:73:A:C	0.0	73	C	A
 74 | 1	1:74:A:C	0.0	74	C	A
 75 | 1	1:75:T:GTGG	0.0	75	GTGG	T
 76 | 1	1:76:A:C	0.0	76	C	A
 77 | 1	1:77:ACT:G	0.0	77	G	ACT
 78 | 1	1:78:ACT:G	0.0	78	G	ACT
 79 | 1	1:79:A:C	0.0	79	C	A
 80 | 1	1:80:A:C	0.0	80	C	A
 81 | 1	1:81:A:C	0.0	81	C	A
 82 | 1	1:82:T:GTGG	0.0	82	GTGG	T
 83 | 1	1:83:A:C	0.0	83	C	A
 84 | 1	1:84:ACT:G	0.0	84	G	ACT
 85 | 1	1:85:A:C	0.0	85	C	A
 86 | 1	1:86:G:CGCGCG	0.0	86	CGCGCG	G
 87 | 1	1:87:ACT:G	0.0	87	G	ACT
 88 | 1	1:88:A:C	0.0	88	C	A
 89 | 1	1:89:A:C	0.0	89	C	A
 90 | 1	1:90:T:GTGG	0.0	90	GTGG	T
 91 | 1	1:91:T:GTGG	0.0	91	GTGG	T
 92 | 1	1:92:T:GTGG	0.0	92	GTGG	T
 93 | 1	1:93:A:C	0.0	93	C	A
 94 | 1	1:94:A:C	0.0	94	C	A
 95 | 1	1:95:A:C	0.0	95	C	A
 96 | 1	1:96:A:C	0.0	96	C	A
 97 | 1	1:97:T:GTGG	0.0	97	GTGG	T
 98 | 1	1:98:ACT:G	0.0	98	G	ACT
 99 | 1	1:99:T:GTGG	0.0	99	GTGG	T
100 | 1	1:100:A:C	0.0	100	C	A
101 | 


--------------------------------------------------------------------------------
/sgkit/tests/io/plink/data/plink_sim_10s_100v_10pmiss.fam:
--------------------------------------------------------------------------------
 1 | 0	000	0	0	0	NA
 2 | 0	001	0	0	0	NA
 3 | 0	002	0	0	0	NA
 4 | 0	003	0	0	0	NA
 5 | 0	004	0	0	0	NA
 6 | 0	005	0	0	0	NA
 7 | 0	006	0	0	0	NA
 8 | 0	007	0	0	0	NA
 9 | 0	008	0	0	0	NA
10 | 0	009	0	0	0	NA
11 | 


--------------------------------------------------------------------------------
/sgkit/tests/io/plink/test_plink_writer.py:
--------------------------------------------------------------------------------
  1 | from filecmp import cmp
  2 | 
  3 | import pandas as pd
  4 | import pytest
  5 | 
  6 | from sgkit.io.plink import plink_to_zarr, read_plink
  7 | from sgkit.io.plink.plink_reader import read_bim, read_fam
  8 | from sgkit.io.plink.plink_writer import write_plink, zarr_to_plink
  9 | from sgkit.testing import simulate_genotype_call_dataset
 10 | 
 11 | example_dataset_1 = "plink_sim_10s_100v_10pmiss"
 12 | example_dataset_2 = "example"
 13 | example_dataset_3 = "example_with_fam"
 14 | 
 15 | 
 16 | @pytest.fixture(params=[dict()])
 17 | def ds1(shared_datadir, request):
 18 |     path = shared_datadir / example_dataset_1
 19 |     return read_plink(path=path, bim_sep="\t", fam_sep="\t", **request.param)
 20 | 
 21 | 
 22 | @pytest.mark.parametrize(
 23 |     "plink_in, fam_sep",
 24 |     [
 25 |         (example_dataset_1, "\t"),
 26 |         (example_dataset_2, " "),
 27 |         (example_dataset_3, " "),
 28 |     ],
 29 | )
 30 | def test_write_plink(shared_datadir, tmp_path, plink_in, fam_sep):
 31 |     # read plink file as a dataset then write it out again
 32 |     ds = read_plink(path=shared_datadir / plink_in, fam_sep=fam_sep)
 33 |     path = tmp_path / "plink_out"
 34 |     path.mkdir(parents=True, exist_ok=False)
 35 |     write_plink(ds, path=path)
 36 | 
 37 |     # check bed files are the same
 38 |     bed_path_expected = (shared_datadir / plink_in).with_suffix(".bed")
 39 |     bed_path_actual = path.with_suffix(".bed")
 40 |     assert cmp(bed_path_expected, bed_path_actual)
 41 | 
 42 |     # check bim files are the same
 43 |     bim_expected = read_bim((shared_datadir / plink_in).with_suffix(".bim")).compute()
 44 |     bim_actual = read_bim(path.with_suffix(".bim")).compute()
 45 |     pd.testing.assert_frame_equal(bim_expected, bim_actual)
 46 | 
 47 |     # check fam files are the same
 48 |     fam_expected = read_fam(
 49 |         (shared_datadir / plink_in).with_suffix(".fam"), sep=fam_sep
 50 |     ).compute()
 51 |     fam_actual = read_fam(path.with_suffix(".fam")).compute()
 52 |     pd.testing.assert_frame_equal(fam_expected, fam_actual)
 53 | 
 54 | 
 55 | @pytest.mark.parametrize(
 56 |     "plink_in, fam_sep",
 57 |     [
 58 |         (example_dataset_1, "\t"),
 59 |         (example_dataset_2, " "),
 60 |         (example_dataset_3, " "),
 61 |     ],
 62 | )
 63 | def test_zarr_to_plink(shared_datadir, tmp_path, plink_in, fam_sep):
 64 |     # read plink file as a zarr file then write it out again
 65 |     zarr_path = tmp_path / "plink.zarr"
 66 |     plink_to_zarr(path=shared_datadir / plink_in, output=zarr_path, fam_sep=fam_sep)
 67 |     path = tmp_path / "plink_out"
 68 |     path.mkdir(parents=True, exist_ok=False)
 69 |     zarr_to_plink(zarr_path, path=path)
 70 | 
 71 |     # check bed files are the same
 72 |     bed_path_expected = (shared_datadir / plink_in).with_suffix(".bed")
 73 |     bed_path_actual = path.with_suffix(".bed")
 74 |     assert cmp(bed_path_expected, bed_path_actual)
 75 | 
 76 |     # check bim files are the same
 77 |     bim_expected = read_bim((shared_datadir / plink_in).with_suffix(".bim")).compute()
 78 |     bim_actual = read_bim(path.with_suffix(".bim")).compute()
 79 |     pd.testing.assert_frame_equal(bim_expected, bim_actual)
 80 | 
 81 |     # check fam files are the same
 82 |     fam_expected = read_fam(
 83 |         (shared_datadir / plink_in).with_suffix(".fam"), sep=fam_sep
 84 |     ).compute()
 85 |     fam_actual = read_fam(path.with_suffix(".fam")).compute()
 86 |     pd.testing.assert_frame_equal(fam_expected, fam_actual)
 87 | 
 88 | 
 89 | def test_raise_on_both_path_types(ds1):
 90 |     with pytest.raises(
 91 |         ValueError,
 92 |         match="Either `path` or all 3 of `{bed,bim,fam}_path` must be specified but not both",
 93 |     ):
 94 |         write_plink(ds1, path="x", bed_path="x")
 95 | 
 96 | 
 97 | def test_genotype_inputs_checks():
 98 |     g_wrong_ploidy = simulate_genotype_call_dataset(100, 10, n_ploidy=3)
 99 |     with pytest.raises(
100 |         ValueError, match="write_plink only works for diploid genotypes"
101 |     ):
102 |         write_plink(g_wrong_ploidy, path="x")
103 | 
104 |     g_non_biallelic = simulate_genotype_call_dataset(100, 10, n_allele=3)
105 |     with pytest.raises(
106 |         ValueError, match="write_plink only works for biallelic genotypes"
107 |     ):
108 |         write_plink(g_non_biallelic, path="x")
109 | 


--------------------------------------------------------------------------------
/sgkit/tests/io/test_dataset.py:
--------------------------------------------------------------------------------
  1 | from typing import MutableMapping
  2 | 
  3 | import pytest
  4 | import xarray as xr
  5 | import zarr
  6 | from packaging.version import Version
  7 | from xarray import Dataset
  8 | 
  9 | from sgkit import load_dataset, save_dataset
 10 | from sgkit.testing import simulate_genotype_call_dataset
 11 | 
 12 | 
 13 | def assert_identical(ds1: Dataset, ds2: Dataset) -> None:
 14 |     """Assert two Datasets are identical, including dtypes for all variables."""
 15 |     xr.testing.assert_identical(ds1, ds2)
 16 |     assert all([ds1[v].dtype == ds2[v].dtype for v in ds1.data_vars])
 17 | 
 18 | 
 19 | @pytest.mark.parametrize(
 20 |     "is_path",
 21 |     [True, False],
 22 | )
 23 | def test_save_and_load_dataset(tmp_path, is_path):
 24 |     path = tmp_path / "ds.zarr"
 25 |     if not is_path:
 26 |         path = str(path)
 27 |     ds = simulate_genotype_call_dataset(n_variant=10, n_sample=10)
 28 |     save_dataset(ds, path)
 29 |     ds2 = load_dataset(path)
 30 |     assert_identical(ds, ds2)
 31 | 
 32 |     # save and load again to test https://github.com/pydata/xarray/issues/4386
 33 |     path2 = tmp_path / "ds2.zarr"
 34 |     if not is_path:
 35 |         path2 = str(path2)
 36 |     save_dataset(ds2, path2)
 37 |     assert_identical(ds, load_dataset(path2))
 38 | 
 39 | 
 40 | def test_save_and_load_dataset__mutable_mapping():
 41 |     store: MutableMapping[str, bytes] = {}
 42 |     ds = simulate_genotype_call_dataset(n_variant=10, n_sample=10)
 43 |     save_dataset(ds, store)
 44 |     ds2 = load_dataset(store)
 45 |     assert_identical(ds, ds2)
 46 | 
 47 |     # save and load again to test https://github.com/pydata/xarray/issues/4386
 48 |     store2: MutableMapping[str, bytes] = {}
 49 |     save_dataset(ds2, store2)
 50 |     assert_identical(ds, load_dataset(store2))
 51 | 
 52 | 
 53 | def test_save_unequal_chunks_error():
 54 |     # Make all dimensions the same size for ease of testing
 55 |     ds = simulate_genotype_call_dataset(
 56 |         n_variant=10, n_sample=10, n_ploidy=10, n_allele=10, n_contig=10
 57 |     )
 58 |     # Normal zarr errors shouldn't be caught
 59 |     with pytest.raises(
 60 |         (FileExistsError, ValueError),
 61 |         match="(path '' contains an array|is not empty)",
 62 |     ):
 63 |         save_dataset(ds, {".zarray": ""})
 64 | 
 65 |     # Make the dataset have unequal chunk sizes across all dimensions
 66 |     ds = ds.chunk({dim: (1, 3, 5, 1) for dim in ds.sizes})
 67 | 
 68 |     # Check we get the sgkit error message
 69 |     with pytest.raises(
 70 |         ValueError, match="Zarr requires uniform chunk sizes. Use the `auto_rechunk`"
 71 |     ):
 72 |         save_dataset(ds, {})
 73 | 
 74 |     # xarray gives a different error message when there are two chunks, so check that too
 75 |     ds = ds.chunk({dim: (4, 6) for dim in ds.sizes})
 76 |     with pytest.raises(
 77 |         ValueError, match="Zarr requires uniform chunk sizes. Use the `auto_rechunk`"
 78 |     ):
 79 |         save_dataset(ds, {})
 80 | 
 81 | 
 82 | @pytest.mark.skipif(
 83 |     Version(zarr.__version__).major >= 3, reason="Fails for Zarr Python 3"
 84 | )
 85 | def test_save_auto_rechunk():
 86 |     # Make all dimensions the same size for ease of testing
 87 |     ds = simulate_genotype_call_dataset(
 88 |         n_variant=10, n_sample=10, n_ploidy=10, n_allele=10, n_contig=10
 89 |     )
 90 |     # Make the dataset have unequal chunk sizes across all dimensions
 91 |     ds = ds.chunk({dim: (1, 3, 5, 1) for dim in ds.sizes})
 92 | 
 93 |     # Default is to not rechunk
 94 |     with pytest.raises(
 95 |         ValueError, match="Zarr requires uniform chunk sizes. Use the `auto_rechunk`"
 96 |     ):
 97 |         save_dataset(ds, {})
 98 | 
 99 |     # Rechunking off
100 |     with pytest.raises(
101 |         ValueError, match="Zarr requires uniform chunk sizes. Use the `auto_rechunk`"
102 |     ):
103 |         save_dataset(ds, {}, auto_rechunk=False)
104 | 
105 |     store = {}
106 |     save_dataset(ds, store, auto_rechunk=True)
107 |     assert_identical(ds, load_dataset(store))
108 | 
109 |     # An equal chunked ds retains its original chunking
110 |     ds = simulate_genotype_call_dataset(n_variant=10, n_sample=10)
111 |     ds = ds.chunk({dim: 5 for dim in ds.sizes})
112 |     store2 = {}
113 |     save_dataset(ds, store2, auto_rechunk=True)
114 |     ds_loaded = load_dataset(store2)
115 |     assert_identical(ds, ds_loaded)
116 |     assert ds_loaded.chunks == ds.chunks
117 | 


--------------------------------------------------------------------------------
/sgkit/tests/test_cohort_numba_fns.py:
--------------------------------------------------------------------------------
 1 | import dask.array as da
 2 | import numpy as np
 3 | import pytest
 4 | 
 5 | from sgkit.stats.cohort_numba_fns import (
 6 |     cohort_mean,
 7 |     cohort_nanmean,
 8 |     cohort_nansum,
 9 |     cohort_sum,
10 | )
11 | 
12 | 
13 | def _random_cohort_data(chunks, n, axis, missing=0.0, scale=1, dtype=float, seed=0):
14 |     shape = tuple(np.sum(tup) for tup in chunks)
15 |     np.random.seed(seed)
16 |     x = np.random.rand(*shape) * scale
17 |     idx = np.random.choice([1, 0], shape, p=[missing, 1 - missing]).astype(bool)
18 |     x[idx] = np.nan
19 |     x = da.asarray(x, chunks=chunks, dtype=dtype)
20 |     cohort = np.random.randint(-1, n, size=shape[axis])
21 |     return x, cohort, n, axis
22 | 
23 | 
24 | def _cohort_reduction(func, x, cohort, n, axis=-1):
25 |     # reference implementation
26 |     out = []
27 |     for i in range(n):
28 |         idx = np.where(cohort == i)[0]
29 |         x_c = np.take(x, idx, axis=axis)
30 |         out.append(func(x_c, axis=axis))
31 |     out = np.swapaxes(np.array(out), 0, axis)
32 |     return out
33 | 
34 | 
35 | @pytest.mark.parametrize(
36 |     "x, cohort, n, axis",
37 |     [
38 |         _random_cohort_data((20,), n=3, axis=0),
39 |         _random_cohort_data((20, 20), n=2, axis=0, dtype=np.float32),
40 |         _random_cohort_data((10, 10), n=2, axis=-1, scale=30, dtype=np.int16),
41 |         _random_cohort_data((20, 20), n=3, axis=-1, missing=0.3),
42 |         _random_cohort_data((7, 103, 4), n=5, axis=1, scale=7, missing=0.3),
43 |         _random_cohort_data(
44 |             ((3, 4), (50, 50, 3), 4), n=5, axis=1, scale=7, dtype=np.uint8
45 |         ),
46 |         _random_cohort_data(
47 |             ((6, 6), (50, 50, 7), (3, 1)), n=5, axis=1, scale=7, missing=0.3
48 |         ),
49 |     ],
50 | )
51 | @pytest.mark.parametrize(
52 |     "reduction, func",
53 |     [
54 |         (cohort_sum, np.sum),
55 |         (cohort_nansum, np.nansum),
56 |         (cohort_mean, np.mean),
57 |         (cohort_nanmean, np.nanmean),
58 |     ],
59 | )
60 | def test_cohort_reductions(reduction, func, x, cohort, n, axis):
61 |     expect = _cohort_reduction(func, x, cohort, n, axis=axis)
62 |     actual = reduction(x, cohort, n, axis=axis)
63 |     np.testing.assert_array_almost_equal(expect, actual)
64 | 


--------------------------------------------------------------------------------
/sgkit/tests/test_cohorts.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pytest
 4 | 
 5 | from sgkit.cohorts import _cohorts_to_array, _tuple_len
 6 | 
 7 | 
 8 | def test_tuple_len():
 9 |     assert _tuple_len(tuple()) == 0
10 |     assert _tuple_len(1) == 1
11 |     assert _tuple_len("a") == 1
12 |     assert _tuple_len("ab") == 1
13 |     assert _tuple_len((1,)) == 1
14 |     assert _tuple_len(("a",)) == 1
15 |     assert _tuple_len(("ab",)) == 1
16 |     assert _tuple_len((1, 2)) == 2
17 |     assert _tuple_len(("a", "b")) == 2
18 |     assert _tuple_len(("ab", "cd")) == 2
19 | 
20 | 
21 | def test_cohorts_to_array__indexes():
22 |     with pytest.raises(ValueError, match="Cohort tuples must all be the same length"):
23 |         _cohorts_to_array([(0, 1), (0, 1, 2)])
24 | 
25 |     np.testing.assert_equal(_cohorts_to_array([]), np.array([]))
26 |     np.testing.assert_equal(_cohorts_to_array([0, 1]), np.array([[0], [1]]))
27 |     np.testing.assert_equal(
28 |         _cohorts_to_array([(0, 1), (2, 1)]), np.array([[0, 1], [2, 1]])
29 |     )
30 |     np.testing.assert_equal(
31 |         _cohorts_to_array([(0, 1, 2), (3, 1, 2)]), np.array([[0, 1, 2], [3, 1, 2]])
32 |     )
33 | 
34 | 
35 | def test_cohorts_to_array__ids():
36 |     with pytest.raises(ValueError, match="Cohort tuples must all be the same length"):
37 |         _cohorts_to_array([("c0", "c1"), ("c0", "c1", "c2")])
38 | 
39 |     np.testing.assert_equal(_cohorts_to_array([]), np.array([]))
40 |     np.testing.assert_equal(
41 |         _cohorts_to_array(["c0", "c1"], pd.Index(["c0", "c1"])),
42 |         np.array([[0], [1]]),
43 |     )
44 |     np.testing.assert_equal(
45 |         _cohorts_to_array([("c0", "c1"), ("c2", "c1")], pd.Index(["c0", "c1", "c2"])),
46 |         np.array([[0, 1], [2, 1]]),
47 |     )
48 |     np.testing.assert_equal(
49 |         _cohorts_to_array(
50 |             [("c0", "c1", "c2"), ("c3", "c1", "c2")], pd.Index(["c0", "c1", "c2", "c3"])
51 |         ),
52 |         np.array([[0, 1, 2], [3, 1, 2]]),
53 |     )
54 | 


--------------------------------------------------------------------------------
/sgkit/tests/test_genee.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import numpy.testing as npt
 3 | import pandas as pd
 4 | 
 5 | from sgkit import genee
 6 | from sgkit.model import create_genotype_call_dataset
 7 | from sgkit.utils import encode_array
 8 | 
 9 | 
10 | def test_genee(datadir):
11 |     # Simulated test data was created using https://github.com/ramachandran-lab/genee
12 |     #
13 |     # Edit Simulation.R to create a smaller dataset:
14 |     #
15 |     # -ngene=100; min_gsize=5; max_gsize=20; nsnp_intergenic=400
16 |     # +ngene=5; min_gsize=5; max_gsize=10; nsnp_intergenic=20
17 |     #
18 |     # Then run
19 |     # R --vanilla < Simulation.R
20 |     #
21 |     # Followed by
22 |     #
23 |     #   library("genee")
24 |     #   load("Simulated_LD.RData")
25 |     #   load("Simulated_Summary_Statistics.RData")
26 |     #   load("gene_list.RData")
27 |     #   # use alpha = -1 for OLS
28 |     #   result = genee(mydata, ld, alpha = -1, gene_list = gene_list)
29 |     #   write.csv(mydata, "/path/to/sgkit/sgkit/tests/test_genee/mydata.csv")
30 |     #   write.csv(ld, "/path/to/sgkit/sgkit/tests/test_genee/ld.csv")
31 |     #   write.csv(result, "/path/to/sgkit/sgkit/tests/test_genee/result.csv")
32 |     #   write.csv(t(sapply(gene_list, unlist)), "/path/to/sgkit/sgkit/tests/test_genee/gene_list.csv")
33 | 
34 |     mydata = pd.read_csv(datadir / "mydata.csv", index_col=0)
35 |     ld = pd.read_csv(datadir / "ld.csv", index_col=0)
36 | 
37 |     # This was extracted from gene_list.csv
38 |     gene_list = "1:7,8:14,15:19,20:25,26:35"
39 |     gene_list = [[int(s) for s in ss.split(":")] for ss in gene_list.split(",")]
40 |     gene_start, gene_stop = list(zip(*gene_list))
41 |     gene_start = np.array(gene_start) - 1  # make 0-based
42 |     gene_stop = np.array(gene_stop)
43 | 
44 |     ds = to_sgkit(mydata)
45 | 
46 |     # turn ld into an array
47 |     ld = ld.to_numpy()
48 | 
49 |     # genes are windows in this simple example
50 |     ds["window_contig"] = (["windows"], np.full(len(gene_start), 0))
51 |     ds["window_start"] = (["windows"], gene_start)
52 |     ds["window_stop"] = (["windows"], gene_stop)
53 | 
54 |     df = genee(ds, ld).compute()
55 | 
56 |     expected = pd.read_csv(datadir / "result.csv", index_col=0)
57 |     expected = expected.reset_index()
58 | 
59 |     npt.assert_allclose(df["test_q"], expected["test_q"])
60 |     npt.assert_allclose(df["q_var"], expected["q_var"], rtol=0.01)
61 |     npt.assert_allclose(
62 |         df[df["pval"] > 1e-6]["pval"],
63 |         expected[expected["pval"] > 1e-6]["pval"],
64 |         rtol=0.04,
65 |     )
66 | 
67 | 
68 | def to_sgkit(mydata):
69 |     """Convert summary stats produced by genee R package to sgkit dataset"""
70 |     variant_contig, variant_contig_names = encode_array(mydata.V1.to_numpy())
71 |     variant_contig = variant_contig.astype("int16")
72 |     variant_contig_names = [str(contig) for contig in variant_contig_names]
73 |     variant_position = mydata.V3.to_numpy()
74 |     variant_id = mydata.V2.to_numpy()
75 |     variant_allele = np.array([["A"]] * len(variant_contig), dtype="S1")  # not used
76 |     sample_id = ["SAMPLE1"]
77 |     ds = create_genotype_call_dataset(
78 |         variant_contig_names=variant_contig_names,
79 |         variant_contig=variant_contig,
80 |         variant_position=variant_position,
81 |         variant_allele=variant_allele,
82 |         sample_id=sample_id,
83 |         variant_id=variant_id,
84 |     )
85 |     ds["beta"] = (["variants"], mydata.V4.to_numpy())
86 |     return ds
87 | 


--------------------------------------------------------------------------------
/sgkit/tests/test_genee/gene_list.csv:
--------------------------------------------------------------------------------
1 | "","V1","V2","V3","V4","V5"
2 | "1",1:7,8:14,15:19,20:25,26:35
3 | 


--------------------------------------------------------------------------------
/sgkit/tests/test_genee/mydata.csv:
--------------------------------------------------------------------------------
 1 | "","V1","V2","V3","V4"
 2 | "1","1","rs1","1","0.0589981193774866"
 3 | "2","1","rs2","2","-0.0479887297468593"
 4 | "3","1","rs3","3","0.00755718359618298"
 5 | "4","1","rs4","4","0.0403655126478955"
 6 | "5","1","rs5","5","0.00386474332262596"
 7 | "6","1","rs6","6","-0.00906859548746008"
 8 | "7","1","rs7","7","0.0116812908619256"
 9 | "8","1","rs8","8","0.0403476488344958"
10 | "9","1","rs9","9","0.0429894641101545"
11 | "10","1","rs10","10","0.033776804193074"
12 | "11","1","rs11","11","0.0162144868632937"
13 | "12","1","rs12","12","-0.0151074458113783"
14 | "13","1","rs13","13","0.0273565177791348"
15 | "14","1","rs14","14","-0.0587960969211619"
16 | "15","1","rs15","15","0.132980436833378"
17 | "16","1","rs16","16","-0.0394732368188627"
18 | "17","1","rs17","17","-0.0713981147997422"
19 | "18","1","rs18","18","0.0100579967683699"
20 | "19","1","rs19","19","0.0314712686930575"
21 | "20","1","rs20","20","-0.022557235722201"
22 | "21","1","rs21","21","-0.0514902521917086"
23 | "22","1","rs22","22","-0.380823961633008"
24 | "23","1","rs23","23","0.0376285736111005"
25 | "24","1","rs24","24","0.0813696952243066"
26 | "25","1","rs25","25","-0.0600195191576378"
27 | "26","1","rs26","26","0.0763255078792159"
28 | "27","1","rs27","27","-0.0157340586805173"
29 | "28","1","rs28","28","0.0053491779420878"
30 | "29","1","rs29","29","-0.0230096268786541"
31 | "30","1","rs30","30","-0.0166889799904788"
32 | "31","1","rs31","31","0.0440867194984021"
33 | "32","1","rs32","32","-0.0271740281102383"
34 | "33","1","rs33","33","0.0769936887394494"
35 | "34","1","rs34","34","0.0515012035053683"
36 | "35","1","rs35","35","0.11405453000561"
37 | "36","1","rs36","36","-0.0623497642279973"
38 | "37","1","rs37","37","-0.0439702026965466"
39 | "38","1","rs38","38","-0.0148583809493149"
40 | "39","1","rs39","39","0.0425562446140515"
41 | "40","1","rs40","40","0.0376324933539782"
42 | "41","1","rs41","41","0.15247764456262"
43 | "42","1","rs42","42","-0.0849894739154603"
44 | "43","1","rs43","43","0.0417361653291142"
45 | "44","1","rs44","44","0.057569406750715"
46 | "45","1","rs45","45","0.0155708994519841"
47 | "46","1","rs46","46","-0.212154240453643"
48 | "47","1","rs47","47","-0.302317171280787"
49 | "48","1","rs48","48","-0.317259261470206"
50 | "49","1","rs49","49","-0.0902619746076583"
51 | "50","1","rs50","50","0.0713954763102486"
52 | "51","1","rs51","51","-0.250385712358405"
53 | "52","1","rs52","52","0.0269857555070088"
54 | "53","1","rs53","53","0.0724172261358925"
55 | "54","1","rs54","54","-0.0115076932928384"
56 | "55","1","rs55","55","-0.241041012957552"
57 | 


--------------------------------------------------------------------------------
/sgkit/tests/test_genee/result.csv:
--------------------------------------------------------------------------------
1 | "","test_q","q_var","pval"
2 | "1",0.00770381012929696,5.97664199541393e-05,0.916111258824932
3 | "2",0.00931340387379375,5.97766731532365e-05,0.866617486805112
4 | "3",0.0254312278544544,4.2639105994213e-05,0.121151813035285
5 | "4",0.159826244240868,5.11662269290079e-05,5.26183874249853e-10
6 | "5",0.0311806269653318,8.54603570912353e-05,0.38271258751765
7 | 


--------------------------------------------------------------------------------
/sgkit/tests/test_genee_momentchi2py.py:
--------------------------------------------------------------------------------
 1 | # hbe tests are from https://github.com/deanbodenham/momentchi2py/blob/master/tests/test_momentchi2.py
 2 | 
 3 | import unittest
 4 | 
 5 | import numpy as np
 6 | 
 7 | from sgkit.stats.genee_momentchi2py import hbe
 8 | 
 9 | 
10 | class HbeTests(unittest.TestCase):
11 |     def test_hbe1(self):
12 |         """hbe with x float, coeff list"""
13 |         x = 10.203
14 |         coeff = [1.5, 1.5, 0.5, 0.5]
15 |         ans = hbe(coeff, x)
16 |         soln = 0.949
17 |         self.assertAlmostEqual(ans, soln, places=3, msg=None, delta=None)
18 | 
19 |     def test_hbe2(self):
20 |         """hbe with x float, coeff list, specifying arguments"""
21 |         x = 10.203
22 |         coeff = [1.5, 1.5, 0.5, 0.5]
23 |         ans = hbe(x=x, coeff=coeff)
24 |         soln = 0.949
25 |         self.assertAlmostEqual(ans, soln, places=3, msg=None, delta=None)
26 | 
27 |     def test_hbe3(self):
28 |         """hbe with x float, coeff list, specifying arguments"""
29 |         x = 0.627
30 |         coeff = [1.5, 1.5, 0.5, 0.5]
31 |         ans = hbe(x=x, coeff=coeff)
32 |         soln = 0.0285
33 |         self.assertAlmostEqual(ans, soln, places=3, msg=None, delta=None)
34 | 
35 |     def test_hbe4(self):
36 |         """hbe with x list, coeff list, specifying arguments"""
37 |         x = [0.627, 10.203]
38 |         coeff = [1.5, 1.5, 0.5, 0.5]
39 |         ans = hbe(coeff=coeff, x=x)
40 |         soln = [0.0285, 0.949]
41 |         # check it is a list
42 |         self.assertTrue(isinstance(ans, list))
43 |         # check lists are equal length and almost equal values
44 |         self.assertEqual(len(ans), len(soln))
45 |         for i in range(len(ans)):
46 |             self.assertAlmostEqual(ans[i], soln[i], places=2, msg=None, delta=None)
47 | 
48 |     def test_hbe5(self):
49 |         """hbe with x float, coeff numpy array"""
50 |         x = 10.203
51 |         coeff = np.array([1.5, 1.5, 0.5, 0.5])
52 |         ans = hbe(coeff, x)
53 |         soln = 0.949
54 |         self.assertAlmostEqual(ans, soln, places=3, msg=None, delta=None)
55 | 
56 |     def test_hbe6(self):
57 |         """hbe with x numpy array, coeff numpy array"""
58 |         x = np.array([0.627, 10.203])
59 |         coeff = np.array([1.5, 1.5, 0.5, 0.5])
60 |         ans = hbe(coeff, x)
61 |         soln = np.array([0.0285, 0.949])
62 |         self.assertEqual(len(ans), len(soln))
63 |         for i in range(len(ans)):
64 |             self.assertAlmostEqual(ans[i], soln[i], places=3, msg=None, delta=None)
65 | 
66 |     def test_hbe7(self):
67 |         """hbe with x numpy array one element, coeff numpy array"""
68 |         x = np.array([0.627])
69 |         coeff = np.array([1.5, 1.5, 0.5, 0.5])
70 |         ans = hbe(coeff, x)
71 |         soln = np.array([0.0285])
72 |         self.assertEqual(len(ans), len(soln))
73 |         for i in range(len(ans)):
74 |             self.assertAlmostEqual(ans[i], soln[i], places=3, msg=None, delta=None)
75 | 
76 |     def test_hbe8(self):
77 |         """hbe with x numpy array, coeff list"""
78 |         x = np.array([0.627, 10.203])
79 |         coeff = [1.5, 1.5, 0.5, 0.5]
80 |         ans = hbe(coeff, x)
81 |         soln = np.array([0.0285, 0.949])
82 |         self.assertEqual(len(ans), len(soln))
83 |         for i in range(len(ans)):
84 |             self.assertAlmostEqual(ans[i], soln[i], places=3, msg=None, delta=None)
85 | 
86 |     def test_hbe9(self):
87 |         """hbe with x list, coeff numpy array"""
88 |         x = np.array([0.627, 10.203])
89 |         coeff = [1.5, 1.5, 0.5, 0.5]
90 |         ans = hbe(coeff, x)
91 |         soln = np.array([0.0285, 0.949])
92 |         self.assertEqual(len(ans), len(soln))
93 |         for i in range(len(ans)):
94 |             self.assertAlmostEqual(ans[i], soln[i], places=3, msg=None, delta=None)
95 | 


--------------------------------------------------------------------------------
/sgkit/tests/test_grm/Legara2009_G_matrix.txt:
--------------------------------------------------------------------------------
1 | 1.0 0.7 0.7 0.7
2 | 0.7 1.0 0.7 0.7
3 | 0.7 0.7 1.0 0.7
4 | 0.7 0.7 0.7 1.0
5 | 


--------------------------------------------------------------------------------
/sgkit/tests/test_grm/Legara2009_H_matrix.txt:
--------------------------------------------------------------------------------
 1 | 1.00 0.00 0.18 0.18 0.18 0.18 0.18 0.18 0.50 0.35 0.35 0.35 0.43 0.35 0.26 0.34 0.39
 2 | 0.00 1.00 0.18 0.18 0.18 0.18 0.18 0.18 0.50 0.35 0.35 0.35 0.43 0.35 0.26 0.34 0.39
 3 | 0.18 0.18 1.00 0.00 0.18 0.18 0.18 0.18 0.35 0.50 0.35 0.35 0.43 0.35 0.18 0.30 0.39
 4 | 0.18 0.18 0.00 1.00 0.18 0.18 0.18 0.18 0.35 0.50 0.35 0.35 0.43 0.35 0.68 0.55 0.39
 5 | 0.18 0.18 0.18 0.18 1.00 0.00 0.18 0.18 0.35 0.35 0.50 0.35 0.35 0.43 0.34 0.34 0.39
 6 | 0.18 0.18 0.18 0.18 0.00 1.00 0.18 0.18 0.35 0.35 0.50 0.35 0.35 0.43 0.34 0.34 0.39
 7 | 0.18 0.18 0.18 0.18 0.18 0.18 1.00 0.00 0.35 0.35 0.35 0.50 0.35 0.43 0.26 0.31 0.39
 8 | 0.18 0.18 0.18 0.18 0.18 0.18 0.00 1.00 0.35 0.35 0.35 0.50 0.35 0.43 0.26 0.31 0.39
 9 | 0.50 0.50 0.35 0.35 0.35 0.35 0.35 0.35 1.00 0.70 0.70 0.70 0.85 0.70 0.53 0.69 0.78
10 | 0.35 0.35 0.50 0.50 0.35 0.35 0.35 0.35 0.70 1.00 0.70 0.70 0.85 0.70 0.60 0.73 0.78
11 | 0.35 0.35 0.35 0.35 0.50 0.50 0.35 0.35 0.70 0.70 1.00 0.70 0.70 0.85 0.68 0.69 0.78
12 | 0.35 0.35 0.35 0.35 0.35 0.35 0.50 0.50 0.70 0.70 0.70 1.00 0.70 0.85 0.53 0.61 0.78
13 | 0.43 0.43 0.43 0.43 0.35 0.35 0.35 0.35 0.85 0.85 0.70 0.70 1.35 0.70 0.56 0.96 1.03
14 | 0.35 0.35 0.35 0.35 0.43 0.43 0.43 0.43 0.70 0.70 0.85 0.85 0.70 1.35 0.60 0.65 1.03
15 | 0.26 0.26 0.18 0.68 0.34 0.34 0.26 0.26 0.53 0.60 0.68 0.53 0.56 0.60 1.18 0.87 0.58
16 | 0.34 0.34 0.30 0.55 0.34 0.34 0.31 0.31 0.69 0.73 0.69 0.61 0.96 0.65 0.87 1.41 0.80
17 | 0.39 0.39 0.39 0.39 0.39 0.39 0.39 0.39 0.78 0.78 0.78 0.78 1.03 1.03 0.58 0.80 1.53
18 | 


--------------------------------------------------------------------------------
/sgkit/tests/test_grm/Legara2009_pedigree.txt:
--------------------------------------------------------------------------------
 1 | -1 -1
 2 | -1 -1
 3 | -1 -1
 4 | -1 -1
 5 | -1 -1
 6 | -1 -1
 7 | -1 -1
 8 | -1 -1
 9 | 0 1
10 | 2 3
11 | 4 5
12 | 6 7
13 | 8 9
14 | 10 11
15 | 3 10
16 | 12 14
17 | 12 13
18 | 


--------------------------------------------------------------------------------
/sgkit/tests/test_ibs/hierfstat.sim1.beta.txt:
--------------------------------------------------------------------------------
 1 | 0.366651780276887 -0.221631277658236 -0.188745889326459 -0.227721164386342 0.0658113759084082 0.0743372173277576 -0.219195322966993 0.0779911493646216 0.0548495797978157 0.0889529454752141 0.0609394665259227 -0.0645122000730788 -0.164386342414031 -0.0486784945800009 0.00856644066420351 -0.0352807437781659
 2 | -0.221631277658236 0.392229304534935 -0.168040274450895 -0.203361617473915 0.0962608095489423 0.0974787868945636 -0.18143802525273 -0.215541390930129 0.0986967642401851 0.0755551946733791 0.0926068775120781 -0.185091957289595 -0.191181844017701 -0.0632942227274573 -0.0657301774187001 -0.0730380414924283
 3 | -0.188745889326459 -0.168040274450895 0.371523689659372 -0.211887458893265 -0.150988591612196 -0.169258251796517 0.0840810360927286 -0.219195322966993 -0.135154886119118 -0.141244772847225 -0.122975112662904 -0.0681661321099428 0.00856644066420351 -0.0584223133449718 -0.0973975884048558 -0.079127928220535
 4 | -0.227721164386342 -0.203361617473915 -0.211887458893265 0.384921440461208 -0.235029028460071 -0.202143640128294 0.0767731720190004 0.0865169907839714 -0.196053753400187 -0.226503187040721 -0.208233526856401 0.0950428322033211 0.0999147415858066 -0.0584223133449718 -0.0511144492712436 -0.0291908570500589
 5 | 0.0658113759084082 0.0962608095489423 -0.150988591612196 -0.235029028460071 0.405627055336771 0.0658113759084082 -0.199707685437051 -0.0839998376030205 0.247290000405993 0.239982136332264 0.242418091023507 -0.113231293897933 -0.170476229142138 0.0317080102310097 0.0597214891803012 0.039015874304738
 6 | 0.0743372173277576 0.0974787868945636 -0.169258251796517 -0.202143640128294 0.0658113759084082 0.408063010028013 -0.188745889326459 -0.0937436563679915 0.237546181641022 0.232674272258536 0.237546181641022 -0.140026795501604 -0.180220047907109 0.00856644066420351 0.0426698063416019 0.0110023953554463
 7 | -0.219195322966993 -0.18143802525273 0.0840810360927286 0.0767731720190004 -0.199707685437051 -0.188745889326459 0.394665259226179 -0.0973975884048558 -0.182656002598352 -0.199707685437051 -0.176566115870245 0.162031586212496 0.285047298120255 0.0536316024521944 0.0110023953554463 0.0670293532540294
 8 | 0.0779911493646216 -0.215541390930129 -0.219195322966993 0.0865169907839714 -0.0839998376030205 -0.0937436563679915 -0.0973975884048558 0.398319191263042 -0.0754739961836711 -0.0803459055661565 -0.0827818602573993 0.148633835410661 0.02318216881166 -0.0304088343956804 0.0414518289959807 0.0146563273923105
 9 | 0.0548495797978157 0.0986967642401851 -0.135154886119118 -0.196053753400187 0.247290000405993 0.237546181641022 -0.182656002598352 -0.0754739961836711 0.527424789898908 0.247290000405993 0.391011327189314 -0.112013316552312 -0.153424546303439 0.108440583005156 0.125492265843855 0.106004628313913
10 | 0.0889529454752141 0.0755551946733791 -0.141244772847225 -0.226503187040721 0.239982136332264 0.232674272258536 -0.199707685437051 -0.0803459055661565 0.247290000405993 0.521334903170801 0.373959644350615 -0.138808818155982 -0.188745889326459 0.0804271040558646 0.116966424424506 0.0852990134383501
11 | 0.0609394665259227 0.0926068775120781 -0.122975112662904 -0.208233526856401 0.242418091023507 0.237546181641022 -0.176566115870245 -0.0827818602573993 0.391011327189314 0.373959644350615 0.629734886931103 -0.121757135317283 -0.170476229142138 0.21805854411108 0.232674272258536 0.207096748000487
12 | -0.0645122000730788 -0.185091957289595 -0.0681661321099428 0.0950428322033211 -0.113231293897933 -0.140026795501604 0.162031586212496 0.148633835410661 -0.112013316552312 -0.138808818155982 -0.121757135317283 0.468961877309082 0.315496731760789 0.0865169907839714 0.187609110470545 0.171775404977467
13 | -0.164386342414031 -0.191181844017701 0.00856644066420351 0.0999147415858066 -0.170476229142138 -0.180220047907109 0.285047298120255 0.02318216881166 -0.153424546303439 -0.188745889326459 -0.170476229142138 0.315496731760789 0.596849498599326 0.20953270269173 0.103568673622671 0.202224838618002
14 | -0.0486784945800009 -0.0632942227274573 -0.0584223133449718 -0.0584223133449718 0.0317080102310097 0.00856644066420351 0.0536316024521944 -0.0304088343956804 0.108440583005156 0.0804271040558646 0.21805854411108 0.0865169907839714 0.20953270269173 0.401973123299906 0.149851812756283 0.286265275465876
15 | 0.00856644066420351 -0.0657301774187001 -0.0973975884048558 -0.0511144492712436 0.0597214891803012 0.0426698063416019 0.0110023953554463 0.0414518289959807 0.125492265843855 0.116966424424506 0.232674272258536 0.187609110470545 0.103568673622671 0.149851812756283 0.437294466322926 0.30697089034144
16 | -0.0352807437781659 -0.0730380414924283 -0.079127928220535 -0.0291908570500589 0.039015874304738 0.0110023953554463 0.0670293532540294 0.0146563273923105 0.106004628313913 0.0852990134383501 0.207096748000487 0.171775404977467 0.202224838618002 0.286265275465876 0.30697089034144 0.590759611871219


--------------------------------------------------------------------------------
/sgkit/tests/test_ibs/hierfstat.sim2.beta.txt:
--------------------------------------------------------------------------------
1 | 0.41475192333518 -0.197024512031376 0.10099131722595
2 | -0.197024512031376 0.407868471660385 0.0960331948054254
3 | 0.10099131722595 0.0960331948054254 0.40829863307675


--------------------------------------------------------------------------------
/sgkit/tests/test_ibs/hierfstat.sim3.beta.txt:
--------------------------------------------------------------------------------
1 | nan nan nan
2 | nan 0.34496319451447 0
3 | nan 0 0.34543905426274


--------------------------------------------------------------------------------
/sgkit/tests/test_import_star.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # Basic test to ensure we can import * and the the __all__ array is
 3 | # well formed. We're doing bad things in this file by definition,
 4 | # so easiest to turn off style checks.
 5 | 
 6 | from sgkit import *
 7 | 
 8 | 
 9 | def test_doc_example():
10 |     ds = simulate_genotype_call_dataset(
11 |         n_variant=1000, n_sample=250, n_contig=23, missing_pct=0.1
12 |     )
13 |     # assert something simple, just to make sure we're evaluating
14 |     # things correctly.
15 |     assert ds.variant_position.shape == (1000,)
16 | 


--------------------------------------------------------------------------------
/sgkit/tests/test_mis.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from typing import List, Optional, Tuple
  3 | 
  4 | import networkx as nx
  5 | import numpy as np
  6 | import pytest
  7 | import toolz
  8 | 
  9 | from sgkit.stats.ld import _maximal_independent_set as numba_mis
 10 | from sgkit.typing import NDArray
 11 | 
 12 | 
 13 | def to_vertex_ids(g: nx.Graph) -> Tuple[NDArray, NDArray]:
 14 |     g = np.array(sorted(g.edges))
 15 |     return g[:, 0], g[:, 1]
 16 | 
 17 | 
 18 | def plink_mis(idi: NDArray, idj: NDArray, cmp: Optional[NDArray] = None) -> List[int]:
 19 |     # Direct port of https://groups.google.com/forum/#!msg/plink2-users/w5TuZo2fgsQ/WbNnE16_xDIJ
 20 |     if cmp is None:
 21 |         cmp = np.zeros(len(idi))
 22 |     lost = set()
 23 |     grps = toolz.groupby(lambda p: p[0], list(zip(idi, idj, cmp)))
 24 |     for i in sorted(grps.keys()):
 25 |         if i in lost:
 26 |             continue
 27 |         for t in sorted(grps[i]):
 28 |             j, c = t[1:]
 29 |             if j <= i:
 30 |                 continue
 31 |             if c < 0:
 32 |                 lost.add(i)
 33 |                 break
 34 |             else:
 35 |                 lost.add(j)
 36 |     return sorted(lost)
 37 | 
 38 | 
 39 | mis_fns = [numba_mis, plink_mis]
 40 | 
 41 | 
 42 | @pytest.mark.parametrize("mis", mis_fns)
 43 | @pytest.mark.parametrize("n", [2, 5, 25])
 44 | def test_star_graph(mis, n):
 45 |     # There are n+1 nodes in the resulting graph
 46 |     idi, idj = to_vertex_ids(nx.star_graph(n))
 47 |     # Favoring non-center node (which is the first)
 48 |     # results in only middle node lost
 49 |     idx = mis(idi, idj, cmp=np.full(n, -1))
 50 |     assert len(idx) == 1
 51 |     # Favoring center node results in all others lost
 52 |     idx = mis(idi, idj, cmp=np.full(n, 1))
 53 |     assert len(idx) == n
 54 | 
 55 | 
 56 | @pytest.mark.parametrize("mis", mis_fns)
 57 | def test_path_graph(mis):
 58 |     # Graph is 3 nodes with A-B and B-C
 59 |     idi, idj = to_vertex_ids(nx.path_graph(3))
 60 |     # First and third should be kept with no comparison
 61 |     idx = mis(idi, idj)
 62 |     assert idx == [1]
 63 |     # With comparisons favoring later nodes, only third is kept
 64 |     idx = mis(idi, idj, cmp=np.array([-1, -1]))
 65 |     assert idx == [0, 1]
 66 |     # With comparisons favoring earlier nodes, middle node is lost
 67 |     idx = mis(idi, idj, cmp=np.array([1, 1]))
 68 |     assert idx == [1]
 69 |     # With middle node largest, first and third are lost
 70 |     idx = mis(idi, idj, cmp=np.array([-1, 1]))
 71 |     assert idx == [0, 2]
 72 | 
 73 | 
 74 | @pytest.mark.parametrize("mis", mis_fns)
 75 | def test_disconnected_graph(mis):
 76 |     # Node 2 is connected to 3 but 0, 1, and 4 have no edges
 77 |     idi, idj = np.array([0, 1, 2, 2, 3, 4]), np.array([0, 1, 2, 3, 3, 4])
 78 |     idx = mis(idi, idj)
 79 |     assert idx == [3]
 80 | 
 81 | 
 82 | @pytest.mark.parametrize(
 83 |     "gfn",
 84 |     [
 85 |         nx.ladder_graph,
 86 |         nx.circular_ladder_graph,
 87 |         nx.binomial_tree,
 88 |         nx.wheel_graph,
 89 |         nx.complete_graph,
 90 |     ],
 91 | )
 92 | @pytest.mark.parametrize("n", [2, 10, 25])
 93 | def test_random_graphs(gfn, n):
 94 |     # For several more complex graph types, make sure
 95 |     # the plink algo is equal to the unrolled numba version
 96 |     if gfn == nx.binomial_tree:
 97 |         n = int(math.log(n, 2))
 98 |     idi, idj = to_vertex_ids(gfn(n))
 99 |     idx1 = numba_mis(idi, idj)
100 |     idx2 = plink_mis(idi, idj)
101 |     assert idx1 == idx2
102 | 
103 | 
104 | def test_unsorted_edges():
105 |     idi, idj = to_vertex_ids(nx.complete_graph(10))
106 |     idi = idi[::-1]
107 |     with pytest.raises(ValueError):
108 |         numba_mis(idi, idj)
109 | 


--------------------------------------------------------------------------------
/sgkit/tests/test_model.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from numpy.testing import assert_array_equal
  3 | 
  4 | from sgkit import (
  5 |     DIM_ALLELE,
  6 |     DIM_PLOIDY,
  7 |     DIM_SAMPLE,
  8 |     DIM_VARIANT,
  9 |     __version__,
 10 |     create_genotype_call_dataset,
 11 |     create_genotype_dosage_dataset,
 12 |     display_genotypes,
 13 | )
 14 | 
 15 | 
 16 | def test_create_genotype_call_dataset():
 17 |     variant_contig_names = ["chr1"]
 18 |     variant_contig = np.array([0, 0], dtype="i1")
 19 |     variant_position = np.array([1000, 2000], dtype="i4")
 20 |     variant_allele = np.array([["A", "C"], ["G", "A"]], dtype="S1")
 21 |     variant_id = np.array(["rs1", "rs2"], dtype=str)
 22 |     sample_id = np.array(["sample_1", "sample_2", "sample_3"], dtype=str)
 23 |     call_genotype = np.array(
 24 |         [[[0, 0], [0, 1], [1, 0]], [[-1, 0], [0, -1], [-1, -1]]], dtype="i1"
 25 |     )
 26 |     call_genotype_phased = np.array(
 27 |         [[True, True, False], [True, False, False]], dtype=bool
 28 |     )
 29 |     ds = create_genotype_call_dataset(
 30 |         variant_contig_names=variant_contig_names,
 31 |         variant_contig=variant_contig,
 32 |         variant_position=variant_position,
 33 |         variant_allele=variant_allele,
 34 |         sample_id=sample_id,
 35 |         call_genotype=call_genotype,
 36 |         call_genotype_phased=call_genotype_phased,
 37 |         variant_id=variant_id,
 38 |     )
 39 | 
 40 |     assert DIM_VARIANT in ds.sizes
 41 |     assert DIM_SAMPLE in ds.sizes
 42 |     assert DIM_PLOIDY in ds.sizes
 43 |     assert DIM_ALLELE in ds.sizes
 44 | 
 45 |     assert ds.attrs["source"] == f"sgkit-{__version__}"
 46 |     assert_array_equal(ds["contig_id"], variant_contig_names)
 47 |     assert_array_equal(ds["variant_contig"], variant_contig)
 48 |     assert_array_equal(ds["variant_position"], variant_position)
 49 |     assert_array_equal(ds["variant_allele"], variant_allele)
 50 |     assert_array_equal(ds["variant_id"], variant_id)
 51 |     assert_array_equal(ds["sample_id"], sample_id)
 52 |     assert_array_equal(ds["call_genotype"], call_genotype)
 53 |     assert_array_equal(ds["call_genotype_mask"], call_genotype < 0)
 54 |     assert_array_equal(ds["call_genotype_phased"], call_genotype_phased)
 55 | 
 56 |     disp = display_genotypes(ds)
 57 |     assert (
 58 |         str(disp)
 59 |         == """
 60 | samples  sample_1 sample_2 sample_3
 61 | variants                           
 62 | rs1           0|0      0|1      1/0
 63 | rs2           .|0      0/.      ./.
 64 | """.strip()  # noqa: W291
 65 |     )
 66 | 
 67 | 
 68 | def test_create_genotype_dosage_dataset():
 69 |     variant_contig_names = ["chr1"]
 70 |     variant_contig = np.array([0, 0], dtype="i1")
 71 |     variant_position = np.array([1000, 2000], dtype="i4")
 72 |     variant_allele = np.array([["A", "C"], ["G", "A"]], dtype="S1")
 73 |     variant_id = np.array(["rs1", "rs2"], dtype=str)
 74 |     sample_id = np.array(["sample_1", "sample_2", "sample_3"], dtype=str)
 75 |     call_dosage = np.array([[0.8, 0.9, np.nan], [1.0, 1.1, 1.2]], dtype="f4")
 76 |     call_genotype_probability = np.array(
 77 |         [
 78 |             [[0.1, 0.5, 0.4], [0.2, 0.2, 0.6], [np.nan, np.nan, np.nan]],
 79 |             [[0.1, 0.5, 0.4], [0.2, 0.2, 0.6], [0.3, 0.1, 0.6]],
 80 |         ],
 81 |         dtype="f4",
 82 |     )
 83 |     ds = create_genotype_dosage_dataset(
 84 |         variant_contig_names=variant_contig_names,
 85 |         variant_contig=variant_contig,
 86 |         variant_position=variant_position,
 87 |         variant_allele=variant_allele,
 88 |         sample_id=sample_id,
 89 |         call_dosage=call_dosage,
 90 |         call_genotype_probability=call_genotype_probability,
 91 |         variant_id=variant_id,
 92 |     )
 93 | 
 94 |     assert DIM_VARIANT in ds.sizes
 95 |     assert DIM_SAMPLE in ds.sizes
 96 | 
 97 |     assert_array_equal(ds["contig_id"], variant_contig_names)
 98 |     assert_array_equal(ds["variant_contig"], variant_contig)
 99 |     assert_array_equal(ds["variant_position"], variant_position)
100 |     assert_array_equal(ds["variant_allele"], variant_allele)
101 |     assert_array_equal(ds["variant_id"], variant_id)
102 |     assert_array_equal(ds["sample_id"], sample_id)
103 |     assert_array_equal(ds["call_dosage"], call_dosage)
104 |     assert_array_equal(ds["call_dosage_mask"], np.isnan(call_dosage))
105 |     assert_array_equal(ds["call_genotype_probability"], call_genotype_probability)
106 |     assert_array_equal(
107 |         ds["call_genotype_probability_mask"], np.isnan(call_genotype_probability)
108 |     )
109 | 


--------------------------------------------------------------------------------
/sgkit/tests/test_pedigree/hamilton_kerr_A_matrix.txt:
--------------------------------------------------------------------------------
1 | 1 -1.39760517786776e-16 -1.97308966287214e-16 1 0.500000000000001 0.707106781186548 0.353553390593274 0.353553390593274
2 | 2.38906505262051e-16 1.23119520073835 0.870586475406418 2.38906505262051e-16 0.435293237703209 0.615597600369174 0.923396400553761 0.923396400553761
3 | 3.39424362382837e-16 0.870586475406418 1.23119520073835 3.39424362382837e-16 0.615597600369174 0.870586475406418 0.870586475406418 0.870586475406418
4 | 1 -1.43416017373419e-16 -2.15124026060129e-16 1.041 0.500000000000001 0.707106781186548 0.353553390593274 0.353553390593274
5 | 0.500000000000001 0.435293237703209 0.615597600369174 0.500000000000001 1 0.788846628296483 0.612069932999846 0.612069932999846
6 | 0.707106781186549 0.615597600369174 0.870586475406418 0.707106781186549 0.788846628296484 1.59035809875404 1.10297784956161 1.10297784956161
7 | 0.353553390593274 0.923396400553761 0.870586475406418 0.353553390593274 0.612069932999846 1.10297784956161 1.58885778326411 1.01318712505768
8 | 0.353553390593274 0.923396400553761 0.870586475406418 0.353553390593274 0.612069932999846 1.10297784956161 1.01318712505768 1.58885778326411
9 | 


--------------------------------------------------------------------------------
/sgkit/tests/test_pedigree/hamilton_kerr_A_matrix_inv.txt:
--------------------------------------------------------------------------------
1 | 27.0087600096362 0 1.61851610719714 -24.3902439024391 -1.13070701800157 -1.48939682975337 0 0
2 | 0 2.49298995216817 -1.14865097063812 0 0 0.868552171058725 -0.868552171058725 -0.868552171058725
3 | 1.61851610719714 -1.14865097063812 3.24295388830658 0 -1.13070701800157 -1.48939682975337 0 0
4 | -24.3902439024391 0 0 24.3902439024391 0 0 0 0
5 | -1.13070701800157 0 -1.13070701800157 0 2.26141403600313 0 0 0
6 | -1.48939682975337 0.868552171058725 -1.48939682975337 0 0 2.97487736745144 -0.868552171058725 -0.868552171058725
7 | 0 -0.868552171058725 0 0 0 -0.868552171058725 1.73710434211745 0
8 | 0 -0.868552171058725 0 0 0 -0.868552171058725 0 1.73710434211745
9 | 


--------------------------------------------------------------------------------
/sgkit/tests/test_pedigree/hamilton_kerr_inbreeding.txt:
--------------------------------------------------------------------------------
1 | 0
2 | 0.0770650669127826
3 | 0.231195200738348
4 | 0.041
5 | 0
6 | 0.196786032918013
7 | 0.196285927754704
8 | 0.196285927754704
9 | 


--------------------------------------------------------------------------------
/sgkit/tests/test_pedigree/hamilton_kerr_kinship.txt:
--------------------------------------------------------------------------------
1 | 0.500000000000001 -2.26083190537432e-17 -2.87742242502186e-17 0.500000000000001 0.25 0.25 0.125 0.125
2 | 1.20724902574995e-17 0.307798800184587 0.307798800184587 1.20724902574995e-17 0.153899400092293 0.153899400092293 0.23084910013844 0.23084910013844
3 | 4.24280452978546e-17 0.307798800184587 0.615597600369174 4.24280452978546e-17 0.307798800184587 0.307798800184587 0.307798800184587 0.307798800184587
4 | 0.500000000000001 -2.68905032575161e-17 -3.58540043433548e-17 0.520500000000001 0.25 0.25 0.125 0.125
5 | 0.25 0.153899400092293 0.307798800184587 0.25 0.5 0.278899400092294 0.216399400092293 0.216399400092293
6 | 0.25 0.153899400092293 0.307798800184587 0.25 0.278899400092294 0.39758952468851 0.275744462390402 0.275744462390402
7 | 0.125 0.23084910013844 0.307798800184587 0.125 0.216399400092294 0.275744462390402 0.397214445816028 0.253296781264421
8 | 0.125 0.23084910013844 0.307798800184587 0.125 0.216399400092294 0.275744462390402 0.253296781264421 0.397214445816028
9 | 


--------------------------------------------------------------------------------
/sgkit/tests/test_pedigree/hamilton_kerr_kinship_inv.txt:
--------------------------------------------------------------------------------
1 | 54.0175200192724 0 3.23703221439428 -48.7804878048781 -2.26141403600313 -4.21265039278542 0 0
2 | 0 9.97195980867268 -3.24887556221889 0 0 3.4742086842349 -3.4742086842349 -3.4742086842349
3 | 3.23703221439428 -3.24887556221889 6.48590777661317 0 -2.26141403600313 -4.21265039278542 0 0
4 | -48.7804878048781 0 0 48.7804878048781 0 0 0 0
5 | -2.26141403600313 0 -2.26141403600313 0 4.52282807200626 0 0 0
6 | -4.21265039278542 3.4742086842349 -4.21265039278542 0 0 11.8995094698057 -3.4742086842349 -3.4742086842349
7 | 0 -3.4742086842349 0 0 0 -3.4742086842349 6.9484173684698 0
8 | 0 -3.4742086842349 0 0 0 -3.4742086842349 0 6.9484173684698
9 | 


--------------------------------------------------------------------------------
/sgkit/tests/test_pedigree/hamilton_kerr_pedigree.csv:
--------------------------------------------------------------------------------
 1 | "INDIV.ID","SIRE.ID","DAM.ID","SIRE.GAMETE.PLOIDY","SIRE.LAMBDA","DAM.GAMETE.PLOIDY","DAM.LAMBDA","SIRE.SEGREGATION","DAM.SEGREGATION","INDIV.PLOIDY"
 2 | 1,0,0,1,0,1,0,"Normal","Normal","Diploid"
 3 | 2,0,0,2,0.167,2,0.167,"Normal","Normal","Tetraploid"
 4 | 3,0,2,0,0,2,0.167,"NA","Normal","Diploid"
 5 | 4,1,0,2,0.041,0,0,"First division restitution","NA","Diploid"
 6 | 5,1,3,1,0,1,0,"Normal","Normal","Diploid"
 7 | 6,1,3,2,0.918,2,0.041,"Second division restitution","First division restitution","Tetraploid"
 8 | 7,6,2,2,0.167,2,0.167,"Normal","Normal","Tetraploid"
 9 | 8,6,2,2,0.167,2,0.167,"Normal","Normal","Tetraploid"
10 | 


--------------------------------------------------------------------------------
/sgkit/tests/test_pedigree/kinship2_pedigree.csv:
--------------------------------------------------------------------------------
 1 | "","ped","id","father","mother","sex","affected","avail"
 2 | "1",1,101,0,0,1,0,0
 3 | "2",1,102,0,0,2,1,0
 4 | "3",1,103,135,136,1,1,0
 5 | "4",1,104,0,0,2,0,0
 6 | "5",1,105,0,0,1,NA,0
 7 | "6",1,106,0,0,2,NA,0
 8 | "7",1,107,0,0,1,1,0
 9 | "8",1,108,0,0,2,0,0
10 | "9",1,109,101,102,2,0,1
11 | "10",1,110,103,104,1,1,1
12 | "11",1,111,103,104,2,1,0
13 | "12",1,112,103,104,1,1,0
14 | "13",1,113,0,0,2,0,1
15 | "14",1,114,103,104,1,1,0
16 | "15",1,115,105,106,2,0,0
17 | "16",1,116,105,106,2,1,1
18 | "17",1,117,0,0,1,1,0
19 | "18",1,118,105,106,2,1,1
20 | "19",1,119,105,106,1,1,1
21 | "20",1,120,107,108,2,0,0
22 | "21",1,121,110,109,1,1,0
23 | "22",1,122,110,109,2,0,0
24 | "23",1,123,110,109,2,0,0
25 | "24",1,124,110,109,1,1,1
26 | "25",1,125,112,118,2,0,1
27 | "26",1,126,112,118,2,0,1
28 | "27",1,127,114,115,1,1,1
29 | "28",1,128,114,115,1,1,1
30 | "29",1,129,117,116,1,0,1
31 | "30",1,130,119,120,1,0,1
32 | "31",1,131,119,120,1,1,0
33 | "32",1,132,119,120,1,0,0
34 | "33",1,133,119,120,2,0,1
35 | "34",1,134,119,120,2,1,0
36 | "35",1,135,0,0,1,NA,0
37 | "36",1,136,0,0,2,NA,0
38 | "37",1,137,0,0,1,NA,0
39 | "38",1,138,135,136,2,NA,0
40 | "39",1,139,137,138,1,1,0
41 | "40",1,140,137,138,2,0,1
42 | "41",1,141,137,138,2,0,1
43 | "42",2,201,0,0,1,1,1
44 | "43",2,202,0,0,2,NA,0
45 | "44",2,203,0,0,1,1,1
46 | "45",2,204,201,202,2,0,1
47 | "46",2,205,201,202,1,NA,0
48 | "47",2,206,201,202,2,1,1
49 | "48",2,207,201,202,2,1,1
50 | "49",2,208,201,202,2,0,0
51 | "50",2,209,0,0,1,0,0
52 | "51",2,210,203,204,1,0,0
53 | "52",2,211,203,204,1,0,1
54 | "53",2,212,209,208,2,0,1
55 | "54",2,213,209,208,1,0,0
56 | "55",2,214,209,208,1,1,1
57 | 


--------------------------------------------------------------------------------
/sgkit/tests/test_pedigree/pedkin_sim_founder.txt:
--------------------------------------------------------------------------------
 1 | 1 1 1 0.0625
 2 | 1 2 1 0.03125
 3 | 1 2 2 0.0
 4 | 1 3 1 0.078125
 5 | 1 3 2 0.0625
 6 | 1 3 3 0.0
 7 | 1 4 1 0.1015625
 8 | 1 4 2 0.0
 9 | 1 4 3 0.0
10 | 1 4 4 0.0
11 | 1 5 1 0.1171875
12 | 1 5 2 0.03125
13 | 1 5 3 0.25
14 | 1 5 4 0.09375
15 | 1 5 5 0.0
16 | 1 6 1 0.15234375
17 | 1 6 2 0.015625
18 | 1 6 3 0.0078125
19 | 1 6 4 0.18359375
20 | 1 6 5 0.08203125
21 | 1 6 6 0.078125
22 | 1 7 1 0.1015625
23 | 1 7 2 0.015625
24 | 1 7 3 0.140625
25 | 1 7 4 0.0546875
26 | 1 7 5 0.1015625
27 | 1 7 6 0.060546875
28 | 1 7 7 0.0
29 | 1 8 1 0.10546875
30 | 1 8 2 0.0625
31 | 1 8 3 0.15625
32 | 1 8 4 0.0625
33 | 1 8 5 0.09375
34 | 1 8 6 0.080078125
35 | 1 8 7 0.15625
36 | 1 8 8 0.0
37 | 1 9 1 0.0673828125
38 | 1 9 2 0.0
39 | 1 9 3 0.0390625
40 | 1 9 4 0.0703125
41 | 1 9 5 0.078125
42 | 1 9 6 0.07568359375
43 | 1 9 7 0.068359375
44 | 1 9 8 0.015625
45 | 1 9 9 0.125
46 | 1 10 1 0.015625
47 | 1 10 2 0.375
48 | 1 10 3 0.03125
49 | 1 10 4 0.0
50 | 1 10 5 0.015625
51 | 1 10 6 0.0078125
52 | 1 10 7 0.0078125
53 | 1 10 8 0.03125
54 | 1 10 9 0.0
55 | 1 10 10 0.25
56 | 


--------------------------------------------------------------------------------
/sgkit/tests/test_pedigree/pedkin_sim_interest.txt:
--------------------------------------------------------------------------------
 1 | 1 1
 2 | 1 2
 3 | 1 3
 4 | 1 4
 5 | 1 5
 6 | 1 6
 7 | 1 7
 8 | 1 8
 9 | 1 9
10 | 1 10
11 | 1 11
12 | 1 12
13 | 1 13
14 | 1 14
15 | 1 15
16 | 1 16
17 | 1 17
18 | 1 18
19 | 1 19
20 | 1 20
21 | 1 21
22 | 1 22
23 | 1 23
24 | 1 24
25 | 1 25
26 | 1 26
27 | 1 27
28 | 1 28
29 | 1 29
30 | 1 30
31 | 


--------------------------------------------------------------------------------
/sgkit/tests/test_pedigree/pedkin_sim_ped.txt:
--------------------------------------------------------------------------------
 1 | 1 1 0 0 1 0
 2 | 1 2 0 0 2 0
 3 | 1 3 0 0 1 0
 4 | 1 4 0 0 2 0
 5 | 1 5 0 0 1 0
 6 | 1 6 0 0 2 0
 7 | 1 7 0 0 1 0
 8 | 1 8 0 0 2 0
 9 | 1 9 0 0 1 0
10 | 1 10 0 0 2 0
11 | 1 11 7 10 1 0
12 | 1 12 1 4 2 0
13 | 1 13 7 12 1 0
14 | 1 14 1 2 2 0
15 | 1 15 3 10 1 0
16 | 1 16 15 12 2 0
17 | 1 17 9 14 1 0
18 | 1 18 5 10 2 0
19 | 1 19 11 6 1 0
20 | 1 20 9 6 2 0
21 | 1 21 9 16 1 0
22 | 1 22 15 20 2 0
23 | 1 23 3 16 1 0
24 | 1 24 1 14 2 0
25 | 1 25 19 20 1 0
26 | 1 26 15 14 2 0
27 | 1 27 19 4 1 0
28 | 1 28 1 26 2 0
29 | 1 29 3 18 1 0
30 | 1 30 17 28 2 0
31 | 


--------------------------------------------------------------------------------
/sgkit/tests/test_regenie/config.yml:
--------------------------------------------------------------------------------
 1 | datasets:
 2 |   sim_sm_01:
 3 |     n_contigs: 1
 4 |     n_covars: 3
 5 |     n_samples: 50
 6 |     n_traits: 1
 7 |     n_variants: 250
 8 |   sim_sm_02:
 9 |     n_contigs: 10
10 |     n_covars: 3
11 |     n_samples: 50
12 |     n_traits: 5
13 |     n_variants: 250
14 | paramsets:
15 |   wgr_01:
16 |     alphas:
17 |     - 1000
18 |     sample_block_size: 10
19 |     variant_block_size: 10
20 |   wgr_02:
21 |     alphas: null
22 |     sample_block_size: 10
23 |     variant_block_size: 10
24 | runs:
25 | - dataset: sim_sm_01
26 |   name: sim_sm_01-wgr_01
27 |   paramset: wgr_01
28 | - dataset: sim_sm_02
29 |   name: sim_sm_02-wgr_02
30 |   paramset: wgr_02
31 | 


--------------------------------------------------------------------------------
/sgkit/tests/test_regenie/dataset/sim_sm_01/beta_covariate.csv:
--------------------------------------------------------------------------------
1 | ,Y0000
2 | B-X000,3.764052345967664
3 | B-X001,0.0
4 | B-X002,0.0
5 | 


--------------------------------------------------------------------------------
/sgkit/tests/test_regenie/dataset/sim_sm_01/covariates.csv:
--------------------------------------------------------------------------------
 1 | sample_id,X000,X001,X002
 2 | S0000001,1.764052345967664,0.4001572083672233,0.9787379841057392
 3 | S0000002,2.240893199201458,1.8675579901499675,-0.977277879876411
 4 | S0000003,0.9500884175255894,-0.1513572082976979,-0.10321885179355784
 5 | S0000004,0.41059850193837233,0.144043571160878,1.454273506962975
 6 | S0000005,0.7610377251469934,0.12167501649282841,0.44386323274542566
 7 | S0000006,0.33367432737426683,1.4940790731576061,-0.20515826376580087
 8 | S0000007,0.31306770165090136,-0.8540957393017248,-2.5529898158340787
 9 | S0000008,0.6536185954403606,0.8644361988595057,-0.7421650204064419
10 | S0000009,2.2697546239876076,-1.4543656745987648,0.04575851730144607
11 | S0000010,-0.1871838500258336,1.5327792143584575,1.469358769900285
12 | S0000011,0.1549474256969163,0.37816251960217356,-0.8877857476301128
13 | S0000012,-1.980796468223927,-0.3479121493261526,0.15634896910398005
14 | S0000013,1.2302906807277207,1.2023798487844113,-0.3873268174079523
15 | S0000014,-0.30230275057533557,-1.0485529650670926,-1.4200179371789752
16 | S0000015,-1.7062701906250126,1.9507753952317897,-0.5096521817516535
17 | S0000016,-0.4380743016111864,-1.2527953600499262,0.7774903558319101
18 | S0000017,-1.6138978475579515,-0.2127402802139687,-0.8954665611936756
19 | S0000018,0.386902497859262,-0.510805137568873,-1.180632184122412
20 | S0000019,-0.028182228338654868,0.42833187053041766,0.06651722238316789
21 | S0000020,0.3024718977397814,-0.6343220936809636,-0.3627411659871381
22 | S0000021,-0.672460447775951,-0.3595531615405413,-0.813146282044454
23 | S0000022,-1.7262826023316769,0.17742614225375283,-0.4017809362082619
24 | S0000023,-1.6301983469660446,0.4627822555257742,-0.9072983643832422
25 | S0000024,0.05194539579613895,0.7290905621775369,0.12898291075741067
26 | S0000025,1.1394006845433007,-1.2348258203536526,0.402341641177549
27 | S0000026,-0.6848100909403132,-0.8707971491818818,-0.5788496647644155
28 | S0000027,-0.31155253212737266,0.05616534222974544,-1.1651498407833565
29 | S0000028,0.9008264869541871,0.46566243973045984,-1.5362436862772237
30 | S0000029,1.4882521937955997,1.8958891760305832,1.1787795711596507
31 | S0000030,-0.17992483581235091,-1.0707526215105425,1.0544517269311366
32 | S0000031,-0.40317694697317963,1.2224450703824274,0.2082749780768603
33 | S0000032,0.9766390364837128,0.3563663971744019,0.7065731681919482
34 | S0000033,0.010500020720820478,1.7858704939058352,0.12691209270361992
35 | S0000034,0.40198936344470165,1.8831506970562544,-1.3477590611424464
36 | S0000035,-1.2704849984857336,0.9693967081580112,-1.17312340511416
37 | S0000036,1.9436211856492926,-0.41361898075974735,-0.7474548114407578
38 | S0000037,1.9229420264803847,1.4805147914344243,1.8675589604265699
39 | S0000038,0.9060446582753853,-0.8612256850547025,1.9100649530990337
40 | S0000039,-0.2680033709513804,0.8024563957963952,0.947251967773748
41 | S0000040,-0.1550100930908342,0.6140793703460803,0.9222066715665268
42 | S0000041,0.37642553115562943,-1.0994007905841945,0.298238174206056
43 | S0000042,1.3263858966870303,-0.6945678597313655,-0.14963454032767076
44 | S0000043,-0.43515355172163744,1.8492637284793418,0.6722947570124355
45 | S0000044,0.40746183624111043,-0.7699160744453164,0.5392491912918173
46 | S0000045,-0.6743326606573761,0.03183055827435118,-0.635846078378881
47 | S0000046,0.6764332949464997,0.5765908166149409,-0.20829875557799488
48 | S0000047,0.3960067126616453,-1.0930615087305058,-1.4912575927056055
49 | S0000048,0.4393917012645369,0.16667349537252904,0.6350314368921064
50 | S0000049,2.383144774863942,0.9444794869904138,-0.9128222254441586
51 | S0000050,1.117016288095853,-1.3159074105115212,-0.461584604814709
52 | 


--------------------------------------------------------------------------------
/sgkit/tests/test_regenie/dataset/sim_sm_01/genotypes.zarr.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/tests/test_regenie/dataset/sim_sm_01/genotypes.zarr.zip


--------------------------------------------------------------------------------
/sgkit/tests/test_regenie/dataset/sim_sm_01/traits.csv:
--------------------------------------------------------------------------------
 1 | sample_id,Y0000
 2 | S0000001,-254.72480109910808
 3 | S0000002,-230.09027706391151
 4 | S0000003,-218.05493689539847
 5 | S0000004,-219.5811261299788
 6 | S0000005,-234.95921431366196
 7 | S0000006,-227.22086660345207
 8 | S0000007,-257.5205067301366
 9 | S0000008,-214.26290999909807
10 | S0000009,-246.80612947384586
11 | S0000010,-221.09857320678782
12 | S0000011,-242.27165206670597
13 | S0000012,-259.2204842996455
14 | S0000013,-246.72979301500683
15 | S0000014,-209.59462849991064
16 | S0000015,-263.1263009837309
17 | S0000016,-254.4080374180435
18 | S0000017,-257.1430792551615
19 | S0000018,-248.76872179158278
20 | S0000019,-240.40214308086217
21 | S0000020,-261.4762675865956
22 | S0000021,-256.7216762782688
23 | S0000022,-255.42224125778205
24 | S0000023,-224.3321512357823
25 | S0000024,-236.59008887723687
26 | S0000025,-252.33028531206247
27 | S0000026,-235.45704328058943
28 | S0000027,-207.56874118449966
29 | S0000028,-246.30202546302775
30 | S0000029,-215.04988433004772
31 | S0000030,-261.991321162662
32 | S0000031,-245.70590215857987
33 | S0000032,-228.11108069502387
34 | S0000033,-251.53285322533534
35 | S0000034,-210.74627747729176
36 | S0000035,-250.5371665715462
37 | S0000036,-228.9568905365803
38 | S0000037,-220.6352585896156
39 | S0000038,-226.25148497511077
40 | S0000039,-247.16395771745957
41 | S0000040,-219.69166073267803
42 | S0000041,-206.30541134510108
43 | S0000042,-211.72156415779116
44 | S0000043,-243.20881457193406
45 | S0000044,-244.71227068608678
46 | S0000045,-267.4724732836421
47 | S0000046,-222.5489850793145
48 | S0000047,-250.73358170555258
49 | S0000048,-235.08853364235185
50 | S0000049,-220.23218045254927
51 | S0000050,-221.99172372182065
52 | 


--------------------------------------------------------------------------------
/sgkit/tests/test_regenie/dataset/sim_sm_02/beta_covariate.csv:
--------------------------------------------------------------------------------
1 | ,Y0000,Y0001,Y0002,Y0003,Y0004
2 | B-X000,3.764052345967664,2.400157208367223,2.9787379841057393,4.240893199201458,3.8675579901499675
3 | B-X001,0.0,0.0,0.0,0.0,0.0
4 | B-X002,0.0,0.0,0.0,0.0,0.0
5 | 


--------------------------------------------------------------------------------
/sgkit/tests/test_regenie/dataset/sim_sm_02/covariates.csv:
--------------------------------------------------------------------------------
 1 | sample_id,X000,X001,X002
 2 | S0000001,1.764052345967664,0.4001572083672233,0.9787379841057392
 3 | S0000002,2.240893199201458,1.8675579901499675,-0.977277879876411
 4 | S0000003,0.9500884175255894,-0.1513572082976979,-0.10321885179355784
 5 | S0000004,0.41059850193837233,0.144043571160878,1.454273506962975
 6 | S0000005,0.7610377251469934,0.12167501649282841,0.44386323274542566
 7 | S0000006,0.33367432737426683,1.4940790731576061,-0.20515826376580087
 8 | S0000007,0.31306770165090136,-0.8540957393017248,-2.5529898158340787
 9 | S0000008,0.6536185954403606,0.8644361988595057,-0.7421650204064419
10 | S0000009,2.2697546239876076,-1.4543656745987648,0.04575851730144607
11 | S0000010,-0.1871838500258336,1.5327792143584575,1.469358769900285
12 | S0000011,0.1549474256969163,0.37816251960217356,-0.8877857476301128
13 | S0000012,-1.980796468223927,-0.3479121493261526,0.15634896910398005
14 | S0000013,1.2302906807277207,1.2023798487844113,-0.3873268174079523
15 | S0000014,-0.30230275057533557,-1.0485529650670926,-1.4200179371789752
16 | S0000015,-1.7062701906250126,1.9507753952317897,-0.5096521817516535
17 | S0000016,-0.4380743016111864,-1.2527953600499262,0.7774903558319101
18 | S0000017,-1.6138978475579515,-0.2127402802139687,-0.8954665611936756
19 | S0000018,0.386902497859262,-0.510805137568873,-1.180632184122412
20 | S0000019,-0.028182228338654868,0.42833187053041766,0.06651722238316789
21 | S0000020,0.3024718977397814,-0.6343220936809636,-0.3627411659871381
22 | S0000021,-0.672460447775951,-0.3595531615405413,-0.813146282044454
23 | S0000022,-1.7262826023316769,0.17742614225375283,-0.4017809362082619
24 | S0000023,-1.6301983469660446,0.4627822555257742,-0.9072983643832422
25 | S0000024,0.05194539579613895,0.7290905621775369,0.12898291075741067
26 | S0000025,1.1394006845433007,-1.2348258203536526,0.402341641177549
27 | S0000026,-0.6848100909403132,-0.8707971491818818,-0.5788496647644155
28 | S0000027,-0.31155253212737266,0.05616534222974544,-1.1651498407833565
29 | S0000028,0.9008264869541871,0.46566243973045984,-1.5362436862772237
30 | S0000029,1.4882521937955997,1.8958891760305832,1.1787795711596507
31 | S0000030,-0.17992483581235091,-1.0707526215105425,1.0544517269311366
32 | S0000031,-0.40317694697317963,1.2224450703824274,0.2082749780768603
33 | S0000032,0.9766390364837128,0.3563663971744019,0.7065731681919482
34 | S0000033,0.010500020720820478,1.7858704939058352,0.12691209270361992
35 | S0000034,0.40198936344470165,1.8831506970562544,-1.3477590611424464
36 | S0000035,-1.2704849984857336,0.9693967081580112,-1.17312340511416
37 | S0000036,1.9436211856492926,-0.41361898075974735,-0.7474548114407578
38 | S0000037,1.9229420264803847,1.4805147914344243,1.8675589604265699
39 | S0000038,0.9060446582753853,-0.8612256850547025,1.9100649530990337
40 | S0000039,-0.2680033709513804,0.8024563957963952,0.947251967773748
41 | S0000040,-0.1550100930908342,0.6140793703460803,0.9222066715665268
42 | S0000041,0.37642553115562943,-1.0994007905841945,0.298238174206056
43 | S0000042,1.3263858966870303,-0.6945678597313655,-0.14963454032767076
44 | S0000043,-0.43515355172163744,1.8492637284793418,0.6722947570124355
45 | S0000044,0.40746183624111043,-0.7699160744453164,0.5392491912918173
46 | S0000045,-0.6743326606573761,0.03183055827435118,-0.635846078378881
47 | S0000046,0.6764332949464997,0.5765908166149409,-0.20829875557799488
48 | S0000047,0.3960067126616453,-1.0930615087305058,-1.4912575927056055
49 | S0000048,0.4393917012645369,0.16667349537252904,0.6350314368921064
50 | S0000049,2.383144774863942,0.9444794869904138,-0.9128222254441586
51 | S0000050,1.117016288095853,-1.3159074105115212,-0.461584604814709
52 | 


--------------------------------------------------------------------------------
/sgkit/tests/test_regenie/dataset/sim_sm_02/genotypes.zarr.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/tests/test_regenie/dataset/sim_sm_02/genotypes.zarr.zip


--------------------------------------------------------------------------------
/sgkit/tests/test_regenie/dataset/sim_sm_02/glow_offsets.zarr.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/tests/test_regenie/dataset/sim_sm_02/glow_offsets.zarr.zip


--------------------------------------------------------------------------------
/sgkit/tests/test_regenie/dataset/sim_sm_02/glow_offsets_nocovariate.zarr.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/tests/test_regenie/dataset/sim_sm_02/glow_offsets_nocovariate.zarr.zip


--------------------------------------------------------------------------------
/sgkit/tests/test_regenie/dataset/sim_sm_02/traits.csv:
--------------------------------------------------------------------------------
 1 | sample_id,Y0000,Y0001,Y0002,Y0003,Y0004
 2 | S0000001,-289.36482961044777,-275.55161096990247,-271.5112918054582,-271.35578637967285,-249.9802561342581
 3 | S0000002,-280.2617466598422,-253.9720298942713,-273.6459148573497,-282.49078841243517,-241.09338997825571
 4 | S0000003,-254.9285482992192,-244.26794278688027,-255.66389861945876,-247.8402123732413,-238.82817113331336
 5 | S0000004,-260.4022375715708,-242.13835393304,-255.01612792818455,-268.44036534547627,-250.70721888268997
 6 | S0000005,-258.1133404800941,-255.57357980548673,-261.90921405575847,-285.7029592534616,-245.24453454678022
 7 | S0000006,-265.47689844301016,-244.28013875467363,-264.23891346125174,-266.07770608431025,-228.28301757796825
 8 | S0000007,-283.2871761140315,-292.6260787637163,-292.1640603923218,-299.39471029355775,-264.3175954620037
 9 | S0000008,-219.49833815684846,-229.40456661041102,-226.29208752330103,-222.02459238865347,-207.13382636978892
10 | S0000009,-279.51270464588004,-283.6975894756584,-273.7096726753412,-280.3554738029803,-259.4180219647341
11 | S0000010,-259.8032291695713,-251.36129374443573,-257.8977581771694,-269.23078726351764,-253.13279293137492
12 | S0000011,-269.9919814699405,-247.93206164429185,-261.7601867956449,-275.0298365515039,-244.21932805458428
13 | S0000012,-285.77393491791605,-275.3890397383568,-274.55380982704963,-284.9299666091712,-254.9928111785241
14 | S0000013,-277.1926809934609,-285.6689827198898,-266.3253403090971,-269.2727324762928,-250.21589155337878
15 | S0000014,-262.4163352438781,-267.4995773197119,-256.5335711240192,-271.9041593346388,-240.7477340675035
16 | S0000015,-287.73560131012886,-298.9007710385901,-279.4414838792006,-288.2026950107911,-261.6035479689111
17 | S0000016,-274.42311861954556,-258.80197244752253,-250.78878192452643,-255.90333311243012,-254.96719346171648
18 | S0000017,-288.78156475051264,-283.69484431811884,-282.0562102999267,-296.18754615077813,-269.81982529752344
19 | S0000018,-267.92458695783586,-274.333207503416,-270.16690545867147,-271.22687144367785,-257.084711352308
20 | S0000019,-259.652239029143,-271.3373968635834,-272.6600069993165,-264.2709560634566,-236.87420837724466
21 | S0000020,-276.432731006855,-265.11250963911374,-291.6925850053143,-287.2029220253524,-257.54740093719437
22 | S0000021,-302.2583816107363,-268.10402480224985,-266.5553387094999,-281.9858242858026,-258.8950016696872
23 | S0000022,-279.8812519534122,-276.5331115117956,-277.2710749855309,-290.26865041089155,-257.7871674979621
24 | S0000023,-259.2277460394356,-243.66122076394095,-245.76668836892475,-256.1007519571617,-241.23568200144058
25 | S0000024,-271.15634538304914,-278.2742395123379,-253.89982381679815,-266.3033017713841,-260.81345761229016
26 | S0000025,-289.12221816694904,-288.2213062217917,-299.17923356228863,-317.70814291174673,-270.87527151421574
27 | S0000026,-256.69561439734474,-266.6609715391943,-271.26324726800226,-272.1222328091294,-254.5707542300432
28 | S0000027,-260.47466710051197,-240.05822457504374,-247.51137384414386,-278.822915534984,-232.1627874599786
29 | S0000028,-293.83634241663555,-267.5942462465462,-260.3977803416625,-266.3495161503439,-249.55915696107422
30 | S0000029,-255.7649864858529,-256.6956682332754,-265.0818014497492,-248.47063293488537,-222.06254119680125
31 | S0000030,-292.7603769436315,-288.48829299572657,-272.85558338442576,-302.4533434187978,-264.89426716806315
32 | S0000031,-272.075281401142,-278.9048993798882,-261.982508345537,-266.32711738718075,-251.50346760610367
33 | S0000032,-251.07077191223377,-275.0877046973688,-261.11750205717243,-255.13447427146772,-242.20585453407114
34 | S0000033,-291.8325541138616,-287.63711513064686,-282.20486664981166,-265.826203021402,-264.4394057946944
35 | S0000034,-240.43647998894556,-238.03828868014054,-241.15607912899287,-241.52289541196632,-225.6261811358186
36 | S0000035,-269.6160134380013,-269.95418840713427,-259.3331428408944,-260.835568915071,-248.99036245952718
37 | S0000036,-283.17410117996593,-277.9604314834706,-281.65162507321173,-275.1431456639316,-236.6681812549374
38 | S0000037,-270.73264002439265,-267.19423360129593,-260.76691021417,-256.81109619851037,-230.82345757977177
39 | S0000038,-263.58004859805374,-248.3362358353548,-252.88775092166745,-256.1529742816787,-228.57582977376472
40 | S0000039,-272.1377582391363,-294.10446434385204,-265.3826971259937,-282.08726061350546,-254.95214449897563
41 | S0000040,-244.0143803641179,-238.3050884849436,-247.577635457108,-253.59230615277642,-225.592932356266
42 | S0000041,-248.44496391618878,-230.13158447036224,-232.32370220739398,-239.39271395267718,-218.43116642254466
43 | S0000042,-230.70463381624643,-226.273342148801,-248.36637606352767,-254.18185949642907,-220.11091567559257
44 | S0000043,-273.4363194314351,-254.1295932375416,-280.1206296032896,-263.7965130748505,-247.54407948763603
45 | S0000044,-250.08973677324587,-280.75695152448344,-252.1930476986954,-273.8235387462245,-255.0579489393799
46 | S0000045,-288.30584497419505,-277.2842374326607,-279.7480695633359,-297.1405472260583,-271.80540976936373
47 | S0000046,-256.8741187019728,-242.220619536689,-228.36044361473154,-230.24503842898756,-226.1419976707234
48 | S0000047,-281.39239364093703,-272.7648883331534,-275.1145231121791,-283.38041504923456,-248.68732803897694
49 | S0000048,-250.56543207518007,-260.88344107712805,-266.7397915037219,-288.52101293377217,-255.29649191317182
50 | S0000049,-250.8053322211505,-254.3889142578776,-251.58362762547668,-251.51574318129602,-237.56303756234547
51 | S0000050,-264.62721748673806,-261.7129193668437,-258.7481379119824,-255.61982190239425,-239.65055065580412
52 | 


--------------------------------------------------------------------------------
/sgkit/tests/test_regenie/result/sim_sm_01-wgr_01/predictions.csv:
--------------------------------------------------------------------------------
 1 | sample_id,Y0000
 2 | S0000001,0.6712206418512147
 3 | S0000002,1.0675227946373933
 4 | S0000003,0.3281001891790848
 5 | S0000004,0.07792072438052661
 6 | S0000005,0.2552996199145314
 7 | S0000006,0.22526782376240384
 8 | S0000007,0.09321159868391961
 9 | S0000008,0.31969664357687166
10 | S0000009,0.7484631415574241
11 | S0000010,-0.04301884891150365
12 | S0000011,-0.12676226976916605
13 | S0000012,-0.8045886595639526
14 | S0000013,0.3486980726044114
15 | S0000014,-0.5345076110144904
16 | S0000015,-0.36148164377437925
17 | S0000016,-0.4878276520639582
18 | S0000017,-0.7319003293934547
19 | S0000018,-0.23188330032239363
20 | S0000019,-0.12008692522353148
21 | S0000020,-0.23228033372830584
22 | S0000021,-0.5377574701429808
23 | S0000022,-0.8639491051767361
24 | S0000023,-0.8415366191036098
25 | S0000024,-0.07839636461289698
26 | S0000025,0.18951125522929352
27 | S0000026,-0.569037828028304
28 | S0000027,-0.38903076208959475
29 | S0000028,0.08970628557699697
30 | S0000029,0.6735764020859089
31 | S0000030,-0.25358601756754817
32 | S0000031,0.04476691419131551
33 | S0000032,0.27895891483490426
34 | S0000033,0.21584900716468297
35 | S0000034,0.31290525875527464
36 | S0000035,-0.20718579770757645
37 | S0000036,0.408707089373321
38 | S0000037,0.6633782570769876
39 | S0000038,0.11939878625988729
40 | S0000039,0.03138343037095803
41 | S0000040,0.03444503439417827
42 | S0000041,-0.1837695563335075
43 | S0000042,0.1060011221467489
44 | S0000043,0.09311322939642908
45 | S0000044,-0.11954146368470456
46 | S0000045,-0.2647727548596352
47 | S0000046,0.1491181301129147
48 | S0000047,-0.19269728742877307
49 | S0000048,0.036116074120831745
50 | S0000049,0.611630749049116
51 | S0000050,-0.044042060654685466
52 | 


--------------------------------------------------------------------------------
/sgkit/tests/test_regenie/result/sim_sm_01-wgr_01/reduced_blocks_flat.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/tests/test_regenie/result/sim_sm_01-wgr_01/reduced_blocks_flat.csv.gz


--------------------------------------------------------------------------------
/sgkit/tests/test_regenie/result/sim_sm_02-wgr_02/predictions.csv:
--------------------------------------------------------------------------------
 1 | sample_id,Y0000,Y0001,Y0002,Y0003,Y0004
 2 | S0000001,0.45374320321611866,0.4384553793358616,0.2692125245910143,0.5947202967913898,0.5975712436135826
 3 | S0000002,0.3981979032191132,0.5213412372969929,-0.38081352848913874,0.8162941188969363,1.0644121156840316
 4 | S0000003,0.28761389591743136,0.14834927965448796,0.12970160532604444,0.34870688419043405,0.3923761073751637
 5 | S0000004,0.5237540570114866,0.3729791215556517,0.8998382594704468,0.4512697584572887,0.2666976547679285
 6 | S0000005,0.3882711007908075,0.262365812438763,0.45600096357874265,0.40959612781192584,0.37754481318684524
 7 | S0000006,0.5002068541742315,0.4519055333223547,0.4961881559317826,0.5880647385794258,0.5742797329186986
 8 | S0000007,-0.050120684838683496,-0.31098003865765617,-0.49118524618893705,-0.01082592895721236,0.26300114538132086
 9 | S0000008,0.3509199180044097,0.27754405527102205,0.1374680521267019,0.46008836105481554,0.5496752408451508
10 | S0000009,0.11755497500258066,-0.03703853409091502,-0.3816524713382645,0.23960844697355968,0.411377907012545
11 | S0000010,0.6999566456601272,0.6345581771142822,1.2594057651645734,0.6565669958786838,0.40319389416846113
12 | S0000011,-0.39764469195427143,-0.21402586638468526,-0.20721917625257816,-0.1989266921107567,-0.21683364253091156
13 | S0000012,-0.5032268326878345,-0.514720795289701,-0.5033526868738674,-0.756449833999863,-0.9810125870606645
14 | S0000013,-0.43716796587146806,-0.09898909240851166,-0.14822990713492612,0.2428228060089122,0.23013561388200385
15 | S0000014,-0.2000575369243856,-0.21823108090123566,-0.04486600391835377,-0.683842471389722,-0.6163148493612544
16 | S0000015,-0.8147850072512632,-0.6948186360042015,-0.8870978332981618,-0.15460919416204447,-0.4524977513386532
17 | S0000016,-0.23446318084410064,-0.14247111244689167,0.00963030563653732,-0.6313989480650365,-0.7300478652398097
18 | S0000017,-0.43722629504209853,-0.49055825470421416,-0.4371099278521622,-0.6960305745944001,-0.8283528779793553
19 | S0000018,-0.23760478198555937,-0.14331440643863386,-0.01827476109328028,-0.3907625159706337,-0.3252337774743284
20 | S0000019,-0.4524300143822794,-0.24682960143824897,-0.26764009022708674,-0.17728958361891117,-0.28629147131853017
21 | S0000020,-0.2256479259004916,-0.10215100433559084,-0.01962565702161251,-0.4022826765823528,-0.38151857255796007
22 | S0000021,-0.3910327350435797,-0.3245350157754625,-0.27853736601812795,-0.44382738029595614,-0.5134205253297407
23 | S0000022,-0.38979608421126516,-0.4198451605709752,-0.2696449211896824,-0.5170422020223361,-0.7491195444360533
24 | S0000023,-0.35655551643903965,-0.29736923275296573,-0.20093284218899046,-0.47544578234073537,-0.6581891393390545
25 | S0000024,-0.08770623978970846,-0.13276000303256172,-0.042926244220280615,-0.010938885879558317,-0.09987402556104427
26 | S0000025,-0.22369919290553647,-0.29331837449806203,-0.26876169304844094,-0.16406706931170847,-0.09366979019564187
27 | S0000026,-0.43113107067087575,-0.41924347744348384,-0.3687095403489385,-0.5345322401025586,-0.6077323051357736
28 | S0000027,-0.31671923425616233,-0.18135492115944263,-0.20325757280115372,-0.3195749298248845,-0.3135842352061409
29 | S0000028,-0.16821016031430422,0.08485280283732696,-0.043447535246061014,-0.018910971202518504,0.14252135190646437
30 | S0000029,0.2931403001919625,0.15480978255735667,0.24189957499918444,0.5927053912085299,0.551739262963243
31 | S0000030,-0.293739725712008,-0.5261225630418921,-0.33169926546552025,-0.355204796820649,-0.4907529449624141
32 | S0000031,0.3470118702581462,0.29708534982377605,0.43519825153927866,0.41765794844395393,0.18720076993734178
33 | S0000032,0.3318167512354224,0.22390402851647723,0.16634714550404497,0.3496263576075134,0.3281124859129637
34 | S0000033,0.44750718046198507,0.247512816053288,0.4315668753366107,0.5395368036693038,0.3303910160469289
35 | S0000034,0.41622328503983463,0.5394475323443995,0.5146392873606955,0.5553606926459201,0.40979552834829286
36 | S0000035,0.19991364164501324,0.6248724038935171,0.6265223337755907,0.3273351488744321,0.0028951826724831066
37 | S0000036,0.20293018449655612,0.5254677435558607,0.09575682466456487,0.26025542478773206,0.40378027253483806
38 | S0000037,0.6064421187499152,-0.1118526815564854,0.04597658064221027,0.5911238158854771,0.6422206838586847
39 | S0000038,0.21136837143972206,0.037161756652827574,-0.007952513678630087,0.15474909210690133,0.16248768309272021
40 | S0000039,0.3368525586794071,0.166108529390026,0.3264216843469296,0.36246236377831675,0.15356485781461562
41 | S0000040,0.309397876460777,0.17291587830510743,0.28929168222313123,0.34097924184533973,0.15087623793051533
42 | S0000041,0.14380592397674863,0.22759321110326292,-0.07136785852710745,-0.3316379862204535,-0.23385955478283238
43 | S0000042,0.22195461967311647,0.25740467623271074,0.0823696499706631,-0.0936122477002144,0.08424362953109349
44 | S0000043,-0.43298137231403444,-0.5981809394028718,-0.32079056196912537,0.15048545900437957,0.05321960377760775
45 | S0000044,0.11929418010813775,0.12782464901718665,-0.061418346619223026,-0.2520582490946169,-0.1813526262771483
46 | S0000045,-0.2686680377620926,-0.11196177609786492,-0.35826598672762855,-0.31202896706441896,-0.27645414098442067
47 | S0000046,-0.12558303037907784,-0.1553277499978641,-0.10305564670217215,0.038733231821486094,0.13429110392270913
48 | S0000047,0.032923167186658046,0.3368044172825969,-0.1346720258718968,-0.38144757192046075,-0.16625814580045456
49 | S0000048,-0.026908928558184442,-0.12285015254170514,-0.09110967206862998,-0.04780712480923875,-0.02069248105103286
50 | S0000049,0.053886792561909765,-0.030554565718910614,0.1734557366607655,0.40743642020101406,0.6476327872039673
51 | S0000050,0.25566722791774016,0.4211396835494259,0.05589731550765741,-0.2702755586740005,-0.05793637160149775
52 | 


--------------------------------------------------------------------------------
/sgkit/tests/test_regenie/result/sim_sm_02-wgr_02/reduced_blocks_flat.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/sgkit/tests/test_regenie/result/sim_sm_02-wgr_02/reduced_blocks_flat.csv.gz


--------------------------------------------------------------------------------
/sgkit/tests/test_testing.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import numpy as np
 4 | import pytest
 5 | import xarray as xr
 6 | 
 7 | from sgkit.testing import simulate_genotype_call_dataset
 8 | 
 9 | 
10 | @pytest.mark.filterwarnings(
11 |     "ignore::UserWarning"
12 | )  # codec `vlen-utf8` not in Zarr v3 spec`
13 | def test_simulate_genotype_call_dataset__zarr(tmp_path):
14 |     path = str(tmp_path / "ds.zarr")
15 |     ds = simulate_genotype_call_dataset(n_variant=10, n_sample=10)
16 |     assert "call_genotype_phased" not in ds
17 |     ds.to_zarr(path)
18 |     xr.testing.assert_equal(ds, xr.open_zarr(path, concat_characters=False))
19 | 
20 | 
21 | def test_simulate_genotype_call_dataset__invalid_missing_pct():
22 |     with pytest.raises(
23 |         ValueError, match=re.escape("missing_pct must be within [0.0, 1.0]")
24 |     ):
25 |         simulate_genotype_call_dataset(n_variant=10, n_sample=10, missing_pct=-1.0)
26 | 
27 | 
28 | def test_simulate_genotype_call_dataset__phased(tmp_path):
29 |     ds = simulate_genotype_call_dataset(n_variant=10, n_sample=10, phased=True)
30 |     assert "call_genotype_phased" in ds
31 |     assert np.all(ds["call_genotype_phased"])
32 |     ds = simulate_genotype_call_dataset(n_variant=10, n_sample=10, phased=False)
33 |     assert "call_genotype_phased" in ds
34 |     assert not np.any(ds["call_genotype_phased"])
35 | 
36 | 
37 | def test_simulate_genotype_call_dataset__additional_variant_fields():
38 |     ds = simulate_genotype_call_dataset(
39 |         n_variant=10,
40 |         n_sample=10,
41 |         phased=True,
42 |         additional_variant_fields={
43 |             "variant_id": str,
44 |             "variant_filter": bool,
45 |             "variant_quality": np.int8,
46 |             "variant_yummyness": np.float32,
47 |         },
48 |     )
49 |     assert "variant_id" in ds
50 |     assert np.all(ds["variant_id"] == np.arange(10).astype("S"))
51 |     assert "variant_filter" in ds
52 |     assert ds["variant_filter"].dtype == bool
53 |     assert "variant_quality" in ds
54 |     assert ds["variant_quality"].dtype == np.int8
55 |     assert "variant_yummyness" in ds
56 |     assert ds["variant_yummyness"].dtype == np.float32
57 | 
58 |     with pytest.raises(ValueError, match="Unrecognized dtype"):
59 |         simulate_genotype_call_dataset(
60 |             n_variant=10,
61 |             n_sample=10,
62 |             phased=True,
63 |             additional_variant_fields={
64 |                 "variant_id": None,
65 |             },
66 |         )
67 | 


--------------------------------------------------------------------------------
/sgkit/typing.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import Any, Union
 3 | 
 4 | import dask.array as da
 5 | import numpy as np
 6 | 
 7 | ArrayLike = Union[np.ndarray, da.Array]
 8 | DType = Any
 9 | NDArray = Any
10 | PathType = Union[str, Path]
11 | RandomStateType = Union[np.random.RandomState, da.random.RandomState, int]
12 | 


--------------------------------------------------------------------------------
/validation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/validation/__init__.py


--------------------------------------------------------------------------------
/validation/gwas/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/validation/gwas/__init__.py


--------------------------------------------------------------------------------
/validation/gwas/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | # sgkit gwas validation docker image
 2 | FROM jupyter/minimal-notebook:54462805efcb
 3 | ENV WORK_DIR=$HOME/work
 4 | RUN mkdir $WORK_DIR/repos $WORK_DIR/auth $WORK_DIR/data $WORK_DIR/logs
 5 | 
 6 | USER root
 7 | 
 8 | # Install Hail
 9 | RUN mkdir -p /usr/share/man/man1 && \
10 |     apt-get update && apt-get install -y \
11 |     openjdk-8-jre-headless \
12 |     && rm -rf /var/lib/apt/lists/*
13 | COPY environment-hail.yml /tmp/
14 | RUN conda env create -p $CONDA_DIR/envs/hail -f /tmp/environment-hail.yml && \
15 |     conda clean --all -f -y
16 | RUN $CONDA_DIR/envs/hail/bin/pip install hail==0.2.47
17 | RUN $CONDA_DIR/envs/hail/bin/python -m ipykernel install --user --name=hail
18 | 
19 | # Install Glow
20 | COPY environment-glow.yml /tmp/
21 | RUN conda env create -p $CONDA_DIR/envs/glow -f /tmp/environment-glow.yml && \
22 |     conda clean --all -f -y
23 | RUN $CONDA_DIR/envs/glow/bin/pip install glow.py==0.5.0
24 | RUN $CONDA_DIR/envs/glow/bin/python -m ipykernel install --user --name=glow
25 | 
26 | 
27 | # Install base environment dependencies
28 | COPY environment.yml environment-dev.yml /tmp/
29 | RUN conda env update -n base --file /tmp/environment.yml
30 | RUN conda env update -n base --file /tmp/environment-dev.yml
31 | 
32 | # Install pysnptools separately (does not work as pip install with conda env update)
33 | RUN pip install --no-cache-dir pysnptools==0.4.19
34 | 
35 | # Ensure this always occurs last before user switch
36 | RUN fix-permissions $CONDA_DIR && \
37 |   fix-permissions /home/$NB_USER
38 | 
39 | USER $NB_UID
40 | 
41 | ENV PYTHONPATH="${PYTHONPATH}:$WORK_DIR/repos/sgkit"
42 | ENV PYTHONPATH="${PYTHONPATH}:$WORK_DIR/repos/sgkit-plink"
43 | 
44 | ENV OMP_NUM_THREADS=1
45 | ENV MKL_NUM_THREADS=1
46 | ENV OPENBLAS_NUM_THREADS=1
47 | 
48 | ARG SPARK_DRIVER_MEMORY=64g
49 | ENV SPARK_DRIVER_MEMORY=$SPARK_DRIVER_MEMORY
50 | 
51 | # Set this as needed to avoid https://issues.apache.org/jira/browse/SPARK-29367
52 | # with any pyspark 2.4.x + pyarrow >= 0.15.x
53 | # See: https://spark.apache.org/docs/latest/sql-pyspark-pandas-with-arrow.html#compatibility-setting-for-pyarrow--0150-and-spark-23x-24x
54 | ENV ARROW_PRE_0_15_IPC_FORMAT=1


--------------------------------------------------------------------------------
/validation/gwas/docker/README.md:
--------------------------------------------------------------------------------
 1 | ### Sgkit GWAS Validation Docker Image
 2 | 
 3 | This image installs a variety of useful dependencies for testing and validating genetic methods, with a focus on GWAS and related QC (e.g. [Hail](https://hail.is/index.html) and [Glow](https://projectglow.io/)).
 4 | 
 5 | #### Build Image
 6 | 
 7 | ```bash
 8 | docker build -t sgkit-gwas-validation .
 9 | ```
10 | 
11 | #### Run Image
12 | 
13 | This will launch a conatiner with jupyter lab accessible at http://localhost:8888 and Spark UI at http://localhost:4040.
14 | 
15 | ```bash
16 | # Adjust these as necessary for your setup
17 | DATA_DIR=/tmp # Set data directory to share locally
18 | REPO_DIR=$HOME/repos # Set local (host) repo dir containing sgkit
19 | JUPYTER_TOKEN=orDiAMbliNfI # Jupyter token for login
20 | SPARK_DRIVER_MEMORY=64g
21 | 
22 | # Launch ephemeral container (remove `--rm` to persist state)
23 | WORK_DIR=/home/jovyan/work
24 | docker run --rm -ti \
25 | -e GRANT_SUDO=yes --user=root \
26 | -p 8888:8888 -p 4040:4040 \
27 | -e JUPYTER_TOKEN=$JUPYTER_TOKEN \
28 | -e SPARK_DRIVER_MEMORY=$SPARK_DRIVER_MEMORY \
29 | -e JUPYTER_ENABLE_LAB=yes \
30 | -v $DATA_DIR:$WORK_DIR/data \
31 | -v $REPO_DIR/sgkit:$WORK_DIR/repos/sgkit \
32 | -v $REPO_DIR/sgkit-plink:$WORK_DIR/repos/sgkit-plink \
33 | sgkit-gwas-validation
34 | ```


--------------------------------------------------------------------------------
/validation/gwas/docker/environment-dev.yml:
--------------------------------------------------------------------------------
 1 | channels:
 2 | - conda-forge
 3 | - defaults
 4 | dependencies:
 5 | # CICD dependencies
 6 | - black
 7 | - flake8
 8 | - isort
 9 | - mypy
10 | - pre-commit
11 | - pylint
12 | - pytest
13 | - pytest-datadir
14 | - pytest-cov
15 | - hypothesis
16 | # Validation/visualization dependencies
17 | - statsmodels
18 | - python-dotenv
19 | - matplotlib
20 | - rope # for code refactor
21 | - graphviz
22 | - python-graphviz
23 | - scikit-learn
24 | - fire
25 | - invoke
26 | - pyarrow
27 | 


--------------------------------------------------------------------------------
/validation/gwas/docker/environment-glow.yml:
--------------------------------------------------------------------------------
 1 | name: glow
 2 | channels:
 3 |   - conda-forge
 4 | dependencies:
 5 |   - python=3.7
 6 |   - numpy
 7 |   - scipy
 8 |   - nomkl
 9 |   # See: https://spark.apache.org/docs/latest/sql-pyspark-pandas-with-arrow.html#recommended-pandas-and-pyarrow-versions
10 |   # If conflict occurs, downgrade pandas to 0.24.2 and let it upgrade on glow pip install to 0.25.3
11 |   - pandas=0.25.3
12 |   - pyarrow=0.15.1
13 |   - pyspark=2.4.5
14 |   - ipython=7.16.1
15 |   - ipykernel=5.3.3
16 |   - pyyaml=5.3.1
17 |   - fire=0.3.1
18 |   - pip
19 |   - pip:
20 |     - typeguard==2.5.0
21 |     - nptyping==1.1.0
22 | 


--------------------------------------------------------------------------------
/validation/gwas/docker/environment-hail.yml:
--------------------------------------------------------------------------------
 1 | name: hail
 2 | channels:
 3 |   - conda-forge
 4 | dependencies:
 5 |   - python=3.7
 6 |   - ipython=7.16.1
 7 |   - ipykernel=5.3.3
 8 |   - pyarrow=0.15.1
 9 |   - pyyaml=5.3.1
10 |   - fire=0.3.1
11 |   - pip


--------------------------------------------------------------------------------
/validation/gwas/docker/environment.yml:
--------------------------------------------------------------------------------
 1 | channels:
 2 | - conda-forge
 3 | - defaults
 4 | dependencies:
 5 | - numpy
 6 | - xarray
 7 | - dask[complete]
 8 | - dask-ml
 9 | - scipy
10 | - numba
11 | - zarr
12 | 


--------------------------------------------------------------------------------
/validation/gwas/method/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/validation/gwas/method/__init__.py


--------------------------------------------------------------------------------
/validation/gwas/method/hwe/Makefile:
--------------------------------------------------------------------------------
 1 | all: clean chwe.so
 2 | 
 3 | clean:
 4 | 	rm -f *.o *.so 
 5 | 
 6 | chwe.so: chwe.o
 7 | 	gcc -shared -o libchwe.so chwe.o
 8 | 
 9 | chwe.o: chwe.c
10 | 	gcc -c -Wall -Werror -fpic chwe.c
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/validation/gwas/method/hwe/README.md:
--------------------------------------------------------------------------------
 1 | ## HWE Exact Test Validation
 2 | 
 3 | This validation produces simulated genotype counts and corresponding HWE statistics from the (C) implementation described in [Wigginton et al. 2005](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1199378).
 4 | 
 5 | The `invoke` [tasks](tasks.py) will compile the C code, simulate genotype counts (inputs for unit tests), and attach p values (outputs for unit tests) from the C code to the genotype counts, as a dataframe.
 6 | 
 7 | The [hwe_unit_test.ipynb](hwe_unit_test.ipynb) is only instructive and shows how to debug and possibly extend test cases, perhaps validating inputs/outputs on a scale that wouldn't be included in unit testing.
 8 | 
 9 | To export the unit test data, all steps can be run as follows:
10 | 
11 | ```bash
12 | > invoke compile simulate export
13 | Building reference C library
14 | rm -f *.o *.so 
15 | gcc -c -Wall -Werror -fpic chwe.c
16 | gcc -shared -o libchwe.so chwe.o
17 | Build complete
18 | Generating unit test data
19 | Unit test data written to data/sim_01.csv
20 | Exporting test data to /home/jovyan/work/repos/sgkit/sgkit/tests/test_hwe
21 | Clearing test datadir at /home/jovyan/work/repos/sgkit/sgkit/tests/test_hwe
22 | Copying data/sim_01.csv to /home/jovyan/work/repos/sgkit/sgkit/tests/test_hwe/sim_01.csv
23 | Export complete
24 | ```


--------------------------------------------------------------------------------
/validation/gwas/method/hwe/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/validation/gwas/method/hwe/__init__.py


--------------------------------------------------------------------------------
/validation/gwas/method/hwe/chwe.c:
--------------------------------------------------------------------------------
 1 | // Lift from http://csg.sph.umich.edu/abecasis/Exact/snp_hwe.c
 2 | #include<stdio.h>
 3 | #include<stdlib.h>
 4 | 
 5 | double hwep(int obs_hets, int obs_hom1, int obs_hom2){
 6 |    if (obs_hom1 < 0 || obs_hom2 < 0 || obs_hets < 0) 
 7 |       {
 8 |       printf("FATAL ERROR - SNP-HWE: Current genotype configuration (%d  %d %d ) includes a"
 9 |              " negative count", obs_hets, obs_hom1, obs_hom2);
10 |       exit(EXIT_FAILURE);
11 |       }
12 | 
13 |    int obs_homc = obs_hom1 < obs_hom2 ? obs_hom2 : obs_hom1;
14 |    int obs_homr = obs_hom1 < obs_hom2 ? obs_hom1 : obs_hom2;
15 | 
16 |    int rare_copies = 2 * obs_homr + obs_hets;
17 |    int genotypes   = obs_hets + obs_homc + obs_homr;
18 | 
19 |    double * het_probs = (double *) malloc((size_t) (rare_copies + 1) * sizeof(double));
20 |    if (het_probs == NULL) 
21 |       {
22 |       printf("FATAL ERROR - SNP-HWE: Unable to allocate array for heterozygote probabilities" );
23 |       exit(EXIT_FAILURE);
24 |       }
25 |    
26 |    int i;
27 |    for (i = 0; i <= rare_copies; i++)
28 |       het_probs[i] = 0.0;
29 | 
30 |    /* start at midpoint */
31 |    int mid = rare_copies * (2 * genotypes - rare_copies) / (2 * genotypes);
32 | 
33 |    /* check to ensure that midpoint and rare alleles have same parity */
34 |    if ((rare_copies & 1) ^ (mid & 1))
35 |       mid++;
36 | 
37 |    int curr_hets = mid;
38 |    int curr_homr = (rare_copies - mid) / 2;
39 |    int curr_homc = genotypes - curr_hets - curr_homr;
40 | 
41 |    het_probs[mid] = 1.0;
42 |    double sum = het_probs[mid];
43 |    for (curr_hets = mid; curr_hets > 1; curr_hets -= 2)
44 |       {
45 |       het_probs[curr_hets - 2] = het_probs[curr_hets] * curr_hets * (curr_hets - 1.0)
46 |                                / (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0));
47 |       sum += het_probs[curr_hets - 2];
48 | 
49 |       /* 2 fewer heterozygotes for next iteration -> add one rare, one common homozygote */
50 |       curr_homr++;
51 |       curr_homc++;
52 |       }
53 | 
54 |    curr_hets = mid;
55 |    curr_homr = (rare_copies - mid) / 2;
56 |    curr_homc = genotypes - curr_hets - curr_homr;
57 |    for (curr_hets = mid; curr_hets <= rare_copies - 2; curr_hets += 2)
58 |       {
59 |       het_probs[curr_hets + 2] = het_probs[curr_hets] * 4.0 * curr_homr * curr_homc
60 |                             /((curr_hets + 2.0) * (curr_hets + 1.0));
61 |       sum += het_probs[curr_hets + 2];
62 | 
63 |       /* add 2 heterozygotes for next iteration -> subtract one rare, one common homozygote */
64 |       curr_homr--;
65 |       curr_homc--;
66 |       }
67 | 
68 |    for (i = 0; i <= rare_copies; i++)
69 |       het_probs[i] /= (sum > 0 ? sum : 1e-128);
70 | 
71 |    /* alternate p-value calculation for p_hi/p_lo
72 |    double p_hi = het_probs[obs_hets];
73 |    for (i = obs_hets + 1; i <= rare_copies; i++)
74 |      p_hi += het_probs[i];
75 |    
76 |    double p_lo = het_probs[obs_hets];
77 |    for (i = obs_hets - 1; i >= 0; i--)
78 |       p_lo += het_probs[i];
79 | 
80 |    
81 |    double p_hi_lo = p_hi < p_lo ? 2.0 * p_hi : 2.0 * p_lo;
82 |    */
83 | 
84 |    double p_hwe = 0.0;
85 |    /*  p-value calculation for p_hwe  */
86 |    for (i = 0; i <= rare_copies; i++)
87 |       {
88 |       if (het_probs[i] > het_probs[obs_hets])
89 |          continue;
90 |       p_hwe += het_probs[i];
91 |       }
92 |    
93 |    p_hwe = p_hwe > 1.0 ? 1.0 : p_hwe;
94 | 
95 |    free(het_probs);
96 | 
97 |    return p_hwe;
98 | }
99 | 


--------------------------------------------------------------------------------
/validation/gwas/method/hwe/chwe.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/validation/gwas/method/hwe/chwe.o


--------------------------------------------------------------------------------
/validation/gwas/method/hwe/invoke.yaml:
--------------------------------------------------------------------------------
1 | tasks:
2 |     auto_dash_names: false


--------------------------------------------------------------------------------
/validation/gwas/method/hwe/logging.ini:
--------------------------------------------------------------------------------
 1 | [loggers]
 2 | keys=root
 3 | 
 4 | [handlers]
 5 | keys=console
 6 | 
 7 | [formatters]
 8 | keys=console_formatter
 9 | 
10 | [logger_root]
11 | level=INFO
12 | handlers=console
13 | 
14 | [handler_console]
15 | level=INFO
16 | class=StreamHandler
17 | formatter=console_formatter
18 | args=(sys.stdout,)
19 | 
20 | [formatter_console_formatter]
21 | format=%(asctime)s|%(levelname)s|%(name)s.%(funcName)s:%(lineno)d| %(message)s


--------------------------------------------------------------------------------
/validation/gwas/method/hwe/tasks.py:
--------------------------------------------------------------------------------
 1 | import ctypes
 2 | import glob
 3 | import logging
 4 | import logging.config
 5 | import os
 6 | import shutil
 7 | from pathlib import Path
 8 | 
 9 | import numpy as np
10 | import pandas as pd
11 | from invoke import task
12 | 
13 | logging.config.fileConfig("logging.ini")
14 | logger = logging.getLogger(__name__)
15 | 
16 | DEFAULT_SIM_DATADIR = os.getenv("SIM_DATADIR", "data")
17 | DEFAULT_TEST_DATADIR = os.getenv("TEST_DATADIR", "../../../../sgkit/tests/test_hwe")
18 | 
19 | 
20 | @task
21 | def compile(ctx):
22 |     """Build reference implementation C library"""
23 |     logger.info("Building reference C library")
24 |     ctx.run("make")
25 |     logger.info("Build complete")
26 | 
27 | 
28 | def get_genotype_counts():
29 |     """Generate genotype counts for testing."""
30 |     rs = np.random.RandomState(0)
31 |     n, s = 10_000, 50
32 |     n_het = np.expand_dims(np.arange(n, step=s) + 1, -1)
33 |     frac = rs.uniform(0.3, 0.7, size=(n // s, 2))
34 |     n_hom = frac * n_het
35 |     n_hom = n_hom.astype(int)
36 |     return pd.DataFrame(
37 |         np.concatenate((n_het, n_hom), axis=1), columns=["n_het", "n_hom_1", "n_hom_2"]
38 |     )
39 | 
40 | 
41 | @task
42 | def simulate(ctx, sim_datadir=DEFAULT_SIM_DATADIR):
43 |     """Create inputs and outputs for unit tests."""
44 |     logger.info("Generating unit test data")
45 |     libc = ctypes.CDLL("./libchwe.so")
46 |     chwep = libc.hwep
47 |     chwep.restype = ctypes.c_double
48 |     df = get_genotype_counts()
49 |     df["p"] = df.apply(
50 |         lambda r: chwep(int(r["n_het"]), int(r["n_hom_1"]), int(r["n_hom_2"])), axis=1
51 |     )
52 |     output_dir = Path(sim_datadir)
53 |     if not output_dir.exists():
54 |         output_dir.mkdir(parents=True, exist_ok=True)
55 |     path = output_dir / "sim_01.csv"
56 |     df.to_csv(path, index=False)
57 |     logger.info(f"Unit test data written to {path}")
58 | 
59 | 
60 | @task
61 | def export(
62 |     ctx,
63 |     sim_datadir=DEFAULT_SIM_DATADIR,
64 |     test_datadir=DEFAULT_TEST_DATADIR,
65 |     clear=True,
66 |     runs=None,
67 | ):
68 |     sim_datadir = Path(sim_datadir)
69 |     test_datadir = Path(test_datadir).resolve()
70 |     logger.info(f"Exporting test data to {test_datadir}")
71 |     if clear and test_datadir.exists():
72 |         logger.info(f"Clearing test datadir at {test_datadir}")
73 |         shutil.rmtree(test_datadir)
74 |     test_datadir.mkdir(exist_ok=True)
75 |     for f in glob.glob(str(sim_datadir / "*.csv")):
76 |         src = f
77 |         dst = test_datadir / Path(f).name
78 |         logger.info(f"Copying {src} to {dst}")
79 |         shutil.copy(src, dst)
80 |     logger.info("Export complete")
81 | 


--------------------------------------------------------------------------------
/validation/gwas/method/pc_relate/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM rstudio/r-base:4.0-focal
 2 | 
 3 | # Note: We freeze versions because we want point in time validation
 4 | #       See: https://github.com/sgkit-dev/sgkit/pull/228
 5 | 
 6 | RUN apt-get update \
 7 |  && apt-get install python3 python3-pip git pkg-config -y \
 8 |  && rm -rf /var/lib/apt/lists/*
 9 | 
10 | RUN R -e 'install.packages("https://cran.r-project.org/src/contrib/data.table_1.13.0.tar.gz", type="source", repos=NULL)'
11 | RUN R -e 'install.packages("tictoc", version = "1.0", repos = "http://cran.us.r-project.org")'
12 | RUN R -e 'install.packages("BiocManager", version = "1.30.10", repos = "http://cran.us.r-project.org")'
13 | RUN R -e 'BiocManager::install(version = "3.11", ask = FALSE)' || \
14 |     R -e 'BiocManager::install(version = "3.11", ask = FALSE, force = TRUE)'
15 | RUN R -e 'BiocManager::install("SNPRelate", version = "3.11", ask = FALSE)'
16 | RUN R -e 'BiocManager::install("gdsfmt", version = "3.11", ask = FALSE)'
17 | RUN R -e 'BiocManager::install("GWASTools", version = "3.11", ask = FALSE)'
18 | RUN R -e 'BiocManager::install("GENESIS", version = "3.11", ask = FALSE)'
19 | 


--------------------------------------------------------------------------------
/validation/gwas/method/pc_relate/README.md:
--------------------------------------------------------------------------------
 1 | This runs test to validate our implementation gets the same
 2 | exact results as the reference R implementation for production
 3 | HapMap data.
 4 | 
 5 | This code is scheduled as part of the Github Actions CI.
 6 | 
 7 | To run manually, you need to first download the test data
 8 | from `https://storage.googleapis.com/sgkit-data/validation/hapmap_JPT_CHB_r23a_filtered.zip`,
 9 | the file size is about 32MB.
10 | 
11 | ```bash
12 | wget https://storage.googleapis.com/sgkit-data/validation/hapmap_JPT_CHB_r23a_filtered.zip -P /tmp/
13 | ./run.sh /tmp/hapmap_JPT_CHB_r23a_filtered.zip
14 | ```
15 | 
16 | `run.sh` will:
17 |  * convert plink data to GDS
18 |  * run reference [R PC-Relate implementation](pc_relate.R)  
19 |  * run [our PC-Relate and compare results](validate_pc_relate.py)
20 | 
21 | The only requirement is that you have Docker and Bash installed.
22 | 


--------------------------------------------------------------------------------
/validation/gwas/method/pc_relate/convert_plink_to_gds.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | library(gdsfmt)
 4 | library(SNPRelate)
 5 | 
 6 | args <- commandArgs(trailingOnly=TRUE)
 7 | 
 8 | if (length(args) < 2) {
 9 |   stop("usage: <file_name> <out>", call.=FALSE)
10 | }
11 | 
12 | snpgdsBED2GDS(bed.fn=paste(args[1], ".bed", sep = ""),
13 |               bim.fn=paste(args[1], ".bim", sep = ""),
14 |               fam.fn=paste(args[1], ".fam", sep = ""),
15 |               out.gdsfn=args[2])
16 | 


--------------------------------------------------------------------------------
/validation/gwas/method/pc_relate/pc_relate.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | library(gdsfmt)
 4 | library(SNPRelate)
 5 | library(GWASTools)
 6 | library(GENESIS)
 7 | library(tictoc)
 8 | 
 9 | args <- commandArgs(trailingOnly=TRUE)
10 | 
11 | if (length(args) < 1) {
12 |   stop("usage: <gds_file>", call.=FALSE)
13 | }
14 | 
15 | gds_filepath = args[1]
16 | 
17 | tic("KING kinship")
18 | genofile <- snpgdsOpen(gds_filepath)
19 | king_mat <- snpgdsIBDKING(genofile, num.thread=8)
20 | king_mat_2 = kingToMatrix(king_mat)
21 | toc(log = TRUE)
22 | snpgdsClose(genofile)
23 | 
24 | reader <- GdsGenotypeReader(gds_filepath, "scan,snp")
25 | geno_data <- GenotypeData(reader)
26 | 
27 | tic("PC-AIR")
28 | pcair_result <- pcair(geno_data,
29 |                       kinobj = king_mat_2,
30 |                       divobj = king_mat_2)
31 | toc(log = TRUE)
32 | summary(pcair_result)
33 | 
34 | write.csv(pcair_result$vectors[,1:2], file = "pcs.csv")
35 | 
36 | geno_data <- GenotypeBlockIterator(geno_data)
37 | tic("PC-Relate")
38 | pcrelate_result <- pcrelate(geno_data,
39 |                             pcs = pcair_result$vectors[,1:2])
40 | toc(log = TRUE)
41 | 
42 | 
43 | write.csv(pcrelate_result$kinSelf, "kinself.csv")
44 | write.csv(pcrelate_result$kinBtwn, "kinbtwe.csv")
45 | write.csv(pcair_result$unrels, "unrels.csv")
46 | write.csv(pcair_result$rels, "rels.csv")
47 | summary(pcrelate_result)
48 | 


--------------------------------------------------------------------------------
/validation/gwas/method/pc_relate/run.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -o errexit
 4 | set -o pipefail
 5 | 
 6 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 7 | REPO_ROOT="$( cd "$DIR/../../../.." >/dev/null 2>&1 && pwd )"
 8 | 
 9 | TEST_DATA="$1"
10 | 
11 | if [[ -z "$TEST_DATA" ]]; then
12 |   echo "usage $0 <PATH_TO_TEST_DATA>" >&2
13 |   echo "You can download real test data from https://storage.googleapis.com/sgkit-data/validation/hapmap_JPT_CHB_r23a_filtered.zip" >&2
14 |   exit 1
15 | fi
16 | 
17 | if [[ -z "$RUNNING_IN_SGKIT_PC_RELATE_VALIDATION_DOCKER" ]]; then
18 |   # Note: to speed up the process up, we could:
19 |   #       * build the docker image and push it to the GHCR/GCR
20 |   #       * or add docker layer caching https://github.com/marketplace/actions/docker-layer-caching
21 |   #
22 |   #       For now, we just build new docker image each time, if we want to run
23 |   #       validation more often than weekly or just want to get results
24 |   #       faster we could add any of the above in the future.
25 |   echo "Building validation docker image, this will take about ~20 minutes ..."
26 |   docker build -t sgkit_pc_relate_validation -f "$DIR/Dockerfile" "$DIR"
27 |   docker run --rm \
28 |   -v $DIR:/work \
29 |   -v $REPO_ROOT:/code \
30 |   -v $TEST_DATA:/test_data.zip \
31 |   -e RUNNING_IN_SGKIT_PC_RELATE_VALIDATION_DOCKER=1 \
32 |   sgkit_pc_relate_validation /work/run.sh "$1"
33 | else
34 |   echo "Running inside docker, will crunch data ..."
35 |   unzip test_data.zip
36 |   /work/convert_plink_to_gds.R hapmap_JPT_CHB_r23a_filtered hapmap_JPT_CHB_r23a_filtered.gds
37 |   /work/pc_relate.R hapmap_JPT_CHB_r23a_filtered.gds
38 |   cp /work/validate_pc_relate.py .
39 |   pip3 install --upgrade '/code[plink]' pytest
40 |   PYTHONPATH=/code:$PYTHONPATH pytest ./validate_pc_relate.py
41 | fi
42 | 


--------------------------------------------------------------------------------
/validation/gwas/method/pc_relate/validate_pc_relate.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import dask.array as da
 4 | import numpy as np
 5 | import pandas as pd
 6 | import xarray as xr
 7 | 
 8 | from sgkit import pc_relate
 9 | from sgkit.io.plink import read_plink
10 | from sgkit.variables import sample_pca_projection
11 | 
12 | 
13 | def test_same_as_the_reference_implementation() -> None:
14 |     """
15 |     This test validates that our implementation gets exactly
16 |     the same results as the reference R implementation.
17 |     """
18 | 
19 |     d = Path(__file__).parent
20 |     ds = read_plink(path="hapmap_JPT_CHB_r23a_filtered")
21 | 
22 |     pcs = da.from_array(
23 |         pd.read_csv(d.joinpath("pcs.csv").as_posix(), usecols=[1, 2]).to_numpy()
24 |     )
25 |     ds[sample_pca_projection] = (("samples", "components"), pcs)
26 |     phi = pc_relate(ds).pc_relate_phi.compute()
27 | 
28 |     n_samples = 90
29 |     assert isinstance(phi, xr.DataArray)
30 |     assert phi.shape == (n_samples, n_samples)
31 | 
32 |     # Get genesis/reference results:
33 |     genesis_phi = pd.read_csv(d.joinpath("kinbtwe.csv"))
34 |     genesis_phi = genesis_phi[["kin"]].to_numpy()
35 | 
36 |     phi_s = phi.data[np.triu_indices_from(phi.data, 1)]  # type: ignore[no-untyped-call]
37 |     assert phi_s.size == genesis_phi.size
38 |     assert np.allclose(phi_s, genesis_phi.T)
39 | 


--------------------------------------------------------------------------------
/validation/gwas/method/regenie/.gitignore:
--------------------------------------------------------------------------------
1 | data/
2 | hail-*.log


--------------------------------------------------------------------------------
/validation/gwas/method/regenie/README.md:
--------------------------------------------------------------------------------
 1 | ## REGENIE Validation
 2 | 
 3 | The scripts in this directory are used to generate data and validate results from a reference implementation, specifically [GloWGR](https://glow.readthedocs.io/en/latest/tertiary/whole-genome-regression.html).  
 4 | 
 5 | The general flow for this process is:
 6 | 
 7 | 1. Generate simulated genotypes, covariates, and traits saved as PLINK (via Hail) and pandas DataFrames
 8 | 2. Convert PLINK results to Zarr
 9 | 3. Run Glow WGR to produce results to compare against
10 | 4. Export a subset of these results and the configuration that defines them to a unit test directory
11 | 
12 | *Note*: The initial PLINK output is used for compatibility with the REGENIE C++ application
13 | 
14 | All of the above are represented as pyinvoke tasks in [tasks.py](tasks.py).  
15 | 
16 | The definition of each simulated dataset and parameterizations run against them can be seen in [config.yml](config.yml). 
17 | 
18 | At time of writing, these commands were used to generate the current test data:
19 | 
20 | ```bash
21 | # Build the simulated inputs and outputs
22 | invoke build
23 | # Export select results to build unit tests against
24 | invoke export --runs sim_sm_02-wgr_02 --runs sim_sm_01-wgr_01
25 | ```
26 | 
27 | ### Glow WGR Release
28 | 
29 | This validation was run for [glow.py==0.5.0](https://pypi.org/project/glow.py/0.5.0/).  At this time, binary traits are not yet supported and the REGENIE implementation hasn't even been officially released.  Support for [binary traits should come in the next release](https://github.com/projectglow/glow/issues/256) along with official support at which time this validation should be updated.  From that point onward, there is little need to update this data unless either implementation (sgkit or Glow) has been shown to be incorrect.
30 | 
31 | 


--------------------------------------------------------------------------------
/validation/gwas/method/regenie/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgkit-dev/sgkit/ddc7076e058f032571afc2e291fb6fc25a5dcc34/validation/gwas/method/regenie/__init__.py


--------------------------------------------------------------------------------
/validation/gwas/method/regenie/config.yml:
--------------------------------------------------------------------------------
 1 | datasets:
 2 |   sim_sm_01:
 3 |     n_variants: 250
 4 |     n_samples: 50
 5 |     n_covars: 3
 6 |     n_contigs: 1
 7 |     n_traits: 1
 8 |   sim_sm_02:
 9 |     n_variants: 250
10 |     n_samples: 50
11 |     n_covars: 3
12 |     n_contigs: 10
13 |     n_traits: 5
14 |   sim_sm_03:
15 |     n_variants: 250
16 |     n_samples: 50
17 |     n_covars: 3
18 |     n_contigs: 10
19 |     n_traits: 1
20 |   sim_md_01:
21 |     n_variants: 1000
22 |     n_samples: 250
23 |     n_covars: 3
24 |     n_contigs: 1
25 |     n_traits: 1
26 | paramsets:
27 |   wgr_01:
28 |     variant_block_size: 10
29 |     sample_block_size: 10
30 |     alphas: [1000]
31 |   wgr_02:
32 |     variant_block_size: 10
33 |     sample_block_size: 10
34 |     alphas: null
35 |   wgr_03:
36 |     variant_block_size: 100
37 |     sample_block_size: 50
38 |     alphas: [1000]
39 | runs:
40 |   - {dataset: sim_sm_01, paramset: wgr_01, name: sim_sm_01-wgr_01}
41 |   - {dataset: sim_sm_02, paramset: wgr_01, name: sim_sm_02-wgr_01}
42 |   - {dataset: sim_sm_02, paramset: wgr_02, name: sim_sm_02-wgr_02}
43 |   - {dataset: sim_sm_03, paramset: wgr_01, name: sim_sm_03-wgr_01}
44 |   - {dataset: sim_md_01, paramset: wgr_03, name: sim_md_01-wgr_01}


--------------------------------------------------------------------------------
/validation/gwas/method/regenie/invoke.yaml:
--------------------------------------------------------------------------------
1 | tasks:
2 |     auto_dash_names: false
3 | 
4 | 


--------------------------------------------------------------------------------
/validation/gwas/method/regenie/logging.ini:
--------------------------------------------------------------------------------
 1 | [loggers]
 2 | keys=root
 3 | 
 4 | [handlers]
 5 | keys=console
 6 | 
 7 | [formatters]
 8 | keys=console_formatter
 9 | 
10 | [logger_root]
11 | level=INFO
12 | handlers=console
13 | 
14 | [handler_console]
15 | level=INFO
16 | class=StreamHandler
17 | formatter=console_formatter
18 | args=(sys.stdout,)
19 | 
20 | [formatter_console_formatter]
21 | format=%(asctime)s|%(levelname)s|%(name)s.%(funcName)s:%(lineno)d| %(message)s


--------------------------------------------------------------------------------
/validation/gwas/method/regenie/sgkit_zarr.py:
--------------------------------------------------------------------------------
 1 | #!/opt/conda/bin/python
 2 | # coding: utf-8
 3 | 
 4 | import logging
 5 | import logging.config
 6 | from pathlib import Path
 7 | 
 8 | import fire
 9 | import yaml
10 | import zarr
11 | from sgkit_plink import read_plink
12 | 
13 | logging.config.fileConfig("logging.ini")
14 | logger = logging.getLogger(__name__)
15 | 
16 | 
17 | def run(dataset: str, dataset_dir="data/dataset"):
18 |     dataset_dir = Path(dataset_dir)
19 |     plink_path = dataset_dir / dataset / "genotypes"
20 |     zarr_path = dataset_dir / dataset / "genotypes.zarr.zip"
21 |     ds = read_plink(path=plink_path, bim_sep="\t", fam_sep="\t")
22 |     # Pre-compute string lengths until this is done:
23 |     # https://github.com/sgkit-dev/sgkit-plink/issues/12
24 |     ds = ds.compute()
25 |     logger.info(f"Loaded dataset {dataset}:")
26 |     logger.info("\n" + str(ds))
27 |     store = zarr.ZipStore(zarr_path, mode="w")
28 |     ds.to_zarr(store, mode="w")
29 |     store.close()
30 |     logger.info(f"Conversion to zarr at {zarr_path} successful")
31 | 
32 | 
33 | def run_from_config():
34 |     with open("config.yml") as fd:
35 |         config = yaml.load(fd, Loader=yaml.FullLoader)
36 |     for dataset in config["datasets"]:
37 |         run(dataset)
38 | 
39 | 
40 | fire.Fire()
41 | 


--------------------------------------------------------------------------------
/validation/gwas/method/regenie/tasks.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import logging
  3 | import logging.config
  4 | import os
  5 | import shutil
  6 | from pathlib import Path
  7 | 
  8 | import yaml
  9 | from invoke import task
 10 | 
 11 | logging.config.fileConfig("logging.ini")
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | HAILPY = os.environ.get("HAIL_PYTHON_EXECUTABLE", "/opt/conda/envs/hail/bin/python")
 15 | GLOWPY = os.environ.get("GLOW_PYTHON_EXECUTABLE", "/opt/conda/envs/glow/bin/python")
 16 | BASEPY = os.environ.get("BASE_PYTHON_EXECUTABLE", "/opt/conda/bin/python")
 17 | DEFAULT_TEST_DATADIR = os.getenv(
 18 |     "TEST_DATADIR", str(Path(__file__).parents[4] / "sgkit/tests/test_regenie")
 19 | )
 20 | 
 21 | 
 22 | def get_config():
 23 |     with open("config.yml") as fd:
 24 |         return yaml.load(fd, Loader=yaml.FullLoader)
 25 | 
 26 | 
 27 | def filter_config(config, runs):
 28 |     res = {"datasets": {}, "paramsets": {}, "runs": []}
 29 |     for run in config["runs"]:
 30 |         name = run["name"]
 31 |         if name not in runs:
 32 |             continue
 33 |         if run["dataset"] not in res["datasets"]:
 34 |             res["datasets"][run["dataset"]] = config["datasets"][run["dataset"]]
 35 |         if run["paramset"] not in res["paramsets"]:
 36 |             res["paramsets"][run["paramset"]] = config["paramsets"][run["paramset"]]
 37 |         res["runs"].append(run)
 38 |     return res
 39 | 
 40 | 
 41 | @task
 42 | def run_simulation(ctx, dataset):
 43 |     logger.info(f"Running simulation for dataset {dataset}")
 44 |     ctx.run(f"{HAILPY} hail_sim.py run_from_config {dataset}")
 45 | 
 46 | 
 47 | @task
 48 | def run_simulations(ctx):
 49 |     config = get_config()
 50 |     for dataset in config["datasets"]:
 51 |         run_simulation(ctx, dataset)
 52 | 
 53 | 
 54 | @task
 55 | def run_glow_wgr(ctx, dataset, paramset):
 56 |     logger.info(f"Running Glow WGR for dataset {dataset}, paramset {paramset}")
 57 |     ctx.run(f"{GLOWPY} glow_wgr.py run_from_config {dataset} {paramset}")
 58 | 
 59 | 
 60 | @task
 61 | def run_plink_to_zarr(ctx):
 62 |     ctx.run(f"{BASEPY} sgkit_zarr.py run_from_config")
 63 | 
 64 | 
 65 | @task
 66 | def run_all_glow_wgr(ctx):
 67 |     config = get_config()
 68 |     for run in config["runs"]:
 69 |         run_glow_wgr(ctx, run["dataset"], run["paramset"])
 70 | 
 71 | 
 72 | def copy_files(src, dst, patterns):
 73 |     logger.info(f"Copying files from {src} to {dst}")
 74 |     dst.mkdir(parents=True, exist_ok=True)
 75 |     files = [Path(f) for pattern in patterns for f in glob.glob(str(src / pattern))]
 76 |     for f in files:
 77 |         logger.info(f"\tCopying path: {f}")
 78 |         if f.is_dir():
 79 |             shutil.copytree(f, dst / f.name)
 80 |         else:
 81 |             shutil.copy(f, dst)
 82 | 
 83 | 
 84 | @task(iterable=["runs"])
 85 | def export(ctx, test_datadir=DEFAULT_TEST_DATADIR, clear=True, runs=None):
 86 |     test_datadir = Path(test_datadir).resolve()
 87 |     src_datadir = Path("data")
 88 |     if clear and test_datadir.exists():
 89 |         logger.info(f"Clearing test datadir at {test_datadir}")
 90 |         shutil.rmtree(test_datadir)
 91 |     test_datadir.mkdir(exist_ok=True)
 92 |     config = get_config()
 93 |     if runs is not None:
 94 |         config = filter_config(config, runs)
 95 |     # Export datasets
 96 |     for dataset in config["datasets"]:
 97 |         dst = test_datadir / "dataset" / dataset
 98 |         src = src_datadir / "dataset" / dataset
 99 |         copy_files(src, dst, ["*.csv", "*.csv.gz", "*.zarr.zip"])
100 |     # Export results
101 |     for run in config["runs"]:
102 |         name = run["name"]
103 |         dst = test_datadir / "result" / name
104 |         src = src_datadir / "result" / name
105 |         copy_files(src, dst, ["*.csv", "*.csv.gz"])
106 |     # Export config
107 |     config_path = test_datadir / "config.yml"
108 |     with open(config_path, "w") as fd:
109 |         yaml.dump(config, fd)
110 |     logger.info(f"Config written to {config_path}")
111 |     logger.info("Export complete")
112 | 
113 | 
114 | @task(pre=[run_simulations, run_all_glow_wgr, run_plink_to_zarr])
115 | def build(ctx):
116 |     logger.info("Test data generation complete")
117 | 


--------------------------------------------------------------------------------
/validation/gwas/method/regenie_loco_regression/README.md:
--------------------------------------------------------------------------------
 1 | This notebook is used to generate validation data for `sgkit.stats.association.regenie_loco_regression`. It generates offsets to pass as a parameter to the function as well as results from GLOW to check it against.
 2 | 
 3 | Follow these steps to start the `.ipynb` notebooks
 4 | 
 5 | 1. Create and activate a conda environment:
 6 | 
 7 | ```
 8 | conda env create -f environment.yml
 9 | conda activate glow
10 | ```
11 | 
12 | 2. Find the location of the corresponding pyspark binary, by typing the following commands in a python console:
13 | 
14 | ```
15 | python -c "import pyspark; print(pyspark.__path__)"
16 | ```
17 | 
18 | 3. Start the Jupyter notebook (make sure to replace `/path/to/pyspark` by that from the command above):
19 | 
20 | ```
21 | PYSPARK_DRIVER_PYTHON=jupyter-lab PYSPARK_DRIVER_PYTHON_OPTS="--ip 0.0.0.0 --port 9999 --no-browser" /path/to/pyspark/bin/pyspark --packages io.projectglow:glow-spark3_2.12:1.0.1 --conf spark.hadoop.io.compression.codecs=io.projectglow.sql.util.BGZFCodec
22 | ```
23 | 
24 | If your notebook is running on a remote server and you can't connect directly to port 9999, run the command below to tunnel the remote server's port 9999 to your local host.
25 | 
26 | ```
27 | ssh -N -L localhost:9999:localhost:9999 <user>@<server>
28 | ```
29 | 
30 | Note: The notebooks are based on the two `.rst` files provided in the glow repository: `docs/source/tertiary/regression-tests.rst` and `whole-genome-regression.rst`
31 | 


--------------------------------------------------------------------------------
/validation/gwas/method/regenie_loco_regression/environment.yml:
--------------------------------------------------------------------------------
 1 | name: glow
 2 | channels:
 3 |   - bioconda
 4 |   - conda-forge
 5 | dependencies:
 6 |   - python=3.7
 7 |   - glow=1.0.1=pyh44b312d_0
 8 |   - jupyterlab=3
 9 |   - numpy=1.18.1
10 |   - pandas=1.0.1
11 |   - pyspark=3.1.2
12 |   - xarray
13 |   - zarr
14 | 


--------------------------------------------------------------------------------