├── .circleci
    └── config.yml
├── .flake8
├── .github
    └── workflows
    │   ├── docker
    │       ├── buildwheel.sh
    │       └── shared.env
    │   ├── docs.yml
    │   ├── tests.yml
    │   └── wheels.yml
├── .gitignore
├── .mergify.yml
├── .pre-commit-config.yaml
├── CHANGELOG.md
├── CITATION.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── _tsinfermodule.c
├── convert_hdf5.py
├── dev.py
├── docs
    ├── .gitignore
    ├── CHANGELOG.md
    ├── CITATION.md
    ├── Makefile
    ├── _config.yml
    ├── _static
    │   ├── .README
    │   ├── P_dom_chr24_phased.vcf.gz
    │   ├── P_dom_chr24_phased.vcf.gz.tbi
    │   ├── ancestor_grouping.png
    │   ├── example_ancestral_state.fa
    │   ├── example_ancestral_state.fa.fai
    │   ├── example_data.vcz
    │   │   ├── .zattrs
    │   │   ├── .zgroup
    │   │   ├── .zmetadata
    │   │   ├── call_genotype
    │   │   │   ├── .zarray
    │   │   │   ├── .zattrs
    │   │   │   └── 0.0.0
    │   │   ├── call_genotype_mask
    │   │   │   ├── .zarray
    │   │   │   ├── .zattrs
    │   │   │   └── 0.0.0
    │   │   ├── call_genotype_phased
    │   │   │   ├── .zarray
    │   │   │   ├── .zattrs
    │   │   │   └── 0.0
    │   │   ├── contig_id
    │   │   │   ├── 0
    │   │   │   ├── .zarray
    │   │   │   └── .zattrs
    │   │   ├── sample_id
    │   │   │   ├── 0
    │   │   │   ├── .zarray
    │   │   │   └── .zattrs
    │   │   ├── variant_allele
    │   │   │   ├── .zarray
    │   │   │   └── 0.0
    │   │   ├── variant_contig
    │   │   │   ├── 0
    │   │   │   ├── .zarray
    │   │   │   └── .zattrs
    │   │   └── variant_position
    │   │   │   ├── 0
    │   │   │   ├── .zarray
    │   │   │   └── .zattrs
    │   ├── example_flow.svg
    │   └── tree_at_1Mb.svg
    ├── _templates
    │   └── .README
    ├── _toc.yml
    ├── api.rst
    ├── build.sh
    ├── cli.rst
    ├── development.rst
    ├── file_formats.rst
    ├── index.md
    ├── inference.md
    ├── installation.rst
    ├── introduction.rst
    ├── large_scale.md
    ├── simulation-example.py
    ├── tsinfer_logo.svg
    └── usage.md
├── evaluation.py
├── lib
    ├── .clang-format
    ├── ancestor_builder.c
    ├── ancestor_matcher.c
    ├── avl.c
    ├── avl.h
    ├── err.c
    ├── err.h
    ├── meson.build
    ├── object_heap.c
    ├── object_heap.h
    ├── subprojects
    │   ├── README
    │   ├── tskit.wrap
    │   └── tskit
    │   │   ├── .gitignore
    │   │   ├── CHANGELOG.rst
    │   │   ├── VERSION.txt
    │   │   ├── examples
    │   │       ├── Makefile
    │   │       ├── api_structure.c
    │   │       ├── cpp_sorting_example.cpp
    │   │       ├── error_handling.c
    │   │       ├── haploid_wright_fisher.c
    │   │       ├── streaming.c
    │   │       ├── take_ownership.c
    │   │       ├── tree_iteration.c
    │   │       └── tree_traversal.c
    │   │   ├── meson.build
    │   │   ├── meson_options.txt
    │   │   ├── subprojects
    │   │       └── kastore
    │   │       │   ├── README.md
    │   │       │   ├── VERSION.txt
    │   │       │   ├── kastore.c
    │   │       │   ├── kastore.h
    │   │       │   └── meson.build
    │   │   ├── tests
    │   │       ├── test_convert.c
    │   │       ├── test_core.c
    │   │       ├── test_file_format.c
    │   │       ├── test_genotypes.c
    │   │       ├── test_haplotype_matching.c
    │   │       ├── test_minimal_cpp.cpp
    │   │       ├── test_stats.c
    │   │       ├── test_tables.c
    │   │       ├── test_trees.c
    │   │       ├── testlib.c
    │   │       └── testlib.h
    │   │   ├── tskit.h
    │   │   └── tskit
    │   │       ├── convert.c
    │   │       ├── convert.h
    │   │       ├── core.c
    │   │       ├── core.h
    │   │       ├── genotypes.c
    │   │       ├── genotypes.h
    │   │       ├── haplotype_matching.c
    │   │       ├── haplotype_matching.h
    │   │       ├── stats.c
    │   │       ├── stats.h
    │   │       ├── tables.c
    │   │       ├── tables.h
    │   │       ├── trees.c
    │   │       └── trees.h
    ├── tests
    │   └── tests.c
    ├── tree_sequence_builder.c
    └── tsinfer.h
├── pyproject.toml
├── requirements
    ├── CI-docs
    │   └── requirements.txt
    ├── CI-tests-complete
    │   └── requirements.txt
    ├── CI-tests-conda
    │   └── requirements.txt
    └── development.txt
├── setup.cfg
├── setup.py
├── tests
    ├── conftest.py
    ├── data
    │   ├── bugs
    │   │   └── invalid_pc_ancestor_time.samples
    │   └── old_formats
    │   │   └── medium_sd_fixture_0.2.3.samples
    ├── test_ancestors.py
    ├── test_cli.py
    ├── test_evaluation.py
    ├── test_formats.py
    ├── test_inference.py
    ├── test_low_level.py
    ├── test_provenance.py
    ├── test_variantdata.py
    └── tsutil.py
├── tsinfer
    ├── __init__.py
    ├── __main__.py
    ├── algorithm.py
    ├── ancestors.py
    ├── cli.py
    ├── constants.py
    ├── eval_util.py
    ├── exceptions.py
    ├── formats.py
    ├── inference.py
    ├── progress.py
    ├── provenance.py
    └── threads.py
└── visualisation.py


/.circleci/config.yml:
--------------------------------------------------------------------------------
  1 | version: 2.1
  2 | 
  3 | orbs:
  4 |   codecov: codecov/codecov@3.2.4
  5 | 
  6 | jobs:
  7 |   build:
  8 |     docker:
  9 |       - image: cimg/python:3.9
 10 |     steps:
 11 |       - checkout
 12 | 
 13 |       - restore_cache:
 14 |           key: tsinfer-{{ .Branch }}-v3
 15 | 
 16 |       - run:
 17 |           name: Install dependencies and set PATH
 18 |           command: |
 19 |               sudo apt-get update
 20 |               sudo apt-get install libgsl-dev libcap-dev libnuma-dev libcunit1-dev \
 21 |                   libconfig-dev ninja-build valgrind clang python3-pip
 22 |               # set path persistently https://circleci.com/docs/2.0/env-vars/#setting-path
 23 |               echo 'export PATH=/home/circleci/.local/bin:$PATH' >> $BASH_ENV
 24 | 
 25 |       - run:
 26 |           name: Install development dependencies
 27 |           command: |
 28 |               pyenv global 3.9
 29 |               pip install -r requirements/CI-tests-complete/requirements.txt --user
 30 |               pyenv rehash
 31 | 
 32 |       - save_cache:
 33 |           key: tsinfer-{{ .Branch }}-v1
 34 |           paths:
 35 |             - "/home/circleci/.local"
 36 | 
 37 |       - run:
 38 |           name: Checkout submodules
 39 |           command: |
 40 |             git submodule update --init --recursive
 41 |             # Write out the status for debugging purposes. Are we checked out at tags?
 42 |             git submodule status --recursive
 43 | 
 44 |       - run:
 45 |           name: Build the distribution tarball.
 46 |           command: |
 47 |             python -m build --sdist
 48 |             python setup.py check
 49 |             python -m twine check dist/*.tar.gz --strict
 50 |             rm dist/*
 51 |             python -m build
 52 | 
 53 |       - run:
 54 |           name: Install from the distribution tarball
 55 |           command: |
 56 |             python -m venv venv
 57 |             source venv/bin/activate
 58 |             pip install dist/*.tar.gz
 59 |             python -c 'import tsinfer; print(tsinfer.__version__)'
 60 | 
 61 |             #Also check the wheel
 62 |             pip uninstall --yes tsinfer
 63 |             pip install dist/*.whl
 64 |             python -c 'import tsinfer; print(tsinfer.__version__)'
 65 |             deactivate
 66 |             rm -rf venv
 67 | 
 68 |       - run:
 69 |           name: Compile Python
 70 |           command: |
 71 |               python setup.py build_ext --inplace
 72 | 
 73 |       - run:
 74 |           name: Run Python tests and upload coverage
 75 |           command: |
 76 |               python3 -m pytest --cov=tsinfer  --cov-report=xml --cov-branch -xvs tests
 77 |               rm .coverage
 78 | 
 79 |       - codecov/upload:
 80 |           flags: python
 81 |           token: CODECOV_TOKEN
 82 | 
 83 |       - run:
 84 |           name: Compile C with gcc
 85 |           command: |
 86 |             CFLAGS=--coverage meson lib/ build-gcc
 87 |             ninja -C build-gcc
 88 | 
 89 |       - run:
 90 |           name: Compile C with clang
 91 |           command: |
 92 |             CC=clang CXX=clang++ meson lib/ build-clang
 93 |             ninja -C build-clang
 94 | 
 95 |       - run:
 96 |           name: Run the low-level tests.
 97 |           command: |
 98 |             cd build-gcc
 99 |             ./tests
100 | 
101 |       - run:
102 |           name: Run gcov manually, as the one used in codecov doesn't work here.
103 |           command: |
104 |             gcov -pb -o ./build/temp.linux*/ _tsinfermodule.c
105 |             cd build-gcc
106 |             # TODO should be able to do this with 'find', but it's tricky and opaque.
107 |             gcov -pb ./libtsinfer.a.p/ancestor_builder.c.gcno ../lib/ancestor_builder.c
108 |             gcov -pb ./libtsinfer.a.p/ancestor_matcher.c.gcno ../lib/ancestor_matcher.c
109 |             gcov -pb ./libtsinfer.a.p/tree_sequence_builder.c.gcno ../lib/tree_sequence_builder.c
110 |             gcov -pb ./libtsinfer.a.p/object_heap.c.gcno ../lib/object_heap.c
111 |             gcov -pb ./libtsinfer.a.p/err.c.gcno ../lib/err.c
112 |             cd ..
113 | 
114 |       - codecov/upload:
115 |           flags: C
116 |           token: CODECOV_TOKEN
117 | 
118 |       - run:
119 |           name: Valgrind for C tests.
120 |           command: |
121 |             valgrind --leak-check=full --error-exitcode=1 ./build-gcc/tests
122 | 
123 |       - run:
124 |           name: Run clang-compiled C tests
125 |           command: |
126 |             ninja -C build-clang test
127 | 
128 | 
129 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | # Based directly on Black's recommendations:
3 | # https://black.readthedocs.io/en/stable/the_black_code_style.html#line-length
4 | max-line-length = 81
5 | select = C,E,F,W,B,B950
6 | ignore = E203, E501, W503
7 | 


--------------------------------------------------------------------------------
/.github/workflows/docker/buildwheel.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | DOCKER_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 3 | source "$DOCKER_DIR/shared.env"
 4 | 
 5 | set -e -x
 6 | 
 7 | ARCH=`uname -p`
 8 | echo "arch=$ARCH"
 9 | 
10 | # We're running as root in the docker container so git commands issued by
11 | # setuptools_scm will fail without this:
12 | git config --global --add safe.directory /project
13 | # Fetch the full history as we'll be missing tags otherwise.
14 | git fetch --unshallow
15 | for V in "${PYTHON_VERSIONS[@]}"; do
16 |     git reset --hard
17 |     git clean -fd
18 |     PYBIN=/opt/python/$V/bin
19 |     rm -rf build/       # Avoid lib build by narrow Python is used by wide python
20 |     # Instead of letting setup.py install a newer numpy we install it here
21 |     # using the oldest supported version for ABI compatibility
22 |     $PYBIN/python -m venv env
23 |     source env/bin/activate
24 |     $PYBIN/python -m pip install --upgrade build
25 |     SETUPTOOLS_SCM_DEBUG=1 $PYBIN/python -m build
26 | done
27 | 
28 | cd dist
29 | for whl in *.whl; do
30 |     auditwheel repair "$whl"
31 |     rm "$whl"
32 | done


--------------------------------------------------------------------------------
/.github/workflows/docker/shared.env:
--------------------------------------------------------------------------------
1 | PYTHON_VERSIONS=(
2 |     cp312-cp312
3 |     cp311-cp311
4 |     cp310-cp310
5 |     cp39-cp39
6 | )
7 | 


--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
 1 | name: Docs
 2 | 
 3 | on:
 4 |   pull_request:
 5 |   push:
 6 |     branches: [main]
 7 |     tags:
 8 |       - '*'
 9 | 
10 | env:
11 |   COMMIT_EMAIL: ben.jeffery.well+adminbot@gmail.com
12 |   MAKE_TARGET: all
13 |   OWNER: tskit-dev
14 |   REPO: tsinfer
15 | 
16 | jobs:
17 |   build-deploy-docs:
18 |     name: Docs
19 |     runs-on: ubuntu-24.04
20 |     steps:
21 |       - name: Cancel Previous Runs
22 |         uses: styfle/cancel-workflow-action@0.12.1
23 |         with:
24 |           access_token: ${{ github.token }}
25 | 
26 |       - uses: actions/checkout@v4.2.2
27 |         with:
28 |             submodules: true
29 | 
30 |       - uses: actions/setup-python@v5.4.0
31 |         with:
32 |           python-version: "3.11"
33 |           cache: "pip"
34 | 
35 |       - name: Install deps (one by one to avoid conflict errors)
36 |         run: |
37 |           pip install --upgrade pip wheel
38 |           pip install -r requirements/CI-docs/requirements.txt
39 |           sudo apt-get install -y tabix
40 | 
41 |       - name: Build C module
42 |         if: env.MAKE_TARGET
43 |         run: |
44 |           make $MAKE_TARGET
45 | 
46 |       - name: Build Docs
47 |         run: |
48 |           cd docs && make dist
49 | 
50 |       - name: Trigger docs site rebuild
51 |         if: github.ref == 'refs/heads/main'
52 |         run: |
53 |           curl -X POST https://api.github.com/repos/tskit-dev/tskit-site/dispatches \
54 |                     -H 'Accept: application/vnd.github.everest-preview+json' \
55 |                     -u AdminBot-tskit:${{ secrets.ADMINBOT_TOKEN }} \
56 |                     --data '{"event_type":"build-docs"}'
57 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
  1 | name: Tests
  2 | 
  3 | on:
  4 |   pull_request:
  5 |   push:
  6 |     branches: [main]
  7 | 
  8 | jobs:
  9 |   pre-commit:
 10 |     name: Lint
 11 |     runs-on: ubuntu-24.04
 12 |     steps:
 13 |       - name: Cancel Previous Runs
 14 |         uses: styfle/cancel-workflow-action@0.12.1
 15 |         with:
 16 |           access_token: ${{ github.token }}
 17 |       - uses: actions/checkout@v4.2.2
 18 |       - uses: actions/setup-python@v5.4.0
 19 |         with:
 20 |           python-version: '3.10'
 21 |       - name: install clang-format
 22 |         if: steps.clang_format.outputs.cache-hit != 'true'
 23 |         run: |
 24 |           sudo pip install clang-format==6.0.1
 25 |           sudo ln -s /usr/local/bin/clang-format /usr/local/bin/clang-format-6.0
 26 |       - uses: pre-commit/action@v3.0.1
 27 | 
 28 |   test:
 29 |     name: Python
 30 |     runs-on: ${{ matrix.os }}
 31 |     strategy:
 32 |       fail-fast: false
 33 |       matrix:
 34 |         python: [ 3.9, "3.12" ]
 35 |         os:  [ macos-latest, ubuntu-24.04, windows-latest ]
 36 |     defaults:
 37 |       run:
 38 |         shell: bash
 39 |     steps:
 40 |       - name: Cancel Previous Runs
 41 |         uses: styfle/cancel-workflow-action@0.12.1
 42 |         with:
 43 |           access_token: ${{ github.token }}
 44 | 
 45 |       - name: Checkout
 46 |         uses: actions/checkout@v4.2.2
 47 |         with:
 48 |           submodules: true
 49 | 
 50 |       - name: Cache conda and dependancies
 51 |         id: cache
 52 |         uses: actions/cache@v4.2.2
 53 |         with:
 54 |           path: ${{ env.CONDA }}/envs
 55 |           key: ${{ runner.os }}-${{ runner.arch }}-${{ matrix.python}}-conda-v5-${{ hashFiles('requirements/CI-tests-conda/requirements.txt')}}
 56 | 
 57 |       - name: Install Miniconda with Mamba
 58 |         uses: conda-incubator/setup-miniconda@v3.1.1
 59 |         if: steps.cache.outputs.cache-hit != 'true'
 60 |         with:
 61 |           activate-environment: anaconda-client-env
 62 |           python-version: ${{ matrix.python }}
 63 |           channels: conda-forge
 64 | #          channel-priority: strict
 65 |           auto-update-conda: true
 66 | #          mamba-version: "*"
 67 | #          use-mamba: true
 68 | 
 69 |       - name: Fix windows .profile
 70 |         if: steps.cache.outputs.cache-hit != 'true' && matrix.os == 'windows-latest'
 71 |         run: |
 72 |           cp ~/.bash_profile ~/.profile
 73 | 
 74 |       # Work around weird issues on OSX possibly caused by mixed compilers
 75 |       # https://github.com/tskit-dev/tsinfer/issues/376
 76 |       - name: Install compiler from conda
 77 |         if: steps.cache.outputs.cache-hit != 'true'
 78 |         shell: bash -l {0} #We need a login shell to get conda
 79 |         run: conda install --yes c-compiler
 80 | 
 81 |       - name: Install conda deps
 82 |         if: steps.cache.outputs.cache-hit != 'true'
 83 |         shell: bash -l {0} #We need a login shell to get conda
 84 |         run: conda install --yes --file=requirements/CI-tests-conda/requirements.txt
 85 | 
 86 |       - name: Install cyvcf2 #Fails if done via conda due to no windows support.
 87 |         if: steps.cache.outputs.cache-hit != 'true' && matrix.os != 'windows-latest'
 88 |         run: |
 89 |           source ~/.profile
 90 |           conda activate anaconda-client-env
 91 |           #Install these by pip so we don't pull in cbgen with conda as it isn't available on 3.12
 92 |           pip install sgkit==0.9.0 cyvcf2==0.31.1 yarl==1.9.4 aiohttp==3.9.5 requests==2.32.3 
 93 | 
 94 |       - name: Install sgkit only on windows
 95 |         if: steps.cache.outputs.cache-hit != 'true' && matrix.os == 'windows-latest'
 96 |         run: |
 97 |           source ~/.profile
 98 |           conda activate anaconda-client-env
 99 |           #Install these by pip so we don't pull in cbgen with conda as it isn't available on 3.12
100 |           pip install sgkit==0.9.0 
101 | 
102 |       - name: Build module
103 |         run: |
104 |           source ~/.profile
105 |           conda activate anaconda-client-env
106 |           # Use numpy2 to build the module
107 |           pip install "numpy>=2"
108 |           python setup.py build_ext --inplace
109 | 
110 |       - name: Run tests
111 |         run: |
112 |           source ~/.profile
113 |           conda activate anaconda-client-env
114 |           # Test with numpy<2 for numba
115 |           pip install "numpy<2"
116 |           python -m pytest -xv
117 | 


--------------------------------------------------------------------------------
/.github/workflows/wheels.yml:
--------------------------------------------------------------------------------
  1 | name: Build and test wheels
  2 | 
  3 | on:
  4 |   push:
  5 |     branches:
  6 |       - main
  7 |       - test
  8 |     tags:
  9 |       - '*'
 10 |   release:
 11 |     types: [published]
 12 | 
 13 | jobs:
 14 |   OSX:
 15 |     runs-on: macos-latest
 16 |     strategy:
 17 |       matrix:
 18 |         python: [3.9, "3.10", 3.11, 3.12]
 19 |     steps:
 20 |       - name: Checkout
 21 |         uses: actions/checkout@v4.2.2
 22 |         with:
 23 |             submodules: true
 24 |       - name: Set up Python ${{ matrix.python }}
 25 |         uses: actions/setup-python@v5.4.0
 26 |         with:
 27 |           python-version: ${{ matrix.python }}
 28 |       - name: Install deps
 29 |         run: |
 30 |           pip install --upgrade pip build delocate
 31 |       - name: Build Wheel
 32 |         run: |
 33 |           python -m build --wheel
 34 |       - name: Delocate to bundle dynamic libs
 35 |         run: |
 36 |           delocate-wheel -v dist/*.whl
 37 |       - name: Upload Wheels
 38 |         uses: actions/upload-artifact@v4.6.1
 39 |         with:
 40 |           name: osx-wheel-${{ matrix.python }}
 41 |           path: dist
 42 | 
 43 |   windows:
 44 |     runs-on: windows-latest
 45 |     strategy:
 46 |       matrix:
 47 |         python: [3.9, "3.10", 3.11, 3.12]
 48 |         wordsize: [64]
 49 |     steps:
 50 |       - name: Checkout
 51 |         uses: actions/checkout@v4.2.2
 52 |         with:
 53 |             submodules: true
 54 |       - name: Install deps
 55 |         env:
 56 |           PYTHON: "py -${{ matrix.python }}-${{ matrix.wordsize }}"
 57 |         shell: bash
 58 |         run: |
 59 |           set -ex
 60 |           ${PYTHON} -m pip install --upgrade pip build
 61 |       - name: Build Wheel
 62 |         env:
 63 |           PYTHON: "py -${{ matrix.python }}-${{ matrix.wordsize }}"
 64 |         shell: bash
 65 |         run: |
 66 |           set -ex
 67 |           ${PYTHON} -m build --wheel
 68 |       - name: Upload Wheels
 69 |         uses: actions/upload-artifact@v4.6.1
 70 |         with:
 71 |           name: win-wheel-${{ matrix.python }}-${{ matrix.wordsize }}
 72 |           path: dist
 73 | 
 74 |   manylinux:
 75 |     runs-on: ubuntu-24.04
 76 |     steps:
 77 |       - name: Checkout
 78 |         uses: actions/checkout@v4.2.2
 79 |         with:
 80 |             submodules: true
 81 | 
 82 |       - name: Set up Python 3.9
 83 |         uses: actions/setup-python@v5.4.0
 84 |         with:
 85 |           python-version: 3.9
 86 | 
 87 |       - name: Build sdist
 88 |         shell: bash
 89 |         run: |
 90 |           pip install --upgrade pip build
 91 |           python -m build --sdist
 92 | 
 93 |       - name: Upload sdist
 94 |         uses: actions/upload-artifact@v4.6.1
 95 |         with:
 96 |           name: sdist
 97 |           path: dist
 98 | 
 99 |       - name: Build wheels in docker
100 |         shell: bash
101 |         run: |
102 |           docker run --rm -v `pwd`:/project -w /project quay.io/pypa/manylinux2014_x86_64 bash .github/workflows/docker/buildwheel.sh
103 | 
104 |       - name: Upload Wheels
105 |         uses: actions/upload-artifact@v4.6.1
106 |         with:
107 |           name: linux-wheels
108 |           path: dist/wheelhouse
109 | 
110 |   OSX-test:
111 |     needs: ['OSX']
112 |     runs-on: macos-latest
113 |     strategy:
114 |       matrix:
115 |         python: [3.9, "3.10", 3.11, 3.12]
116 |     steps:
117 |       - name: Download wheels
118 |         uses: actions/download-artifact@v4.2.0
119 |         with:
120 |           name: osx-wheel-${{ matrix.python }}
121 |       - name: Set up Python ${{ matrix.python }}
122 |         uses: actions/setup-python@v5.4.0
123 |         with:
124 |           python-version: ${{ matrix.python }}
125 |       - name: Install wheel and test
126 |         run: |
127 |           python -VV
128 |           # Install the local wheel
129 |           pip install --no-index --no-deps --find-links=. tsinfer
130 |           pip install tsinfer
131 |           python -c "import tsinfer"
132 | 
133 |   windows-test:
134 |     needs: ['windows']
135 |     runs-on: windows-latest
136 |     strategy:
137 |       matrix:
138 |         python: [3.9, "3.10", 3.11, 3.12]
139 |         wordsize: [64]
140 |     steps:
141 |       - name: Download wheels
142 |         uses: actions/download-artifact@v4.2.0
143 |         with:
144 |           name: win-wheel-${{ matrix.python }}-${{ matrix.wordsize }}
145 |       - name: Set up Python ${{ matrix.python }}
146 |         uses: actions/setup-python@v5.4.0
147 |         with:
148 |           python-version: ${{ matrix.python }}
149 |       - name: Install wheel and test
150 |         run: |
151 |           python -VV
152 |           #patch-ng required to build lmdb
153 |           pip install patch-ng
154 |           # Install the local wheel
155 |           pip install --no-index --no-deps --find-links=. tsinfer
156 |           pip install tsinfer
157 |           python -c "import tsinfer"
158 | 
159 |   manylinux-test:
160 |     runs-on: ubuntu-24.04
161 |     needs: ['manylinux']
162 |     strategy:
163 |       matrix:
164 |         python: [3.9, "3.10", 3.11, 3.12]
165 |     steps:
166 |       - name: Download wheels
167 |         uses: actions/download-artifact@v4.2.0
168 |         with:
169 |           name: linux-wheels
170 |       - name: Set up Python
171 |         uses: actions/setup-python@v5.4.0
172 |         with:
173 |           python-version: ${{ matrix.python }}
174 |       - name: Install wheel and test
175 |         run: |
176 |           python -VV
177 |           # Install the local wheel
178 |           pip install --no-index --no-deps --find-links=. tsinfer
179 |           pip install tsinfer
180 |           python -c "import tsinfer"
181 | 
182 | 
183 |   PyPI_Upload:
184 |     runs-on: ubuntu-24.04
185 |     environment: release
186 |     needs: ['windows-test', 'OSX-test', 'manylinux-test']
187 |     permissions:
188 |       id-token: write
189 |     steps:
190 |       - name: Download all
191 |         uses: actions/download-artifact@v4.2.0
192 |       - name: Move to dist
193 |         run: |
194 |           mkdir dist
195 |           cp */*.{whl,gz} dist/.
196 |       - name: Publish distribution to Test PyPI
197 |         if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags')
198 |         uses: pypa/gh-action-pypi-publish@v1.12.4
199 |         with:
200 |           repository_url: https://test.pypi.org/legacy/
201 |       - name: Publish distribution to PRODUCTION PyPI
202 |         if: github.event_name == 'release'
203 |         uses: pypa/gh-action-pypi-publish@v1.12.4
204 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 | .hypothesis/
46 | 
47 | # Translations
48 | *.mo
49 | *.pot
50 | 
51 | # Django stuff:
52 | *.log
53 | local_settings.py
54 | 
55 | # Flask stuff:
56 | instance/
57 | .webassets-cache
58 | 
59 | # Scrapy stuff:
60 | .scrapy
61 | 
62 | # OS X stuff
63 | .DS_Store
64 | 
65 | # Sphinx documentation
66 | docs/_build/
67 | 
68 | # PyBuilder
69 | target/
70 | 
71 | # IPython Notebook
72 | .ipynb_checkpoints
73 | 
74 | # pyenv
75 | .python-version
76 | 
77 | # celery beat schedule file
78 | celerybeat-schedule
79 | 
80 | # dotenv
81 | .env
82 | 
83 | # virtualenv
84 | venv/
85 | ENV/
86 | 
87 | # Spyder project settings
88 | .spyderproject
89 | 
90 | # Rope project settings
91 | .ropeproject
92 | 
93 | *.svg
94 | tsinfer/_version.py
95 | 
96 | # Mac OS
97 | .DS_Store
98 | 


--------------------------------------------------------------------------------
/.mergify.yml:
--------------------------------------------------------------------------------
 1 | queue_rules:
 2 |   - name: default
 3 |     queue_conditions:
 4 |       - "-merged"
 5 |       - "#approved-reviews-by>=1"
 6 |       - "#changes-requested-reviews-by=0"
 7 |       - base=main
 8 |       - label=AUTOMERGE-REQUESTED
 9 |       - status-success=Lint
10 |       - status-success=Python (3.9, macos-latest)
11 |       - status-success=Python (3.12, macos-latest)
12 |       - status-success=Python (3.9, ubuntu-24.04)
13 |       - status-success=Python (3.12, ubuntu-24.04)
14 |       - status-success=Python (3.9, windows-latest)
15 |       - status-success=Python (3.12, windows-latest)
16 |       - "status-success=ci/circleci: build"
17 |     merge_conditions:
18 |       - "#approved-reviews-by>=1"
19 |       - "#changes-requested-reviews-by=0"
20 |       - status-success=Lint
21 |       - status-success=Python (3.9, macos-latest)
22 |       - status-success=Python (3.12, macos-latest)
23 |       - status-success=Python (3.9, ubuntu-24.04)
24 |       - status-success=Python (3.12, ubuntu-24.04)
25 |       - status-success=Python (3.9, windows-latest)
26 |       - status-success=Python (3.12, windows-latest)
27 |       - "status-success=ci/circleci: build"
28 |     merge_method: rebase
29 |     update_method: rebase
30 | 
31 | pull_request_rules:
32 |   - name: Remove label after merge
33 |     conditions:
34 |       - merged
35 |       - label=AUTOMERGE-REQUESTED
36 |     actions:
37 |       label:
38 |         remove:
39 |           - AUTOMERGE-REQUESTED
40 |   - name: refactored queue action rule
41 |     conditions: []
42 |     actions:
43 |       queue:
44 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v5.0.0
 4 |     hooks:
 5 |       - id: check-merge-conflict
 6 |       - id: debug-statements
 7 |       - id: mixed-line-ending
 8 |       - id: check-case-conflict
 9 |       - id: check-yaml
10 |   - repo: https://github.com/benjeffery/pre-commit-clang-format
11 |     rev: '1.0'
12 |     hooks:
13 |     - id: clang-format
14 |       exclude: avl
15 |       verbose: true
16 |   - repo: https://github.com/asottile/reorder_python_imports
17 |     rev: v3.14.0
18 |     hooks:
19 |       - id: reorder-python-imports
20 |         args: [ --unclassifiable-application-module=_tsinfer ]
21 |   - repo: https://github.com/asottile/pyupgrade
22 |     rev: v3.19.1
23 |     hooks:
24 |       - id: pyupgrade
25 |         args: [ --py3-plus, --py39-plus ]
26 |   - repo: https://github.com/psf/black
27 |     rev: 25.1.0
28 |     hooks:
29 |       - id: black
30 |         language_version: python3
31 |   - repo: https://github.com/asottile/blacken-docs
32 |     rev: 1.19.1
33 |     hooks:
34 |       - id: blacken-docs
35 |         args: [--skip-errors]
36 |         additional_dependencies: [black==22.3.0]
37 |         language_version: python3
38 |   - repo: https://github.com/pycqa/flake8
39 |     rev: 7.1.2
40 |     hooks:
41 |       - id: flake8
42 |         args: [--config=.flake8]
43 |         additional_dependencies: ["flake8-bugbear==22.10.27", "flake8-builtins==2.0.1"]


--------------------------------------------------------------------------------
/CITATION.md:
--------------------------------------------------------------------------------
 1 | (sec_citation)=
 2 | 
 3 | # Citing tsinfer
 4 | 
 5 | If you use `tsinfer` in your work, please cite the
 6 | [2019 Nature Genetics paper](<https://doi.org/10.1038/s41588-019-0483-y>):
 7 | 
 8 | > Jerome Kelleher, Yan Wong, Anthony W. Wohns, 
 9 | > Chaimaa Fadil, Patrick K. Albers & Gil McVean (2019) 
10 | > *Inferring whole-genome histories in large population datasets*,
11 | > Nature Genetics, Volume 51, 1330–1338. https://doi.org/10.1038/s41588-019-0483-y
12 | 
13 | Bibtex record:
14 | 
15 | ```bibtex
16 | 
17 | @article{Kelleher2019,
18 |   doi = {10.1038/s41588-019-0483-y},
19 |   url = {https://doi.org/10.1038/s41588-019-0483-y},
20 |   year = {2019},
21 |   month = sep,
22 |   publisher = {Springer Science and Business Media {LLC}},
23 |   volume = {51},
24 |   number = {9},
25 |   pages = {1330--1338},
26 |   author = {Jerome Kelleher and Yan Wong and Anthony W. Wohns and Chaimaa Fadil and Patrick K. Albers and Gil McVean},
27 |   title = {Inferring whole-genome histories in large population datasets},
28 |   journal = {Nature Genetics}
29 | }


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include lib/*.h
2 | include lib/subprojects/tskit/c/*.h
3 | include lib/subprojects/tskit/c/tskit/*.h
4 | include lib/subprojects/tskit/c/subprojects/kastore/*.h
5 | include README.txt
6 | include LICENSE
7 | recursive-include tests *.py
8 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | CC?=gcc
 2 | CFLAGS=-std=c99 -g -O3 -march=native -funroll-loops -ffast-math \
 3 |        # -ftree-vectorize \
 4 |        # -ftree-vectorizer-verbose=6 \
 5 |        # -fopt-info-vec-missed
 6 | 
 7 | all: _tsinfer.cpython-34m.so 
 8 | 
 9 | _tsinfer.cpython-34m.so: _tsinfermodule.c 
10 | 	CC="${CC}" CFLAGS="${CFLAGS}" python setup.py build_ext --inplace
11 | 
12 | ctags:
13 | 	ctags lib/*.c lib/*.h tsinfer/*.py
14 | 
15 | clean:
16 | 	rm -f *.so *.o tags
17 | 	rm -fR build
18 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # tsinfer <img align="right" width="145" height="90" src="https://raw.githubusercontent.com/tskit-dev/tsinfer/main/docs/tsinfer_logo.svg">
 2 | 
 3 | [![CircleCI](https://circleci.com/gh/tskit-dev/tsinfer.svg?style=svg)](https://circleci.com/gh/tskit-dev/tsinfer) [![Build Status](https://travis-ci.org/tskit-dev/tsinfer.svg?branch=main)](https://travis-ci.org/tskit-dev/tsinfer) [![Docs Build](https://github.com/tskit-dev/tsinfer/actions/workflows/docs.yml/badge.svg)](https://tskit.dev/tsinfer/docs/stable/introduction.html) [![codecov](https://codecov.io/gh/tskit-dev/tsinfer/branch/main/graph/badge.svg)](https://codecov.io/gh/tskit-dev/tsinfer)
 4 | 
 5 | 
 6 | Infer a tree sequence from genetic variation data
 7 | 
 8 | The [documentation](https://tskit.dev/tsinfer/docs/latest) contains details of how to use this software, including [installation instructions](https://tskit.dev/tsinfer/docs/latest/installation.html).
 9 | 
10 | The initial algorithm, its rationale, and results from testing on simulated and real data are described in the following [Nature Genetics paper](https://doi.org/10.1038/s41588-019-0483-y):
11 | 
12 | > Jerome Kelleher, Yan Wong, Anthony W Wohns, Chaimaa Fadil, Patrick K Albers and Gil McVean (2019) *Inferring whole-genome histories in large population datasets*. Nature Genetics **51**: 1330-1338
13 | 
14 | _Tsinfer_ versions [0.2.0](https://github.com/tskit-dev/tsinfer/releases/tag/0.2.0) onwards allow missing data and provide a fully parameterised Li & Stephens matching algorithm (i.e. which allows mismatch). These improvements are described in the
15 | following [Science paper](https://doi.org/10.1126/science.abi8264):
16 | 
17 | > Anthony Wilder Wohns, Yan Wong, Ben Jeffery, Ali Akbari, Swapan Mallick, Ron Pinhasi, Nick Patterson, David Reich, Jerome Kelleher, and Gil McVean (2022) A unified genealogy of modern and ancient genomes. Science 375: eabi8264
18 | 
19 | Please cite either or both of these if you use ``tsinfer`` in your work. Code to reproduce the results in the first paper is present in a [separate GitHub repository](https://github.com/mcveanlab/treeseq-inference).
20 | 
21 | Note that `tsinfer` does not attempt to infer node times (i.e. branch lengths of the
22 | inferred trees). If you require a tree sequence where the dates of common ancestors
23 | are expressed in calendar or generation times, you should post-process the ``tsinfer``
24 | output using software such as [``tsdate``](https://github.com/tskit-dev/tsdate).
25 | 


--------------------------------------------------------------------------------
/convert_hdf5.py:
--------------------------------------------------------------------------------
 1 | # Simple script to convert input data into HDF5 format so that
 2 | # we can feed it into the C development CLI.
 3 | import sys
 4 | 
 5 | import h5py
 6 | import numpy as np
 7 | 
 8 | import tsinfer
 9 | 
10 | 
11 | def main(infile, outfile):
12 |     sample_data = tsinfer.SampleData.load(infile)
13 |     print(sample_data)
14 |     shape = (sample_data.num_inference_sites, sample_data.num_samples)
15 |     G = np.empty(shape, dtype=np.int8)
16 |     for j, (_, genotypes) in enumerate(sample_data.genotypes(inference_sites=True)):
17 |         G[j] = genotypes
18 |     with h5py.File(outfile, "w") as root:
19 |         root["haplotypes"] = G.T
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     main(sys.argv[1], sys.argv[2])
24 | 


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | notebook-simulation.trees
2 | notebook-simulation.samples
3 | notebook-simulation-source.trees
4 | notebook-simulation.vc*
5 | notebook-simulation-AA.npy
6 | P_dom_chr24_phased.samples
7 | sparrows.vcz
8 | 


--------------------------------------------------------------------------------
/docs/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | ../CHANGELOG.md


--------------------------------------------------------------------------------
/docs/CITATION.md:
--------------------------------------------------------------------------------
1 | ../CITATION.md


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | # Need to set PYTHONPATH so that we pick up the local tsinfer
 3 | PYPATH=$(shell pwd)/../
 4 | TSINF_VERSION:=$(shell PYTHONPATH=${PYPATH} \
 5 |    python -c 'import tsinfer; print(tsinfer.__version__.split("+")[0])')
 6 | 
 7 | BUILDDIR      = _build
 8 | 
 9 | all: dev
10 | 
11 | dev:
12 | 	PYTHONPATH=${PYPATH} ./build.sh
13 | 
14 | dist:
15 | 	@echo Building distribution for tskit version ${TSINF_VERSION}
16 | 	sed -i -e s/__TSINFER_VERSION__/${TSINF_VERSION}/g _config.yml
17 | 	PYTHONPATH=${PYPATH} ./build.sh
18 | 
19 | clean:
20 | 	rm -fR $(BUILDDIR)
21 | 	rm -rf _static/example_data.vcz/ancestral_state
22 | 


--------------------------------------------------------------------------------
/docs/_config.yml:
--------------------------------------------------------------------------------
 1 | # Book settings
 2 | # Learn more at https://jupyterbook.org/customize/config.html
 3 | 
 4 | title: Tsinfer manual
 5 | author: Tskit Developers
 6 | copyright: "2018"
 7 | only_build_toc_files: true
 8 | logo: tsinfer_logo.svg
 9 | 
10 | execute:
11 |   execute_notebooks: cache
12 | 
13 | launch_buttons:
14 |   binderhub_url: ""
15 | 
16 | repository:
17 |   url: https://github.com/tskit-dev/tsinfer
18 |   branch: main
19 |   path_to_book: docs
20 | 
21 | html:
22 |   use_issues_button: true
23 |   use_repository_button: true
24 |   use_edit_page_button: true
25 |   # Do not edit this - the version placeholder is replaced by the
26 |   # current version during a distribution build in the Makefile
27 |   extra_navbar: tsinfer __TSINFER_VERSION__
28 |   extra_footer: tsinfer __TSINFER_VERSION__
29 | 
30 | sphinx:
31 |     extra_extensions:
32 |     - sphinx.ext.autodoc
33 |     - sphinx.ext.autosummary
34 |     - sphinx.ext.todo
35 |     - sphinx.ext.viewcode
36 |     - sphinx.ext.intersphinx
37 |     - sphinx_issues
38 |     - sphinxarg.ext
39 |     - IPython.sphinxext.ipython_console_highlighting
40 | 
41 |     config:
42 |       html_theme: sphinx_book_theme
43 |       html_theme_options:
44 |         pygments_dark_style: monokai
45 |       pygments_style: monokai
46 |       myst_enable_extensions:
47 |       - colon_fence
48 |       - deflist
49 |       issues_github_path: tskit-dev/tsinfer
50 |       todo_include_todos: true
51 |       intersphinx_mapping:
52 |         python: ["https://docs.python.org/3/", null]
53 |         tskit: ["https://tskit.dev/tskit/docs/stable", null]
54 |         msprime: ["https://tskit.dev/msprime/docs/stable", null]
55 |         tutorials: ["https://tskit.dev/tutorials/", null]
56 |         numpy: ["https://numpy.org/doc/stable/", null]
57 |         numcodecs: ["https://numcodecs.readthedocs.io/en/stable/", null]
58 |         zarr: ["https://zarr.readthedocs.io/en/stable/", null]
59 |       nitpicky: true
60 | 
61 |       autodoc_member_order: bysource
62 | 
63 |       # Without this option, autodoc tries to put links for all return types
64 |       # in terms of the fully-qualified classnames
65 |       # (e.g. msprime.demography.Demography) which we don't want, and also
66 |       # leads to broken links and nitpick failures. So, until we tackle
67 |       # typehints fully, this is the simplest approach.
68 |       autodoc_typehints: none
69 | 
70 |       # Note we have to use the regex version here because of
71 |       # https://github.com/sphinx-doc/sphinx/issues/9748
72 |       nitpick_ignore_regex: [
73 |         [ "py:class", "arraylike" ],
74 |         [ "py:class", "array_like" ],
75 |         [ "py:class", "array" ],
76 |         [ "py:class", "dtype=float64" ],
77 |         [ "py:class", "dtype=uint32" ],
78 |         [ "py:class", "dtype=int8" ],
79 |         [ "py:class", "iter" ],
80 |       ]
81 | 


--------------------------------------------------------------------------------
/docs/_static/.README:
--------------------------------------------------------------------------------
1 | Placeholder file to make git store this directory.
2 | 


--------------------------------------------------------------------------------
/docs/_static/P_dom_chr24_phased.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tsinfer/20788d393b79f0ee8b39d866456533c2d86abbe7/docs/_static/P_dom_chr24_phased.vcf.gz


--------------------------------------------------------------------------------
/docs/_static/P_dom_chr24_phased.vcf.gz.tbi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tsinfer/20788d393b79f0ee8b39d866456533c2d86abbe7/docs/_static/P_dom_chr24_phased.vcf.gz.tbi


--------------------------------------------------------------------------------
/docs/_static/ancestor_grouping.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tsinfer/20788d393b79f0ee8b39d866456533c2d86abbe7/docs/_static/ancestor_grouping.png


--------------------------------------------------------------------------------
/docs/_static/example_ancestral_state.fa:
--------------------------------------------------------------------------------
1 | >chr1
2 | nnnnnnnnnnnnnnnGnnnnnnnnnnnnnnnnnnnnnnnnnnnGnnnnnCnnnnTnnnnnnnnnnnnnnnCnnnAnnnnnnnnnTnnnnnnnnnAnnnn


--------------------------------------------------------------------------------
/docs/_static/example_ancestral_state.fa.fai:
--------------------------------------------------------------------------------
1 | chr1	99	6	99	99
2 | 


--------------------------------------------------------------------------------
/docs/_static/example_data.vcz/.zattrs:
--------------------------------------------------------------------------------
1 | {
2 |     "contigs": [
3 |         "0"
4 |     ],
5 |     "source": "sgkit-0.9.0"
6 | }


--------------------------------------------------------------------------------
/docs/_static/example_data.vcz/.zgroup:
--------------------------------------------------------------------------------
1 | {
2 |     "zarr_format": 2
3 | }


--------------------------------------------------------------------------------
/docs/_static/example_data.vcz/.zmetadata:
--------------------------------------------------------------------------------
  1 | {
  2 |     "metadata": {
  3 |         ".zattrs": {
  4 |             "contigs": [
  5 |                 "0"
  6 |             ],
  7 |             "source": "sgkit-0.9.0"
  8 |         },
  9 |         ".zgroup": {
 10 |             "zarr_format": 2
 11 |         },
 12 |         "call_genotype/.zarray": {
 13 |             "chunks": [
 14 |                 8,
 15 |                 3,
 16 |                 2
 17 |             ],
 18 |             "compressor": {
 19 |                 "blocksize": 0,
 20 |                 "clevel": 5,
 21 |                 "cname": "lz4",
 22 |                 "id": "blosc",
 23 |                 "shuffle": 1
 24 |             },
 25 |             "dtype": "|i1",
 26 |             "fill_value": null,
 27 |             "filters": null,
 28 |             "order": "C",
 29 |             "shape": [
 30 |                 8,
 31 |                 3,
 32 |                 2
 33 |             ],
 34 |             "zarr_format": 2
 35 |         },
 36 |         "call_genotype/.zattrs": {
 37 |             "_ARRAY_DIMENSIONS": [
 38 |                 "variants",
 39 |                 "samples",
 40 |                 "ploidy"
 41 |             ],
 42 |             "comment": "Call genotype. Encoded as allele values (0 for the reference, 1 for\nthe first allele, 2 for the second allele), -1 to indicate a\nmissing value, or -2 to indicate a non allele in mixed ploidy datasets.",
 43 |             "mixed_ploidy": false
 44 |         },
 45 |         "call_genotype_mask/.zarray": {
 46 |             "chunks": [
 47 |                 8,
 48 |                 3,
 49 |                 2
 50 |             ],
 51 |             "compressor": {
 52 |                 "blocksize": 0,
 53 |                 "clevel": 5,
 54 |                 "cname": "lz4",
 55 |                 "id": "blosc",
 56 |                 "shuffle": 1
 57 |             },
 58 |             "dtype": "|i1",
 59 |             "fill_value": null,
 60 |             "filters": null,
 61 |             "order": "C",
 62 |             "shape": [
 63 |                 8,
 64 |                 3,
 65 |                 2
 66 |             ],
 67 |             "zarr_format": 2
 68 |         },
 69 |         "call_genotype_mask/.zattrs": {
 70 |             "_ARRAY_DIMENSIONS": [
 71 |                 "variants",
 72 |                 "samples",
 73 |                 "ploidy"
 74 |             ],
 75 |             "comment": "A flag for each call indicating which values are missing.",
 76 |             "dtype": "bool"
 77 |         },
 78 |         "call_genotype_phased/.zarray": {
 79 |             "chunks": [
 80 |                 8,
 81 |                 3
 82 |             ],
 83 |             "compressor": {
 84 |                 "blocksize": 0,
 85 |                 "clevel": 5,
 86 |                 "cname": "lz4",
 87 |                 "id": "blosc",
 88 |                 "shuffle": 1
 89 |             },
 90 |             "dtype": "|i1",
 91 |             "fill_value": null,
 92 |             "filters": null,
 93 |             "order": "C",
 94 |             "shape": [
 95 |                 8,
 96 |                 3
 97 |             ],
 98 |             "zarr_format": 2
 99 |         },
100 |         "call_genotype_phased/.zattrs": {
101 |             "_ARRAY_DIMENSIONS": [
102 |                 "variants",
103 |                 "samples"
104 |             ],
105 |             "comment": "A flag for each call indicating if it is phased or not. If omitted\nall calls are unphased.",
106 |             "dtype": "bool"
107 |         },
108 |         "contig_id/.zarray": {
109 |             "chunks": [
110 |                 1
111 |             ],
112 |             "compressor": {
113 |                 "blocksize": 0,
114 |                 "clevel": 5,
115 |                 "cname": "lz4",
116 |                 "id": "blosc",
117 |                 "shuffle": 1
118 |             },
119 |             "dtype": "<U1",
120 |             "fill_value": null,
121 |             "filters": null,
122 |             "order": "C",
123 |             "shape": [
124 |                 1
125 |             ],
126 |             "zarr_format": 2
127 |         },
128 |         "contig_id/.zattrs": {
129 |             "_ARRAY_DIMENSIONS": [
130 |                 "contigs"
131 |             ],
132 |             "comment": "Contig identifiers."
133 |         },
134 |         "sample_id/.zarray": {
135 |             "chunks": [
136 |                 3
137 |             ],
138 |             "compressor": {
139 |                 "blocksize": 0,
140 |                 "clevel": 5,
141 |                 "cname": "lz4",
142 |                 "id": "blosc",
143 |                 "shuffle": 1
144 |             },
145 |             "dtype": "<U2",
146 |             "fill_value": null,
147 |             "filters": null,
148 |             "order": "C",
149 |             "shape": [
150 |                 3
151 |             ],
152 |             "zarr_format": 2
153 |         },
154 |         "sample_id/.zattrs": {
155 |             "_ARRAY_DIMENSIONS": [
156 |                 "samples"
157 |             ],
158 |             "comment": "The unique identifier of the sample."
159 |         },
160 |         "variant_allele/.zarray": {
161 |             "chunks": [
162 |                 8,
163 |                 2
164 |             ],
165 |             "compressor": {
166 |                 "blocksize": 0,
167 |                 "clevel": 5,
168 |                 "cname": "lz4",
169 |                 "id": "blosc",
170 |                 "shuffle": 1
171 |             },
172 |             "dtype": "|S1",
173 |             "fill_value": null,
174 |             "filters": null,
175 |             "order": "C",
176 |             "shape": [
177 |                 8,
178 |                 2
179 |             ],
180 |             "zarr_format": 2
181 |         },
182 |         "variant_allele/.zattrs": {
183 |             "_ARRAY_DIMENSIONS": [
184 |                 "variants",
185 |                 "alleles"
186 |             ],
187 |             "comment": "The possible alleles for the variant."
188 |         },
189 |         "variant_contig/.zarray": {
190 |             "chunks": [
191 |                 8
192 |             ],
193 |             "compressor": {
194 |                 "blocksize": 0,
195 |                 "clevel": 5,
196 |                 "cname": "lz4",
197 |                 "id": "blosc",
198 |                 "shuffle": 1
199 |             },
200 |             "dtype": "<i8",
201 |             "fill_value": null,
202 |             "filters": null,
203 |             "order": "C",
204 |             "shape": [
205 |                 8
206 |             ],
207 |             "zarr_format": 2
208 |         },
209 |         "variant_contig/.zattrs": {
210 |             "_ARRAY_DIMENSIONS": [
211 |                 "variants"
212 |             ],
213 |             "comment": "Index corresponding to contig name for each variant. In some less common\nscenarios, this may also be equivalent to the contig names if the data\ngenerating process used contig names that were also integers."
214 |         },
215 |         "variant_position/.zarray": {
216 |             "chunks": [
217 |                 8
218 |             ],
219 |             "compressor": {
220 |                 "blocksize": 0,
221 |                 "clevel": 5,
222 |                 "cname": "lz4",
223 |                 "id": "blosc",
224 |                 "shuffle": 1
225 |             },
226 |             "dtype": "<i8",
227 |             "fill_value": null,
228 |             "filters": null,
229 |             "order": "C",
230 |             "shape": [
231 |                 8
232 |             ],
233 |             "zarr_format": 2
234 |         },
235 |         "variant_position/.zattrs": {
236 |             "_ARRAY_DIMENSIONS": [
237 |                 "variants"
238 |             ],
239 |             "comment": "The reference position of the variant."
240 |         }
241 |     },
242 |     "zarr_consolidated_format": 1
243 | }


--------------------------------------------------------------------------------
/docs/_static/example_data.vcz/call_genotype/.zarray:
--------------------------------------------------------------------------------
 1 | {
 2 |     "chunks": [
 3 |         8,
 4 |         3,
 5 |         2
 6 |     ],
 7 |     "compressor": {
 8 |         "blocksize": 0,
 9 |         "clevel": 5,
10 |         "cname": "lz4",
11 |         "id": "blosc",
12 |         "shuffle": 1
13 |     },
14 |     "dtype": "|i1",
15 |     "fill_value": null,
16 |     "filters": null,
17 |     "order": "C",
18 |     "shape": [
19 |         8,
20 |         3,
21 |         2
22 |     ],
23 |     "zarr_format": 2
24 | }


--------------------------------------------------------------------------------
/docs/_static/example_data.vcz/call_genotype/.zattrs:
--------------------------------------------------------------------------------
1 | {
2 |     "_ARRAY_DIMENSIONS": [
3 |         "variants",
4 |         "samples",
5 |         "ploidy"
6 |     ],
7 |     "comment": "Call genotype. Encoded as allele values (0 for the reference, 1 for\nthe first allele, 2 for the second allele), -1 to indicate a\nmissing value, or -2 to indicate a non allele in mixed ploidy datasets.",
8 |     "mixed_ploidy": false
9 | }


--------------------------------------------------------------------------------
/docs/_static/example_data.vcz/call_genotype/0.0.0:
--------------------------------------------------------------------------------
1 | 30   0   @                              


--------------------------------------------------------------------------------
/docs/_static/example_data.vcz/call_genotype_mask/.zarray:
--------------------------------------------------------------------------------
 1 | {
 2 |     "chunks": [
 3 |         8,
 4 |         3,
 5 |         2
 6 |     ],
 7 |     "compressor": {
 8 |         "blocksize": 0,
 9 |         "clevel": 5,
10 |         "cname": "lz4",
11 |         "id": "blosc",
12 |         "shuffle": 1
13 |     },
14 |     "dtype": "|i1",
15 |     "fill_value": null,
16 |     "filters": null,
17 |     "order": "C",
18 |     "shape": [
19 |         8,
20 |         3,
21 |         2
22 |     ],
23 |     "zarr_format": 2
24 | }


--------------------------------------------------------------------------------
/docs/_static/example_data.vcz/call_genotype_mask/.zattrs:
--------------------------------------------------------------------------------
1 | {
2 |     "_ARRAY_DIMENSIONS": [
3 |         "variants",
4 |         "samples",
5 |         "ploidy"
6 |     ],
7 |     "comment": "A flag for each call indicating which values are missing.",
8 |     "dtype": "bool"
9 | }


--------------------------------------------------------------------------------
/docs/_static/example_data.vcz/call_genotype_mask/0.0.0:
--------------------------------------------------------------------------------
1 | 30   0   @                                                   


--------------------------------------------------------------------------------
/docs/_static/example_data.vcz/call_genotype_phased/.zarray:
--------------------------------------------------------------------------------
 1 | {
 2 |     "chunks": [
 3 |         8,
 4 |         3
 5 |     ],
 6 |     "compressor": {
 7 |         "blocksize": 0,
 8 |         "clevel": 5,
 9 |         "cname": "lz4",
10 |         "id": "blosc",
11 |         "shuffle": 1
12 |     },
13 |     "dtype": "|i1",
14 |     "fill_value": null,
15 |     "filters": null,
16 |     "order": "C",
17 |     "shape": [
18 |         8,
19 |         3
20 |     ],
21 |     "zarr_format": 2
22 | }


--------------------------------------------------------------------------------
/docs/_static/example_data.vcz/call_genotype_phased/.zattrs:
--------------------------------------------------------------------------------
1 | {
2 |     "_ARRAY_DIMENSIONS": [
3 |         "variants",
4 |         "samples"
5 |     ],
6 |     "comment": "A flag for each call indicating if it is phased or not. If omitted\nall calls are unphased.",
7 |     "dtype": "bool"
8 | }


--------------------------------------------------------------------------------
/docs/_static/example_data.vcz/call_genotype_phased/0.0:
--------------------------------------------------------------------------------
1 | 3      (   


--------------------------------------------------------------------------------
/docs/_static/example_data.vcz/contig_id/.zarray:
--------------------------------------------------------------------------------
 1 | {
 2 |     "chunks": [
 3 |         1
 4 |     ],
 5 |     "compressor": {
 6 |         "blocksize": 0,
 7 |         "clevel": 5,
 8 |         "cname": "lz4",
 9 |         "id": "blosc",
10 |         "shuffle": 1
11 |     },
12 |     "dtype": "<U4",
13 |     "fill_value": "",
14 |     "filters": null,
15 |     "order": "C",
16 |     "shape": [
17 |         1
18 |     ],
19 |     "zarr_format": 2
20 | }


--------------------------------------------------------------------------------
/docs/_static/example_data.vcz/contig_id/.zattrs:
--------------------------------------------------------------------------------
1 | {
2 |     "_ARRAY_DIMENSIONS": [
3 |         "contigs"
4 |     ],
5 |     "comment": "Contig identifiers."
6 | }


--------------------------------------------------------------------------------
/docs/_static/example_data.vcz/contig_id/0:
--------------------------------------------------------------------------------
1 | 3          c   h   r   1   


--------------------------------------------------------------------------------
/docs/_static/example_data.vcz/sample_id/.zarray:
--------------------------------------------------------------------------------
 1 | {
 2 |     "chunks": [
 3 |         3
 4 |     ],
 5 |     "compressor": {
 6 |         "blocksize": 0,
 7 |         "clevel": 5,
 8 |         "cname": "lz4",
 9 |         "id": "blosc",
10 |         "shuffle": 1
11 |     },
12 |     "dtype": "<U2",
13 |     "fill_value": null,
14 |     "filters": null,
15 |     "order": "C",
16 |     "shape": [
17 |         3
18 |     ],
19 |     "zarr_format": 2
20 | }


--------------------------------------------------------------------------------
/docs/_static/example_data.vcz/sample_id/.zattrs:
--------------------------------------------------------------------------------
1 | {
2 |     "_ARRAY_DIMENSIONS": [
3 |         "samples"
4 |     ],
5 |     "comment": "The unique identifier of the sample."
6 | }


--------------------------------------------------------------------------------
/docs/_static/example_data.vcz/sample_id/0:
--------------------------------------------------------------------------------
1 | 3      (   S   0   S   1   S   2   


--------------------------------------------------------------------------------
/docs/_static/example_data.vcz/variant_allele/.zarray:
--------------------------------------------------------------------------------
 1 | {
 2 |     "chunks": [
 3 |         8,
 4 |         2
 5 |     ],
 6 |     "compressor": {
 7 |         "blocksize": 0,
 8 |         "clevel": 5,
 9 |         "cname": "lz4",
10 |         "id": "blosc",
11 |         "shuffle": 1
12 |     },
13 |     "dtype": "|S4",
14 |     "fill_value": "",
15 |     "filters": null,
16 |     "order": "C",
17 |     "shape": [
18 |         8,
19 |         2
20 |     ],
21 |     "zarr_format": 2
22 | }


--------------------------------------------------------------------------------
/docs/_static/example_data.vcz/variant_allele/0.0:
--------------------------------------------------------------------------------
1 | 3@   @   P   G   T   G   T   C   G   T   A   C   G   A   T   G   C   AGGTA   


--------------------------------------------------------------------------------
/docs/_static/example_data.vcz/variant_contig/.zarray:
--------------------------------------------------------------------------------
 1 | {
 2 |     "chunks": [
 3 |         8
 4 |     ],
 5 |     "compressor": {
 6 |         "blocksize": 0,
 7 |         "clevel": 5,
 8 |         "cname": "lz4",
 9 |         "id": "blosc",
10 |         "shuffle": 1
11 |     },
12 |     "dtype": "<i8",
13 |     "fill_value": null,
14 |     "filters": null,
15 |     "order": "C",
16 |     "shape": [
17 |         8
18 |     ],
19 |     "zarr_format": 2
20 | }


--------------------------------------------------------------------------------
/docs/_static/example_data.vcz/variant_contig/.zattrs:
--------------------------------------------------------------------------------
1 | {
2 |     "_ARRAY_DIMENSIONS": [
3 |         "variants"
4 |     ],
5 |     "comment": "Index corresponding to contig name for each variant. In some less common\nscenarios, this may also be equivalent to the contig names if the data\ngenerating process used contig names that were also integers."
6 | }


--------------------------------------------------------------------------------
/docs/_static/example_data.vcz/variant_contig/0:
--------------------------------------------------------------------------------
1 | 3@   @   P                                                                   


--------------------------------------------------------------------------------
/docs/_static/example_data.vcz/variant_position/.zarray:
--------------------------------------------------------------------------------
 1 | {
 2 |     "chunks": [
 3 |         8
 4 |     ],
 5 |     "compressor": {
 6 |         "blocksize": 0,
 7 |         "clevel": 5,
 8 |         "cname": "lz4",
 9 |         "id": "blosc",
10 |         "shuffle": 1
11 |     },
12 |     "dtype": "<i8",
13 |     "fill_value": null,
14 |     "filters": null,
15 |     "order": "C",
16 |     "shape": [
17 |         8
18 |     ],
19 |     "zarr_format": 2
20 | }


--------------------------------------------------------------------------------
/docs/_static/example_data.vcz/variant_position/.zattrs:
--------------------------------------------------------------------------------
1 | {
2 |     "_ARRAY_DIMENSIONS": [
3 |         "variants"
4 |     ],
5 |     "comment": "The reference position of the variant."
6 | }


--------------------------------------------------------------------------------
/docs/_static/example_data.vcz/variant_position/0:
--------------------------------------------------------------------------------
1 | 3@   @   P          ,       2       7       G       K       U       _       


--------------------------------------------------------------------------------
/docs/_templates/.README:
--------------------------------------------------------------------------------
1 | Placeholder file to make git store this directory.
2 | 


--------------------------------------------------------------------------------
/docs/_toc.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | format: jb-book
 3 | root: index
 4 | parts:
 5 | - caption: Introduction
 6 |   chapters:
 7 |   - file: introduction
 8 | - caption: Installation
 9 |   chapters:
10 |   - file: installation
11 | - caption: Usage
12 |   chapters:
13 |   - file: usage
14 | - caption: Inference
15 |   chapters:
16 |   - file: inference
17 |   - file: large_scale
18 | - caption: Interfaces
19 |   chapters:
20 |   - file: api
21 |   - file: cli
22 | - caption: File Formats
23 |   chapters:
24 |     - file: file_formats
25 | - caption: Miscellaneous
26 |   chapters:
27 |   - file: development
28 |   - file: CITATION
29 |   - file: CHANGELOG
30 | 


--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
 1 | .. _sec_api:
 2 | 
 3 | =================
 4 | API Documentation
 5 | =================
 6 | 
 7 | .. _sec_api_file_formats:
 8 | 
 9 | 
10 | ++++++++++++
11 | Variant data
12 | ++++++++++++
13 | 
14 | .. autoclass:: tsinfer.VariantData
15 | 
16 | 
17 | .. autofunction:: tsinfer.add_ancestral_state_array
18 | 
19 | +++++++++++++
20 | Ancestor data
21 | +++++++++++++
22 | 
23 | .. autofunction:: tsinfer.load
24 | 
25 | .. autoclass:: tsinfer.AncestorData
26 |     :inherited-members:
27 | 
28 | .. todo::
29 | 
30 |     1. Add documentation for the data attributes in read-mode.
31 | 
32 | 
33 | .. _sec_api_file_inference:
34 | 
35 | *****************
36 | Running inference
37 | *****************
38 | 
39 | .. autofunction:: tsinfer.infer
40 | 
41 | .. autofunction:: tsinfer.generate_ancestors
42 | 
43 | .. autoclass:: tsinfer.GenotypeEncoding
44 |    :members:
45 | 
46 | .. autofunction:: tsinfer.match_ancestors
47 | 
48 | .. autofunction:: tsinfer.match_samples
49 | 
50 | .. autofunction:: tsinfer.augment_ancestors
51 | 
52 | .. autofunction:: tsinfer.post_process
53 | 
54 | *****************
55 | Batched inference
56 | *****************
57 | 
58 | .. autofunction:: tsinfer.match_ancestors_batch_init
59 | 
60 | .. autofunction:: tsinfer.match_ancestors_batch_groups
61 | 
62 | .. autofunction:: tsinfer.match_ancestors_batch_group_partition
63 | 
64 | .. autofunction:: tsinfer.match_ancestors_batch_group_finalise
65 | 
66 | .. autofunction:: tsinfer.match_ancestors_batch_finalise
67 | 
68 | .. autofunction:: tsinfer.match_samples_batch_init
69 | 
70 | .. autofunction:: tsinfer.match_samples_batch_partition
71 | 
72 | .. autofunction:: tsinfer.match_samples_batch_finalise
73 | 
74 | 
75 | *****************
76 | Container classes
77 | *****************
78 | 
79 | .. autoclass:: tsinfer.Variant
80 | 
81 | .. autoclass:: tsinfer.Site
82 | 
83 | 
84 | **********
85 | Exceptions
86 | **********
87 | 
88 | .. autoexception:: tsinfer.FileFormatError
89 | 
90 | 


--------------------------------------------------------------------------------
/docs/build.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | #/bin/bash
 4 | 
 5 | # Jupyter-build doesn't have an option to automatically show the 
 6 | # saved reports, which makes it difficult to debug the reasons for 
 7 | # build failures in CI. This is a simple wrapper to handle that.
 8 | 
 9 | REPORTDIR=_build/html/reports
10 | 
11 | jupyter-book build -nW --keep-going .
12 | RETVAL=$?
13 | if [ $RETVAL -ne 0 ]; then
14 |     if [ -e $REPORTDIR ]; then
15 |       echo "Error occured; showing saved reports"
16 |       cat $REPORTDIR/*
17 |     fi
18 | else
19 |     # Clear out any old reports
20 |     rm -f $REPORTDIR/*
21 | fi
22 | exit $RETVAL


--------------------------------------------------------------------------------
/docs/cli.rst:
--------------------------------------------------------------------------------
 1 | .. _sec_cli:
 2 | 
 3 | ======================
 4 | Command line interface
 5 | ======================
 6 | 
 7 | .. warning::
 8 | 
 9 |     The command line interface only supports the deprecated SampleData format
10 |     used in tsinfer<0.4.0. 
11 | 
12 | The command line interface in ``tsinfer`` is intended to provide a convenient
13 | interface to the high-level :ref:`API functionality <sec_api>`. There are two
14 | equivalent ways to invoke this program:
15 | 
16 | .. code-block:: bash
17 | 
18 |     $ tsinfer
19 | 
20 | or
21 | 
22 | .. code-block:: bash
23 | 
24 |     $ python3 -m tsinfer
25 | 
26 | The first form is more intuitive and works well most of the time. The second
27 | form is useful when multiple versions of Python are installed or if the
28 | :command:`tsinfer` executable is not installed on your path.
29 | 
30 | The :command:`tsinfer` program has five subcommands: :command:`list` prints a
31 | summary of the data held in one of tsinfer's :ref:`file formats <sec_file_formats>`;
32 | :command:`infer` runs the complete :ref:`inference process <sec_inference>` for a given
33 | input SampleData file; and
34 | :command:`generate-ancestors`, :command:`match-ancestors` and
35 | :command:`match-samples` run the three parts of this inference
36 | process as separate steps. Running the inference as separate steps like this
37 | is recommended for large inferences as it allows for greater control over
38 | the inference process.
39 | 
40 | ++++++++++++++++
41 | Argument details
42 | ++++++++++++++++
43 | 
44 | .. argparse::
45 |     :module: tsinfer
46 |     :func: get_cli_parser
47 |     :prog: tsinfer
48 |     :nodefault:
49 | 
50 | 


--------------------------------------------------------------------------------
/docs/development.rst:
--------------------------------------------------------------------------------
1 | .. _sec_development:
2 | 
3 | =======================
4 | Developer documentation
5 | =======================
6 | 
7 | .. todo:: Write developer documentation.
8 | 


--------------------------------------------------------------------------------
/docs/file_formats.rst:
--------------------------------------------------------------------------------
 1 | .. _sec_file_formats:
 2 | 
 3 | ============
 4 | File formats
 5 | ============
 6 | 
 7 | ``tsinfer`` uses the excellent `zarr library <http://zarr.readthedocs.io/>`_
 8 | to encode data in a form that is both compact and efficient to process.
 9 | See the :ref:`API documentation <sec_api_file_formats>` for details on
10 | how to construct and manipulate these files using Python. The
11 | :ref:`tsinfer list <sec_cli>` command provides a way to print out a
12 | summary of these files.
13 | 
14 | 
15 | .. _sec_file_formats_ancestors:
16 | 
17 | **************
18 | Ancestors File
19 | **************
20 | 
21 | The ancestors file contains the ancestral haplotype data inferred from the
22 | sample data in the :ref:`sec_inference_generate_ancestors` step.
23 | 
24 | .. todo:: Document the structure of the ancestors file.
25 | 
26 | 
27 | .. _sec_file_formats_tree_sequences:
28 | 
29 | **************
30 | Tree sequences
31 | **************
32 | 
33 | The goal of ``tsinfer`` is to infer correlated genealogies from variation
34 | data, and it uses the very efficient `succinct tree sequence
35 | <https://tskit.dev/tskit/docs/stable/data-model.html>`_ data structure
36 | to encode this output. Please see the `tskit documentation
37 | <https://tskit.dev/tskit/docs/stable/>`_ for details on how to
38 | process and manipulate such tree sequences.
39 | 
40 | The intermediate ``.ancestors.trees`` file produced by the
41 | :ref:`sec_inference_match_ancestors` step is also a
42 | tree sequence and can be loaded and analysed using the
43 | `tskit API <https://tskit.dev/tskit/docs/stable/python-api.html>`_.
44 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # Welcome to tsinfer's documentation!
 2 | 
 3 | This is the documentation for {program}`tsinfer`, a method for inferring correlated
 4 | genealogies (a.k.a. tree sequence or ARGs) from genetic variation data.
 5 | 
 6 | Besides this manual, there are a number of other resources
 7 | available for learning about {program}`tskit` and {program}`tsinfer`:
 8 | 
 9 | - The [tskit tutorials](https://tskit.dev/tutorials) site contains
10 |   in-depth tutorials on analysis of the {program}`tskit` tree sequences produced by
11 |   {program}`tsinfer`.
12 | 
13 | - Our [Discussions board](https://github.com/tskit-dev/tsinfer/discussions)
14 |   is a great place to ask questions like "how do I do X" or "what's the best
15 |   way to do Y". Please make questions as clear as possible, and be respectful,
16 |   helpful, and kind.
17 | 
18 | ```{important}
19 | If you use {program}`tsinfer` in your work, please remember to
20 | cite it appropriately: see the {ref}`citations<sec_citation>` page
21 | for details.
22 | ```
23 | 
24 | 
25 | ## Contents
26 | 
27 | ```{tableofcontents}
28 | ```
29 | 


--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
 1 | .. _sec_installation:
 2 | 
 3 | ############
 4 | Installation
 5 | ############
 6 | 
 7 | Python 3.9 or newer is required for ``tsinfer``. Any Unix-like platform should
 8 | work (``tsinfer`` is tested on Linux, OS X, and Windows).
 9 | 
10 | ***************
11 | Binary packages
12 | ***************
13 | 
14 | The most reliable way to install ``tsinfer`` is to install the binary conda package:
15 | e.g.::
16 | 
17 |     $ conda install tsinfer -c conda-forge
18 | 
19 | you can then ``import tsinfer`` in python or use the ``tsinfer`` executable directly::
20 | 
21 |     $ tsinfer --help
22 | 
23 | **********************
24 | Installing from source
25 | **********************
26 | 
27 | It is also possible to install from source via ``pip`` (although see the issues below):
28 | 
29 |     $ python -m pip install tsinfer --user
30 | 
31 | which will install ``tsinfer`` to the Python installation corresponding to your
32 | ``python`` executable. All requirements should be installed automatically.
33 | 
34 | To run the command line interface to ``tsinfer`` you can then use::
35 | 
36 |     $ python -m tsinfer --help
37 | 
38 | 
39 | If your ``PATH`` is set up to point at the corresponding ``bin`` directory
40 | you can also use the ``tsinfer`` executable directly::
41 | 
42 |     $ tsinfer --help
43 | 
44 | You may wish to install into a virtual environment
45 | first using `venv <https://docs.python.org/3/library/venv.html>`_::
46 | 
47 |     $ python -m venv tsinfer-venv
48 |     $ source tsinfer-venv/bin/activate
49 |     (tsinfer-venv) $ python -m pip install tsinfer
50 |     (tsinfer-venv) $ tsinfer --help
51 | 
52 | ****************
53 | Potential issues
54 | ****************
55 | 
56 | #. One of the dependencies of ``tsinfer``,
57 |    `numcodecs <https://numcodecs.readthedocs.io/>`_, is compiled to
58 |    use AVX2 instructions (where available) when installed using pip. This can lead to
59 |    issues when ``numcodecs`` is compiled on a machine that supports AVX2
60 |    and subsequently run on older machines that do not. To resolve this, ``numcodecs``
61 |    has a ``DISABLE_NUMCODECS_AVX2`` variable which can be turned on before calling
62 |    ``pip install``, see
63 |    `these instructions <https://numcodecs.readthedocs.io/en/stable/#installation>`_
64 |    for details.
65 | 
66 | #. There can be problems compiling from source using the default compilers under Mac OS
67 |    (see https://github.com/tskit-dev/tsinfer/issues/376). The current workaround is
68 |    either to compile from source by installing alternative python and C compilers via
69 |    conda (``conda install -c conda-forge c-compiler``) or to install the binary
70 |    packages via conda as recommended at the top of this page.
71 | 


--------------------------------------------------------------------------------
/docs/introduction.rst:
--------------------------------------------------------------------------------
 1 | .. _sec_introduction:
 2 | 
 3 | ============
 4 | Introduction
 5 | ============
 6 | 
 7 | The goal of ``tsinfer`` is to infer *succinct tree sequences* from observed
 8 | genetic variation data. A succinct tree sequence (or
 9 | :ref:`tree sequence<tutorials:sec_what_is>`, for short)
10 | is an efficient way of representing the correlated genealogies that
11 | describe the ancestry of many species. By inferring these tree sequences, we
12 | make two very important gains:
13 | 
14 | 1. We obtain an approximation of the true history of our sampled data, which
15 |    may be useful for other inferential tasks.
16 | 
17 | 2. The data structure itself is an extremely concise and efficient means of
18 |    storing and processing the data that we have.
19 | 
20 | The output of ``tsinfer`` is a :class:`tskit.TreeSequence` and so the
21 | full `tskit API <https://tskit.dev/tskit/docs/stable>`_ can be used to
22 | analyse real data, in precisely the same way that it is commonly used
23 | to analyse simulation data, for example, from `msprime <https://tskit.dev/msprime/docs/stable/>`_.
24 | 
25 | .. note::
26 | 
27 |   ``Tsinfer`` infers the genetic relationships between sampled genomes, but does not
28 |   attempt to infer the *times* of most recent common ancestors (tMRCAs) in the genealogy.
29 |   If you are using the output of ``tsinfer`` in downstream analysis that relies on
30 |   node times, you are advised not to use the inferred tree sequences directly; instead,
31 |   you should post-process the ``tsinfer`` output using software such as
32 |   `tsdate <https://tsdate.readthedocs.io>`_ that attempts to assign calendar or
33 |   generation times to the tree sequence nodes.


--------------------------------------------------------------------------------
/docs/simulation-example.py:
--------------------------------------------------------------------------------
 1 | import builtins
 2 | import subprocess
 3 | import sys
 4 | 
 5 | import msprime
 6 | import numpy as np
 7 | from Bio import bgzf
 8 | 
 9 | if getattr(builtins, "__IPYTHON__", False):  # if running IPython: e.g. in a notebook
10 |     num_diploids, seq_len = 100, 10_000
11 |     name = "notebook-simulation"
12 | else:  # Take parameters from the command-line
13 |     num_diploids, seq_len = int(sys.argv[1]), float(sys.argv[2])
14 |     name = "cli-simulation"
15 | 
16 | ts = msprime.sim_ancestry(
17 |     num_diploids,
18 |     population_size=10**4,
19 |     recombination_rate=1e-8,
20 |     sequence_length=seq_len,
21 |     random_seed=6,
22 | )
23 | ts = msprime.sim_mutations(ts, rate=1e-8, random_seed=7)
24 | ts.dump(name + "-source.trees")
25 | print(
26 |     f"Simulated {ts.num_samples} samples over {seq_len/1e6} Mb:",
27 |     f"{ts.num_trees} trees and {ts.num_sites} sites",
28 | )
29 | 
30 | # Convert to a zarr file: this should be easier once a tskit2zarr utility is made, see
31 | # https://github.com/sgkit-dev/bio2zarr/issues/232
32 | np.save(f"{name}-AA.npy", [s.ancestral_state for s in ts.sites()])
33 | vcf_name = f"{name}.vcf.gz"
34 | with bgzf.open(vcf_name, "wt") as f:
35 |     ts.write_vcf(f)
36 | subprocess.run(["tabix", vcf_name])
37 | ret = subprocess.run(
38 |     "python -m bio2zarr vcf2zarr convert --force".split() + [vcf_name, name + ".vcz"],
39 |     stderr=subprocess.DEVNULL if name == "notebook-simulation" else None,
40 | )
41 | if ret.returncode == 0:
42 |     print(f"Converted to {name}.vcz")
43 | 


--------------------------------------------------------------------------------
/docs/tsinfer_logo.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <svg
 3 |    xmlns:dc="http://purl.org/dc/elements/1.1/"
 4 |    xmlns:cc="http://creativecommons.org/ns#"
 5 |    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
 6 |    xmlns:svg="http://www.w3.org/2000/svg"
 7 |    xmlns="http://www.w3.org/2000/svg"
 8 |    viewBox="0 0 1434.6667 819.54669"
 9 |    height="819.54669"
10 |    width="1434.6667"
11 |    xml:space="preserve"
12 |    id="svg2"
13 |    version="1.1"><metadata
14 |      id="metadata8"><rdf:RDF><cc:Work
15 |          rdf:about=""><dc:format>image/svg+xml</dc:format><dc:type
16 |            rdf:resource="http://purl.org/dc/dcmitype/StillImage" /></cc:Work></rdf:RDF></metadata><defs
17 |      id="defs6" /><g
18 |      transform="matrix(1.3333333,0,0,-1.3333333,0,819.54667)"
19 |      id="g10"><g
20 |        transform="scale(0.1)"
21 |        id="g12"><path
22 |          id="path14"
23 |          style="fill:#5dc694;fill-opacity:1;fill-rule:nonzero;stroke:none"
24 |          d="M 10208.4,3550.1 H 3726.8 v 432.73 h 6481.6 V 3550.1" /><path
25 |          id="path16"
26 |          style="fill:#204e66;fill-opacity:1;fill-rule:nonzero;stroke:none"
27 |          d="M 179.805,594.211 V 1328.6 H 0 v 304.73 h 179.805 v 447.94 H 612.504 V 1633.33 H 1060.45 V 1328.6 H 612.504 V 594.211 c 0,-167.602 70.101,-259.012 228.539,-259.012 100.574,0 182.847,33.52 219.407,54.852 V 73.1211 C 999.512,42.6484 883.719,0 716.121,0 338.242,0 179.805,249.852 179.805,594.211" /><path
28 |          id="path18"
29 |          style="fill:#204e66;fill-opacity:1;fill-rule:nonzero;stroke:none"
30 |          d="M 2102.61,990.352 C 2504.87,871.52 2672.47,746.559 2672.47,460.129 2672.47,164.531 2443.93,0 2096.52,0 1858.85,0 1709.51,63.9883 1608.97,167.602 L 1602.88,48.7383 H 1298.15 V 600.301 h 335.2 C 1700.38,432.699 1834.47,323 2035.58,323 c 112.75,0 204.16,45.719 204.16,140.168 0,94.48 -88.37,134.094 -152.35,152.383 l -237.7,63.98 c -249.88,67.028 -560.7,149.309 -560.7,536.319 0,301.68 216.37,463.2 536.35,463.2 158.43,0 326.03,-45.72 450.99,-158.47 l 6.09,112.75 h 304.73 v -533.27 h -335.2 c -51.81,173.68 -182.85,259.01 -326.07,259.01 -112.75,0 -204.16,-48.74 -204.16,-149.31 0,-85.32 54.85,-121.88 164.56,-155.42 l 216.33,-63.988" /><path
31 |          id="path20"
32 |          style="fill:#204e66;fill-opacity:1;fill-rule:nonzero;stroke:none"
33 |          d="m 3647.6,2133.08 c 0,140.18 97.5,246.84 259.01,246.84 161.51,0 259.01,-106.66 259.01,-246.84 0,-140.17 -97.5,-246.83 -259.01,-246.83 -161.51,0 -259.01,106.66 -259.01,246.83 z M 3553.11,48.7383 V 353.469 H 3726.8 V 1328.6 h -173.69 v 304.73 h 606.42 V 353.469 h 173.69 V 48.7383 h -780.11" /><path
34 |          id="path22"
35 |          style="fill:#204e66;fill-opacity:1;fill-rule:nonzero;stroke:none"
36 |          d="m 6225.58,353.469 h 173.69 V 48.7383 h -780.1 V 353.469 h 173.68 V 923.32 c 0,201.12 -42.67,390.06 -268.17,390.06 -322.99,0 -347.37,-350.45 -347.37,-518.052 V 353.469 h 173.68 V 48.7383 h -780.1 V 353.469 h 173.69 V 1328.6 h -173.69 v 304.73 h 607.26 v -228.54 h -34.36 c 82.28,143.21 240.72,277.3 511.93,277.3 380.92,0 569.86,-280.35 569.86,-752.68 V 353.469" /><path
37 |          id="path24"
38 |          style="fill:#204e66;fill-opacity:1;fill-rule:nonzero;stroke:none"
39 |          d="M 6636.97,48.7383 V 353.469 h 173.69 V 1328.6 h -173.69 v 304.73 h 173.69 v 152.38 c 0,344.33 158.47,594.21 536.34,594.21 143.22,0 246.84,-30.47 313.86,-57.9 v -316.94 c -45.72,18.3 -112.74,39.64 -188.93,39.64 -158.47,0 -228.54,-91.41 -228.54,-259.01 v -152.38 h 356.53 V 1328.6 H 7243.39 V 353.469 h 265.1 V 48.7383 h -871.52" /><path
40 |          id="path26"
41 |          style="fill:#204e66;fill-opacity:1;fill-rule:nonzero;stroke:none"
42 |          d="m 8273.4,1057.38 h 691.73 c -42.67,231.61 -161.51,329.11 -341.31,329.11 -204.16,0 -310.82,-109.7 -350.42,-329.11 z m -12.21,-280.349 c 12.21,-304.73 134.09,-444.902 420.52,-444.902 170.68,0 353.5,67.062 447.98,149.332 L 9306.42,219.379 C 9184.51,100.539 8937.71,3.05078 8681.71,3.05078 c -597.25,0 -865.42,347.37122 -865.42,841.03922 0,466.24 240.74,838 801.44,838 505.84,0 740.5,-383.96 740.5,-810.57 0,-36.59 -3.04,-73.149 -6.09,-94.489 H 8261.19" /><path
43 |          id="path28"
44 |          style="fill:#204e66;fill-opacity:1;fill-rule:nonzero;stroke:none"
45 |          d="m 10760,1663.8 v -402.23 c -24.4,3.04 -73.1,9.13 -121.9,9.13 -271.2,0 -429.7,-67.03 -429.7,-502.802 V 353.469 h 246.9 V 48.7383 H 9602.02 V 353.469 h 173.69 v 972.081 h -173.69 v 304.73 h 606.38 v -255.96 h -42.6 c 79.2,207.2 219.4,307.77 441.8,307.77 67.1,0 121.9,-9.13 152.4,-18.29" /><path
46 |          id="path30"
47 |          style="fill:#5dc694;fill-opacity:1;fill-rule:nonzero;stroke:none"
48 |          d="M 4159.53,2684.65 H 3726.8 v 432.72 h 432.73 v -432.72" /><path
49 |          id="path32"
50 |          style="fill:#5dc694;fill-opacity:1;fill-rule:nonzero;stroke:none"
51 |          d="M 612.344,2386 H 179.617 v 432.73 H 612.344 V 2386" /><path
52 |          id="path34"
53 |          style="fill:#5dc694;fill-opacity:1;fill-rule:nonzero;stroke:none"
54 |          d="m 10208.4,1935.22 h -432.69 v 432.73 h 432.69 v -432.73" /><path
55 |          id="path36"
56 |          style="fill:#5dc694;fill-opacity:1;fill-rule:nonzero;stroke:none"
57 |          d="m 7183.98,3982.83 h -432.72 v 432.73 h 432.72 v -432.73" /><path
58 |          id="path38"
59 |          style="fill:#5dc694;fill-opacity:1;fill-rule:nonzero;stroke:none"
60 |          d="M 4159.53,3117.37 H 3726.8 v 432.73 h 432.73 v -432.73" /><path
61 |          id="path40"
62 |          style="fill:#5dc694;fill-opacity:1;fill-rule:nonzero;stroke:none"
63 |          d="M 4159.53,3550.1 H 3726.8 v 432.73 h 432.73 V 3550.1" /><path
64 |          id="path42"
65 |          style="fill:#5dc694;fill-opacity:1;fill-rule:nonzero;stroke:none"
66 |          d="m 10208.4,1935.22 h -432.69 v 2047.61 h 432.69 V 1935.22" /><path
67 |          id="path44"
68 |          style="fill:#5dc694;fill-opacity:1;fill-rule:nonzero;stroke:none"
69 |          d="m 7183.98,3983.01 h -432.72 v 432.73 h 432.72 v -432.73" /><path
70 |          id="path46"
71 |          style="fill:#5dc694;fill-opacity:1;fill-rule:nonzero;stroke:none"
72 |          d="m 7183.98,4415.74 h -432.72 v 432.73 h 432.72 v -432.73" /><path
73 |          id="path48"
74 |          style="fill:#5dc694;fill-opacity:1;fill-rule:nonzero;stroke:none"
75 |          d="m 3898.16,5281.2 h -432.72 v 432.72 h 432.72 V 5281.2" /><path
76 |          id="path50"
77 |          style="fill:#5dc694;fill-opacity:1;fill-rule:nonzero;stroke:none"
78 |          d="M 612.344,2386 H 179.617 V 5281.2 H 612.344 V 2386" /><path
79 |          id="path52"
80 |          style="fill:#5dc694;fill-opacity:1;fill-rule:nonzero;stroke:none"
81 |          d="m 3898.16,5713.92 h -432.72 v 432.73 h 432.72 v -432.73" /><path
82 |          id="path54"
83 |          style="fill:#5dc694;fill-opacity:1;fill-rule:nonzero;stroke:none"
84 |          d="m 6454.12,4848.47 h -432.73 v 432.73 h 432.73 v -432.73" /><path
85 |          id="path56"
86 |          style="fill:#5dc694;fill-opacity:1;fill-rule:nonzero;stroke:none"
87 |          d="M 7183.98,4848.47 H 179.617 V 5281.2 H 7183.98 v -432.73" /></g></g></svg>


--------------------------------------------------------------------------------
/lib/.clang-format:
--------------------------------------------------------------------------------
 1 | Language: Cpp
 2 | BasedOnStyle: GNU
 3 | SortIncludes:    false
 4 | AllowShortIfStatementsOnASingleLine: false
 5 | BreakBeforeBraces: Linux
 6 | TabWidth:        4
 7 | IndentWidth:     4
 8 | ColumnLimit:     89
 9 | SpaceBeforeParens:
10 |     ControlStatements
11 | SpacesInCStyleCastParentheses: false
12 | SpaceAfterCStyleCast: true
13 | IndentCaseLabels: true
14 | AlignAfterOpenBracket: DontAlign
15 | BinPackArguments: true
16 | BinPackParameters: true
17 | AlwaysBreakAfterReturnType: AllDefinitions
18 | 
19 | # These are disabled for version 6 compatibility
20 | # StatementMacros: ["PyObject_HEAD"]
21 | # AlignConsecutiveMacros: true
22 | 


--------------------------------------------------------------------------------
/lib/avl.h:
--------------------------------------------------------------------------------
  1 | /*****************************************************************************
  2 | 
  3 |     avl.h - Source code for the AVL-tree library.
  4 | 
  5 |     Copyright (C) 1998  Michael H. Buselli <cosine@cosine.org>
  6 |     Copyright (C) 2000-2002  Wessel Dankers <wsl@nl.linux.org>
  7 | 
  8 |     This library is free software; you can redistribute it and/or
  9 |     modify it under the terms of the GNU Lesser General Public
 10 |     License as published by the Free Software Foundation; either
 11 |     version 2.1 of the License, or (at your option) any later version.
 12 | 
 13 |     This library is distributed in the hope that it will be useful,
 14 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
 15 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 16 |     Lesser General Public License for more details.
 17 | 
 18 |     You should have received a copy of the GNU Lesser General Public
 19 |     License along with this library; if not, write to the Free Software
 20 |     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
 21 | 
 22 |     Augmented AVL-tree. Original by Michael H. Buselli <cosine@cosine.org>.
 23 | 
 24 |     Modified by Wessel Dankers <wsl@nl.linux.org> to add a bunch of bloat to
 25 |     the sourcecode, change the interface and squash a few bugs.
 26 |     Mail him if you find new bugs.
 27 | 
 28 | *****************************************************************************/
 29 | 
 30 | #ifndef _AVL_H
 31 | #define _AVL_H
 32 | 
 33 | /* We need either depths, counts or both (the latter being the default) */
 34 | #if !defined(AVL_DEPTH) && !defined(AVL_COUNT)
 35 | #define AVL_DEPTH
 36 | #define AVL_COUNT
 37 | #endif
 38 | 
 39 | /* User supplied function to compare two items like strcmp() does.
 40 |  * For example: cmp(a,b) will return:
 41 |  *   -1  if a < b
 42 |  *    0  if a = b
 43 |  *    1  if a > b
 44 |  */
 45 | typedef int (*avl_compare_t)(const void *, const void *);
 46 | 
 47 | /* User supplied function to delete an item when a node is free()d.
 48 |  * If NULL, the item is not free()d.
 49 |  */
 50 | typedef void (*avl_freeitem_t)(void *);
 51 | 
 52 | typedef struct avl_node_t {
 53 | 	struct avl_node_t *next;
 54 | 	struct avl_node_t *prev;
 55 | 	struct avl_node_t *parent;
 56 | 	struct avl_node_t *left;
 57 | 	struct avl_node_t *right;
 58 | 	void *item;
 59 | #ifdef AVL_COUNT
 60 | 	unsigned int count;
 61 | #endif
 62 | #ifdef AVL_DEPTH
 63 | 	unsigned char depth;
 64 | #endif
 65 | } avl_node_t;
 66 | 
 67 | typedef struct avl_tree_t {
 68 | 	avl_node_t *head;
 69 | 	avl_node_t *tail;
 70 | 	avl_node_t *top;
 71 | 	avl_compare_t cmp;
 72 | 	avl_freeitem_t freeitem;
 73 | } avl_tree_t;
 74 | 
 75 | /* Initializes a new tree for elements that will be ordered using
 76 |  * the supplied strcmp()-like function.
 77 |  * Returns the value of avltree (even if it's NULL).
 78 |  * O(1) */
 79 | extern avl_tree_t *avl_init_tree(avl_tree_t *avltree, avl_compare_t, avl_freeitem_t);
 80 | 
 81 | /* Allocates and initializes a new tree for elements that will be
 82 |  * ordered using the supplied strcmp()-like function.
 83 |  * Returns NULL if memory could not be allocated.
 84 |  * O(1) */
 85 | extern avl_tree_t *avl_alloc_tree(avl_compare_t, avl_freeitem_t);
 86 | 
 87 | /* Frees the entire tree efficiently. Nodes will be free()d.
 88 |  * If the tree's freeitem is not NULL it will be invoked on every item.
 89 |  * O(n) */
 90 | extern void avl_free_tree(avl_tree_t *);
 91 | 
 92 | /* Reinitializes the tree structure for reuse. Nothing is free()d.
 93 |  * Compare and freeitem functions are left alone.
 94 |  * O(1) */
 95 | extern void avl_clear_tree(avl_tree_t *);
 96 | 
 97 | /* Free()s all nodes in the tree but leaves the tree itself.
 98 |  * If the tree's freeitem is not NULL it will be invoked on every item.
 99 |  * O(n) */
100 | extern void avl_free_nodes(avl_tree_t *);
101 | 
102 | /* Initializes memory for use as a node. Returns NULL if avlnode is NULL.
103 |  * O(1) */
104 | extern avl_node_t *avl_init_node(avl_node_t *avlnode, void *item);
105 | 
106 | /* Insert an item into the tree and return the new node.
107 |  * Returns NULL and sets errno if memory for the new node could not be
108 |  * allocated or if the node is already in the tree (EEXIST).
109 |  * O(lg n) */
110 | extern avl_node_t *avl_insert(avl_tree_t *, void *item);
111 | 
112 | /* Insert a node into the tree and return it.
113 |  * Returns NULL if the node is already in the tree.
114 |  * O(lg n) */
115 | extern avl_node_t *avl_insert_node(avl_tree_t *, avl_node_t *);
116 | 
117 | /* Insert a node in an empty tree. If avlnode is NULL, the tree will be
118 |  * cleared and ready for re-use.
119 |  * If the tree is not empty, the old nodes are left dangling.
120 |  * O(1) */
121 | extern avl_node_t *avl_insert_top(avl_tree_t *, avl_node_t *avlnode);
122 | 
123 | /* Insert a node before another node. Returns the new node.
124 |  * If old is NULL, the item is appended to the tree.
125 |  * O(lg n) */
126 | extern avl_node_t *avl_insert_before(avl_tree_t *, avl_node_t *old, avl_node_t *new);
127 | 
128 | /* Insert a node after another node. Returns the new node.
129 |  * If old is NULL, the item is prepended to the tree.
130 |  * O(lg n) */
131 | extern avl_node_t *avl_insert_after(avl_tree_t *, avl_node_t *old, avl_node_t *new);
132 | 
133 | /* Deletes a node from the tree. Returns immediately if the node is NULL.
134 |  * The item will not be free()d regardless of the tree's freeitem handler.
135 |  * This function comes in handy if you need to update the search key.
136 |  * O(lg n) */
137 | extern void avl_unlink_node(avl_tree_t *, avl_node_t *);
138 | 
139 | /* Deletes a node from the tree. Returns immediately if the node is NULL.
140 |  * If the tree's freeitem is not NULL, it is invoked on the item.
141 |  * If it is, returns the item.
142 |  * O(lg n) */
143 | extern void *avl_delete_node(avl_tree_t *, avl_node_t *);
144 | 
145 | /* Searches for an item in the tree and deletes it if found.
146 |  * If the tree's freeitem is not NULL, it is invoked on the item.
147 |  * If it is, returns the item.
148 |  * O(lg n) */
149 | extern void *avl_delete(avl_tree_t *, const void *item);
150 | 
151 | /* If exactly one node is moved in memory, this will fix the pointers
152 |  * in the tree that refer to it. It must be an exact shallow copy.
153 |  * Returns the pointer to the old position.
154 |  * O(1) */
155 | extern avl_node_t *avl_fixup_node(avl_tree_t *, avl_node_t *new);
156 | 
157 | /* Searches for a node with the key closest (or equal) to the given item.
158 |  * If avlnode is not NULL, *avlnode will be set to the node found or NULL
159 |  * if the tree is empty. Return values:
160 |  *   -1  if the returned node is smaller
161 |  *    0  if the returned node is equal or if the tree is empty
162 |  *    1  if the returned node is greater
163 |  * O(lg n) */
164 | extern int avl_search_closest(const avl_tree_t *, const void *item, avl_node_t **avlnode);
165 | 
166 | /* Searches for the item in the tree and returns a matching node if found
167 |  * or NULL if not.
168 |  * O(lg n) */
169 | extern avl_node_t *avl_search(const avl_tree_t *, const void *item);
170 | 
171 | #ifdef AVL_COUNT
172 | /* Returns the number of nodes in the tree.
173 |  * O(1) */
174 | extern unsigned int avl_count(const avl_tree_t *);
175 | 
176 | /* Searches a node by its rank in the list. Counting starts at 0.
177 |  * Returns NULL if the index exceeds the number of nodes in the tree.
178 |  * O(lg n) */
179 | extern avl_node_t *avl_at(const avl_tree_t *, unsigned int);
180 | 
181 | /* Returns the rank of a node in the list. Counting starts at 0.
182 |  * O(lg n) */
183 | extern unsigned int avl_index(const avl_node_t *);
184 | #endif
185 | 
186 | #endif
187 | 


--------------------------------------------------------------------------------
/lib/err.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | ** Copyright (C) 2020 University of Oxford
  3 | **
  4 | ** This file is part of tsinfer.
  5 | **
  6 | ** tsinfer is free software: you can redistribute it and/or modify
  7 | ** it under the terms of the GNU General Public License as published by
  8 | ** the Free Software Foundation, either version 3 of the License, or
  9 | ** (at your option) any later version.
 10 | **
 11 | ** tsinfer is distributed in the hope that it will be useful,
 12 | ** but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | ** GNU General Public License for more details.
 15 | **
 16 | ** You should have received a copy of the GNU General Public License
 17 | ** along with tsinfer.  If not, see <http://www.gnu.org/licenses/>.
 18 | */
 19 | 
 20 | #include "err.h"
 21 | #include <tskit.h>
 22 | 
 23 | const char *
 24 | tsi_strerror(int err)
 25 | {
 26 |     const char *ret = "Unknown error";
 27 | 
 28 |     switch (err) {
 29 |         case 0:
 30 |             ret = "Normal exit condition. This is not an error!";
 31 |             break;
 32 | 
 33 |         case TSI_ERR_GENERIC:
 34 |             ret = "Generic tsinfer error - please file a bug report.";
 35 |             break;
 36 |         case TSI_ERR_NO_MEMORY:
 37 |             ret = "Out of memory";
 38 |             break;
 39 |         case TSI_ERR_NONCONTIGUOUS_EDGES:
 40 |             ret = "Edges must be contiguous";
 41 |             break;
 42 |         case TSI_ERR_UNSORTED_EDGES:
 43 |             ret = "Edges must be sorted";
 44 |             break;
 45 |         case TSI_ERR_PC_ANCESTOR_TIME:
 46 |             ret = "Failure generating time for path compression ancestor";
 47 |             break;
 48 |         case TSI_ERR_BAD_PATH_CHILD:
 49 |             ret = "Bad path information: child node";
 50 |             break;
 51 |         case TSI_ERR_BAD_PATH_PARENT:
 52 |             ret = "Bad path information: parent node";
 53 |             break;
 54 |         case TSI_ERR_BAD_PATH_TIME:
 55 |             ret = "Bad path information: time";
 56 |             break;
 57 |         case TSI_ERR_BAD_PATH_INTERVAL:
 58 |             ret = "Bad path information: left >= right";
 59 |             break;
 60 |         case TSI_ERR_BAD_PATH_LEFT_LESS_ZERO:
 61 |             ret = "Bad path information: left < 0";
 62 |             break;
 63 |         case TSI_ERR_BAD_PATH_RIGHT_GREATER_NUM_SITES:
 64 |             ret = "Bad path information: right > num_sites";
 65 |             break;
 66 |         case TSI_ERR_MATCH_IMPOSSIBLE:
 67 |             ret = "Unexpected failure to find matching haplotype; please open "
 68 |                   "an issue on GitHub";
 69 |             break;
 70 |         case TSI_ERR_MATCH_IMPOSSIBLE_EXTREME_MUTATION_PROBA:
 71 |             ret = "Cannot find match: the specified mismatch probability is "
 72 |                   "0 or 1 and no matches are possible with these parameters";
 73 |             break;
 74 |         case TSI_ERR_MATCH_IMPOSSIBLE_ZERO_RECOMB_PRECISION:
 75 |             ret = "Cannot find match: the specified recombination probability is"
 76 |                   "zero and no matches could be found. Increasing the 'precision' "
 77 |                   "may help, but recombination values of 0 are not recommended.";
 78 |             break;
 79 |         case TSI_ERR_BAD_HAPLOTYPE_ALLELE:
 80 |             ret = "Input haplotype contains bad allele information.";
 81 |             break;
 82 |         case TSI_ERR_BAD_NUM_ALLELES:
 83 |             ret = "The number of alleles must be between 2 and 127";
 84 |             break;
 85 |         case TSI_ERR_BAD_MUTATION_NODE:
 86 |             ret = "Bad mutation information: node";
 87 |             break;
 88 |         case TSI_ERR_BAD_MUTATION_SITE:
 89 |             ret = "Bad mutation information: site";
 90 |             break;
 91 |         case TSI_ERR_BAD_MUTATION_DERIVED_STATE:
 92 |             ret = "Bad mutation information: derived state";
 93 |             break;
 94 |         case TSI_ERR_BAD_MUTATION_DUPLICATE_NODE:
 95 |             ret = "Bad mutation information: mutation already exists for this node.";
 96 |             break;
 97 |         case TSI_ERR_BAD_NUM_SAMPLES:
 98 |             ret = "Must have at least 2 samples.";
 99 |             break;
100 |         case TSI_ERR_TOO_MANY_SITES:
101 |             ret = "Cannot add more sites than the specified maximum.";
102 |             break;
103 |         case TSI_ERR_BAD_FOCAL_SITE:
104 |             ret = "Bad focal site.";
105 |             break;
106 |         case TSI_ERR_ONE_BIT_NON_BINARY:
107 |             ret = "One-bit genotype encoding only supports binary 0/1 data";
108 |             break;
109 |         case TSI_ERR_IO:
110 |             ret = tsk_strerror(TSK_ERR_IO);
111 |             break;
112 |     }
113 |     return ret;
114 | }
115 | 


--------------------------------------------------------------------------------
/lib/err.h:
--------------------------------------------------------------------------------
 1 | #ifndef __ERR_H__
 2 | #define __ERR_H__
 3 | 
 4 | // clang-format off
 5 | #define TSI_ERR_GENERIC                                             -1
 6 | #define TSI_ERR_NO_MEMORY                                           -2
 7 | #define TSI_ERR_NONCONTIGUOUS_EDGES                                 -3
 8 | #define TSI_ERR_UNSORTED_EDGES                                      -4
 9 | #define TSI_ERR_PC_ANCESTOR_TIME                                    -5
10 | #define TSI_ERR_BAD_PATH_CHILD                                      -6
11 | #define TSI_ERR_BAD_PATH_PARENT                                     -7
12 | #define TSI_ERR_BAD_PATH_TIME                                       -8
13 | #define TSI_ERR_BAD_PATH_INTERVAL                                   -9
14 | #define TSI_ERR_BAD_PATH_LEFT_LESS_ZERO                             -10
15 | #define TSI_ERR_BAD_PATH_RIGHT_GREATER_NUM_SITES                    -11
16 | #define TSI_ERR_MATCH_IMPOSSIBLE                                    -12
17 | #define TSI_ERR_BAD_HAPLOTYPE_ALLELE                                -13
18 | #define TSI_ERR_BAD_NUM_ALLELES                                     -14
19 | #define TSI_ERR_BAD_MUTATION_NODE                                   -15
20 | #define TSI_ERR_BAD_MUTATION_SITE                                   -16
21 | #define TSI_ERR_BAD_MUTATION_DERIVED_STATE                          -17
22 | #define TSI_ERR_BAD_MUTATION_DUPLICATE_NODE                         -18
23 | #define TSI_ERR_BAD_NUM_SAMPLES                                     -19
24 | #define TSI_ERR_TOO_MANY_SITES                                      -20
25 | #define TSI_ERR_BAD_FOCAL_SITE                                      -21
26 | #define TSI_ERR_MATCH_IMPOSSIBLE_EXTREME_MUTATION_PROBA             -22
27 | #define TSI_ERR_MATCH_IMPOSSIBLE_ZERO_RECOMB_PRECISION              -23
28 | #define TSI_ERR_ONE_BIT_NON_BINARY                                  -24
29 | #define TSI_ERR_IO                                                  -25
30 | // clang-format on
31 | 
32 | #ifdef __GNUC__
33 | #define WARN_UNUSED __attribute__((warn_unused_result))
34 | #define unlikely(expr) __builtin_expect(!!(expr), 0)
35 | #define likely(expr) __builtin_expect(!!(expr), 1)
36 | #else
37 | /* On windows we don't do any perf related stuff */
38 | #define WARN_UNUSED
39 | #define restrict
40 | #define unlikely(expr) (expr)
41 | #define likely(expr) (expr)
42 | #endif
43 | 
44 | const char *tsi_strerror(int err);
45 | 
46 | #endif /*__ERR_H__*/
47 | 


--------------------------------------------------------------------------------
/lib/meson.build:
--------------------------------------------------------------------------------
 1 | project('tsinfer', 'c')
 2 |  
 3 | tskit_proj = subproject('tskit')
 4 | tskit_dep = tskit_proj.get_variable('tskit_dep')
 5 | 
 6 | cc = meson.get_compiler('c')
 7 | m_dep = cc.find_library('m', required : false)
 8 | cunit_dep = dependency('cunit')
 9 | 
10 | extra_c_args = [
11 |     '-std=c99', '-Wall', '-Wextra', '-Werror', '-Wpedantic', '-W',
12 |     '-Wmissing-prototypes',  '-Wstrict-prototypes',
13 |     '-Wconversion', '-Wshadow', '-Wpointer-arith', '-Wcast-align',
14 |     '-Wcast-qual', '-Wwrite-strings', '-Wnested-externs',
15 |     '-fshort-enums', '-fno-common']
16 | 
17 | tsinfer_sources =[
18 |     'ancestor_matcher.c', 'ancestor_builder.c', 'tree_sequence_builder.c',
19 |     'object_heap.c', 'err.c']
20 | 
21 | avl_lib = static_library('avl', sources: ['avl.c'])
22 | tsinfer_lib = static_library('tsinfer', 
23 |     sources: tsinfer_sources, dependencies: [m_dep, tskit_dep], 
24 |     c_args: extra_c_args, link_with:[avl_lib])
25 | 
26 | unit_tests = executable('tests', 
27 |     sources: ['tests/tests.c'], 
28 |     link_with: [tsinfer_lib], dependencies:[cunit_dep, tskit_dep])
29 | test('Unit tests', unit_tests)
30 | 


--------------------------------------------------------------------------------
/lib/object_heap.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | ** Copyright (C) 2018 University of Oxford
  3 | **
  4 | ** This file is part of tsinfer.
  5 | **
  6 | ** tsinfer is free software: you can redistribute it and/or modify
  7 | ** it under the terms of the GNU General Public License as published by
  8 | ** the Free Software Foundation, either version 3 of the License, or
  9 | ** (at your option) any later version.
 10 | **
 11 | ** tsinfer is distributed in the hope that it will be useful,
 12 | ** but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | ** GNU General Public License for more details.
 15 | **
 16 | ** You should have received a copy of the GNU General Public License
 17 | ** along with tsinfer.  If not, see <http://www.gnu.org/licenses/>.
 18 | */
 19 | 
 20 | #include <stdio.h>
 21 | #include <string.h>
 22 | #include <assert.h>
 23 | #include <stdlib.h>
 24 | 
 25 | #include "err.h"
 26 | #include "object_heap.h"
 27 | 
 28 | /* memory heap manager */
 29 | 
 30 | size_t
 31 | object_heap_get_num_allocated(object_heap_t *self)
 32 | {
 33 |     return self->size - self->top;
 34 | }
 35 | 
 36 | void
 37 | object_heap_print_state(object_heap_t *self, FILE *out)
 38 | {
 39 |     fprintf(out, "object heap %p::\n", (void *) self);
 40 |     fprintf(out, "\tsize = %d\n", (int) self->size);
 41 |     fprintf(out, "\ttop = %d\n", (int) self->top);
 42 |     fprintf(out, "\tblock_size = %d\n", (int) self->block_size);
 43 |     fprintf(out, "\tnum_blocks = %d\n", (int) self->num_blocks);
 44 |     fprintf(out, "\ttotal allocated = %d\n", (int) object_heap_get_num_allocated(self));
 45 | }
 46 | 
 47 | static void
 48 | object_heap_add_block(object_heap_t *self, char *mem_block)
 49 | {
 50 |     size_t j, index;
 51 | 
 52 |     for (j = 0; j < self->block_size; j++) {
 53 |         self->heap[j] = mem_block + j * self->object_size;
 54 |         if (self->init_object != NULL) {
 55 |             index = j + (self->num_blocks - 1) * self->block_size;
 56 |             self->init_object(self->heap[j], index);
 57 |         }
 58 |     }
 59 |     self->top = self->block_size;
 60 | }
 61 | 
 62 | int WARN_UNUSED
 63 | object_heap_expand(object_heap_t *self)
 64 | {
 65 |     int ret = -1;
 66 |     void *p;
 67 | 
 68 |     p = realloc(self->mem_blocks, (self->num_blocks + 1) * sizeof(void *));
 69 |     if (p == NULL) {
 70 |         ret = TSI_ERR_NO_MEMORY;
 71 |         goto out;
 72 |     }
 73 |     self->mem_blocks = p;
 74 |     p = calloc(self->block_size, self->object_size);
 75 |     if (p == NULL) {
 76 |         ret = TSI_ERR_NO_MEMORY;
 77 |         goto out;
 78 |     }
 79 |     self->mem_blocks[self->num_blocks] = p;
 80 |     self->num_blocks++;
 81 |     /* Now we increase the size of the heap. Since it is currently empty,
 82 |      * we avoid the copying cost of realloc and free before making a new
 83 |      * heap.
 84 |      */
 85 |     free(self->heap);
 86 |     self->heap = NULL;
 87 |     self->size += self->block_size;
 88 |     self->heap = calloc(self->size, sizeof(void *));
 89 |     if (self->heap == NULL) {
 90 |         ret = TSI_ERR_NO_MEMORY;
 91 |         goto out;
 92 |     }
 93 |     object_heap_add_block(self, p);
 94 |     ret = 0;
 95 | out:
 96 |     return ret;
 97 | }
 98 | 
 99 | /*
100 |  * Returns the jth object in the memory buffers.
101 |  */
102 | inline void *WARN_UNUSED
103 | object_heap_get_object(object_heap_t *self, size_t index)
104 | {
105 |     void *ret = NULL;
106 |     size_t block, obj;
107 | 
108 |     block = index / self->block_size;
109 |     obj = index % self->block_size;
110 |     if (block < self->num_blocks && obj < self->block_size) {
111 |         ret = self->mem_blocks[block] + obj * self->object_size;
112 |     }
113 |     return ret;
114 | }
115 | 
116 | inline int WARN_UNUSED
117 | object_heap_empty(object_heap_t *self)
118 | {
119 |     return self->top == 0;
120 | }
121 | 
122 | inline void *WARN_UNUSED
123 | object_heap_alloc_object(object_heap_t *self)
124 | {
125 |     void *ret = NULL;
126 | 
127 |     if (self->top > 0) {
128 |         self->top--;
129 |         ret = self->heap[self->top];
130 |     }
131 |     return ret;
132 | }
133 | 
134 | inline void
135 | object_heap_free_object(object_heap_t *self, void *obj)
136 | {
137 |     assert(self->top < self->size);
138 |     self->heap[self->top] = obj;
139 |     self->top++;
140 | }
141 | 
142 | int WARN_UNUSED
143 | object_heap_init(object_heap_t *self, size_t object_size, size_t block_size,
144 |     void (*init_object)(void **, size_t))
145 | {
146 |     int ret = -1;
147 | 
148 |     assert(block_size > 0);
149 |     memset(self, 0, sizeof(object_heap_t));
150 |     self->block_size = block_size;
151 |     self->size = block_size;
152 |     self->object_size = object_size;
153 |     self->init_object = init_object;
154 |     self->num_blocks = 1;
155 |     self->heap = calloc(self->size, sizeof(void *));
156 |     self->mem_blocks = calloc(1, sizeof(void *));
157 |     if (self->heap == NULL || self->mem_blocks == NULL) {
158 |         ret = TSI_ERR_NO_MEMORY;
159 |         goto out;
160 |     }
161 |     self->mem_blocks[0] = calloc(self->size, self->object_size);
162 |     if (self->mem_blocks[0] == NULL) {
163 |         ret = TSI_ERR_NO_MEMORY;
164 |         goto out;
165 |     }
166 |     self->top = 0;
167 |     object_heap_add_block(self, self->mem_blocks[0]);
168 |     ret = 0;
169 | out:
170 |     return ret;
171 | }
172 | 
173 | void
174 | object_heap_free(object_heap_t *self)
175 | {
176 |     size_t j;
177 | 
178 |     if (self->mem_blocks != NULL) {
179 |         for (j = 0; j < self->num_blocks; j++) {
180 |             if (self->mem_blocks[j] != NULL) {
181 |                 free(self->mem_blocks[j]);
182 |             }
183 |         }
184 |         free(self->mem_blocks);
185 |     }
186 |     if (self->heap != NULL) {
187 |         free(self->heap);
188 |     }
189 | }
190 | 


--------------------------------------------------------------------------------
/lib/object_heap.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef OBJECT_HEAP_H
 3 | #define OBJECT_HEAP_H
 4 | 
 5 | #include <stdio.h>
 6 | #include <string.h>
 7 | #include <assert.h>
 8 | 
 9 | typedef struct {
10 |     size_t object_size;
11 |     size_t block_size; /* number of objects in a block */
12 |     size_t top;
13 |     size_t size;
14 |     size_t num_blocks;
15 |     void **heap;
16 |     char **mem_blocks;
17 |     void (*init_object)(void **obj, size_t index);
18 | } object_heap_t;
19 | 
20 | extern size_t object_heap_get_num_allocated(object_heap_t *self);
21 | extern void object_heap_print_state(object_heap_t *self, FILE *out);
22 | extern int object_heap_expand(object_heap_t *self);
23 | extern void *object_heap_get_object(object_heap_t *self, size_t index);
24 | extern int object_heap_empty(object_heap_t *self);
25 | extern void *object_heap_alloc_object(object_heap_t *self);
26 | extern void object_heap_free_object(object_heap_t *self, void *obj);
27 | extern int object_heap_init(object_heap_t *self, size_t object_size, size_t block_size,
28 |     void (*init_object)(void **, size_t));
29 | extern void object_heap_free(object_heap_t *self);
30 | 
31 | #endif
32 | 


--------------------------------------------------------------------------------
/lib/subprojects/README:
--------------------------------------------------------------------------------
1 | This wrapfile is just used by meson for compiling the C code for 
2 | tests. It's not used by the top-level Python module in any
3 | way - that uses a git submodule to get the tskit code.
4 | 


--------------------------------------------------------------------------------
/lib/subprojects/tskit.wrap:
--------------------------------------------------------------------------------
1 | [wrap-file]
2 | directory = tskit-1.1.1
3 | 
4 | source_url = https://github.com/tskit-dev/tskit/releases/download/C_1.1.1/tskit-1.1.1.tar.xz
5 | source_filename = tskit-1.1.1.tar.xz
6 | source_hash = 12e9de302686fbc58be7a40066a2e478faa7da44a0b038f6d7f87e7f3a319984
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/lib/subprojects/tskit/.gitignore:
--------------------------------------------------------------------------------
1 | build
2 | 


--------------------------------------------------------------------------------
/lib/subprojects/tskit/VERSION.txt:
--------------------------------------------------------------------------------
1 | 1.1.1


--------------------------------------------------------------------------------
/lib/subprojects/tskit/examples/Makefile:
--------------------------------------------------------------------------------
 1 | # Simple Makefile for building examples.
 2 | # This will build the examples in the current directory by compiling in the
 3 | # full tskit source into each of the examples. This is *not* recommended for
 4 | # real projects!
 5 | #
 6 | # To use, type "make" in the this directory. If you have GSL installed you
 7 | # should then get two example programs built.
 8 | #
 9 | # **Note**: This repo uses git submodules, and these must be checked out
10 | # correctly for this makefile to work, e.g.:
11 | #
12 | # $ git clone git@github.com:tskit-dev/tskit.git --recurse-submodules
13 | #
14 | # See the documentation (https://tskit.dev/tskit/docs/stable/c-api.html)
15 | # for more details on how to use the C API, and the tskit build examples
16 | # repo (https://github.com/tskit-dev/tskit-build-examples) for examples
17 | # of how to set up a production-ready build with tskit.
18 | #
19 | 
20 | CFLAGS=-I../ -I../subprojects/kastore
21 | TSKIT_SOURCE=../tskit/*.c ../subprojects/kastore/kastore.c
22 | 
23 | targets = api_structure error_handling \
24 | 	haploid_wright_fisher streaming \
25 | 	tree_iteration tree_traversal \
26 | 	take_ownership
27 | 
28 | all: $(targets)
29 | 
30 | $(targets): %: %.c
31 | 	${CC} ${CFLAGS} -o $@ $< ${TSKIT_SOURCE} -lm
32 | 
33 | clean:
34 | 	rm -f $(targets)
35 | 
36 | 


--------------------------------------------------------------------------------
/lib/subprojects/tskit/examples/api_structure.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <tskit/tables.h>
 4 | 
 5 | #define check_tsk_error(val)                                                            \
 6 |     if (val < 0) {                                                                      \
 7 |         fprintf(stderr, "line %d: %s", __LINE__, tsk_strerror(val));                    \
 8 |         exit(EXIT_FAILURE);                                                             \
 9 |     }
10 | 
11 | int
12 | main(int argc, char **argv)
13 | {
14 |     int j, ret;
15 |     tsk_edge_table_t edges;
16 | 
17 |     ret = tsk_edge_table_init(&edges, 0);
18 |     check_tsk_error(ret);
19 |     for (j = 0; j < 5; j++) {
20 |         ret = tsk_edge_table_add_row(&edges, 0, 1, j + 1, j, NULL, 0);
21 |         check_tsk_error(ret);
22 |     }
23 |     tsk_edge_table_print_state(&edges, stdout);
24 |     tsk_edge_table_free(&edges);
25 | 
26 |     return EXIT_SUCCESS;
27 | }
28 | 


--------------------------------------------------------------------------------
/lib/subprojects/tskit/examples/cpp_sorting_example.cpp:
--------------------------------------------------------------------------------
  1 | #include <cstddef>
  2 | #include <vector>
  3 | #include <algorithm>
  4 | #include <stdexcept>
  5 | #include <sstream>
  6 | #include <iostream>
  7 | #include <type_traits>
  8 | #include <tskit.h>
  9 | 
 10 | static void
 11 | handle_tskit_return_code(int code)
 12 | {
 13 |     if (code != 0) {
 14 |         std::ostringstream o;
 15 |         o << tsk_strerror(code);
 16 |         throw std::runtime_error(o.str());
 17 |     }
 18 | }
 19 | 
 20 | struct edge_plus_time {
 21 |     double time;
 22 |     tsk_id_t parent, child;
 23 |     double left, right;
 24 | };
 25 | 
 26 | int
 27 | sort_edges(tsk_table_sorter_t *sorter, tsk_size_t start)
 28 | {
 29 |     if (sorter->tables->edges.metadata_length != 0) {
 30 |         throw std::invalid_argument(
 31 |             "the sorter does not currently handle edge metadata");
 32 |     }
 33 |     if (start != 0) {
 34 |         throw std::invalid_argument("the sorter requires start==0");
 35 |     }
 36 | 
 37 |     std::vector<edge_plus_time> temp;
 38 |     temp.reserve(static_cast<std::size_t>(sorter->tables->edges.num_rows));
 39 | 
 40 |     auto edges = &sorter->tables->edges;
 41 |     auto nodes = &sorter->tables->nodes;
 42 | 
 43 |     for (tsk_size_t i = 0; i < sorter->tables->edges.num_rows; ++i) {
 44 |         temp.push_back(edge_plus_time{ nodes->time[edges->parent[i]], edges->parent[i],
 45 |             edges->child[i], edges->left[i], edges->right[i] });
 46 |     }
 47 | 
 48 |     std::sort(begin(temp), end(temp),
 49 |         [](const edge_plus_time &lhs, const edge_plus_time &rhs) {
 50 |             if (lhs.time == rhs.time) {
 51 |                 if (lhs.parent == rhs.parent) {
 52 |                     if (lhs.child == rhs.child) {
 53 |                         return lhs.left < rhs.left;
 54 |                     }
 55 |                     return lhs.child < rhs.child;
 56 |                 }
 57 |                 return lhs.parent < rhs.parent;
 58 |             }
 59 |             return lhs.time < rhs.time;
 60 |         });
 61 | 
 62 |     for (std::size_t i = 0; i < temp.size(); ++i) {
 63 |         edges->left[i] = temp[i].left;
 64 |         edges->right[i] = temp[i].right;
 65 |         edges->parent[i] = temp[i].parent;
 66 |         edges->child[i] = temp[i].child;
 67 |     }
 68 | 
 69 |     return 0;
 70 | }
 71 | 
 72 | int
 73 | main(int argc, char **argv)
 74 | {
 75 |     if (argc != 3) {
 76 |         std::cerr << "Usage: " << argv[0] << " input.trees output.trees\n";
 77 |         std::exit(0);
 78 |     }
 79 |     const char *infile = argv[1];
 80 |     const char *outfile = argv[2];
 81 | 
 82 |     tsk_table_collection_t tables;
 83 |     auto ret = tsk_table_collection_load(&tables, infile, 0);
 84 |     handle_tskit_return_code(ret);
 85 | 
 86 |     tsk_table_sorter_t sorter;
 87 |     ret = tsk_table_sorter_init(&sorter, &tables, 0);
 88 |     handle_tskit_return_code(ret);
 89 |     sorter.sort_edges = sort_edges;
 90 |     try {
 91 |         ret = tsk_table_sorter_run(&sorter, NULL);
 92 |     } catch (std::exception &e) {
 93 |         std::cerr << e.what() << '\n';
 94 |         std::exit(1);
 95 |     }
 96 |     handle_tskit_return_code(ret);
 97 |     ret = tsk_table_collection_dump(&tables, outfile, 0);
 98 |     handle_tskit_return_code(ret);
 99 |     ret = tsk_table_collection_free(&tables);
100 |     handle_tskit_return_code(ret);
101 | }
102 | 


--------------------------------------------------------------------------------
/lib/subprojects/tskit/examples/error_handling.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <tskit.h>
 4 | 
 5 | int
 6 | main(int argc, char **argv)
 7 | {
 8 |     int ret;
 9 |     tsk_treeseq_t ts;
10 | 
11 |     if (argc != 2) {
12 |         fprintf(stderr, "usage: <tree sequence file>");
13 |         exit(EXIT_FAILURE);
14 |     }
15 |     ret = tsk_treeseq_load(&ts, argv[1], 0);
16 |     if (ret < 0) {
17 |         /* Error condition. Free and exit */
18 |         tsk_treeseq_free(&ts);
19 |         fprintf(stderr, "%s", tsk_strerror(ret));
20 |         exit(EXIT_FAILURE);
21 |     }
22 |     printf("Loaded tree sequence with %lld nodes and %lld edges from %s\n",
23 |         (long long) tsk_treeseq_get_num_nodes(&ts),
24 |         (long long) tsk_treeseq_get_num_edges(&ts), argv[1]);
25 |     tsk_treeseq_free(&ts);
26 | 
27 |     return EXIT_SUCCESS;
28 | }
29 | 


--------------------------------------------------------------------------------
/lib/subprojects/tskit/examples/haploid_wright_fisher.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <assert.h>
 4 | #include <err.h>
 5 | 
 6 | #include <tskit/tables.h>
 7 | 
 8 | #define check_tsk_error(val)                                                            \
 9 |     if (val < 0) {                                                                      \
10 |         errx(EXIT_FAILURE, "line %d: %s", __LINE__, tsk_strerror(val));                 \
11 |     }
12 | 
13 | void
14 | simulate(tsk_table_collection_t *tables, int N, int T, int simplify_interval)
15 | {
16 |     tsk_id_t *buffer, *parents, *children, child, left_parent, right_parent;
17 |     double breakpoint;
18 |     int ret, j, t, b;
19 | 
20 |     assert(simplify_interval != 0); // leads to division by zero
21 |     buffer = malloc(2 * N * sizeof(tsk_id_t));
22 |     if (buffer == NULL) {
23 |         errx(EXIT_FAILURE, "Out of memory");
24 |     }
25 |     tables->sequence_length = 1.0;
26 |     parents = buffer;
27 |     for (j = 0; j < N; j++) {
28 |         parents[j]
29 |             = tsk_node_table_add_row(&tables->nodes, 0, T, TSK_NULL, TSK_NULL, NULL, 0);
30 |         check_tsk_error(parents[j]);
31 |     }
32 |     b = 0;
33 |     for (t = T - 1; t >= 0; t--) {
34 |         /* Alternate between using the first and last N values in the buffer */
35 |         parents = buffer + (b * N);
36 |         b = (b + 1) % 2;
37 |         children = buffer + (b * N);
38 |         for (j = 0; j < N; j++) {
39 |             child = tsk_node_table_add_row(
40 |                 &tables->nodes, 0, t, TSK_NULL, TSK_NULL, NULL, 0);
41 |             check_tsk_error(child);
42 |             /* NOTE: the use of rand() is discouraged for
43 |              * research code and proper random number generator
44 |              * libraries should be preferred.
45 |              */
46 |             left_parent = parents[(size_t)((rand() / (1. + RAND_MAX)) * N)];
47 |             right_parent = parents[(size_t)((rand() / (1. + RAND_MAX)) * N)];
48 |             do {
49 |                 breakpoint = rand() / (1. + RAND_MAX);
50 |             } while (breakpoint == 0); /* tiny proba of breakpoint being 0 */
51 |             ret = tsk_edge_table_add_row(
52 |                 &tables->edges, 0, breakpoint, left_parent, child, NULL, 0);
53 |             check_tsk_error(ret);
54 |             ret = tsk_edge_table_add_row(
55 |                 &tables->edges, breakpoint, 1, right_parent, child, NULL, 0);
56 |             check_tsk_error(ret);
57 |             children[j] = child;
58 |         }
59 |         if (t % simplify_interval == 0) {
60 |             printf("Simplify at generation %lld: (%lld nodes %lld edges)", (long long) t,
61 |                 (long long) tables->nodes.num_rows, (long long) tables->edges.num_rows);
62 |             /* Note: Edges must be sorted for simplify to work, and we use a brute force
63 |              * approach of sorting each time here for simplicity. This is inefficient. */
64 |             ret = tsk_table_collection_sort(tables, NULL, 0);
65 |             check_tsk_error(ret);
66 |             ret = tsk_table_collection_simplify(tables, children, N, 0, NULL);
67 |             check_tsk_error(ret);
68 |             printf(" -> (%lld nodes %lld edges)\n", (long long) tables->nodes.num_rows,
69 |                 (long long) tables->edges.num_rows);
70 |             for (j = 0; j < N; j++) {
71 |                 children[j] = j;
72 |             }
73 |         }
74 |     }
75 |     free(buffer);
76 | }
77 | 
78 | int
79 | main(int argc, char **argv)
80 | {
81 |     int ret;
82 |     tsk_table_collection_t tables;
83 | 
84 |     if (argc != 6) {
85 |         errx(EXIT_FAILURE, "usage: N T simplify-interval output-file seed");
86 |     }
87 |     ret = tsk_table_collection_init(&tables, 0);
88 |     check_tsk_error(ret);
89 |     srand((unsigned) atoi(argv[5]));
90 |     simulate(&tables, atoi(argv[1]), atoi(argv[2]), atoi(argv[3]));
91 |     ret = tsk_table_collection_dump(&tables, argv[4], 0);
92 |     check_tsk_error(ret);
93 | 
94 |     tsk_table_collection_free(&tables);
95 |     return 0;
96 | }
97 | 


--------------------------------------------------------------------------------
/lib/subprojects/tskit/examples/streaming.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <tskit/tables.h>
 4 | 
 5 | #define check_tsk_error(val)                                                            \
 6 |     if (val < 0) {                                                                      \
 7 |         fprintf(stderr, "Error: line %d: %s\n", __LINE__, tsk_strerror(val));           \
 8 |         exit(EXIT_FAILURE);                                                             \
 9 |     }
10 | 
11 | int
12 | main(int argc, char **argv)
13 | {
14 |     int ret;
15 |     int j = 0;
16 |     tsk_table_collection_t tables;
17 | 
18 |     ret = tsk_table_collection_init(&tables, 0);
19 |     check_tsk_error(ret);
20 | 
21 |     while (true) {
22 |         ret = tsk_table_collection_loadf(&tables, stdin, TSK_NO_INIT);
23 |         if (ret == TSK_ERR_EOF) {
24 |             break;
25 |         }
26 |         check_tsk_error(ret);
27 |         fprintf(stderr, "Tree sequence %d had %lld mutations\n", j,
28 |             (long long) tables.mutations.num_rows);
29 |         ret = tsk_mutation_table_truncate(&tables.mutations, 0);
30 |         check_tsk_error(ret);
31 |         ret = tsk_table_collection_dumpf(&tables, stdout, 0);
32 |         check_tsk_error(ret);
33 |         j++;
34 |     }
35 |     tsk_table_collection_free(&tables);
36 |     return EXIT_SUCCESS;
37 | }
38 | 


--------------------------------------------------------------------------------
/lib/subprojects/tskit/examples/take_ownership.c:
--------------------------------------------------------------------------------
 1 | #include <err.h>
 2 | #include <stdlib.h>
 3 | #include <tskit/tables.h>
 4 | #include <tskit/trees.h>
 5 | 
 6 | #define check_tsk_error(val)                                                            \
 7 |     if (val < 0) {                                                                      \
 8 |         errx(EXIT_FAILURE, "line %d: %s", __LINE__, tsk_strerror(val));                 \
 9 |     }
10 | 
11 | int
12 | main(int argc, char **argv)
13 | {
14 |     tsk_table_collection_t *tables;
15 |     tsk_treeseq_t treeseq;
16 |     int rv;
17 | 
18 |     tables = malloc(sizeof(*tables));
19 |     rv = tsk_table_collection_init(tables, 0);
20 |     check_tsk_error(rv);
21 | 
22 |     /* NOTE: you must set sequence length AFTER initialization */
23 |     tables->sequence_length = 1.0;
24 | 
25 |     /* Do your regular table operations */
26 |     rv = tsk_node_table_add_row(&tables->nodes, 0, 0.0, -1, -1, NULL, 0);
27 |     check_tsk_error(rv);
28 | 
29 |     /* Initalize the tree sequence, transferring all responsibility
30 |      * for the table collection's memory managment
31 |      */
32 |     rv = tsk_treeseq_init(
33 |         &treeseq, tables, TSK_TS_INIT_BUILD_INDEXES | TSK_TAKE_OWNERSHIP);
34 |     check_tsk_error(rv);
35 | 
36 |     /* WARNING: calling tsk_table_collection_free is now a memory error! */
37 |     tsk_treeseq_free(&treeseq);
38 | }
39 | 


--------------------------------------------------------------------------------
/lib/subprojects/tskit/examples/tree_iteration.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <err.h>
 4 | 
 5 | #include <tskit.h>
 6 | 
 7 | #define check_tsk_error(val)                                                            \
 8 |     if (val < 0) {                                                                      \
 9 |         errx(EXIT_FAILURE, "line %d: %s", __LINE__, tsk_strerror(val));                 \
10 |     }
11 | 
12 | int
13 | main(int argc, char **argv)
14 | {
15 |     int ret;
16 |     tsk_treeseq_t ts;
17 |     tsk_tree_t tree;
18 | 
19 |     if (argc != 2) {
20 |         errx(EXIT_FAILURE, "usage: <tree sequence file>");
21 |     }
22 |     ret = tsk_treeseq_load(&ts, argv[1], 0);
23 |     check_tsk_error(ret);
24 |     ret = tsk_tree_init(&tree, &ts, 0);
25 |     check_tsk_error(ret);
26 | 
27 |     printf("Iterate forwards\n");
28 |     for (ret = tsk_tree_first(&tree); ret == TSK_TREE_OK; ret = tsk_tree_next(&tree)) {
29 |         printf("\ttree %lld has %lld roots\n", (long long) tree.index,
30 |             (long long) tsk_tree_get_num_roots(&tree));
31 |     }
32 |     check_tsk_error(ret);
33 | 
34 |     printf("Iterate backwards\n");
35 |     for (ret = tsk_tree_last(&tree); ret == TSK_TREE_OK; ret = tsk_tree_prev(&tree)) {
36 |         printf("\ttree %lld has %lld roots\n", (long long) tree.index,
37 |             (long long) tsk_tree_get_num_roots(&tree));
38 |     }
39 |     check_tsk_error(ret);
40 | 
41 |     tsk_tree_free(&tree);
42 |     tsk_treeseq_free(&ts);
43 |     return 0;
44 | }
45 | 


--------------------------------------------------------------------------------
/lib/subprojects/tskit/examples/tree_traversal.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <err.h>
  4 | 
  5 | #include <tskit.h>
  6 | 
  7 | #define check_tsk_error(val)                                                            \
  8 |     if (val < 0) {                                                                      \
  9 |         errx(EXIT_FAILURE, "line %d: %s", __LINE__, tsk_strerror(val));                 \
 10 |     }
 11 | 
 12 | static void
 13 | traverse_standard(const tsk_tree_t *tree)
 14 | {
 15 |     int ret;
 16 |     tsk_size_t num_nodes, j;
 17 |     tsk_id_t *nodes = malloc(tsk_tree_get_size_bound(tree) * sizeof(*nodes));
 18 | 
 19 |     if (nodes == NULL) {
 20 |         errx(EXIT_FAILURE, "Out of memory");
 21 |     }
 22 |     ret = tsk_tree_preorder(tree, nodes, &num_nodes);
 23 |     check_tsk_error(ret);
 24 |     for (j = 0; j < num_nodes; j++) {
 25 |         printf("Visit preorder %lld\n", (long long) nodes[j]);
 26 |     }
 27 | 
 28 |     ret = tsk_tree_postorder(tree, nodes, &num_nodes);
 29 |     check_tsk_error(ret);
 30 |     for (j = 0; j < num_nodes; j++) {
 31 |         printf("Visit postorder %lld\n", (long long) nodes[j]);
 32 |     }
 33 | 
 34 |     free(nodes);
 35 | }
 36 | 
 37 | static void
 38 | _traverse(const tsk_tree_t *tree, tsk_id_t u, int depth)
 39 | {
 40 |     tsk_id_t v;
 41 |     int j;
 42 | 
 43 |     for (j = 0; j < depth; j++) {
 44 |         printf("    ");
 45 |     }
 46 |     printf("Visit recursive %lld\n", (long long) u);
 47 |     for (v = tree->left_child[u]; v != TSK_NULL; v = tree->right_sib[v]) {
 48 |         _traverse(tree, v, depth + 1);
 49 |     }
 50 | }
 51 | 
 52 | static void
 53 | traverse_recursive(const tsk_tree_t *tree)
 54 | {
 55 |     _traverse(tree, tree->virtual_root, -1);
 56 | }
 57 | 
 58 | static void
 59 | traverse_stack(const tsk_tree_t *tree)
 60 | {
 61 |     int stack_top;
 62 |     tsk_id_t u, v;
 63 |     tsk_id_t *stack = malloc(tsk_tree_get_size_bound(tree) * sizeof(*stack));
 64 | 
 65 |     if (stack == NULL) {
 66 |         errx(EXIT_FAILURE, "Out of memory");
 67 |     }
 68 |     stack_top = 0;
 69 |     stack[stack_top] = tree->virtual_root;
 70 |     while (stack_top >= 0) {
 71 |         u = stack[stack_top];
 72 |         stack_top--;
 73 |         printf("Visit stack %lld\n", (long long) u);
 74 |         /* Put nodes on the stack right-to-left, so we visit in left-to-right */
 75 |         for (v = tree->right_child[u]; v != TSK_NULL; v = tree->left_sib[v]) {
 76 |             stack_top++;
 77 |             stack[stack_top] = v;
 78 |         }
 79 |     }
 80 |     free(stack);
 81 | }
 82 | 
 83 | static void
 84 | traverse_upwards(const tsk_tree_t *tree)
 85 | {
 86 |     const tsk_id_t *samples = tsk_treeseq_get_samples(tree->tree_sequence);
 87 |     tsk_size_t num_samples = tsk_treeseq_get_num_samples(tree->tree_sequence);
 88 |     tsk_size_t j;
 89 |     tsk_id_t u;
 90 | 
 91 |     for (j = 0; j < num_samples; j++) {
 92 |         u = samples[j];
 93 |         while (u != TSK_NULL) {
 94 |             printf("Visit upwards: %lld\n", (long long) u);
 95 |             u = tree->parent[u];
 96 |         }
 97 |     }
 98 | }
 99 | 
100 | int
101 | main(int argc, char **argv)
102 | {
103 |     int ret;
104 |     tsk_treeseq_t ts;
105 |     tsk_tree_t tree;
106 | 
107 |     if (argc != 2) {
108 |         errx(EXIT_FAILURE, "usage: <tree sequence file>");
109 |     }
110 |     ret = tsk_treeseq_load(&ts, argv[1], 0);
111 |     check_tsk_error(ret);
112 |     ret = tsk_tree_init(&tree, &ts, 0);
113 |     check_tsk_error(ret);
114 |     ret = tsk_tree_first(&tree);
115 |     check_tsk_error(ret);
116 | 
117 |     traverse_standard(&tree);
118 | 
119 |     traverse_recursive(&tree);
120 | 
121 |     traverse_stack(&tree);
122 | 
123 |     traverse_upwards(&tree);
124 | 
125 |     tsk_tree_free(&tree);
126 |     tsk_treeseq_free(&ts);
127 |     return 0;
128 | }
129 | 


--------------------------------------------------------------------------------
/lib/subprojects/tskit/meson.build:
--------------------------------------------------------------------------------
  1 | project('tskit', ['c', 'cpp'],
  2 |     version: files('VERSION.txt'),    
  3 |     default_options: ['c_std=c99', 'cpp_std=c++11']
  4 | )
  5 | 
  6 | kastore_proj = subproject('kastore')
  7 | kastore_dep = kastore_proj.get_variable('kastore_dep')
  8 | kastore_inc = kastore_proj.get_variable('kastore_inc')
  9 | 
 10 | cc = meson.get_compiler('c')
 11 | m_dep = cc.find_library('m', required: false)
 12 | lib_deps = [m_dep, kastore_dep]
 13 | 
 14 | extra_c_args = [
 15 |     '-Wall', '-Wextra', '-Werror', '-Wpedantic', '-W',
 16 |     '-Wmissing-prototypes',  '-Wstrict-prototypes',
 17 |     '-Wconversion', '-Wshadow', '-Wpointer-arith', '-Wcast-align',
 18 |     '-Wcast-qual', '-Wwrite-strings', '-Wnested-externs',
 19 |     '-fshort-enums', '-fno-common']
 20 | 
 21 | lib_sources = [
 22 |     'tskit/core.c', 'tskit/tables.c', 'tskit/trees.c',
 23 |     'tskit/genotypes.c', 'tskit/stats.c', 'tskit/convert.c', 'tskit/haplotype_matching.c']
 24 | lib_headers = [
 25 |     'tskit/core.h', 'tskit/tables.h', 'tskit/trees.h',
 26 |     'tskit/genotypes.h', 'tskit/stats.h', 'tskit/convert.h', 'tskit/haplotype_matching.h']
 27 | 
 28 | # Subprojects use the static library for simplicity.
 29 | tskit_inc = [kastore_inc, include_directories(['.'])]
 30 | tskit_lib = static_library('tskit',
 31 |     sources: lib_sources, dependencies: lib_deps)
 32 | tskit_dep = declare_dependency(include_directories:tskit_inc, link_with: tskit_lib)
 33 | 
 34 | if not meson.is_subproject()
 35 | 
 36 |     # Shared library install target.
 37 |     shared_library('tskit',
 38 |         sources: lib_sources, dependencies: lib_deps, c_args: extra_c_args, install: true)
 39 |     install_headers('tskit.h')
 40 |     install_headers(lib_headers, subdir: 'tskit')
 41 | 
 42 |     cunit_dep = dependency('cunit')
 43 |     # We don't specify extra C args here as CUnit won't pass the checks.
 44 |     test_lib = static_library('testlib',
 45 |         sources: ['tests/testlib.c'], dependencies: [cunit_dep, kastore_dep, tskit_dep])
 46 | 
 47 |     test_core = executable('test_core',
 48 |         sources: ['tests/test_core.c'],
 49 |         link_with: [tskit_lib, test_lib],
 50 |         c_args: extra_c_args+['-DMESON_PROJECT_VERSION="@0@"'.format(meson.project_version())],
 51 |         dependencies: kastore_dep,
 52 |         )
 53 |     test('core', test_core)
 54 | 
 55 |     test_tables = executable('test_tables',
 56 |         sources: ['tests/test_tables.c'],
 57 |         link_with: [tskit_lib, test_lib], c_args: extra_c_args, dependencies: kastore_dep)
 58 |     test('tables', test_tables)
 59 | 
 60 |     test_trees = executable('test_trees',
 61 |         sources: ['tests/test_trees.c'],
 62 |         link_with: [tskit_lib, test_lib], c_args: extra_c_args, dependencies: kastore_dep)
 63 |     test('trees', test_trees)
 64 | 
 65 |     test_genotypes = executable('test_genotypes',
 66 |         sources: ['tests/test_genotypes.c'],
 67 |         link_with: [tskit_lib, test_lib], c_args: extra_c_args, dependencies: kastore_dep)
 68 |     test('genotypes', test_genotypes)
 69 | 
 70 |     test_convert = executable('test_convert',
 71 |         sources: ['tests/test_convert.c'],
 72 |         link_with: [tskit_lib, test_lib], c_args: extra_c_args, dependencies: kastore_dep)
 73 |     test('convert', test_convert)
 74 | 
 75 |     test_stats = executable('test_stats',
 76 |         sources: ['tests/test_stats.c'],
 77 |         link_with: [tskit_lib, test_lib], c_args: extra_c_args, dependencies: kastore_dep)
 78 |     test('stats', test_stats)
 79 | 
 80 |     test_haplotype_matching = executable('test_haplotype_matching',
 81 |         sources: ['tests/test_haplotype_matching.c'],
 82 |         link_with: [tskit_lib, test_lib], c_args: extra_c_args, dependencies: kastore_dep)
 83 |     test('haplotype_matching', test_haplotype_matching)
 84 | 
 85 |     test_file_format = executable('test_file_format',
 86 |         sources: ['tests/test_file_format.c'],
 87 |         link_with: [tskit_lib, test_lib], c_args: extra_c_args, dependencies: kastore_dep)
 88 |     test('file_format', test_file_format)
 89 | 
 90 |     test_minimal_cpp = executable('test_minimal_cpp',
 91 |         sources: ['tests/test_minimal_cpp.cpp'], link_with: [tskit_lib],
 92 |         dependencies: kastore_dep)
 93 |     test('minimal_cpp', test_minimal_cpp)
 94 | 
 95 |     if get_option('build_examples')
 96 |       # These example programs use less portable features,
 97 |       # and we don't want to always compile them. Use, e.g.,
 98 |       # meson build -Dbuild_examples=false
 99 |       executable('api_structure',
100 |           sources: ['examples/api_structure.c'], 
101 |           link_with: [tskit_lib], dependencies: lib_deps)
102 |       executable('error_handling',
103 |           sources: ['examples/error_handling.c'], 
104 |           link_with: [tskit_lib], dependencies: lib_deps)
105 |       executable('tree_iteration',
106 |           sources: ['examples/tree_iteration.c'], 
107 |           link_with: [tskit_lib], dependencies: lib_deps)
108 |       executable('tree_traversal',
109 |           sources: ['examples/tree_traversal.c'], 
110 |           link_with: [tskit_lib], dependencies: lib_deps)
111 |       executable('streaming',
112 |           sources: ['examples/streaming.c'], 
113 |           link_with: [tskit_lib], dependencies: lib_deps)
114 |       executable('cpp_sorting_example',
115 |           sources: ['examples/cpp_sorting_example.cpp'], 
116 |           link_with: [tskit_lib], dependencies: lib_deps)
117 |       executable('haploid_wright_fisher',
118 |           sources: ['examples/haploid_wright_fisher.c'], 
119 |           link_with: [tskit_lib], dependencies: lib_deps)
120 |     endif
121 | endif
122 | 


--------------------------------------------------------------------------------
/lib/subprojects/tskit/meson_options.txt:
--------------------------------------------------------------------------------
1 | option('build_examples', type : 'boolean', value : true)
2 | 


--------------------------------------------------------------------------------
/lib/subprojects/tskit/subprojects/kastore/README.md:
--------------------------------------------------------------------------------
1 | This directory is an abbreviated version of the kastore distribution source.
2 | 
3 | All files should be updated when we are updating to a new kastore version.
4 | 


--------------------------------------------------------------------------------
/lib/subprojects/tskit/subprojects/kastore/VERSION.txt:
--------------------------------------------------------------------------------
1 | 2.1.1
2 | 


--------------------------------------------------------------------------------
/lib/subprojects/tskit/subprojects/kastore/meson.build:
--------------------------------------------------------------------------------
 1 | project('kastore', ['c', 'cpp'],
 2 |   version: files('VERSION.txt'),
 3 |   default_options: [
 4 |     'c_std=c99', 
 5 |     'cpp_std=c++11', 
 6 |     'warning_level=3', 
 7 |     'werror=true'])
 8 | 
 9 | if not meson.is_subproject()
10 |     add_global_arguments([
11 |         '-W', '-Wmissing-prototypes',  '-Wstrict-prototypes',
12 |         '-Wconversion', '-Wshadow', '-Wpointer-arith', '-Wcast-align',
13 |         '-Wcast-qual', '-Wwrite-strings', '-Wnested-externs',
14 |         '-fshort-enums', '-fno-common'], language : 'c')
15 | endif
16 | 
17 | # Subprojects should compile in the static library for simplicity.
18 | kastore_inc = include_directories('.')
19 | kastore = static_library('kastore', 'kastore.c')
20 | kastore_dep = declare_dependency(link_with : kastore, include_directories: kastore_inc)
21 | 
22 | if not meson.is_subproject()
23 | 
24 |     # The shared library can be installed into the system.
25 |     install_headers('kastore.h')
26 |     shared_library('kastore', 'kastore.c', install: true)
27 |     executable('example', ['example.c'], link_with: kastore)
28 | 
29 |     # Note: we don't declare these as meson tests because they depend on 
30 |     # being run from the current working directory because of the paths
31 |     # to example files.
32 |     cunit_dep = dependency('cunit')
33 |     executable('tests', ['tests.c', 'kastore.c'], dependencies: cunit_dep,
34 |       c_args: ['-DMESON_VERSION="@0@"'.format(meson.project_version())])
35 | 
36 |     executable('cpp_tests', ['cpp_tests.cpp'], link_with: kastore)
37 | 
38 |     executable('malloc_tests', ['malloc_tests.c', 'kastore.c'], 
39 |         dependencies: cunit_dep, 
40 |         link_args:['-Wl,--wrap=malloc', '-Wl,--wrap=realloc', '-Wl,--wrap=calloc'])
41 | 
42 |     executable('io_tests', ['io_tests.c', 'kastore.c'], 
43 |         dependencies: cunit_dep, 
44 |         link_args:[
45 |             '-Wl,--wrap=fwrite', 
46 |             '-Wl,--wrap=fread', 
47 |             '-Wl,--wrap=fclose',
48 |             '-Wl,--wrap=ftell',
49 |             '-Wl,--wrap=fseek'])
50 | endif
51 | 


--------------------------------------------------------------------------------
/lib/subprojects/tskit/tests/test_convert.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * MIT License
  3 |  *
  4 |  * Copyright (c) 2019-2022 Tskit Developers
  5 |  *
  6 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  7 |  * of this software and associated documentation files (the "Software"), to deal
  8 |  * in the Software without restriction, including without limitation the rights
  9 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 |  * copies of the Software, and to permit persons to whom the Software is
 11 |  * furnished to do so, subject to the following conditions:
 12 |  *
 13 |  * The above copyright notice and this permission notice shall be included in all
 14 |  * copies or substantial portions of the Software.
 15 |  *
 16 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 22 |  * SOFTWARE.
 23 |  */
 24 | 
 25 | #include "testlib.h"
 26 | #include <tskit/convert.h>
 27 | 
 28 | #include <unistd.h>
 29 | #include <stdlib.h>
 30 | 
 31 | static void
 32 | test_single_tree_newick(void)
 33 | {
 34 |     int ret;
 35 |     tsk_treeseq_t ts;
 36 |     tsk_tree_t t;
 37 |     size_t buffer_size = 1024;
 38 |     char newick[buffer_size];
 39 | 
 40 |     tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL,
 41 |         NULL, NULL, NULL, 0);
 42 | 
 43 |     ret = tsk_tree_init(&t, &ts, 0);
 44 |     CU_ASSERT_EQUAL_FATAL(ret, 0)
 45 |     ret = tsk_tree_first(&t);
 46 |     CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK)
 47 | 
 48 |     ret = tsk_convert_newick(&t, 0, 0, TSK_NEWICK_LEGACY_MS_LABELS, buffer_size, newick);
 49 |     CU_ASSERT_EQUAL_FATAL(ret, 0);
 50 |     /* Seems odd, but this is what a single node newick tree looks like.
 51 |      * Newick parsers seems to accept it in any case */
 52 |     CU_ASSERT_STRING_EQUAL(newick, "1;");
 53 | 
 54 |     ret = tsk_convert_newick(&t, 0, 0, 0, buffer_size, newick);
 55 |     CU_ASSERT_EQUAL_FATAL(ret, 0);
 56 |     CU_ASSERT_STRING_EQUAL(newick, "n0;");
 57 | 
 58 |     ret = tsk_convert_newick(&t, 4, 0, TSK_NEWICK_LEGACY_MS_LABELS, buffer_size, newick);
 59 |     CU_ASSERT_EQUAL_FATAL(ret, 0);
 60 |     CU_ASSERT_STRING_EQUAL(newick, "(1:1,2:1);");
 61 |     ret = tsk_convert_newick(&t, 4, 0, 0, buffer_size, newick);
 62 |     CU_ASSERT_EQUAL_FATAL(ret, 0);
 63 |     CU_ASSERT_STRING_EQUAL(newick, "(n0:1,n1:1);");
 64 | 
 65 |     ret = tsk_convert_newick(&t, 6, 0, TSK_NEWICK_LEGACY_MS_LABELS, buffer_size, newick);
 66 |     CU_ASSERT_EQUAL_FATAL(ret, 0);
 67 |     CU_ASSERT_STRING_EQUAL(newick, "((1:1,2:1):2,(3:2,4:2):1);");
 68 | 
 69 |     ret = tsk_convert_newick(&t, 6, 0, 0, buffer_size, newick);
 70 |     CU_ASSERT_EQUAL_FATAL(ret, 0);
 71 |     CU_ASSERT_STRING_EQUAL(newick, "((n0:1,n1:1):2,(n2:2,n3:2):1);");
 72 | 
 73 |     tsk_tree_free(&t);
 74 |     tsk_treeseq_free(&ts);
 75 | }
 76 | 
 77 | static void
 78 | test_single_tree_newick_errors(void)
 79 | {
 80 |     int ret;
 81 |     tsk_treeseq_t ts;
 82 |     tsk_tree_t t;
 83 |     size_t j, len;
 84 |     size_t buffer_size = 1024;
 85 |     char newick[buffer_size];
 86 | 
 87 |     tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL,
 88 |         NULL, NULL, NULL, 0);
 89 | 
 90 |     ret = tsk_tree_init(&t, &ts, 0);
 91 |     CU_ASSERT_EQUAL_FATAL(ret, 0)
 92 |     ret = tsk_tree_first(&t);
 93 |     CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK)
 94 | 
 95 |     ret = tsk_convert_newick(&t, -1, 1, 0, buffer_size, newick);
 96 |     CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS);
 97 |     ret = tsk_convert_newick(&t, 7, 1, 0, buffer_size, newick);
 98 |     CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS);
 99 | 
100 |     ret = tsk_convert_newick(&t, 6, 0, 0, buffer_size, NULL);
101 |     CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_PARAM_VALUE);
102 |     ret = tsk_convert_newick(&t, 6, 0, 0, buffer_size, newick);
103 |     CU_ASSERT_EQUAL_FATAL(ret, 0);
104 |     len = 1 + strlen(newick);
105 |     for (j = 0; j < len; j++) {
106 |         ret = tsk_convert_newick(&t, 6, 0, 0, j, newick);
107 |         CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BUFFER_OVERFLOW);
108 |     }
109 |     ret = tsk_convert_newick(&t, 6, 0, TSK_NEWICK_LEGACY_MS_LABELS, len, newick);
110 | 
111 |     CU_ASSERT_EQUAL_FATAL(ret, 0);
112 |     CU_ASSERT_STRING_EQUAL(newick, "((1:1,2:1):2,(3:2,4:2):1);");
113 | 
114 |     tsk_tree_free(&t);
115 |     tsk_treeseq_free(&ts);
116 | }
117 | 
118 | int
119 | main(int argc, char **argv)
120 | {
121 |     CU_TestInfo tests[] = {
122 |         { "test_single_tree_newick", test_single_tree_newick },
123 |         { "test_single_tree_newick_errors", test_single_tree_newick_errors },
124 |         { NULL, NULL },
125 |     };
126 |     return test_main(tests, argc, argv);
127 | }
128 | 


--------------------------------------------------------------------------------
/lib/subprojects/tskit/tests/test_minimal_cpp.cpp:
--------------------------------------------------------------------------------
  1 | /* * MIT License
  2 |  *
  3 |  * Copyright (c) 2019-2022 Tskit Developers
  4 |  *
  5 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  6 |  * of this software and associated documentation files (the "Software"), to deal
  7 |  * in the Software without restriction, including without limitation the rights
  8 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 |  * copies of the Software, and to permit persons to whom the Software is
 10 |  * furnished to do so, subject to the following conditions:
 11 |  *
 12 |  * The above copyright notice and this permission notice shall be included in all
 13 |  * copies or substantial portions of the Software.
 14 |  *
 15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 |  * SOFTWARE.
 22 |  */
 23 | 
 24 | /* Minimal tests to make sure that tskit at least compiles and links
 25 |  * in a simple C++ program */
 26 | 
 27 | #include <iostream>
 28 | #include <cassert>
 29 | #include <sstream>
 30 | #include <vector>
 31 | #include <algorithm>
 32 | #include <cstring>
 33 | 
 34 | #include <tskit.h>
 35 | 
 36 | using namespace std;
 37 | 
 38 | void
 39 | test_kas_strerror()
 40 | {
 41 |     std::cout << "test_kas_strerror" << endl;
 42 |     std::ostringstream o;
 43 |     o << kas_strerror(KAS_ERR_NO_MEMORY);
 44 |     assert(std::string("Out of memory").compare(o.str()) == 0);
 45 | }
 46 | 
 47 | void
 48 | test_strerror()
 49 | {
 50 |     std::cout << "test_strerror" << endl;
 51 |     std::ostringstream o;
 52 |     o << tsk_strerror(TSK_ERR_NO_MEMORY);
 53 |     assert(std::string("Out of memory. (TSK_ERR_NO_MEMORY)").compare(o.str()) == 0);
 54 | }
 55 | 
 56 | void
 57 | test_load_error()
 58 | {
 59 |     std::cout << "test_open_error" << endl;
 60 |     tsk_treeseq_t ts;
 61 |     int ret = tsk_treeseq_load(&ts, "no such file", 0);
 62 |     assert(ret == TSK_ERR_IO);
 63 |     tsk_treeseq_free(&ts);
 64 | }
 65 | 
 66 | void
 67 | test_table_basics()
 68 | {
 69 |     std::cout << "test_table_basics" << endl;
 70 |     tsk_table_collection_t tables;
 71 |     int ret = tsk_table_collection_init(&tables, 0);
 72 |     assert(ret == 0);
 73 | 
 74 |     ret = tsk_node_table_add_row(&tables.nodes, 0, 1.0, TSK_NULL, TSK_NULL, NULL, 0);
 75 |     assert(ret == 0);
 76 |     ret = tsk_node_table_add_row(&tables.nodes, 0, 2.0, TSK_NULL, TSK_NULL, NULL, 0);
 77 |     assert(ret == 1);
 78 |     assert(tables.nodes.num_rows == 2);
 79 | 
 80 |     tsk_table_collection_free(&tables);
 81 | }
 82 | 
 83 | /* A definition of sort_edges that uses C++ std::sort and inlining of the
 84 |  * comparison function to achieve significantly better performance than
 85 |  * the builtin method in tskit.
 86 |  */
 87 | int
 88 | cpp_sort_edges(tsk_table_sorter_t *sorter, tsk_size_t start)
 89 | {
 90 |     struct _edge {
 91 |         double left, right;
 92 |         tsk_id_t parent, child;
 93 | 
 94 |         _edge(double l, double r, tsk_id_t p, tsk_id_t c)
 95 |             : left{ l }, right{ r }, parent{ p }, child{ c }
 96 |         {
 97 |         }
 98 |     };
 99 |     tsk_edge_table_t *edges = &sorter->tables->edges;
100 |     const double *node_time = sorter->tables->nodes.time;
101 |     std::vector<_edge> sorted_edges;
102 |     size_t num_edges = edges->num_rows;
103 |     size_t j;
104 | 
105 |     /* This is the comparison function.  We cannot define an
106 |      * operator < for _edge because we need to bind the node times
107 |      * so we have to use a functional method. This is a copy of the cmp
108 |      * from fwdpp.  Only difference is the final time comparison
109 |      * (fwdpp table times go forwards). */
110 |     const auto cmp = [&node_time](const _edge &lhs, const _edge &rhs) {
111 |         auto tl = node_time[lhs.parent];
112 |         auto tr = node_time[rhs.parent];
113 |         if (tl == tr) {
114 |             if (lhs.parent == rhs.parent) {
115 |                 if (lhs.child == rhs.child) {
116 |                     return lhs.left < rhs.left;
117 |                 }
118 |                 return lhs.child < rhs.child;
119 |             }
120 |             return lhs.parent < rhs.parent;
121 |         }
122 |         return tl < tr;
123 |     };
124 | 
125 |     assert(start == 0);
126 |     /* Let's not bother with metadata */
127 |     assert(edges->metadata_length == 0);
128 | 
129 |     sorted_edges.reserve(num_edges);
130 |     for (j = 0; j < num_edges; j++) {
131 |         sorted_edges.emplace_back(
132 |             edges->left[j], edges->right[j], edges->parent[j], edges->child[j]);
133 |     }
134 | 
135 |     std::sort(begin(sorted_edges), end(sorted_edges), cmp);
136 | 
137 |     for (j = 0; j < num_edges; j++) {
138 |         edges->left[j] = sorted_edges[j].left;
139 |         edges->right[j] = sorted_edges[j].right;
140 |         edges->parent[j] = sorted_edges[j].parent;
141 |         edges->child[j] = sorted_edges[j].child;
142 |     }
143 |     return 0;
144 | }
145 | 
146 | void
147 | test_edge_sorting()
148 | {
149 |     std::cout << "test_edge_sorting" << endl;
150 |     tsk_table_collection_t tables;
151 |     tsk_id_t n = 10;
152 |     tsk_id_t j;
153 |     int ret = tsk_table_collection_init(&tables, 0);
154 |     assert(ret == 0);
155 | 
156 |     tables.sequence_length = 1.0;
157 |     /* Make a stick tree */
158 |     /* Add nodes and edges */
159 |     for (j = 0; j < n; j++) {
160 |         ret = tsk_node_table_add_row(
161 |             &tables.nodes, TSK_NODE_IS_SAMPLE, j + 1, TSK_NULL, TSK_NULL, NULL, 0);
162 |         assert(ret == j);
163 |     }
164 |     for (j = n - 1; j > 0; j--) {
165 |         tsk_edge_table_add_row(&tables.edges, 0, 1, j, j - 1, NULL, 0);
166 |     }
167 |     assert(tables.nodes.num_rows == (tsk_size_t) n);
168 |     assert(tables.edges.num_rows == (tsk_size_t) n - 1);
169 | 
170 |     /* Make sure the edges are unsorted */
171 |     /* Not calling TSK_CHECK_TREES so casting is safe */
172 |     ret = (int) tsk_table_collection_check_integrity(&tables, TSK_CHECK_EDGE_ORDERING);
173 |     assert(ret == TSK_ERR_EDGES_NOT_SORTED_PARENT_TIME);
174 | 
175 |     /* Sort the tables */
176 |     tsk_table_sorter_t sorter;
177 |     ret = tsk_table_sorter_init(&sorter, &tables, 0);
178 |     assert(ret == 0);
179 |     /* Set the sort_edges to our local C++ version. We could also set some
180 |      * persistent state in sorter.params if we wanted to. */
181 |     sorter.sort_edges = cpp_sort_edges;
182 |     ret = tsk_table_sorter_run(&sorter, NULL);
183 |     assert(ret == 0);
184 |     tsk_table_sorter_free(&sorter);
185 | 
186 |     /* Make sure the edges are now sorted */
187 |     ret = (int) tsk_table_collection_check_integrity(&tables, TSK_CHECK_EDGE_ORDERING);
188 |     assert(ret == 0);
189 | 
190 |     tsk_table_collection_free(&tables);
191 | }
192 | 
193 | int
194 | sort_edges_raises_exception(tsk_table_sorter_t *sorter, tsk_size_t start)
195 | {
196 |     throw std::exception();
197 |     return 0;
198 | }
199 | 
200 | int
201 | sort_edges_raises_non_exception(tsk_table_sorter_t *sorter, tsk_size_t start)
202 | {
203 |     throw 42;
204 |     return 0;
205 | }
206 | 
207 | int
208 | safe_sort_edges(tsk_table_sorter_t *sorter, tsk_size_t start)
209 | {
210 |     int ret = 0;
211 |     if (sorter->user_data == NULL) {
212 |         try {
213 |             ret = sort_edges_raises_exception(sorter, start);
214 |         } catch (...) {
215 |             ret = -12345;
216 |         }
217 |     } else {
218 |         try {
219 |             ret = sort_edges_raises_non_exception(sorter, start);
220 |         } catch (...) {
221 |             ret = -12346;
222 |         }
223 |     }
224 |     return ret;
225 | }
226 | 
227 | void
228 | test_edge_sorting_errors()
229 | {
230 |     /* Some inexplicable error happened here on 32 bit Windows where the
231 |      * exceptions were not being caught as expected. This seems much
232 |      * more likely to be a platform quirk that a real bug in our code,
233 |      * so just disabling the test there.
234 |      *
235 |      * https://github.com/tskit-dev/tskit/issues/1790
236 |      * https://github.com/tskit-dev/tskit/pull/1791
237 |      */
238 | #if !defined(_WIN32)
239 |     std::cout << "test_edge_sorting_errors" << endl;
240 |     tsk_table_collection_t tables;
241 |     tsk_table_sorter_t sorter;
242 |     tsk_id_t ret = tsk_table_collection_init(&tables, 0);
243 | 
244 |     assert(ret == 0);
245 |     tables.sequence_length = 1.0;
246 | 
247 |     ret = tsk_table_sorter_init(&sorter, &tables, 0);
248 |     assert(ret == 0);
249 |     sorter.sort_edges = safe_sort_edges;
250 |     ret = tsk_table_sorter_run(&sorter, NULL);
251 |     assert(ret == -12345);
252 | 
253 |     /* Use the user_data as a way to communicate with the sorter
254 |      * function. Here, we want to try out two different types
255 |      * of exception that get thrown. */
256 |     sorter.user_data = &tables;
257 |     ret = tsk_table_sorter_run(&sorter, NULL);
258 |     assert(ret == -12346);
259 | 
260 |     tsk_table_sorter_free(&sorter);
261 |     tsk_table_collection_free(&tables);
262 | #endif
263 | }
264 | 
265 | int
266 | main()
267 | {
268 |     test_kas_strerror();
269 |     test_strerror();
270 |     test_load_error();
271 |     test_table_basics();
272 |     test_edge_sorting();
273 |     test_edge_sorting_errors();
274 |     return 0;
275 | }
276 | 


--------------------------------------------------------------------------------
/lib/subprojects/tskit/tests/testlib.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * MIT License
  3 |  *
  4 |  * Copyright (c) 2019-2021 Tskit Developers
  5 |  *
  6 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  7 |  * of this software and associated documentation files (the "Software"), to deal
  8 |  * in the Software without restriction, including without limitation the rights
  9 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 |  * copies of the Software, and to permit persons to whom the Software is
 11 |  * furnished to do so, subject to the following conditions:
 12 |  *
 13 |  * The above copyright notice and this permission notice shall be included in all
 14 |  * copies or substantial portions of the Software.
 15 |  *
 16 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 22 |  * SOFTWARE.
 23 |  */
 24 | 
 25 | #ifndef __TESTLIB_H__
 26 | #define __TESTLIB_H__
 27 | 
 28 | #define _GNU_SOURCE
 29 | #include <stdio.h>
 30 | #include <unistd.h>
 31 | #include <stdlib.h>
 32 | 
 33 | #include <CUnit/Basic.h>
 34 | #include <tskit/trees.h>
 35 | 
 36 | /* Global variables used in the test suite */
 37 | 
 38 | extern char *_tmp_file_name;
 39 | extern FILE *_devnull;
 40 | 
 41 | int test_main(CU_TestInfo *tests, int argc, char **argv);
 42 | 
 43 | void tsk_treeseq_from_text(tsk_treeseq_t *ts, double sequence_length, const char *nodes,
 44 |     const char *edges, const char *migrations, const char *sites, const char *mutations,
 45 |     const char *individuals, const char *provenance, tsk_flags_t tc_options);
 46 | tsk_treeseq_t *caterpillar_tree(
 47 |     tsk_size_t num_samples, tsk_size_t num_sites, tsk_size_t num_mutations);
 48 | 
 49 | void parse_nodes(const char *text, tsk_node_table_t *node_table);
 50 | void parse_edges(const char *text, tsk_edge_table_t *edge_table);
 51 | void parse_sites(const char *text, tsk_site_table_t *site_table);
 52 | void parse_mutations(const char *text, tsk_mutation_table_t *mutation_table);
 53 | void parse_individuals(const char *text, tsk_individual_table_t *individual_table);
 54 | 
 55 | void unsort_edges(tsk_edge_table_t *edges, size_t start);
 56 | 
 57 | extern const char *single_tree_ex_nodes;
 58 | extern const char *single_tree_ex_edges;
 59 | extern const char *single_tree_ex_sites;
 60 | extern const char *single_tree_ex_mutations;
 61 | 
 62 | extern const char *multiple_tree_ex_nodes;
 63 | extern const char *multiple_tree_ex_edges;
 64 | 
 65 | extern const char *odd_tree1_ex_nodes;
 66 | extern const char *odd_tree1_ex_edges;
 67 | 
 68 | extern const char *multi_root_tree_ex_nodes;
 69 | extern const char *multi_root_tree_ex_edges;
 70 | 
 71 | extern const char *multi_path_tree_ex_nodes;
 72 | extern const char *multi_path_tree_ex_edges;
 73 | 
 74 | extern const char *nonbinary_ex_nodes;
 75 | extern const char *nonbinary_ex_edges;
 76 | extern const char *nonbinary_ex_sites;
 77 | extern const char *nonbinary_ex_mutations;
 78 | 
 79 | extern const char *unary_ex_nodes;
 80 | extern const char *unary_ex_edges;
 81 | extern const char *unary_ex_sites;
 82 | extern const char *unary_ex_mutations;
 83 | 
 84 | extern const char *internal_sample_ex_nodes;
 85 | extern const char *internal_sample_ex_edges;
 86 | extern const char *internal_sample_ex_sites;
 87 | extern const char *internal_sample_ex_mutations;
 88 | 
 89 | extern const char *multiroot_ex_nodes;
 90 | extern const char *multiroot_ex_edges;
 91 | extern const char *multiroot_ex_sites;
 92 | extern const char *multiroot_ex_mutations;
 93 | 
 94 | extern const char *empty_ex_nodes;
 95 | extern const char *empty_ex_edges;
 96 | 
 97 | extern const char *paper_ex_nodes;
 98 | extern const char *paper_ex_edges;
 99 | extern const char *paper_ex_sites;
100 | extern const char *paper_ex_mutations;
101 | extern const char *paper_ex_individuals;
102 | 
103 | #endif
104 | 


--------------------------------------------------------------------------------
/lib/subprojects/tskit/tskit.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * MIT License
 3 |  *
 4 |  * Copyright (c) 2019 Tskit Developers
 5 |  *
 6 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
 7 |  * of this software and associated documentation files (the "Software"), to deal
 8 |  * in the Software without restriction, including without limitation the rights
 9 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 |  * copies of the Software, and to permit persons to whom the Software is
11 |  * furnished to do so, subject to the following conditions:
12 |  *
13 |  * The above copyright notice and this permission notice shall be included in all
14 |  * copies or substantial portions of the Software.
15 |  *
16 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 |  * SOFTWARE.
23 |  */
24 | 
25 | /**
26 |  * @file tskit.h
27 |  * @brief Tskit API.
28 |  */
29 | #ifndef __TSKIT_H__
30 | #define __TSKIT_H__
31 | 
32 | #include <tskit/core.h>
33 | #include <tskit/trees.h>
34 | #include <tskit/genotypes.h>
35 | #include <tskit/convert.h>
36 | #include <tskit/stats.h>
37 | #include <tskit/haplotype_matching.h>
38 | 
39 | #endif
40 | 


--------------------------------------------------------------------------------
/lib/subprojects/tskit/tskit/convert.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * MIT License
  3 |  *
  4 |  * Copyright (c) 2018-2021 Tskit Developers
  5 |  * Copyright (c) 2015-2017 University of Oxford
  6 |  *
  7 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  8 |  * of this software and associated documentation files (the "Software"), to deal
  9 |  * in the Software without restriction, including without limitation the rights
 10 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 |  * copies of the Software, and to permit persons to whom the Software is
 12 |  * furnished to do so, subject to the following conditions:
 13 |  *
 14 |  * The above copyright notice and this permission notice shall be included in all
 15 |  * copies or substantial portions of the Software.
 16 |  *
 17 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 |  * SOFTWARE.
 24 |  */
 25 | 
 26 | #include <stdio.h>
 27 | #include <string.h>
 28 | #include <stdbool.h>
 29 | #include <stdlib.h>
 30 | #include <math.h>
 31 | 
 32 | #include <tskit/convert.h>
 33 | 
 34 | /* ======================================================== *
 35 |  * Newick output.
 36 |  * ======================================================== */
 37 | 
 38 | /* This infrastructure is left-over from an earlier more complex version
 39 |  * of this algorithm that worked over a tree sequence and cached the newick
 40 |  * subtrees, updating according to diffs. It's unclear whether this complexity
 41 |  * was of any real-world use, since newick output for large trees is pretty
 42 |  * pointless. */
 43 | 
 44 | typedef struct {
 45 |     unsigned int precision;
 46 |     tsk_flags_t options;
 47 |     char *newick;
 48 |     tsk_id_t *traversal_stack;
 49 |     const tsk_tree_t *tree;
 50 | } tsk_newick_converter_t;
 51 | 
 52 | static int
 53 | tsk_newick_converter_run(
 54 |     tsk_newick_converter_t *self, tsk_id_t root, size_t buffer_size, char *buffer)
 55 | {
 56 |     int ret = TSK_ERR_GENERIC;
 57 |     const tsk_tree_t *tree = self->tree;
 58 |     tsk_id_t *stack = self->traversal_stack;
 59 |     const double *time = self->tree->tree_sequence->tables->nodes.time;
 60 |     const tsk_flags_t *flags = self->tree->tree_sequence->tables->nodes.flags;
 61 |     int stack_top = 0;
 62 |     int label;
 63 |     size_t s = 0;
 64 |     int r;
 65 |     tsk_id_t u, v, w, root_parent;
 66 |     double branch_length;
 67 |     bool ms_labels = self->options & TSK_NEWICK_LEGACY_MS_LABELS;
 68 |     const char *label_format = ms_labels ? "%d" : "n%d";
 69 | 
 70 |     if (root < 0 || root >= (tsk_id_t) self->tree->num_nodes) {
 71 |         ret = TSK_ERR_NODE_OUT_OF_BOUNDS;
 72 |         goto out;
 73 |     }
 74 |     if (buffer == NULL) {
 75 |         ret = TSK_ERR_BAD_PARAM_VALUE;
 76 |         goto out;
 77 |     }
 78 |     root_parent = tree->parent[root];
 79 |     stack[0] = root;
 80 |     u = root_parent;
 81 |     while (stack_top >= 0) {
 82 |         v = stack[stack_top];
 83 |         if (tree->left_child[v] != TSK_NULL && v != u) {
 84 |             if (s >= buffer_size) {
 85 |                 ret = TSK_ERR_BUFFER_OVERFLOW;
 86 |                 goto out;
 87 |             }
 88 |             buffer[s] = '(';
 89 |             s++;
 90 |             for (w = tree->right_child[v]; w != TSK_NULL; w = tree->left_sib[w]) {
 91 |                 stack_top++;
 92 |                 stack[stack_top] = w;
 93 |             }
 94 |         } else {
 95 |             u = tree->parent[v];
 96 |             stack_top--;
 97 |             label = -1;
 98 |             if (ms_labels) {
 99 |                 if (tree->left_child[v] == TSK_NULL) {
100 |                     label = (int) v + 1;
101 |                 }
102 |             } else if (flags[v] & TSK_NODE_IS_SAMPLE) {
103 |                 label = (int) v;
104 |             }
105 |             if (label != -1) {
106 |                 if (s >= buffer_size) {
107 |                     ret = TSK_ERR_BUFFER_OVERFLOW;
108 |                     goto out;
109 |                 }
110 |                 r = snprintf(buffer + s, buffer_size - s, label_format, label);
111 |                 if (r < 0) {
112 |                     ret = TSK_ERR_IO;
113 |                     goto out;
114 |                 }
115 |                 s += (size_t) r;
116 |                 if (s >= buffer_size) {
117 |                     ret = TSK_ERR_BUFFER_OVERFLOW;
118 |                     goto out;
119 |                 }
120 |             }
121 |             if (u != root_parent) {
122 |                 branch_length = (time[u] - time[v]);
123 |                 r = snprintf(buffer + s, buffer_size - s, ":%.*f", (int) self->precision,
124 |                     branch_length);
125 |                 if (r < 0) {
126 |                     ret = TSK_ERR_IO;
127 |                     goto out;
128 |                 }
129 |                 s += (size_t) r;
130 |                 if (s >= buffer_size) {
131 |                     ret = TSK_ERR_BUFFER_OVERFLOW;
132 |                     goto out;
133 |                 }
134 |                 if (v == tree->right_child[u]) {
135 |                     buffer[s] = ')';
136 |                 } else {
137 |                     buffer[s] = ',';
138 |                 }
139 |                 s++;
140 |             }
141 |         }
142 |     }
143 |     if ((s + 1) >= buffer_size) {
144 |         ret = TSK_ERR_BUFFER_OVERFLOW;
145 |         goto out;
146 |     }
147 |     buffer[s] = ';';
148 |     buffer[s + 1] = '\0';
149 |     ret = 0;
150 | out:
151 |     return ret;
152 | }
153 | 
154 | static int
155 | tsk_newick_converter_init(tsk_newick_converter_t *self, const tsk_tree_t *tree,
156 |     unsigned int precision, tsk_flags_t options)
157 | {
158 |     int ret = 0;
159 | 
160 |     tsk_memset(self, 0, sizeof(tsk_newick_converter_t));
161 |     self->precision = precision;
162 |     self->options = options;
163 |     self->tree = tree;
164 |     self->traversal_stack
165 |         = tsk_malloc(tsk_tree_get_size_bound(tree) * sizeof(*self->traversal_stack));
166 |     if (self->traversal_stack == NULL) {
167 |         ret = TSK_ERR_NO_MEMORY;
168 |         goto out;
169 |     }
170 | out:
171 |     return ret;
172 | }
173 | 
174 | static int
175 | tsk_newick_converter_free(tsk_newick_converter_t *self)
176 | {
177 |     tsk_safe_free(self->traversal_stack);
178 |     return 0;
179 | }
180 | 
181 | int
182 | tsk_convert_newick(const tsk_tree_t *tree, tsk_id_t root, unsigned int precision,
183 |     tsk_flags_t options, size_t buffer_size, char *buffer)
184 | {
185 |     int ret = 0;
186 |     tsk_newick_converter_t nc;
187 | 
188 |     ret = tsk_newick_converter_init(&nc, tree, precision, options);
189 |     if (ret != 0) {
190 |         goto out;
191 |     }
192 |     ret = tsk_newick_converter_run(&nc, root, buffer_size, buffer);
193 | out:
194 |     tsk_newick_converter_free(&nc);
195 |     return ret;
196 | }
197 | 


--------------------------------------------------------------------------------
/lib/subprojects/tskit/tskit/convert.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * MIT License
 3 |  *
 4 |  * Copyright (c) 2018-2021 Tskit Developers
 5 |  * Copyright (c) 2015-2017 University of Oxford
 6 |  *
 7 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
 8 |  * of this software and associated documentation files (the "Software"), to deal
 9 |  * in the Software without restriction, including without limitation the rights
10 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 |  * copies of the Software, and to permit persons to whom the Software is
12 |  * furnished to do so, subject to the following conditions:
13 |  *
14 |  * The above copyright notice and this permission notice shall be included in all
15 |  * copies or substantial portions of the Software.
16 |  *
17 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 |  * SOFTWARE.
24 |  */
25 | 
26 | #ifndef TSK_CONVERT_H
27 | #define TSK_CONVERT_H
28 | 
29 | #ifdef __cplusplus
30 | extern "C" {
31 | #endif
32 | 
33 | #include <tskit/trees.h>
34 | 
35 | #define TSK_NEWICK_LEGACY_MS_LABELS (1 << 0)
36 | 
37 | int tsk_convert_newick(const tsk_tree_t *tree, tsk_id_t root, unsigned int precision,
38 |     tsk_flags_t options, size_t buffer_size, char *buffer);
39 | 
40 | #ifdef __cplusplus
41 | }
42 | #endif
43 | #endif
44 | 


--------------------------------------------------------------------------------
/lib/subprojects/tskit/tskit/genotypes.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * MIT License
  3 |  *
  4 |  * Copyright (c) 2019-2022 Tskit Developers
  5 |  * Copyright (c) 2016-2018 University of Oxford
  6 |  *
  7 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  8 |  * of this software and associated documentation files (the "Software"), to deal
  9 |  * in the Software without restriction, including without limitation the rights
 10 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 |  * copies of the Software, and to permit persons to whom the Software is
 12 |  * furnished to do so, subject to the following conditions:
 13 |  *
 14 |  * The above copyright notice and this permission notice shall be included in all
 15 |  * copies or substantial portions of the Software.
 16 |  *
 17 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 |  * SOFTWARE.
 24 |  */
 25 | 
 26 | #ifndef TSK_GENOTYPES_H
 27 | #define TSK_GENOTYPES_H
 28 | 
 29 | #ifdef __cplusplus
 30 | extern "C" {
 31 | #endif
 32 | 
 33 | #include <tskit/trees.h>
 34 | 
 35 | #define TSK_ISOLATED_NOT_MISSING (1 << 1)
 36 | 
 37 | /**
 38 | @brief A variant at a specific site.
 39 | 
 40 | @rst
 41 | Used to generate the genotypes for a given set of samples at a given
 42 | site.
 43 | @endrst
 44 | */
 45 | typedef struct {
 46 |     /** @brief Unowned reference to the tree sequence of the variant */
 47 |     const tsk_treeseq_t *tree_sequence;
 48 |     /** @brief The site this variant is currently decoded at*/
 49 |     tsk_site_t site;
 50 |     tsk_tree_t tree;
 51 |     /** @brief Array of allele strings that the genotypes of the variant refer to
 52 |      *  These are not NULL terminated - use `allele_lengths` for example:.
 53 |      *  `printf("%.*s", (int) var->allele_lengths[j], var->alleles[j]);`
 54 |      */
 55 |     const char **alleles;
 56 |     /** @brief Lengths of the allele strings */
 57 |     tsk_size_t *allele_lengths;
 58 |     /** @brief Length of the allele array */
 59 |     tsk_size_t num_alleles;
 60 |     tsk_size_t max_alleles;
 61 |     /** @brief If True the genotypes of isolated nodes have been decoded to the "missing"
 62 |      * genotype. If False they are set to the ancestral state (in the absence of
 63 |      * mutations above them)*/
 64 |     bool has_missing_data;
 65 |     /** @brief Array of genotypes for the current site */
 66 |     int32_t *genotypes;
 67 |     /** @brief Number of samples */
 68 |     tsk_size_t num_samples;
 69 |     /** @brief Array of sample ids used*/
 70 |     tsk_id_t *samples;
 71 | 
 72 |     const tsk_id_t *sample_index_map;
 73 |     bool user_alleles;
 74 |     char *user_alleles_mem;
 75 |     tsk_id_t *traversal_stack;
 76 |     tsk_flags_t options;
 77 |     tsk_id_t *alt_samples;
 78 |     tsk_id_t *alt_sample_index_map;
 79 | 
 80 | } tsk_variant_t;
 81 | 
 82 | /* All vargen related structs and methods were deprecated in C API v1.0 */
 83 | typedef struct {
 84 |     const tsk_treeseq_t *tree_sequence;
 85 |     tsk_id_t site_index;
 86 |     tsk_variant_t variant;
 87 | } tsk_vargen_t;
 88 | 
 89 | /**
 90 | @defgroup VARIANT_API_GROUP Variant API for obtaining genotypes.
 91 | @{
 92 | */
 93 | 
 94 | /**
 95 | @brief Initialises the variant by allocating the internal memory
 96 | 
 97 | @rst
 98 | This must be called before any operations are performed on the variant.
 99 | See the :ref:`sec_c_api_overview_structure` for details on how objects
100 | are initialised and freed.
101 | @endrst
102 | 
103 | @param self A pointer to an uninitialised tsk_variant_t object.
104 | @param tree_sequence A pointer to the tree sequence from which this variant
105 | will decode genotypes. No copy is taken, so this tree sequence must persist
106 | for the lifetime of the variant.
107 | @param samples Optional. Either `NULL` or an array of node ids of the samples that are to
108 | have their genotypes decoded. A copy of this array will be taken by the variant. If
109 | `NULL` then the samples from the tree sequence will be used.
110 | @param num_samples The number of ids in the samples array, ignored if `samples` is `NULL`
111 | @param alleles Optional. Either ``NULL`` or an array of string alleles with a terminal
112 | ``NULL`` sentinel value.
113 | If specified, the genotypes will be decoded to match the index in this allele array.
114 | If ``NULL`` then alleles will be automatically determined from the mutations encountered.
115 | @param options Variant options. Either ``0`` or ``TSK_ISOLATED_NOT_MISSING`` which
116 | if specified indicates that isolated sample nodes should not be decoded as the "missing"
117 | state but as the ancestral state (or the state of any mutation above them).
118 | @return Return 0 on success or a negative value on failure.
119 | */
120 | int tsk_variant_init(tsk_variant_t *self, const tsk_treeseq_t *tree_sequence,
121 |     const tsk_id_t *samples, tsk_size_t num_samples, const char **alleles,
122 |     tsk_flags_t options);
123 | 
124 | /**
125 | @brief Copies the state of this variant to another variant
126 | 
127 | @rst
128 | Copies the site, genotypes and alleles from this variant to another. Note that
129 | the other variant should be uninitialised as this method does not free any
130 | memory that the other variant owns. After copying `other` is frozen and
131 | this restricts it from being further decoded at any site. `self` remains unchanged.
132 | @endrst
133 | 
134 | @param self A pointer to an initialised and decoded tsk_variant_t object.
135 | @param other A pointer to an uninitialised tsk_variant_t object.
136 | @return Return 0 on success or a negative value on failure.
137 | */
138 | int tsk_variant_restricted_copy(const tsk_variant_t *self, tsk_variant_t *other);
139 | 
140 | /**
141 | @brief Decode the genotypes at the given site, storing them in this variant.
142 | 
143 | @rst
144 | Decodes the genotypes for this variant's samples, indexed to this variant's alleles,
145 | at the specified site.
146 | This method is most efficient at decoding sites in-order, either forwards or backwards
147 | along the tree sequence. Resulting genotypes are stored in the ``genotypes`` member of
148 | this variant.
149 | @endrst
150 | 
151 | @param self A pointer to an initialised tsk_variant_t object.
152 | @param site_id A valid site id for the tree sequence of this variant.
153 | @param options Bitwise option flags. Currently unused; should be
154 |     set to zero to ensure compatibility with later versions of `tskit`.
155 | @return Return 0 on success or a negative value on failure.
156 | */
157 | int tsk_variant_decode(tsk_variant_t *self, tsk_id_t site_id, tsk_flags_t options);
158 | 
159 | /**
160 | @brief Free the internal memory for the specified variant.
161 | 
162 | @param self A pointer to an initialised tsk_variant_t object.
163 | @return Always returns 0.
164 | */
165 | int tsk_variant_free(tsk_variant_t *self);
166 | 
167 | /**
168 | @brief Print out the state of this variant to the specified stream.
169 | 
170 | This method is intended for debugging purposes and should not be used
171 | in production code. The format of the output should **not** be depended
172 | on and may change arbitrarily between versions.
173 | 
174 | @param self A pointer to a tsk_variant_t object.
175 | @param out The stream to write the summary to.
176 | */
177 | void tsk_variant_print_state(const tsk_variant_t *self, FILE *out);
178 | 
179 | /** @} */
180 | 
181 | /* Deprecated vargen methods (since C API v1.0) */
182 | int tsk_vargen_init(tsk_vargen_t *self, const tsk_treeseq_t *tree_sequence,
183 |     const tsk_id_t *samples, tsk_size_t num_samples, const char **alleles,
184 |     tsk_flags_t options);
185 | int tsk_vargen_next(tsk_vargen_t *self, tsk_variant_t **variant);
186 | int tsk_vargen_free(tsk_vargen_t *self);
187 | void tsk_vargen_print_state(const tsk_vargen_t *self, FILE *out);
188 | 
189 | #ifdef __cplusplus
190 | }
191 | #endif
192 | #endif
193 | 


--------------------------------------------------------------------------------
/lib/subprojects/tskit/tskit/haplotype_matching.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * MIT License
  3 |  *
  4 |  * Copyright (c) 2019-2022 Tskit Developers
  5 |  *
  6 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  7 |  * of this software and associated documentation files (the "Software"), to deal
  8 |  * in the Software without restriction, including without limitation the rights
  9 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 |  * copies of the Software, and to permit persons to whom the Software is
 11 |  * furnished to do so, subject to the following conditions:
 12 |  *
 13 |  * The above copyright notice and this permission notice shall be included in all
 14 |  * copies or substantial portions of the Software.
 15 |  *
 16 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 22 |  * SOFTWARE.
 23 |  */
 24 | 
 25 | #ifndef TSK_HAPLOTYPE_MATCHING_H
 26 | #define TSK_HAPLOTYPE_MATCHING_H
 27 | 
 28 | #ifdef __cplusplus
 29 | extern "C" {
 30 | #endif
 31 | 
 32 | #include <tskit/trees.h>
 33 | 
 34 | /* Seems like we might use this somewhere else as well, so putting it into the middle
 35 |  * of the flags space */
 36 | #define TSK_ALLELES_ACGT (1 << 16)
 37 | 
 38 | typedef struct {
 39 |     tsk_id_t tree_node;
 40 |     tsk_id_t value_index;
 41 |     double value;
 42 | } tsk_value_transition_t;
 43 | 
 44 | typedef struct {
 45 |     tsk_size_t index;
 46 |     double value;
 47 | } tsk_argsort_t;
 48 | 
 49 | typedef struct {
 50 |     tsk_id_t tree_node;
 51 |     tsk_id_t old_state;
 52 |     tsk_id_t new_state;
 53 |     tsk_id_t transition_parent;
 54 | } tsk_transition_stack_t;
 55 | 
 56 | typedef struct {
 57 |     double normalisation_factor;
 58 |     double *value;
 59 |     tsk_id_t *node;
 60 |     tsk_size_t num_values;
 61 | } tsk_site_probability_t;
 62 | 
 63 | typedef struct {
 64 |     tsk_treeseq_t *tree_sequence;
 65 |     tsk_flags_t options;
 66 |     tsk_size_t num_sites;
 67 |     tsk_size_t num_samples;
 68 |     double *normalisation_factor;
 69 |     tsk_size_t *num_transitions;
 70 |     double **values;
 71 |     tsk_id_t **nodes;
 72 |     tsk_blkalloc_t memory;
 73 | } tsk_compressed_matrix_t;
 74 | 
 75 | typedef struct {
 76 |     tsk_id_t site;
 77 |     tsk_id_t node;
 78 |     bool required;
 79 | } tsk_recomb_required_record;
 80 | 
 81 | typedef struct {
 82 |     tsk_compressed_matrix_t matrix;
 83 |     tsk_recomb_required_record *recombination_required;
 84 |     tsk_size_t num_recomb_records;
 85 |     tsk_size_t max_recomb_records;
 86 | } tsk_viterbi_matrix_t;
 87 | 
 88 | typedef struct _tsk_ls_hmm_t {
 89 |     /* input */
 90 |     tsk_treeseq_t *tree_sequence;
 91 |     double *recombination_rate;
 92 |     double *mutation_rate;
 93 |     const char ***alleles;
 94 |     unsigned int precision;
 95 |     uint32_t *num_alleles;
 96 |     tsk_size_t num_samples;
 97 |     tsk_size_t num_sites;
 98 |     tsk_size_t num_nodes;
 99 |     /* state */
100 |     tsk_tree_t tree;
101 |     tsk_diff_iter_t diffs;
102 |     tsk_id_t *parent;
103 |     /* The probability value transitions on the tree */
104 |     tsk_value_transition_t *transitions;
105 |     tsk_value_transition_t *transitions_copy;
106 |     /* Stack used when distributing transitions on the tree */
107 |     tsk_transition_stack_t *transition_stack;
108 |     /* Map of node_id to index in the transitions list */
109 |     tsk_id_t *transition_index;
110 |     /* Buffer used to argsort the transitions by node time */
111 |     tsk_argsort_t *transition_time_order;
112 |     tsk_size_t num_transitions;
113 |     tsk_size_t max_transitions;
114 |     /* The distinct values in the transitions */
115 |     double *values;
116 |     tsk_size_t num_values;
117 |     tsk_size_t max_values;
118 |     tsk_size_t max_parsimony_words;
119 |     /* Number of machine words per node optimal value set. */
120 |     tsk_size_t num_optimal_value_set_words;
121 |     uint64_t *optimal_value_sets;
122 |     /* The parent transition; used during compression */
123 |     tsk_id_t *transition_parent;
124 |     /* The number of samples directly subtended by a transition */
125 |     tsk_size_t *num_transition_samples;
126 |     int32_t *allelic_state;
127 |     /* Algorithms set these values before they are run */
128 |     int (*next_probability)(
129 |         struct _tsk_ls_hmm_t *, tsk_id_t, double, bool, tsk_id_t, double *);
130 |     double (*compute_normalisation_factor)(struct _tsk_ls_hmm_t *);
131 |     void *output;
132 | } tsk_ls_hmm_t;
133 | 
134 | int tsk_ls_hmm_init(tsk_ls_hmm_t *self, tsk_treeseq_t *tree_sequence,
135 |     double *recombination_rate, double *mutation_rate, tsk_flags_t options);
136 | int tsk_ls_hmm_set_precision(tsk_ls_hmm_t *self, unsigned int precision);
137 | int tsk_ls_hmm_free(tsk_ls_hmm_t *self);
138 | void tsk_ls_hmm_print_state(tsk_ls_hmm_t *self, FILE *out);
139 | int tsk_ls_hmm_forward(tsk_ls_hmm_t *self, int32_t *haplotype,
140 |     tsk_compressed_matrix_t *output, tsk_flags_t options);
141 | int tsk_ls_hmm_viterbi(tsk_ls_hmm_t *self, int32_t *haplotype,
142 |     tsk_viterbi_matrix_t *output, tsk_flags_t options);
143 | int tsk_ls_hmm_run(tsk_ls_hmm_t *self, int32_t *haplotype,
144 |     int (*next_probability)(tsk_ls_hmm_t *, tsk_id_t, double, bool, tsk_id_t, double *),
145 |     double (*compute_normalisation_factor)(struct _tsk_ls_hmm_t *), void *output);
146 | 
147 | int tsk_compressed_matrix_init(tsk_compressed_matrix_t *self,
148 |     tsk_treeseq_t *tree_sequence, tsk_size_t block_size, tsk_flags_t options);
149 | int tsk_compressed_matrix_free(tsk_compressed_matrix_t *self);
150 | int tsk_compressed_matrix_clear(tsk_compressed_matrix_t *self);
151 | void tsk_compressed_matrix_print_state(tsk_compressed_matrix_t *self, FILE *out);
152 | int tsk_compressed_matrix_store_site(tsk_compressed_matrix_t *self, tsk_id_t site,
153 |     double normalisation_factor, tsk_size_t num_transitions,
154 |     const tsk_value_transition_t *transitions);
155 | int tsk_compressed_matrix_decode(tsk_compressed_matrix_t *self, double *values);
156 | 
157 | int tsk_viterbi_matrix_init(tsk_viterbi_matrix_t *self, tsk_treeseq_t *tree_sequence,
158 |     tsk_size_t block_size, tsk_flags_t options);
159 | int tsk_viterbi_matrix_free(tsk_viterbi_matrix_t *self);
160 | int tsk_viterbi_matrix_clear(tsk_viterbi_matrix_t *self);
161 | void tsk_viterbi_matrix_print_state(tsk_viterbi_matrix_t *self, FILE *out);
162 | int tsk_viterbi_matrix_add_recombination_required(
163 |     tsk_viterbi_matrix_t *self, tsk_id_t site, tsk_id_t node, bool required);
164 | int tsk_viterbi_matrix_traceback(
165 |     tsk_viterbi_matrix_t *self, tsk_id_t *path, tsk_flags_t options);
166 | 
167 | #ifdef __cplusplus
168 | }
169 | #endif
170 | #endif
171 | 


--------------------------------------------------------------------------------
/lib/subprojects/tskit/tskit/stats.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * MIT License
 3 |  *
 4 |  * Copyright (c) 2019-2021 Tskit Developers
 5 |  * Copyright (c) 2016-2017 University of Oxford
 6 |  *
 7 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
 8 |  * of this software and associated documentation files (the "Software"), to deal
 9 |  * in the Software without restriction, including without limitation the rights
10 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 |  * copies of the Software, and to permit persons to whom the Software is
12 |  * furnished to do so, subject to the following conditions:
13 |  *
14 |  * The above copyright notice and this permission notice shall be included in all
15 |  * copies or substantial portions of the Software.
16 |  *
17 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 |  * SOFTWARE.
24 |  */
25 | 
26 | #ifndef TSK_STATS_H
27 | #define TSK_STATS_H
28 | 
29 | #ifdef __cplusplus
30 | extern "C" {
31 | #endif
32 | 
33 | #include <tskit/trees.h>
34 | 
35 | typedef struct {
36 |     const tsk_treeseq_t *tree_sequence;
37 |     tsk_site_t focal_site;
38 |     tsk_size_t total_samples;
39 |     tsk_size_t focal_samples;
40 |     double max_distance;
41 |     tsk_size_t max_sites;
42 |     tsk_tree_t tree;
43 |     tsk_id_t *sample_buffer;
44 |     double *result;
45 |     tsk_size_t result_length;
46 | } tsk_ld_calc_t;
47 | 
48 | int tsk_ld_calc_init(tsk_ld_calc_t *self, const tsk_treeseq_t *tree_sequence);
49 | int tsk_ld_calc_free(tsk_ld_calc_t *self);
50 | void tsk_ld_calc_print_state(const tsk_ld_calc_t *self, FILE *out);
51 | int tsk_ld_calc_get_r2(tsk_ld_calc_t *self, tsk_id_t a, tsk_id_t b, double *r2);
52 | int tsk_ld_calc_get_r2_array(tsk_ld_calc_t *self, tsk_id_t a, int direction,
53 |     tsk_size_t max_sites, double max_distance, double *r2, tsk_size_t *num_r2_values);
54 | 
55 | #ifdef __cplusplus
56 | }
57 | #endif
58 | #endif
59 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = [
 3 |     "setuptools>=42",
 4 |     "setuptools_scm",
 5 |     "wheel",
 6 |     "numpy>=2"
 7 | ]
 8 | build-backend = "setuptools.build_meta"
 9 | 
10 | [tool.setuptools_scm]
11 | write_to = "tsinfer/_version.py"
12 | 
13 | [project]
14 | name = "tsinfer"
15 | dynamic = ["version"]
16 | authors = [
17 |     {name = "Tskit Developers", email = "admin@tskit.dev"},
18 | ]
19 | description = "Infer tree sequences from genetic variation data."
20 | readme = "README.md"
21 | requires-python = ">=3.9"
22 | license = {text = "GNU GPLv3+"}
23 | classifiers = [
24 |     "Programming Language :: C",
25 |     "Programming Language :: Python",
26 |     "Programming Language :: Python :: 3",
27 |     "Programming Language :: Python :: 3.9",
28 |     "Programming Language :: Python :: 3.10",
29 |     "Programming Language :: Python :: 3.11",
30 |     "Programming Language :: Python :: 3.12",
31 |     "Programming Language :: Python :: 3 :: Only",
32 |     "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
33 |     "Development Status :: 3 - Alpha",
34 |     "Environment :: Other Environment",
35 |     "Intended Audience :: Science/Research",
36 |     "Operating System :: POSIX",
37 |     "Operating System :: MacOS :: MacOS X",
38 |     "Operating System :: Microsoft :: Windows",
39 |     "Topic :: Scientific/Engineering",
40 |     "Topic :: Scientific/Engineering :: Bio-Informatics",
41 | ]
42 | keywords = [
43 |     "population genetics",
44 |     "tree sequence",
45 |     "ancestral recombination graph",
46 |     "evolutionary tree",
47 |     "inference",
48 |     "tsinfer",
49 | ]
50 | dependencies = [
51 |     "numpy>=1.23.5",
52 |     "six",
53 |     "tqdm",
54 |     "humanize",
55 |     "daiquiri",
56 |     "tskit>=0.5.3",
57 |     "numcodecs>=0.6",
58 |     "zarr>=2.2,!=2.11.0,!=2.11.1,!=2.11.2,<3",
59 |     "lmdb",
60 |     "sortedcontainers",
61 |     "attrs>=19.2.0",
62 |     "numba",
63 |     "psutil>=5.9.0",
64 | ]
65 | 
66 | [project.urls]
67 | Homepage = "https://tskit.dev/tsinfer"
68 | Documentation = "https://tskit.dev/tsinfer/docs/stable"
69 | Changelog = "https://tskit.dev/tsinfer/docs/stable/CHANGELOG.html"
70 | "Bug Tracker" = "https://github.com/tskit-dev/tsinfer/issues"
71 | "Source Code" = "https://github.com/tskit-dev/tsinfer/"
72 | 
73 | [project.scripts]
74 | tsinfer = "tsinfer.__main__:main"
75 | 
76 | [tool.setuptools]
77 | packages = ["tsinfer"]
78 | include-package-data = true
79 | 
80 | [tool.pytest.ini_options]
81 | testpaths = ["tests"]
82 | filterwarnings = [
83 |     'ignore:SampleData'
84 | ]


--------------------------------------------------------------------------------
/requirements/CI-docs/requirements.txt:
--------------------------------------------------------------------------------
 1 | jupyter-book==1.0.4.post1
 2 | sphinx-issues==5.0.0
 3 | sphinx-argparse==0.5.2
 4 | humanize==4.12.1
 5 | lmdb==1.6.2
 6 | tqdm==4.67.1
 7 | daiquiri==3.3.0
 8 | msprime==1.3.3
 9 | sgkit[vcf]==0.9.0
10 | ipywidgets==8.1.5
11 | Bio==1.7.1
12 | bio2zarr==0.1.4
13 | sphinx-book-theme #Unpinned to allow easy updating.
14 | pyfaidx==0.8.1.3


--------------------------------------------------------------------------------
/requirements/CI-tests-complete/requirements.txt:
--------------------------------------------------------------------------------
 1 | build==1.2.2.post1
 2 | colorama==0.4.6
 3 | daiquiri==3.2.5.1
 4 | humanize==4.12.1
 5 | lmdb==1.6.2
 6 | matplotlib==3.9.4
 7 | meson==1.7.0
 8 | msprime==1.3.3
 9 | pytest==8.3.5
10 | pytest-cov==6.0.0
11 | seaborn==0.13.2
12 | sgkit[vcf]==0.9.0
13 | tskit==0.6.0
14 | tqdm==4.67.1
15 | twine==6.1.0
16 | 


--------------------------------------------------------------------------------
/requirements/CI-tests-conda/requirements.txt:
--------------------------------------------------------------------------------
 1 | pytest==8.3.5
 2 | msprime==1.3.3
 3 | humanize==4.12.1
 4 | python-lmdb==1.4.1
 5 | tqdm==4.67.1
 6 | daiquiri==3.0.0 # Pinned as conda package not updating
 7 | matplotlib==3.9.4
 8 | seaborn==0.13.2
 9 | colorama==0.4.6
10 | tskit==0.6.0


--------------------------------------------------------------------------------
/requirements/development.txt:
--------------------------------------------------------------------------------
 1 | attrs
 2 | codecov
 3 | coverage
 4 | flake8
 5 | six
 6 | tqdm
 7 | humanize
 8 | daiquiri
 9 | msprime >= 1.0.0
10 | tskit >= 0.5.3
11 | lmdb
12 | pre-commit
13 | pytest
14 | pytest-coverage
15 | # Only for giving nice error messages for incompatible older files
16 | h5py
17 | # Only needed for the Python implementation.
18 | sortedcontainers
19 | # Optional extras for debugging threads - these modules mainly work on linux
20 | python-prctl; sys_platform == 'linux'
21 | numa; sys_platform == 'linux'
22 | # Needed for building docs.
23 | sphinx
24 | sphinx-argparse
25 | sphinx_rtd_theme
26 | setuptools>=45
27 | setuptools_scm
28 | # Needed for evaluation script.
29 | matplotlib
30 | seaborn
31 | colorama
32 | sgkit[vcf]
33 | sphinx-book-theme
34 | jupyter-book
35 | sphinx-issues
36 | ipywidgets
37 | pyfaidx
38 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = tsinfer
 3 | author= Tskit Developers
 4 | author_email = admin@tskit.dev
 5 | license = GNU GPLv3+
 6 | description= Infer tree sequences from genetic variation data.
 7 | long_description_content_type = text/markdown
 8 | long_description = file: README.md
 9 | url = https://tskit.dev/tsinfer
10 | project_urls =
11 |     Documentation = https://tskit.dev/tsinfer/docs/stable
12 |     Changelog = https://tskit.dev/tsinfer/docs/stable/CHANGELOG.html
13 |     Bug Tracker = https://github.com/tskit-dev/tsinfer/issues
14 |     GitHub = https://github.com/tskit-dev/tsinfer/
15 | classifiers =
16 |     Programming Language :: C
17 |     Programming Language :: Python
18 |     Programming Language :: Python :: 3
19 |     Programming Language :: Python :: 3.9
20 |     Programming Language :: Python :: 3.10
21 |     Programming Language :: Python :: 3.11
22 |     Programming Language :: Python :: 3.12
23 |     Programming Language :: Python :: 3 :: Only
24 |     License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
25 |     Development Status :: 3 - Alpha
26 |     Environment :: Other Environment
27 |     Intended Audience :: Science/Research
28 |     Operating System :: POSIX
29 |     Operating System :: MacOS :: MacOS X
30 |     Operating System :: Microsoft :: Windows
31 |     Topic :: Scientific/Engineering
32 |     Topic :: Scientific/Engineering :: Bio-Informatics
33 | keywords =
34 |     population genetics
35 |     tree sequence
36 |     ancestral recombination graph
37 |     evolutionary tree
38 |     inference
39 |     tsinfer
40 | platforms =
41 |     POSIX
42 |     Windows
43 |     MacOS X
44 | 
45 | [options]
46 | packages = tsinfer
47 | python_requires = >=3.9
48 | include_package_data = True
49 | install_requires =
50 |     numpy>=1.23.5
51 |     six
52 |     tqdm
53 |     humanize
54 |     daiquiri
55 |     tskit>=0.5.8
56 |     numcodecs>=0.6
57 |     # issues 965 and 967 at zarr-python prevent usage of 2.11.0 and 2.11.1
58 |     zarr>=2.2,!=2.11.0,!=2.11.1,!=2.11.2,<3
59 |     lmdb
60 |     sortedcontainers
61 |     attrs>=19.2.0
62 |     numba
63 | 
64 | [options.entry_points]
65 | console_scripts =
66 |     tsinfer = tsinfer.__main__:main
67 | 
68 | [tool:pytest]
69 | testpaths =
70 |     tests
71 | 
72 | [bdist_wheel]
73 | # This flag says to generate wheels that support both Python 2 and Python
74 | # 3. If your code will not run unchanged on both Python 2 and 3, you will
75 | # need to generate separate wheels for each Python version that you
76 | # support.
77 | universal=0
78 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import platform
 3 | 
 4 | import numpy
 5 | from setuptools import Extension
 6 | from setuptools import setup
 7 | 
 8 | IS_WINDOWS = platform.system() == "Windows"
 9 | 
10 | libdir = "lib"
11 | tskroot = os.path.join(libdir, "subprojects", "tskit")
12 | tskdir = os.path.join(tskroot, "tskit")
13 | kasdir = os.path.join(tskroot, "subprojects", "kastore")
14 | includes = [libdir, tskroot, tskdir, kasdir]
15 | 
16 | tsi_source_files = [
17 |     "ancestor_matcher.c",
18 |     "ancestor_builder.c",
19 |     "object_heap.c",
20 |     "tree_sequence_builder.c",
21 |     "err.c",
22 |     "avl.c",
23 | ]
24 | tsk_source_files = ["core.c"]
25 | kas_source_files = ["kastore.c"]
26 | 
27 | sources = (
28 |     ["_tsinfermodule.c"]
29 |     + [os.path.join(libdir, f) for f in tsi_source_files]
30 |     + [os.path.join(tskdir, f) for f in tsk_source_files]
31 |     + [os.path.join(kasdir, f) for f in kas_source_files]
32 | )
33 | 
34 | libraries = ["Advapi32"] if IS_WINDOWS else []
35 | 
36 | _tsinfer_module = Extension(
37 |     "_tsinfer",
38 |     sources=sources,
39 |     extra_compile_args=["-std=c99"],
40 |     libraries=libraries,
41 |     undef_macros=["NDEBUG"],
42 |     include_dirs=includes + [numpy.get_include()],
43 | )
44 | 
45 | setup(
46 |     ext_modules=[_tsinfer_module],
47 | )
48 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (C) 2020 University of Oxford
  3 | #
  4 | # This file is part of tsinfer.
  5 | #
  6 | # tsinfer is free software: you can redistribute it and/or modify
  7 | # it under the terms of the GNU General Public License as published by
  8 | # the Free Software Foundation, either version 3 of the License, or
  9 | # (at your option) any later version.
 10 | #
 11 | # tsinfer is distributed in the hope that it will be useful,
 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | # GNU General Public License for more details.
 15 | #
 16 | # You should have received a copy of the GNU General Public License
 17 | # along with tsinfer.  If not, see <http://www.gnu.org/licenses/>.
 18 | #
 19 | """
 20 | Configuration and fixtures for pytest. Only put test-suite wide fixtures in here. Module
 21 | specific fixtures should live in their modules.
 22 | 
 23 | To use a fixture in a test simply refer to it by name as an argument. This is called
 24 | dependancy injection. Note that all fixtures should have the suffix "_fixture" to make
 25 | it clear in test code.
 26 | 
 27 | For example to use the `ts` fixture (a tree sequence with data in all fields) in a test:
 28 | 
 29 | class TestClass:
 30 |     def test_something(self, ts_fixture):
 31 |         assert ts_fixture.some_method() == expected
 32 | 
 33 | Fixtures can be parameterised etc. see https://docs.pytest.org/en/stable/fixture.html
 34 | 
 35 | Note that fixtures have a "scope" for example `ts_fixture` below is only created once
 36 | per test session and re-used for subsequent tests.
 37 | """
 38 | import msprime
 39 | import numpy as np
 40 | import pytest
 41 | import tskit
 42 | from pytest import fixture
 43 | from tsutil import mark_mutation_times_unknown
 44 | 
 45 | import tsinfer
 46 | 
 47 | 
 48 | def pytest_addoption(parser):
 49 |     """
 50 |     Add an option to skip tests marked with `@pytest.mark.slow`
 51 |     """
 52 |     parser.addoption(
 53 |         "--skip-slow", action="store_true", default=False, help="Skip slow tests"
 54 |     )
 55 | 
 56 | 
 57 | def pytest_configure(config):
 58 |     """
 59 |     Add docs on the "slow" marker
 60 |     """
 61 |     config.addinivalue_line("markers", "slow: mark test as slow to run")
 62 | 
 63 | 
 64 | def pytest_collection_modifyitems(config, items):
 65 |     if config.getoption("--skip-slow"):
 66 |         skip_slow = pytest.mark.skip(reason="--skip-slow specified")
 67 |         for item in items:
 68 |             if "slow" in item.keywords:
 69 |                 item.add_marker(skip_slow)
 70 | 
 71 | 
 72 | def num_nonsample_muts(ts):
 73 |     return np.sum(np.logical_not(np.isin(ts.tables.mutations.node, ts.samples())))
 74 | 
 75 | 
 76 | def assign_individual_ids(ts):
 77 |     tables = ts.dump_tables()
 78 |     ind_md = [{"id": i} for i in range(ts.num_individuals)]
 79 |     tables.individuals.metadata_schema = tskit.MetadataSchema.permissive_json()
 80 |     tables.individuals.packset_metadata(
 81 |         [tables.individuals.metadata_schema.validate_and_encode_row(r) for r in ind_md]
 82 |     )
 83 |     return tables.tree_sequence()
 84 | 
 85 | 
 86 | @fixture(scope="session")
 87 | def small_ts_fixture():
 88 |     """
 89 |     A simple 1-tree sequence with at least 2 inference sites
 90 |     (i.e. mutations above a non-sample node), and no mutation times
 91 |     """
 92 |     ts = msprime.sim_ancestry(10, sequence_length=1000, ploidy=1, random_seed=1)
 93 |     ts = msprime.sim_mutations(ts, rate=0.01, random_seed=1)
 94 |     ts = assign_individual_ids(ts)
 95 |     assert num_nonsample_muts(ts) > 1
 96 |     return mark_mutation_times_unknown(ts)
 97 | 
 98 | 
 99 | @fixture(scope="session")
100 | def small_sd_fixture(small_ts_fixture):
101 |     """
102 |     A sample data instance from the small 1-tree sequence
103 |     """
104 |     return tsinfer.SampleData.from_tree_sequence(small_ts_fixture)
105 | 
106 | 
107 | @fixture(scope="session")
108 | def small_sd_anc_fixture(small_ts_fixture):
109 |     """
110 |     A sample data and an ancestors instance from the small 1-tree sequence
111 |     """
112 |     sd = tsinfer.SampleData.from_tree_sequence(small_ts_fixture)
113 |     return sd, tsinfer.generate_ancestors(sd)
114 | 
115 | 
116 | @fixture(scope="session")
117 | def medium_ts_fixture():
118 |     """
119 |     A medium sized tree sequence with a good number of trees and inference mutations
120 |     (i.e. mutations above a non-sample node), and no mutation times. Samples are
121 |     haploid, so we have one individual per sample, which has metadata for identification
122 |     """
123 |     ts = msprime.sim_ancestry(
124 |         10, sequence_length=1000, ploidy=1, recombination_rate=0.01, random_seed=3
125 |     )
126 |     ts = msprime.sim_mutations(ts, rate=0.02, random_seed=3)
127 |     ts = assign_individual_ids(ts)
128 |     assert ts.num_trees > 10
129 |     assert num_nonsample_muts(ts) > 50
130 |     return mark_mutation_times_unknown(ts)
131 | 
132 | 
133 | @fixture(scope="session")
134 | def medium_sd_fixture(medium_ts_fixture):
135 |     """
136 |     A sample data instance from the medium-sized tree sequence
137 |     """
138 |     return tsinfer.SampleData.from_tree_sequence(
139 |         medium_ts_fixture, use_sites_time=False
140 |     )
141 | 


--------------------------------------------------------------------------------
/tests/data/bugs/invalid_pc_ancestor_time.samples:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tsinfer/20788d393b79f0ee8b39d866456533c2d86abbe7/tests/data/bugs/invalid_pc_ancestor_time.samples


--------------------------------------------------------------------------------
/tests/data/old_formats/medium_sd_fixture_0.2.3.samples:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tsinfer/20788d393b79f0ee8b39d866456533c2d86abbe7/tests/data/old_formats/medium_sd_fixture_0.2.3.samples


--------------------------------------------------------------------------------
/tests/test_low_level.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (C) 2018-2020 University of Oxford
  3 | #
  4 | # This file is part of tsinfer.
  5 | #
  6 | # tsinfer is free software: you can redistribute it and/or modify
  7 | # it under the terms of the GNU General Public License as published by
  8 | # the Free Software Foundation, either version 3 of the License, or
  9 | # (at your option) any later version.
 10 | #
 11 | # tsinfer is distributed in the hope that it will be useful,
 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | # GNU General Public License for more details.
 15 | #
 16 | # You should have received a copy of the GNU General Public License
 17 | # along with tsinfer.  If not, see <http://www.gnu.org/licenses/>.
 18 | #
 19 | """
 20 | Integrity tests for the low-level module.
 21 | """
 22 | import sys
 23 | 
 24 | import pytest
 25 | 
 26 | import _tsinfer
 27 | 
 28 | 
 29 | IS_WINDOWS = sys.platform == "win32"
 30 | 
 31 | 
 32 | class TestOutOfMemory:
 33 |     """
 34 |     Make sure we raise the correct error when out of memory occurs in
 35 |     the library code.
 36 |     """
 37 | 
 38 |     @pytest.mark.skipif(
 39 |         sys.platform == "win32",
 40 |         reason="windows seems to allow initializing with insane # of nodes"
 41 |         " (perhaps memory allocation is optimised out at this stage?)",
 42 |     )
 43 |     def test_tree_sequence_builder_too_many_nodes(self):
 44 |         big = 2**62
 45 |         with pytest.raises(MemoryError):
 46 |             _tsinfer.TreeSequenceBuilder([2], max_nodes=big)
 47 | 
 48 |     @pytest.mark.skipif(
 49 |         sys.platform == "win32",
 50 |         reason="windows raises an assert error not a memory error with 2**62 edges"
 51 |         " (line 149 of object_heap.c)",
 52 |     )
 53 |     def test_tree_sequence_builder_too_many_edges(self):
 54 |         big = 2**62
 55 |         with pytest.raises(MemoryError):
 56 |             _tsinfer.TreeSequenceBuilder([2], max_edges=big)
 57 | 
 58 | 
 59 | class TestAncestorMatcher:
 60 |     """
 61 |     Tests for the AncestorMatcher C Python interface.
 62 |     """
 63 | 
 64 |     def test_init(self):
 65 |         with pytest.raises(TypeError):
 66 |             _tsinfer.AncestorMatcher()
 67 |         with pytest.raises(TypeError):
 68 |             _tsinfer.AncestorMatcher(None)
 69 |         tsb = _tsinfer.TreeSequenceBuilder([2])
 70 |         with pytest.raises(TypeError):
 71 |             _tsinfer.AncestorMatcher(tsb)
 72 |         with pytest.raises(TypeError):
 73 |             _tsinfer.AncestorMatcher(tsb, [1])
 74 |         for bad_type in [None, {}]:
 75 |             with pytest.raises(TypeError):
 76 |                 _tsinfer.AncestorMatcher(tsb, [1], [1], extended_checks=bad_type)
 77 |             with pytest.raises(TypeError):
 78 |                 _tsinfer.AncestorMatcher(tsb, [1], [1], precision=bad_type)
 79 |         for bad_array in [[], [[], []], None, "sdf", [1, 2, 3]]:
 80 |             with pytest.raises(ValueError):
 81 |                 _tsinfer.AncestorMatcher(tsb, bad_array, [1])
 82 |             with pytest.raises(ValueError):
 83 |                 _tsinfer.AncestorMatcher(tsb, [1], bad_array)
 84 | 
 85 | 
 86 | class TestTreeSequenceBuilder:
 87 |     """
 88 |     Tests for the AncestorMatcher C Python interface.
 89 |     """
 90 | 
 91 |     def test_init(self):
 92 |         with pytest.raises(TypeError):
 93 |             _tsinfer.TreeSequenceBuilder()
 94 |         for bad_array in [None, "serf", [[], []], ["asdf"], {}]:
 95 |             with pytest.raises(ValueError):
 96 |                 _tsinfer.TreeSequenceBuilder(bad_array)
 97 | 
 98 |         for bad_type in [None, "sdf", {}]:
 99 |             with pytest.raises(TypeError):
100 |                 _tsinfer.TreeSequenceBuilder([2], max_nodes=bad_type)
101 |             with pytest.raises(TypeError):
102 |                 _tsinfer.TreeSequenceBuilder([2], max_edges=bad_type)
103 | 
104 | 
105 | class TestAncestorBuilder:
106 |     """
107 |     Tests for the AncestorBuilder C Python interface.
108 |     """
109 | 
110 |     def test_init(self):
111 |         with pytest.raises(TypeError):
112 |             _tsinfer.AncestorBuilder()
113 |         for bad_value in [None, "serf", [[], []], ["asdf"], {}]:
114 |             with pytest.raises(TypeError):
115 |                 _tsinfer.AncestorBuilder(num_samples=2, max_sites=bad_value)
116 |             with pytest.raises(TypeError):
117 |                 _tsinfer.AncestorBuilder(num_samples=bad_value, max_sites=2)
118 |             with pytest.raises(TypeError):
119 |                 _tsinfer.AncestorBuilder(
120 |                     num_samples=2, max_sites=2, genotype_encoding=bad_value
121 |                 )
122 |             with pytest.raises(TypeError):
123 |                 _tsinfer.AncestorBuilder(num_samples=2, max_sites=2, mmap_fd=bad_value)
124 |         for bad_num_samples in [0, 1]:
125 |             with pytest.raises(_tsinfer.LibraryError):
126 |                 _tsinfer.AncestorBuilder(num_samples=bad_num_samples, max_sites=0)
127 | 
128 |     @pytest.mark.skipif(IS_WINDOWS, reason="mmap_fd is a no-op on Windows")
129 |     def test_bad_fd(self):
130 |         with pytest.raises(_tsinfer.LibraryError, match="Bad file desc"):
131 |             _tsinfer.AncestorBuilder(num_samples=2, max_sites=2, mmap_fd=-2)
132 | 
133 |     def test_add_site(self):
134 |         ab = _tsinfer.AncestorBuilder(num_samples=2, max_sites=10)
135 |         for bad_type in ["sdf", {}, None]:
136 |             with pytest.raises(TypeError):
137 |                 ab.add_site(time=bad_type, genotypes=[0, 0])
138 |         for bad_genotypes in ["asdf", [[], []], [0, 1, 2]]:
139 |             with pytest.raises(ValueError):
140 |                 ab.add_site(time=0, genotypes=bad_genotypes)
141 | 
142 |     def test_add_too_many_sites(self):
143 |         for max_sites in range(10):
144 |             ab = _tsinfer.AncestorBuilder(num_samples=2, max_sites=max_sites)
145 |             for _ in range(max_sites):
146 |                 ab.add_site(time=1, genotypes=[0, 1])
147 |             for _ in range(2 * max_sites):
148 |                 with pytest.raises(_tsinfer.LibraryError) as record:
149 |                     ab.add_site(time=1, genotypes=[0, 1])
150 |                 msg = "Cannot add more sites than the specified maximum."
151 |                 assert str(record.value) == msg
152 | 
153 |     # TODO need tester methods for the remaining methonds in the class.
154 | 


--------------------------------------------------------------------------------
/tsinfer/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (C) 2018 University of Oxford
 3 | #
 4 | # This file is part of tsinfer.
 5 | #
 6 | # tsinfer is free software: you can redistribute it and/or modify
 7 | # it under the terms of the GNU General Public License as published by
 8 | # the Free Software Foundation, either version 3 of the License, or
 9 | # (at your option) any later version.
10 | #
11 | # tsinfer is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with tsinfer.  If not, see <http://www.gnu.org/licenses/>.
18 | #
19 | """
20 | Tree sequence inference.
21 | """
22 | import sys
23 | import warnings
24 | 
25 | # tsinfer #957. This warning pops up as a result of using fast-math. It bascially means
26 | # that tiny tiny values are being rounded to zero, which should be fine for our purposes.
27 | warnings.filterwarnings(
28 |     "ignore",
29 |     message="The value of the smallest subnormal for <class 'numpy.float64'> "
30 |     "type is zero",
31 | )
32 | 
33 | 
34 | if sys.version_info[0] < 3:
35 |     raise Exception("Python 3 only")
36 | 
37 | __version__ = "undefined"
38 | try:
39 |     from . import _version
40 | 
41 |     __version__ = _version.version
42 | except ImportError:
43 |     pass
44 | 
45 | from .inference import *  # NOQA
46 | from .formats import *  # NOQA
47 | from .eval_util import *  # NOQA
48 | from .exceptions import *  # NOQA
49 | from .constants import *  # NOQA
50 | from .cli import get_cli_parser  # NOQA
51 | 


--------------------------------------------------------------------------------
/tsinfer/__main__.py:
--------------------------------------------------------------------------------
 1 | import tsinfer.cli as cli
 2 | 
 3 | 
 4 | def main():
 5 |     cli.tsinfer_main()
 6 | 
 7 | 
 8 | if __name__ == "__main__":
 9 |     main()
10 | 


--------------------------------------------------------------------------------
/tsinfer/ancestors.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (C) 2023 University of Oxford
  3 | #
  4 | # This file is part of tsinfer.
  5 | #
  6 | # tsinfer is free software: you can redistribute it and/or modify
  7 | # it under the terms of the GNU General Public License as published by
  8 | # the Free Software Foundation, either version 3 of the License, or
  9 | # (at your option) any later version.
 10 | #
 11 | # tsinfer is distributed in the hope that it will be useful,
 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | # GNU General Public License for more details.
 15 | #
 16 | # You should have received a copy of the GNU General Public License
 17 | # along with tsinfer.  If not, see <http://www.gnu.org/licenses/>.
 18 | #
 19 | """
 20 | Ancestor handling routines.
 21 | """
 22 | import logging
 23 | import time as time_
 24 | 
 25 | import numba
 26 | import numpy as np
 27 | 
 28 | logger = logging.getLogger(__name__)
 29 | 
 30 | 
 31 | def merge_overlapping_ancestors(start, end, time):
 32 |     # Merge overlapping, same-time ancestors. We do this by scanning along a single
 33 |     # time epoch from left to right, detecting breaks.
 34 |     sort_indices = np.lexsort((start, time))
 35 |     start = start[sort_indices]
 36 |     end = end[sort_indices]
 37 |     time = time[sort_indices]
 38 |     old_indexes = {}
 39 |     # For efficiency, pre-allocate the output arrays to the maximum possible size.
 40 |     new_start = np.full_like(start, -1)
 41 |     new_end = np.full_like(end, -1)
 42 |     new_time = np.full_like(time, -1)
 43 | 
 44 |     i = 0
 45 |     new_index_pos = 0
 46 |     while i < len(start):
 47 |         j = i + 1
 48 |         group_overlap = [i]
 49 |         max_right = end[i]
 50 |         # While we're in the same time epoch, and the next ancestor
 51 |         # overlaps with the group, add this ancestor to the group.
 52 |         while j < len(start) and time[j] == time[i] and start[j] < max_right:
 53 |             max_right = max(max_right, end[j])
 54 |             group_overlap.append(j)
 55 |             j += 1
 56 | 
 57 |         # Emit the found group
 58 |         old_indexes[new_index_pos] = group_overlap
 59 |         new_start[new_index_pos] = start[i]
 60 |         new_end[new_index_pos] = max_right
 61 |         new_time[new_index_pos] = time[i]
 62 |         new_index_pos += 1
 63 |         i = j
 64 |     # Trim the output arrays to the actual size.
 65 |     new_start = new_start[:new_index_pos]
 66 |     new_end = new_end[:new_index_pos]
 67 |     new_time = new_time[:new_index_pos]
 68 |     return new_start, new_end, new_time, old_indexes, sort_indices
 69 | 
 70 | 
 71 | @numba.njit
 72 | def run_linesweep(event_times, event_index, event_type, new_time):
 73 |     # Run the linesweep over the ancestor start-stop events,
 74 |     # building up the dependency graph as a count of dependencies for each ancestor,
 75 |     # and a list of dependant children for each ancestor.
 76 |     n = len(new_time)
 77 | 
 78 |     # numba really likes to know the type of the list elements, so we tell it by adding
 79 |     # a dummy element to the list and then popping it off.
 80 |     # `active` is the list of ancestors that overlap with the current linesweep position.
 81 |     active = [-1]
 82 |     active.pop()
 83 |     children = [[-1] for _ in range(n)]
 84 |     for c in range(n):
 85 |         children[c].pop()
 86 |     incoming_edge_count = np.zeros(n, dtype=np.int32)
 87 |     for i in range(len(event_times)):
 88 |         index = event_index[i]
 89 |         e_time = event_times[i]
 90 |         if event_type[i] == 1:
 91 |             for j in active:
 92 |                 if new_time[j] > e_time:
 93 |                     incoming_edge_count[index] += 1
 94 |                     children[j].append(index)
 95 |                 elif new_time[j] < e_time:
 96 |                     incoming_edge_count[j] += 1
 97 |                     children[index].append(j)
 98 |             active.append(index)
 99 |         else:
100 |             active.remove(index)
101 | 
102 |     # Convert children to ragged array format so we can pass arrays to the
103 |     # next numba function, `find_groups`.
104 |     children_data = []
105 |     children_indices = [0]
106 |     for child_list in children:
107 |         children_data.extend(child_list)
108 |         children_indices.append(len(children_data))
109 |     children_data = np.array(children_data, dtype=np.int32)
110 |     children_indices = np.array(children_indices, dtype=np.int32)
111 |     return children_data, children_indices, incoming_edge_count
112 | 
113 | 
114 | @numba.njit
115 | def find_groups(children_data, children_indices, incoming_edge_count):
116 |     # We find groups of ancestors that can be matched in parallel by topologically
117 |     # sorting the dependency graph. We do this by deconstructing the graph, removing
118 |     # nodes with no incoming edges, and adding them to a group.
119 |     n = len(children_indices) - 1
120 |     group_id = np.full(n, -1, dtype=np.int32)
121 |     current_group = 0
122 |     while True:
123 |         # Find the nodes with no incoming edges
124 |         no_incoming = np.where(incoming_edge_count == 0)[0]
125 |         if len(no_incoming) == 0:
126 |             break
127 |         # Remove them from the graph
128 |         for i in no_incoming:
129 |             incoming_edge_count[i] = -1
130 |             incoming_edge_count[
131 |                 children_data[children_indices[i] : children_indices[i + 1]]
132 |             ] -= 1
133 |         # Add them to the group
134 |         group_id[no_incoming] = current_group
135 |         current_group += 1
136 | 
137 |     # Check for unassigned nodes (cycles in dependency graph)
138 |     if np.any(group_id == -1):
139 |         raise ValueError(
140 |             "Erroneous cycle in ancestor dependancies, this is often "
141 |             "caused by too many unique site times. This fixed by discretising "
142 |             "the site times, for example rounding times to the nearest 0.1."
143 |         )
144 |     return group_id
145 | 
146 | 
147 | def group_ancestors_by_linesweep(start, end, time):
148 |     # For a given set of ancestors, we want to group them for matching in parallel.
149 |     # For each ancestor, any overlapping, older ancestors must be in an earlier group,
150 |     # and any overlapping, younger ancestors in a later group. Any overlapping same-age
151 |     # ancestors must be in the same group so they don't match to each other.
152 |     # We do this by first merging the overlapping same-age ancestors. Then build a
153 |     # dependency graph of the ancestors by linesweep. Then form groups by topological
154 |     # sort. Finally, we un-merge the same-age ancestors.
155 | 
156 |     assert len(start) == len(end)
157 |     assert len(start) == len(time)
158 |     t = time_.time()
159 |     (
160 |         new_start,
161 |         new_end,
162 |         new_time,
163 |         old_indexes,
164 |         sort_indices,
165 |     ) = merge_overlapping_ancestors(start, end, time)
166 |     logger.info(f"Merged to {len(new_start)} ancestors in {time_.time() - t:.2f}s")
167 | 
168 |     # Build a list of events for the linesweep
169 |     t = time_.time()
170 |     n = len(new_time)
171 |     # Create events arrays by copying and concatenating inputs
172 |     event_times = np.concatenate([new_time, new_time])
173 |     event_pos = np.concatenate([new_start, new_end])
174 |     event_index = np.concatenate([np.arange(n), np.arange(n)])
175 |     event_type = np.concatenate([np.ones(n, dtype=np.int8), np.zeros(n, dtype=np.int8)])
176 |     # Sort events by position, then ends before starts
177 |     event_sort_indices = np.lexsort((event_type, event_pos))
178 |     event_times = event_times[event_sort_indices]
179 |     event_index = event_index[event_sort_indices]
180 |     event_type = event_type[event_sort_indices]
181 |     logger.info(f"Built {len(event_times)} events in {time_.time() - t:.2f}s")
182 | 
183 |     t = time_.time()
184 |     children_data, children_indices, incoming_edge_count = run_linesweep(
185 |         event_times, event_index, event_type, new_time
186 |     )
187 |     logger.info(
188 |         f"Linesweep generated {np.sum(incoming_edge_count)} dependencies in"
189 |         f" {time_.time() - t:.2f}s"
190 |     )
191 | 
192 |     t = time_.time()
193 |     group_id = find_groups(children_data, children_indices, incoming_edge_count)
194 |     logger.info(f"Found groups in {time_.time() - t:.2f}s")
195 | 
196 |     t = time_.time()
197 |     # Convert the group id array to lists of ids for each group
198 |     ancestor_grouping = {}
199 |     for group in np.unique(group_id):
200 |         ancestor_grouping[group] = np.where(group_id == group)[0]
201 | 
202 |     # Now un-merge the same-age ancestors, simultaneously mapping back to the original,
203 |     # unsorted indexes
204 |     for group in ancestor_grouping:
205 |         ancestor_grouping[group] = sorted(
206 |             [
207 |                 sort_indices[item]
208 |                 for i in ancestor_grouping[group]
209 |                 for item in old_indexes[i]
210 |             ]
211 |         )
212 |     logger.info(f"Un-merged in {time_.time() - t:.2f}s")
213 |     logger.info(
214 |         f"{len(ancestor_grouping)} groups with median size "
215 |         f"{np.median([len(ancestor_grouping[group]) for group in ancestor_grouping])}"
216 |     )
217 |     return ancestor_grouping
218 | 


--------------------------------------------------------------------------------
/tsinfer/constants.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (C) 2018-2023 University of Oxford
 3 | #
 4 | # This file is part of tsinfer.
 5 | #
 6 | # tsinfer is free software: you can redistribute it and/or modify
 7 | # it under the terms of the GNU General Public License as published by
 8 | # the Free Software Foundation, either version 3 of the License, or
 9 | # (at your option) any later version.
10 | #
11 | # tsinfer is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with tsinfer.  If not, see <http://www.gnu.org/licenses/>.
18 | #
19 | """
20 | Collection of constants used in tsinfer. We also make use of constants defined in tskit.
21 | """
22 | import enum
23 | 
24 | import numpy as np
25 | 
26 | C_ENGINE = "C"
27 | PY_ENGINE = "P"
28 | 
29 | 
30 | # TODO Change these to use the enum.IntFlag class
31 | 
32 | # Bit 16 is set in node flags when they have been created by path compression.
33 | NODE_IS_PC_ANCESTOR = np.uint32(1 << 16)
34 | # Bit 17 is set in node flags when they have been created by shared recombination
35 | # breakpoint
36 | NODE_IS_SRB_ANCESTOR = np.uint32(1 << 17)
37 | # Bit 18 is set in node flags when they are samples inserted to augment existing
38 | # ancestors.
39 | NODE_IS_SAMPLE_ANCESTOR = np.uint32(1 << 18)
40 | # Bit 20 is set in node flags when they are samples not at time zero in the sampledata
41 | # file
42 | NODE_IS_HISTORICAL_SAMPLE = np.uint32(1 << 20)
43 | 
44 | # What type of inference have we done at a site?
45 | INFERENCE_NONE = "none"
46 | INFERENCE_FULL = "full"
47 | INFERENCE_PARSIMONY = "parsimony"
48 | 
49 | 
50 | class GenotypeEncoding(enum.IntEnum):
51 |     """
52 |     The encoding scheme used to store genotypes.
53 |     """
54 | 
55 |     EIGHT_BIT = 0
56 |     """
57 |     The default approach of using one-byte per genotype. Supports up to 127 alleles
58 |     and missing data.
59 |     """
60 | 
61 |     ONE_BIT = 1
62 |     """
63 |     Encode binary genotype data using a single bit.
64 |     """
65 | 


--------------------------------------------------------------------------------
/tsinfer/exceptions.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (C) 2018 University of Oxford
 3 | #
 4 | # This file is part of tsinfer.
 5 | #
 6 | # tsinfer is free software: you can redistribute it and/or modify
 7 | # it under the terms of the GNU General Public License as published by
 8 | # the Free Software Foundation, either version 3 of the License, or
 9 | # (at your option) any later version.
10 | #
11 | # tsinfer is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | # GNU General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with tsinfer.  If not, see <http://www.gnu.org/licenses/>.
18 | #
19 | """
20 | Exceptions raised by tsinfer.
21 | """
22 | 
23 | 
24 | class TsinferException(Exception):
25 |     """
26 |     Superclass of all exceptions thrown by tsinfer.
27 |     """
28 | 
29 | 
30 | class FileError(TsinferException):
31 |     """
32 |     Exception raised when some non-specific error happens during file handling.
33 |     """
34 | 
35 | 
36 | class FileFormatError(FileError):
37 |     """
38 |     Exception raised when a malformed file is encountered.
39 |     """
40 | 
41 | 
42 | class FileFormatTooOld(FileError):
43 |     """
44 |     Exception raised when a file with a version too old is detected.
45 |     """
46 | 
47 | 
48 | class FileFormatTooNew(FileError):
49 |     """
50 |     Exception raised when a file with a version from a newer version
51 |     of tsinfer is detected.
52 |     """
53 | 


--------------------------------------------------------------------------------
/tsinfer/progress.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (C) 2018-2020 University of Oxford
  3 | #
  4 | # This file is part of tsinfer.
  5 | #
  6 | # tsinfer is free software: you can redistribute it and/or modify
  7 | # it under the terms of the GNU General Public License as published by
  8 | # the Free Software Foundation, either version 3 of the License, or
  9 | # (at your option) any later version.
 10 | #
 11 | # tsinfer is distributed in the hope that it will be useful,
 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | # GNU General Public License for more details.
 15 | #
 16 | # You should have received a copy of the GNU General Public License
 17 | # along with tsinfer.  If not, see <http://www.gnu.org/licenses/>.
 18 | #
 19 | """
 20 | A progress monitor class for tsinfer
 21 | """
 22 | from tqdm.auto import tqdm
 23 | 
 24 | 
 25 | class ProgressMonitor:
 26 |     """
 27 |     Class responsible for managing in the tqdm progress monitors.
 28 |     """
 29 | 
 30 |     def __init__(
 31 |         self,
 32 |         enabled=True,
 33 |         generate_ancestors=False,
 34 |         match_ancestors=False,
 35 |         augment_ancestors=False,
 36 |         match_samples=False,
 37 |         verify=False,
 38 |         tqdm_kwargs=None,
 39 |     ):
 40 |         self.enabled = enabled
 41 |         self.num_bars = 0
 42 |         if generate_ancestors:
 43 |             self.num_bars += 2
 44 |         if match_ancestors:
 45 |             self.num_bars += 1
 46 |         if match_samples:
 47 |             self.num_bars += 3
 48 |         if verify:
 49 |             assert self.num_bars == 0
 50 |             self.num_bars += 1
 51 |         if augment_ancestors:
 52 |             assert self.num_bars == 0
 53 |             self.num_bars += 2
 54 |         self.current_count = 0
 55 |         self.current_instance = None
 56 |         if not verify:
 57 |             # Only show extra detail if we are running match-ancestors by itself.
 58 |             self.show_detail = self.num_bars == 1
 59 |         self.descriptions = {
 60 |             "ga_add_sites": "ga-add",
 61 |             "ga_generate": "ga-gen",
 62 |             "ma_match": "ma-match",
 63 |             "ms_match": "ms-match",
 64 |             "ms_paths": "ms-paths",
 65 |             "ms_full_mutations": "ms-muts",
 66 |             "ms_extra_sites": "ms-xsites",
 67 |             "verify": "verify",
 68 |         }
 69 |         if tqdm_kwargs is None:
 70 |             tqdm_kwargs = {}
 71 |         self.tqdm_kwargs = tqdm_kwargs
 72 | 
 73 |     def set_detail(self, info):
 74 |         if self.show_detail:
 75 |             self.current_instance.set_postfix(info)
 76 | 
 77 |     def get(self, key, total):
 78 |         self.current_count += 1
 79 |         desc = "{:<8} ({}/{})".format(
 80 |             self.descriptions[key], self.current_count, self.num_bars
 81 |         )
 82 |         bar_format = (
 83 |             "{desc}{percentage:3.0f}%|{bar}"
 84 |             "| {n_fmt}/{total_fmt} [{elapsed}, {rate_fmt}{postfix}]"
 85 |         )
 86 |         self.current_instance = tqdm(
 87 |             desc=desc,
 88 |             total=total,
 89 |             disable=not self.enabled,
 90 |             bar_format=bar_format,
 91 |             dynamic_ncols=True,
 92 |             smoothing=0.01,
 93 |             unit_scale=True,
 94 |             **self.tqdm_kwargs,
 95 |         )
 96 |         return self.current_instance
 97 | 
 98 | 
 99 | class DummyProgress:
100 |     """
101 |     Class that mimics the subset of the tqdm API that we use in this module.
102 |     """
103 | 
104 |     def update(self, n=None):
105 |         pass
106 | 
107 |     def close(self):
108 |         pass
109 | 
110 | 
111 | class DummyProgressMonitor(ProgressMonitor):
112 |     """
113 |     Simple class to mimic the interface of the real progress monitor.
114 |     """
115 | 
116 |     def get(self, key, total):
117 |         return DummyProgress()
118 | 
119 |     def set_detail(self, info):
120 |         pass
121 | 


--------------------------------------------------------------------------------
/tsinfer/provenance.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (C) 2018 University of Oxford
  3 | #
  4 | # This file is part of tsinfer.
  5 | #
  6 | # tsinfer is free software: you can redistribute it and/or modify
  7 | # it under the terms of the GNU General Public License as published by
  8 | # the Free Software Foundation, either version 3 of the License, or
  9 | # (at your option) any later version.
 10 | #
 11 | # tsinfer is distributed in the hope that it will be useful,
 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | # GNU General Public License for more details.
 15 | #
 16 | # You should have received a copy of the GNU General Public License
 17 | # along with tsinfer.  If not, see <http://www.gnu.org/licenses/>.
 18 | #
 19 | """
 20 | Common provenance methods used to determine the state and versions
 21 | of various dependencies and the OS.
 22 | """
 23 | import dataclasses
 24 | import platform
 25 | import sys
 26 | import time
 27 | 
 28 | import lmdb
 29 | import numcodecs
 30 | import psutil
 31 | import tskit
 32 | import zarr
 33 | 
 34 | if sys.platform != "win32":
 35 |     import resource
 36 | 
 37 | 
 38 | __version__ = "undefined"
 39 | try:
 40 |     from . import _version
 41 | 
 42 |     __version__ = _version.version
 43 | except ImportError:
 44 |     pass
 45 | 
 46 | 
 47 | @dataclasses.dataclass
 48 | class ResourceMetrics:
 49 |     elapsed_time: float
 50 |     user_time: float
 51 |     sys_time: float
 52 |     max_memory: int
 53 | 
 54 |     def asdict(self):
 55 |         return dataclasses.asdict(self)
 56 | 
 57 |     @classmethod
 58 |     def combine(cls, metrics_list):
 59 |         if not metrics_list:
 60 |             raise ValueError("Cannot combine empty list of metrics")
 61 |         return cls(
 62 |             elapsed_time=sum(m.elapsed_time for m in metrics_list),
 63 |             user_time=sum(m.user_time for m in metrics_list),
 64 |             sys_time=sum(m.sys_time for m in metrics_list),
 65 |             max_memory=max(m.max_memory for m in metrics_list),
 66 |         )
 67 | 
 68 | 
 69 | def get_environment():
 70 |     """
 71 |     Returns a dictionary describing the environment in which tsinfer
 72 |     is currently running.
 73 |     """
 74 |     env = {
 75 |         "libraries": {
 76 |             "zarr": {"version": zarr.__version__},
 77 |             "numcodecs": {"version": numcodecs.__version__},
 78 |             "lmdb": {"version": lmdb.__version__},
 79 |             "tskit": {"version": tskit.__version__},
 80 |         },
 81 |         "os": {
 82 |             "system": platform.system(),
 83 |             "node": platform.node(),
 84 |             "release": platform.release(),
 85 |             "version": platform.version(),
 86 |             "machine": platform.machine(),
 87 |         },
 88 |         "python": {
 89 |             "implementation": platform.python_implementation(),
 90 |             "version": platform.python_version_tuple(),
 91 |         },
 92 |     }
 93 |     return env
 94 | 
 95 | 
 96 | def get_provenance_dict(command=None, resources=None, **kwargs):
 97 |     """
 98 |     Returns a dictionary encoding an execution of tsinfer following the
 99 |     tskit provenance schema.
100 | 
101 |     https://tskit.readthedocs.io/en/stable/provenance.html
102 |     """
103 |     if command is None:
104 |         raise ValueError("Command must be provided")
105 |     parameters = dict(kwargs)
106 |     parameters["command"] = command
107 |     if "simplify" in parameters:
108 |         if parameters["simplify"] is None:
109 |             del parameters["simplify"]  # simplify is deprecated version of post_process
110 |         else:
111 |             del parameters["post_process"]
112 |     document = {
113 |         "schema_version": "1.0.0",
114 |         "software": {"name": "tsinfer", "version": __version__},
115 |         "parameters": parameters,
116 |         "environment": get_environment(),
117 |     }
118 |     if resources is not None:
119 |         document["resources"] = resources
120 |     return document
121 | 
122 | 
123 | def get_peak_memory_bytes():
124 |     # peak memory usage in bytes
125 |     if sys.platform in ("linux", "darwin"):
126 |         usage = resource.getrusage(resource.RUSAGE_SELF)
127 |         max_rss = usage.ru_maxrss
128 | 
129 |         if sys.platform == "linux":
130 |             # Linux reports in kilobytes
131 |             return max_rss * 1024  # Convert KB to bytes
132 |         # macOS reports in bytes
133 |         return max_rss
134 | 
135 |     elif sys.platform == "win32":
136 |         return psutil.Process().memory_info().peak_wset
137 | 
138 |     else:
139 |         return None
140 | 
141 | 
142 | class TimingAndMemory:
143 |     # Context manager for tracking timing and memory usage.
144 |     def __init__(self):
145 |         self.metrics = None
146 | 
147 |     def __enter__(self):
148 |         self.start_process = psutil.Process()
149 |         self.start_elapsed = time.perf_counter()
150 |         self.start_times = self.start_process.cpu_times()
151 |         return self
152 | 
153 |     def __exit__(self, exc_type, exc_val, exc_tb):
154 |         end_times = self.start_process.cpu_times()
155 |         self.metrics = ResourceMetrics(
156 |             elapsed_time=time.perf_counter() - self.start_elapsed,
157 |             user_time=end_times.user - self.start_times.user,
158 |             sys_time=end_times.system - self.start_times.system,
159 |             max_memory=get_peak_memory_bytes(),
160 |         )
161 | 


--------------------------------------------------------------------------------
/tsinfer/threads.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (C) 2018 University of Oxford
  3 | #
  4 | # This file is part of tsinfer.
  5 | #
  6 | # tsinfer is free software: you can redistribute it and/or modify
  7 | # it under the terms of the GNU General Public License as published by
  8 | # the Free Software Foundation, either version 3 of the License, or
  9 | # (at your option) any later version.
 10 | #
 11 | # tsinfer is distributed in the hope that it will be useful,
 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | # GNU General Public License for more details.
 15 | #
 16 | # You should have received a copy of the GNU General Public License
 17 | # along with tsinfer.  If not, see <http://www.gnu.org/licenses/>.
 18 | #
 19 | """
 20 | Utilities for handling threads.
 21 | """
 22 | import _thread
 23 | import concurrent.futures
 24 | import heapq
 25 | import logging
 26 | import threading
 27 | import traceback
 28 | 
 29 | 
 30 | # prctl is an optional extra; it allows us assign meaninful names to threads
 31 | # for debugging.
 32 | _prctl_available = False
 33 | try:
 34 |     import prctl
 35 | 
 36 |     _prctl_available = True
 37 | except ImportError:
 38 |     pass
 39 | 
 40 | _numa_available = False
 41 | try:
 42 |     import numa
 43 | 
 44 |     _numa_available = True
 45 | except ImportError:
 46 |     pass
 47 | 
 48 | 
 49 | logger = logging.getLogger(__name__)
 50 | 
 51 | 
 52 | def threaded_map(func, args, num_workers):
 53 |     results_buffer = []
 54 |     with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
 55 |         futures = set()
 56 |         next_index = 0
 57 |         for i, arg in enumerate(args):
 58 |             # +1 so that we're not waiting for the args generator to produce the next arg
 59 |             while len(futures) >= num_workers + 1:
 60 |                 # If there are too many in-progress tasks, wait for one to complete
 61 |                 done, futures = concurrent.futures.wait(
 62 |                     futures, return_when=concurrent.futures.FIRST_COMPLETED
 63 |                 )
 64 |                 for future in done:
 65 |                     index, result = future.result()
 66 |                     if index == next_index:
 67 |                         # If this result is the next expected one, yield it immediately
 68 |                         yield result
 69 |                         next_index += 1
 70 |                     else:
 71 |                         heapq.heappush(results_buffer, (index, result))
 72 | 
 73 |                     # Yield any results from the buffer that are next in line
 74 |                     while results_buffer and results_buffer[0][0] == next_index:
 75 |                         _, result = heapq.heappop(results_buffer)
 76 |                         yield result
 77 |                         next_index += 1
 78 | 
 79 |             # Wraps the function so we can track the index of the argument
 80 |             futures.add(executor.submit(lambda arg, i=i: (i, func(arg)), arg))
 81 | 
 82 |         concurrent.futures.wait(futures)
 83 |         for future in futures:
 84 |             index, result = future.result()
 85 |             if index == next_index:
 86 |                 yield result
 87 |                 next_index += 1
 88 |             else:
 89 |                 heapq.heappush(results_buffer, (index, result))
 90 | 
 91 |         # Yield any remaining results in the buffer
 92 |         while results_buffer:
 93 |             _, result = heapq.heappop(results_buffer)
 94 |             yield result
 95 | 
 96 | 
 97 | def _queue_thread(worker, work_queue, name="tsinfer-worker", index=0, consumer=True):
 98 |     def thread_target():
 99 |         try:
100 |             logger.debug(f"thread '{name}' starting")
101 |             if _prctl_available:
102 |                 prctl.set_name(name)
103 |             if _numa_available and numa.available():
104 |                 numa.set_localalloc()
105 |                 logger.debug(f"Set NUMA local allocation policy on thread {name}")
106 |             worker(index)
107 |             logger.debug(f"thread '{name}' finishing")
108 |         except Exception:
109 |             logger.critical("Exception occured in thread; exiting")
110 |             logger.critical(traceback.format_exc())
111 |             # Communicate back the main thread that something bad has happened.
112 |             # This seems to be the only reliable way to do it.
113 |             _thread.interrupt_main()
114 |             # Now we still need to make sure that the main thread doesn't block
115 |             # on the queue.get/join (as it won't be interrupted). This is an attempt
116 |             # to make sure that it unblocks. May not be fool-proof though.
117 |             #
118 |             # TODO This doesn't really work. We can still block on pushing things
119 |             # onto the queue. We'll probably have to do something ourselves using
120 |             # timeouts and stuff to see if an error has occured.
121 |             if consumer:
122 |                 while True:
123 |                     try:
124 |                         work_queue.task_done()
125 |                     except ValueError:
126 |                         break
127 |             else:
128 |                 work_queue.put(None)
129 | 
130 |     thread = threading.Thread(target=thread_target, daemon=True)
131 |     thread.start()
132 |     return thread
133 | 
134 | 
135 | def queue_producer_thread(worker, work_queue, name="tsinfer-worker", index=0):
136 |     """
137 |     Returns a started thread that produces items for the specified queue using the
138 |     specified worker function.
139 |     """
140 |     return _queue_thread(worker, work_queue, name=name, index=index, consumer=False)
141 | 
142 | 
143 | def queue_consumer_thread(worker, work_queue, name="tsinfer-worker", index=0):
144 |     """
145 |     Returns a started thread that consumes items for the specified queue using the
146 |     specified worker function.
147 |     """
148 |     return _queue_thread(worker, work_queue, name=name, index=index, consumer=True)
149 | 


--------------------------------------------------------------------------------