├── .circleci └── config.yml ├── .flake8 ├── .github └── workflows │ ├── docker │ ├── buildwheel.sh │ └── shared.env │ ├── docs.yml │ ├── tests.yml │ └── wheels.yml ├── .gitignore ├── .mergify.yml ├── .pre-commit-config.yaml ├── CHANGELOG.md ├── CITATION.md ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── _tsinfermodule.c ├── convert_hdf5.py ├── dev.py ├── docs ├── .gitignore ├── CHANGELOG.md ├── CITATION.md ├── Makefile ├── _config.yml ├── _static │ ├── .README │ ├── P_dom_chr24_phased.vcf.gz │ ├── P_dom_chr24_phased.vcf.gz.tbi │ ├── ancestor_grouping.png │ ├── example_ancestral_state.fa │ ├── example_ancestral_state.fa.fai │ ├── example_data.vcz │ │ ├── .zattrs │ │ ├── .zgroup │ │ ├── .zmetadata │ │ ├── call_genotype │ │ │ ├── .zarray │ │ │ ├── .zattrs │ │ │ └── 0.0.0 │ │ ├── call_genotype_mask │ │ │ ├── .zarray │ │ │ ├── .zattrs │ │ │ └── 0.0.0 │ │ ├── call_genotype_phased │ │ │ ├── .zarray │ │ │ ├── .zattrs │ │ │ └── 0.0 │ │ ├── contig_id │ │ │ ├── 0 │ │ │ ├── .zarray │ │ │ └── .zattrs │ │ ├── sample_id │ │ │ ├── 0 │ │ │ ├── .zarray │ │ │ └── .zattrs │ │ ├── variant_allele │ │ │ ├── .zarray │ │ │ └── 0.0 │ │ ├── variant_contig │ │ │ ├── 0 │ │ │ ├── .zarray │ │ │ └── .zattrs │ │ └── variant_position │ │ │ ├── 0 │ │ │ ├── .zarray │ │ │ └── .zattrs │ ├── example_flow.svg │ └── tree_at_1Mb.svg ├── _templates │ └── .README ├── _toc.yml ├── api.rst ├── build.sh ├── cli.rst ├── development.rst ├── file_formats.rst ├── index.md ├── inference.md ├── installation.rst ├── introduction.rst ├── large_scale.md ├── simulation-example.py ├── tsinfer_logo.svg └── usage.md ├── evaluation.py ├── lib ├── .clang-format ├── ancestor_builder.c ├── ancestor_matcher.c ├── avl.c ├── avl.h ├── err.c ├── err.h ├── meson.build ├── object_heap.c ├── object_heap.h ├── subprojects │ ├── README │ ├── tskit.wrap │ └── tskit │ │ ├── .gitignore │ │ ├── CHANGELOG.rst │ │ ├── VERSION.txt │ │ ├── examples │ │ ├── Makefile │ │ ├── api_structure.c │ │ ├── cpp_sorting_example.cpp │ │ ├── error_handling.c │ │ ├── haploid_wright_fisher.c │ │ ├── streaming.c │ │ ├── take_ownership.c │ │ ├── tree_iteration.c │ │ └── tree_traversal.c │ │ ├── meson.build │ │ ├── meson_options.txt │ │ ├── subprojects │ │ └── kastore │ │ │ ├── README.md │ │ │ ├── VERSION.txt │ │ │ ├── kastore.c │ │ │ ├── kastore.h │ │ │ └── meson.build │ │ ├── tests │ │ ├── test_convert.c │ │ ├── test_core.c │ │ ├── test_file_format.c │ │ ├── test_genotypes.c │ │ ├── test_haplotype_matching.c │ │ ├── test_minimal_cpp.cpp │ │ ├── test_stats.c │ │ ├── test_tables.c │ │ ├── test_trees.c │ │ ├── testlib.c │ │ └── testlib.h │ │ ├── tskit.h │ │ └── tskit │ │ ├── convert.c │ │ ├── convert.h │ │ ├── core.c │ │ ├── core.h │ │ ├── genotypes.c │ │ ├── genotypes.h │ │ ├── haplotype_matching.c │ │ ├── haplotype_matching.h │ │ ├── stats.c │ │ ├── stats.h │ │ ├── tables.c │ │ ├── tables.h │ │ ├── trees.c │ │ └── trees.h ├── tests │ └── tests.c ├── tree_sequence_builder.c └── tsinfer.h ├── pyproject.toml ├── requirements ├── CI-docs │ └── requirements.txt ├── CI-tests-complete │ └── requirements.txt ├── CI-tests-conda │ └── requirements.txt └── development.txt ├── setup.cfg ├── setup.py ├── tests ├── conftest.py ├── data │ ├── bugs │ │ └── invalid_pc_ancestor_time.samples │ └── old_formats │ │ └── medium_sd_fixture_0.2.3.samples ├── test_ancestors.py ├── test_cli.py ├── test_evaluation.py ├── test_formats.py ├── test_inference.py ├── test_low_level.py ├── test_provenance.py ├── test_variantdata.py └── tsutil.py ├── tsinfer ├── __init__.py ├── __main__.py ├── algorithm.py ├── ancestors.py ├── cli.py ├── constants.py ├── eval_util.py ├── exceptions.py ├── formats.py ├── inference.py ├── progress.py ├── provenance.py └── threads.py └── visualisation.py /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2.1 2 | 3 | orbs: 4 | codecov: codecov/codecov@3.2.4 5 | 6 | jobs: 7 | build: 8 | docker: 9 | - image: cimg/python:3.9 10 | steps: 11 | - checkout 12 | 13 | - restore_cache: 14 | key: tsinfer-{{ .Branch }}-v3 15 | 16 | - run: 17 | name: Install dependencies and set PATH 18 | command: | 19 | sudo apt-get update 20 | sudo apt-get install libgsl-dev libcap-dev libnuma-dev libcunit1-dev \ 21 | libconfig-dev ninja-build valgrind clang python3-pip 22 | # set path persistently https://circleci.com/docs/2.0/env-vars/#setting-path 23 | echo 'export PATH=/home/circleci/.local/bin:$PATH' >> $BASH_ENV 24 | 25 | - run: 26 | name: Install development dependencies 27 | command: | 28 | pyenv global 3.9 29 | pip install -r requirements/CI-tests-complete/requirements.txt --user 30 | pyenv rehash 31 | 32 | - save_cache: 33 | key: tsinfer-{{ .Branch }}-v1 34 | paths: 35 | - "/home/circleci/.local" 36 | 37 | - run: 38 | name: Checkout submodules 39 | command: | 40 | git submodule update --init --recursive 41 | # Write out the status for debugging purposes. Are we checked out at tags? 42 | git submodule status --recursive 43 | 44 | - run: 45 | name: Build the distribution tarball. 46 | command: | 47 | python -m build --sdist 48 | python setup.py check 49 | python -m twine check dist/*.tar.gz --strict 50 | rm dist/* 51 | python -m build 52 | 53 | - run: 54 | name: Install from the distribution tarball 55 | command: | 56 | python -m venv venv 57 | source venv/bin/activate 58 | pip install dist/*.tar.gz 59 | python -c 'import tsinfer; print(tsinfer.__version__)' 60 | 61 | #Also check the wheel 62 | pip uninstall --yes tsinfer 63 | pip install dist/*.whl 64 | python -c 'import tsinfer; print(tsinfer.__version__)' 65 | deactivate 66 | rm -rf venv 67 | 68 | - run: 69 | name: Compile Python 70 | command: | 71 | python setup.py build_ext --inplace 72 | 73 | - run: 74 | name: Run Python tests and upload coverage 75 | command: | 76 | python3 -m pytest --cov=tsinfer --cov-report=xml --cov-branch -xvs tests 77 | rm .coverage 78 | 79 | - codecov/upload: 80 | flags: python 81 | token: CODECOV_TOKEN 82 | 83 | - run: 84 | name: Compile C with gcc 85 | command: | 86 | CFLAGS=--coverage meson lib/ build-gcc 87 | ninja -C build-gcc 88 | 89 | - run: 90 | name: Compile C with clang 91 | command: | 92 | CC=clang CXX=clang++ meson lib/ build-clang 93 | ninja -C build-clang 94 | 95 | - run: 96 | name: Run the low-level tests. 97 | command: | 98 | cd build-gcc 99 | ./tests 100 | 101 | - run: 102 | name: Run gcov manually, as the one used in codecov doesn't work here. 103 | command: | 104 | gcov -pb -o ./build/temp.linux*/ _tsinfermodule.c 105 | cd build-gcc 106 | # TODO should be able to do this with 'find', but it's tricky and opaque. 107 | gcov -pb ./libtsinfer.a.p/ancestor_builder.c.gcno ../lib/ancestor_builder.c 108 | gcov -pb ./libtsinfer.a.p/ancestor_matcher.c.gcno ../lib/ancestor_matcher.c 109 | gcov -pb ./libtsinfer.a.p/tree_sequence_builder.c.gcno ../lib/tree_sequence_builder.c 110 | gcov -pb ./libtsinfer.a.p/object_heap.c.gcno ../lib/object_heap.c 111 | gcov -pb ./libtsinfer.a.p/err.c.gcno ../lib/err.c 112 | cd .. 113 | 114 | - codecov/upload: 115 | flags: C 116 | token: CODECOV_TOKEN 117 | 118 | - run: 119 | name: Valgrind for C tests. 120 | command: | 121 | valgrind --leak-check=full --error-exitcode=1 ./build-gcc/tests 122 | 123 | - run: 124 | name: Run clang-compiled C tests 125 | command: | 126 | ninja -C build-clang test 127 | 128 | 129 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | # Based directly on Black's recommendations: 3 | # https://black.readthedocs.io/en/stable/the_black_code_style.html#line-length 4 | max-line-length = 81 5 | select = C,E,F,W,B,B950 6 | ignore = E203, E501, W503 7 | -------------------------------------------------------------------------------- /.github/workflows/docker/buildwheel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | DOCKER_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 3 | source "$DOCKER_DIR/shared.env" 4 | 5 | set -e -x 6 | 7 | ARCH=`uname -p` 8 | echo "arch=$ARCH" 9 | 10 | # We're running as root in the docker container so git commands issued by 11 | # setuptools_scm will fail without this: 12 | git config --global --add safe.directory /project 13 | # Fetch the full history as we'll be missing tags otherwise. 14 | git fetch --unshallow 15 | for V in "${PYTHON_VERSIONS[@]}"; do 16 | git reset --hard 17 | git clean -fd 18 | PYBIN=/opt/python/$V/bin 19 | rm -rf build/ # Avoid lib build by narrow Python is used by wide python 20 | # Instead of letting setup.py install a newer numpy we install it here 21 | # using the oldest supported version for ABI compatibility 22 | $PYBIN/python -m venv env 23 | source env/bin/activate 24 | $PYBIN/python -m pip install --upgrade build 25 | SETUPTOOLS_SCM_DEBUG=1 $PYBIN/python -m build 26 | done 27 | 28 | cd dist 29 | for whl in *.whl; do 30 | auditwheel repair "$whl" 31 | rm "$whl" 32 | done -------------------------------------------------------------------------------- /.github/workflows/docker/shared.env: -------------------------------------------------------------------------------- 1 | PYTHON_VERSIONS=( 2 | cp312-cp312 3 | cp311-cp311 4 | cp310-cp310 5 | cp39-cp39 6 | ) 7 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: Docs 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: [main] 7 | tags: 8 | - '*' 9 | 10 | env: 11 | COMMIT_EMAIL: ben.jeffery.well+adminbot@gmail.com 12 | MAKE_TARGET: all 13 | OWNER: tskit-dev 14 | REPO: tsinfer 15 | 16 | jobs: 17 | build-deploy-docs: 18 | name: Docs 19 | runs-on: ubuntu-24.04 20 | steps: 21 | - name: Cancel Previous Runs 22 | uses: styfle/cancel-workflow-action@0.12.1 23 | with: 24 | access_token: ${{ github.token }} 25 | 26 | - uses: actions/checkout@v4.2.2 27 | with: 28 | submodules: true 29 | 30 | - uses: actions/setup-python@v5.4.0 31 | with: 32 | python-version: "3.11" 33 | cache: "pip" 34 | 35 | - name: Install deps (one by one to avoid conflict errors) 36 | run: | 37 | pip install --upgrade pip wheel 38 | pip install -r requirements/CI-docs/requirements.txt 39 | sudo apt-get install -y tabix 40 | 41 | - name: Build C module 42 | if: env.MAKE_TARGET 43 | run: | 44 | make $MAKE_TARGET 45 | 46 | - name: Build Docs 47 | run: | 48 | cd docs && make dist 49 | 50 | - name: Trigger docs site rebuild 51 | if: github.ref == 'refs/heads/main' 52 | run: | 53 | curl -X POST https://api.github.com/repos/tskit-dev/tskit-site/dispatches \ 54 | -H 'Accept: application/vnd.github.everest-preview+json' \ 55 | -u AdminBot-tskit:${{ secrets.ADMINBOT_TOKEN }} \ 56 | --data '{"event_type":"build-docs"}' 57 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: [main] 7 | 8 | jobs: 9 | pre-commit: 10 | name: Lint 11 | runs-on: ubuntu-24.04 12 | steps: 13 | - name: Cancel Previous Runs 14 | uses: styfle/cancel-workflow-action@0.12.1 15 | with: 16 | access_token: ${{ github.token }} 17 | - uses: actions/checkout@v4.2.2 18 | - uses: actions/setup-python@v5.4.0 19 | with: 20 | python-version: '3.10' 21 | - name: install clang-format 22 | if: steps.clang_format.outputs.cache-hit != 'true' 23 | run: | 24 | sudo pip install clang-format==6.0.1 25 | sudo ln -s /usr/local/bin/clang-format /usr/local/bin/clang-format-6.0 26 | - uses: pre-commit/action@v3.0.1 27 | 28 | test: 29 | name: Python 30 | runs-on: ${{ matrix.os }} 31 | strategy: 32 | fail-fast: false 33 | matrix: 34 | python: [ 3.9, "3.12" ] 35 | os: [ macos-latest, ubuntu-24.04, windows-latest ] 36 | defaults: 37 | run: 38 | shell: bash 39 | steps: 40 | - name: Cancel Previous Runs 41 | uses: styfle/cancel-workflow-action@0.12.1 42 | with: 43 | access_token: ${{ github.token }} 44 | 45 | - name: Checkout 46 | uses: actions/checkout@v4.2.2 47 | with: 48 | submodules: true 49 | 50 | - name: Cache conda and dependancies 51 | id: cache 52 | uses: actions/cache@v4.2.2 53 | with: 54 | path: ${{ env.CONDA }}/envs 55 | key: ${{ runner.os }}-${{ runner.arch }}-${{ matrix.python}}-conda-v5-${{ hashFiles('requirements/CI-tests-conda/requirements.txt')}} 56 | 57 | - name: Install Miniconda with Mamba 58 | uses: conda-incubator/setup-miniconda@v3.1.1 59 | if: steps.cache.outputs.cache-hit != 'true' 60 | with: 61 | activate-environment: anaconda-client-env 62 | python-version: ${{ matrix.python }} 63 | channels: conda-forge 64 | # channel-priority: strict 65 | auto-update-conda: true 66 | # mamba-version: "*" 67 | # use-mamba: true 68 | 69 | - name: Fix windows .profile 70 | if: steps.cache.outputs.cache-hit != 'true' && matrix.os == 'windows-latest' 71 | run: | 72 | cp ~/.bash_profile ~/.profile 73 | 74 | # Work around weird issues on OSX possibly caused by mixed compilers 75 | # https://github.com/tskit-dev/tsinfer/issues/376 76 | - name: Install compiler from conda 77 | if: steps.cache.outputs.cache-hit != 'true' 78 | shell: bash -l {0} #We need a login shell to get conda 79 | run: conda install --yes c-compiler 80 | 81 | - name: Install conda deps 82 | if: steps.cache.outputs.cache-hit != 'true' 83 | shell: bash -l {0} #We need a login shell to get conda 84 | run: conda install --yes --file=requirements/CI-tests-conda/requirements.txt 85 | 86 | - name: Install cyvcf2 #Fails if done via conda due to no windows support. 87 | if: steps.cache.outputs.cache-hit != 'true' && matrix.os != 'windows-latest' 88 | run: | 89 | source ~/.profile 90 | conda activate anaconda-client-env 91 | #Install these by pip so we don't pull in cbgen with conda as it isn't available on 3.12 92 | pip install sgkit==0.9.0 cyvcf2==0.31.1 yarl==1.9.4 aiohttp==3.9.5 requests==2.32.3 93 | 94 | - name: Install sgkit only on windows 95 | if: steps.cache.outputs.cache-hit != 'true' && matrix.os == 'windows-latest' 96 | run: | 97 | source ~/.profile 98 | conda activate anaconda-client-env 99 | #Install these by pip so we don't pull in cbgen with conda as it isn't available on 3.12 100 | pip install sgkit==0.9.0 101 | 102 | - name: Build module 103 | run: | 104 | source ~/.profile 105 | conda activate anaconda-client-env 106 | # Use numpy2 to build the module 107 | pip install "numpy>=2" 108 | python setup.py build_ext --inplace 109 | 110 | - name: Run tests 111 | run: | 112 | source ~/.profile 113 | conda activate anaconda-client-env 114 | # Test with numpy<2 for numba 115 | pip install "numpy<2" 116 | python -m pytest -xv 117 | -------------------------------------------------------------------------------- /.github/workflows/wheels.yml: -------------------------------------------------------------------------------- 1 | name: Build and test wheels 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - test 8 | tags: 9 | - '*' 10 | release: 11 | types: [published] 12 | 13 | jobs: 14 | OSX: 15 | runs-on: macos-latest 16 | strategy: 17 | matrix: 18 | python: [3.9, "3.10", 3.11, 3.12] 19 | steps: 20 | - name: Checkout 21 | uses: actions/checkout@v4.2.2 22 | with: 23 | submodules: true 24 | - name: Set up Python ${{ matrix.python }} 25 | uses: actions/setup-python@v5.4.0 26 | with: 27 | python-version: ${{ matrix.python }} 28 | - name: Install deps 29 | run: | 30 | pip install --upgrade pip build delocate 31 | - name: Build Wheel 32 | run: | 33 | python -m build --wheel 34 | - name: Delocate to bundle dynamic libs 35 | run: | 36 | delocate-wheel -v dist/*.whl 37 | - name: Upload Wheels 38 | uses: actions/upload-artifact@v4.6.1 39 | with: 40 | name: osx-wheel-${{ matrix.python }} 41 | path: dist 42 | 43 | windows: 44 | runs-on: windows-latest 45 | strategy: 46 | matrix: 47 | python: [3.9, "3.10", 3.11, 3.12] 48 | wordsize: [64] 49 | steps: 50 | - name: Checkout 51 | uses: actions/checkout@v4.2.2 52 | with: 53 | submodules: true 54 | - name: Install deps 55 | env: 56 | PYTHON: "py -${{ matrix.python }}-${{ matrix.wordsize }}" 57 | shell: bash 58 | run: | 59 | set -ex 60 | ${PYTHON} -m pip install --upgrade pip build 61 | - name: Build Wheel 62 | env: 63 | PYTHON: "py -${{ matrix.python }}-${{ matrix.wordsize }}" 64 | shell: bash 65 | run: | 66 | set -ex 67 | ${PYTHON} -m build --wheel 68 | - name: Upload Wheels 69 | uses: actions/upload-artifact@v4.6.1 70 | with: 71 | name: win-wheel-${{ matrix.python }}-${{ matrix.wordsize }} 72 | path: dist 73 | 74 | manylinux: 75 | runs-on: ubuntu-24.04 76 | steps: 77 | - name: Checkout 78 | uses: actions/checkout@v4.2.2 79 | with: 80 | submodules: true 81 | 82 | - name: Set up Python 3.9 83 | uses: actions/setup-python@v5.4.0 84 | with: 85 | python-version: 3.9 86 | 87 | - name: Build sdist 88 | shell: bash 89 | run: | 90 | pip install --upgrade pip build 91 | python -m build --sdist 92 | 93 | - name: Upload sdist 94 | uses: actions/upload-artifact@v4.6.1 95 | with: 96 | name: sdist 97 | path: dist 98 | 99 | - name: Build wheels in docker 100 | shell: bash 101 | run: | 102 | docker run --rm -v `pwd`:/project -w /project quay.io/pypa/manylinux2014_x86_64 bash .github/workflows/docker/buildwheel.sh 103 | 104 | - name: Upload Wheels 105 | uses: actions/upload-artifact@v4.6.1 106 | with: 107 | name: linux-wheels 108 | path: dist/wheelhouse 109 | 110 | OSX-test: 111 | needs: ['OSX'] 112 | runs-on: macos-latest 113 | strategy: 114 | matrix: 115 | python: [3.9, "3.10", 3.11, 3.12] 116 | steps: 117 | - name: Download wheels 118 | uses: actions/download-artifact@v4.2.0 119 | with: 120 | name: osx-wheel-${{ matrix.python }} 121 | - name: Set up Python ${{ matrix.python }} 122 | uses: actions/setup-python@v5.4.0 123 | with: 124 | python-version: ${{ matrix.python }} 125 | - name: Install wheel and test 126 | run: | 127 | python -VV 128 | # Install the local wheel 129 | pip install --no-index --no-deps --find-links=. tsinfer 130 | pip install tsinfer 131 | python -c "import tsinfer" 132 | 133 | windows-test: 134 | needs: ['windows'] 135 | runs-on: windows-latest 136 | strategy: 137 | matrix: 138 | python: [3.9, "3.10", 3.11, 3.12] 139 | wordsize: [64] 140 | steps: 141 | - name: Download wheels 142 | uses: actions/download-artifact@v4.2.0 143 | with: 144 | name: win-wheel-${{ matrix.python }}-${{ matrix.wordsize }} 145 | - name: Set up Python ${{ matrix.python }} 146 | uses: actions/setup-python@v5.4.0 147 | with: 148 | python-version: ${{ matrix.python }} 149 | - name: Install wheel and test 150 | run: | 151 | python -VV 152 | #patch-ng required to build lmdb 153 | pip install patch-ng 154 | # Install the local wheel 155 | pip install --no-index --no-deps --find-links=. tsinfer 156 | pip install tsinfer 157 | python -c "import tsinfer" 158 | 159 | manylinux-test: 160 | runs-on: ubuntu-24.04 161 | needs: ['manylinux'] 162 | strategy: 163 | matrix: 164 | python: [3.9, "3.10", 3.11, 3.12] 165 | steps: 166 | - name: Download wheels 167 | uses: actions/download-artifact@v4.2.0 168 | with: 169 | name: linux-wheels 170 | - name: Set up Python 171 | uses: actions/setup-python@v5.4.0 172 | with: 173 | python-version: ${{ matrix.python }} 174 | - name: Install wheel and test 175 | run: | 176 | python -VV 177 | # Install the local wheel 178 | pip install --no-index --no-deps --find-links=. tsinfer 179 | pip install tsinfer 180 | python -c "import tsinfer" 181 | 182 | 183 | PyPI_Upload: 184 | runs-on: ubuntu-24.04 185 | environment: release 186 | needs: ['windows-test', 'OSX-test', 'manylinux-test'] 187 | permissions: 188 | id-token: write 189 | steps: 190 | - name: Download all 191 | uses: actions/download-artifact@v4.2.0 192 | - name: Move to dist 193 | run: | 194 | mkdir dist 195 | cp */*.{whl,gz} dist/. 196 | - name: Publish distribution to Test PyPI 197 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') 198 | uses: pypa/gh-action-pypi-publish@v1.12.4 199 | with: 200 | repository_url: https://test.pypi.org/legacy/ 201 | - name: Publish distribution to PRODUCTION PyPI 202 | if: github.event_name == 'release' 203 | uses: pypa/gh-action-pypi-publish@v1.12.4 204 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | .hypothesis/ 46 | 47 | # Translations 48 | *.mo 49 | *.pot 50 | 51 | # Django stuff: 52 | *.log 53 | local_settings.py 54 | 55 | # Flask stuff: 56 | instance/ 57 | .webassets-cache 58 | 59 | # Scrapy stuff: 60 | .scrapy 61 | 62 | # OS X stuff 63 | .DS_Store 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | 68 | # PyBuilder 69 | target/ 70 | 71 | # IPython Notebook 72 | .ipynb_checkpoints 73 | 74 | # pyenv 75 | .python-version 76 | 77 | # celery beat schedule file 78 | celerybeat-schedule 79 | 80 | # dotenv 81 | .env 82 | 83 | # virtualenv 84 | venv/ 85 | ENV/ 86 | 87 | # Spyder project settings 88 | .spyderproject 89 | 90 | # Rope project settings 91 | .ropeproject 92 | 93 | *.svg 94 | tsinfer/_version.py 95 | 96 | # Mac OS 97 | .DS_Store 98 | -------------------------------------------------------------------------------- /.mergify.yml: -------------------------------------------------------------------------------- 1 | queue_rules: 2 | - name: default 3 | queue_conditions: 4 | - "-merged" 5 | - "#approved-reviews-by>=1" 6 | - "#changes-requested-reviews-by=0" 7 | - base=main 8 | - label=AUTOMERGE-REQUESTED 9 | - status-success=Lint 10 | - status-success=Python (3.9, macos-latest) 11 | - status-success=Python (3.12, macos-latest) 12 | - status-success=Python (3.9, ubuntu-24.04) 13 | - status-success=Python (3.12, ubuntu-24.04) 14 | - status-success=Python (3.9, windows-latest) 15 | - status-success=Python (3.12, windows-latest) 16 | - "status-success=ci/circleci: build" 17 | merge_conditions: 18 | - "#approved-reviews-by>=1" 19 | - "#changes-requested-reviews-by=0" 20 | - status-success=Lint 21 | - status-success=Python (3.9, macos-latest) 22 | - status-success=Python (3.12, macos-latest) 23 | - status-success=Python (3.9, ubuntu-24.04) 24 | - status-success=Python (3.12, ubuntu-24.04) 25 | - status-success=Python (3.9, windows-latest) 26 | - status-success=Python (3.12, windows-latest) 27 | - "status-success=ci/circleci: build" 28 | merge_method: rebase 29 | update_method: rebase 30 | 31 | pull_request_rules: 32 | - name: Remove label after merge 33 | conditions: 34 | - merged 35 | - label=AUTOMERGE-REQUESTED 36 | actions: 37 | label: 38 | remove: 39 | - AUTOMERGE-REQUESTED 40 | - name: refactored queue action rule 41 | conditions: [] 42 | actions: 43 | queue: 44 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v5.0.0 4 | hooks: 5 | - id: check-merge-conflict 6 | - id: debug-statements 7 | - id: mixed-line-ending 8 | - id: check-case-conflict 9 | - id: check-yaml 10 | - repo: https://github.com/benjeffery/pre-commit-clang-format 11 | rev: '1.0' 12 | hooks: 13 | - id: clang-format 14 | exclude: avl 15 | verbose: true 16 | - repo: https://github.com/asottile/reorder_python_imports 17 | rev: v3.14.0 18 | hooks: 19 | - id: reorder-python-imports 20 | args: [ --unclassifiable-application-module=_tsinfer ] 21 | - repo: https://github.com/asottile/pyupgrade 22 | rev: v3.19.1 23 | hooks: 24 | - id: pyupgrade 25 | args: [ --py3-plus, --py39-plus ] 26 | - repo: https://github.com/psf/black 27 | rev: 25.1.0 28 | hooks: 29 | - id: black 30 | language_version: python3 31 | - repo: https://github.com/asottile/blacken-docs 32 | rev: 1.19.1 33 | hooks: 34 | - id: blacken-docs 35 | args: [--skip-errors] 36 | additional_dependencies: [black==22.3.0] 37 | language_version: python3 38 | - repo: https://github.com/pycqa/flake8 39 | rev: 7.1.2 40 | hooks: 41 | - id: flake8 42 | args: [--config=.flake8] 43 | additional_dependencies: ["flake8-bugbear==22.10.27", "flake8-builtins==2.0.1"] -------------------------------------------------------------------------------- /CITATION.md: -------------------------------------------------------------------------------- 1 | (sec_citation)= 2 | 3 | # Citing tsinfer 4 | 5 | If you use `tsinfer` in your work, please cite the 6 | [2019 Nature Genetics paper](): 7 | 8 | > Jerome Kelleher, Yan Wong, Anthony W. Wohns, 9 | > Chaimaa Fadil, Patrick K. Albers & Gil McVean (2019) 10 | > *Inferring whole-genome histories in large population datasets*, 11 | > Nature Genetics, Volume 51, 1330–1338. https://doi.org/10.1038/s41588-019-0483-y 12 | 13 | Bibtex record: 14 | 15 | ```bibtex 16 | 17 | @article{Kelleher2019, 18 | doi = {10.1038/s41588-019-0483-y}, 19 | url = {https://doi.org/10.1038/s41588-019-0483-y}, 20 | year = {2019}, 21 | month = sep, 22 | publisher = {Springer Science and Business Media {LLC}}, 23 | volume = {51}, 24 | number = {9}, 25 | pages = {1330--1338}, 26 | author = {Jerome Kelleher and Yan Wong and Anthony W. Wohns and Chaimaa Fadil and Patrick K. Albers and Gil McVean}, 27 | title = {Inferring whole-genome histories in large population datasets}, 28 | journal = {Nature Genetics} 29 | } -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include lib/*.h 2 | include lib/subprojects/tskit/c/*.h 3 | include lib/subprojects/tskit/c/tskit/*.h 4 | include lib/subprojects/tskit/c/subprojects/kastore/*.h 5 | include README.txt 6 | include LICENSE 7 | recursive-include tests *.py 8 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CC?=gcc 2 | CFLAGS=-std=c99 -g -O3 -march=native -funroll-loops -ffast-math \ 3 | # -ftree-vectorize \ 4 | # -ftree-vectorizer-verbose=6 \ 5 | # -fopt-info-vec-missed 6 | 7 | all: _tsinfer.cpython-34m.so 8 | 9 | _tsinfer.cpython-34m.so: _tsinfermodule.c 10 | CC="${CC}" CFLAGS="${CFLAGS}" python setup.py build_ext --inplace 11 | 12 | ctags: 13 | ctags lib/*.c lib/*.h tsinfer/*.py 14 | 15 | clean: 16 | rm -f *.so *.o tags 17 | rm -fR build 18 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tsinfer 2 | 3 | [![CircleCI](https://circleci.com/gh/tskit-dev/tsinfer.svg?style=svg)](https://circleci.com/gh/tskit-dev/tsinfer) [![Build Status](https://travis-ci.org/tskit-dev/tsinfer.svg?branch=main)](https://travis-ci.org/tskit-dev/tsinfer) [![Docs Build](https://github.com/tskit-dev/tsinfer/actions/workflows/docs.yml/badge.svg)](https://tskit.dev/tsinfer/docs/stable/introduction.html) [![codecov](https://codecov.io/gh/tskit-dev/tsinfer/branch/main/graph/badge.svg)](https://codecov.io/gh/tskit-dev/tsinfer) 4 | 5 | 6 | Infer a tree sequence from genetic variation data 7 | 8 | The [documentation](https://tskit.dev/tsinfer/docs/latest) contains details of how to use this software, including [installation instructions](https://tskit.dev/tsinfer/docs/latest/installation.html). 9 | 10 | The initial algorithm, its rationale, and results from testing on simulated and real data are described in the following [Nature Genetics paper](https://doi.org/10.1038/s41588-019-0483-y): 11 | 12 | > Jerome Kelleher, Yan Wong, Anthony W Wohns, Chaimaa Fadil, Patrick K Albers and Gil McVean (2019) *Inferring whole-genome histories in large population datasets*. Nature Genetics **51**: 1330-1338 13 | 14 | _Tsinfer_ versions [0.2.0](https://github.com/tskit-dev/tsinfer/releases/tag/0.2.0) onwards allow missing data and provide a fully parameterised Li & Stephens matching algorithm (i.e. which allows mismatch). These improvements are described in the 15 | following [Science paper](https://doi.org/10.1126/science.abi8264): 16 | 17 | > Anthony Wilder Wohns, Yan Wong, Ben Jeffery, Ali Akbari, Swapan Mallick, Ron Pinhasi, Nick Patterson, David Reich, Jerome Kelleher, and Gil McVean (2022) A unified genealogy of modern and ancient genomes. Science 375: eabi8264 18 | 19 | Please cite either or both of these if you use ``tsinfer`` in your work. Code to reproduce the results in the first paper is present in a [separate GitHub repository](https://github.com/mcveanlab/treeseq-inference). 20 | 21 | Note that `tsinfer` does not attempt to infer node times (i.e. branch lengths of the 22 | inferred trees). If you require a tree sequence where the dates of common ancestors 23 | are expressed in calendar or generation times, you should post-process the ``tsinfer`` 24 | output using software such as [``tsdate``](https://github.com/tskit-dev/tsdate). 25 | -------------------------------------------------------------------------------- /convert_hdf5.py: -------------------------------------------------------------------------------- 1 | # Simple script to convert input data into HDF5 format so that 2 | # we can feed it into the C development CLI. 3 | import sys 4 | 5 | import h5py 6 | import numpy as np 7 | 8 | import tsinfer 9 | 10 | 11 | def main(infile, outfile): 12 | sample_data = tsinfer.SampleData.load(infile) 13 | print(sample_data) 14 | shape = (sample_data.num_inference_sites, sample_data.num_samples) 15 | G = np.empty(shape, dtype=np.int8) 16 | for j, (_, genotypes) in enumerate(sample_data.genotypes(inference_sites=True)): 17 | G[j] = genotypes 18 | with h5py.File(outfile, "w") as root: 19 | root["haplotypes"] = G.T 20 | 21 | 22 | if __name__ == "__main__": 23 | main(sys.argv[1], sys.argv[2]) 24 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | notebook-simulation.trees 2 | notebook-simulation.samples 3 | notebook-simulation-source.trees 4 | notebook-simulation.vc* 5 | notebook-simulation-AA.npy 6 | P_dom_chr24_phased.samples 7 | sparrows.vcz 8 | -------------------------------------------------------------------------------- /docs/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ../CHANGELOG.md -------------------------------------------------------------------------------- /docs/CITATION.md: -------------------------------------------------------------------------------- 1 | ../CITATION.md -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | 2 | # Need to set PYTHONPATH so that we pick up the local tsinfer 3 | PYPATH=$(shell pwd)/../ 4 | TSINF_VERSION:=$(shell PYTHONPATH=${PYPATH} \ 5 | python -c 'import tsinfer; print(tsinfer.__version__.split("+")[0])') 6 | 7 | BUILDDIR = _build 8 | 9 | all: dev 10 | 11 | dev: 12 | PYTHONPATH=${PYPATH} ./build.sh 13 | 14 | dist: 15 | @echo Building distribution for tskit version ${TSINF_VERSION} 16 | sed -i -e s/__TSINFER_VERSION__/${TSINF_VERSION}/g _config.yml 17 | PYTHONPATH=${PYPATH} ./build.sh 18 | 19 | clean: 20 | rm -fR $(BUILDDIR) 21 | rm -rf _static/example_data.vcz/ancestral_state 22 | -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | # Book settings 2 | # Learn more at https://jupyterbook.org/customize/config.html 3 | 4 | title: Tsinfer manual 5 | author: Tskit Developers 6 | copyright: "2018" 7 | only_build_toc_files: true 8 | logo: tsinfer_logo.svg 9 | 10 | execute: 11 | execute_notebooks: cache 12 | 13 | launch_buttons: 14 | binderhub_url: "" 15 | 16 | repository: 17 | url: https://github.com/tskit-dev/tsinfer 18 | branch: main 19 | path_to_book: docs 20 | 21 | html: 22 | use_issues_button: true 23 | use_repository_button: true 24 | use_edit_page_button: true 25 | # Do not edit this - the version placeholder is replaced by the 26 | # current version during a distribution build in the Makefile 27 | extra_navbar: tsinfer __TSINFER_VERSION__ 28 | extra_footer: tsinfer __TSINFER_VERSION__ 29 | 30 | sphinx: 31 | extra_extensions: 32 | - sphinx.ext.autodoc 33 | - sphinx.ext.autosummary 34 | - sphinx.ext.todo 35 | - sphinx.ext.viewcode 36 | - sphinx.ext.intersphinx 37 | - sphinx_issues 38 | - sphinxarg.ext 39 | - IPython.sphinxext.ipython_console_highlighting 40 | 41 | config: 42 | html_theme: sphinx_book_theme 43 | html_theme_options: 44 | pygments_dark_style: monokai 45 | pygments_style: monokai 46 | myst_enable_extensions: 47 | - colon_fence 48 | - deflist 49 | issues_github_path: tskit-dev/tsinfer 50 | todo_include_todos: true 51 | intersphinx_mapping: 52 | python: ["https://docs.python.org/3/", null] 53 | tskit: ["https://tskit.dev/tskit/docs/stable", null] 54 | msprime: ["https://tskit.dev/msprime/docs/stable", null] 55 | tutorials: ["https://tskit.dev/tutorials/", null] 56 | numpy: ["https://numpy.org/doc/stable/", null] 57 | numcodecs: ["https://numcodecs.readthedocs.io/en/stable/", null] 58 | zarr: ["https://zarr.readthedocs.io/en/stable/", null] 59 | nitpicky: true 60 | 61 | autodoc_member_order: bysource 62 | 63 | # Without this option, autodoc tries to put links for all return types 64 | # in terms of the fully-qualified classnames 65 | # (e.g. msprime.demography.Demography) which we don't want, and also 66 | # leads to broken links and nitpick failures. So, until we tackle 67 | # typehints fully, this is the simplest approach. 68 | autodoc_typehints: none 69 | 70 | # Note we have to use the regex version here because of 71 | # https://github.com/sphinx-doc/sphinx/issues/9748 72 | nitpick_ignore_regex: [ 73 | [ "py:class", "arraylike" ], 74 | [ "py:class", "array_like" ], 75 | [ "py:class", "array" ], 76 | [ "py:class", "dtype=float64" ], 77 | [ "py:class", "dtype=uint32" ], 78 | [ "py:class", "dtype=int8" ], 79 | [ "py:class", "iter" ], 80 | ] 81 | -------------------------------------------------------------------------------- /docs/_static/.README: -------------------------------------------------------------------------------- 1 | Placeholder file to make git store this directory. 2 | -------------------------------------------------------------------------------- /docs/_static/P_dom_chr24_phased.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tsinfer/20788d393b79f0ee8b39d866456533c2d86abbe7/docs/_static/P_dom_chr24_phased.vcf.gz -------------------------------------------------------------------------------- /docs/_static/P_dom_chr24_phased.vcf.gz.tbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tsinfer/20788d393b79f0ee8b39d866456533c2d86abbe7/docs/_static/P_dom_chr24_phased.vcf.gz.tbi -------------------------------------------------------------------------------- /docs/_static/ancestor_grouping.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tsinfer/20788d393b79f0ee8b39d866456533c2d86abbe7/docs/_static/ancestor_grouping.png -------------------------------------------------------------------------------- /docs/_static/example_ancestral_state.fa: -------------------------------------------------------------------------------- 1 | >chr1 2 | nnnnnnnnnnnnnnnGnnnnnnnnnnnnnnnnnnnnnnnnnnnGnnnnnCnnnnTnnnnnnnnnnnnnnnCnnnAnnnnnnnnnTnnnnnnnnnAnnnn -------------------------------------------------------------------------------- /docs/_static/example_ancestral_state.fa.fai: -------------------------------------------------------------------------------- 1 | chr1 99 6 99 99 2 | -------------------------------------------------------------------------------- /docs/_static/example_data.vcz/.zattrs: -------------------------------------------------------------------------------- 1 | { 2 | "contigs": [ 3 | "0" 4 | ], 5 | "source": "sgkit-0.9.0" 6 | } -------------------------------------------------------------------------------- /docs/_static/example_data.vcz/.zgroup: -------------------------------------------------------------------------------- 1 | { 2 | "zarr_format": 2 3 | } -------------------------------------------------------------------------------- /docs/_static/example_data.vcz/.zmetadata: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | ".zattrs": { 4 | "contigs": [ 5 | "0" 6 | ], 7 | "source": "sgkit-0.9.0" 8 | }, 9 | ".zgroup": { 10 | "zarr_format": 2 11 | }, 12 | "call_genotype/.zarray": { 13 | "chunks": [ 14 | 8, 15 | 3, 16 | 2 17 | ], 18 | "compressor": { 19 | "blocksize": 0, 20 | "clevel": 5, 21 | "cname": "lz4", 22 | "id": "blosc", 23 | "shuffle": 1 24 | }, 25 | "dtype": "|i1", 26 | "fill_value": null, 27 | "filters": null, 28 | "order": "C", 29 | "shape": [ 30 | 8, 31 | 3, 32 | 2 33 | ], 34 | "zarr_format": 2 35 | }, 36 | "call_genotype/.zattrs": { 37 | "_ARRAY_DIMENSIONS": [ 38 | "variants", 39 | "samples", 40 | "ploidy" 41 | ], 42 | "comment": "Call genotype. Encoded as allele values (0 for the reference, 1 for\nthe first allele, 2 for the second allele), -1 to indicate a\nmissing value, or -2 to indicate a non allele in mixed ploidy datasets.", 43 | "mixed_ploidy": false 44 | }, 45 | "call_genotype_mask/.zarray": { 46 | "chunks": [ 47 | 8, 48 | 3, 49 | 2 50 | ], 51 | "compressor": { 52 | "blocksize": 0, 53 | "clevel": 5, 54 | "cname": "lz4", 55 | "id": "blosc", 56 | "shuffle": 1 57 | }, 58 | "dtype": "|i1", 59 | "fill_value": null, 60 | "filters": null, 61 | "order": "C", 62 | "shape": [ 63 | 8, 64 | 3, 65 | 2 66 | ], 67 | "zarr_format": 2 68 | }, 69 | "call_genotype_mask/.zattrs": { 70 | "_ARRAY_DIMENSIONS": [ 71 | "variants", 72 | "samples", 73 | "ploidy" 74 | ], 75 | "comment": "A flag for each call indicating which values are missing.", 76 | "dtype": "bool" 77 | }, 78 | "call_genotype_phased/.zarray": { 79 | "chunks": [ 80 | 8, 81 | 3 82 | ], 83 | "compressor": { 84 | "blocksize": 0, 85 | "clevel": 5, 86 | "cname": "lz4", 87 | "id": "blosc", 88 | "shuffle": 1 89 | }, 90 | "dtype": "|i1", 91 | "fill_value": null, 92 | "filters": null, 93 | "order": "C", 94 | "shape": [ 95 | 8, 96 | 3 97 | ], 98 | "zarr_format": 2 99 | }, 100 | "call_genotype_phased/.zattrs": { 101 | "_ARRAY_DIMENSIONS": [ 102 | "variants", 103 | "samples" 104 | ], 105 | "comment": "A flag for each call indicating if it is phased or not. If omitted\nall calls are unphased.", 106 | "dtype": "bool" 107 | }, 108 | "contig_id/.zarray": { 109 | "chunks": [ 110 | 1 111 | ], 112 | "compressor": { 113 | "blocksize": 0, 114 | "clevel": 5, 115 | "cname": "lz4", 116 | "id": "blosc", 117 | "shuffle": 1 118 | }, 119 | "dtype": "`. There are two 14 | equivalent ways to invoke this program: 15 | 16 | .. code-block:: bash 17 | 18 | $ tsinfer 19 | 20 | or 21 | 22 | .. code-block:: bash 23 | 24 | $ python3 -m tsinfer 25 | 26 | The first form is more intuitive and works well most of the time. The second 27 | form is useful when multiple versions of Python are installed or if the 28 | :command:`tsinfer` executable is not installed on your path. 29 | 30 | The :command:`tsinfer` program has five subcommands: :command:`list` prints a 31 | summary of the data held in one of tsinfer's :ref:`file formats `; 32 | :command:`infer` runs the complete :ref:`inference process ` for a given 33 | input SampleData file; and 34 | :command:`generate-ancestors`, :command:`match-ancestors` and 35 | :command:`match-samples` run the three parts of this inference 36 | process as separate steps. Running the inference as separate steps like this 37 | is recommended for large inferences as it allows for greater control over 38 | the inference process. 39 | 40 | ++++++++++++++++ 41 | Argument details 42 | ++++++++++++++++ 43 | 44 | .. argparse:: 45 | :module: tsinfer 46 | :func: get_cli_parser 47 | :prog: tsinfer 48 | :nodefault: 49 | 50 | -------------------------------------------------------------------------------- /docs/development.rst: -------------------------------------------------------------------------------- 1 | .. _sec_development: 2 | 3 | ======================= 4 | Developer documentation 5 | ======================= 6 | 7 | .. todo:: Write developer documentation. 8 | -------------------------------------------------------------------------------- /docs/file_formats.rst: -------------------------------------------------------------------------------- 1 | .. _sec_file_formats: 2 | 3 | ============ 4 | File formats 5 | ============ 6 | 7 | ``tsinfer`` uses the excellent `zarr library `_ 8 | to encode data in a form that is both compact and efficient to process. 9 | See the :ref:`API documentation ` for details on 10 | how to construct and manipulate these files using Python. The 11 | :ref:`tsinfer list ` command provides a way to print out a 12 | summary of these files. 13 | 14 | 15 | .. _sec_file_formats_ancestors: 16 | 17 | ************** 18 | Ancestors File 19 | ************** 20 | 21 | The ancestors file contains the ancestral haplotype data inferred from the 22 | sample data in the :ref:`sec_inference_generate_ancestors` step. 23 | 24 | .. todo:: Document the structure of the ancestors file. 25 | 26 | 27 | .. _sec_file_formats_tree_sequences: 28 | 29 | ************** 30 | Tree sequences 31 | ************** 32 | 33 | The goal of ``tsinfer`` is to infer correlated genealogies from variation 34 | data, and it uses the very efficient `succinct tree sequence 35 | `_ data structure 36 | to encode this output. Please see the `tskit documentation 37 | `_ for details on how to 38 | process and manipulate such tree sequences. 39 | 40 | The intermediate ``.ancestors.trees`` file produced by the 41 | :ref:`sec_inference_match_ancestors` step is also a 42 | tree sequence and can be loaded and analysed using the 43 | `tskit API `_. 44 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # Welcome to tsinfer's documentation! 2 | 3 | This is the documentation for {program}`tsinfer`, a method for inferring correlated 4 | genealogies (a.k.a. tree sequence or ARGs) from genetic variation data. 5 | 6 | Besides this manual, there are a number of other resources 7 | available for learning about {program}`tskit` and {program}`tsinfer`: 8 | 9 | - The [tskit tutorials](https://tskit.dev/tutorials) site contains 10 | in-depth tutorials on analysis of the {program}`tskit` tree sequences produced by 11 | {program}`tsinfer`. 12 | 13 | - Our [Discussions board](https://github.com/tskit-dev/tsinfer/discussions) 14 | is a great place to ask questions like "how do I do X" or "what's the best 15 | way to do Y". Please make questions as clear as possible, and be respectful, 16 | helpful, and kind. 17 | 18 | ```{important} 19 | If you use {program}`tsinfer` in your work, please remember to 20 | cite it appropriately: see the {ref}`citations` page 21 | for details. 22 | ``` 23 | 24 | 25 | ## Contents 26 | 27 | ```{tableofcontents} 28 | ``` 29 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | .. _sec_installation: 2 | 3 | ############ 4 | Installation 5 | ############ 6 | 7 | Python 3.9 or newer is required for ``tsinfer``. Any Unix-like platform should 8 | work (``tsinfer`` is tested on Linux, OS X, and Windows). 9 | 10 | *************** 11 | Binary packages 12 | *************** 13 | 14 | The most reliable way to install ``tsinfer`` is to install the binary conda package: 15 | e.g.:: 16 | 17 | $ conda install tsinfer -c conda-forge 18 | 19 | you can then ``import tsinfer`` in python or use the ``tsinfer`` executable directly:: 20 | 21 | $ tsinfer --help 22 | 23 | ********************** 24 | Installing from source 25 | ********************** 26 | 27 | It is also possible to install from source via ``pip`` (although see the issues below): 28 | 29 | $ python -m pip install tsinfer --user 30 | 31 | which will install ``tsinfer`` to the Python installation corresponding to your 32 | ``python`` executable. All requirements should be installed automatically. 33 | 34 | To run the command line interface to ``tsinfer`` you can then use:: 35 | 36 | $ python -m tsinfer --help 37 | 38 | 39 | If your ``PATH`` is set up to point at the corresponding ``bin`` directory 40 | you can also use the ``tsinfer`` executable directly:: 41 | 42 | $ tsinfer --help 43 | 44 | You may wish to install into a virtual environment 45 | first using `venv `_:: 46 | 47 | $ python -m venv tsinfer-venv 48 | $ source tsinfer-venv/bin/activate 49 | (tsinfer-venv) $ python -m pip install tsinfer 50 | (tsinfer-venv) $ tsinfer --help 51 | 52 | **************** 53 | Potential issues 54 | **************** 55 | 56 | #. One of the dependencies of ``tsinfer``, 57 | `numcodecs `_, is compiled to 58 | use AVX2 instructions (where available) when installed using pip. This can lead to 59 | issues when ``numcodecs`` is compiled on a machine that supports AVX2 60 | and subsequently run on older machines that do not. To resolve this, ``numcodecs`` 61 | has a ``DISABLE_NUMCODECS_AVX2`` variable which can be turned on before calling 62 | ``pip install``, see 63 | `these instructions `_ 64 | for details. 65 | 66 | #. There can be problems compiling from source using the default compilers under Mac OS 67 | (see https://github.com/tskit-dev/tsinfer/issues/376). The current workaround is 68 | either to compile from source by installing alternative python and C compilers via 69 | conda (``conda install -c conda-forge c-compiler``) or to install the binary 70 | packages via conda as recommended at the top of this page. 71 | -------------------------------------------------------------------------------- /docs/introduction.rst: -------------------------------------------------------------------------------- 1 | .. _sec_introduction: 2 | 3 | ============ 4 | Introduction 5 | ============ 6 | 7 | The goal of ``tsinfer`` is to infer *succinct tree sequences* from observed 8 | genetic variation data. A succinct tree sequence (or 9 | :ref:`tree sequence`, for short) 10 | is an efficient way of representing the correlated genealogies that 11 | describe the ancestry of many species. By inferring these tree sequences, we 12 | make two very important gains: 13 | 14 | 1. We obtain an approximation of the true history of our sampled data, which 15 | may be useful for other inferential tasks. 16 | 17 | 2. The data structure itself is an extremely concise and efficient means of 18 | storing and processing the data that we have. 19 | 20 | The output of ``tsinfer`` is a :class:`tskit.TreeSequence` and so the 21 | full `tskit API `_ can be used to 22 | analyse real data, in precisely the same way that it is commonly used 23 | to analyse simulation data, for example, from `msprime `_. 24 | 25 | .. note:: 26 | 27 | ``Tsinfer`` infers the genetic relationships between sampled genomes, but does not 28 | attempt to infer the *times* of most recent common ancestors (tMRCAs) in the genealogy. 29 | If you are using the output of ``tsinfer`` in downstream analysis that relies on 30 | node times, you are advised not to use the inferred tree sequences directly; instead, 31 | you should post-process the ``tsinfer`` output using software such as 32 | `tsdate `_ that attempts to assign calendar or 33 | generation times to the tree sequence nodes. -------------------------------------------------------------------------------- /docs/simulation-example.py: -------------------------------------------------------------------------------- 1 | import builtins 2 | import subprocess 3 | import sys 4 | 5 | import msprime 6 | import numpy as np 7 | from Bio import bgzf 8 | 9 | if getattr(builtins, "__IPYTHON__", False): # if running IPython: e.g. in a notebook 10 | num_diploids, seq_len = 100, 10_000 11 | name = "notebook-simulation" 12 | else: # Take parameters from the command-line 13 | num_diploids, seq_len = int(sys.argv[1]), float(sys.argv[2]) 14 | name = "cli-simulation" 15 | 16 | ts = msprime.sim_ancestry( 17 | num_diploids, 18 | population_size=10**4, 19 | recombination_rate=1e-8, 20 | sequence_length=seq_len, 21 | random_seed=6, 22 | ) 23 | ts = msprime.sim_mutations(ts, rate=1e-8, random_seed=7) 24 | ts.dump(name + "-source.trees") 25 | print( 26 | f"Simulated {ts.num_samples} samples over {seq_len/1e6} Mb:", 27 | f"{ts.num_trees} trees and {ts.num_sites} sites", 28 | ) 29 | 30 | # Convert to a zarr file: this should be easier once a tskit2zarr utility is made, see 31 | # https://github.com/sgkit-dev/bio2zarr/issues/232 32 | np.save(f"{name}-AA.npy", [s.ancestral_state for s in ts.sites()]) 33 | vcf_name = f"{name}.vcf.gz" 34 | with bgzf.open(vcf_name, "wt") as f: 35 | ts.write_vcf(f) 36 | subprocess.run(["tabix", vcf_name]) 37 | ret = subprocess.run( 38 | "python -m bio2zarr vcf2zarr convert --force".split() + [vcf_name, name + ".vcz"], 39 | stderr=subprocess.DEVNULL if name == "notebook-simulation" else None, 40 | ) 41 | if ret.returncode == 0: 42 | print(f"Converted to {name}.vcz") 43 | -------------------------------------------------------------------------------- /docs/tsinfer_logo.svg: -------------------------------------------------------------------------------- 1 | 2 | image/svg+xml -------------------------------------------------------------------------------- /lib/.clang-format: -------------------------------------------------------------------------------- 1 | Language: Cpp 2 | BasedOnStyle: GNU 3 | SortIncludes: false 4 | AllowShortIfStatementsOnASingleLine: false 5 | BreakBeforeBraces: Linux 6 | TabWidth: 4 7 | IndentWidth: 4 8 | ColumnLimit: 89 9 | SpaceBeforeParens: 10 | ControlStatements 11 | SpacesInCStyleCastParentheses: false 12 | SpaceAfterCStyleCast: true 13 | IndentCaseLabels: true 14 | AlignAfterOpenBracket: DontAlign 15 | BinPackArguments: true 16 | BinPackParameters: true 17 | AlwaysBreakAfterReturnType: AllDefinitions 18 | 19 | # These are disabled for version 6 compatibility 20 | # StatementMacros: ["PyObject_HEAD"] 21 | # AlignConsecutiveMacros: true 22 | -------------------------------------------------------------------------------- /lib/avl.h: -------------------------------------------------------------------------------- 1 | /***************************************************************************** 2 | 3 | avl.h - Source code for the AVL-tree library. 4 | 5 | Copyright (C) 1998 Michael H. Buselli 6 | Copyright (C) 2000-2002 Wessel Dankers 7 | 8 | This library is free software; you can redistribute it and/or 9 | modify it under the terms of the GNU Lesser General Public 10 | License as published by the Free Software Foundation; either 11 | version 2.1 of the License, or (at your option) any later version. 12 | 13 | This library is distributed in the hope that it will be useful, 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 | Lesser General Public License for more details. 17 | 18 | You should have received a copy of the GNU Lesser General Public 19 | License along with this library; if not, write to the Free Software 20 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 21 | 22 | Augmented AVL-tree. Original by Michael H. Buselli . 23 | 24 | Modified by Wessel Dankers to add a bunch of bloat to 25 | the sourcecode, change the interface and squash a few bugs. 26 | Mail him if you find new bugs. 27 | 28 | *****************************************************************************/ 29 | 30 | #ifndef _AVL_H 31 | #define _AVL_H 32 | 33 | /* We need either depths, counts or both (the latter being the default) */ 34 | #if !defined(AVL_DEPTH) && !defined(AVL_COUNT) 35 | #define AVL_DEPTH 36 | #define AVL_COUNT 37 | #endif 38 | 39 | /* User supplied function to compare two items like strcmp() does. 40 | * For example: cmp(a,b) will return: 41 | * -1 if a < b 42 | * 0 if a = b 43 | * 1 if a > b 44 | */ 45 | typedef int (*avl_compare_t)(const void *, const void *); 46 | 47 | /* User supplied function to delete an item when a node is free()d. 48 | * If NULL, the item is not free()d. 49 | */ 50 | typedef void (*avl_freeitem_t)(void *); 51 | 52 | typedef struct avl_node_t { 53 | struct avl_node_t *next; 54 | struct avl_node_t *prev; 55 | struct avl_node_t *parent; 56 | struct avl_node_t *left; 57 | struct avl_node_t *right; 58 | void *item; 59 | #ifdef AVL_COUNT 60 | unsigned int count; 61 | #endif 62 | #ifdef AVL_DEPTH 63 | unsigned char depth; 64 | #endif 65 | } avl_node_t; 66 | 67 | typedef struct avl_tree_t { 68 | avl_node_t *head; 69 | avl_node_t *tail; 70 | avl_node_t *top; 71 | avl_compare_t cmp; 72 | avl_freeitem_t freeitem; 73 | } avl_tree_t; 74 | 75 | /* Initializes a new tree for elements that will be ordered using 76 | * the supplied strcmp()-like function. 77 | * Returns the value of avltree (even if it's NULL). 78 | * O(1) */ 79 | extern avl_tree_t *avl_init_tree(avl_tree_t *avltree, avl_compare_t, avl_freeitem_t); 80 | 81 | /* Allocates and initializes a new tree for elements that will be 82 | * ordered using the supplied strcmp()-like function. 83 | * Returns NULL if memory could not be allocated. 84 | * O(1) */ 85 | extern avl_tree_t *avl_alloc_tree(avl_compare_t, avl_freeitem_t); 86 | 87 | /* Frees the entire tree efficiently. Nodes will be free()d. 88 | * If the tree's freeitem is not NULL it will be invoked on every item. 89 | * O(n) */ 90 | extern void avl_free_tree(avl_tree_t *); 91 | 92 | /* Reinitializes the tree structure for reuse. Nothing is free()d. 93 | * Compare and freeitem functions are left alone. 94 | * O(1) */ 95 | extern void avl_clear_tree(avl_tree_t *); 96 | 97 | /* Free()s all nodes in the tree but leaves the tree itself. 98 | * If the tree's freeitem is not NULL it will be invoked on every item. 99 | * O(n) */ 100 | extern void avl_free_nodes(avl_tree_t *); 101 | 102 | /* Initializes memory for use as a node. Returns NULL if avlnode is NULL. 103 | * O(1) */ 104 | extern avl_node_t *avl_init_node(avl_node_t *avlnode, void *item); 105 | 106 | /* Insert an item into the tree and return the new node. 107 | * Returns NULL and sets errno if memory for the new node could not be 108 | * allocated or if the node is already in the tree (EEXIST). 109 | * O(lg n) */ 110 | extern avl_node_t *avl_insert(avl_tree_t *, void *item); 111 | 112 | /* Insert a node into the tree and return it. 113 | * Returns NULL if the node is already in the tree. 114 | * O(lg n) */ 115 | extern avl_node_t *avl_insert_node(avl_tree_t *, avl_node_t *); 116 | 117 | /* Insert a node in an empty tree. If avlnode is NULL, the tree will be 118 | * cleared and ready for re-use. 119 | * If the tree is not empty, the old nodes are left dangling. 120 | * O(1) */ 121 | extern avl_node_t *avl_insert_top(avl_tree_t *, avl_node_t *avlnode); 122 | 123 | /* Insert a node before another node. Returns the new node. 124 | * If old is NULL, the item is appended to the tree. 125 | * O(lg n) */ 126 | extern avl_node_t *avl_insert_before(avl_tree_t *, avl_node_t *old, avl_node_t *new); 127 | 128 | /* Insert a node after another node. Returns the new node. 129 | * If old is NULL, the item is prepended to the tree. 130 | * O(lg n) */ 131 | extern avl_node_t *avl_insert_after(avl_tree_t *, avl_node_t *old, avl_node_t *new); 132 | 133 | /* Deletes a node from the tree. Returns immediately if the node is NULL. 134 | * The item will not be free()d regardless of the tree's freeitem handler. 135 | * This function comes in handy if you need to update the search key. 136 | * O(lg n) */ 137 | extern void avl_unlink_node(avl_tree_t *, avl_node_t *); 138 | 139 | /* Deletes a node from the tree. Returns immediately if the node is NULL. 140 | * If the tree's freeitem is not NULL, it is invoked on the item. 141 | * If it is, returns the item. 142 | * O(lg n) */ 143 | extern void *avl_delete_node(avl_tree_t *, avl_node_t *); 144 | 145 | /* Searches for an item in the tree and deletes it if found. 146 | * If the tree's freeitem is not NULL, it is invoked on the item. 147 | * If it is, returns the item. 148 | * O(lg n) */ 149 | extern void *avl_delete(avl_tree_t *, const void *item); 150 | 151 | /* If exactly one node is moved in memory, this will fix the pointers 152 | * in the tree that refer to it. It must be an exact shallow copy. 153 | * Returns the pointer to the old position. 154 | * O(1) */ 155 | extern avl_node_t *avl_fixup_node(avl_tree_t *, avl_node_t *new); 156 | 157 | /* Searches for a node with the key closest (or equal) to the given item. 158 | * If avlnode is not NULL, *avlnode will be set to the node found or NULL 159 | * if the tree is empty. Return values: 160 | * -1 if the returned node is smaller 161 | * 0 if the returned node is equal or if the tree is empty 162 | * 1 if the returned node is greater 163 | * O(lg n) */ 164 | extern int avl_search_closest(const avl_tree_t *, const void *item, avl_node_t **avlnode); 165 | 166 | /* Searches for the item in the tree and returns a matching node if found 167 | * or NULL if not. 168 | * O(lg n) */ 169 | extern avl_node_t *avl_search(const avl_tree_t *, const void *item); 170 | 171 | #ifdef AVL_COUNT 172 | /* Returns the number of nodes in the tree. 173 | * O(1) */ 174 | extern unsigned int avl_count(const avl_tree_t *); 175 | 176 | /* Searches a node by its rank in the list. Counting starts at 0. 177 | * Returns NULL if the index exceeds the number of nodes in the tree. 178 | * O(lg n) */ 179 | extern avl_node_t *avl_at(const avl_tree_t *, unsigned int); 180 | 181 | /* Returns the rank of a node in the list. Counting starts at 0. 182 | * O(lg n) */ 183 | extern unsigned int avl_index(const avl_node_t *); 184 | #endif 185 | 186 | #endif 187 | -------------------------------------------------------------------------------- /lib/err.c: -------------------------------------------------------------------------------- 1 | /* 2 | ** Copyright (C) 2020 University of Oxford 3 | ** 4 | ** This file is part of tsinfer. 5 | ** 6 | ** tsinfer is free software: you can redistribute it and/or modify 7 | ** it under the terms of the GNU General Public License as published by 8 | ** the Free Software Foundation, either version 3 of the License, or 9 | ** (at your option) any later version. 10 | ** 11 | ** tsinfer is distributed in the hope that it will be useful, 12 | ** but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | ** GNU General Public License for more details. 15 | ** 16 | ** You should have received a copy of the GNU General Public License 17 | ** along with tsinfer. If not, see . 18 | */ 19 | 20 | #include "err.h" 21 | #include 22 | 23 | const char * 24 | tsi_strerror(int err) 25 | { 26 | const char *ret = "Unknown error"; 27 | 28 | switch (err) { 29 | case 0: 30 | ret = "Normal exit condition. This is not an error!"; 31 | break; 32 | 33 | case TSI_ERR_GENERIC: 34 | ret = "Generic tsinfer error - please file a bug report."; 35 | break; 36 | case TSI_ERR_NO_MEMORY: 37 | ret = "Out of memory"; 38 | break; 39 | case TSI_ERR_NONCONTIGUOUS_EDGES: 40 | ret = "Edges must be contiguous"; 41 | break; 42 | case TSI_ERR_UNSORTED_EDGES: 43 | ret = "Edges must be sorted"; 44 | break; 45 | case TSI_ERR_PC_ANCESTOR_TIME: 46 | ret = "Failure generating time for path compression ancestor"; 47 | break; 48 | case TSI_ERR_BAD_PATH_CHILD: 49 | ret = "Bad path information: child node"; 50 | break; 51 | case TSI_ERR_BAD_PATH_PARENT: 52 | ret = "Bad path information: parent node"; 53 | break; 54 | case TSI_ERR_BAD_PATH_TIME: 55 | ret = "Bad path information: time"; 56 | break; 57 | case TSI_ERR_BAD_PATH_INTERVAL: 58 | ret = "Bad path information: left >= right"; 59 | break; 60 | case TSI_ERR_BAD_PATH_LEFT_LESS_ZERO: 61 | ret = "Bad path information: left < 0"; 62 | break; 63 | case TSI_ERR_BAD_PATH_RIGHT_GREATER_NUM_SITES: 64 | ret = "Bad path information: right > num_sites"; 65 | break; 66 | case TSI_ERR_MATCH_IMPOSSIBLE: 67 | ret = "Unexpected failure to find matching haplotype; please open " 68 | "an issue on GitHub"; 69 | break; 70 | case TSI_ERR_MATCH_IMPOSSIBLE_EXTREME_MUTATION_PROBA: 71 | ret = "Cannot find match: the specified mismatch probability is " 72 | "0 or 1 and no matches are possible with these parameters"; 73 | break; 74 | case TSI_ERR_MATCH_IMPOSSIBLE_ZERO_RECOMB_PRECISION: 75 | ret = "Cannot find match: the specified recombination probability is" 76 | "zero and no matches could be found. Increasing the 'precision' " 77 | "may help, but recombination values of 0 are not recommended."; 78 | break; 79 | case TSI_ERR_BAD_HAPLOTYPE_ALLELE: 80 | ret = "Input haplotype contains bad allele information."; 81 | break; 82 | case TSI_ERR_BAD_NUM_ALLELES: 83 | ret = "The number of alleles must be between 2 and 127"; 84 | break; 85 | case TSI_ERR_BAD_MUTATION_NODE: 86 | ret = "Bad mutation information: node"; 87 | break; 88 | case TSI_ERR_BAD_MUTATION_SITE: 89 | ret = "Bad mutation information: site"; 90 | break; 91 | case TSI_ERR_BAD_MUTATION_DERIVED_STATE: 92 | ret = "Bad mutation information: derived state"; 93 | break; 94 | case TSI_ERR_BAD_MUTATION_DUPLICATE_NODE: 95 | ret = "Bad mutation information: mutation already exists for this node."; 96 | break; 97 | case TSI_ERR_BAD_NUM_SAMPLES: 98 | ret = "Must have at least 2 samples."; 99 | break; 100 | case TSI_ERR_TOO_MANY_SITES: 101 | ret = "Cannot add more sites than the specified maximum."; 102 | break; 103 | case TSI_ERR_BAD_FOCAL_SITE: 104 | ret = "Bad focal site."; 105 | break; 106 | case TSI_ERR_ONE_BIT_NON_BINARY: 107 | ret = "One-bit genotype encoding only supports binary 0/1 data"; 108 | break; 109 | case TSI_ERR_IO: 110 | ret = tsk_strerror(TSK_ERR_IO); 111 | break; 112 | } 113 | return ret; 114 | } 115 | -------------------------------------------------------------------------------- /lib/err.h: -------------------------------------------------------------------------------- 1 | #ifndef __ERR_H__ 2 | #define __ERR_H__ 3 | 4 | // clang-format off 5 | #define TSI_ERR_GENERIC -1 6 | #define TSI_ERR_NO_MEMORY -2 7 | #define TSI_ERR_NONCONTIGUOUS_EDGES -3 8 | #define TSI_ERR_UNSORTED_EDGES -4 9 | #define TSI_ERR_PC_ANCESTOR_TIME -5 10 | #define TSI_ERR_BAD_PATH_CHILD -6 11 | #define TSI_ERR_BAD_PATH_PARENT -7 12 | #define TSI_ERR_BAD_PATH_TIME -8 13 | #define TSI_ERR_BAD_PATH_INTERVAL -9 14 | #define TSI_ERR_BAD_PATH_LEFT_LESS_ZERO -10 15 | #define TSI_ERR_BAD_PATH_RIGHT_GREATER_NUM_SITES -11 16 | #define TSI_ERR_MATCH_IMPOSSIBLE -12 17 | #define TSI_ERR_BAD_HAPLOTYPE_ALLELE -13 18 | #define TSI_ERR_BAD_NUM_ALLELES -14 19 | #define TSI_ERR_BAD_MUTATION_NODE -15 20 | #define TSI_ERR_BAD_MUTATION_SITE -16 21 | #define TSI_ERR_BAD_MUTATION_DERIVED_STATE -17 22 | #define TSI_ERR_BAD_MUTATION_DUPLICATE_NODE -18 23 | #define TSI_ERR_BAD_NUM_SAMPLES -19 24 | #define TSI_ERR_TOO_MANY_SITES -20 25 | #define TSI_ERR_BAD_FOCAL_SITE -21 26 | #define TSI_ERR_MATCH_IMPOSSIBLE_EXTREME_MUTATION_PROBA -22 27 | #define TSI_ERR_MATCH_IMPOSSIBLE_ZERO_RECOMB_PRECISION -23 28 | #define TSI_ERR_ONE_BIT_NON_BINARY -24 29 | #define TSI_ERR_IO -25 30 | // clang-format on 31 | 32 | #ifdef __GNUC__ 33 | #define WARN_UNUSED __attribute__((warn_unused_result)) 34 | #define unlikely(expr) __builtin_expect(!!(expr), 0) 35 | #define likely(expr) __builtin_expect(!!(expr), 1) 36 | #else 37 | /* On windows we don't do any perf related stuff */ 38 | #define WARN_UNUSED 39 | #define restrict 40 | #define unlikely(expr) (expr) 41 | #define likely(expr) (expr) 42 | #endif 43 | 44 | const char *tsi_strerror(int err); 45 | 46 | #endif /*__ERR_H__*/ 47 | -------------------------------------------------------------------------------- /lib/meson.build: -------------------------------------------------------------------------------- 1 | project('tsinfer', 'c') 2 | 3 | tskit_proj = subproject('tskit') 4 | tskit_dep = tskit_proj.get_variable('tskit_dep') 5 | 6 | cc = meson.get_compiler('c') 7 | m_dep = cc.find_library('m', required : false) 8 | cunit_dep = dependency('cunit') 9 | 10 | extra_c_args = [ 11 | '-std=c99', '-Wall', '-Wextra', '-Werror', '-Wpedantic', '-W', 12 | '-Wmissing-prototypes', '-Wstrict-prototypes', 13 | '-Wconversion', '-Wshadow', '-Wpointer-arith', '-Wcast-align', 14 | '-Wcast-qual', '-Wwrite-strings', '-Wnested-externs', 15 | '-fshort-enums', '-fno-common'] 16 | 17 | tsinfer_sources =[ 18 | 'ancestor_matcher.c', 'ancestor_builder.c', 'tree_sequence_builder.c', 19 | 'object_heap.c', 'err.c'] 20 | 21 | avl_lib = static_library('avl', sources: ['avl.c']) 22 | tsinfer_lib = static_library('tsinfer', 23 | sources: tsinfer_sources, dependencies: [m_dep, tskit_dep], 24 | c_args: extra_c_args, link_with:[avl_lib]) 25 | 26 | unit_tests = executable('tests', 27 | sources: ['tests/tests.c'], 28 | link_with: [tsinfer_lib], dependencies:[cunit_dep, tskit_dep]) 29 | test('Unit tests', unit_tests) 30 | -------------------------------------------------------------------------------- /lib/object_heap.c: -------------------------------------------------------------------------------- 1 | /* 2 | ** Copyright (C) 2018 University of Oxford 3 | ** 4 | ** This file is part of tsinfer. 5 | ** 6 | ** tsinfer is free software: you can redistribute it and/or modify 7 | ** it under the terms of the GNU General Public License as published by 8 | ** the Free Software Foundation, either version 3 of the License, or 9 | ** (at your option) any later version. 10 | ** 11 | ** tsinfer is distributed in the hope that it will be useful, 12 | ** but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | ** GNU General Public License for more details. 15 | ** 16 | ** You should have received a copy of the GNU General Public License 17 | ** along with tsinfer. If not, see . 18 | */ 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | #include "err.h" 26 | #include "object_heap.h" 27 | 28 | /* memory heap manager */ 29 | 30 | size_t 31 | object_heap_get_num_allocated(object_heap_t *self) 32 | { 33 | return self->size - self->top; 34 | } 35 | 36 | void 37 | object_heap_print_state(object_heap_t *self, FILE *out) 38 | { 39 | fprintf(out, "object heap %p::\n", (void *) self); 40 | fprintf(out, "\tsize = %d\n", (int) self->size); 41 | fprintf(out, "\ttop = %d\n", (int) self->top); 42 | fprintf(out, "\tblock_size = %d\n", (int) self->block_size); 43 | fprintf(out, "\tnum_blocks = %d\n", (int) self->num_blocks); 44 | fprintf(out, "\ttotal allocated = %d\n", (int) object_heap_get_num_allocated(self)); 45 | } 46 | 47 | static void 48 | object_heap_add_block(object_heap_t *self, char *mem_block) 49 | { 50 | size_t j, index; 51 | 52 | for (j = 0; j < self->block_size; j++) { 53 | self->heap[j] = mem_block + j * self->object_size; 54 | if (self->init_object != NULL) { 55 | index = j + (self->num_blocks - 1) * self->block_size; 56 | self->init_object(self->heap[j], index); 57 | } 58 | } 59 | self->top = self->block_size; 60 | } 61 | 62 | int WARN_UNUSED 63 | object_heap_expand(object_heap_t *self) 64 | { 65 | int ret = -1; 66 | void *p; 67 | 68 | p = realloc(self->mem_blocks, (self->num_blocks + 1) * sizeof(void *)); 69 | if (p == NULL) { 70 | ret = TSI_ERR_NO_MEMORY; 71 | goto out; 72 | } 73 | self->mem_blocks = p; 74 | p = calloc(self->block_size, self->object_size); 75 | if (p == NULL) { 76 | ret = TSI_ERR_NO_MEMORY; 77 | goto out; 78 | } 79 | self->mem_blocks[self->num_blocks] = p; 80 | self->num_blocks++; 81 | /* Now we increase the size of the heap. Since it is currently empty, 82 | * we avoid the copying cost of realloc and free before making a new 83 | * heap. 84 | */ 85 | free(self->heap); 86 | self->heap = NULL; 87 | self->size += self->block_size; 88 | self->heap = calloc(self->size, sizeof(void *)); 89 | if (self->heap == NULL) { 90 | ret = TSI_ERR_NO_MEMORY; 91 | goto out; 92 | } 93 | object_heap_add_block(self, p); 94 | ret = 0; 95 | out: 96 | return ret; 97 | } 98 | 99 | /* 100 | * Returns the jth object in the memory buffers. 101 | */ 102 | inline void *WARN_UNUSED 103 | object_heap_get_object(object_heap_t *self, size_t index) 104 | { 105 | void *ret = NULL; 106 | size_t block, obj; 107 | 108 | block = index / self->block_size; 109 | obj = index % self->block_size; 110 | if (block < self->num_blocks && obj < self->block_size) { 111 | ret = self->mem_blocks[block] + obj * self->object_size; 112 | } 113 | return ret; 114 | } 115 | 116 | inline int WARN_UNUSED 117 | object_heap_empty(object_heap_t *self) 118 | { 119 | return self->top == 0; 120 | } 121 | 122 | inline void *WARN_UNUSED 123 | object_heap_alloc_object(object_heap_t *self) 124 | { 125 | void *ret = NULL; 126 | 127 | if (self->top > 0) { 128 | self->top--; 129 | ret = self->heap[self->top]; 130 | } 131 | return ret; 132 | } 133 | 134 | inline void 135 | object_heap_free_object(object_heap_t *self, void *obj) 136 | { 137 | assert(self->top < self->size); 138 | self->heap[self->top] = obj; 139 | self->top++; 140 | } 141 | 142 | int WARN_UNUSED 143 | object_heap_init(object_heap_t *self, size_t object_size, size_t block_size, 144 | void (*init_object)(void **, size_t)) 145 | { 146 | int ret = -1; 147 | 148 | assert(block_size > 0); 149 | memset(self, 0, sizeof(object_heap_t)); 150 | self->block_size = block_size; 151 | self->size = block_size; 152 | self->object_size = object_size; 153 | self->init_object = init_object; 154 | self->num_blocks = 1; 155 | self->heap = calloc(self->size, sizeof(void *)); 156 | self->mem_blocks = calloc(1, sizeof(void *)); 157 | if (self->heap == NULL || self->mem_blocks == NULL) { 158 | ret = TSI_ERR_NO_MEMORY; 159 | goto out; 160 | } 161 | self->mem_blocks[0] = calloc(self->size, self->object_size); 162 | if (self->mem_blocks[0] == NULL) { 163 | ret = TSI_ERR_NO_MEMORY; 164 | goto out; 165 | } 166 | self->top = 0; 167 | object_heap_add_block(self, self->mem_blocks[0]); 168 | ret = 0; 169 | out: 170 | return ret; 171 | } 172 | 173 | void 174 | object_heap_free(object_heap_t *self) 175 | { 176 | size_t j; 177 | 178 | if (self->mem_blocks != NULL) { 179 | for (j = 0; j < self->num_blocks; j++) { 180 | if (self->mem_blocks[j] != NULL) { 181 | free(self->mem_blocks[j]); 182 | } 183 | } 184 | free(self->mem_blocks); 185 | } 186 | if (self->heap != NULL) { 187 | free(self->heap); 188 | } 189 | } 190 | -------------------------------------------------------------------------------- /lib/object_heap.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef OBJECT_HEAP_H 3 | #define OBJECT_HEAP_H 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | typedef struct { 10 | size_t object_size; 11 | size_t block_size; /* number of objects in a block */ 12 | size_t top; 13 | size_t size; 14 | size_t num_blocks; 15 | void **heap; 16 | char **mem_blocks; 17 | void (*init_object)(void **obj, size_t index); 18 | } object_heap_t; 19 | 20 | extern size_t object_heap_get_num_allocated(object_heap_t *self); 21 | extern void object_heap_print_state(object_heap_t *self, FILE *out); 22 | extern int object_heap_expand(object_heap_t *self); 23 | extern void *object_heap_get_object(object_heap_t *self, size_t index); 24 | extern int object_heap_empty(object_heap_t *self); 25 | extern void *object_heap_alloc_object(object_heap_t *self); 26 | extern void object_heap_free_object(object_heap_t *self, void *obj); 27 | extern int object_heap_init(object_heap_t *self, size_t object_size, size_t block_size, 28 | void (*init_object)(void **, size_t)); 29 | extern void object_heap_free(object_heap_t *self); 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /lib/subprojects/README: -------------------------------------------------------------------------------- 1 | This wrapfile is just used by meson for compiling the C code for 2 | tests. It's not used by the top-level Python module in any 3 | way - that uses a git submodule to get the tskit code. 4 | -------------------------------------------------------------------------------- /lib/subprojects/tskit.wrap: -------------------------------------------------------------------------------- 1 | [wrap-file] 2 | directory = tskit-1.1.1 3 | 4 | source_url = https://github.com/tskit-dev/tskit/releases/download/C_1.1.1/tskit-1.1.1.tar.xz 5 | source_filename = tskit-1.1.1.tar.xz 6 | source_hash = 12e9de302686fbc58be7a40066a2e478faa7da44a0b038f6d7f87e7f3a319984 7 | 8 | 9 | -------------------------------------------------------------------------------- /lib/subprojects/tskit/.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | -------------------------------------------------------------------------------- /lib/subprojects/tskit/VERSION.txt: -------------------------------------------------------------------------------- 1 | 1.1.1 -------------------------------------------------------------------------------- /lib/subprojects/tskit/examples/Makefile: -------------------------------------------------------------------------------- 1 | # Simple Makefile for building examples. 2 | # This will build the examples in the current directory by compiling in the 3 | # full tskit source into each of the examples. This is *not* recommended for 4 | # real projects! 5 | # 6 | # To use, type "make" in the this directory. If you have GSL installed you 7 | # should then get two example programs built. 8 | # 9 | # **Note**: This repo uses git submodules, and these must be checked out 10 | # correctly for this makefile to work, e.g.: 11 | # 12 | # $ git clone git@github.com:tskit-dev/tskit.git --recurse-submodules 13 | # 14 | # See the documentation (https://tskit.dev/tskit/docs/stable/c-api.html) 15 | # for more details on how to use the C API, and the tskit build examples 16 | # repo (https://github.com/tskit-dev/tskit-build-examples) for examples 17 | # of how to set up a production-ready build with tskit. 18 | # 19 | 20 | CFLAGS=-I../ -I../subprojects/kastore 21 | TSKIT_SOURCE=../tskit/*.c ../subprojects/kastore/kastore.c 22 | 23 | targets = api_structure error_handling \ 24 | haploid_wright_fisher streaming \ 25 | tree_iteration tree_traversal \ 26 | take_ownership 27 | 28 | all: $(targets) 29 | 30 | $(targets): %: %.c 31 | ${CC} ${CFLAGS} -o $@ $< ${TSKIT_SOURCE} -lm 32 | 33 | clean: 34 | rm -f $(targets) 35 | 36 | -------------------------------------------------------------------------------- /lib/subprojects/tskit/examples/api_structure.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #define check_tsk_error(val) \ 6 | if (val < 0) { \ 7 | fprintf(stderr, "line %d: %s", __LINE__, tsk_strerror(val)); \ 8 | exit(EXIT_FAILURE); \ 9 | } 10 | 11 | int 12 | main(int argc, char **argv) 13 | { 14 | int j, ret; 15 | tsk_edge_table_t edges; 16 | 17 | ret = tsk_edge_table_init(&edges, 0); 18 | check_tsk_error(ret); 19 | for (j = 0; j < 5; j++) { 20 | ret = tsk_edge_table_add_row(&edges, 0, 1, j + 1, j, NULL, 0); 21 | check_tsk_error(ret); 22 | } 23 | tsk_edge_table_print_state(&edges, stdout); 24 | tsk_edge_table_free(&edges); 25 | 26 | return EXIT_SUCCESS; 27 | } 28 | -------------------------------------------------------------------------------- /lib/subprojects/tskit/examples/cpp_sorting_example.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | static void 11 | handle_tskit_return_code(int code) 12 | { 13 | if (code != 0) { 14 | std::ostringstream o; 15 | o << tsk_strerror(code); 16 | throw std::runtime_error(o.str()); 17 | } 18 | } 19 | 20 | struct edge_plus_time { 21 | double time; 22 | tsk_id_t parent, child; 23 | double left, right; 24 | }; 25 | 26 | int 27 | sort_edges(tsk_table_sorter_t *sorter, tsk_size_t start) 28 | { 29 | if (sorter->tables->edges.metadata_length != 0) { 30 | throw std::invalid_argument( 31 | "the sorter does not currently handle edge metadata"); 32 | } 33 | if (start != 0) { 34 | throw std::invalid_argument("the sorter requires start==0"); 35 | } 36 | 37 | std::vector temp; 38 | temp.reserve(static_cast(sorter->tables->edges.num_rows)); 39 | 40 | auto edges = &sorter->tables->edges; 41 | auto nodes = &sorter->tables->nodes; 42 | 43 | for (tsk_size_t i = 0; i < sorter->tables->edges.num_rows; ++i) { 44 | temp.push_back(edge_plus_time{ nodes->time[edges->parent[i]], edges->parent[i], 45 | edges->child[i], edges->left[i], edges->right[i] }); 46 | } 47 | 48 | std::sort(begin(temp), end(temp), 49 | [](const edge_plus_time &lhs, const edge_plus_time &rhs) { 50 | if (lhs.time == rhs.time) { 51 | if (lhs.parent == rhs.parent) { 52 | if (lhs.child == rhs.child) { 53 | return lhs.left < rhs.left; 54 | } 55 | return lhs.child < rhs.child; 56 | } 57 | return lhs.parent < rhs.parent; 58 | } 59 | return lhs.time < rhs.time; 60 | }); 61 | 62 | for (std::size_t i = 0; i < temp.size(); ++i) { 63 | edges->left[i] = temp[i].left; 64 | edges->right[i] = temp[i].right; 65 | edges->parent[i] = temp[i].parent; 66 | edges->child[i] = temp[i].child; 67 | } 68 | 69 | return 0; 70 | } 71 | 72 | int 73 | main(int argc, char **argv) 74 | { 75 | if (argc != 3) { 76 | std::cerr << "Usage: " << argv[0] << " input.trees output.trees\n"; 77 | std::exit(0); 78 | } 79 | const char *infile = argv[1]; 80 | const char *outfile = argv[2]; 81 | 82 | tsk_table_collection_t tables; 83 | auto ret = tsk_table_collection_load(&tables, infile, 0); 84 | handle_tskit_return_code(ret); 85 | 86 | tsk_table_sorter_t sorter; 87 | ret = tsk_table_sorter_init(&sorter, &tables, 0); 88 | handle_tskit_return_code(ret); 89 | sorter.sort_edges = sort_edges; 90 | try { 91 | ret = tsk_table_sorter_run(&sorter, NULL); 92 | } catch (std::exception &e) { 93 | std::cerr << e.what() << '\n'; 94 | std::exit(1); 95 | } 96 | handle_tskit_return_code(ret); 97 | ret = tsk_table_collection_dump(&tables, outfile, 0); 98 | handle_tskit_return_code(ret); 99 | ret = tsk_table_collection_free(&tables); 100 | handle_tskit_return_code(ret); 101 | } 102 | -------------------------------------------------------------------------------- /lib/subprojects/tskit/examples/error_handling.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | int 6 | main(int argc, char **argv) 7 | { 8 | int ret; 9 | tsk_treeseq_t ts; 10 | 11 | if (argc != 2) { 12 | fprintf(stderr, "usage: "); 13 | exit(EXIT_FAILURE); 14 | } 15 | ret = tsk_treeseq_load(&ts, argv[1], 0); 16 | if (ret < 0) { 17 | /* Error condition. Free and exit */ 18 | tsk_treeseq_free(&ts); 19 | fprintf(stderr, "%s", tsk_strerror(ret)); 20 | exit(EXIT_FAILURE); 21 | } 22 | printf("Loaded tree sequence with %lld nodes and %lld edges from %s\n", 23 | (long long) tsk_treeseq_get_num_nodes(&ts), 24 | (long long) tsk_treeseq_get_num_edges(&ts), argv[1]); 25 | tsk_treeseq_free(&ts); 26 | 27 | return EXIT_SUCCESS; 28 | } 29 | -------------------------------------------------------------------------------- /lib/subprojects/tskit/examples/haploid_wright_fisher.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | #define check_tsk_error(val) \ 9 | if (val < 0) { \ 10 | errx(EXIT_FAILURE, "line %d: %s", __LINE__, tsk_strerror(val)); \ 11 | } 12 | 13 | void 14 | simulate(tsk_table_collection_t *tables, int N, int T, int simplify_interval) 15 | { 16 | tsk_id_t *buffer, *parents, *children, child, left_parent, right_parent; 17 | double breakpoint; 18 | int ret, j, t, b; 19 | 20 | assert(simplify_interval != 0); // leads to division by zero 21 | buffer = malloc(2 * N * sizeof(tsk_id_t)); 22 | if (buffer == NULL) { 23 | errx(EXIT_FAILURE, "Out of memory"); 24 | } 25 | tables->sequence_length = 1.0; 26 | parents = buffer; 27 | for (j = 0; j < N; j++) { 28 | parents[j] 29 | = tsk_node_table_add_row(&tables->nodes, 0, T, TSK_NULL, TSK_NULL, NULL, 0); 30 | check_tsk_error(parents[j]); 31 | } 32 | b = 0; 33 | for (t = T - 1; t >= 0; t--) { 34 | /* Alternate between using the first and last N values in the buffer */ 35 | parents = buffer + (b * N); 36 | b = (b + 1) % 2; 37 | children = buffer + (b * N); 38 | for (j = 0; j < N; j++) { 39 | child = tsk_node_table_add_row( 40 | &tables->nodes, 0, t, TSK_NULL, TSK_NULL, NULL, 0); 41 | check_tsk_error(child); 42 | /* NOTE: the use of rand() is discouraged for 43 | * research code and proper random number generator 44 | * libraries should be preferred. 45 | */ 46 | left_parent = parents[(size_t)((rand() / (1. + RAND_MAX)) * N)]; 47 | right_parent = parents[(size_t)((rand() / (1. + RAND_MAX)) * N)]; 48 | do { 49 | breakpoint = rand() / (1. + RAND_MAX); 50 | } while (breakpoint == 0); /* tiny proba of breakpoint being 0 */ 51 | ret = tsk_edge_table_add_row( 52 | &tables->edges, 0, breakpoint, left_parent, child, NULL, 0); 53 | check_tsk_error(ret); 54 | ret = tsk_edge_table_add_row( 55 | &tables->edges, breakpoint, 1, right_parent, child, NULL, 0); 56 | check_tsk_error(ret); 57 | children[j] = child; 58 | } 59 | if (t % simplify_interval == 0) { 60 | printf("Simplify at generation %lld: (%lld nodes %lld edges)", (long long) t, 61 | (long long) tables->nodes.num_rows, (long long) tables->edges.num_rows); 62 | /* Note: Edges must be sorted for simplify to work, and we use a brute force 63 | * approach of sorting each time here for simplicity. This is inefficient. */ 64 | ret = tsk_table_collection_sort(tables, NULL, 0); 65 | check_tsk_error(ret); 66 | ret = tsk_table_collection_simplify(tables, children, N, 0, NULL); 67 | check_tsk_error(ret); 68 | printf(" -> (%lld nodes %lld edges)\n", (long long) tables->nodes.num_rows, 69 | (long long) tables->edges.num_rows); 70 | for (j = 0; j < N; j++) { 71 | children[j] = j; 72 | } 73 | } 74 | } 75 | free(buffer); 76 | } 77 | 78 | int 79 | main(int argc, char **argv) 80 | { 81 | int ret; 82 | tsk_table_collection_t tables; 83 | 84 | if (argc != 6) { 85 | errx(EXIT_FAILURE, "usage: N T simplify-interval output-file seed"); 86 | } 87 | ret = tsk_table_collection_init(&tables, 0); 88 | check_tsk_error(ret); 89 | srand((unsigned) atoi(argv[5])); 90 | simulate(&tables, atoi(argv[1]), atoi(argv[2]), atoi(argv[3])); 91 | ret = tsk_table_collection_dump(&tables, argv[4], 0); 92 | check_tsk_error(ret); 93 | 94 | tsk_table_collection_free(&tables); 95 | return 0; 96 | } 97 | -------------------------------------------------------------------------------- /lib/subprojects/tskit/examples/streaming.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #define check_tsk_error(val) \ 6 | if (val < 0) { \ 7 | fprintf(stderr, "Error: line %d: %s\n", __LINE__, tsk_strerror(val)); \ 8 | exit(EXIT_FAILURE); \ 9 | } 10 | 11 | int 12 | main(int argc, char **argv) 13 | { 14 | int ret; 15 | int j = 0; 16 | tsk_table_collection_t tables; 17 | 18 | ret = tsk_table_collection_init(&tables, 0); 19 | check_tsk_error(ret); 20 | 21 | while (true) { 22 | ret = tsk_table_collection_loadf(&tables, stdin, TSK_NO_INIT); 23 | if (ret == TSK_ERR_EOF) { 24 | break; 25 | } 26 | check_tsk_error(ret); 27 | fprintf(stderr, "Tree sequence %d had %lld mutations\n", j, 28 | (long long) tables.mutations.num_rows); 29 | ret = tsk_mutation_table_truncate(&tables.mutations, 0); 30 | check_tsk_error(ret); 31 | ret = tsk_table_collection_dumpf(&tables, stdout, 0); 32 | check_tsk_error(ret); 33 | j++; 34 | } 35 | tsk_table_collection_free(&tables); 36 | return EXIT_SUCCESS; 37 | } 38 | -------------------------------------------------------------------------------- /lib/subprojects/tskit/examples/take_ownership.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #define check_tsk_error(val) \ 7 | if (val < 0) { \ 8 | errx(EXIT_FAILURE, "line %d: %s", __LINE__, tsk_strerror(val)); \ 9 | } 10 | 11 | int 12 | main(int argc, char **argv) 13 | { 14 | tsk_table_collection_t *tables; 15 | tsk_treeseq_t treeseq; 16 | int rv; 17 | 18 | tables = malloc(sizeof(*tables)); 19 | rv = tsk_table_collection_init(tables, 0); 20 | check_tsk_error(rv); 21 | 22 | /* NOTE: you must set sequence length AFTER initialization */ 23 | tables->sequence_length = 1.0; 24 | 25 | /* Do your regular table operations */ 26 | rv = tsk_node_table_add_row(&tables->nodes, 0, 0.0, -1, -1, NULL, 0); 27 | check_tsk_error(rv); 28 | 29 | /* Initalize the tree sequence, transferring all responsibility 30 | * for the table collection's memory managment 31 | */ 32 | rv = tsk_treeseq_init( 33 | &treeseq, tables, TSK_TS_INIT_BUILD_INDEXES | TSK_TAKE_OWNERSHIP); 34 | check_tsk_error(rv); 35 | 36 | /* WARNING: calling tsk_table_collection_free is now a memory error! */ 37 | tsk_treeseq_free(&treeseq); 38 | } 39 | -------------------------------------------------------------------------------- /lib/subprojects/tskit/examples/tree_iteration.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | 7 | #define check_tsk_error(val) \ 8 | if (val < 0) { \ 9 | errx(EXIT_FAILURE, "line %d: %s", __LINE__, tsk_strerror(val)); \ 10 | } 11 | 12 | int 13 | main(int argc, char **argv) 14 | { 15 | int ret; 16 | tsk_treeseq_t ts; 17 | tsk_tree_t tree; 18 | 19 | if (argc != 2) { 20 | errx(EXIT_FAILURE, "usage: "); 21 | } 22 | ret = tsk_treeseq_load(&ts, argv[1], 0); 23 | check_tsk_error(ret); 24 | ret = tsk_tree_init(&tree, &ts, 0); 25 | check_tsk_error(ret); 26 | 27 | printf("Iterate forwards\n"); 28 | for (ret = tsk_tree_first(&tree); ret == TSK_TREE_OK; ret = tsk_tree_next(&tree)) { 29 | printf("\ttree %lld has %lld roots\n", (long long) tree.index, 30 | (long long) tsk_tree_get_num_roots(&tree)); 31 | } 32 | check_tsk_error(ret); 33 | 34 | printf("Iterate backwards\n"); 35 | for (ret = tsk_tree_last(&tree); ret == TSK_TREE_OK; ret = tsk_tree_prev(&tree)) { 36 | printf("\ttree %lld has %lld roots\n", (long long) tree.index, 37 | (long long) tsk_tree_get_num_roots(&tree)); 38 | } 39 | check_tsk_error(ret); 40 | 41 | tsk_tree_free(&tree); 42 | tsk_treeseq_free(&ts); 43 | return 0; 44 | } 45 | -------------------------------------------------------------------------------- /lib/subprojects/tskit/examples/tree_traversal.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | 7 | #define check_tsk_error(val) \ 8 | if (val < 0) { \ 9 | errx(EXIT_FAILURE, "line %d: %s", __LINE__, tsk_strerror(val)); \ 10 | } 11 | 12 | static void 13 | traverse_standard(const tsk_tree_t *tree) 14 | { 15 | int ret; 16 | tsk_size_t num_nodes, j; 17 | tsk_id_t *nodes = malloc(tsk_tree_get_size_bound(tree) * sizeof(*nodes)); 18 | 19 | if (nodes == NULL) { 20 | errx(EXIT_FAILURE, "Out of memory"); 21 | } 22 | ret = tsk_tree_preorder(tree, nodes, &num_nodes); 23 | check_tsk_error(ret); 24 | for (j = 0; j < num_nodes; j++) { 25 | printf("Visit preorder %lld\n", (long long) nodes[j]); 26 | } 27 | 28 | ret = tsk_tree_postorder(tree, nodes, &num_nodes); 29 | check_tsk_error(ret); 30 | for (j = 0; j < num_nodes; j++) { 31 | printf("Visit postorder %lld\n", (long long) nodes[j]); 32 | } 33 | 34 | free(nodes); 35 | } 36 | 37 | static void 38 | _traverse(const tsk_tree_t *tree, tsk_id_t u, int depth) 39 | { 40 | tsk_id_t v; 41 | int j; 42 | 43 | for (j = 0; j < depth; j++) { 44 | printf(" "); 45 | } 46 | printf("Visit recursive %lld\n", (long long) u); 47 | for (v = tree->left_child[u]; v != TSK_NULL; v = tree->right_sib[v]) { 48 | _traverse(tree, v, depth + 1); 49 | } 50 | } 51 | 52 | static void 53 | traverse_recursive(const tsk_tree_t *tree) 54 | { 55 | _traverse(tree, tree->virtual_root, -1); 56 | } 57 | 58 | static void 59 | traverse_stack(const tsk_tree_t *tree) 60 | { 61 | int stack_top; 62 | tsk_id_t u, v; 63 | tsk_id_t *stack = malloc(tsk_tree_get_size_bound(tree) * sizeof(*stack)); 64 | 65 | if (stack == NULL) { 66 | errx(EXIT_FAILURE, "Out of memory"); 67 | } 68 | stack_top = 0; 69 | stack[stack_top] = tree->virtual_root; 70 | while (stack_top >= 0) { 71 | u = stack[stack_top]; 72 | stack_top--; 73 | printf("Visit stack %lld\n", (long long) u); 74 | /* Put nodes on the stack right-to-left, so we visit in left-to-right */ 75 | for (v = tree->right_child[u]; v != TSK_NULL; v = tree->left_sib[v]) { 76 | stack_top++; 77 | stack[stack_top] = v; 78 | } 79 | } 80 | free(stack); 81 | } 82 | 83 | static void 84 | traverse_upwards(const tsk_tree_t *tree) 85 | { 86 | const tsk_id_t *samples = tsk_treeseq_get_samples(tree->tree_sequence); 87 | tsk_size_t num_samples = tsk_treeseq_get_num_samples(tree->tree_sequence); 88 | tsk_size_t j; 89 | tsk_id_t u; 90 | 91 | for (j = 0; j < num_samples; j++) { 92 | u = samples[j]; 93 | while (u != TSK_NULL) { 94 | printf("Visit upwards: %lld\n", (long long) u); 95 | u = tree->parent[u]; 96 | } 97 | } 98 | } 99 | 100 | int 101 | main(int argc, char **argv) 102 | { 103 | int ret; 104 | tsk_treeseq_t ts; 105 | tsk_tree_t tree; 106 | 107 | if (argc != 2) { 108 | errx(EXIT_FAILURE, "usage: "); 109 | } 110 | ret = tsk_treeseq_load(&ts, argv[1], 0); 111 | check_tsk_error(ret); 112 | ret = tsk_tree_init(&tree, &ts, 0); 113 | check_tsk_error(ret); 114 | ret = tsk_tree_first(&tree); 115 | check_tsk_error(ret); 116 | 117 | traverse_standard(&tree); 118 | 119 | traverse_recursive(&tree); 120 | 121 | traverse_stack(&tree); 122 | 123 | traverse_upwards(&tree); 124 | 125 | tsk_tree_free(&tree); 126 | tsk_treeseq_free(&ts); 127 | return 0; 128 | } 129 | -------------------------------------------------------------------------------- /lib/subprojects/tskit/meson.build: -------------------------------------------------------------------------------- 1 | project('tskit', ['c', 'cpp'], 2 | version: files('VERSION.txt'), 3 | default_options: ['c_std=c99', 'cpp_std=c++11'] 4 | ) 5 | 6 | kastore_proj = subproject('kastore') 7 | kastore_dep = kastore_proj.get_variable('kastore_dep') 8 | kastore_inc = kastore_proj.get_variable('kastore_inc') 9 | 10 | cc = meson.get_compiler('c') 11 | m_dep = cc.find_library('m', required: false) 12 | lib_deps = [m_dep, kastore_dep] 13 | 14 | extra_c_args = [ 15 | '-Wall', '-Wextra', '-Werror', '-Wpedantic', '-W', 16 | '-Wmissing-prototypes', '-Wstrict-prototypes', 17 | '-Wconversion', '-Wshadow', '-Wpointer-arith', '-Wcast-align', 18 | '-Wcast-qual', '-Wwrite-strings', '-Wnested-externs', 19 | '-fshort-enums', '-fno-common'] 20 | 21 | lib_sources = [ 22 | 'tskit/core.c', 'tskit/tables.c', 'tskit/trees.c', 23 | 'tskit/genotypes.c', 'tskit/stats.c', 'tskit/convert.c', 'tskit/haplotype_matching.c'] 24 | lib_headers = [ 25 | 'tskit/core.h', 'tskit/tables.h', 'tskit/trees.h', 26 | 'tskit/genotypes.h', 'tskit/stats.h', 'tskit/convert.h', 'tskit/haplotype_matching.h'] 27 | 28 | # Subprojects use the static library for simplicity. 29 | tskit_inc = [kastore_inc, include_directories(['.'])] 30 | tskit_lib = static_library('tskit', 31 | sources: lib_sources, dependencies: lib_deps) 32 | tskit_dep = declare_dependency(include_directories:tskit_inc, link_with: tskit_lib) 33 | 34 | if not meson.is_subproject() 35 | 36 | # Shared library install target. 37 | shared_library('tskit', 38 | sources: lib_sources, dependencies: lib_deps, c_args: extra_c_args, install: true) 39 | install_headers('tskit.h') 40 | install_headers(lib_headers, subdir: 'tskit') 41 | 42 | cunit_dep = dependency('cunit') 43 | # We don't specify extra C args here as CUnit won't pass the checks. 44 | test_lib = static_library('testlib', 45 | sources: ['tests/testlib.c'], dependencies: [cunit_dep, kastore_dep, tskit_dep]) 46 | 47 | test_core = executable('test_core', 48 | sources: ['tests/test_core.c'], 49 | link_with: [tskit_lib, test_lib], 50 | c_args: extra_c_args+['-DMESON_PROJECT_VERSION="@0@"'.format(meson.project_version())], 51 | dependencies: kastore_dep, 52 | ) 53 | test('core', test_core) 54 | 55 | test_tables = executable('test_tables', 56 | sources: ['tests/test_tables.c'], 57 | link_with: [tskit_lib, test_lib], c_args: extra_c_args, dependencies: kastore_dep) 58 | test('tables', test_tables) 59 | 60 | test_trees = executable('test_trees', 61 | sources: ['tests/test_trees.c'], 62 | link_with: [tskit_lib, test_lib], c_args: extra_c_args, dependencies: kastore_dep) 63 | test('trees', test_trees) 64 | 65 | test_genotypes = executable('test_genotypes', 66 | sources: ['tests/test_genotypes.c'], 67 | link_with: [tskit_lib, test_lib], c_args: extra_c_args, dependencies: kastore_dep) 68 | test('genotypes', test_genotypes) 69 | 70 | test_convert = executable('test_convert', 71 | sources: ['tests/test_convert.c'], 72 | link_with: [tskit_lib, test_lib], c_args: extra_c_args, dependencies: kastore_dep) 73 | test('convert', test_convert) 74 | 75 | test_stats = executable('test_stats', 76 | sources: ['tests/test_stats.c'], 77 | link_with: [tskit_lib, test_lib], c_args: extra_c_args, dependencies: kastore_dep) 78 | test('stats', test_stats) 79 | 80 | test_haplotype_matching = executable('test_haplotype_matching', 81 | sources: ['tests/test_haplotype_matching.c'], 82 | link_with: [tskit_lib, test_lib], c_args: extra_c_args, dependencies: kastore_dep) 83 | test('haplotype_matching', test_haplotype_matching) 84 | 85 | test_file_format = executable('test_file_format', 86 | sources: ['tests/test_file_format.c'], 87 | link_with: [tskit_lib, test_lib], c_args: extra_c_args, dependencies: kastore_dep) 88 | test('file_format', test_file_format) 89 | 90 | test_minimal_cpp = executable('test_minimal_cpp', 91 | sources: ['tests/test_minimal_cpp.cpp'], link_with: [tskit_lib], 92 | dependencies: kastore_dep) 93 | test('minimal_cpp', test_minimal_cpp) 94 | 95 | if get_option('build_examples') 96 | # These example programs use less portable features, 97 | # and we don't want to always compile them. Use, e.g., 98 | # meson build -Dbuild_examples=false 99 | executable('api_structure', 100 | sources: ['examples/api_structure.c'], 101 | link_with: [tskit_lib], dependencies: lib_deps) 102 | executable('error_handling', 103 | sources: ['examples/error_handling.c'], 104 | link_with: [tskit_lib], dependencies: lib_deps) 105 | executable('tree_iteration', 106 | sources: ['examples/tree_iteration.c'], 107 | link_with: [tskit_lib], dependencies: lib_deps) 108 | executable('tree_traversal', 109 | sources: ['examples/tree_traversal.c'], 110 | link_with: [tskit_lib], dependencies: lib_deps) 111 | executable('streaming', 112 | sources: ['examples/streaming.c'], 113 | link_with: [tskit_lib], dependencies: lib_deps) 114 | executable('cpp_sorting_example', 115 | sources: ['examples/cpp_sorting_example.cpp'], 116 | link_with: [tskit_lib], dependencies: lib_deps) 117 | executable('haploid_wright_fisher', 118 | sources: ['examples/haploid_wright_fisher.c'], 119 | link_with: [tskit_lib], dependencies: lib_deps) 120 | endif 121 | endif 122 | -------------------------------------------------------------------------------- /lib/subprojects/tskit/meson_options.txt: -------------------------------------------------------------------------------- 1 | option('build_examples', type : 'boolean', value : true) 2 | -------------------------------------------------------------------------------- /lib/subprojects/tskit/subprojects/kastore/README.md: -------------------------------------------------------------------------------- 1 | This directory is an abbreviated version of the kastore distribution source. 2 | 3 | All files should be updated when we are updating to a new kastore version. 4 | -------------------------------------------------------------------------------- /lib/subprojects/tskit/subprojects/kastore/VERSION.txt: -------------------------------------------------------------------------------- 1 | 2.1.1 2 | -------------------------------------------------------------------------------- /lib/subprojects/tskit/subprojects/kastore/meson.build: -------------------------------------------------------------------------------- 1 | project('kastore', ['c', 'cpp'], 2 | version: files('VERSION.txt'), 3 | default_options: [ 4 | 'c_std=c99', 5 | 'cpp_std=c++11', 6 | 'warning_level=3', 7 | 'werror=true']) 8 | 9 | if not meson.is_subproject() 10 | add_global_arguments([ 11 | '-W', '-Wmissing-prototypes', '-Wstrict-prototypes', 12 | '-Wconversion', '-Wshadow', '-Wpointer-arith', '-Wcast-align', 13 | '-Wcast-qual', '-Wwrite-strings', '-Wnested-externs', 14 | '-fshort-enums', '-fno-common'], language : 'c') 15 | endif 16 | 17 | # Subprojects should compile in the static library for simplicity. 18 | kastore_inc = include_directories('.') 19 | kastore = static_library('kastore', 'kastore.c') 20 | kastore_dep = declare_dependency(link_with : kastore, include_directories: kastore_inc) 21 | 22 | if not meson.is_subproject() 23 | 24 | # The shared library can be installed into the system. 25 | install_headers('kastore.h') 26 | shared_library('kastore', 'kastore.c', install: true) 27 | executable('example', ['example.c'], link_with: kastore) 28 | 29 | # Note: we don't declare these as meson tests because they depend on 30 | # being run from the current working directory because of the paths 31 | # to example files. 32 | cunit_dep = dependency('cunit') 33 | executable('tests', ['tests.c', 'kastore.c'], dependencies: cunit_dep, 34 | c_args: ['-DMESON_VERSION="@0@"'.format(meson.project_version())]) 35 | 36 | executable('cpp_tests', ['cpp_tests.cpp'], link_with: kastore) 37 | 38 | executable('malloc_tests', ['malloc_tests.c', 'kastore.c'], 39 | dependencies: cunit_dep, 40 | link_args:['-Wl,--wrap=malloc', '-Wl,--wrap=realloc', '-Wl,--wrap=calloc']) 41 | 42 | executable('io_tests', ['io_tests.c', 'kastore.c'], 43 | dependencies: cunit_dep, 44 | link_args:[ 45 | '-Wl,--wrap=fwrite', 46 | '-Wl,--wrap=fread', 47 | '-Wl,--wrap=fclose', 48 | '-Wl,--wrap=ftell', 49 | '-Wl,--wrap=fseek']) 50 | endif 51 | -------------------------------------------------------------------------------- /lib/subprojects/tskit/tests/test_convert.c: -------------------------------------------------------------------------------- 1 | /* 2 | * MIT License 3 | * 4 | * Copyright (c) 2019-2022 Tskit Developers 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a copy 7 | * of this software and associated documentation files (the "Software"), to deal 8 | * in the Software without restriction, including without limitation the rights 9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | * copies of the Software, and to permit persons to whom the Software is 11 | * furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in all 14 | * copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | * SOFTWARE. 23 | */ 24 | 25 | #include "testlib.h" 26 | #include 27 | 28 | #include 29 | #include 30 | 31 | static void 32 | test_single_tree_newick(void) 33 | { 34 | int ret; 35 | tsk_treeseq_t ts; 36 | tsk_tree_t t; 37 | size_t buffer_size = 1024; 38 | char newick[buffer_size]; 39 | 40 | tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, 41 | NULL, NULL, NULL, 0); 42 | 43 | ret = tsk_tree_init(&t, &ts, 0); 44 | CU_ASSERT_EQUAL_FATAL(ret, 0) 45 | ret = tsk_tree_first(&t); 46 | CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK) 47 | 48 | ret = tsk_convert_newick(&t, 0, 0, TSK_NEWICK_LEGACY_MS_LABELS, buffer_size, newick); 49 | CU_ASSERT_EQUAL_FATAL(ret, 0); 50 | /* Seems odd, but this is what a single node newick tree looks like. 51 | * Newick parsers seems to accept it in any case */ 52 | CU_ASSERT_STRING_EQUAL(newick, "1;"); 53 | 54 | ret = tsk_convert_newick(&t, 0, 0, 0, buffer_size, newick); 55 | CU_ASSERT_EQUAL_FATAL(ret, 0); 56 | CU_ASSERT_STRING_EQUAL(newick, "n0;"); 57 | 58 | ret = tsk_convert_newick(&t, 4, 0, TSK_NEWICK_LEGACY_MS_LABELS, buffer_size, newick); 59 | CU_ASSERT_EQUAL_FATAL(ret, 0); 60 | CU_ASSERT_STRING_EQUAL(newick, "(1:1,2:1);"); 61 | ret = tsk_convert_newick(&t, 4, 0, 0, buffer_size, newick); 62 | CU_ASSERT_EQUAL_FATAL(ret, 0); 63 | CU_ASSERT_STRING_EQUAL(newick, "(n0:1,n1:1);"); 64 | 65 | ret = tsk_convert_newick(&t, 6, 0, TSK_NEWICK_LEGACY_MS_LABELS, buffer_size, newick); 66 | CU_ASSERT_EQUAL_FATAL(ret, 0); 67 | CU_ASSERT_STRING_EQUAL(newick, "((1:1,2:1):2,(3:2,4:2):1);"); 68 | 69 | ret = tsk_convert_newick(&t, 6, 0, 0, buffer_size, newick); 70 | CU_ASSERT_EQUAL_FATAL(ret, 0); 71 | CU_ASSERT_STRING_EQUAL(newick, "((n0:1,n1:1):2,(n2:2,n3:2):1);"); 72 | 73 | tsk_tree_free(&t); 74 | tsk_treeseq_free(&ts); 75 | } 76 | 77 | static void 78 | test_single_tree_newick_errors(void) 79 | { 80 | int ret; 81 | tsk_treeseq_t ts; 82 | tsk_tree_t t; 83 | size_t j, len; 84 | size_t buffer_size = 1024; 85 | char newick[buffer_size]; 86 | 87 | tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, 88 | NULL, NULL, NULL, 0); 89 | 90 | ret = tsk_tree_init(&t, &ts, 0); 91 | CU_ASSERT_EQUAL_FATAL(ret, 0) 92 | ret = tsk_tree_first(&t); 93 | CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK) 94 | 95 | ret = tsk_convert_newick(&t, -1, 1, 0, buffer_size, newick); 96 | CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); 97 | ret = tsk_convert_newick(&t, 7, 1, 0, buffer_size, newick); 98 | CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); 99 | 100 | ret = tsk_convert_newick(&t, 6, 0, 0, buffer_size, NULL); 101 | CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_PARAM_VALUE); 102 | ret = tsk_convert_newick(&t, 6, 0, 0, buffer_size, newick); 103 | CU_ASSERT_EQUAL_FATAL(ret, 0); 104 | len = 1 + strlen(newick); 105 | for (j = 0; j < len; j++) { 106 | ret = tsk_convert_newick(&t, 6, 0, 0, j, newick); 107 | CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BUFFER_OVERFLOW); 108 | } 109 | ret = tsk_convert_newick(&t, 6, 0, TSK_NEWICK_LEGACY_MS_LABELS, len, newick); 110 | 111 | CU_ASSERT_EQUAL_FATAL(ret, 0); 112 | CU_ASSERT_STRING_EQUAL(newick, "((1:1,2:1):2,(3:2,4:2):1);"); 113 | 114 | tsk_tree_free(&t); 115 | tsk_treeseq_free(&ts); 116 | } 117 | 118 | int 119 | main(int argc, char **argv) 120 | { 121 | CU_TestInfo tests[] = { 122 | { "test_single_tree_newick", test_single_tree_newick }, 123 | { "test_single_tree_newick_errors", test_single_tree_newick_errors }, 124 | { NULL, NULL }, 125 | }; 126 | return test_main(tests, argc, argv); 127 | } 128 | -------------------------------------------------------------------------------- /lib/subprojects/tskit/tests/test_minimal_cpp.cpp: -------------------------------------------------------------------------------- 1 | /* * MIT License 2 | * 3 | * Copyright (c) 2019-2022 Tskit Developers 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a copy 6 | * of this software and associated documentation files (the "Software"), to deal 7 | * in the Software without restriction, including without limitation the rights 8 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | * copies of the Software, and to permit persons to whom the Software is 10 | * furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in all 13 | * copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | * SOFTWARE. 22 | */ 23 | 24 | /* Minimal tests to make sure that tskit at least compiles and links 25 | * in a simple C++ program */ 26 | 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | 34 | #include 35 | 36 | using namespace std; 37 | 38 | void 39 | test_kas_strerror() 40 | { 41 | std::cout << "test_kas_strerror" << endl; 42 | std::ostringstream o; 43 | o << kas_strerror(KAS_ERR_NO_MEMORY); 44 | assert(std::string("Out of memory").compare(o.str()) == 0); 45 | } 46 | 47 | void 48 | test_strerror() 49 | { 50 | std::cout << "test_strerror" << endl; 51 | std::ostringstream o; 52 | o << tsk_strerror(TSK_ERR_NO_MEMORY); 53 | assert(std::string("Out of memory. (TSK_ERR_NO_MEMORY)").compare(o.str()) == 0); 54 | } 55 | 56 | void 57 | test_load_error() 58 | { 59 | std::cout << "test_open_error" << endl; 60 | tsk_treeseq_t ts; 61 | int ret = tsk_treeseq_load(&ts, "no such file", 0); 62 | assert(ret == TSK_ERR_IO); 63 | tsk_treeseq_free(&ts); 64 | } 65 | 66 | void 67 | test_table_basics() 68 | { 69 | std::cout << "test_table_basics" << endl; 70 | tsk_table_collection_t tables; 71 | int ret = tsk_table_collection_init(&tables, 0); 72 | assert(ret == 0); 73 | 74 | ret = tsk_node_table_add_row(&tables.nodes, 0, 1.0, TSK_NULL, TSK_NULL, NULL, 0); 75 | assert(ret == 0); 76 | ret = tsk_node_table_add_row(&tables.nodes, 0, 2.0, TSK_NULL, TSK_NULL, NULL, 0); 77 | assert(ret == 1); 78 | assert(tables.nodes.num_rows == 2); 79 | 80 | tsk_table_collection_free(&tables); 81 | } 82 | 83 | /* A definition of sort_edges that uses C++ std::sort and inlining of the 84 | * comparison function to achieve significantly better performance than 85 | * the builtin method in tskit. 86 | */ 87 | int 88 | cpp_sort_edges(tsk_table_sorter_t *sorter, tsk_size_t start) 89 | { 90 | struct _edge { 91 | double left, right; 92 | tsk_id_t parent, child; 93 | 94 | _edge(double l, double r, tsk_id_t p, tsk_id_t c) 95 | : left{ l }, right{ r }, parent{ p }, child{ c } 96 | { 97 | } 98 | }; 99 | tsk_edge_table_t *edges = &sorter->tables->edges; 100 | const double *node_time = sorter->tables->nodes.time; 101 | std::vector<_edge> sorted_edges; 102 | size_t num_edges = edges->num_rows; 103 | size_t j; 104 | 105 | /* This is the comparison function. We cannot define an 106 | * operator < for _edge because we need to bind the node times 107 | * so we have to use a functional method. This is a copy of the cmp 108 | * from fwdpp. Only difference is the final time comparison 109 | * (fwdpp table times go forwards). */ 110 | const auto cmp = [&node_time](const _edge &lhs, const _edge &rhs) { 111 | auto tl = node_time[lhs.parent]; 112 | auto tr = node_time[rhs.parent]; 113 | if (tl == tr) { 114 | if (lhs.parent == rhs.parent) { 115 | if (lhs.child == rhs.child) { 116 | return lhs.left < rhs.left; 117 | } 118 | return lhs.child < rhs.child; 119 | } 120 | return lhs.parent < rhs.parent; 121 | } 122 | return tl < tr; 123 | }; 124 | 125 | assert(start == 0); 126 | /* Let's not bother with metadata */ 127 | assert(edges->metadata_length == 0); 128 | 129 | sorted_edges.reserve(num_edges); 130 | for (j = 0; j < num_edges; j++) { 131 | sorted_edges.emplace_back( 132 | edges->left[j], edges->right[j], edges->parent[j], edges->child[j]); 133 | } 134 | 135 | std::sort(begin(sorted_edges), end(sorted_edges), cmp); 136 | 137 | for (j = 0; j < num_edges; j++) { 138 | edges->left[j] = sorted_edges[j].left; 139 | edges->right[j] = sorted_edges[j].right; 140 | edges->parent[j] = sorted_edges[j].parent; 141 | edges->child[j] = sorted_edges[j].child; 142 | } 143 | return 0; 144 | } 145 | 146 | void 147 | test_edge_sorting() 148 | { 149 | std::cout << "test_edge_sorting" << endl; 150 | tsk_table_collection_t tables; 151 | tsk_id_t n = 10; 152 | tsk_id_t j; 153 | int ret = tsk_table_collection_init(&tables, 0); 154 | assert(ret == 0); 155 | 156 | tables.sequence_length = 1.0; 157 | /* Make a stick tree */ 158 | /* Add nodes and edges */ 159 | for (j = 0; j < n; j++) { 160 | ret = tsk_node_table_add_row( 161 | &tables.nodes, TSK_NODE_IS_SAMPLE, j + 1, TSK_NULL, TSK_NULL, NULL, 0); 162 | assert(ret == j); 163 | } 164 | for (j = n - 1; j > 0; j--) { 165 | tsk_edge_table_add_row(&tables.edges, 0, 1, j, j - 1, NULL, 0); 166 | } 167 | assert(tables.nodes.num_rows == (tsk_size_t) n); 168 | assert(tables.edges.num_rows == (tsk_size_t) n - 1); 169 | 170 | /* Make sure the edges are unsorted */ 171 | /* Not calling TSK_CHECK_TREES so casting is safe */ 172 | ret = (int) tsk_table_collection_check_integrity(&tables, TSK_CHECK_EDGE_ORDERING); 173 | assert(ret == TSK_ERR_EDGES_NOT_SORTED_PARENT_TIME); 174 | 175 | /* Sort the tables */ 176 | tsk_table_sorter_t sorter; 177 | ret = tsk_table_sorter_init(&sorter, &tables, 0); 178 | assert(ret == 0); 179 | /* Set the sort_edges to our local C++ version. We could also set some 180 | * persistent state in sorter.params if we wanted to. */ 181 | sorter.sort_edges = cpp_sort_edges; 182 | ret = tsk_table_sorter_run(&sorter, NULL); 183 | assert(ret == 0); 184 | tsk_table_sorter_free(&sorter); 185 | 186 | /* Make sure the edges are now sorted */ 187 | ret = (int) tsk_table_collection_check_integrity(&tables, TSK_CHECK_EDGE_ORDERING); 188 | assert(ret == 0); 189 | 190 | tsk_table_collection_free(&tables); 191 | } 192 | 193 | int 194 | sort_edges_raises_exception(tsk_table_sorter_t *sorter, tsk_size_t start) 195 | { 196 | throw std::exception(); 197 | return 0; 198 | } 199 | 200 | int 201 | sort_edges_raises_non_exception(tsk_table_sorter_t *sorter, tsk_size_t start) 202 | { 203 | throw 42; 204 | return 0; 205 | } 206 | 207 | int 208 | safe_sort_edges(tsk_table_sorter_t *sorter, tsk_size_t start) 209 | { 210 | int ret = 0; 211 | if (sorter->user_data == NULL) { 212 | try { 213 | ret = sort_edges_raises_exception(sorter, start); 214 | } catch (...) { 215 | ret = -12345; 216 | } 217 | } else { 218 | try { 219 | ret = sort_edges_raises_non_exception(sorter, start); 220 | } catch (...) { 221 | ret = -12346; 222 | } 223 | } 224 | return ret; 225 | } 226 | 227 | void 228 | test_edge_sorting_errors() 229 | { 230 | /* Some inexplicable error happened here on 32 bit Windows where the 231 | * exceptions were not being caught as expected. This seems much 232 | * more likely to be a platform quirk that a real bug in our code, 233 | * so just disabling the test there. 234 | * 235 | * https://github.com/tskit-dev/tskit/issues/1790 236 | * https://github.com/tskit-dev/tskit/pull/1791 237 | */ 238 | #if !defined(_WIN32) 239 | std::cout << "test_edge_sorting_errors" << endl; 240 | tsk_table_collection_t tables; 241 | tsk_table_sorter_t sorter; 242 | tsk_id_t ret = tsk_table_collection_init(&tables, 0); 243 | 244 | assert(ret == 0); 245 | tables.sequence_length = 1.0; 246 | 247 | ret = tsk_table_sorter_init(&sorter, &tables, 0); 248 | assert(ret == 0); 249 | sorter.sort_edges = safe_sort_edges; 250 | ret = tsk_table_sorter_run(&sorter, NULL); 251 | assert(ret == -12345); 252 | 253 | /* Use the user_data as a way to communicate with the sorter 254 | * function. Here, we want to try out two different types 255 | * of exception that get thrown. */ 256 | sorter.user_data = &tables; 257 | ret = tsk_table_sorter_run(&sorter, NULL); 258 | assert(ret == -12346); 259 | 260 | tsk_table_sorter_free(&sorter); 261 | tsk_table_collection_free(&tables); 262 | #endif 263 | } 264 | 265 | int 266 | main() 267 | { 268 | test_kas_strerror(); 269 | test_strerror(); 270 | test_load_error(); 271 | test_table_basics(); 272 | test_edge_sorting(); 273 | test_edge_sorting_errors(); 274 | return 0; 275 | } 276 | -------------------------------------------------------------------------------- /lib/subprojects/tskit/tests/testlib.h: -------------------------------------------------------------------------------- 1 | /* 2 | * MIT License 3 | * 4 | * Copyright (c) 2019-2021 Tskit Developers 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a copy 7 | * of this software and associated documentation files (the "Software"), to deal 8 | * in the Software without restriction, including without limitation the rights 9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | * copies of the Software, and to permit persons to whom the Software is 11 | * furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in all 14 | * copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | * SOFTWARE. 23 | */ 24 | 25 | #ifndef __TESTLIB_H__ 26 | #define __TESTLIB_H__ 27 | 28 | #define _GNU_SOURCE 29 | #include 30 | #include 31 | #include 32 | 33 | #include 34 | #include 35 | 36 | /* Global variables used in the test suite */ 37 | 38 | extern char *_tmp_file_name; 39 | extern FILE *_devnull; 40 | 41 | int test_main(CU_TestInfo *tests, int argc, char **argv); 42 | 43 | void tsk_treeseq_from_text(tsk_treeseq_t *ts, double sequence_length, const char *nodes, 44 | const char *edges, const char *migrations, const char *sites, const char *mutations, 45 | const char *individuals, const char *provenance, tsk_flags_t tc_options); 46 | tsk_treeseq_t *caterpillar_tree( 47 | tsk_size_t num_samples, tsk_size_t num_sites, tsk_size_t num_mutations); 48 | 49 | void parse_nodes(const char *text, tsk_node_table_t *node_table); 50 | void parse_edges(const char *text, tsk_edge_table_t *edge_table); 51 | void parse_sites(const char *text, tsk_site_table_t *site_table); 52 | void parse_mutations(const char *text, tsk_mutation_table_t *mutation_table); 53 | void parse_individuals(const char *text, tsk_individual_table_t *individual_table); 54 | 55 | void unsort_edges(tsk_edge_table_t *edges, size_t start); 56 | 57 | extern const char *single_tree_ex_nodes; 58 | extern const char *single_tree_ex_edges; 59 | extern const char *single_tree_ex_sites; 60 | extern const char *single_tree_ex_mutations; 61 | 62 | extern const char *multiple_tree_ex_nodes; 63 | extern const char *multiple_tree_ex_edges; 64 | 65 | extern const char *odd_tree1_ex_nodes; 66 | extern const char *odd_tree1_ex_edges; 67 | 68 | extern const char *multi_root_tree_ex_nodes; 69 | extern const char *multi_root_tree_ex_edges; 70 | 71 | extern const char *multi_path_tree_ex_nodes; 72 | extern const char *multi_path_tree_ex_edges; 73 | 74 | extern const char *nonbinary_ex_nodes; 75 | extern const char *nonbinary_ex_edges; 76 | extern const char *nonbinary_ex_sites; 77 | extern const char *nonbinary_ex_mutations; 78 | 79 | extern const char *unary_ex_nodes; 80 | extern const char *unary_ex_edges; 81 | extern const char *unary_ex_sites; 82 | extern const char *unary_ex_mutations; 83 | 84 | extern const char *internal_sample_ex_nodes; 85 | extern const char *internal_sample_ex_edges; 86 | extern const char *internal_sample_ex_sites; 87 | extern const char *internal_sample_ex_mutations; 88 | 89 | extern const char *multiroot_ex_nodes; 90 | extern const char *multiroot_ex_edges; 91 | extern const char *multiroot_ex_sites; 92 | extern const char *multiroot_ex_mutations; 93 | 94 | extern const char *empty_ex_nodes; 95 | extern const char *empty_ex_edges; 96 | 97 | extern const char *paper_ex_nodes; 98 | extern const char *paper_ex_edges; 99 | extern const char *paper_ex_sites; 100 | extern const char *paper_ex_mutations; 101 | extern const char *paper_ex_individuals; 102 | 103 | #endif 104 | -------------------------------------------------------------------------------- /lib/subprojects/tskit/tskit.h: -------------------------------------------------------------------------------- 1 | /* 2 | * MIT License 3 | * 4 | * Copyright (c) 2019 Tskit Developers 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a copy 7 | * of this software and associated documentation files (the "Software"), to deal 8 | * in the Software without restriction, including without limitation the rights 9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | * copies of the Software, and to permit persons to whom the Software is 11 | * furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in all 14 | * copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | * SOFTWARE. 23 | */ 24 | 25 | /** 26 | * @file tskit.h 27 | * @brief Tskit API. 28 | */ 29 | #ifndef __TSKIT_H__ 30 | #define __TSKIT_H__ 31 | 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | 39 | #endif 40 | -------------------------------------------------------------------------------- /lib/subprojects/tskit/tskit/convert.c: -------------------------------------------------------------------------------- 1 | /* 2 | * MIT License 3 | * 4 | * Copyright (c) 2018-2021 Tskit Developers 5 | * Copyright (c) 2015-2017 University of Oxford 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | #include 33 | 34 | /* ======================================================== * 35 | * Newick output. 36 | * ======================================================== */ 37 | 38 | /* This infrastructure is left-over from an earlier more complex version 39 | * of this algorithm that worked over a tree sequence and cached the newick 40 | * subtrees, updating according to diffs. It's unclear whether this complexity 41 | * was of any real-world use, since newick output for large trees is pretty 42 | * pointless. */ 43 | 44 | typedef struct { 45 | unsigned int precision; 46 | tsk_flags_t options; 47 | char *newick; 48 | tsk_id_t *traversal_stack; 49 | const tsk_tree_t *tree; 50 | } tsk_newick_converter_t; 51 | 52 | static int 53 | tsk_newick_converter_run( 54 | tsk_newick_converter_t *self, tsk_id_t root, size_t buffer_size, char *buffer) 55 | { 56 | int ret = TSK_ERR_GENERIC; 57 | const tsk_tree_t *tree = self->tree; 58 | tsk_id_t *stack = self->traversal_stack; 59 | const double *time = self->tree->tree_sequence->tables->nodes.time; 60 | const tsk_flags_t *flags = self->tree->tree_sequence->tables->nodes.flags; 61 | int stack_top = 0; 62 | int label; 63 | size_t s = 0; 64 | int r; 65 | tsk_id_t u, v, w, root_parent; 66 | double branch_length; 67 | bool ms_labels = self->options & TSK_NEWICK_LEGACY_MS_LABELS; 68 | const char *label_format = ms_labels ? "%d" : "n%d"; 69 | 70 | if (root < 0 || root >= (tsk_id_t) self->tree->num_nodes) { 71 | ret = TSK_ERR_NODE_OUT_OF_BOUNDS; 72 | goto out; 73 | } 74 | if (buffer == NULL) { 75 | ret = TSK_ERR_BAD_PARAM_VALUE; 76 | goto out; 77 | } 78 | root_parent = tree->parent[root]; 79 | stack[0] = root; 80 | u = root_parent; 81 | while (stack_top >= 0) { 82 | v = stack[stack_top]; 83 | if (tree->left_child[v] != TSK_NULL && v != u) { 84 | if (s >= buffer_size) { 85 | ret = TSK_ERR_BUFFER_OVERFLOW; 86 | goto out; 87 | } 88 | buffer[s] = '('; 89 | s++; 90 | for (w = tree->right_child[v]; w != TSK_NULL; w = tree->left_sib[w]) { 91 | stack_top++; 92 | stack[stack_top] = w; 93 | } 94 | } else { 95 | u = tree->parent[v]; 96 | stack_top--; 97 | label = -1; 98 | if (ms_labels) { 99 | if (tree->left_child[v] == TSK_NULL) { 100 | label = (int) v + 1; 101 | } 102 | } else if (flags[v] & TSK_NODE_IS_SAMPLE) { 103 | label = (int) v; 104 | } 105 | if (label != -1) { 106 | if (s >= buffer_size) { 107 | ret = TSK_ERR_BUFFER_OVERFLOW; 108 | goto out; 109 | } 110 | r = snprintf(buffer + s, buffer_size - s, label_format, label); 111 | if (r < 0) { 112 | ret = TSK_ERR_IO; 113 | goto out; 114 | } 115 | s += (size_t) r; 116 | if (s >= buffer_size) { 117 | ret = TSK_ERR_BUFFER_OVERFLOW; 118 | goto out; 119 | } 120 | } 121 | if (u != root_parent) { 122 | branch_length = (time[u] - time[v]); 123 | r = snprintf(buffer + s, buffer_size - s, ":%.*f", (int) self->precision, 124 | branch_length); 125 | if (r < 0) { 126 | ret = TSK_ERR_IO; 127 | goto out; 128 | } 129 | s += (size_t) r; 130 | if (s >= buffer_size) { 131 | ret = TSK_ERR_BUFFER_OVERFLOW; 132 | goto out; 133 | } 134 | if (v == tree->right_child[u]) { 135 | buffer[s] = ')'; 136 | } else { 137 | buffer[s] = ','; 138 | } 139 | s++; 140 | } 141 | } 142 | } 143 | if ((s + 1) >= buffer_size) { 144 | ret = TSK_ERR_BUFFER_OVERFLOW; 145 | goto out; 146 | } 147 | buffer[s] = ';'; 148 | buffer[s + 1] = '\0'; 149 | ret = 0; 150 | out: 151 | return ret; 152 | } 153 | 154 | static int 155 | tsk_newick_converter_init(tsk_newick_converter_t *self, const tsk_tree_t *tree, 156 | unsigned int precision, tsk_flags_t options) 157 | { 158 | int ret = 0; 159 | 160 | tsk_memset(self, 0, sizeof(tsk_newick_converter_t)); 161 | self->precision = precision; 162 | self->options = options; 163 | self->tree = tree; 164 | self->traversal_stack 165 | = tsk_malloc(tsk_tree_get_size_bound(tree) * sizeof(*self->traversal_stack)); 166 | if (self->traversal_stack == NULL) { 167 | ret = TSK_ERR_NO_MEMORY; 168 | goto out; 169 | } 170 | out: 171 | return ret; 172 | } 173 | 174 | static int 175 | tsk_newick_converter_free(tsk_newick_converter_t *self) 176 | { 177 | tsk_safe_free(self->traversal_stack); 178 | return 0; 179 | } 180 | 181 | int 182 | tsk_convert_newick(const tsk_tree_t *tree, tsk_id_t root, unsigned int precision, 183 | tsk_flags_t options, size_t buffer_size, char *buffer) 184 | { 185 | int ret = 0; 186 | tsk_newick_converter_t nc; 187 | 188 | ret = tsk_newick_converter_init(&nc, tree, precision, options); 189 | if (ret != 0) { 190 | goto out; 191 | } 192 | ret = tsk_newick_converter_run(&nc, root, buffer_size, buffer); 193 | out: 194 | tsk_newick_converter_free(&nc); 195 | return ret; 196 | } 197 | -------------------------------------------------------------------------------- /lib/subprojects/tskit/tskit/convert.h: -------------------------------------------------------------------------------- 1 | /* 2 | * MIT License 3 | * 4 | * Copyright (c) 2018-2021 Tskit Developers 5 | * Copyright (c) 2015-2017 University of Oxford 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | #ifndef TSK_CONVERT_H 27 | #define TSK_CONVERT_H 28 | 29 | #ifdef __cplusplus 30 | extern "C" { 31 | #endif 32 | 33 | #include 34 | 35 | #define TSK_NEWICK_LEGACY_MS_LABELS (1 << 0) 36 | 37 | int tsk_convert_newick(const tsk_tree_t *tree, tsk_id_t root, unsigned int precision, 38 | tsk_flags_t options, size_t buffer_size, char *buffer); 39 | 40 | #ifdef __cplusplus 41 | } 42 | #endif 43 | #endif 44 | -------------------------------------------------------------------------------- /lib/subprojects/tskit/tskit/genotypes.h: -------------------------------------------------------------------------------- 1 | /* 2 | * MIT License 3 | * 4 | * Copyright (c) 2019-2022 Tskit Developers 5 | * Copyright (c) 2016-2018 University of Oxford 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | #ifndef TSK_GENOTYPES_H 27 | #define TSK_GENOTYPES_H 28 | 29 | #ifdef __cplusplus 30 | extern "C" { 31 | #endif 32 | 33 | #include 34 | 35 | #define TSK_ISOLATED_NOT_MISSING (1 << 1) 36 | 37 | /** 38 | @brief A variant at a specific site. 39 | 40 | @rst 41 | Used to generate the genotypes for a given set of samples at a given 42 | site. 43 | @endrst 44 | */ 45 | typedef struct { 46 | /** @brief Unowned reference to the tree sequence of the variant */ 47 | const tsk_treeseq_t *tree_sequence; 48 | /** @brief The site this variant is currently decoded at*/ 49 | tsk_site_t site; 50 | tsk_tree_t tree; 51 | /** @brief Array of allele strings that the genotypes of the variant refer to 52 | * These are not NULL terminated - use `allele_lengths` for example:. 53 | * `printf("%.*s", (int) var->allele_lengths[j], var->alleles[j]);` 54 | */ 55 | const char **alleles; 56 | /** @brief Lengths of the allele strings */ 57 | tsk_size_t *allele_lengths; 58 | /** @brief Length of the allele array */ 59 | tsk_size_t num_alleles; 60 | tsk_size_t max_alleles; 61 | /** @brief If True the genotypes of isolated nodes have been decoded to the "missing" 62 | * genotype. If False they are set to the ancestral state (in the absence of 63 | * mutations above them)*/ 64 | bool has_missing_data; 65 | /** @brief Array of genotypes for the current site */ 66 | int32_t *genotypes; 67 | /** @brief Number of samples */ 68 | tsk_size_t num_samples; 69 | /** @brief Array of sample ids used*/ 70 | tsk_id_t *samples; 71 | 72 | const tsk_id_t *sample_index_map; 73 | bool user_alleles; 74 | char *user_alleles_mem; 75 | tsk_id_t *traversal_stack; 76 | tsk_flags_t options; 77 | tsk_id_t *alt_samples; 78 | tsk_id_t *alt_sample_index_map; 79 | 80 | } tsk_variant_t; 81 | 82 | /* All vargen related structs and methods were deprecated in C API v1.0 */ 83 | typedef struct { 84 | const tsk_treeseq_t *tree_sequence; 85 | tsk_id_t site_index; 86 | tsk_variant_t variant; 87 | } tsk_vargen_t; 88 | 89 | /** 90 | @defgroup VARIANT_API_GROUP Variant API for obtaining genotypes. 91 | @{ 92 | */ 93 | 94 | /** 95 | @brief Initialises the variant by allocating the internal memory 96 | 97 | @rst 98 | This must be called before any operations are performed on the variant. 99 | See the :ref:`sec_c_api_overview_structure` for details on how objects 100 | are initialised and freed. 101 | @endrst 102 | 103 | @param self A pointer to an uninitialised tsk_variant_t object. 104 | @param tree_sequence A pointer to the tree sequence from which this variant 105 | will decode genotypes. No copy is taken, so this tree sequence must persist 106 | for the lifetime of the variant. 107 | @param samples Optional. Either `NULL` or an array of node ids of the samples that are to 108 | have their genotypes decoded. A copy of this array will be taken by the variant. If 109 | `NULL` then the samples from the tree sequence will be used. 110 | @param num_samples The number of ids in the samples array, ignored if `samples` is `NULL` 111 | @param alleles Optional. Either ``NULL`` or an array of string alleles with a terminal 112 | ``NULL`` sentinel value. 113 | If specified, the genotypes will be decoded to match the index in this allele array. 114 | If ``NULL`` then alleles will be automatically determined from the mutations encountered. 115 | @param options Variant options. Either ``0`` or ``TSK_ISOLATED_NOT_MISSING`` which 116 | if specified indicates that isolated sample nodes should not be decoded as the "missing" 117 | state but as the ancestral state (or the state of any mutation above them). 118 | @return Return 0 on success or a negative value on failure. 119 | */ 120 | int tsk_variant_init(tsk_variant_t *self, const tsk_treeseq_t *tree_sequence, 121 | const tsk_id_t *samples, tsk_size_t num_samples, const char **alleles, 122 | tsk_flags_t options); 123 | 124 | /** 125 | @brief Copies the state of this variant to another variant 126 | 127 | @rst 128 | Copies the site, genotypes and alleles from this variant to another. Note that 129 | the other variant should be uninitialised as this method does not free any 130 | memory that the other variant owns. After copying `other` is frozen and 131 | this restricts it from being further decoded at any site. `self` remains unchanged. 132 | @endrst 133 | 134 | @param self A pointer to an initialised and decoded tsk_variant_t object. 135 | @param other A pointer to an uninitialised tsk_variant_t object. 136 | @return Return 0 on success or a negative value on failure. 137 | */ 138 | int tsk_variant_restricted_copy(const tsk_variant_t *self, tsk_variant_t *other); 139 | 140 | /** 141 | @brief Decode the genotypes at the given site, storing them in this variant. 142 | 143 | @rst 144 | Decodes the genotypes for this variant's samples, indexed to this variant's alleles, 145 | at the specified site. 146 | This method is most efficient at decoding sites in-order, either forwards or backwards 147 | along the tree sequence. Resulting genotypes are stored in the ``genotypes`` member of 148 | this variant. 149 | @endrst 150 | 151 | @param self A pointer to an initialised tsk_variant_t object. 152 | @param site_id A valid site id for the tree sequence of this variant. 153 | @param options Bitwise option flags. Currently unused; should be 154 | set to zero to ensure compatibility with later versions of `tskit`. 155 | @return Return 0 on success or a negative value on failure. 156 | */ 157 | int tsk_variant_decode(tsk_variant_t *self, tsk_id_t site_id, tsk_flags_t options); 158 | 159 | /** 160 | @brief Free the internal memory for the specified variant. 161 | 162 | @param self A pointer to an initialised tsk_variant_t object. 163 | @return Always returns 0. 164 | */ 165 | int tsk_variant_free(tsk_variant_t *self); 166 | 167 | /** 168 | @brief Print out the state of this variant to the specified stream. 169 | 170 | This method is intended for debugging purposes and should not be used 171 | in production code. The format of the output should **not** be depended 172 | on and may change arbitrarily between versions. 173 | 174 | @param self A pointer to a tsk_variant_t object. 175 | @param out The stream to write the summary to. 176 | */ 177 | void tsk_variant_print_state(const tsk_variant_t *self, FILE *out); 178 | 179 | /** @} */ 180 | 181 | /* Deprecated vargen methods (since C API v1.0) */ 182 | int tsk_vargen_init(tsk_vargen_t *self, const tsk_treeseq_t *tree_sequence, 183 | const tsk_id_t *samples, tsk_size_t num_samples, const char **alleles, 184 | tsk_flags_t options); 185 | int tsk_vargen_next(tsk_vargen_t *self, tsk_variant_t **variant); 186 | int tsk_vargen_free(tsk_vargen_t *self); 187 | void tsk_vargen_print_state(const tsk_vargen_t *self, FILE *out); 188 | 189 | #ifdef __cplusplus 190 | } 191 | #endif 192 | #endif 193 | -------------------------------------------------------------------------------- /lib/subprojects/tskit/tskit/haplotype_matching.h: -------------------------------------------------------------------------------- 1 | /* 2 | * MIT License 3 | * 4 | * Copyright (c) 2019-2022 Tskit Developers 5 | * 6 | * Permission is hereby granted, free of charge, to any person obtaining a copy 7 | * of this software and associated documentation files (the "Software"), to deal 8 | * in the Software without restriction, including without limitation the rights 9 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | * copies of the Software, and to permit persons to whom the Software is 11 | * furnished to do so, subject to the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be included in all 14 | * copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | * SOFTWARE. 23 | */ 24 | 25 | #ifndef TSK_HAPLOTYPE_MATCHING_H 26 | #define TSK_HAPLOTYPE_MATCHING_H 27 | 28 | #ifdef __cplusplus 29 | extern "C" { 30 | #endif 31 | 32 | #include 33 | 34 | /* Seems like we might use this somewhere else as well, so putting it into the middle 35 | * of the flags space */ 36 | #define TSK_ALLELES_ACGT (1 << 16) 37 | 38 | typedef struct { 39 | tsk_id_t tree_node; 40 | tsk_id_t value_index; 41 | double value; 42 | } tsk_value_transition_t; 43 | 44 | typedef struct { 45 | tsk_size_t index; 46 | double value; 47 | } tsk_argsort_t; 48 | 49 | typedef struct { 50 | tsk_id_t tree_node; 51 | tsk_id_t old_state; 52 | tsk_id_t new_state; 53 | tsk_id_t transition_parent; 54 | } tsk_transition_stack_t; 55 | 56 | typedef struct { 57 | double normalisation_factor; 58 | double *value; 59 | tsk_id_t *node; 60 | tsk_size_t num_values; 61 | } tsk_site_probability_t; 62 | 63 | typedef struct { 64 | tsk_treeseq_t *tree_sequence; 65 | tsk_flags_t options; 66 | tsk_size_t num_sites; 67 | tsk_size_t num_samples; 68 | double *normalisation_factor; 69 | tsk_size_t *num_transitions; 70 | double **values; 71 | tsk_id_t **nodes; 72 | tsk_blkalloc_t memory; 73 | } tsk_compressed_matrix_t; 74 | 75 | typedef struct { 76 | tsk_id_t site; 77 | tsk_id_t node; 78 | bool required; 79 | } tsk_recomb_required_record; 80 | 81 | typedef struct { 82 | tsk_compressed_matrix_t matrix; 83 | tsk_recomb_required_record *recombination_required; 84 | tsk_size_t num_recomb_records; 85 | tsk_size_t max_recomb_records; 86 | } tsk_viterbi_matrix_t; 87 | 88 | typedef struct _tsk_ls_hmm_t { 89 | /* input */ 90 | tsk_treeseq_t *tree_sequence; 91 | double *recombination_rate; 92 | double *mutation_rate; 93 | const char ***alleles; 94 | unsigned int precision; 95 | uint32_t *num_alleles; 96 | tsk_size_t num_samples; 97 | tsk_size_t num_sites; 98 | tsk_size_t num_nodes; 99 | /* state */ 100 | tsk_tree_t tree; 101 | tsk_diff_iter_t diffs; 102 | tsk_id_t *parent; 103 | /* The probability value transitions on the tree */ 104 | tsk_value_transition_t *transitions; 105 | tsk_value_transition_t *transitions_copy; 106 | /* Stack used when distributing transitions on the tree */ 107 | tsk_transition_stack_t *transition_stack; 108 | /* Map of node_id to index in the transitions list */ 109 | tsk_id_t *transition_index; 110 | /* Buffer used to argsort the transitions by node time */ 111 | tsk_argsort_t *transition_time_order; 112 | tsk_size_t num_transitions; 113 | tsk_size_t max_transitions; 114 | /* The distinct values in the transitions */ 115 | double *values; 116 | tsk_size_t num_values; 117 | tsk_size_t max_values; 118 | tsk_size_t max_parsimony_words; 119 | /* Number of machine words per node optimal value set. */ 120 | tsk_size_t num_optimal_value_set_words; 121 | uint64_t *optimal_value_sets; 122 | /* The parent transition; used during compression */ 123 | tsk_id_t *transition_parent; 124 | /* The number of samples directly subtended by a transition */ 125 | tsk_size_t *num_transition_samples; 126 | int32_t *allelic_state; 127 | /* Algorithms set these values before they are run */ 128 | int (*next_probability)( 129 | struct _tsk_ls_hmm_t *, tsk_id_t, double, bool, tsk_id_t, double *); 130 | double (*compute_normalisation_factor)(struct _tsk_ls_hmm_t *); 131 | void *output; 132 | } tsk_ls_hmm_t; 133 | 134 | int tsk_ls_hmm_init(tsk_ls_hmm_t *self, tsk_treeseq_t *tree_sequence, 135 | double *recombination_rate, double *mutation_rate, tsk_flags_t options); 136 | int tsk_ls_hmm_set_precision(tsk_ls_hmm_t *self, unsigned int precision); 137 | int tsk_ls_hmm_free(tsk_ls_hmm_t *self); 138 | void tsk_ls_hmm_print_state(tsk_ls_hmm_t *self, FILE *out); 139 | int tsk_ls_hmm_forward(tsk_ls_hmm_t *self, int32_t *haplotype, 140 | tsk_compressed_matrix_t *output, tsk_flags_t options); 141 | int tsk_ls_hmm_viterbi(tsk_ls_hmm_t *self, int32_t *haplotype, 142 | tsk_viterbi_matrix_t *output, tsk_flags_t options); 143 | int tsk_ls_hmm_run(tsk_ls_hmm_t *self, int32_t *haplotype, 144 | int (*next_probability)(tsk_ls_hmm_t *, tsk_id_t, double, bool, tsk_id_t, double *), 145 | double (*compute_normalisation_factor)(struct _tsk_ls_hmm_t *), void *output); 146 | 147 | int tsk_compressed_matrix_init(tsk_compressed_matrix_t *self, 148 | tsk_treeseq_t *tree_sequence, tsk_size_t block_size, tsk_flags_t options); 149 | int tsk_compressed_matrix_free(tsk_compressed_matrix_t *self); 150 | int tsk_compressed_matrix_clear(tsk_compressed_matrix_t *self); 151 | void tsk_compressed_matrix_print_state(tsk_compressed_matrix_t *self, FILE *out); 152 | int tsk_compressed_matrix_store_site(tsk_compressed_matrix_t *self, tsk_id_t site, 153 | double normalisation_factor, tsk_size_t num_transitions, 154 | const tsk_value_transition_t *transitions); 155 | int tsk_compressed_matrix_decode(tsk_compressed_matrix_t *self, double *values); 156 | 157 | int tsk_viterbi_matrix_init(tsk_viterbi_matrix_t *self, tsk_treeseq_t *tree_sequence, 158 | tsk_size_t block_size, tsk_flags_t options); 159 | int tsk_viterbi_matrix_free(tsk_viterbi_matrix_t *self); 160 | int tsk_viterbi_matrix_clear(tsk_viterbi_matrix_t *self); 161 | void tsk_viterbi_matrix_print_state(tsk_viterbi_matrix_t *self, FILE *out); 162 | int tsk_viterbi_matrix_add_recombination_required( 163 | tsk_viterbi_matrix_t *self, tsk_id_t site, tsk_id_t node, bool required); 164 | int tsk_viterbi_matrix_traceback( 165 | tsk_viterbi_matrix_t *self, tsk_id_t *path, tsk_flags_t options); 166 | 167 | #ifdef __cplusplus 168 | } 169 | #endif 170 | #endif 171 | -------------------------------------------------------------------------------- /lib/subprojects/tskit/tskit/stats.h: -------------------------------------------------------------------------------- 1 | /* 2 | * MIT License 3 | * 4 | * Copyright (c) 2019-2021 Tskit Developers 5 | * Copyright (c) 2016-2017 University of Oxford 6 | * 7 | * Permission is hereby granted, free of charge, to any person obtaining a copy 8 | * of this software and associated documentation files (the "Software"), to deal 9 | * in the Software without restriction, including without limitation the rights 10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | * copies of the Software, and to permit persons to whom the Software is 12 | * furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be included in all 15 | * copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | * SOFTWARE. 24 | */ 25 | 26 | #ifndef TSK_STATS_H 27 | #define TSK_STATS_H 28 | 29 | #ifdef __cplusplus 30 | extern "C" { 31 | #endif 32 | 33 | #include 34 | 35 | typedef struct { 36 | const tsk_treeseq_t *tree_sequence; 37 | tsk_site_t focal_site; 38 | tsk_size_t total_samples; 39 | tsk_size_t focal_samples; 40 | double max_distance; 41 | tsk_size_t max_sites; 42 | tsk_tree_t tree; 43 | tsk_id_t *sample_buffer; 44 | double *result; 45 | tsk_size_t result_length; 46 | } tsk_ld_calc_t; 47 | 48 | int tsk_ld_calc_init(tsk_ld_calc_t *self, const tsk_treeseq_t *tree_sequence); 49 | int tsk_ld_calc_free(tsk_ld_calc_t *self); 50 | void tsk_ld_calc_print_state(const tsk_ld_calc_t *self, FILE *out); 51 | int tsk_ld_calc_get_r2(tsk_ld_calc_t *self, tsk_id_t a, tsk_id_t b, double *r2); 52 | int tsk_ld_calc_get_r2_array(tsk_ld_calc_t *self, tsk_id_t a, int direction, 53 | tsk_size_t max_sites, double max_distance, double *r2, tsk_size_t *num_r2_values); 54 | 55 | #ifdef __cplusplus 56 | } 57 | #endif 58 | #endif 59 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=42", 4 | "setuptools_scm", 5 | "wheel", 6 | "numpy>=2" 7 | ] 8 | build-backend = "setuptools.build_meta" 9 | 10 | [tool.setuptools_scm] 11 | write_to = "tsinfer/_version.py" 12 | 13 | [project] 14 | name = "tsinfer" 15 | dynamic = ["version"] 16 | authors = [ 17 | {name = "Tskit Developers", email = "admin@tskit.dev"}, 18 | ] 19 | description = "Infer tree sequences from genetic variation data." 20 | readme = "README.md" 21 | requires-python = ">=3.9" 22 | license = {text = "GNU GPLv3+"} 23 | classifiers = [ 24 | "Programming Language :: C", 25 | "Programming Language :: Python", 26 | "Programming Language :: Python :: 3", 27 | "Programming Language :: Python :: 3.9", 28 | "Programming Language :: Python :: 3.10", 29 | "Programming Language :: Python :: 3.11", 30 | "Programming Language :: Python :: 3.12", 31 | "Programming Language :: Python :: 3 :: Only", 32 | "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", 33 | "Development Status :: 3 - Alpha", 34 | "Environment :: Other Environment", 35 | "Intended Audience :: Science/Research", 36 | "Operating System :: POSIX", 37 | "Operating System :: MacOS :: MacOS X", 38 | "Operating System :: Microsoft :: Windows", 39 | "Topic :: Scientific/Engineering", 40 | "Topic :: Scientific/Engineering :: Bio-Informatics", 41 | ] 42 | keywords = [ 43 | "population genetics", 44 | "tree sequence", 45 | "ancestral recombination graph", 46 | "evolutionary tree", 47 | "inference", 48 | "tsinfer", 49 | ] 50 | dependencies = [ 51 | "numpy>=1.23.5", 52 | "six", 53 | "tqdm", 54 | "humanize", 55 | "daiquiri", 56 | "tskit>=0.5.3", 57 | "numcodecs>=0.6", 58 | "zarr>=2.2,!=2.11.0,!=2.11.1,!=2.11.2,<3", 59 | "lmdb", 60 | "sortedcontainers", 61 | "attrs>=19.2.0", 62 | "numba", 63 | "psutil>=5.9.0", 64 | ] 65 | 66 | [project.urls] 67 | Homepage = "https://tskit.dev/tsinfer" 68 | Documentation = "https://tskit.dev/tsinfer/docs/stable" 69 | Changelog = "https://tskit.dev/tsinfer/docs/stable/CHANGELOG.html" 70 | "Bug Tracker" = "https://github.com/tskit-dev/tsinfer/issues" 71 | "Source Code" = "https://github.com/tskit-dev/tsinfer/" 72 | 73 | [project.scripts] 74 | tsinfer = "tsinfer.__main__:main" 75 | 76 | [tool.setuptools] 77 | packages = ["tsinfer"] 78 | include-package-data = true 79 | 80 | [tool.pytest.ini_options] 81 | testpaths = ["tests"] 82 | filterwarnings = [ 83 | 'ignore:SampleData' 84 | ] -------------------------------------------------------------------------------- /requirements/CI-docs/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter-book==1.0.4.post1 2 | sphinx-issues==5.0.0 3 | sphinx-argparse==0.5.2 4 | humanize==4.12.1 5 | lmdb==1.6.2 6 | tqdm==4.67.1 7 | daiquiri==3.3.0 8 | msprime==1.3.3 9 | sgkit[vcf]==0.9.0 10 | ipywidgets==8.1.5 11 | Bio==1.7.1 12 | bio2zarr==0.1.4 13 | sphinx-book-theme #Unpinned to allow easy updating. 14 | pyfaidx==0.8.1.3 -------------------------------------------------------------------------------- /requirements/CI-tests-complete/requirements.txt: -------------------------------------------------------------------------------- 1 | build==1.2.2.post1 2 | colorama==0.4.6 3 | daiquiri==3.2.5.1 4 | humanize==4.12.1 5 | lmdb==1.6.2 6 | matplotlib==3.9.4 7 | meson==1.7.0 8 | msprime==1.3.3 9 | pytest==8.3.5 10 | pytest-cov==6.0.0 11 | seaborn==0.13.2 12 | sgkit[vcf]==0.9.0 13 | tskit==0.6.0 14 | tqdm==4.67.1 15 | twine==6.1.0 16 | -------------------------------------------------------------------------------- /requirements/CI-tests-conda/requirements.txt: -------------------------------------------------------------------------------- 1 | pytest==8.3.5 2 | msprime==1.3.3 3 | humanize==4.12.1 4 | python-lmdb==1.4.1 5 | tqdm==4.67.1 6 | daiquiri==3.0.0 # Pinned as conda package not updating 7 | matplotlib==3.9.4 8 | seaborn==0.13.2 9 | colorama==0.4.6 10 | tskit==0.6.0 -------------------------------------------------------------------------------- /requirements/development.txt: -------------------------------------------------------------------------------- 1 | attrs 2 | codecov 3 | coverage 4 | flake8 5 | six 6 | tqdm 7 | humanize 8 | daiquiri 9 | msprime >= 1.0.0 10 | tskit >= 0.5.3 11 | lmdb 12 | pre-commit 13 | pytest 14 | pytest-coverage 15 | # Only for giving nice error messages for incompatible older files 16 | h5py 17 | # Only needed for the Python implementation. 18 | sortedcontainers 19 | # Optional extras for debugging threads - these modules mainly work on linux 20 | python-prctl; sys_platform == 'linux' 21 | numa; sys_platform == 'linux' 22 | # Needed for building docs. 23 | sphinx 24 | sphinx-argparse 25 | sphinx_rtd_theme 26 | setuptools>=45 27 | setuptools_scm 28 | # Needed for evaluation script. 29 | matplotlib 30 | seaborn 31 | colorama 32 | sgkit[vcf] 33 | sphinx-book-theme 34 | jupyter-book 35 | sphinx-issues 36 | ipywidgets 37 | pyfaidx 38 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = tsinfer 3 | author= Tskit Developers 4 | author_email = admin@tskit.dev 5 | license = GNU GPLv3+ 6 | description= Infer tree sequences from genetic variation data. 7 | long_description_content_type = text/markdown 8 | long_description = file: README.md 9 | url = https://tskit.dev/tsinfer 10 | project_urls = 11 | Documentation = https://tskit.dev/tsinfer/docs/stable 12 | Changelog = https://tskit.dev/tsinfer/docs/stable/CHANGELOG.html 13 | Bug Tracker = https://github.com/tskit-dev/tsinfer/issues 14 | GitHub = https://github.com/tskit-dev/tsinfer/ 15 | classifiers = 16 | Programming Language :: C 17 | Programming Language :: Python 18 | Programming Language :: Python :: 3 19 | Programming Language :: Python :: 3.9 20 | Programming Language :: Python :: 3.10 21 | Programming Language :: Python :: 3.11 22 | Programming Language :: Python :: 3.12 23 | Programming Language :: Python :: 3 :: Only 24 | License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+) 25 | Development Status :: 3 - Alpha 26 | Environment :: Other Environment 27 | Intended Audience :: Science/Research 28 | Operating System :: POSIX 29 | Operating System :: MacOS :: MacOS X 30 | Operating System :: Microsoft :: Windows 31 | Topic :: Scientific/Engineering 32 | Topic :: Scientific/Engineering :: Bio-Informatics 33 | keywords = 34 | population genetics 35 | tree sequence 36 | ancestral recombination graph 37 | evolutionary tree 38 | inference 39 | tsinfer 40 | platforms = 41 | POSIX 42 | Windows 43 | MacOS X 44 | 45 | [options] 46 | packages = tsinfer 47 | python_requires = >=3.9 48 | include_package_data = True 49 | install_requires = 50 | numpy>=1.23.5 51 | six 52 | tqdm 53 | humanize 54 | daiquiri 55 | tskit>=0.5.8 56 | numcodecs>=0.6 57 | # issues 965 and 967 at zarr-python prevent usage of 2.11.0 and 2.11.1 58 | zarr>=2.2,!=2.11.0,!=2.11.1,!=2.11.2,<3 59 | lmdb 60 | sortedcontainers 61 | attrs>=19.2.0 62 | numba 63 | 64 | [options.entry_points] 65 | console_scripts = 66 | tsinfer = tsinfer.__main__:main 67 | 68 | [tool:pytest] 69 | testpaths = 70 | tests 71 | 72 | [bdist_wheel] 73 | # This flag says to generate wheels that support both Python 2 and Python 74 | # 3. If your code will not run unchanged on both Python 2 and 3, you will 75 | # need to generate separate wheels for each Python version that you 76 | # support. 77 | universal=0 78 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import platform 3 | 4 | import numpy 5 | from setuptools import Extension 6 | from setuptools import setup 7 | 8 | IS_WINDOWS = platform.system() == "Windows" 9 | 10 | libdir = "lib" 11 | tskroot = os.path.join(libdir, "subprojects", "tskit") 12 | tskdir = os.path.join(tskroot, "tskit") 13 | kasdir = os.path.join(tskroot, "subprojects", "kastore") 14 | includes = [libdir, tskroot, tskdir, kasdir] 15 | 16 | tsi_source_files = [ 17 | "ancestor_matcher.c", 18 | "ancestor_builder.c", 19 | "object_heap.c", 20 | "tree_sequence_builder.c", 21 | "err.c", 22 | "avl.c", 23 | ] 24 | tsk_source_files = ["core.c"] 25 | kas_source_files = ["kastore.c"] 26 | 27 | sources = ( 28 | ["_tsinfermodule.c"] 29 | + [os.path.join(libdir, f) for f in tsi_source_files] 30 | + [os.path.join(tskdir, f) for f in tsk_source_files] 31 | + [os.path.join(kasdir, f) for f in kas_source_files] 32 | ) 33 | 34 | libraries = ["Advapi32"] if IS_WINDOWS else [] 35 | 36 | _tsinfer_module = Extension( 37 | "_tsinfer", 38 | sources=sources, 39 | extra_compile_args=["-std=c99"], 40 | libraries=libraries, 41 | undef_macros=["NDEBUG"], 42 | include_dirs=includes + [numpy.get_include()], 43 | ) 44 | 45 | setup( 46 | ext_modules=[_tsinfer_module], 47 | ) 48 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2020 University of Oxford 3 | # 4 | # This file is part of tsinfer. 5 | # 6 | # tsinfer is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # tsinfer is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with tsinfer. If not, see . 18 | # 19 | """ 20 | Configuration and fixtures for pytest. Only put test-suite wide fixtures in here. Module 21 | specific fixtures should live in their modules. 22 | 23 | To use a fixture in a test simply refer to it by name as an argument. This is called 24 | dependancy injection. Note that all fixtures should have the suffix "_fixture" to make 25 | it clear in test code. 26 | 27 | For example to use the `ts` fixture (a tree sequence with data in all fields) in a test: 28 | 29 | class TestClass: 30 | def test_something(self, ts_fixture): 31 | assert ts_fixture.some_method() == expected 32 | 33 | Fixtures can be parameterised etc. see https://docs.pytest.org/en/stable/fixture.html 34 | 35 | Note that fixtures have a "scope" for example `ts_fixture` below is only created once 36 | per test session and re-used for subsequent tests. 37 | """ 38 | import msprime 39 | import numpy as np 40 | import pytest 41 | import tskit 42 | from pytest import fixture 43 | from tsutil import mark_mutation_times_unknown 44 | 45 | import tsinfer 46 | 47 | 48 | def pytest_addoption(parser): 49 | """ 50 | Add an option to skip tests marked with `@pytest.mark.slow` 51 | """ 52 | parser.addoption( 53 | "--skip-slow", action="store_true", default=False, help="Skip slow tests" 54 | ) 55 | 56 | 57 | def pytest_configure(config): 58 | """ 59 | Add docs on the "slow" marker 60 | """ 61 | config.addinivalue_line("markers", "slow: mark test as slow to run") 62 | 63 | 64 | def pytest_collection_modifyitems(config, items): 65 | if config.getoption("--skip-slow"): 66 | skip_slow = pytest.mark.skip(reason="--skip-slow specified") 67 | for item in items: 68 | if "slow" in item.keywords: 69 | item.add_marker(skip_slow) 70 | 71 | 72 | def num_nonsample_muts(ts): 73 | return np.sum(np.logical_not(np.isin(ts.tables.mutations.node, ts.samples()))) 74 | 75 | 76 | def assign_individual_ids(ts): 77 | tables = ts.dump_tables() 78 | ind_md = [{"id": i} for i in range(ts.num_individuals)] 79 | tables.individuals.metadata_schema = tskit.MetadataSchema.permissive_json() 80 | tables.individuals.packset_metadata( 81 | [tables.individuals.metadata_schema.validate_and_encode_row(r) for r in ind_md] 82 | ) 83 | return tables.tree_sequence() 84 | 85 | 86 | @fixture(scope="session") 87 | def small_ts_fixture(): 88 | """ 89 | A simple 1-tree sequence with at least 2 inference sites 90 | (i.e. mutations above a non-sample node), and no mutation times 91 | """ 92 | ts = msprime.sim_ancestry(10, sequence_length=1000, ploidy=1, random_seed=1) 93 | ts = msprime.sim_mutations(ts, rate=0.01, random_seed=1) 94 | ts = assign_individual_ids(ts) 95 | assert num_nonsample_muts(ts) > 1 96 | return mark_mutation_times_unknown(ts) 97 | 98 | 99 | @fixture(scope="session") 100 | def small_sd_fixture(small_ts_fixture): 101 | """ 102 | A sample data instance from the small 1-tree sequence 103 | """ 104 | return tsinfer.SampleData.from_tree_sequence(small_ts_fixture) 105 | 106 | 107 | @fixture(scope="session") 108 | def small_sd_anc_fixture(small_ts_fixture): 109 | """ 110 | A sample data and an ancestors instance from the small 1-tree sequence 111 | """ 112 | sd = tsinfer.SampleData.from_tree_sequence(small_ts_fixture) 113 | return sd, tsinfer.generate_ancestors(sd) 114 | 115 | 116 | @fixture(scope="session") 117 | def medium_ts_fixture(): 118 | """ 119 | A medium sized tree sequence with a good number of trees and inference mutations 120 | (i.e. mutations above a non-sample node), and no mutation times. Samples are 121 | haploid, so we have one individual per sample, which has metadata for identification 122 | """ 123 | ts = msprime.sim_ancestry( 124 | 10, sequence_length=1000, ploidy=1, recombination_rate=0.01, random_seed=3 125 | ) 126 | ts = msprime.sim_mutations(ts, rate=0.02, random_seed=3) 127 | ts = assign_individual_ids(ts) 128 | assert ts.num_trees > 10 129 | assert num_nonsample_muts(ts) > 50 130 | return mark_mutation_times_unknown(ts) 131 | 132 | 133 | @fixture(scope="session") 134 | def medium_sd_fixture(medium_ts_fixture): 135 | """ 136 | A sample data instance from the medium-sized tree sequence 137 | """ 138 | return tsinfer.SampleData.from_tree_sequence( 139 | medium_ts_fixture, use_sites_time=False 140 | ) 141 | -------------------------------------------------------------------------------- /tests/data/bugs/invalid_pc_ancestor_time.samples: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tsinfer/20788d393b79f0ee8b39d866456533c2d86abbe7/tests/data/bugs/invalid_pc_ancestor_time.samples -------------------------------------------------------------------------------- /tests/data/old_formats/medium_sd_fixture_0.2.3.samples: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tsinfer/20788d393b79f0ee8b39d866456533c2d86abbe7/tests/data/old_formats/medium_sd_fixture_0.2.3.samples -------------------------------------------------------------------------------- /tests/test_low_level.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2018-2020 University of Oxford 3 | # 4 | # This file is part of tsinfer. 5 | # 6 | # tsinfer is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # tsinfer is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with tsinfer. If not, see . 18 | # 19 | """ 20 | Integrity tests for the low-level module. 21 | """ 22 | import sys 23 | 24 | import pytest 25 | 26 | import _tsinfer 27 | 28 | 29 | IS_WINDOWS = sys.platform == "win32" 30 | 31 | 32 | class TestOutOfMemory: 33 | """ 34 | Make sure we raise the correct error when out of memory occurs in 35 | the library code. 36 | """ 37 | 38 | @pytest.mark.skipif( 39 | sys.platform == "win32", 40 | reason="windows seems to allow initializing with insane # of nodes" 41 | " (perhaps memory allocation is optimised out at this stage?)", 42 | ) 43 | def test_tree_sequence_builder_too_many_nodes(self): 44 | big = 2**62 45 | with pytest.raises(MemoryError): 46 | _tsinfer.TreeSequenceBuilder([2], max_nodes=big) 47 | 48 | @pytest.mark.skipif( 49 | sys.platform == "win32", 50 | reason="windows raises an assert error not a memory error with 2**62 edges" 51 | " (line 149 of object_heap.c)", 52 | ) 53 | def test_tree_sequence_builder_too_many_edges(self): 54 | big = 2**62 55 | with pytest.raises(MemoryError): 56 | _tsinfer.TreeSequenceBuilder([2], max_edges=big) 57 | 58 | 59 | class TestAncestorMatcher: 60 | """ 61 | Tests for the AncestorMatcher C Python interface. 62 | """ 63 | 64 | def test_init(self): 65 | with pytest.raises(TypeError): 66 | _tsinfer.AncestorMatcher() 67 | with pytest.raises(TypeError): 68 | _tsinfer.AncestorMatcher(None) 69 | tsb = _tsinfer.TreeSequenceBuilder([2]) 70 | with pytest.raises(TypeError): 71 | _tsinfer.AncestorMatcher(tsb) 72 | with pytest.raises(TypeError): 73 | _tsinfer.AncestorMatcher(tsb, [1]) 74 | for bad_type in [None, {}]: 75 | with pytest.raises(TypeError): 76 | _tsinfer.AncestorMatcher(tsb, [1], [1], extended_checks=bad_type) 77 | with pytest.raises(TypeError): 78 | _tsinfer.AncestorMatcher(tsb, [1], [1], precision=bad_type) 79 | for bad_array in [[], [[], []], None, "sdf", [1, 2, 3]]: 80 | with pytest.raises(ValueError): 81 | _tsinfer.AncestorMatcher(tsb, bad_array, [1]) 82 | with pytest.raises(ValueError): 83 | _tsinfer.AncestorMatcher(tsb, [1], bad_array) 84 | 85 | 86 | class TestTreeSequenceBuilder: 87 | """ 88 | Tests for the AncestorMatcher C Python interface. 89 | """ 90 | 91 | def test_init(self): 92 | with pytest.raises(TypeError): 93 | _tsinfer.TreeSequenceBuilder() 94 | for bad_array in [None, "serf", [[], []], ["asdf"], {}]: 95 | with pytest.raises(ValueError): 96 | _tsinfer.TreeSequenceBuilder(bad_array) 97 | 98 | for bad_type in [None, "sdf", {}]: 99 | with pytest.raises(TypeError): 100 | _tsinfer.TreeSequenceBuilder([2], max_nodes=bad_type) 101 | with pytest.raises(TypeError): 102 | _tsinfer.TreeSequenceBuilder([2], max_edges=bad_type) 103 | 104 | 105 | class TestAncestorBuilder: 106 | """ 107 | Tests for the AncestorBuilder C Python interface. 108 | """ 109 | 110 | def test_init(self): 111 | with pytest.raises(TypeError): 112 | _tsinfer.AncestorBuilder() 113 | for bad_value in [None, "serf", [[], []], ["asdf"], {}]: 114 | with pytest.raises(TypeError): 115 | _tsinfer.AncestorBuilder(num_samples=2, max_sites=bad_value) 116 | with pytest.raises(TypeError): 117 | _tsinfer.AncestorBuilder(num_samples=bad_value, max_sites=2) 118 | with pytest.raises(TypeError): 119 | _tsinfer.AncestorBuilder( 120 | num_samples=2, max_sites=2, genotype_encoding=bad_value 121 | ) 122 | with pytest.raises(TypeError): 123 | _tsinfer.AncestorBuilder(num_samples=2, max_sites=2, mmap_fd=bad_value) 124 | for bad_num_samples in [0, 1]: 125 | with pytest.raises(_tsinfer.LibraryError): 126 | _tsinfer.AncestorBuilder(num_samples=bad_num_samples, max_sites=0) 127 | 128 | @pytest.mark.skipif(IS_WINDOWS, reason="mmap_fd is a no-op on Windows") 129 | def test_bad_fd(self): 130 | with pytest.raises(_tsinfer.LibraryError, match="Bad file desc"): 131 | _tsinfer.AncestorBuilder(num_samples=2, max_sites=2, mmap_fd=-2) 132 | 133 | def test_add_site(self): 134 | ab = _tsinfer.AncestorBuilder(num_samples=2, max_sites=10) 135 | for bad_type in ["sdf", {}, None]: 136 | with pytest.raises(TypeError): 137 | ab.add_site(time=bad_type, genotypes=[0, 0]) 138 | for bad_genotypes in ["asdf", [[], []], [0, 1, 2]]: 139 | with pytest.raises(ValueError): 140 | ab.add_site(time=0, genotypes=bad_genotypes) 141 | 142 | def test_add_too_many_sites(self): 143 | for max_sites in range(10): 144 | ab = _tsinfer.AncestorBuilder(num_samples=2, max_sites=max_sites) 145 | for _ in range(max_sites): 146 | ab.add_site(time=1, genotypes=[0, 1]) 147 | for _ in range(2 * max_sites): 148 | with pytest.raises(_tsinfer.LibraryError) as record: 149 | ab.add_site(time=1, genotypes=[0, 1]) 150 | msg = "Cannot add more sites than the specified maximum." 151 | assert str(record.value) == msg 152 | 153 | # TODO need tester methods for the remaining methonds in the class. 154 | -------------------------------------------------------------------------------- /tsinfer/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2018 University of Oxford 3 | # 4 | # This file is part of tsinfer. 5 | # 6 | # tsinfer is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # tsinfer is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with tsinfer. If not, see . 18 | # 19 | """ 20 | Tree sequence inference. 21 | """ 22 | import sys 23 | import warnings 24 | 25 | # tsinfer #957. This warning pops up as a result of using fast-math. It bascially means 26 | # that tiny tiny values are being rounded to zero, which should be fine for our purposes. 27 | warnings.filterwarnings( 28 | "ignore", 29 | message="The value of the smallest subnormal for " 30 | "type is zero", 31 | ) 32 | 33 | 34 | if sys.version_info[0] < 3: 35 | raise Exception("Python 3 only") 36 | 37 | __version__ = "undefined" 38 | try: 39 | from . import _version 40 | 41 | __version__ = _version.version 42 | except ImportError: 43 | pass 44 | 45 | from .inference import * # NOQA 46 | from .formats import * # NOQA 47 | from .eval_util import * # NOQA 48 | from .exceptions import * # NOQA 49 | from .constants import * # NOQA 50 | from .cli import get_cli_parser # NOQA 51 | -------------------------------------------------------------------------------- /tsinfer/__main__.py: -------------------------------------------------------------------------------- 1 | import tsinfer.cli as cli 2 | 3 | 4 | def main(): 5 | cli.tsinfer_main() 6 | 7 | 8 | if __name__ == "__main__": 9 | main() 10 | -------------------------------------------------------------------------------- /tsinfer/ancestors.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2023 University of Oxford 3 | # 4 | # This file is part of tsinfer. 5 | # 6 | # tsinfer is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # tsinfer is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with tsinfer. If not, see . 18 | # 19 | """ 20 | Ancestor handling routines. 21 | """ 22 | import logging 23 | import time as time_ 24 | 25 | import numba 26 | import numpy as np 27 | 28 | logger = logging.getLogger(__name__) 29 | 30 | 31 | def merge_overlapping_ancestors(start, end, time): 32 | # Merge overlapping, same-time ancestors. We do this by scanning along a single 33 | # time epoch from left to right, detecting breaks. 34 | sort_indices = np.lexsort((start, time)) 35 | start = start[sort_indices] 36 | end = end[sort_indices] 37 | time = time[sort_indices] 38 | old_indexes = {} 39 | # For efficiency, pre-allocate the output arrays to the maximum possible size. 40 | new_start = np.full_like(start, -1) 41 | new_end = np.full_like(end, -1) 42 | new_time = np.full_like(time, -1) 43 | 44 | i = 0 45 | new_index_pos = 0 46 | while i < len(start): 47 | j = i + 1 48 | group_overlap = [i] 49 | max_right = end[i] 50 | # While we're in the same time epoch, and the next ancestor 51 | # overlaps with the group, add this ancestor to the group. 52 | while j < len(start) and time[j] == time[i] and start[j] < max_right: 53 | max_right = max(max_right, end[j]) 54 | group_overlap.append(j) 55 | j += 1 56 | 57 | # Emit the found group 58 | old_indexes[new_index_pos] = group_overlap 59 | new_start[new_index_pos] = start[i] 60 | new_end[new_index_pos] = max_right 61 | new_time[new_index_pos] = time[i] 62 | new_index_pos += 1 63 | i = j 64 | # Trim the output arrays to the actual size. 65 | new_start = new_start[:new_index_pos] 66 | new_end = new_end[:new_index_pos] 67 | new_time = new_time[:new_index_pos] 68 | return new_start, new_end, new_time, old_indexes, sort_indices 69 | 70 | 71 | @numba.njit 72 | def run_linesweep(event_times, event_index, event_type, new_time): 73 | # Run the linesweep over the ancestor start-stop events, 74 | # building up the dependency graph as a count of dependencies for each ancestor, 75 | # and a list of dependant children for each ancestor. 76 | n = len(new_time) 77 | 78 | # numba really likes to know the type of the list elements, so we tell it by adding 79 | # a dummy element to the list and then popping it off. 80 | # `active` is the list of ancestors that overlap with the current linesweep position. 81 | active = [-1] 82 | active.pop() 83 | children = [[-1] for _ in range(n)] 84 | for c in range(n): 85 | children[c].pop() 86 | incoming_edge_count = np.zeros(n, dtype=np.int32) 87 | for i in range(len(event_times)): 88 | index = event_index[i] 89 | e_time = event_times[i] 90 | if event_type[i] == 1: 91 | for j in active: 92 | if new_time[j] > e_time: 93 | incoming_edge_count[index] += 1 94 | children[j].append(index) 95 | elif new_time[j] < e_time: 96 | incoming_edge_count[j] += 1 97 | children[index].append(j) 98 | active.append(index) 99 | else: 100 | active.remove(index) 101 | 102 | # Convert children to ragged array format so we can pass arrays to the 103 | # next numba function, `find_groups`. 104 | children_data = [] 105 | children_indices = [0] 106 | for child_list in children: 107 | children_data.extend(child_list) 108 | children_indices.append(len(children_data)) 109 | children_data = np.array(children_data, dtype=np.int32) 110 | children_indices = np.array(children_indices, dtype=np.int32) 111 | return children_data, children_indices, incoming_edge_count 112 | 113 | 114 | @numba.njit 115 | def find_groups(children_data, children_indices, incoming_edge_count): 116 | # We find groups of ancestors that can be matched in parallel by topologically 117 | # sorting the dependency graph. We do this by deconstructing the graph, removing 118 | # nodes with no incoming edges, and adding them to a group. 119 | n = len(children_indices) - 1 120 | group_id = np.full(n, -1, dtype=np.int32) 121 | current_group = 0 122 | while True: 123 | # Find the nodes with no incoming edges 124 | no_incoming = np.where(incoming_edge_count == 0)[0] 125 | if len(no_incoming) == 0: 126 | break 127 | # Remove them from the graph 128 | for i in no_incoming: 129 | incoming_edge_count[i] = -1 130 | incoming_edge_count[ 131 | children_data[children_indices[i] : children_indices[i + 1]] 132 | ] -= 1 133 | # Add them to the group 134 | group_id[no_incoming] = current_group 135 | current_group += 1 136 | 137 | # Check for unassigned nodes (cycles in dependency graph) 138 | if np.any(group_id == -1): 139 | raise ValueError( 140 | "Erroneous cycle in ancestor dependancies, this is often " 141 | "caused by too many unique site times. This fixed by discretising " 142 | "the site times, for example rounding times to the nearest 0.1." 143 | ) 144 | return group_id 145 | 146 | 147 | def group_ancestors_by_linesweep(start, end, time): 148 | # For a given set of ancestors, we want to group them for matching in parallel. 149 | # For each ancestor, any overlapping, older ancestors must be in an earlier group, 150 | # and any overlapping, younger ancestors in a later group. Any overlapping same-age 151 | # ancestors must be in the same group so they don't match to each other. 152 | # We do this by first merging the overlapping same-age ancestors. Then build a 153 | # dependency graph of the ancestors by linesweep. Then form groups by topological 154 | # sort. Finally, we un-merge the same-age ancestors. 155 | 156 | assert len(start) == len(end) 157 | assert len(start) == len(time) 158 | t = time_.time() 159 | ( 160 | new_start, 161 | new_end, 162 | new_time, 163 | old_indexes, 164 | sort_indices, 165 | ) = merge_overlapping_ancestors(start, end, time) 166 | logger.info(f"Merged to {len(new_start)} ancestors in {time_.time() - t:.2f}s") 167 | 168 | # Build a list of events for the linesweep 169 | t = time_.time() 170 | n = len(new_time) 171 | # Create events arrays by copying and concatenating inputs 172 | event_times = np.concatenate([new_time, new_time]) 173 | event_pos = np.concatenate([new_start, new_end]) 174 | event_index = np.concatenate([np.arange(n), np.arange(n)]) 175 | event_type = np.concatenate([np.ones(n, dtype=np.int8), np.zeros(n, dtype=np.int8)]) 176 | # Sort events by position, then ends before starts 177 | event_sort_indices = np.lexsort((event_type, event_pos)) 178 | event_times = event_times[event_sort_indices] 179 | event_index = event_index[event_sort_indices] 180 | event_type = event_type[event_sort_indices] 181 | logger.info(f"Built {len(event_times)} events in {time_.time() - t:.2f}s") 182 | 183 | t = time_.time() 184 | children_data, children_indices, incoming_edge_count = run_linesweep( 185 | event_times, event_index, event_type, new_time 186 | ) 187 | logger.info( 188 | f"Linesweep generated {np.sum(incoming_edge_count)} dependencies in" 189 | f" {time_.time() - t:.2f}s" 190 | ) 191 | 192 | t = time_.time() 193 | group_id = find_groups(children_data, children_indices, incoming_edge_count) 194 | logger.info(f"Found groups in {time_.time() - t:.2f}s") 195 | 196 | t = time_.time() 197 | # Convert the group id array to lists of ids for each group 198 | ancestor_grouping = {} 199 | for group in np.unique(group_id): 200 | ancestor_grouping[group] = np.where(group_id == group)[0] 201 | 202 | # Now un-merge the same-age ancestors, simultaneously mapping back to the original, 203 | # unsorted indexes 204 | for group in ancestor_grouping: 205 | ancestor_grouping[group] = sorted( 206 | [ 207 | sort_indices[item] 208 | for i in ancestor_grouping[group] 209 | for item in old_indexes[i] 210 | ] 211 | ) 212 | logger.info(f"Un-merged in {time_.time() - t:.2f}s") 213 | logger.info( 214 | f"{len(ancestor_grouping)} groups with median size " 215 | f"{np.median([len(ancestor_grouping[group]) for group in ancestor_grouping])}" 216 | ) 217 | return ancestor_grouping 218 | -------------------------------------------------------------------------------- /tsinfer/constants.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2018-2023 University of Oxford 3 | # 4 | # This file is part of tsinfer. 5 | # 6 | # tsinfer is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # tsinfer is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with tsinfer. If not, see . 18 | # 19 | """ 20 | Collection of constants used in tsinfer. We also make use of constants defined in tskit. 21 | """ 22 | import enum 23 | 24 | import numpy as np 25 | 26 | C_ENGINE = "C" 27 | PY_ENGINE = "P" 28 | 29 | 30 | # TODO Change these to use the enum.IntFlag class 31 | 32 | # Bit 16 is set in node flags when they have been created by path compression. 33 | NODE_IS_PC_ANCESTOR = np.uint32(1 << 16) 34 | # Bit 17 is set in node flags when they have been created by shared recombination 35 | # breakpoint 36 | NODE_IS_SRB_ANCESTOR = np.uint32(1 << 17) 37 | # Bit 18 is set in node flags when they are samples inserted to augment existing 38 | # ancestors. 39 | NODE_IS_SAMPLE_ANCESTOR = np.uint32(1 << 18) 40 | # Bit 20 is set in node flags when they are samples not at time zero in the sampledata 41 | # file 42 | NODE_IS_HISTORICAL_SAMPLE = np.uint32(1 << 20) 43 | 44 | # What type of inference have we done at a site? 45 | INFERENCE_NONE = "none" 46 | INFERENCE_FULL = "full" 47 | INFERENCE_PARSIMONY = "parsimony" 48 | 49 | 50 | class GenotypeEncoding(enum.IntEnum): 51 | """ 52 | The encoding scheme used to store genotypes. 53 | """ 54 | 55 | EIGHT_BIT = 0 56 | """ 57 | The default approach of using one-byte per genotype. Supports up to 127 alleles 58 | and missing data. 59 | """ 60 | 61 | ONE_BIT = 1 62 | """ 63 | Encode binary genotype data using a single bit. 64 | """ 65 | -------------------------------------------------------------------------------- /tsinfer/exceptions.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2018 University of Oxford 3 | # 4 | # This file is part of tsinfer. 5 | # 6 | # tsinfer is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # tsinfer is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with tsinfer. If not, see . 18 | # 19 | """ 20 | Exceptions raised by tsinfer. 21 | """ 22 | 23 | 24 | class TsinferException(Exception): 25 | """ 26 | Superclass of all exceptions thrown by tsinfer. 27 | """ 28 | 29 | 30 | class FileError(TsinferException): 31 | """ 32 | Exception raised when some non-specific error happens during file handling. 33 | """ 34 | 35 | 36 | class FileFormatError(FileError): 37 | """ 38 | Exception raised when a malformed file is encountered. 39 | """ 40 | 41 | 42 | class FileFormatTooOld(FileError): 43 | """ 44 | Exception raised when a file with a version too old is detected. 45 | """ 46 | 47 | 48 | class FileFormatTooNew(FileError): 49 | """ 50 | Exception raised when a file with a version from a newer version 51 | of tsinfer is detected. 52 | """ 53 | -------------------------------------------------------------------------------- /tsinfer/progress.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2018-2020 University of Oxford 3 | # 4 | # This file is part of tsinfer. 5 | # 6 | # tsinfer is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # tsinfer is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with tsinfer. If not, see . 18 | # 19 | """ 20 | A progress monitor class for tsinfer 21 | """ 22 | from tqdm.auto import tqdm 23 | 24 | 25 | class ProgressMonitor: 26 | """ 27 | Class responsible for managing in the tqdm progress monitors. 28 | """ 29 | 30 | def __init__( 31 | self, 32 | enabled=True, 33 | generate_ancestors=False, 34 | match_ancestors=False, 35 | augment_ancestors=False, 36 | match_samples=False, 37 | verify=False, 38 | tqdm_kwargs=None, 39 | ): 40 | self.enabled = enabled 41 | self.num_bars = 0 42 | if generate_ancestors: 43 | self.num_bars += 2 44 | if match_ancestors: 45 | self.num_bars += 1 46 | if match_samples: 47 | self.num_bars += 3 48 | if verify: 49 | assert self.num_bars == 0 50 | self.num_bars += 1 51 | if augment_ancestors: 52 | assert self.num_bars == 0 53 | self.num_bars += 2 54 | self.current_count = 0 55 | self.current_instance = None 56 | if not verify: 57 | # Only show extra detail if we are running match-ancestors by itself. 58 | self.show_detail = self.num_bars == 1 59 | self.descriptions = { 60 | "ga_add_sites": "ga-add", 61 | "ga_generate": "ga-gen", 62 | "ma_match": "ma-match", 63 | "ms_match": "ms-match", 64 | "ms_paths": "ms-paths", 65 | "ms_full_mutations": "ms-muts", 66 | "ms_extra_sites": "ms-xsites", 67 | "verify": "verify", 68 | } 69 | if tqdm_kwargs is None: 70 | tqdm_kwargs = {} 71 | self.tqdm_kwargs = tqdm_kwargs 72 | 73 | def set_detail(self, info): 74 | if self.show_detail: 75 | self.current_instance.set_postfix(info) 76 | 77 | def get(self, key, total): 78 | self.current_count += 1 79 | desc = "{:<8} ({}/{})".format( 80 | self.descriptions[key], self.current_count, self.num_bars 81 | ) 82 | bar_format = ( 83 | "{desc}{percentage:3.0f}%|{bar}" 84 | "| {n_fmt}/{total_fmt} [{elapsed}, {rate_fmt}{postfix}]" 85 | ) 86 | self.current_instance = tqdm( 87 | desc=desc, 88 | total=total, 89 | disable=not self.enabled, 90 | bar_format=bar_format, 91 | dynamic_ncols=True, 92 | smoothing=0.01, 93 | unit_scale=True, 94 | **self.tqdm_kwargs, 95 | ) 96 | return self.current_instance 97 | 98 | 99 | class DummyProgress: 100 | """ 101 | Class that mimics the subset of the tqdm API that we use in this module. 102 | """ 103 | 104 | def update(self, n=None): 105 | pass 106 | 107 | def close(self): 108 | pass 109 | 110 | 111 | class DummyProgressMonitor(ProgressMonitor): 112 | """ 113 | Simple class to mimic the interface of the real progress monitor. 114 | """ 115 | 116 | def get(self, key, total): 117 | return DummyProgress() 118 | 119 | def set_detail(self, info): 120 | pass 121 | -------------------------------------------------------------------------------- /tsinfer/provenance.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2018 University of Oxford 3 | # 4 | # This file is part of tsinfer. 5 | # 6 | # tsinfer is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # tsinfer is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with tsinfer. If not, see . 18 | # 19 | """ 20 | Common provenance methods used to determine the state and versions 21 | of various dependencies and the OS. 22 | """ 23 | import dataclasses 24 | import platform 25 | import sys 26 | import time 27 | 28 | import lmdb 29 | import numcodecs 30 | import psutil 31 | import tskit 32 | import zarr 33 | 34 | if sys.platform != "win32": 35 | import resource 36 | 37 | 38 | __version__ = "undefined" 39 | try: 40 | from . import _version 41 | 42 | __version__ = _version.version 43 | except ImportError: 44 | pass 45 | 46 | 47 | @dataclasses.dataclass 48 | class ResourceMetrics: 49 | elapsed_time: float 50 | user_time: float 51 | sys_time: float 52 | max_memory: int 53 | 54 | def asdict(self): 55 | return dataclasses.asdict(self) 56 | 57 | @classmethod 58 | def combine(cls, metrics_list): 59 | if not metrics_list: 60 | raise ValueError("Cannot combine empty list of metrics") 61 | return cls( 62 | elapsed_time=sum(m.elapsed_time for m in metrics_list), 63 | user_time=sum(m.user_time for m in metrics_list), 64 | sys_time=sum(m.sys_time for m in metrics_list), 65 | max_memory=max(m.max_memory for m in metrics_list), 66 | ) 67 | 68 | 69 | def get_environment(): 70 | """ 71 | Returns a dictionary describing the environment in which tsinfer 72 | is currently running. 73 | """ 74 | env = { 75 | "libraries": { 76 | "zarr": {"version": zarr.__version__}, 77 | "numcodecs": {"version": numcodecs.__version__}, 78 | "lmdb": {"version": lmdb.__version__}, 79 | "tskit": {"version": tskit.__version__}, 80 | }, 81 | "os": { 82 | "system": platform.system(), 83 | "node": platform.node(), 84 | "release": platform.release(), 85 | "version": platform.version(), 86 | "machine": platform.machine(), 87 | }, 88 | "python": { 89 | "implementation": platform.python_implementation(), 90 | "version": platform.python_version_tuple(), 91 | }, 92 | } 93 | return env 94 | 95 | 96 | def get_provenance_dict(command=None, resources=None, **kwargs): 97 | """ 98 | Returns a dictionary encoding an execution of tsinfer following the 99 | tskit provenance schema. 100 | 101 | https://tskit.readthedocs.io/en/stable/provenance.html 102 | """ 103 | if command is None: 104 | raise ValueError("Command must be provided") 105 | parameters = dict(kwargs) 106 | parameters["command"] = command 107 | if "simplify" in parameters: 108 | if parameters["simplify"] is None: 109 | del parameters["simplify"] # simplify is deprecated version of post_process 110 | else: 111 | del parameters["post_process"] 112 | document = { 113 | "schema_version": "1.0.0", 114 | "software": {"name": "tsinfer", "version": __version__}, 115 | "parameters": parameters, 116 | "environment": get_environment(), 117 | } 118 | if resources is not None: 119 | document["resources"] = resources 120 | return document 121 | 122 | 123 | def get_peak_memory_bytes(): 124 | # peak memory usage in bytes 125 | if sys.platform in ("linux", "darwin"): 126 | usage = resource.getrusage(resource.RUSAGE_SELF) 127 | max_rss = usage.ru_maxrss 128 | 129 | if sys.platform == "linux": 130 | # Linux reports in kilobytes 131 | return max_rss * 1024 # Convert KB to bytes 132 | # macOS reports in bytes 133 | return max_rss 134 | 135 | elif sys.platform == "win32": 136 | return psutil.Process().memory_info().peak_wset 137 | 138 | else: 139 | return None 140 | 141 | 142 | class TimingAndMemory: 143 | # Context manager for tracking timing and memory usage. 144 | def __init__(self): 145 | self.metrics = None 146 | 147 | def __enter__(self): 148 | self.start_process = psutil.Process() 149 | self.start_elapsed = time.perf_counter() 150 | self.start_times = self.start_process.cpu_times() 151 | return self 152 | 153 | def __exit__(self, exc_type, exc_val, exc_tb): 154 | end_times = self.start_process.cpu_times() 155 | self.metrics = ResourceMetrics( 156 | elapsed_time=time.perf_counter() - self.start_elapsed, 157 | user_time=end_times.user - self.start_times.user, 158 | sys_time=end_times.system - self.start_times.system, 159 | max_memory=get_peak_memory_bytes(), 160 | ) 161 | -------------------------------------------------------------------------------- /tsinfer/threads.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2018 University of Oxford 3 | # 4 | # This file is part of tsinfer. 5 | # 6 | # tsinfer is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # tsinfer is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with tsinfer. If not, see . 18 | # 19 | """ 20 | Utilities for handling threads. 21 | """ 22 | import _thread 23 | import concurrent.futures 24 | import heapq 25 | import logging 26 | import threading 27 | import traceback 28 | 29 | 30 | # prctl is an optional extra; it allows us assign meaninful names to threads 31 | # for debugging. 32 | _prctl_available = False 33 | try: 34 | import prctl 35 | 36 | _prctl_available = True 37 | except ImportError: 38 | pass 39 | 40 | _numa_available = False 41 | try: 42 | import numa 43 | 44 | _numa_available = True 45 | except ImportError: 46 | pass 47 | 48 | 49 | logger = logging.getLogger(__name__) 50 | 51 | 52 | def threaded_map(func, args, num_workers): 53 | results_buffer = [] 54 | with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor: 55 | futures = set() 56 | next_index = 0 57 | for i, arg in enumerate(args): 58 | # +1 so that we're not waiting for the args generator to produce the next arg 59 | while len(futures) >= num_workers + 1: 60 | # If there are too many in-progress tasks, wait for one to complete 61 | done, futures = concurrent.futures.wait( 62 | futures, return_when=concurrent.futures.FIRST_COMPLETED 63 | ) 64 | for future in done: 65 | index, result = future.result() 66 | if index == next_index: 67 | # If this result is the next expected one, yield it immediately 68 | yield result 69 | next_index += 1 70 | else: 71 | heapq.heappush(results_buffer, (index, result)) 72 | 73 | # Yield any results from the buffer that are next in line 74 | while results_buffer and results_buffer[0][0] == next_index: 75 | _, result = heapq.heappop(results_buffer) 76 | yield result 77 | next_index += 1 78 | 79 | # Wraps the function so we can track the index of the argument 80 | futures.add(executor.submit(lambda arg, i=i: (i, func(arg)), arg)) 81 | 82 | concurrent.futures.wait(futures) 83 | for future in futures: 84 | index, result = future.result() 85 | if index == next_index: 86 | yield result 87 | next_index += 1 88 | else: 89 | heapq.heappush(results_buffer, (index, result)) 90 | 91 | # Yield any remaining results in the buffer 92 | while results_buffer: 93 | _, result = heapq.heappop(results_buffer) 94 | yield result 95 | 96 | 97 | def _queue_thread(worker, work_queue, name="tsinfer-worker", index=0, consumer=True): 98 | def thread_target(): 99 | try: 100 | logger.debug(f"thread '{name}' starting") 101 | if _prctl_available: 102 | prctl.set_name(name) 103 | if _numa_available and numa.available(): 104 | numa.set_localalloc() 105 | logger.debug(f"Set NUMA local allocation policy on thread {name}") 106 | worker(index) 107 | logger.debug(f"thread '{name}' finishing") 108 | except Exception: 109 | logger.critical("Exception occured in thread; exiting") 110 | logger.critical(traceback.format_exc()) 111 | # Communicate back the main thread that something bad has happened. 112 | # This seems to be the only reliable way to do it. 113 | _thread.interrupt_main() 114 | # Now we still need to make sure that the main thread doesn't block 115 | # on the queue.get/join (as it won't be interrupted). This is an attempt 116 | # to make sure that it unblocks. May not be fool-proof though. 117 | # 118 | # TODO This doesn't really work. We can still block on pushing things 119 | # onto the queue. We'll probably have to do something ourselves using 120 | # timeouts and stuff to see if an error has occured. 121 | if consumer: 122 | while True: 123 | try: 124 | work_queue.task_done() 125 | except ValueError: 126 | break 127 | else: 128 | work_queue.put(None) 129 | 130 | thread = threading.Thread(target=thread_target, daemon=True) 131 | thread.start() 132 | return thread 133 | 134 | 135 | def queue_producer_thread(worker, work_queue, name="tsinfer-worker", index=0): 136 | """ 137 | Returns a started thread that produces items for the specified queue using the 138 | specified worker function. 139 | """ 140 | return _queue_thread(worker, work_queue, name=name, index=index, consumer=False) 141 | 142 | 143 | def queue_consumer_thread(worker, work_queue, name="tsinfer-worker", index=0): 144 | """ 145 | Returns a started thread that consumes items for the specified queue using the 146 | specified worker function. 147 | """ 148 | return _queue_thread(worker, work_queue, name=name, index=index, consumer=True) 149 | --------------------------------------------------------------------------------