├── .clang-format ├── .clang-tidy ├── .github └── workflows │ ├── macos-unit.yml │ ├── python-package.yml │ ├── sync.yml │ ├── ubuntu-asan.yml │ ├── ubuntu-coverage.yml │ ├── ubuntu-no-sse.yml │ ├── ubuntu-python.yml │ ├── ubuntu-regression.yml │ └── ubuntu-unit.yml ├── .gitignore ├── .gitmodules ├── .readthedocs.yaml ├── CMakeLists.txt ├── COPYING ├── LICENSE ├── PyPI_README.md ├── README.md ├── RELEASE_NOTES.md ├── asmc └── asmc ├── cmake ├── AutodetectVcpkgToolchainFile.cmake ├── CheckDataModule.cmake ├── FindGMP.cmake └── SIMD.cmake ├── cpp_example ├── FastSMC_example.sh └── FastSMC_example_multiple_jobs.sh ├── docs ├── Makefile ├── asmc.md ├── asmc_python.md ├── conf.py ├── fastsmc.md ├── fastsmc_python.md ├── index.rst ├── make.bat ├── pages │ ├── asmc.rst │ ├── asmc_python.rst │ ├── fastsmc.rst │ ├── fastsmc_python.rst │ ├── quickstart_developer.rst │ └── quickstart_user.rst ├── quickstart_developer.md ├── quickstart_user.md └── requirements.txt ├── exe ├── main.cpp ├── main_convertBinary.cpp └── main_fastsmc.cpp ├── notebooks ├── asmc-minimal.ipynb ├── asmc.ipynb ├── asmc_w_decodingquant.ipynb ├── fastsmc-minimal.ipynb └── fastsmc.ipynb ├── setup.py ├── src ├── ASMC.cpp ├── ASMC.hpp ├── AvxDefinitions.hpp ├── BinaryDataReader.hpp ├── Data.cpp ├── Data.hpp ├── DecodePairsReturnStruct.hpp ├── DecodingParams.cpp ├── DecodingParams.hpp ├── DecodingQuantities.cpp ├── DecodingQuantities.hpp ├── FastSMC.cpp ├── FastSMC.hpp ├── FileUtils.cpp ├── FileUtils.hpp ├── HMM.cpp ├── HMM.hpp ├── HmmUtils.cpp ├── HmmUtils.hpp ├── Individual.cpp ├── Individual.hpp ├── MemoryUtils.cpp ├── MemoryUtils.hpp ├── StringUtils.cpp ├── StringUtils.hpp ├── Timer.cpp ├── Timer.hpp ├── Types.hpp ├── __init__.py ├── hashing │ ├── ExtendHash.hpp │ ├── Individuals.hpp │ ├── Match.hpp │ ├── SeedHash.hpp │ ├── Utils.cpp │ └── Utils.hpp └── pybind.cpp ├── test ├── catch.hpp ├── cli_interface_test.py ├── test_ASMC.cpp ├── test_HMM.cpp ├── test_binary_data_reader.cpp ├── test_decoding_params.cpp ├── test_decoding_quantities.cpp ├── test_fastsmc_regression.cpp ├── test_hashing.cpp ├── test_hmm_utils.cpp ├── test_regression.cpp ├── test_regression.py ├── test_unit_asmc.py ├── test_unit_decoding_params.py └── unit_tests.cpp └── vcpkg.json /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | BasedOnStyle: LLVM 3 | AllowShortFunctionsOnASingleLine: None 4 | AllowShortIfStatementsOnASingleLine: Never 5 | AllowShortLoopsOnASingleLine: false 6 | ColumnLimit: 120 7 | BreakBeforeBraces: Linux 8 | PointerAlignment: Left 9 | 10 | ... 11 | -------------------------------------------------------------------------------- /.clang-tidy: -------------------------------------------------------------------------------- 1 | --- 2 | Checks: '*,-android-*,-bugprone-bool-pointer-implicit-conversion,-bugprone-exception-escape,-bugprone-infinite-loop,-bugprone-signed-char-misuse,-cert-dcl16-c,-cert-dcl37-c,-cert-dcl50-cpp,-cert-dcl51-cpp,-cert-dcl54-cpp,-cert-dcl59-cpp,-cert-env33-c,-cert-err09-cpp,-cert-err61-cpp,-cert-fio38-c,-cert-mem57-cpp,-cert-msc30-c,-cert-msc32-c,-cert-oop11-cpp,-cert-oop57-cpp,-cert-oop58-cpp,-cert-pos44-c,-clang-analyzer-*,-cppcoreguidelines-avoid-magic-numbers,-cppcoreguidelines-pro-bounds-array-to-pointer-decay,-cppcoreguidelines-pro-type-cstyle-cast,-darwin-*,-fuchsia-*,-google-*,google-default-arguments,google-explicit-constructor,google-runtime-operator,-hicpp-*,hicpp-exception-baseclass,hicpp-multiway-paths-covered,hicpp-signed-bitwise,-linuxkernel-*,-llvm-*,-llvmlibc-*,-misc-definitions-in-headers,-misc-non-private-member-variables-in-classes,-misc-unused-alias-decls,-misc-unused-parameters,-misc-unused-using-decls,-modernize-use-trailing-return-type,-objc-*,-openmp-exception-escape,-readability-braces-around-statements,-readability-else-after-return,-readability-function-size,-readability-identifier-naming,-readability-implicit-bool-conversion,-readability-isolate-declaration,-readability-magic-numbers,-readability-named-parameter,-readability-qualified-auto,-readability-redundant-access-specifiers,-readability-redundant-member-init,-readability-redundant-preprocessor,-readability-simplify-boolean-expr,-readability-uppercase-literal-suffix,-zircon-*' 3 | WarningsAsErrors: '-*' 4 | HeaderFilterRegex: '' 5 | FormatStyle: none 6 | -------------------------------------------------------------------------------- /.github/workflows/macos-unit.yml: -------------------------------------------------------------------------------- 1 | name: "Unit tests: macOS" 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - '**' 10 | workflow_dispatch: 11 | 12 | jobs: 13 | 14 | build-and-test: 15 | name: Unit tests on macOS 16 | runs-on: ${{ matrix.os }} 17 | if: ${{ github.event_name == 'pull_request' || github.repository == 'PalamaraLab/ASMC' }} 18 | strategy: 19 | fail-fast: false 20 | matrix: 21 | include: 22 | - os: macos-13 23 | - os: macos-14 24 | 25 | steps: 26 | 27 | - name: checkout repo & submodules 28 | uses: actions/checkout@v4 29 | with: 30 | submodules: true 31 | fetch-depth: 0 32 | 33 | - name: cache vcpkg installed packages 34 | uses: actions/cache@v4 35 | id: cache 36 | with: 37 | path: | 38 | vcpkg/ 39 | build_dir/vcpkg_installed/ 40 | key: ${{ runner.os }}-${{ hashFiles('vcpkg.json', 'vcpkg/CHANGELOG.md') }} 41 | 42 | - name: install dependencies 43 | run: | 44 | brew install libomp llvm pkg-config 45 | 46 | - name: make build directory 47 | run: mkdir -p build_dir 48 | 49 | - name: cmake configure 50 | run: cmake .. -DCMAKE_CXX_COMPILER=$(brew --prefix llvm)/bin/clang++ -DCMAKE_C_COMPILER=$(brew --prefix llvm)/bin/clang 51 | working-directory: build_dir 52 | 53 | - name: cmake build 54 | run: cmake --build . --parallel 3 --target ASMC_unit_tests 55 | working-directory: build_dir 56 | 57 | - name: cmake test 58 | run: ctest -j2 -R Asmc_unit_tests --output-on-failure 59 | working-directory: build_dir 60 | -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | name: Build python wheels 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | release: 8 | types: 9 | - published 10 | workflow_dispatch: 11 | 12 | jobs: 13 | build_wheels_cloud: 14 | name: Build wheels on ${{ matrix.os }} 15 | runs-on: ${{ matrix.os }} 16 | if: github.event_name == 'pull_request' || github.repository == 'PalamaraLab/ASMC' 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | include: 21 | - os: ubuntu-24.04 22 | arch: auto64 23 | py-vers: cp39-* cp310-* cp311-* cp312-* cp313-* 24 | before-all: | 25 | dnf -y groupinstall "Development Tools" 26 | dnf -y install git 27 | extra-env: "" 28 | mdt: "" 29 | - os: macos-13 30 | arch: x86_64 31 | py-vers: cp39-* cp310-* cp311-* cp312-* cp313-* 32 | before-all: brew install cmake libomp llvm pkg-config 33 | extra-env: CC="$(brew --prefix llvm)/bin/clang" CXX="$(brew --prefix llvm)/bin/clang++" HOMEBREW_NO_INSTALLED_DEPENDENTS_CHECK=1 34 | mdt: 13 35 | - os: macos-14 36 | arch: arm64 37 | py-vers: cp39-* cp310-* cp311-* cp312-* cp313-* 38 | before-all: brew install cmake libomp llvm pkg-config 39 | extra-env: CC="$(brew --prefix llvm)/bin/clang" CXX="$(brew --prefix llvm)/bin/clang++" HOMEBREW_NO_INSTALLED_DEPENDENTS_CHECK=1 40 | mdt: 14 41 | 42 | env: 43 | CIBW_BUILD: ${{ matrix.py-vers }} 44 | CIBW_SKIP: cp3*-musllinux_* 45 | CIBW_ARCHS: ${{ matrix.arch }} 46 | CIBW_MANYLINUX_X86_64_IMAGE: manylinux_2_28 47 | CIBW_BEFORE_ALL: ${{ matrix.before-all }} 48 | CIBW_BEFORE_BUILD: pip install --upgrade pip setuptools wheel ninja numpy cython 49 | CIBW_ENVIRONMENT: ${{ matrix.extra-env }} 50 | MACOSX_DEPLOYMENT_TARGET: ${{ matrix.mdt }} 51 | 52 | steps: 53 | 54 | - name: checkout repo & submodules 55 | uses: actions/checkout@v4 56 | with: 57 | submodules: true 58 | fetch-depth: 0 59 | 60 | - name: Build wheels 61 | uses: pypa/cibuildwheel@v2.19.2 62 | 63 | - uses: actions/upload-artifact@v4 64 | with: 65 | name: wheels-cloud-${{ matrix.os }} 66 | path: ./wheelhouse/*.whl 67 | retention-days: 1 68 | 69 | upload_all: 70 | name: Upload to PyPI 71 | needs: [build_wheels_cloud] 72 | runs-on: ubuntu-latest 73 | if: ${{ github.event_name == 'release' && github.event.action == 'published' && github.repository == 'PalamaraLab/ASMC' }} 74 | 75 | steps: 76 | - name: Download wheels from cloud runners 77 | uses: actions/download-artifact@v4 78 | with: 79 | pattern: wheels-cloud-* 80 | merge-multiple: true 81 | path: wheels 82 | 83 | - uses: pypa/gh-action-pypi-publish@v1.10.1 84 | with: 85 | user: __token__ 86 | password: ${{ secrets.PYPI_TOKEN }} 87 | packages_dir: wheels/ 88 | -------------------------------------------------------------------------------- /.github/workflows/sync.yml: -------------------------------------------------------------------------------- 1 | name: Sync 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | private-to-public: 10 | if: github.repository == 'PalamaraLab/ASMC_dev' 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Checkout private repo 14 | uses: actions/checkout@v4 15 | with: 16 | fetch-depth: 0 17 | persist-credentials: false 18 | 19 | - name: Mirror main to public repo 20 | run: | 21 | remote_repo="https://fcooper8472:${{ secrets.DEPLOY_ACCESS_TOKEN }}@github.com/PalamaraLab/ASMC.git" 22 | git fetch "${remote_repo}" main 23 | if ! git diff --quiet HEAD FETCH_HEAD; then 24 | git push "${remote_repo}" HEAD:main --follow-tags --force 25 | echo "Changes detected and pushed to public repo." 26 | else 27 | echo "No changes detected. No push needed." 28 | fi 29 | 30 | public-to-private: 31 | if: github.repository == 'PalamaraLab/ASMC' 32 | runs-on: ubuntu-latest 33 | steps: 34 | - name: Checkout public repo 35 | uses: actions/checkout@v4 36 | with: 37 | fetch-depth: 0 38 | persist-credentials: false 39 | 40 | - name: Mirror main to private repo 41 | run: | 42 | remote_repo="https://fcooper8472:${{ secrets.DEPLOY_ACCESS_TOKEN }}@github.com/PalamaraLab/ASMC_dev.git" 43 | git fetch "${remote_repo}" main 44 | if ! git diff --quiet HEAD FETCH_HEAD; then 45 | git push "${remote_repo}" HEAD:main --follow-tags --force 46 | echo "Changes detected and pushed to private repo." 47 | else 48 | echo "No changes detected. No push needed." 49 | fi 50 | -------------------------------------------------------------------------------- /.github/workflows/ubuntu-asan.yml: -------------------------------------------------------------------------------- 1 | name: Ubuntu asan 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - '**' 10 | workflow_dispatch: 11 | 12 | jobs: 13 | 14 | build-and-test: 15 | name: Unit tests with address sanitizer on Ubuntu 16 | runs-on: ubuntu-22.04 17 | env: 18 | CXX: clang++-14 19 | if: ${{ github.event_name == 'pull_request' || github.repository == 'PalamaraLab/ASMC' }} 20 | 21 | steps: 22 | 23 | - name: checkout repo & submodules 24 | uses: actions/checkout@v3 25 | with: 26 | submodules: true 27 | fetch-depth: 0 28 | 29 | - name: cache vcpkg installed packages 30 | uses: actions/cache@v4 31 | id: cache 32 | with: 33 | path: | 34 | vcpkg/ 35 | build_dir/vcpkg_installed/ 36 | key: ${{ runner.os }}-${{ env.CXX }}-${{ hashFiles('vcpkg.json', 'vcpkg/CHANGELOG.md') }} 37 | 38 | - name: install openmp for LLVM compiler 39 | run: sudo apt install libomp-dev 40 | 41 | - name: make build directory 42 | run: mkdir -p build_dir 43 | 44 | - name: cmake configure 45 | run: cmake .. -DASMC_MEMCHECK:BOOL=TRUE 46 | working-directory: build_dir 47 | 48 | - name: cmake build 49 | run: cmake --build . --parallel 2 --target ASMC_unit_tests 50 | working-directory: build_dir 51 | 52 | - name: cmake test 53 | run: ctest -j2 -R Asmc_unit_tests --output-on-failure 54 | working-directory: build_dir 55 | -------------------------------------------------------------------------------- /.github/workflows/ubuntu-coverage.yml: -------------------------------------------------------------------------------- 1 | name: Ubuntu coverage 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | workflow_dispatch: 8 | 9 | jobs: 10 | 11 | build-and-test: 12 | name: Coverage on Ubuntu 13 | runs-on: ubuntu-22.04 14 | if: ${{ github.repository == 'PalamaraLab/ASMC' }} 15 | 16 | steps: 17 | 18 | - name: checkout repo & submodules 19 | uses: actions/checkout@v3 20 | with: 21 | submodules: true 22 | fetch-depth: 0 23 | 24 | - name: cache vcpkg installed packages 25 | uses: actions/cache@v4 26 | id: cache 27 | with: 28 | path: | 29 | vcpkg/ 30 | build_dir/vcpkg_installed/ 31 | key: ${{ runner.os }}-${{ env.CXX }}-${{ hashFiles('vcpkg.json', 'vcpkg/CHANGELOG.md') }} 32 | 33 | - name: install tools 34 | run: | 35 | sudo apt -y update 36 | sudo apt -y install lcov libcurl4-openssl-dev 37 | 38 | - name: make build directory 39 | run: mkdir -p build_dir 40 | 41 | - name: configure 42 | run: | 43 | cmake .. -DCMAKE_BUILD_TYPE=Debug -DASMC_ENABLE_COVERAGE=ON 44 | working-directory: build_dir 45 | 46 | - name: build 47 | run: | 48 | cmake --build . --parallel 2 --target ASMC_unit_tests 49 | working-directory: build_dir 50 | 51 | - name: test 52 | run: | 53 | ctest -j2 -R Asmc_unit_tests --output-on-failure 54 | working-directory: build_dir 55 | 56 | - name: process coverage 57 | run: | 58 | lcov --directory . --capture --output-file coverage.info 59 | lcov --remove coverage.info '/usr/*' '*/test/*' '*/vcpkg_installed/*' --output-file coverage.info 60 | lcov --list coverage.info 61 | working-directory: build_dir 62 | 63 | - name: upload coverage to codecov 64 | run: | 65 | curl -Os https://uploader.codecov.io/latest/linux/codecov 66 | chmod +x codecov 67 | ./codecov 68 | working-directory: build_dir 69 | -------------------------------------------------------------------------------- /.github/workflows/ubuntu-no-sse.yml: -------------------------------------------------------------------------------- 1 | name: Ubuntu no sse/avx 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - '**' 10 | workflow_dispatch: 11 | 12 | jobs: 13 | 14 | build-and-test: 15 | name: Unit tests with out sse/avx on Ubuntu 16 | runs-on: ubuntu-22.04 17 | if: ${{ github.event_name == 'pull_request' || github.repository == 'PalamaraLab/ASMC' }} 18 | 19 | steps: 20 | 21 | - name: checkout repo & vcpkg submodule 22 | uses: actions/checkout@v3 23 | with: 24 | submodules: true 25 | fetch-depth: 0 26 | 27 | - name: cache vcpkg installed packages 28 | uses: actions/cache@v4 29 | id: cache 30 | with: 31 | path: | 32 | vcpkg/ 33 | build_dir/vcpkg_installed/ 34 | key: ${{ runner.os }}-${{ env.CXX }}-${{ hashFiles('vcpkg.json', 'vcpkg/CHANGELOG.md') }} 35 | 36 | - name: make build directory 37 | run: mkdir -p build_dir 38 | 39 | - name: cmake configure 40 | run: cmake .. -DASMC_FORCE_PURE:BOOL=TRUE 41 | working-directory: build_dir 42 | 43 | - name: cmake build 44 | run: cmake --build . --parallel 2 --target ASMC_unit_tests 45 | working-directory: build_dir 46 | 47 | - name: cmake test 48 | run: ctest -j2 -R Asmc_unit_tests --output-on-failure 49 | working-directory: build_dir 50 | -------------------------------------------------------------------------------- /.github/workflows/ubuntu-python.yml: -------------------------------------------------------------------------------- 1 | name: Python 3.8 3.11 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - '**' 10 | workflow_dispatch: 11 | 12 | jobs: 13 | 14 | build-and-test: 15 | name: Unit tests via Python on Ubuntu 16 | runs-on: ubuntu-22.04 17 | if: ${{ github.event_name == 'pull_request' || github.repository == 'PalamaraLab/ASMC' }} 18 | 19 | strategy: 20 | matrix: 21 | python-version: [3.8, 3.11] 22 | 23 | steps: 24 | - name: checkout repo & submodules 25 | uses: actions/checkout@v3 26 | with: 27 | submodules: true 28 | fetch-depth: 0 29 | 30 | - name: cache vcpkg installed packages 31 | uses: actions/cache@v4 32 | id: cache 33 | with: 34 | path: | 35 | vcpkg/ 36 | build_dir/vcpkg_installed/ 37 | key: ${{ runner.os }}-${{ env.CXX }}-${{ hashFiles('vcpkg.json', 'vcpkg/CHANGELOG.md') }} 38 | 39 | - name: Set up Python ${{ matrix.python-version }} 40 | uses: actions/setup-python@v2 41 | with: 42 | python-version: ${{ matrix.python-version }} 43 | architecture: x64 44 | 45 | - name: install python bindings 46 | run: | 47 | python -m pip install --upgrade pip setuptools wheel ninja 48 | python -m pip install . 49 | 50 | - name: python unit tests 51 | run: | 52 | python -m unittest discover test "test_unit*.py" 53 | 54 | - name: python regression tests 55 | run: | 56 | python -m unittest discover test "test_regression.py" 57 | -------------------------------------------------------------------------------- /.github/workflows/ubuntu-regression.yml: -------------------------------------------------------------------------------- 1 | name: Regression test 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - '**' 10 | workflow_dispatch: 11 | 12 | jobs: 13 | 14 | build-and-test: 15 | name: Regression test on Ubuntu 16 | runs-on: ubuntu-22.04 17 | env: 18 | CXX: g++-10 19 | if: ${{ github.event_name == 'pull_request' || github.repository == 'PalamaraLab/ASMC' }} 20 | 21 | steps: 22 | - name: checkout repo & submodules 23 | uses: actions/checkout@v3 24 | with: 25 | submodules: true 26 | fetch-depth: 0 27 | 28 | - name: cache vcpkg installed packages 29 | uses: actions/cache@v4 30 | id: cache 31 | with: 32 | path: | 33 | vcpkg/ 34 | build_dir/vcpkg_installed/ 35 | key: ${{ runner.os }}-${{ env.CXX }}-${{ hashFiles('vcpkg.json', 'vcpkg/CHANGELOG.md') }} 36 | 37 | - name: make build directory 38 | run: mkdir -p build_dir 39 | 40 | - name: cmake configure 41 | run: cmake .. -DCMAKE_BUILD_TYPE=Release 42 | working-directory: build_dir 43 | 44 | - name: cmake build 45 | run: cmake --build . --parallel 2 --target ASMC_regression 46 | working-directory: build_dir 47 | 48 | - name: cmake test 49 | run: ctest -R regression --output-on-failure 50 | working-directory: build_dir 51 | -------------------------------------------------------------------------------- /.github/workflows/ubuntu-unit.yml: -------------------------------------------------------------------------------- 1 | name: "Unit tests: Ubuntu" 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - '**' 10 | workflow_dispatch: 11 | 12 | jobs: 13 | 14 | build-and-test: 15 | name: Unit tests on Ubuntu 16 | runs-on: ubuntu-24.04 17 | if: ${{ github.event_name == 'pull_request' || github.repository == 'PalamaraLab/ASMC' }} 18 | 19 | steps: 20 | 21 | - name: checkout repo & submodules 22 | uses: actions/checkout@v4 23 | with: 24 | submodules: true 25 | fetch-depth: 0 26 | 27 | - name: cache vcpkg installed packages 28 | uses: actions/cache@v4 29 | id: cache 30 | with: 31 | path: | 32 | vcpkg/ 33 | build_dir/vcpkg_installed/ 34 | key: ${{ runner.os }}-${{ env.CXX }}-${{ hashFiles('vcpkg.json', 'vcpkg/CHANGELOG.md') }} 35 | 36 | - name: make build directory 37 | run: mkdir -p build_dir 38 | 39 | - name: cmake configure 40 | run: cmake .. 41 | working-directory: build_dir 42 | 43 | - name: cmake build 44 | run: cmake --build . --parallel 4 --target ASMC_unit_tests 45 | working-directory: build_dir 46 | 47 | - name: cmake test 48 | run: ctest -j2 -R Asmc_unit_tests --output-on-failure 49 | working-directory: build_dir 50 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | 34 | # Byte-compiled / optimized / DLL files 35 | __pycache__/ 36 | *.py[cod] 37 | *$py.class 38 | 39 | # C extensions 40 | *.so 41 | 42 | # Distribution / packaging 43 | .Python 44 | build/ 45 | develop-eggs/ 46 | dist/ 47 | downloads/ 48 | eggs/ 49 | .eggs/ 50 | lib/ 51 | lib64/ 52 | parts/ 53 | sdist/ 54 | var/ 55 | wheels/ 56 | pip-wheel-metadata/ 57 | share/python-wheels/ 58 | *.egg-info/ 59 | .installed.cfg 60 | *.egg 61 | MANIFEST 62 | 63 | # PyInstaller 64 | # Usually these files are written by a python script from a template 65 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 66 | *.manifest 67 | *.spec 68 | 69 | # Installer logs 70 | pip-log.txt 71 | pip-delete-this-directory.txt 72 | 73 | # Unit test / coverage reports 74 | htmlcov/ 75 | .tox/ 76 | .nox/ 77 | .coverage 78 | .coverage.* 79 | .cache 80 | nosetests.xml 81 | coverage.xml 82 | *.cover 83 | *.py,cover 84 | .hypothesis/ 85 | .pytest_cache/ 86 | 87 | # Translations 88 | *.mo 89 | *.pot 90 | 91 | # Django stuff: 92 | *.log 93 | local_settings.py 94 | db.sqlite3 95 | db.sqlite3-journal 96 | 97 | # Flask stuff: 98 | instance/ 99 | .webassets-cache 100 | 101 | # Scrapy stuff: 102 | .scrapy 103 | 104 | # Sphinx documentation 105 | docs/_build/ 106 | 107 | # PyBuilder 108 | target/ 109 | 110 | # Jupyter Notebook 111 | .ipynb_checkpoints 112 | 113 | # IPython 114 | profile_default/ 115 | ipython_config.py 116 | 117 | # pyenv 118 | .python-version 119 | 120 | # pipenv 121 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 122 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 123 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 124 | # install all needed dependencies. 125 | #Pipfile.lock 126 | 127 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 128 | __pypackages__/ 129 | 130 | # Celery stuff 131 | celerybeat-schedule 132 | celerybeat.pid 133 | 134 | # SageMath parsed files 135 | *.sage.py 136 | 137 | # Environments 138 | .env 139 | .venv 140 | env/ 141 | venv/ 142 | ENV/ 143 | env.bak/ 144 | venv.bak/ 145 | 146 | # Spyder project settings 147 | .spyderproject 148 | .spyproject 149 | 150 | # Rope project settings 151 | .ropeproject 152 | 153 | # mkdocs documentation 154 | /site 155 | 156 | # mypy 157 | .mypy_cache/ 158 | .dmypy.json 159 | dmypy.json 160 | 161 | # Pyre type checker 162 | .pyre/ 163 | 164 | ####################### 165 | # Custom ASMC-related # 166 | ####################### 167 | 168 | .vscode 169 | .idea 170 | cmake-build-* 171 | build 172 | 173 | **/.uuid 174 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "pybind11"] 2 | path = pybind11 3 | url = https://github.com/pybind/pybind11 4 | [submodule "vcpkg"] 5 | path = vcpkg 6 | url = https://github.com/microsoft/vcpkg 7 | [submodule "ASMC_data"] 8 | path = ASMC_data 9 | url = https://github.com/PalamaraLab/ASMC_data 10 | [submodule "DataModule"] 11 | path = DataModule 12 | url = https://github.com/PalamaraLab/DataModule/ 13 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the OS, Python version and other tools you might need 9 | build: 10 | os: ubuntu-22.04 11 | tools: 12 | python: "3.11" 13 | 14 | # Build documentation in the "docs/" directory with Sphinx 15 | sphinx: 16 | configuration: docs/conf.py 17 | 18 | # Optionally build your docs in additional formats such as PDF and ePub 19 | # formats: 20 | # - pdf 21 | # - epub 22 | 23 | # Optional but recommended, declare the Python requirements required 24 | # to build your documentation 25 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html 26 | python: 27 | install: 28 | - requirements: docs/requirements.txt 29 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | Copyright (c) 2020, University of Oxford. 2 | All rights reserved. 3 | 4 | University of Oxford means the Chancellor, Masters and Scholars of the 5 | University of Oxford, having an administrative office at Wellington 6 | Square, Oxford OX1 2JD, UK. 7 | 8 | This program is free software: you can redistribute it and/or modify 9 | it under the terms of the GNU General Public License as published by 10 | the Free Software Foundation, either version 3 of the License, or 11 | (at your option) any later version. 12 | 13 | This program is distributed in the hope that it will be useful, 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | GNU General Public License for more details. 17 | 18 | You should have received a copy of the GNU General Public License 19 | along with this program. If not, see . 20 | -------------------------------------------------------------------------------- /PyPI_README.md: -------------------------------------------------------------------------------- 1 | [![Unit tests: Ubuntu](https://github.com/PalamaraLab/ASMC/actions/workflows/ubuntu-unit.yml/badge.svg)](https://github.com/PalamaraLab/ASMC/actions/workflows/ubuntu-unit.yml) 2 | [![Unit tests: macOS](https://github.com/PalamaraLab/ASMC/actions/workflows/macos-unit.yml/badge.svg)](https://github.com/PalamaraLab/ASMC/actions/workflows/macos-unit.yml) 3 | [![Python 3.8 3.11](https://github.com/PalamaraLab/ASMC/actions/workflows/ubuntu-python.yml/badge.svg)](https://github.com/PalamaraLab/ASMC/actions/workflows/ubuntu-python.yml) 4 | [![Regression test](https://github.com/PalamaraLab/ASMC/workflows/Regression%20test/badge.svg)](https://github.com/PalamaraLab/ASMC/actions) 5 | [![Ubuntu asan](https://github.com/PalamaraLab/ASMC/workflows/Ubuntu%20asan/badge.svg)](https://github.com/PalamaraLab/ASMC/actions) 6 | [![Ubuntu no sse/avx](https://github.com/PalamaraLab/ASMC/workflows/Ubuntu%20no%20sse/avx/badge.svg)](https://github.com/PalamaraLab/ASMC/actions) 7 | [![codecov](https://codecov.io/gh/PalamaraLab/ASMC/branch/main/graph/badge.svg)](https://codecov.io/gh/PalamaraLab/ASMC) 8 | 9 | # ASMC and FastSMC 10 | 11 | This repository contains ASMC and an extension, FastSMC, together with python bindings for both. 12 | 13 | ## Quickstart 14 | 15 | ### Install the Python module from PyPI 16 | 17 | Most functionality is available through a Python module which can be installed with: 18 | 19 | ```bash 20 | pip install asmc-asmc 21 | ``` 22 | 23 | ### Documentation 24 | 25 | The following pages of documentation contains specific information: 26 | - [Quickstart guide for users](https://github.com/PalamaraLab/ASMC/blob/main/docs/quickstart_user.md) 27 | - [ASMC python docs](https://github.com/PalamaraLab/ASMC/blob/main/docs/asmc_python.md) 28 | - [FastSMC python docs](https://github.com/PalamaraLab/ASMC/blob/main/docs/fastsmc_python.md) 29 | 30 | This Python module is currently available on Linux and macOS. 31 | 32 | Example Jupyter notebooks showcasing basic functionality can be found here: 33 | - [Example notebooks](https://github.com/PalamaraLab/ASMC/tree/main/notebooks) 34 | 35 | ## License 36 | 37 | ASMC and FastSMC are distributed under the GNU General Public License v3.0 (GPLv3). For any questions or comments on ASMC, please contact Pier Palamara using `@stats.ox.ac.uk`. 38 | 39 | ## Reference 40 | 41 | If you use this software, please cite the appropriate reference(s) below. 42 | 43 | The ASMC algorithm and software were developed in 44 | - P. Palamara, J. Terhorst, Y. Song, A. Price. High-throughput inference of pairwise coalescence times identifies signals of selection and enriched disease heritability. *Nature Genetics*, 2018. 45 | 46 | The FastSMC algorithm and software were developed in 47 | - J. Nait Saada, G. Kalantzis, D. Shyr, F. Cooper, M. Robinson, A. Gusev, P. F. Palamara. Identity-by-descent detection across 487,409 British samples reveals fine-scale evolutionary history and trait associations. *Nature Communications*, 2020. 48 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Unit tests: Ubuntu](https://github.com/PalamaraLab/ASMC/actions/workflows/ubuntu-unit.yml/badge.svg)](https://github.com/PalamaraLab/ASMC/actions/workflows/ubuntu-unit.yml) 2 | [![Unit tests: macOS](https://github.com/PalamaraLab/ASMC/actions/workflows/macos-unit.yml/badge.svg)](https://github.com/PalamaraLab/ASMC/actions/workflows/macos-unit.yml) 3 | [![Python 3.8 3.11](https://github.com/PalamaraLab/ASMC/actions/workflows/ubuntu-python.yml/badge.svg)](https://github.com/PalamaraLab/ASMC/actions/workflows/ubuntu-python.yml) 4 | [![Regression test](https://github.com/PalamaraLab/ASMC/workflows/Regression%20test/badge.svg)](https://github.com/PalamaraLab/ASMC/actions) 5 | [![Ubuntu asan](https://github.com/PalamaraLab/ASMC/workflows/Ubuntu%20asan/badge.svg)](https://github.com/PalamaraLab/ASMC/actions) 6 | [![Ubuntu no sse/avx](https://github.com/PalamaraLab/ASMC/workflows/Ubuntu%20no%20sse/avx/badge.svg)](https://github.com/PalamaraLab/ASMC/actions) 7 | [![codecov](https://codecov.io/gh/PalamaraLab/ASMC/branch/main/graph/badge.svg)](https://codecov.io/gh/PalamaraLab/ASMC) 8 | 9 | # ASMC and FastSMC 10 | 11 | This repository contains ASMC and an extension, FastSMC, together with python bindings for both. 12 | 13 | The following pages of documentation contains specific information: 14 | - [ASMC](./docs/asmc.md) 15 | - [ASMC python bindings](./docs/asmc_python.md) 16 | - [FastSMC](./docs/fastsmc.md) 17 | - [FastSMC python bindings](./docs/fastsmc_python.md) 18 | 19 | ## Installation 20 | 21 | ASMC and FastSMC are regularly built and tested on Ubuntu and macOS. 22 | They consist of a C++ library, C++ executables, and optional Python bindings. 23 | 24 | The C++ libraries and executables require: 25 | 26 | - A C++ compiler (C++17 or later) 27 | - CMake (3.15 or later) 28 | - Boost (1.62 or later) 29 | - Eigen (3.3.4 or later) 30 | - {fmt} 31 | - range-v3 32 | - OpenMP 33 | - zlib 34 | 35 | We recommend installing dependencies using vcpkg, distributed with this repository as a submodule. 36 | Information below. 37 | 38 | Building the optional Python bindings additionally requires: 39 | 40 | - Python (3.6 or later) with development files 41 | - PyBind11 (distributed with this repository as a submodule) 42 | 43 | ## Quickstart guides 44 | 45 | - [For users](./docs/quickstart_user.md) 46 | - [For developers](./docs/quickstart_developer.md) 47 | 48 | ## Decoding Quantities 49 | 50 | Decoding quantities files are required in order to run ASMC and FastSMC. 51 | These can be generated directly from a Python module, and instructions can be found [here](https://github.com/PalamaraLab/PrepareDecoding). 52 | Input and output file formats for the tool used to create decoding quantities are described [here](https://github.com/PalamaraLab/PrepareDecoding/blob/master/docs/file_formats.md). 53 | 54 | Note: the CEU.demo demographic model and the decoding quantities for CEU+UKBB previously provided in [this repository](https://github.com/PalamaraLab/FastSMC) and [this repository](https://github.com/PalamaraLab/ASMC_legacy) were mistakenly encoded as diploid rather than haploid. 55 | The file [CEU.demo](https://github.com/PalamaraLab/ASMC_data/tree/main/demographies) and CEU+UKBB decoding quantities [here](https://github.com/PalamaraLab/ASMC_data/tree/main/decoding_quantities) have now been fixed. 56 | They were generated using [v2.2.1](https://github.com/PalamaraLab/PrepareDecoding/releases/tag/v2.2.1) of the [PrepareDecoding tool](https://github.com/PalamaraLab/PrepareDecoding), which also provides a simpler interface for computing decoding quantities as well as support for additional demographic models. 57 | Using these new decoding quantities with v1.2 of ASMC will tend to produce more recent estimates for TMRCAs compared to the decoding quantities distributed with v1.0 and v1.1. 58 | This should not have a substantial impact on most downstream analyses. 59 | 60 | ## For developers: making a release 61 | 62 | - Bump the version number in [setup.py](setup.py), [CMakeLists.txt](CMakeLists.txt) and [conf.py](docs/conf.py) 63 | - Update [RELEASE_NOTES.md](RELEASE_NOTES.md) 64 | - Push changes and check that all [GitHub workflows](https://github.com/PalamaraLab/ASMC/actions) pass 65 | - Tag the commit in Git using syntax `vX.Y` 66 | - Make a release on GitHub, which should trigger a new build that will upload Python wheels to PyPI 67 | 68 | ## License 69 | 70 | ASMC and FastSMC are distributed under the GNU General Public License v3.0 (GPLv3). For any questions or comments on ASMC, please contact Pier Palamara using `@stats.ox.ac.uk`. 71 | 72 | ## Reference 73 | 74 | If you use this software, please cite the appropriate reference(s) below. 75 | 76 | The ASMC algorithm and software were developed in 77 | - P. Palamara, J. Terhorst, Y. Song, A. Price. High-throughput inference of pairwise coalescence times identifies signals of selection and enriched disease heritability. *Nature Genetics*, 2018. 78 | 79 | The FastSMC algorithm and software were developed in 80 | - J. Nait Saada, G. Kalantzis, D. Shyr, F. Cooper, M. Robinson, A. Gusev, P. F. Palamara. Identity-by-descent detection across 487,409 British samples reveals fine-scale evolutionary history and trait associations. *Nature Communications*, 2020. 81 | -------------------------------------------------------------------------------- /RELEASE_NOTES.md: -------------------------------------------------------------------------------- 1 | # ASMC Release Notes 2 | 3 | ## v1.3.1 (2023-06-30) 4 | 5 | ### Breaking changes 6 | 7 | None 8 | 9 | ### Other changes 10 | 11 | - The location of a `.map` or `.map.gz` file can now be optionally specified explicitly: previously it was assumed to be at the `inFileRoot`. 12 | 13 | 14 | ## v1.3 (2023-03-03) 15 | 16 | ### Breaking changes 17 | 18 | None 19 | 20 | ### Other changes 21 | 22 | - Decoding a batch can now be done in a selected subregion with from / to indices. 23 | A `cm_burn_in` parameter takes into account additional variants on either side of the subregion for HMM burn-in. 24 | - Allow the user to access selected attributes of the DecodingParams and Data from the ASMC object. 25 | - Python continuous integration now uses Python 3.8 and 3.11 (previously 3.6 and 3.9) 26 | - Update Catch to v2.13. 27 | 28 | 29 | ## v1.2 (2021-09-28) 30 | 31 | All functionality for ASMC and FastSMC is now in this repository ([link](https://github.com/PalamaraLab/ASMC)). 32 | 33 | ### Breaking changes 34 | 35 | - Fixed an issue with demographic models. 36 | The `CEU.demo` demographic model and the decoding quantities for CEU+UKBB previously provided in the repository were mistakenly encoded as diploid rather than haploid. 37 | CEU.demo and CEU+UKBB decoding quantities have now been updated and can be found in [this repository](https://github.com/PalamaraLab/ASMC_data). 38 | Also see the manual for a note on how this affects analyses. 39 | 40 | ### Other changes 41 | 42 | - New API for decoding pairs with ASMC. 43 | In addition to running full analyses as described in the ASMC paper, users can now decode specific pairs and get back a variety of summary statistics. 44 | See the [ASMC python documentation](https://github.com/PalamaraLab/ASMC/blob/main/docs/asmc_python.md) for details. 45 | - New, more extensive, [documentation](https://github.com/PalamaraLab/ASMC/blob/main/docs/) is available. 46 | 47 | 48 | ## v1.1 (2021-01-20) 49 | 50 | [Legacy repository](https://github.com/PalamaraLab/FastSMC/releases/tag/v1.1) 51 | 52 | Improvements to documentation and default use. 53 | No changes to any core functionality. 54 | 55 | ### Breaking changes 56 | 57 | - The hashing functionality, previously named `GERMLINE`, has been renamed to `hashing`. 58 | This includes the command line flag for turning this behaviour on/off, which is now `--hashing`. 59 | 60 | ### Other changes 61 | 62 | - `--hashing` is now ON by default when running the FastSMC executable: previously, `--GERMLINE` was OFF by default. 63 | - Extra output, including the IBD segment length, posterior mean, and MAP, are now on by default. 64 | This behaviour can be toggled with the flags `--segmentLength`, `--perPairPosteriorMeans`, `--perPairMAP`. 65 | - An example script has been added to `cpp_example/FastSMC_example_multiple_jobs.sh` that demonstrates how to run FastSMC with multiple jobs simultaneously. 66 | - The README has been updated to focus on FastSMC functionality. 67 | - More robust checking is now used to verify the decoding quantities file is correct before reading it. 68 | - CMake will now, by default, build in Release mode (giving 03 optimisation on Linux). 69 | Previously, Debug was used by default. 70 | 71 | 72 | ## v1.0 (2020-09-18) 73 | 74 | [Legacy repository](https://github.com/PalamaraLab/FastSMC/releases/tag/v1.0) 75 | 76 | First public release of FastSMC, with functionality as described and used in [this paper](https://doi.org/10.1038/s41467-020-19588-x). 77 | -------------------------------------------------------------------------------- /asmc/asmc: -------------------------------------------------------------------------------- 1 | ../src -------------------------------------------------------------------------------- /cmake/AutodetectVcpkgToolchainFile.cmake: -------------------------------------------------------------------------------- 1 | # This file is part of https://github.com/PalamaraLab/ASMC which is released under the GPL-3.0 license. 2 | # See accompanying LICENSE and COPYING for copyright notice and full details. 3 | 4 | # If a VCPKG toolchain file is not defined, but the expected file exists, use it 5 | if (NOT DEFINED CMAKE_TOOLCHAIN_FILE) 6 | if (EXISTS ${CMAKE_SOURCE_DIR}/vcpkg/scripts/buildsystems/vcpkg.cmake) 7 | set(vcpkg_toolchain_file ${CMAKE_SOURCE_DIR}/vcpkg/scripts/buildsystems/vcpkg.cmake) 8 | message(STATUS "Detected vcpkg toolchain file at ${vcpkg_toolchain_file}") 9 | set(CMAKE_TOOLCHAIN_FILE ${vcpkg_toolchain_file}) 10 | endif () 11 | endif () 12 | -------------------------------------------------------------------------------- /cmake/CheckDataModule.cmake: -------------------------------------------------------------------------------- 1 | # This file is part of https://github.com/PalamaraLab/ASMC which is released under the GPL-3.0 license. 2 | # See accompanying LICENSE and COPYING for copyright notice and full details. 3 | 4 | # If a VCPKG toolchain file is not defined, but the expected file exists, use it 5 | if (NOT EXISTS ${CMAKE_SOURCE_DIR}/DataModule/README.md) 6 | message(FATAL_ERROR " 7 | The data module ${ASMC_data_module_dir} does not exist, and it is required for ASMC. 8 | Please either get all submodules when you clone ASMC: 9 | $ git clone --recurse-submodules https://github.com/PalamaraLab/ASMC.git 10 | or, at minimum, initialise the data module. From the ASMC directory: 11 | $ git submodule update --init DataModule 12 | Please see this quickstart guide for further information: 13 | https://github.com/PalamaraLab/ASMC/blob/main/docs/quickstart_user.md 14 | ") 15 | endif () 16 | -------------------------------------------------------------------------------- /cmake/FindGMP.cmake: -------------------------------------------------------------------------------- 1 | # Reproduced from https://github.com/dune-project/dune-common under the terms of version 2 of the GNU General Public 2 | # License (https://github.com/dune-project/dune-common/blob/master/LICENSE.md) 3 | 4 | #[=======================================================================[.rst: 5 | FindGMP 6 | ------- 7 | 8 | Find the GNU MULTI-Precision Bignum (GMP) library 9 | and the corresponding C++ bindings GMPxx. 10 | 11 | This module searches for both libraries and only considers the package 12 | found if both can be located. It then defines separate targets for the C 13 | and the C++ library. 14 | 15 | Imported Targets 16 | ^^^^^^^^^^^^^^^^ 17 | 18 | This module provides the following imported targets, if found: 19 | 20 | ``GMP::gmp`` 21 | Library target of the C library. 22 | ``GMP::gmpxx`` 23 | Library target of the C++ library, which also links to the C library. 24 | 25 | Result Variables 26 | ^^^^^^^^^^^^^^^^ 27 | 28 | This will define the following variables: 29 | 30 | ``GMP_FOUND`` 31 | True if the GMP library, the GMPxx headers and 32 | the GMPxx library were found. 33 | 34 | Cache Variables 35 | ^^^^^^^^^^^^^^^ 36 | 37 | You may set the following variables to modify the behaviour of 38 | this module: 39 | 40 | ``GMP_INCLUDE_DIR`` 41 | The directory containing ``gmp.h``. 42 | ``GMP_LIB`` 43 | The path to the gmp library. 44 | ``GMPXX_INCLUDE_DIR`` 45 | The directory containing ``gmpxx.h``. 46 | ``GMPXX_LIB`` 47 | The path to the gmpxx library. 48 | 49 | #]=======================================================================] 50 | 51 | # Add a feature summary for this package 52 | include(FeatureSummary) 53 | set_package_properties(GMP PROPERTIES 54 | DESCRIPTION "GNU multi-precision library" 55 | URL "https://gmplib.org" 56 | ) 57 | 58 | # Try finding the package with pkg-config 59 | find_package(PkgConfig QUIET) 60 | pkg_check_modules(PKG QUIET gmp gmpxx) 61 | 62 | # Try to locate the libraries and their headers, using pkg-config hints 63 | find_path(GMP_INCLUDE_DIR gmp.h HINTS ${PKG_gmp_INCLUDEDIR}) 64 | find_library(GMP_LIB gmp HINTS ${PKG_gmp_LIBDIR}) 65 | 66 | find_path(GMPXX_INCLUDE_DIR gmpxx.h HINTS ${PKG_gmpxx_INCLUDEDIR}) 67 | find_library(GMPXX_LIB gmpxx HINTS ${PKG_gmpxx_LIBDIR}) 68 | 69 | # Remove these variables from cache inspector 70 | mark_as_advanced(GMP_INCLUDE_DIR GMP_LIB GMPXX_INCLUDE_DIR GMPXX_LIB) 71 | 72 | # Report if package was found 73 | include(FindPackageHandleStandardArgs) 74 | find_package_handle_standard_args(GMP 75 | DEFAULT_MSG 76 | GMPXX_LIB GMPXX_INCLUDE_DIR GMP_INCLUDE_DIR GMP_LIB 77 | ) 78 | 79 | # Set targets 80 | if(GMP_FOUND) 81 | # C library 82 | if(NOT TARGET GMP::gmp) 83 | add_library(GMP::gmp UNKNOWN IMPORTED) 84 | set_target_properties(GMP::gmp PROPERTIES 85 | IMPORTED_LOCATION ${GMP_LIB} 86 | INTERFACE_INCLUDE_DIRECTORIES ${GMP_INCLUDE_DIR} 87 | ) 88 | endif() 89 | 90 | # C++ library, which requires a link to the C library 91 | if(NOT TARGET GMP::gmpxx) 92 | add_library(GMP::gmpxx UNKNOWN IMPORTED) 93 | set_target_properties(GMP::gmpxx PROPERTIES 94 | IMPORTED_LOCATION ${GMPXX_LIB} 95 | INTERFACE_INCLUDE_DIRECTORIES ${GMPXX_INCLUDE_DIR} 96 | INTERFACE_LINK_LIBRARIES GMP::gmp 97 | ) 98 | endif() 99 | endif() 100 | -------------------------------------------------------------------------------- /cmake/SIMD.cmake: -------------------------------------------------------------------------------- 1 | # fast-math and SIMD instruction settings below has been copied and modified from 2 | # th GLM library CMakeLists.txt (MIT license) 3 | # 4 | # https://github.com/g-truc/glm/blob/master/CMakeLists.txt 5 | 6 | option(ASMC_ENABLE_FAST_MATH "Enable fast math optimizations" OFF) 7 | if(ASMC_ENABLE_FAST_MATH) 8 | message(STATUS "Build with fast math optimizations") 9 | 10 | if((CMAKE_CXX_COMPILER_ID MATCHES "Clang") OR (CMAKE_CXX_COMPILER_ID MATCHES "GNU")) 11 | add_compile_options(-ffast-math) 12 | 13 | elseif(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") 14 | add_compile_options(/fp:fast) 15 | endif() 16 | else() 17 | if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") 18 | add_compile_options(/fp:precise) 19 | endif() 20 | endif() 21 | 22 | option(ASMC_ENABLE_SIMD_SSE3 "Enable SSE 1, 2 & 3 optimizations" OFF) 23 | option(ASMC_ENABLE_SIMD_AVX "Enable AVX optimizations" ON) 24 | option(ASMC_ENABLE_SIMD_AVX512 "Enable AVX512 optimizations" OFF) 25 | option(ASMC_FORCE_PURE "Force 'pure' instructions" OFF) 26 | 27 | if(ASMC_FORCE_PURE) 28 | add_definitions(-DNO_SSE) 29 | 30 | if(CMAKE_CXX_COMPILER_ID MATCHES "GNU") 31 | add_compile_options(-mfpmath=387) 32 | endif() 33 | message(STATUS "No SIMD instruction set") 34 | 35 | elseif(ASMC_ENABLE_SIMD_AVX) 36 | add_definitions(-DAVX) 37 | add_compile_definitions(EIGEN_MAX_ALIGN_BYTES=64) 38 | 39 | if((CMAKE_CXX_COMPILER_ID MATCHES "GNU") OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) 40 | add_compile_options(-mavx) 41 | elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel") 42 | add_compile_options(/QxAVX) 43 | elseif(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") 44 | add_compile_options(/arch:AVX) 45 | endif() 46 | message(STATUS "AVX instruction set") 47 | 48 | elseif(ASMC_ENABLE_SIMD_AVX512) 49 | add_definitions(-DAVX) 50 | 51 | if((CMAKE_CXX_COMPILER_ID MATCHES "GNU") OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) 52 | add_compile_options(-mavx512f) 53 | add_compile_options(-mavx512cd) 54 | elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel") 55 | add_compile_options(-xCOMMON-AVX512) 56 | elseif(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") 57 | add_compile_options(/arch:AVX512) 58 | endif() 59 | message(STATUS "AVX-512 instruction set") 60 | 61 | elseif(ASMC_ENABLE_SIMD_SSE) 62 | add_definitions(-DSSE) 63 | 64 | if((CMAKE_CXX_COMPILER_ID MATCHES "GNU") OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) 65 | add_compile_options(-msse3) 66 | add_compile_options(-msse2) 67 | add_compile_options(-msse) 68 | elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel") 69 | add_compile_options(/QxSSE3) 70 | add_compile_options(/QxSSE2) 71 | add_compile_options(/QxSSE) 72 | elseif((CMAKE_CXX_COMPILER_ID MATCHES "MSVC")) 73 | add_compile_options(/arch:SSE2) # VC doesn't support SSE3 74 | add_compile_options(/arch:SSE) 75 | endif() 76 | message(STATUS "SSE2 & SSE3 instruction set") 77 | 78 | elseif(ASMC_ENABLE_SIMD_SSE2) 79 | add_definitions(-DGLM_FORCE_INTRINSICS) 80 | 81 | if((CMAKE_CXX_COMPILER_ID MATCHES "GNU") OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) 82 | add_compile_options(-msse2) 83 | elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel") 84 | add_compile_options(/QxSSE2) 85 | elseif((CMAKE_CXX_COMPILER_ID MATCHES "MSVC") AND NOT CMAKE_CL_64) 86 | add_compile_options(/arch:SSE2) 87 | endif() 88 | message(STATUS "SSE2 instruction set") 89 | endif() 90 | 91 | -------------------------------------------------------------------------------- /cpp_example/FastSMC_example.sh: -------------------------------------------------------------------------------- 1 | # this script will run FastSMC on a simulated data as described in the paper (in FILES/FASTSMC_EXAMPLE/) 2 | # parameters can be changed if desired 3 | 4 | cd ../FASTSMC_BUILD_DIR/ || exit 5 | 6 | ./FastSMC_exe --inFileRoot ../FILES/FASTSMC_EXAMPLE/example \ 7 | --outFileRoot ../cpp_example/FastSMC_output_example \ 8 | --decodingQuantFile ../FILES/FASTSMC_EXAMPLE/example.decodingQuantities.gz \ 9 | --mode array \ 10 | --time 50 \ 11 | --min_m 1.5 \ 12 | --segmentLength \ 13 | --hashing \ 14 | --perPairPosteriorMeans \ 15 | --perPairMAP \ 16 | --noConditionalAgeEstimates \ 17 | --bin 18 | 19 | # Binary output file can be converted with the following command line 20 | echo 'Showing first lines of the binary output...' 21 | ./convertBinary_exe ../cpp_example/FastSMC_output_example.1.1.FastSMC.bibd.gz | head 22 | -------------------------------------------------------------------------------- /cpp_example/FastSMC_example_multiple_jobs.sh: -------------------------------------------------------------------------------- 1 | # this script will run FastSMC on a simulated data as described in the paper (in FILES/FASTSMC_EXAMPLE/) 2 | # parameters can be changed if desired 3 | 4 | # This example will run multiple jobs in different threads on the same machine. If you are running FastSMC on a cluster 5 | # then it may be more appropriate to instead use the job scheduler such as `qsub`. 6 | 7 | # The total number of jobs you want to run in parallel (this should be a square number). 8 | # Note that the standard output will be messy as information will be printed from every job simultaneously. 9 | total_num_jobs=4 10 | 11 | cd ../FASTSMC_BUILD_DIR/ || exit 12 | 13 | run_single_job() { 14 | local job=$1 15 | ./FastSMC_exe --inFileRoot ../FILES/FASTSMC_EXAMPLE/example \ 16 | --outFileRoot ../cpp_example/FastSMC_output_example \ 17 | --decodingQuantFile ../FILES/FASTSMC_EXAMPLE/example.decodingQuantities.gz \ 18 | --mode array \ 19 | --time 50 \ 20 | --min_m 1.5 \ 21 | --segmentLength \ 22 | --hashing \ 23 | --jobs "${total_num_jobs}" \ 24 | --jobInd "${job}" \ 25 | --perPairPosteriorMeans \ 26 | --perPairMAP \ 27 | --noConditionalAgeEstimates \ 28 | --bin 29 | echo Finished job "$1" 30 | } 31 | 32 | # Run the jobs in parallel 33 | for ((i = 1; i <= total_num_jobs; i++)); do 34 | # substitute the command below with the appropriate command if running this on a cluster, e.g. "qsub run_single_job $i" 35 | run_single_job "$i" & 36 | done 37 | wait 38 | 39 | # Binary output file can be converted with the following command line 40 | echo 'Showing first lines of the binary output...' 41 | ./convertBinary_exe ../cpp_example/FastSMC_output_example.1.4.FastSMC.bibd.gz | head 42 | 43 | # Note that there will be the same number of output files as jobs, which will need to be concatenated. 44 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/asmc_python.md: -------------------------------------------------------------------------------- 1 | # ASMC Python API 2 | 3 | - [Examples using the Python bindings](#examples-using-the-python-bindings) 4 | - [API](#api) 5 | - [ASMC](#asmc) 6 | - [DecodePairsReturnStruct](#decodepairsreturnstruct) 7 | 8 | ASMC includes Python bindings which can be installed using pip: 9 | 10 | ``` 11 | pip install asmc-asmc 12 | ``` 13 | 14 | Before reading further you may wish to read the [ASMC docs](./asmc.md). 15 | In particular, these sections are directly relevant: 16 | - [Summary (TL;DR)](./asmc.md#summary-tldr) 17 | - [Input/output file formats](./asmc.md#inputoutput-file-formats) 18 | - [Tools, scripts, and analyses](./asmc.md#tools-scripts-and-analyses) 19 | - [Precomputed decoding quantities](./asmc.md#precomputed-decoding-quantities) 20 | 21 | 22 | ## Examples using the Python bindings 23 | 24 | See the `notebooks` directory for examples. 25 | There are two Jupyter notebooks: 26 | - a [minimal working example](../notebooks/asmc-minimal.ipynb), where sensible defaults for parameters are chosen automatically 27 | - a [more detailed example](../notebooks/asmc.ipynb) that demonstrates how to customise parameters 28 | 29 | ## API 30 | 31 | The core Python API for ASMC consists of the following classes: 32 | - `ASMC` 33 | - `DecodingParams` 34 | - `DecodePairsReturnStruct` 35 | 36 | ### ASMC 37 | 38 | The main `ASMC` object can be constructed minimally with an input file root and a decoding quantities file. 39 | Optional parameters are the output file root and the decoding mode. 40 | The full signature (with defaults indicated) is as follows: 41 | 42 | ```python 43 | asmc = ASMC( 44 | in_dir=input_files_root, # path to the 45 | dq_file=dq_file, # path to the decoding quantities file 46 | out_dir="", # location of output files (default is the input file root) 47 | decoding_mode="array" # one of "array" or "sequence" 48 | ) 49 | ``` 50 | 51 | This creates an ASMC object with sensible defaults. 52 | To fine-tune parameters you can instead create the ASMC object with an instance of `DecodingParams`: 53 | 54 | ```python 55 | # These are the arguments (with defaults indicated) for constructing a decoding paramters object 56 | params = DecodingParams( 57 | in_file_root=input_files_root, 58 | dq_file=dq_file, 59 | map_file="", # Optional override for map|map.gz file, if not in in_file_root 60 | out_file_root="", 61 | jobs=1, # Number of jobs being done in total 62 | job_ind=1, # Job index (0, ..., jobs) 63 | decoding_mode_string="array", # One of {"squence", "array"} 64 | decoding_sequence=False, 65 | using_CSFS=True, # Whether to use CSFS 66 | compress=False, # Compress emission to binary (no CSFS) 67 | use_ancestral=False, # Assume ancestral alleles are coded as 1 in input (will assume 1 = minor otherwise) 68 | skip_CSFS_distance=0.0, # Genetic distance between two CSFS emissions 69 | no_batches=False, # Decode with no vectorization (do not use without good reason) 70 | do_posterior_sums=False, 71 | do_per_pair_posterior_mean=False, 72 | expected_coal_times_file="", 73 | within_only=False, 74 | do_major_minor_posterior_sums=False, # 75 | do_per_pair_MAP=False 76 | ) 77 | 78 | asmc = ASMC(params) 79 | ``` 80 | 81 | You can specify the outputs that will be calculated with the following methods (with defaults indicated): 82 | 83 | ```python 84 | # Per pair posterior mean, MAP and full posteriors, as well as the sum of posteriors can be stored in matrices 85 | asmc.set_store_per_pair_posterior_mean(True) # <-- true by default; others false by default 86 | asmc.set_store_per_pair_map(False) 87 | asmc.set_store_per_pair_posterior(False) 88 | asmc.set_store_sum_of_posterior(False) 89 | 90 | # Per pair posterior mean and MAP can be written to file. This is typically slow. 91 | asmc.set_write_per_pair_posterior_mean(False) 92 | asmc.set_write_per_pair_map(False) 93 | ``` 94 | 95 | Finally, the ASMC method `decode_pairs` will run the analysis. 96 | There are three different signatures available: 97 | 98 | ```python 99 | a = [1, 2, 3] 100 | b = [4, 5, 6] 101 | 102 | a_str = [f"1_{x}_1" for x in range(1,149)] 103 | b_str = [f"1_{x}_2" for x in range(1,149)] 104 | 105 | asmc.decode_pairs(a, b) # two lists of haplotype indices 106 | asmc.decode_pairs(a_str, b_str) # two lists of haplotype IDs, with _1 and _2 indicating the haplotype 107 | asmc.decode_pairs() # <-- decode all pairs in the dataset 108 | ``` 109 | 110 | The results can then be accessed either by copy or reference: 111 | 112 | ```python 113 | return_vals = asmc.get_copy_of_results() 114 | return_vals_ref = asmc.get_ref_of_results() 115 | ``` 116 | 117 | Getting the values by reference is safe if you are only planning to call `decode_pairs` once, or if you are performing calculations that do not require the results to persist after the first call to `decode_pairs`. 118 | If you call `decode_pairs` multiple times, the results will be overwritten, so you should ensure you get results by copy. 119 | 120 | ### DecodePairsReturnStruct 121 | 122 | The return structure will contain results based on the options selected on the ASMC object before calling `decode_pairs`. 123 | ```python 124 | # The index information for the pairs decoded 125 | return_vals.per_pair_indices 126 | 127 | # The `per_pair_posteriors` option gives the largest amount of information: a list of 2D numpy arrays 128 | # The list has length numPairs, and each 2D array has size (numStates x numSites) 129 | return_vals.per_pair_posteriors 130 | 131 | # The sum of posteriors is a single 2D numpy array of size (numStates x numSites) 132 | return_vals.sum_of_posteriors 133 | 134 | # Turning on the per_pair_posteriors flag gives you the the following: 135 | # A 2D numpy array with posterior means, of size (numPairs x numSites) 136 | return_vals.per_pair_posterior_means 137 | # Two 1D numpy arrays with the column-wise min and argmin of this array: 138 | return_vals.min_posterior_means 139 | return_vals.argmin_posterior_means 140 | 141 | # Turning on the per_pair_MAPs flag gives you the the following: 142 | # A 2D numpy array with posterior MAPs, of size (numPairs x numSites) 143 | return_vals.per_pair_MAPs 144 | # Two 1D numpy arrays with the column-wise min and argmin of this array: 145 | return_vals.min_MAPs 146 | return_vals.argmin_MAPs 147 | ``` 148 | 149 | Finally, the ASMC object can also return the list of expected coalescent times from the decoding quantities file: 150 | asmc.get_expected_times() 151 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'ASMC' 21 | copyright = '2023, ASMC Developers' 22 | author = 'ASMC Developers, https://palamaralab.github.io/software/asmc/' 23 | release = 'v1.3.1' 24 | 25 | # -- General configuration --------------------------------------------------- 26 | 27 | # Add any Sphinx extension module names here, as strings. They can be 28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 29 | # ones. 30 | extensions = [ 31 | 'sphinx_rtd_theme', 32 | ] 33 | 34 | # Add any paths that contain templates here, relative to this directory. 35 | templates_path = ['_templates'] 36 | 37 | # List of patterns, relative to source directory, that match files and 38 | # directories to ignore when looking for source files. 39 | # This pattern also affects html_static_path and html_extra_path. 40 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 41 | 42 | 43 | # -- Options for HTML output ------------------------------------------------- 44 | 45 | # The theme to use for HTML and HTML Help pages. See the documentation for 46 | # a list of builtin themes. 47 | # 48 | html_theme = 'sphinx_rtd_theme' 49 | 50 | # Add any paths that contain custom static files (such as style sheets) here, 51 | # relative to this directory. They are copied after the builtin static files, 52 | # so a file named "default.css" will overwrite the builtin "default.css". 53 | html_static_path = ['_static'] 54 | -------------------------------------------------------------------------------- /docs/fastsmc_python.md: -------------------------------------------------------------------------------- 1 | # FastSMC Python API 2 | 3 | - [Examples using the Python bindings](#examples-using-the-python-bindings) 4 | - [API](#api) 5 | - [FastSMC](#fastsmc) 6 | - [DecodingParams](#decodingparams) 7 | - [BinaryDataReader](#binarydatareader) 8 | 9 | FastSMC includes Python bindings which can be installed using pip: 10 | 11 | ``` 12 | pip install asmc-asmc 13 | ``` 14 | 15 | Before reading further you may wish to read the [FastSMC docs](./fastsmc.md). 16 | In particular, these sections are directly relevant: 17 | - [Summary (TL;DR)](./fastsmc.md#input-file-formats) 18 | - [Input/output file formats](./fastsmc.md#output-format) 19 | - [Tools, scripts, and analyses](./fastsmc.md#binary-output) 20 | - [Precomputed decoding quantities](./fastsmc.md#relationship-to-asmc) 21 | 22 | And, from the [ASMC docs](./asmc.md): 23 | - [Precomputed decoding quantities](./asmc.md#precomputed-decoding-quantities) 24 | 25 | ## Examples using the Python bindings 26 | 27 | See the `notebooks` directory for examples. 28 | There are two Jupyter notebooks: 29 | - a [minimal working example](../notebooks/fastsmc-minimal.ipynb), where sensible defaults for parameters are chosen automatically 30 | - a [more detailed example](../notebooks/fastsmc.ipynb) that demonstrates how to customise parameters, how to convert the binary file to text format, and how to analyse the output if it is too large to fit in memory. 31 | 32 | ## API 33 | 34 | The core Python API for FastSMC consists of the following classes: 35 | - `FastSMC` 36 | - `DecodingParams` 37 | - `BinaryDataReader` 38 | 39 | ### FastSMC 40 | 41 | The main `FastSMC` object can be constructed minimally with an input file root, a decoding quantities file, and an output directory. 42 | Simply construct a FastSMC object and call `run()` to generate output in the output file root: 43 | 44 | ```python 45 | fast_smc = FastSMC(in_dir=input_files_root, dq_file=dq_file, out_dir=output_files_root) 46 | fast_smc.run() 47 | ``` 48 | 49 | This creates a FastSMC object with sensible defaults. 50 | To fine-tune parameters you can instead create the FastSMC object with an instance of `DecodingParams`. 51 | 52 | ### DecodingParams 53 | 54 | Create an empty `DecodingParams` object: 55 | 56 | ```python 57 | params = DecodingParams() 58 | ``` 59 | 60 | The following parameters can be set: 61 | 62 | ```python 63 | params.decodingQuantFile = dq_file 64 | params.inFileRoot = input_files_root 65 | params.map_file = map_file # Optional override for .map file, if not in input_files_root 66 | params.outFileRoot = output_files_root 67 | params.decodingModeString = 'array' 68 | params.usingCSFS = True 69 | params.batchSize = 32 70 | params.recallThreshold = 3 71 | params.min_m = 1.5 72 | params.hashing = True 73 | params.FastSMC = True 74 | params.BIN_OUT = True 75 | params.outputIbdSegmentLength = True 76 | params.time = 50 77 | params.noConditionalAgeEstimates = True 78 | params.doPerPairMAP = True 79 | params.doPerPairPosteriorMean = True 80 | params.hashingOnly = False 81 | ``` 82 | 83 | > Note: the `hashingOnly` flag has not been extensively tested. 84 | You may also want to look into [this repository](https://github.com/gusevlab/germline2) for a standalone version. 85 | 86 | Finally, you can validate that the parameters are consistent for running FastSMC: 87 | 88 | ```python 89 | assert params.validateParamsFastSMC() 90 | ``` 91 | 92 | Then, construct and run a `FastSMC` object using these parameters: 93 | 94 | ```python 95 | fast_smc = FastSMC(params) 96 | fast_smc.run() 97 | ``` 98 | 99 | ### BinaryDataReader 100 | 101 | If you turn on `BIN_OUT` in the decoding parameters, the `BinaryDataReader` class can read sequential lines in a file. 102 | This is useful particularly if the output is too large to process entirely in memory. 103 | 104 | ```python 105 | binary_data_reader = BinaryDataReader(output_files_root + '.1.1.FastSMC.bibd.gz') 106 | 107 | while binary_data_reader.moreLinesInFile(): 108 | line = binary_data_reader.getNextLine() 109 | ``` 110 | 111 | For each line, the following attributes and methods are available: 112 | 113 | ```python 114 | line.ind1FamId 115 | line.ind1Id 116 | line.ind1Hap 117 | line.ind2FamId 118 | line.ind2Id 119 | line.ind2Hap 120 | line.chromosome 121 | line.ibdStart 122 | line.ibdEnd 123 | line.lengthInCentimorgans 124 | line.ibdScore 125 | line.postEst 126 | line.mapEst 127 | 128 | line.toString() 129 | ``` 130 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. ASMC documentation master file, created by 2 | sphinx-quickstart on Mon Dec 6 16:23:40 2021. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to the ASMC documentation! 7 | ================================== 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | pages/quickstart_user 14 | pages/quickstart_developer 15 | pages/asmc 16 | pages/asmc_python 17 | pages/fastsmc 18 | pages/fastsmc_python 19 | 20 | 21 | 22 | Indices and tables 23 | ================== 24 | 25 | * :ref:`search` 26 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.https://www.sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/pages/fastsmc_python.rst: -------------------------------------------------------------------------------- 1 | FastSMC Python API 2 | ================== 3 | 4 | - `Examples using the Python 5 | bindings <#examples-using-the-python-bindings>`__ 6 | - `API <#api>`__ 7 | 8 | - `FastSMC <#fastsmc>`__ 9 | - `DecodingParams <#decodingparams>`__ 10 | - `BinaryDataReader <#binarydatareader>`__ 11 | 12 | FastSMC includes Python bindings which can be installed using pip: 13 | 14 | :: 15 | 16 | pip install asmc-asmc 17 | 18 | Before reading further you may wish to read the `FastSMC 19 | docs <./fastsmc.md>`__. In particular, these sections are directly 20 | relevant: 21 | 22 | - `Summary (TL;DR) <./fastsmc.md#input-file-formats>`__ 23 | - `Input/output file formats <./fastsmc.md#output-format>`__ 24 | - `Tools, scripts, and analyses <./fastsmc.md#binary-output>`__ 25 | - `Precomputed decoding 26 | quantities <./fastsmc.md#relationship-to-asmc>`__ 27 | 28 | And, from the `ASMC docs <./asmc.md>`__: 29 | 30 | - `Precomputed decoding 31 | quantities <./asmc.md#precomputed-decoding-quantities>`__ 32 | 33 | Examples using the Python bindings 34 | ---------------------------------- 35 | 36 | See the ``notebooks`` directory for examples. There are two Jupyter 37 | notebooks: 38 | 39 | - a `minimal working example <../notebooks/fastsmc-minimal.ipynb>`__, 40 | where sensible defaults for parameters are chosen automatically 41 | - a `more detailed example <../notebooks/fastsmc.ipynb>`__ that 42 | demonstrates how to customise parameters, how to convert the binary 43 | file to text format, and how to analyse the output if it is too large 44 | to fit in memory. 45 | 46 | API 47 | --- 48 | 49 | The core Python API for FastSMC consists of the following classes: 50 | 51 | - ``FastSMC`` 52 | - ``DecodingParams`` 53 | - ``BinaryDataReader`` 54 | 55 | FastSMC 56 | ~~~~~~~ 57 | 58 | The main ``FastSMC`` object can be constructed minimally with an input 59 | file root, a decoding quantities file, and an output directory. Simply 60 | construct a FastSMC object and call ``run()`` to generate output in the 61 | output file root: 62 | 63 | .. code:: python 64 | 65 | fast_smc = FastSMC(in_dir=input_files_root, dq_file=dq_file, out_dir=output_files_root) 66 | fast_smc.run() 67 | 68 | This creates a FastSMC object with sensible defaults. To fine-tune 69 | parameters you can instead create the FastSMC object with an instance of 70 | ``DecodingParams``. 71 | 72 | DecodingParams 73 | ~~~~~~~~~~~~~~ 74 | 75 | Create an empty ``DecodingParams`` object: 76 | 77 | .. code:: python 78 | 79 | params = DecodingParams() 80 | 81 | The following parameters can be set: 82 | 83 | .. code:: python 84 | 85 | params.decodingQuantFile = dq_file 86 | params.inFileRoot = input_files_root 87 | params.map_file = map_file # Optional override for .map file, if not in input_files_root 88 | params.outFileRoot = output_files_root 89 | params.decodingModeString = 'array' 90 | params.usingCSFS = True 91 | params.batchSize = 32 92 | params.recallThreshold = 3 93 | params.min_m = 1.5 94 | params.hashing = True 95 | params.FastSMC = True 96 | params.BIN_OUT = True 97 | params.outputIbdSegmentLength = True 98 | params.time = 50 99 | params.noConditionalAgeEstimates = True 100 | params.doPerPairMAP = True 101 | params.doPerPairPosteriorMean = True 102 | params.hashingOnly = False 103 | 104 | .. 105 | 106 | Note: the ``hashingOnly`` flag has not been extensively tested. You 107 | may also want to look into `this 108 | repository `__ for a 109 | standalone version. 110 | 111 | Finally, you can validate that the parameters are consistent for running 112 | FastSMC: 113 | 114 | .. code:: python 115 | 116 | assert params.validateParamsFastSMC() 117 | 118 | Then, construct and run a ``FastSMC`` object using these parameters: 119 | 120 | .. code:: python 121 | 122 | fast_smc = FastSMC(params) 123 | fast_smc.run() 124 | 125 | BinaryDataReader 126 | ~~~~~~~~~~~~~~~~ 127 | 128 | If you turn on ``BIN_OUT`` in the decoding parameters, the 129 | ``BinaryDataReader`` class can read sequential lines in a file. This is 130 | useful particularly if the output is too large to process entirely in 131 | memory. 132 | 133 | .. code:: python 134 | 135 | binary_data_reader = BinaryDataReader(output_files_root + '.1.1.FastSMC.bibd.gz') 136 | 137 | while binary_data_reader.moreLinesInFile(): 138 | line = binary_data_reader.getNextLine() 139 | 140 | For each line, the following attributes and methods are available: 141 | 142 | .. code:: python 143 | 144 | line.ind1FamId 145 | line.ind1Id 146 | line.ind1Hap 147 | line.ind2FamId 148 | line.ind2Id 149 | line.ind2Hap 150 | line.chromosome 151 | line.ibdStart 152 | line.ibdEnd 153 | line.lengthInCentimorgans 154 | line.ibdScore 155 | line.postEst 156 | line.mapEst 157 | 158 | line.toString() 159 | -------------------------------------------------------------------------------- /docs/pages/quickstart_developer.rst: -------------------------------------------------------------------------------- 1 | Quickstart guide for developers 2 | =============================== 3 | 4 | - `Linux <#linux>`__ 5 | - `macOS <#macos>`__ 6 | - `ResComp (oxford research 7 | computing) <#rescomp-oxford-research-computing>`__ 8 | - `Python bindings <#python-bindings>`__ 9 | 10 | Linux 11 | ----- 12 | 13 | This guide assumes you have a C++17 compatible compiler (e.g. gcc >= 8.3 14 | or clang >= 7) and `CMake >= 3.15 `__. 15 | Additionally, to compile the Python bindings you need Python with 16 | development files: 17 | 18 | .. code:: bash 19 | 20 | sudo apt install python3-dev 21 | 22 | Then, follow these steps: 23 | 24 | .. code:: bash 25 | 26 | # Get the source 27 | git clone --recurse-submodules https://github.com/PalamaraLab/ASMC_dev 28 | cd ASMC_dev 29 | 30 | # Create a build directory 31 | mkdir build && cd build 32 | 33 | # Configure and build 34 | # On first run, CMake will build the required dependencies 35 | cmake .. 36 | cmake --build . --parallel 4 37 | 38 | macOS 39 | ----- 40 | 41 | This guide assumes you have a recent version of the `Xcode command line 42 | tools `__ and 43 | `Homebrew `__. Install the following dependencies: 44 | 45 | .. code:: bash 46 | 47 | brew install cmake 48 | brew install libomp 49 | brew install python # for python bindings, if required 50 | 51 | Then, follow these steps: 52 | 53 | .. code:: bash 54 | 55 | # Get the source 56 | git clone --recurse-submodules https://github.com/PalamaraLab/ASMC_dev 57 | cd ASMC_dev 58 | 59 | # Create a build directory 60 | mkdir build && cd build 61 | 62 | # Configure and build 63 | # On first run, CMake will build the required dependencies 64 | cmake .. 65 | cmake --build . --parallel 4 66 | 67 | ResComp (oxford research computing) 68 | ----------------------------------- 69 | 70 | All necessary dependencies are already installed on ResComp. Simply 71 | follow these steps: 72 | 73 | .. code:: bash 74 | 75 | # Load required modules 76 | module load GCC/10.2.0 77 | module load CMake/3.18.4-GCCcore-10.2.0 78 | module load git/2.28.0-GCCcore-10.2.0-nodocs 79 | module load Python/3.8.6-GCCcore-10.2.0 80 | 81 | # Get the source 82 | git clone --recurse-submodules https://github.com/PalamaraLab/ASMC_dev 83 | cd ASMC_dev 84 | 85 | # Create a build directory 86 | mkdir build && cd build 87 | 88 | # Configure and build 89 | # On first run, CMake will build the required dependencies 90 | cmake .. 91 | cmake --build . --parallel 4 92 | 93 | Python bindings 94 | --------------- 95 | 96 | These instructions are platform independent, assuming you have installed 97 | all dependencies (excluding those from vcpkg) according to the 98 | instructions above. From the ``ASMC_dev`` directory: 99 | 100 | .. code:: bash 101 | 102 | python3 -m venv venv 103 | source venv/bin/activate 104 | 105 | pip install --upgrade pip setuptools wheel ninja 106 | pip install . 107 | -------------------------------------------------------------------------------- /docs/pages/quickstart_user.rst: -------------------------------------------------------------------------------- 1 | Quickstart guide for users 2 | ========================== 3 | 4 | - `Python bindings <#python-bindings>`__ 5 | - `Linux <#linux>`__ 6 | - `macOS <#macos>`__ 7 | - `Without vcpkg <#without-vcpkg>`__ 8 | 9 | Python bindings 10 | --------------- 11 | 12 | If you want to use ASMC or FastSMC via their Python interface, you can 13 | simply install ASMC using pip: 14 | 15 | :: 16 | 17 | pip install asmc-asmc 18 | 19 | For examples, see the `ASMC python documentation <./asmc_python.md>`__ 20 | and `FastSMC python documentation <./fastsmc_python.md>`__. 21 | 22 | If you want to compile the C++ executables, read on. 23 | 24 | Linux 25 | ----- 26 | 27 | This guide assumes you have a C++17 compatible compiler (e.g. gcc >= 8.3 28 | or clang >= 7) and `CMake >= 3.15 `__. Then, 29 | follow these steps to build the ASMC, FastSMC and binary conversion 30 | executables: 31 | 32 | .. code:: bash 33 | 34 | # Get the source 35 | git clone --recurse-submodules https://github.com/PalamaraLab/ASMC 36 | cd ASMC 37 | 38 | # Create a build directory 39 | mkdir build && cd build 40 | 41 | # Configure and build 42 | # On first run, CMake will build the required dependencies 43 | cmake -DASMC_NO_PYTHON=TRUE .. 44 | cmake --build . --parallel 4 45 | 46 | macOS 47 | ----- 48 | 49 | This guide assumes you have a recent version of the `Xcode command line 50 | tools `__ and 51 | `Homebrew `__. Install the following dependencies: 52 | 53 | .. code:: bash 54 | 55 | brew install cmake 56 | brew install libomp 57 | 58 | Then, follow these steps to build the ASMC, FastSMC and binary 59 | conversion executables: 60 | 61 | .. code:: bash 62 | 63 | # Get the source 64 | git clone --recurse-submodules https://github.com/PalamaraLab/ASMC 65 | cd ASMC 66 | 67 | # Create a build directory 68 | mkdir build && cd build 69 | 70 | # Configure and build 71 | # On first run, CMake will build the required dependencies 72 | cmake -DASMC_NO_PYTHON=TRUE .. 73 | cmake --build . --parallel 4 74 | 75 | Without vcpkg 76 | ------------- 77 | 78 | If you would like to compile ASMC without using 79 | `vcpkg `__ to handle dependencies, 80 | you should first ensure all dependencies are installed: 81 | 82 | **Ubuntu** 83 | 84 | .. code:: bash 85 | 86 | sudo apt install libboost-iostreams-dev libboost-math-dev libboost-program-options-dev libeigen3-dev libfmt-dev librange-v3-dev zlib1g-dev 87 | 88 | **macOS** 89 | 90 | .. code:: bash 91 | 92 | brew install boost eigen fmt range-v3 zlib 93 | 94 | Then, when you run CMake, add the following definition: 95 | 96 | .. code:: bash 97 | 98 | cmake -DASMC_AVOID_VCPKG=true .. 99 | 100 | You may additionally choose to not recursively clone all submodules, as 101 | long as you still obtain the ``DataModule`` submodule. From the ASMC 102 | directory: 103 | 104 | .. code:: bash 105 | 106 | git clone https://github.com/PalamaraLab/ASMC 107 | cd ASMC 108 | git submodule update --init DataModule 109 | -------------------------------------------------------------------------------- /docs/quickstart_developer.md: -------------------------------------------------------------------------------- 1 | # Quickstart guide for developers 2 | 3 | - [Linux](#linux) 4 | - [macOS](#macos) 5 | - [ResComp (oxford research computing)](#rescomp-oxford-research-computing) 6 | - [Python bindings](#python-bindings) 7 | 8 | ## Linux 9 | 10 | This guide assumes you have a C++17 compatible compiler (e.g. gcc >= 8.3 or clang >= 7) and [CMake >= 3.15](https://cmake.org/install/). 11 | Additionally, to compile the Python bindings you need Python with development files: 12 | 13 | ```bash 14 | sudo apt install python3-dev 15 | ``` 16 | 17 | Then, follow these steps: 18 | 19 | ```bash 20 | # Get the source 21 | git clone --recurse-submodules https://github.com/PalamaraLab/ASMC_dev 22 | cd ASMC_dev 23 | 24 | # Create a build directory 25 | mkdir build && cd build 26 | 27 | # Configure and build 28 | # On first run, CMake will build the required dependencies 29 | cmake .. 30 | cmake --build . --parallel 4 31 | ``` 32 | 33 | ## macOS 34 | 35 | This guide assumes you have a recent version of the [Xcode command line tools](https://developer.apple.com/xcode/features/) and [Homebrew](https://brew.sh/). 36 | Install the following dependencies: 37 | 38 | ```bash 39 | brew install cmake 40 | brew install libomp 41 | brew install python # for python bindings, if required 42 | ``` 43 | 44 | Then, follow these steps: 45 | 46 | ```bash 47 | # Get the source 48 | git clone --recurse-submodules https://github.com/PalamaraLab/ASMC_dev 49 | cd ASMC_dev 50 | 51 | # Create a build directory 52 | mkdir build && cd build 53 | 54 | # Configure and build 55 | # On first run, CMake will build the required dependencies 56 | cmake .. 57 | cmake --build . --parallel 4 58 | ``` 59 | 60 | ## ResComp (oxford research computing) 61 | 62 | All necessary dependencies are already installed on ResComp. Simply follow these steps: 63 | 64 | ```bash 65 | # Load required modules 66 | module load GCC/10.2.0 67 | module load CMake/3.18.4-GCCcore-10.2.0 68 | module load git/2.28.0-GCCcore-10.2.0-nodocs 69 | module load Python/3.8.6-GCCcore-10.2.0 70 | 71 | # Get the source 72 | git clone --recurse-submodules https://github.com/PalamaraLab/ASMC_dev 73 | cd ASMC_dev 74 | 75 | # Create a build directory 76 | mkdir build && cd build 77 | 78 | # Configure and build 79 | # On first run, CMake will build the required dependencies 80 | cmake .. 81 | cmake --build . --parallel 4 82 | ``` 83 | 84 | ## Python bindings 85 | 86 | These instructions are platform independent, assuming you have installed all dependencies (excluding those from vcpkg) according to the instructions above. 87 | From the `ASMC_dev` directory: 88 | 89 | ```bash 90 | python3 -m venv venv 91 | source venv/bin/activate 92 | 93 | pip install --upgrade pip setuptools wheel ninja 94 | pip install . 95 | ``` 96 | -------------------------------------------------------------------------------- /docs/quickstart_user.md: -------------------------------------------------------------------------------- 1 | # Quickstart guide for users 2 | 3 | - [Python bindings](#python-bindings) 4 | - [Linux](#linux) 5 | - [macOS](#macos) 6 | - [Without vcpkg](#without-vcpkg) 7 | 8 | ## Python bindings 9 | 10 | If you want to use ASMC or FastSMC via their Python interface, you can simply install ASMC using pip: 11 | 12 | ``` 13 | pip install asmc-asmc 14 | ``` 15 | 16 | For examples, see the [ASMC python documentation](./asmc_python.md) and [FastSMC python documentation](./fastsmc_python.md). 17 | 18 | If you want to compile the C++ executables, read on. 19 | 20 | ## Linux 21 | 22 | This guide assumes you have a C++17 compatible compiler (e.g. gcc >= 8.3 or clang >= 7) and [CMake >= 3.15](https://cmake.org/install/). 23 | Then, follow these steps to build the ASMC, FastSMC and binary conversion executables: 24 | 25 | ```bash 26 | # Get the source 27 | git clone --recurse-submodules https://github.com/PalamaraLab/ASMC 28 | cd ASMC 29 | 30 | # Create a build directory 31 | mkdir build && cd build 32 | 33 | # Configure and build 34 | # On first run, CMake will build the required dependencies 35 | cmake -DASMC_NO_PYTHON=TRUE .. 36 | cmake --build . --parallel 4 37 | ``` 38 | 39 | ## macOS 40 | 41 | This guide assumes you have a recent version of the [Xcode command line tools](https://developer.apple.com/xcode/features/) and [Homebrew](https://brew.sh/). 42 | Install the following dependencies: 43 | 44 | ```bash 45 | brew install cmake 46 | brew install libomp 47 | ``` 48 | 49 | Then, follow these steps to build the ASMC, FastSMC and binary conversion executables: 50 | 51 | ```bash 52 | # Get the source 53 | git clone --recurse-submodules https://github.com/PalamaraLab/ASMC 54 | cd ASMC 55 | 56 | # Create a build directory 57 | mkdir build && cd build 58 | 59 | # Configure and build 60 | # On first run, CMake will build the required dependencies 61 | cmake -DASMC_NO_PYTHON=TRUE .. 62 | cmake --build . --parallel 4 63 | ``` 64 | 65 | ## Without vcpkg 66 | 67 | If you would like to compile ASMC without using [vcpkg](https://github.com/microsoft/vcpkg/) to handle dependencies, you should first ensure all dependencies are installed: 68 | 69 | **Ubuntu** 70 | ```bash 71 | sudo apt install libboost-iostreams-dev libboost-math-dev libboost-program-options-dev libeigen3-dev libfmt-dev librange-v3-dev zlib1g-dev 72 | ``` 73 | 74 | **macOS** 75 | ```bash 76 | brew install boost eigen fmt range-v3 zlib 77 | ```` 78 | 79 | Then, when you run CMake, add the following definition: 80 | 81 | ```bash 82 | cmake -DASMC_AVOID_VCPKG=true .. 83 | ``` 84 | 85 | You may additionally choose to not recursively clone all submodules, as long as you still obtain the `DataModule` submodule. 86 | From the ASMC directory: 87 | 88 | ```bash 89 | git clone https://github.com/PalamaraLab/ASMC 90 | cd ASMC 91 | git submodule update --init DataModule 92 | ``` 93 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx-rtd-theme -------------------------------------------------------------------------------- /exe/main.cpp: -------------------------------------------------------------------------------- 1 | // This file is part of ASMC, developed by Pier Francesco Palamara. 2 | // 3 | // ASMC is free software: you can redistribute it and/or modify 4 | // it under the terms of the GNU General Public License as published by 5 | // the Free Software Foundation, either version 3 of the License, or 6 | // (at your option) any later version. 7 | // 8 | // ASMC is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with ASMC. If not, see . 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #include "Data.hpp" 22 | #include "DecodingParams.hpp" 23 | #include "DecodingQuantities.hpp" 24 | #include "FileUtils.hpp" 25 | 26 | #include "HMM.hpp" 27 | #include "StringUtils.hpp" 28 | #include "Timer.hpp" 29 | #include 30 | 31 | using namespace std; 32 | 33 | int main(int argc, char* argv[]) 34 | { 35 | 36 | srand(1234); 37 | 38 | const char VERSION[] = "1.0"; 39 | const char VERSION_DATE[] = "July 1, 2018"; 40 | const char YEAR[] = "2018"; 41 | const char LICENSE[] = "GNU GPL v3"; 42 | const char WEBSITE[] = "https://palamaralab.github.io/software/fastsmc/"; 43 | const char PROGRAM[] = "Ascertained Sequentially Markovian Coalescent (ASMC)"; 44 | 45 | DecodingParams params; 46 | 47 | // parse input arguments 48 | if (!params.processCommandLineArgs(argc, argv)) { 49 | cerr << "Error processing command line; exiting." << endl; 50 | exit(1); 51 | } 52 | 53 | // Eigen output formatter to match original ASMC output 54 | Eigen::IOFormat TabFmt(Eigen::StreamPrecision, Eigen::DontAlignCols, "\t", "\n"); 55 | 56 | cout << "\n"; 57 | 58 | // cout << " _____ __ __ _____ \n"; 59 | // cout << " /\\ / ____| | \\/ | / ____|\n"; 60 | // cout << " / \\ | (___ | \\ / | | | \n"; 61 | // cout << " / /\\ \\ \\___ \\ | |\\/| | | | \n"; 62 | // cout << " / ____ \\ ____) | | | | | | |____ \n"; 63 | // cout << " /_/ \\_\\ |_____/ |_| |_| \\_____|\n"; 64 | 65 | cout << " █████╗ ███████╗ ███╗ ███╗ ██████╗\n"; 66 | cout << "██╔══██╗ ██╔════╝ ████╗ ████║ ██╔════╝\n"; 67 | cout << "███████║ ███████╗ ██╔████╔██║ ██║ \n"; 68 | cout << "██╔══██║ ╚════██║ ██║╚██╔╝██║ ██║ \n"; 69 | cout << "██║ ██║ ███████║ ██║ ╚═╝ ██║ ╚██████╗\n"; 70 | cout << "╚═╝ ╚═╝ ╚══════╝ ╚═╝ ╚═╝ ╚═════╝\n"; 71 | 72 | cout << "\n" << PROGRAM << " v." << VERSION << ", " << VERSION_DATE << "\n"; 73 | cout << LICENSE << ", Copyright (C) " << YEAR << " Pier Palamara" 74 | << "\n"; 75 | cout << "Manual: " << WEBSITE << "\n" 76 | << "\n"; 77 | 78 | cout << "Decoding batch " << params.jobInd << " of " << params.jobs << "\n\n"; 79 | 80 | cout << "Will decode " << params.decodingModeString << " data." << endl; 81 | cout << "Output will have prefix: " << params.outFileRoot << endl; 82 | if (params.compress) 83 | cout << "Will use classic emission model (no CSFS)." << endl; 84 | else 85 | cout << "Minimum marker distance to use CSFS is set to " << params.skipCSFSdistance 86 | << "." << endl; 87 | if (params.useAncestral) 88 | cout << "Assuming ancestral alleles are correctly encoded." << endl; 89 | if (params.doPosteriorSums) 90 | cout << "Will output sum of posterior tables for all pairs." << endl; 91 | if (params.doMajorMinorPosteriorSums) 92 | cout << "Will output sum of posterior tables for all pairs, partitioned by " 93 | "major/minor alleles." 94 | << endl; 95 | 96 | // if (params.noBatches) 97 | // cout << "Will not process samples in batches (slower)." << endl; 98 | // if (!params.withinOnly) 99 | // cout << "Will only decode maternal vs. paternal haplotypes." << endl; 100 | // if (params.doPerPairMAP) 101 | // cout << "Will output MAP for all haploid pairs (DANGER: huge files)." << endl; 102 | // if (params.doPerPairPosteriorMean) 103 | // cout << "Will output posterior mean for all haploid pairs (DANGER: huge 104 | // files)." << endl; 105 | 106 | // used for benchmarking 107 | Timer timer; 108 | 109 | cout << "Data will be loaded from " << params.inFileRoot << "*\n"; 110 | Data data(params); 111 | printf("Read haps in %.3f seconds.\n", timer.update_time()); 112 | 113 | HMM hmm(data, params); 114 | 115 | hmm.decodeAll(params.jobs, params.jobInd); 116 | const DecodingReturnValues& decodingReturnValues = hmm.getDecodingReturnValues(); 117 | 118 | // output sums over pairs (if requested) 119 | if (params.doPosteriorSums) { 120 | FileUtils::AutoGzOfstream fout; 121 | fout.openOrExit(params.outFileRoot + ".sumOverPairs.gz"); 122 | cout << "Output file: " << params.outFileRoot << ".sumOverPairs.gz" << endl; 123 | fout << decodingReturnValues.sumOverPairs.format(TabFmt) << endl; 124 | fout.close(); 125 | } 126 | if (params.doMajorMinorPosteriorSums) { 127 | // Sum for 00 128 | FileUtils::AutoGzOfstream fout00; 129 | fout00.openOrExit(params.outFileRoot + ".00.sumOverPairs.gz"); 130 | for (int pos = 0; pos < data.sites; pos++) { 131 | for (uint k = 0; k < hmm.getDecodingQuantities().states; k++) { 132 | if (k) 133 | fout00 << "\t"; 134 | if (!data.siteWasFlippedDuringFolding[pos]) { 135 | fout00 << decodingReturnValues.sumOverPairs00(pos,k); 136 | } else { 137 | fout00 << decodingReturnValues.sumOverPairs11(pos,k); 138 | } 139 | } 140 | fout00 << endl; 141 | } 142 | 143 | fout00.close(); 144 | // Sum for 01 145 | FileUtils::AutoGzOfstream fout01; 146 | fout01.openOrExit(params.outFileRoot + ".01.sumOverPairs.gz"); 147 | fout01 << decodingReturnValues.sumOverPairs01.format(TabFmt) << endl; 148 | fout01.close(); 149 | // Sum for 11 150 | FileUtils::AutoGzOfstream fout11; 151 | fout11.openOrExit(params.outFileRoot + ".11.sumOverPairs.gz"); 152 | for (int pos = 0; pos < data.sites; pos++) { 153 | for (uint k = 0; k < hmm.getDecodingQuantities().states; k++) { 154 | if (k) 155 | fout11 << "\t"; 156 | if (!data.siteWasFlippedDuringFolding[pos]) { 157 | fout11 << decodingReturnValues.sumOverPairs11(pos,k); 158 | } else { 159 | fout11 << decodingReturnValues.sumOverPairs00(pos,k); 160 | } 161 | } 162 | fout11 << endl; 163 | } 164 | fout11.close(); 165 | 166 | cout << "Done.\n\n"; 167 | } 168 | } 169 | -------------------------------------------------------------------------------- /exe/main_convertBinary.cpp: -------------------------------------------------------------------------------- 1 | #include "BinaryDataReader.hpp" 2 | 3 | #include 4 | #include 5 | 6 | int main(int argc, char* argv[]) 7 | { 8 | 9 | // make sure parameters are ok 10 | if (argc != 2) { 11 | std::cout << "Number of parameters is wrong." << std::endl; 12 | std::cout << "Only one parameter (name of binary file) is required." << std::endl; 13 | exit(1); 14 | } 15 | 16 | BinaryDataReader binaryDataReader(argv[1]); 17 | 18 | while (binaryDataReader.moreLinesInFile()) { 19 | std::cout << binaryDataReader.getNextLine().toString() << std::endl; 20 | } 21 | 22 | return 0; 23 | } 24 | -------------------------------------------------------------------------------- /exe/main_fastsmc.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "Data.hpp" 5 | #include "DecodingParams.hpp" 6 | #include "DecodingQuantities.hpp" 7 | #include "FastSMC.hpp" 8 | #include "HMM.hpp" 9 | #include "Timer.hpp" 10 | 11 | using namespace std; 12 | 13 | int main(int argc, char* argv[]) 14 | { 15 | // Parse input arguments 16 | DecodingParams params; 17 | if (!params.processCommandLineArgsFastSMC(argc, argv)) { 18 | cerr << "Error processing command line; exiting." << endl; 19 | exit(1); 20 | } 21 | 22 | ASMC::FastSMC fastSMC(params); 23 | fastSMC.run(); 24 | 25 | return 0; 26 | } 27 | -------------------------------------------------------------------------------- /notebooks/fastsmc-minimal.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# FastSMC minimal working example notebook\n", 8 | "\n", 9 | "This notebook demonstrates a minimal working example of the FastSMC python bindings, where sensible default parameters are set automatically.\n", 10 | "\n", 11 | "Please make sure you have installed the python bindings by following the instructions in `../README.md` before attempting to run this notebook.\n", 12 | "\n", 13 | "The example dataset was simulated using the setup described in the paper, corresponding to SNP data for 150 diploid individuals and a chromosomal region of 30 Mb, with recombination rate from chromosome 2 and under a European demographic model (see https://www.nature.com/articles/s41467-020-19588-x for more details)." 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "1) Import `asmc` which is installed with the Python bindings" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "from asmc.asmc import *\n", 30 | "\n", 31 | "import pathlib\n", 32 | "import tempfile\n", 33 | "\n", 34 | "data_dir = pathlib.Path('.').resolve().parent / 'ASMC_data'" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "2) Specify paths for input (example provided in a submodule of this repository) and output. Input is expected to have the following files (note: make sure the map file is in the right format, as described in https://github.com/PalamaraLab/ASMC/blob/main/docs/fastsmc.md#input-file-formats):\n", 42 | "- `.hap.gz`\n", 43 | "- `.map`\n", 44 | "- `.samples`" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "input_files_root = str(data_dir / 'examples' / 'fastsmc' / 'example')\n", 54 | "dq_file = str(data_dir / 'decoding_quantities' / '30-100-2000_CEU.decodingQuantities.gz')\n", 55 | "output_files_root = tempfile.TemporaryDirectory().name" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "3) Create the Python FastSMC object and run it. This should only take a few seconds." 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "fast_smc = FastSMC(in_dir=input_files_root, dq_file=dq_file, out_dir=output_files_root)\n", 72 | "fast_smc.run()" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "4) Read data, add column names and filter to remove IBD segments with low IBD score. Note that for a large analysis, loading all data into memory is unlikely to be possible. See fastsmc.ipynb for an example that reads the output line-by-line." 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "%config InlineBackend.figure_formats = ['svg']\n", 89 | "\n", 90 | "import numpy as np\n", 91 | "import pandas as pd\n", 92 | "import matplotlib.pyplot as plt\n", 93 | "\n", 94 | "data = pd.read_csv(output_files_root + '.1.1.FastSMC.ibd.gz', sep='\\t', header=None)\n", 95 | "\n", 96 | "data.columns = ['ind1_famid', 'ind1_id', 'ind1_hap', 'ind2_famid', 'ind2_id', 'ind2_hap', 'chromosome',\n", 97 | " 'ibd_start', 'ibd_end', 'length_in_cM', 'ibd_score', 'post_est', 'map_est']\n", 98 | "\n", 99 | "filtered = data[data['ibd_score'] > 0.1]\n", 100 | "filtered" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": { 106 | "pycharm": { 107 | "name": "#%% md\n" 108 | } 109 | }, 110 | "source": [ 111 | "5) Visualise data: here we simply bin the MAP age estimates and the IBD segment length" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "plt.xlabel(\"MAP age estimate (in generations)\")\n", 121 | "filtered['map_est'].hist(range=(0, 100))\n", 122 | "plt.gca().set_yscale('linear')" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": { 129 | "pycharm": { 130 | "name": "#%%\n" 131 | } 132 | }, 133 | "outputs": [], 134 | "source": [ 135 | "plt.xlabel(\"IBD segments length (in cM)\")\n", 136 | "filtered['length_in_cM'].hist(range=(0, 15))\n", 137 | "plt.gca().set_yscale('log')" 138 | ] 139 | } 140 | ], 141 | "metadata": { 142 | "kernelspec": { 143 | "display_name": "Python 3 (ipykernel)", 144 | "language": "python", 145 | "name": "python3" 146 | }, 147 | "language_info": { 148 | "codemirror_mode": { 149 | "name": "ipython", 150 | "version": 3 151 | }, 152 | "file_extension": ".py", 153 | "mimetype": "text/x-python", 154 | "name": "python", 155 | "nbconvert_exporter": "python", 156 | "pygments_lexer": "ipython3", 157 | "version": "3.8.10" 158 | } 159 | }, 160 | "nbformat": 4, 161 | "nbformat_minor": 4 162 | } 163 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Based on https://github.com/pybind/cmake_example 2 | 3 | import os 4 | import sys 5 | import subprocess 6 | 7 | from setuptools import setup, Extension, find_namespace_packages 8 | from setuptools.command.build_ext import build_ext 9 | 10 | # Convert distutils Windows platform specifiers to CMake -A arguments 11 | PLAT_TO_CMAKE = { 12 | "win32": "Win32", 13 | "win-amd64": "x64", 14 | "win-arm32": "ARM", 15 | "win-arm64": "ARM64", 16 | } 17 | 18 | 19 | # A CMakeExtension needs a sourcedir instead of a file list. 20 | # The name must be the _single_ output extension from the CMake build. 21 | # If you need multiple extensions, see scikit-build. 22 | class CMakeExtension(Extension): 23 | def __init__(self, name, sourcedir=""): 24 | Extension.__init__(self, name, sources=[]) 25 | self.sourcedir = os.path.abspath(sourcedir) 26 | 27 | 28 | class CMakeBuild(build_ext): 29 | 30 | def build_extension(self, ext): 31 | extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name))) 32 | 33 | # required for auto-detection of auxiliary "native" libs 34 | if not extdir.endswith(os.path.sep): 35 | extdir += os.path.sep 36 | 37 | cfg = "Debug" if self.debug else "Release" 38 | 39 | # CMake lets you override the generator - we need to check this. 40 | # Can be set with Conda-Build, for example. 41 | cmake_generator = os.environ.get("CMAKE_GENERATOR", "") 42 | 43 | # Set Python_EXECUTABLE instead if you use PYBIND11_FINDPYTHON 44 | # EXAMPLE_VERSION_INFO shows you how to pass a value into the C++ code 45 | # from Python. 46 | cmake_args = [ 47 | f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}", 48 | f"-DPYTHON_EXECUTABLE={sys.executable}", 49 | f"-DCMAKE_BUILD_TYPE={cfg}", 50 | f"-DWARNINGS_AS_ERRORS=OFF", 51 | f"-DASMC_TESTING=OFF", 52 | ] 53 | build_args = [] 54 | 55 | if self.compiler.compiler_type != "msvc": 56 | # Using Ninja-build since it a) is available as a wheel and b) 57 | # multithreads automatically. MSVC would require all variables be 58 | # exported for Ninja to pick it up, which is a little tricky to do. 59 | # Users can override the generator with CMAKE_GENERATOR in CMake 60 | # 3.15+. 61 | if not cmake_generator: 62 | cmake_args += ["-GNinja"] 63 | 64 | else: 65 | 66 | # Single config generators are handled "normally" 67 | single_config = any(x in cmake_generator for x in {"NMake", "Ninja"}) 68 | 69 | # CMake allows an arch-in-generator style for backward compatibility 70 | contains_arch = any(x in cmake_generator for x in {"ARM", "Win64"}) 71 | 72 | # Specify the arch if using MSVC generator, but only if it doesn't 73 | # contain a backward-compatibility arch spec already in the 74 | # generator name. 75 | if not single_config and not contains_arch: 76 | cmake_args += ["-A", PLAT_TO_CMAKE[self.plat_name]] 77 | 78 | # Multi-config generators have a different way to specify configs 79 | if not single_config: 80 | cmake_args += [ 81 | "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}".format(cfg.upper(), extdir) 82 | ] 83 | build_args += ["--config", cfg] 84 | 85 | # Set CMAKE_BUILD_PARALLEL_LEVEL to control the parallel build level 86 | # across all generators. 87 | if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ: 88 | # self.parallel is a Python 3 only way to set parallel jobs by hand 89 | # using -j in the build_ext call, not supported by pip or PyPA-build. 90 | if hasattr(self, "parallel") and self.parallel: 91 | # CMake 3.12+ only. 92 | build_args += ["-j{}".format(self.parallel)] 93 | 94 | if not os.path.exists(self.build_temp): 95 | os.makedirs(self.build_temp) 96 | 97 | subprocess.check_call( 98 | ["cmake", ext.sourcedir] + cmake_args, cwd=self.build_temp 99 | ) 100 | subprocess.check_call( 101 | ["cmake", "--build", "."] + build_args, cwd=self.build_temp 102 | ) 103 | 104 | 105 | with open('PyPI_README.md', encoding='utf-8') as f: 106 | long_description = f.read() 107 | 108 | with open('RELEASE_NOTES.md', encoding='utf-8') as f: 109 | release_notes = f.read() 110 | 111 | setup( 112 | name='asmc-asmc', 113 | version='1.3.1', 114 | author='PalamaraLab (https://palamaralab.github.io/)', 115 | description='ASMC is a method to efficiently estimate pairwise coalescence time along the genome', 116 | url='https://github.com/PalamaraLab/ASMC/', 117 | python_requires=">=3.6", 118 | packages=find_namespace_packages(include=['asmc.*']), 119 | long_description='\n'.join([long_description, release_notes]), 120 | long_description_content_type="text/markdown", 121 | install_requires=['jupyter', 'numpy', 'pandas', 'asmc-preparedecoding', 'matplotlib'], 122 | ext_modules=[CMakeExtension('asmc/asmc')], 123 | cmdclass=dict(build_ext=CMakeBuild), 124 | zip_safe=False, 125 | ) 126 | -------------------------------------------------------------------------------- /src/ASMC.hpp: -------------------------------------------------------------------------------- 1 | // This file is part of ASMC, developed by Pier Francesco Palamara. 2 | // 3 | // ASMC is free software: you can redistribute it and/or modify 4 | // it under the terms of the GNU General Public License as published by 5 | // the Free Software Foundation, either version 3 of the License, or 6 | // (at your option) any later version. 7 | // 8 | // ASMC is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with ASMC. If not, see . 15 | 16 | #ifndef ASMC_HPP 17 | #define ASMC_HPP 18 | 19 | #include "Data.hpp" 20 | #include "DecodePairsReturnStruct.hpp" 21 | #include "DecodingParams.hpp" 22 | #include "HMM.hpp" 23 | 24 | #include 25 | #include 26 | 27 | namespace ASMC 28 | { 29 | 30 | class ASMC 31 | { 32 | 33 | private: 34 | DecodingParams mParams; 35 | Data mData; 36 | HMM mHmm; 37 | 38 | public: 39 | /** 40 | * ASMC constructor with full control over parameters, by manually specifying a DecodingParams object. 41 | * 42 | * @param params the decoding parameters 43 | */ 44 | explicit ASMC(DecodingParams params); 45 | 46 | /** 47 | * ASMC constructor that will set sensible defaults. If you wish to fine-tune parameters, use the constructor that 48 | * takes a DecodingParams object, which you can configure manually. 49 | * 50 | * @param inFileRoot the input file root 51 | * @param decodingQuantFile the decoding quantities file 52 | * @param outFileRoot the output file root, default to the input file root 53 | */ 54 | ASMC(const std::string& inFileRoot, const std::string& decodingQuantFile, const std::string& outFileRoot = "", 55 | const std::string& decodingMode = "array"); 56 | 57 | DecodingParams getDecodingParams(); 58 | 59 | unsigned long getDiploidSampleSize(); 60 | 61 | unsigned long getHaploidSampleSize(); 62 | 63 | int getNumSites(); 64 | 65 | std::vector getPhysicalPositions(); 66 | 67 | std::vector getGeneticPositions(); 68 | 69 | DecodingReturnValues decodeAllInJob(); 70 | 71 | void decodePairs(unsigned from = 0u, unsigned to = 0u, float cmBurnIn = 0.5f); 72 | 73 | void decodePairs(const std::vector &hapIndicesA, const std::vector &hapIndicesB, 74 | unsigned from = 0u, unsigned to = 0u, float cmBurnIn = 0.5f); 75 | 76 | void decodePairs(const std::vector& hapIdsA, const std::vector& hapIdsB, 77 | unsigned from = 0u, unsigned to = 0u, float cmBurnIn = 0.5f); 78 | 79 | DecodePairsReturnStruct getCopyOfResults(); 80 | 81 | const DecodePairsReturnStruct& getRefOfResults(); 82 | 83 | const std::vector& getExpectedTimes(); 84 | 85 | /// Set to true to store per pair posterior mean 86 | void setStorePerPairPosteriorMean(bool storePerPairPosteriorMean = true); 87 | 88 | /// Set to true to write per pair posterior mean to file 89 | void setWritePerPairPosteriorMean(bool writePerPairPosteriorMean = true); 90 | 91 | /// Set to true to store per pair MAP 92 | void setStorePerPairMap(bool storePerPairMAP = true); 93 | 94 | /// Set to true to write per pair MAP to file 95 | void setWritePerPairMap(bool writePerPairMAP = true); 96 | 97 | /// Set to true to store per pair posterior 98 | void setStorePerPairPosterior(bool storePerPairPosterior = true); 99 | 100 | /// Set to true to store the sum of posteriors 101 | void setStoreSumOfPosterior(bool storeSumOfPosterior = true); 102 | 103 | }; 104 | } // namespace ASMC 105 | 106 | #endif 107 | -------------------------------------------------------------------------------- /src/AvxDefinitions.hpp: -------------------------------------------------------------------------------- 1 | // This file is part of ASMC, developed by Pier Francesco Palamara. 2 | // 3 | // ASMC is free software: you can redistribute it and/or modify 4 | // it under the terms of the GNU General Public License as published by 5 | // the Free Software Foundation, either version 3 of the License, or 6 | // (at your option) any later version. 7 | // 8 | // ASMC is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with ASMC. If not, see . 15 | 16 | 17 | #ifndef AVXDEFINITIONS_HPP 18 | #define AVXDEFINITIONS_HPP 19 | 20 | #include 21 | 22 | #ifdef NO_SSE 23 | #define MODE "NO_SSE" 24 | #define VECX 4 25 | #endif 26 | 27 | // SSE vectorization (block size = 4) 28 | #ifdef SSE 29 | #define MODE "SSE" 30 | #define VECX 4 31 | #define FLOAT __m128 32 | #define LOAD _mm_load_ps 33 | #define STORE _mm_store_ps 34 | #define MULT _mm_mul_ps 35 | #define ADD _mm_add_ps 36 | #define RECIPROCAL _mm_rcp_ps 37 | #define LOAD1 _mm_load1_ps 38 | #endif 39 | 40 | // AVX vectorization (block size = 8) 41 | #ifdef AVX 42 | #define MODE "AVX" 43 | #include 44 | #define VECX 8 45 | #define FLOAT __m256 46 | #define LOAD _mm256_load_ps 47 | #define STORE _mm256_store_ps 48 | #define MULT _mm256_mul_ps 49 | #define ADD _mm256_add_ps 50 | #define RECIPROCAL _mm256_rcp_ps 51 | #define LOAD1 _mm256_broadcast_ss 52 | #endif 53 | 54 | // AVX512 vectorization (block size = 16) 55 | #ifdef AVX512 56 | #define MODE "AVX512" 57 | #include 58 | #define VECX 16 59 | #define FLOAT __m512 60 | #define LOAD _mm512_load_ps 61 | #define STORE _mm512_store_ps 62 | #define MULT _mm512_mul_ps 63 | #define ADD _mm512_add_ps 64 | #define RECIPROCAL _mm512_rcp14_ps 65 | #define LOAD1 _mm512_set1_ps 66 | #endif 67 | 68 | 69 | #endif // AVXDEFINITIONS_HPP 70 | -------------------------------------------------------------------------------- /src/BinaryDataReader.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by fergus on 28/08/2020. 3 | // 4 | 5 | #ifndef ASMC_BINARYDATAREADER_HPP 6 | #define ASMC_BINARYDATAREADER_HPP 7 | 8 | #include 9 | 10 | #include 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | namespace fs = std::filesystem; 23 | 24 | struct IbdPairDataLine { 25 | 26 | std::string ind1FamId = "0_00"; 27 | std::string ind1Id = "0_00"; 28 | int ind1Hap = -1; 29 | 30 | std::string ind2FamId = "0_00"; 31 | std::string ind2Id = "0_00"; 32 | int ind2Hap = -1; 33 | 34 | int chromosome = -1; 35 | 36 | int ibdStart = -1; 37 | int ibdEnd = -1; 38 | 39 | float lengthInCentimorgans = -1.f; 40 | float ibdScore = -1.f; 41 | float postEst = -1.f; 42 | float mapEst = -1.f; 43 | 44 | [[nodiscard]] std::string toString() const 45 | { 46 | std::stringstream line; 47 | line << std::setprecision(std::numeric_limits::digits10 + 1); 48 | 49 | line << ind1FamId << '\t' << ind1Id << '\t' << ind1Hap << '\t' << ind2FamId << '\t' << ind2Id << '\t' << ind2Hap 50 | << '\t' << chromosome << '\t' << ibdStart << '\t' << ibdEnd; 51 | 52 | if (lengthInCentimorgans != -1.f) { 53 | line << '\t' << lengthInCentimorgans; 54 | } 55 | 56 | if (ibdScore != -1.f) { 57 | line << '\t' << ibdScore; 58 | } 59 | 60 | if (postEst != -1.f) { 61 | line << '\t' << postEst; 62 | } 63 | 64 | if (mapEst != -1.f) { 65 | line << '\t' << mapEst; 66 | } 67 | 68 | return line.str(); 69 | } 70 | }; 71 | 72 | class BinaryDataReader 73 | { 74 | 75 | private: 76 | /** 77 | * Handle to the binary zipped file, opened in the constructor and closed in the destructor 78 | */ 79 | gzFile mGzBinaryFileHandle; 80 | 81 | bool mContainsIbdSegmentLengths = false; 82 | bool mContainsIbdScore = false; 83 | bool mContainsPosteriorAgeEstimates = false; 84 | bool mContainsMapAgeEstimates = false; 85 | 86 | int mChromosomeNumber = -1; 87 | unsigned mNumIds = 0u; 88 | 89 | std::vector mFamIds; 90 | std::vector mIIds; 91 | 92 | unsigned mPreReadStartOfNextLine = {}; 93 | 94 | bool mMoreLinesInFile = true; 95 | 96 | void ReadHeader() 97 | { 98 | gzread(mGzBinaryFileHandle, reinterpret_cast(&mContainsIbdSegmentLengths), sizeof(bool)); 99 | gzread(mGzBinaryFileHandle, reinterpret_cast(&mContainsIbdScore), sizeof(bool)); 100 | gzread(mGzBinaryFileHandle, reinterpret_cast(&mContainsPosteriorAgeEstimates), sizeof(bool)); 101 | gzread(mGzBinaryFileHandle, reinterpret_cast(&mContainsMapAgeEstimates), sizeof(bool)); 102 | gzread(mGzBinaryFileHandle, reinterpret_cast(&mChromosomeNumber), sizeof(int)); 103 | 104 | gzread(mGzBinaryFileHandle, reinterpret_cast(&mNumIds), sizeof(unsigned)); 105 | mFamIds.reserve(mNumIds); 106 | mIIds.reserve(mNumIds); 107 | 108 | for (unsigned i = 0; i < mNumIds; i++) { 109 | 110 | unsigned lengthFamId = {}; 111 | gzread(mGzBinaryFileHandle, reinterpret_cast(&lengthFamId), sizeof(unsigned)); 112 | mFamIds.emplace_back(lengthFamId, 'z'); 113 | gzread(mGzBinaryFileHandle, &mFamIds.at(i).at(0), lengthFamId); 114 | 115 | unsigned lengthIId = {}; 116 | gzread(mGzBinaryFileHandle, reinterpret_cast(&lengthIId), sizeof(unsigned)); 117 | mIIds.emplace_back(lengthIId, 'z'); 118 | gzread(mGzBinaryFileHandle, &mIIds.at(i).at(0), lengthIId); 119 | } 120 | } 121 | 122 | void CheckIfNextLineExists() 123 | { 124 | if (gzread(mGzBinaryFileHandle, reinterpret_cast(&mPreReadStartOfNextLine), sizeof(unsigned)) < 125 | sizeof(unsigned)) { 126 | mMoreLinesInFile = false; 127 | } 128 | } 129 | 130 | public: 131 | explicit BinaryDataReader(const std::string& binaryFile) 132 | { 133 | 134 | if (!fs::is_regular_file(binaryFile)) { 135 | throw std::runtime_error(fmt::format("Provided path to binary file {} is not a file\n", binaryFile)); 136 | } 137 | 138 | mGzBinaryFileHandle = gzopen(binaryFile.c_str(), "rb"); 139 | ReadHeader(); 140 | CheckIfNextLineExists(); 141 | } 142 | 143 | IbdPairDataLine getNextLine() 144 | { 145 | IbdPairDataLine line; 146 | 147 | // We have already read the first number from this line with a call to CheckIfNextLineExists() 148 | unsigned ind1 = mPreReadStartOfNextLine; 149 | unsigned ind2 = -1; 150 | 151 | std::uint_least8_t hap1; 152 | std::uint_least8_t hap2; 153 | 154 | gzread(mGzBinaryFileHandle, reinterpret_cast(&hap1), sizeof(std::uint_least8_t)); 155 | gzread(mGzBinaryFileHandle, reinterpret_cast(&ind2), sizeof(unsigned)); 156 | gzread(mGzBinaryFileHandle, reinterpret_cast(&hap2), sizeof(std::uint_least8_t)); 157 | gzread(mGzBinaryFileHandle, reinterpret_cast(&line.ibdStart), sizeof(int)); 158 | gzread(mGzBinaryFileHandle, reinterpret_cast(&line.ibdEnd), sizeof(int)); 159 | 160 | if (mContainsIbdSegmentLengths) { 161 | gzread(mGzBinaryFileHandle, reinterpret_cast(&line.lengthInCentimorgans), sizeof(float)); 162 | } 163 | 164 | if (mContainsIbdScore) { 165 | gzread(mGzBinaryFileHandle, reinterpret_cast(&line.ibdScore), sizeof(float)); 166 | } 167 | 168 | if (mContainsPosteriorAgeEstimates) { 169 | gzread(mGzBinaryFileHandle, reinterpret_cast(&line.postEst), sizeof(float)); 170 | } 171 | 172 | if (mContainsMapAgeEstimates) { 173 | gzread(mGzBinaryFileHandle, reinterpret_cast(&line.mapEst), sizeof(float)); 174 | } 175 | 176 | line.ind1Hap = static_cast(hap1); 177 | line.ind2Hap = static_cast(hap2); 178 | 179 | line.chromosome = mChromosomeNumber; 180 | 181 | line.ind1FamId = mFamIds.at(ind1); 182 | line.ind1Id = mIIds.at(ind1); 183 | 184 | line.ind2FamId = mFamIds.at(ind2); 185 | line.ind2Id = mIIds.at(ind2); 186 | 187 | // Pre-read first number from next line to check whether the line exists 188 | CheckIfNextLineExists(); 189 | 190 | return line; 191 | } 192 | 193 | [[nodiscard]] bool moreLinesInFile() const 194 | { 195 | return mMoreLinesInFile; 196 | } 197 | 198 | ~BinaryDataReader() 199 | { 200 | gzclose(mGzBinaryFileHandle); 201 | } 202 | }; 203 | 204 | #endif // ASMC_BINARYDATAREADER_HPP 205 | -------------------------------------------------------------------------------- /src/Data.hpp: -------------------------------------------------------------------------------- 1 | // This file is part of ASMC, developed by Pier Francesco Palamara. 2 | // 3 | // ASMC is free software: you can redistribute it and/or modify 4 | // it under the terms of the GNU General Public License as published by 5 | // the Free Software Foundation, either version 3 of the License, or 6 | // (at your option) any later version. 7 | // 8 | // ASMC is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with ASMC. If not, see . 15 | 16 | #ifndef ASMC_DATA_HPP 17 | #define ASMC_DATA_HPP 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #include "Individual.hpp" 25 | #include "DecodingParams.hpp" 26 | #include "DecodingQuantities.hpp" 27 | 28 | class Data 29 | { 30 | 31 | public: 32 | 33 | std::vector FamIDList = {}; 34 | std::vector IIDList = {}; 35 | std::vector famAndIndNameList = {}; 36 | std::vector individuals = {}; 37 | 38 | unsigned long sampleSize = 0ul; 39 | unsigned long haploidSampleSize = 0ul; 40 | int sites = 0; 41 | bool decodingUsesCSFS = false; 42 | bool mJobbing = false; 43 | bool foldToMinorAlleles = false; 44 | std::vector geneticPositions = {}; 45 | std::vector physicalPositions = {}; 46 | std::vector siteWasFlippedDuringFolding = {}; 47 | std::vector recRateAtMarker = {}; 48 | 49 | // Variables relating to FastSMC 50 | int chrNumber = 0; 51 | unsigned int windowSize = 0u; // window size in triangles for each job 52 | unsigned int w_i = 0u; // window id for ind_i for jobs 53 | unsigned int w_j = 0u; // window id for ind_j for jobs 54 | bool is_j_above_diag = false; 55 | std::unordered_map physicalPositionsMap = {}; // map where key=physicalPosition, value=indexPosition 56 | 57 | /** 58 | * Construct the data object, which also constructs the decoding quantities that will be owned by this object 59 | * 60 | * @param params the decoding params 61 | */ 62 | explicit Data(const DecodingParams& params); 63 | 64 | static int countHapLines(std::string inFileRoot); 65 | static int countSamplesLines(std::string inFileRoot); 66 | 67 | /** 68 | * Calculate the undistinguished counts 69 | * 70 | * @param numCsfsSamples the number of CSFS samples 71 | * @return the undistinguished counts 72 | */ 73 | std::vector> calculateUndistinguishedCounts(int numCsfsSamples) const; 74 | 75 | const std::vector& getSnpIDs() const; 76 | 77 | private: 78 | 79 | /** 80 | * Determine whether a sample should be read, based on the jobID, number of jobs, and the number of lines processed. 81 | * ASMC will always return true, but FastSMC will determine whether to read a sample. 82 | * 83 | * @param linesProcessed the number of lines processed so far 84 | * @param jobID the jobID, which will be the default value of -1 for ASMC 85 | * @param jobs the number of jobs, which will be the default value of -1 for ASMC 86 | * @return whether to read the sample 87 | */ 88 | bool readSample(unsigned linesProcessed, int jobID, int jobs); 89 | 90 | /** 91 | * Read the samples file and populate members `FamIDList`, `IIDList` and `famAndIndNameList`. 92 | * 93 | * @param inFileRoot location of input files 94 | * @param jobID the jobID which defaults to -1 indicating no jobbing 95 | * @param jobs the number of jobs which defaults to -1 indicating no jobbing 96 | */ 97 | void readSamplesList(const std::string& inFileRoot, int jobID, int jobs); 98 | 99 | void readHaps(std::string inFileRoot, bool foldToMinorAlleles); 100 | void readHaps(std::string inFileRoot, bool foldToMinorAlleles, int jobID, int jobs, 101 | std::vector>& genetic_map); 102 | 103 | /** 104 | * Read Plink-format map file 105 | * @param inFileRoot 106 | * @param mapFile: optional direct path to map file, used if map file is not in inFileRoot 107 | */ 108 | void readMap(const std::string& inFileRoot, const std::string& mapFile = ""); 109 | 110 | /** 111 | * Subsumed functionality from FastSMC to read genetic map as a vector of pairs. 112 | * TODO: can this be harmonised with the other readMap method? 113 | * @param inFileRoot 114 | * @param mapFile: optional direct path to map file, used if map file is not in inFileRoot 115 | * @return 116 | */ 117 | static std::vector> readMapFastSMC(const std::string& inFileRoot, 118 | const std::string& mapFile = ""); 119 | 120 | std::vector totalSamplesCount; 121 | std::vector derivedAlleleCounts; 122 | std::vector SNP_IDs; 123 | 124 | static int sampleHypergeometric(int populationSize, int numberOfSuccesses, int sampleSize); 125 | 126 | 127 | void readGeneticMap(unsigned long int bp, std::vector>& genetic_map, 128 | unsigned int& cur_g, unsigned int pos); 129 | 130 | void addMarker(unsigned long int physicalPosition, double geneticPosition, unsigned int pos); 131 | 132 | 133 | 134 | 135 | }; 136 | 137 | #endif // ASMC_DATA_HPP 138 | -------------------------------------------------------------------------------- /src/DecodePairsReturnStruct.hpp: -------------------------------------------------------------------------------- 1 | // This file is part of ASMC, developed by Pier Francesco Palamara. 2 | // 3 | // ASMC is free software: you can redistribute it and/or modify 4 | // it under the terms of the GNU General Public License as published by 5 | // the Free Software Foundation, either version 3 of the License, or 6 | // (at your option) any later version. 7 | // 8 | // ASMC is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with ASMC. If not, see . 15 | 16 | #ifndef FASTSMC_DECODE_PAIRS_RETURN_STRUCT_HPP 17 | #define FASTSMC_DECODE_PAIRS_RETURN_STRUCT_HPP 18 | 19 | #pragma clang diagnostic push 20 | #pragma ide diagnostic ignored "cppcoreguidelines-non-private-member-variables-in-classes" 21 | 22 | #include 23 | #include 24 | 25 | #include 26 | #include 27 | #include 28 | 29 | struct DecodePairsReturnStruct { 30 | 31 | private: 32 | 33 | bool m_storeFullPosteriors = false; 34 | bool m_storeSumOfPosteriors = false; 35 | bool m_storePerPairPosteriors = false; 36 | bool m_storePerPairMAPs = false; 37 | 38 | std::size_t numWritten = 0ul; 39 | 40 | public: 41 | void initialise(const std::vector& individualsA, const std::vector& individualsB, 42 | long int numSites, long int numStates, bool _fullPosteriors = false, bool _sumOfPosteriors = false, 43 | bool _perPairPosteriors = false, bool _perPairMAPs = false) 44 | { 45 | numWritten = 0ul; 46 | Eigen::Index numPairsToDecode = individualsA.size(); 47 | 48 | m_storeFullPosteriors = _fullPosteriors; 49 | m_storeSumOfPosteriors = _sumOfPosteriors; 50 | m_storePerPairPosteriors = _perPairPosteriors; 51 | m_storePerPairMAPs = _perPairMAPs; 52 | 53 | perPairIndices.resize(numPairsToDecode); 54 | 55 | if (m_storeFullPosteriors) { 56 | perPairPosteriors.resize(numPairsToDecode); 57 | for (auto& arr : perPairPosteriors) { 58 | arr.resize(numStates, numSites); 59 | } 60 | } 61 | 62 | if (m_storeSumOfPosteriors) { 63 | sumOfPosteriors.resize(numStates, numSites); 64 | sumOfPosteriors.setZero(); 65 | } 66 | 67 | if (m_storePerPairPosteriors) { 68 | perPairPosteriorMeans.resize(numPairsToDecode, numSites); 69 | minPosteriorMeans.resize(numSites); 70 | argminPosteriorMeans.resize(numSites); 71 | } 72 | 73 | if (m_storePerPairMAPs) { 74 | perPairMAPs.resize(numPairsToDecode, numSites); 75 | minMAPs.resize(numSites); 76 | argminMAPs.resize(numSites); 77 | } 78 | } 79 | 80 | /// iHapIdx, iHapId, jHapIdx, jHapId 81 | std::vector> perPairIndices; 82 | 83 | /// The full set of posteriors: for each pair this is a (states * numSites) matrix 84 | std::vector> perPairPosteriors; 85 | 86 | /// The sum of all posteriors in perPairPosteriors: a (states * numSites) matrix 87 | Eigen::Array sumOfPosteriors; 88 | 89 | /// Posterior means: each row is an array of length numSites 90 | Eigen::Array perPairPosteriorMeans; 91 | 92 | Eigen::Array minPosteriorMeans; 93 | Eigen::Array argminPosteriorMeans; 94 | 95 | Eigen::Array perPairMAPs; 96 | 97 | Eigen::Array minMAPs; 98 | Eigen::Array argminMAPs; 99 | 100 | void incrementNumWritten() 101 | { 102 | numWritten += 1; 103 | } 104 | 105 | void finaliseCalculations() 106 | { 107 | for (Eigen::Index siteIdx = 0ll; siteIdx < perPairPosteriorMeans.cols(); ++siteIdx) { 108 | Eigen::Index argmin{}; 109 | minPosteriorMeans(siteIdx) = perPairPosteriorMeans.col(siteIdx).minCoeff(&argmin); 110 | argminPosteriorMeans(siteIdx) = static_cast(argmin); 111 | } 112 | 113 | for (Eigen::Index siteIdx = 0ll; siteIdx < perPairMAPs.cols(); ++siteIdx) { 114 | Eigen::Index argmin{}; 115 | minMAPs(siteIdx) = perPairMAPs.col(siteIdx).minCoeff(&argmin); 116 | argminMAPs(siteIdx) = static_cast(argmin); 117 | } 118 | } 119 | 120 | [[nodiscard]] std::size_t getNumWritten() const 121 | { 122 | return numWritten; 123 | } 124 | }; 125 | 126 | #endif // FASTSMC_DECODE_PAIRS_RETURN_STRUCT_HPP 127 | 128 | #pragma clang diagnostic pop -------------------------------------------------------------------------------- /src/DecodingParams.hpp: -------------------------------------------------------------------------------- 1 | // This file is part of ASMC, developed by Pier Francesco Palamara. 2 | // 3 | // ASMC is free software: you can redistribute it and/or modify 4 | // it under the terms of the GNU General Public License as published by 5 | // the Free Software Foundation, either version 3 of the License, or 6 | // (at your option) any later version. 7 | // 8 | // ASMC is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with ASMC. If not, see . 15 | 16 | #ifndef DECODINGPARAMS_HPP 17 | #define DECODINGPARAMS_HPP 18 | 19 | #include 20 | 21 | #include 22 | 23 | enum class DecodingMode { sequenceFolded, arrayFolded, sequence, array }; 24 | 25 | enum class DecodingModeOverall { sequence, array }; 26 | 27 | class DecodingParams 28 | { 29 | 30 | private: 31 | bool fastSmcInvokedWithProgramOptions = false; 32 | 33 | public: 34 | std::string inFileRoot; 35 | std::string decodingQuantFile; 36 | std::string mapFile; 37 | std::string outFileRoot; 38 | int jobs = 1; 39 | int jobInd = 1; 40 | std::string decodingModeString = "array"; 41 | DecodingModeOverall decodingModeOverall; 42 | DecodingMode decodingMode; 43 | bool decodingSequence = false; 44 | bool foldData = false; 45 | bool usingCSFS = false; 46 | bool compress = false; 47 | bool useAncestral = false; 48 | float skipCSFSdistance{}; 49 | bool noBatches = false; 50 | 51 | // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 52 | // New params from FastSMC that were not originally in ASMC 53 | 54 | int batchSize = 64; 55 | int recallThreshold = 3; 56 | 57 | float skip = 0.f; 58 | int gap = 1; 59 | int max_seeds = 0; 60 | float min_maf = 0; 61 | float min_m = 1; 62 | bool hashing = false; 63 | bool hashingOnly = false; 64 | bool FastSMC = false; 65 | bool BIN_OUT = false; 66 | bool useKnownSeed = false; 67 | 68 | /// Whether to write IBD segment length (in centimorgans) 69 | bool outputIbdSegmentLength = false; 70 | 71 | // Used by FastSCM itself 72 | int hashingWordSize = 64; 73 | int constReadAhead = 10; 74 | bool haploid = true; 75 | 76 | int time = 100; // state threshold for IBD detection 77 | 78 | 79 | // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 80 | 81 | // main tasks 82 | bool noConditionalAgeEstimates = false; 83 | bool doPosteriorSums = false; 84 | bool doPerPairPosteriorMean = false; // output posterior mean for each pair 85 | bool doPerPairMAP = false; // output MAP for each pair 86 | std::string expectedCoalTimesFile; // expected coalescence times within each interval 87 | bool withinOnly = false; // only compute decoding within individuals 88 | bool doMajorMinorPosteriorSums = false; 89 | 90 | bool processOptions(); 91 | bool processCommandLineArgs(int argc, char* argv[]); 92 | bool processCommandLineArgsFastSMC(int argc, char* argv[]); 93 | 94 | /** 95 | * Verify that the selected parameters are compatible. Incompatible options will cause FastSMC to exit with a message 96 | * explaining the incompatibility. 97 | * 98 | * @return true if the parameters are compatible 99 | */ 100 | bool validateParamsFastSMC(); 101 | 102 | /** 103 | * Print decoding properties that are currently active. 104 | * 105 | * @return true; 106 | */ 107 | bool printDecodingParams(); 108 | 109 | /** 110 | * Constructor requiring only an input file root with all other parameters set to sensible defaults. 111 | * Decoding quantities will be generated if they are not specified and do not exist in the input file directory. 112 | */ 113 | DecodingParams(); 114 | explicit DecodingParams(std::string _inFileRoot, std::string _decodingQuantFile = "", std::string _outFileRoot = "", 115 | int _jobs = 1, int _jobInd = 1, std::string _decodingModeString = "array", 116 | bool _decodingSequence = false, bool _usingCSFS = true, bool _compress = false, 117 | bool _useAncestral = false, float _skipCSFSdistance = 0.f, bool _noBatches = false, 118 | bool _doPosteriorSums = false, bool _doPerPairPosteriorMean = false, 119 | std::string _expectedCoalTimesFile = "", bool _withinOnly = false, 120 | bool _doMajorMinorPosteriorSums = false, bool _doPerPairMAP = false, 121 | std::string _mapFile = ""); 122 | 123 | /** 124 | * Minimal constructor that sets defaults for FastSMC. An error will occur if you try to use this constructor for 125 | * FastSMC == false. 126 | * 127 | * @param _inFileRoot the input file root 128 | * @param _decodingQuantFile the decoding quantities file 129 | * @param _outFileRoot the output file root 130 | * @param _fastSMC whether to run in FastSMC: if this is set to false an error will occur 131 | */ 132 | DecodingParams(std::string _inFileRoot, std::string _decodingQuantFile, std::string _outFileRoot, bool _fastSMC = true); 133 | 134 | 135 | }; 136 | 137 | #endif 138 | -------------------------------------------------------------------------------- /src/DecodingQuantities.hpp: -------------------------------------------------------------------------------- 1 | // This file is part of ASMC, developed by Pier Francesco Palamara. 2 | // 3 | // ASMC is free software: you can redistribute it and/or modify 4 | // it under the terms of the GNU General Public License as published by 5 | // the Free Software Foundation, either version 3 of the License, or 6 | // (at your option) any later version. 7 | // 8 | // ASMC is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with ASMC. If not, see . 15 | 16 | #ifndef DECODINGQUANTITIES_HPP 17 | #define DECODINGQUANTITIES_HPP 18 | 19 | #include 20 | #include 21 | #include 22 | 23 | enum class DataType { 24 | TransitionType, 25 | States, 26 | CSFSSamples, 27 | TimeVector, 28 | SizeVector, 29 | Discretization, 30 | ExpectedTimes, 31 | CSFS, 32 | FoldedCSFS, 33 | ClassicEmission, 34 | AscertainedCSFS, 35 | FoldedAscertainedCSFS, 36 | CompressedAscertainedEmission, 37 | initialStateProb, 38 | ColumnRatios, 39 | RowRatios, 40 | Uvectors, 41 | Bvectors, 42 | Dvectors, 43 | HomozygousEmissions, 44 | None 45 | }; 46 | 47 | class DecodingQuantities 48 | { 49 | 50 | public: 51 | unsigned int states = 0u; 52 | int CSFSSamples = 0; 53 | std::vector initialStateProb; 54 | std::vector expectedTimes; 55 | std::vector discretization; 56 | std::vector timeVector; 57 | std::vector columnRatios; 58 | std::vector> classicEmissionTable; 59 | std::vector> compressedEmissionTable; 60 | std::unordered_map> Dvectors; 61 | std::unordered_map> Bvectors; 62 | std::unordered_map> Uvectors; 63 | std::unordered_map> rowRatioVectors; 64 | std::unordered_map> homozygousEmissionMap; 65 | std::vector>> CSFSmap; 66 | std::vector>> foldedCSFSmap; 67 | std::vector>> ascertainedCSFSmap; 68 | std::vector>> foldedAscertainedCSFSmap; 69 | 70 | explicit DecodingQuantities(const std::string& fileName); 71 | 72 | private: 73 | // implemented, but need to update other code 74 | // void createFromBinary(const char *fileName); 75 | void createFromGzippedText(const std::string& fileName); 76 | 77 | /** 78 | * Validate that an appropriate decoding quantities file has been provided. This is achieved by: 79 | * 80 | * 1. Verifying the file exists 81 | * 2. Verifying the first line of the file contains exactly "TransitionType" 82 | * 83 | * @param fileName the name of the provided decoding quantities file 84 | */ 85 | void validateDecodingQuantitiesFile(const std::string& fileName); 86 | 87 | }; 88 | 89 | #endif 90 | -------------------------------------------------------------------------------- /src/FastSMC.hpp: -------------------------------------------------------------------------------- 1 | // This file is part of ASMC, developed by Pier Francesco Palamara. 2 | // 3 | // ASMC is free software: you can redistribute it and/or modify 4 | // it under the terms of the GNU General Public License as published by 5 | // the Free Software Foundation, either version 3 of the License, or 6 | // (at your option) any later version. 7 | // 8 | // ASMC is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with ASMC. If not, see . 15 | 16 | #ifndef ASMC_FASTSMC_HPP 17 | #define ASMC_FASTSMC_HPP 18 | 19 | #include "Data.hpp" 20 | #include "DecodingParams.hpp" 21 | #include "HMM.hpp" 22 | 23 | namespace ASMC 24 | { 25 | 26 | class FastSMC 27 | { 28 | 29 | private: 30 | 31 | DecodingParams mParams; 32 | Data mData; 33 | HMM mHmm; 34 | 35 | public: 36 | 37 | /** 38 | * FastSMC constructor with full control over parameters, by manually specifying a DecodingParams object. 39 | * 40 | * @param params the decoding parameters 41 | */ 42 | explicit FastSMC(DecodingParams params); 43 | 44 | /** 45 | * FastSMC constructor that will set sensible defaults. If you wish to fine-tune parameters, use the constructor that 46 | * takes a DecodingParams object, which you can configure manually. 47 | * 48 | * @param inFileRoot the input file root 49 | * @param dqFile the decoding quantities file 50 | * @param outFileRoot the output file root 51 | */ 52 | FastSMC(const std::string& inFileRoot, const std::string& dqFile, const std::string& outFileRoot); 53 | 54 | void run(); 55 | 56 | }; 57 | 58 | } // namespace ASMC 59 | 60 | #endif // ASMC_FASTSMC_HPP 61 | -------------------------------------------------------------------------------- /src/FileUtils.cpp: -------------------------------------------------------------------------------- 1 | // This file is part of ASMC, developed by Pier Francesco Palamara. 2 | // 3 | // ASMC is free software: you can redistribute it and/or modify 4 | // it under the terms of the GNU General Public License as published by 5 | // the Free Software Foundation, either version 3 of the License, or 6 | // (at your option) any later version. 7 | // 8 | // ASMC is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with ASMC. If not, see . 15 | 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #include "StringUtils.hpp" 25 | #include "FileUtils.hpp" 26 | #include "Types.hpp" 27 | 28 | #include 29 | #include 30 | 31 | namespace FileUtils { 32 | 33 | using std::string; 34 | using std::vector; 35 | using std::cerr; 36 | using std::endl; 37 | 38 | bool fileExists(const std::string& name) 39 | { 40 | std::ifstream f(name.c_str()); 41 | return f.good(); 42 | } 43 | 44 | void openOrExit(std::ifstream &stream, const string &file, 45 | std::ios_base::openmode mode) { 46 | stream.open(file.c_str(), mode); 47 | if (!stream) { 48 | cerr << "ERROR: Unable to open file: " << file << endl; 49 | exit(1); 50 | } 51 | } 52 | 53 | void openWritingOrExit(std::ofstream &stream, const string &file, 54 | std::ios_base::openmode mode) { 55 | stream.open(file.c_str(), mode); 56 | if (!stream) { 57 | cerr << "ERROR: Unable to open file for writing: " << file << endl; 58 | exit(1); 59 | } 60 | } 61 | 62 | void requireEmptyOrReadable(const std::string &file) { 63 | if (file.empty()) return; 64 | std::ifstream fin; 65 | fin.open(file.c_str()); 66 | if (!fin) { 67 | cerr << "ERROR: Unable to open file: " << file << endl; 68 | exit(1); 69 | } 70 | fin.close(); 71 | } 72 | 73 | void requireEachEmptyOrReadable(const std::vector &fileList) { 74 | for (uint i = 0; i < fileList.size(); i++) 75 | requireEmptyOrReadable(fileList[i]); 76 | } 77 | 78 | void requireEmptyOrWriteable(const std::string &file) { 79 | if (file.empty()) return; 80 | std::ofstream fout; 81 | fout.open(file.c_str(), std::ios::out | std::ios::app); 82 | if (!fout) { 83 | cerr << "ERROR: Output file is not writeable: " << file << endl; 84 | exit(1); 85 | } 86 | fout.close(); 87 | } 88 | 89 | vector parseHeader(const string &fileName, const string &delimiters) { 90 | AutoGzIfstream fin; fin.openOrExit(fileName); 91 | string header; 92 | getline(fin, header); 93 | vector split = StringUtils::tokenizeMultipleDelimiters(header, delimiters); 94 | fin.close(); 95 | return split; 96 | } 97 | 98 | int lookupColumnInd(const string &fileName, const string &delimiters, const string &columnName) { 99 | vector headers = parseHeader(fileName, delimiters); 100 | int columnInd = -1; 101 | for (uint c = 0; c < headers.size(); c++) 102 | if (headers[c] == columnName) 103 | columnInd = c; // first column is snp ID, treated separately 104 | if (columnInd == -1) { 105 | cerr << "WARNING: Column " << columnName << " not found in headers of " << fileName << endl; 106 | //exit(1); 107 | } 108 | return columnInd; 109 | } 110 | 111 | double readDoubleNanInf(std::istream &stream) { 112 | string str; 113 | stream >> str; 114 | return std::stod(str); 115 | } 116 | 117 | vector < std::pair > readFidIids(const string &file) { 118 | vector < std::pair > ret; 119 | AutoGzIfstream fin; 120 | fin.openOrExit(file); 121 | string FID, IID, line; 122 | while (fin >> FID >> IID) { 123 | if (FID.empty() || IID.empty()) { 124 | cerr << "ERROR: In file " << file << endl; 125 | cerr << " unable to read FID and IID; check format" << endl; 126 | exit(1); 127 | } 128 | ret.push_back(make_pair(FID, IID)); 129 | getline(fin, line); 130 | } 131 | fin.close(); 132 | return ret; 133 | } 134 | 135 | int AutoGzIfstream::lineCount(const std::string &file) { 136 | AutoGzIfstream fin; fin.openOrExit(file); 137 | int ctr = 0; string line; 138 | while (getline(fin, line)) 139 | ctr++; 140 | return ctr; 141 | } 142 | 143 | void AutoGzIfstream::openOrExit(const std::string &file, std::ios_base::openmode mode) { 144 | fin.open(file.c_str(), mode); 145 | if (!fin) { 146 | cerr << "ERROR: Unable to open file: " << file << endl; 147 | exit(1); 148 | } 149 | if ((int) file.length() > 3 && file.substr(file.length() - 3) == ".gz") 150 | boost_in.push(boost::iostreams::gzip_decompressor()); 151 | boost_in.push(fin); 152 | } 153 | 154 | void AutoGzIfstream::close() { 155 | fin.close(); 156 | boost_in.reset(); 157 | } 158 | 159 | AutoGzIfstream::operator bool() const { 160 | return !boost_in.fail(); 161 | } 162 | 163 | AutoGzIfstream& AutoGzIfstream::read(char *s, std::streamsize n) { 164 | boost_in.read(s, n); 165 | return *this; 166 | } 167 | 168 | int AutoGzIfstream::get() { 169 | return boost_in.get(); 170 | } 171 | 172 | double AutoGzIfstream::readDoubleNanInf() { 173 | return FileUtils::readDoubleNanInf(boost_in); 174 | } 175 | 176 | void AutoGzIfstream::clear() { 177 | boost_in.clear(); 178 | } 179 | 180 | AutoGzIfstream& AutoGzIfstream::seekg(std::streamoff off, std::ios_base::seekdir way) { 181 | boost_in.seekg(off, way); 182 | return *this; 183 | } 184 | 185 | AutoGzIfstream& getline(AutoGzIfstream& in, std::string &s) { 186 | std::getline(in.boost_in, s); 187 | return in; 188 | } 189 | 190 | void AutoGzOfstream::openOrExit(const std::string &file, std::ios_base::openmode mode) { 191 | fout.open(file.c_str(), mode); 192 | if (!fout) { 193 | cerr << "ERROR: Unable to open file: " << file << endl; 194 | exit(1); 195 | } 196 | if ((int) file.length() > 3 && file.substr(file.length() - 3) == ".gz") 197 | boost_out.push(boost::iostreams::gzip_compressor()); 198 | boost_out.push(fout); 199 | } 200 | 201 | void AutoGzOfstream::close() { 202 | boost_out.reset(); 203 | fout.close(); 204 | } 205 | 206 | AutoGzOfstream& AutoGzOfstream::operator << (std::ostream & (*manip)(std::ostream&)) { 207 | manip(boost_out); 208 | return *this; 209 | } 210 | 211 | void AutoGzOfstream::unsetf(std::ios_base::fmtflags mask) { 212 | boost_out.unsetf(mask); 213 | } 214 | 215 | AutoGzOfstream::operator bool() const { 216 | return !boost_out.fail(); 217 | } 218 | 219 | } 220 | -------------------------------------------------------------------------------- /src/FileUtils.hpp: -------------------------------------------------------------------------------- 1 | // This file is part of ASMC, developed by Pier Francesco Palamara. 2 | // 3 | // ASMC is free software: you can redistribute it and/or modify 4 | // it under the terms of the GNU General Public License as published by 5 | // the Free Software Foundation, either version 3 of the License, or 6 | // (at your option) any later version. 7 | // 8 | // ASMC is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with ASMC. If not, see . 15 | 16 | 17 | #ifndef FILEUTILS_HPP 18 | #define FILEUTILS_HPP 19 | 20 | #include 21 | #include 22 | #include 23 | 24 | #include "StringUtils.hpp" 25 | 26 | #include 27 | 28 | namespace FileUtils { 29 | 30 | bool fileExists(const std::string& name); 31 | 32 | void openOrExit(std::ifstream &stream, const std::string &file, 33 | std::ios_base::openmode mode = std::ios::in); 34 | 35 | void openWritingOrExit(std::ofstream &stream, const std::string &file, 36 | std::ios_base::openmode mode = std::ios::out); 37 | 38 | void requireEmptyOrReadable(const std::string &file); 39 | 40 | void requireEachEmptyOrReadable(const std::vector &fileList); 41 | 42 | void requireEmptyOrWriteable(const std::string &file); 43 | 44 | std::vector parseHeader(const std::string &fileName, 45 | const std::string &delimiters); 46 | 47 | int lookupColumnInd(const std::string &fileName, const std::string &delimiters, 48 | const std::string &columnName); 49 | 50 | double readDoubleNanInf(std::istream &stream); 51 | 52 | std::vector < std::pair > readFidIids(const std::string &file); 53 | 54 | class AutoGzIfstream { 55 | boost::iostreams::filtering_istream boost_in; 56 | std::ifstream fin; 57 | 58 | public: 59 | 60 | static int lineCount(const std::string &file); 61 | 62 | void openOrExit(const std::string &file, std::ios_base::openmode mode = std::ios::in); 63 | void close(); 64 | template AutoGzIfstream& operator >> (T &x) { 65 | boost_in >> x; 66 | return *this; 67 | } 68 | 69 | explicit operator bool() const; 70 | AutoGzIfstream& read(char *s, std::streamsize n); 71 | int get(); 72 | double readDoubleNanInf(); 73 | void clear(); 74 | AutoGzIfstream& seekg(std::streamoff off, std::ios_base::seekdir way); 75 | friend AutoGzIfstream& getline(AutoGzIfstream& in, std::string &s); 76 | }; 77 | 78 | AutoGzIfstream& getline(AutoGzIfstream& in, std::string &s); 79 | 80 | class AutoGzOfstream { 81 | boost::iostreams::filtering_ostream boost_out; 82 | std::ofstream fout; 83 | 84 | public: 85 | 86 | void openOrExit(const std::string &file, std::ios_base::openmode mode = std::ios::out); 87 | void close(); 88 | template AutoGzOfstream& operator << (const T &x) { 89 | boost_out << x; 90 | return *this; 91 | } 92 | AutoGzOfstream& operator << (std::ostream & (*manip)(std::ostream&)); 93 | void unsetf(std::ios_base::fmtflags); 94 | explicit operator bool() const; 95 | }; 96 | 97 | } 98 | 99 | #endif 100 | -------------------------------------------------------------------------------- /src/Individual.cpp: -------------------------------------------------------------------------------- 1 | // This file is part of ASMC, developed by Pier Francesco Palamara. 2 | // 3 | // ASMC is free software: you can redistribute it and/or modify 4 | // it under the terms of the GNU General Public License as published by 5 | // the Free Software Foundation, either version 3 of the License, or 6 | // (at your option) any later version. 7 | // 8 | // ASMC is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with ASMC. If not, see . 15 | 16 | #include "Individual.hpp" 17 | 18 | Individual::Individual(int numOfSites) 19 | { 20 | genotype1 = std::vector(numOfSites); 21 | genotype2 = std::vector(numOfSites); 22 | } 23 | 24 | void Individual::setGenotype(int_least8_t hap, int pos, bool val) 25 | { 26 | if (hap == 1) { 27 | genotype1[pos] = val; 28 | } else { 29 | genotype2[pos] = val; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/Individual.hpp: -------------------------------------------------------------------------------- 1 | // This file is part of ASMC, developed by Pier Francesco Palamara. 2 | // 3 | // ASMC is free software: you can redistribute it and/or modify 4 | // it under the terms of the GNU General Public License as published by 5 | // the Free Software Foundation, either version 3 of the License, or 6 | // (at your option) any later version. 7 | // 8 | // ASMC is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with ASMC. If not, see . 15 | 16 | 17 | #ifndef INDIVIDUAL_HPP 18 | #define INDIVIDUAL_HPP 19 | 20 | #include 21 | #include 22 | 23 | class Individual { 24 | 25 | /* **************************** */ 26 | /* **************************** */ 27 | // contains individual data 28 | /* **************************** */ 29 | /* **************************** */ 30 | public: 31 | std::vector genotype1; 32 | std::vector genotype2; 33 | 34 | public: 35 | explicit Individual(int numOfSites = 0); 36 | void setGenotype(int_least8_t hap, int pos, bool val); 37 | 38 | }; 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /src/MemoryUtils.cpp: -------------------------------------------------------------------------------- 1 | // This file is part of ASMC, developed by Pier Francesco Palamara. 2 | // 3 | // ASMC is free software: you can redistribute it and/or modify 4 | // it under the terms of the GNU General Public License as published by 5 | // the Free Software Foundation, either version 3 of the License, or 6 | // (at your option) any later version. 7 | // 8 | // ASMC is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with ASMC. If not, see . 15 | 16 | 17 | #include 18 | #include 19 | 20 | #include "MemoryUtils.hpp" 21 | #include "Types.hpp" 22 | 23 | void *ALIGNED_MALLOC(size_t size) { 24 | #ifdef USE_MKL_MALLOC 25 | void *p = mkl_malloc(size, MEM_ALIGNMENT); 26 | #else 27 | void *p = _mm_malloc(size, MEM_ALIGNMENT); 28 | #endif 29 | if (p == NULL) { 30 | std::cerr << "ERROR: Failed to allocate " << size << " bytes" << std::endl; 31 | exit(1); 32 | } else if ((uint64) p & 0xf) { 33 | std::cerr << "ERROR: Memory alignment of " << size << " bytes failed" << std::endl; 34 | exit(1); 35 | } 36 | return p; 37 | } 38 | -------------------------------------------------------------------------------- /src/MemoryUtils.hpp: -------------------------------------------------------------------------------- 1 | // This file is part of ASMC, developed by Pier Francesco Palamara. 2 | // 3 | // ASMC is free software: you can redistribute it and/or modify 4 | // it under the terms of the GNU General Public License as published by 5 | // the Free Software Foundation, either version 3 of the License, or 6 | // (at your option) any later version. 7 | // 8 | // ASMC is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with ASMC. If not, see . 15 | 16 | 17 | #ifndef MEMORYUTILS_HPP 18 | #define MEMORYUTILS_HPP 19 | 20 | #include "Types.hpp" 21 | 22 | #define MEM_ALIGNMENT 64 23 | 24 | void *ALIGNED_MALLOC(size_t size); 25 | 26 | #ifdef USE_MKL_MALLOC 27 | #include 28 | #define ALIGNED_FREE mkl_free 29 | #else 30 | #include 31 | #define ALIGNED_FREE _mm_free 32 | #endif 33 | 34 | #define ALIGNED_MALLOC_DOUBLES(numDoubles) (double *) ALIGNED_MALLOC((numDoubles)*sizeof(double)) 35 | #define ALIGNED_MALLOC_FLOATS(numFloats) (float *) ALIGNED_MALLOC((numFloats)*sizeof(float)) 36 | #define ALIGNED_MALLOC_UCHARS(numUchars) (uchar *) ALIGNED_MALLOC((numUchars)*sizeof(uchar)) 37 | #define ALIGNED_MALLOC_UINTS(numUints) (uint *) ALIGNED_MALLOC((numUints)*sizeof(uint)) 38 | #define ALIGNED_MALLOC_UINT64S(numUint64s) (uint64 *) ALIGNED_MALLOC((numUint64s)*sizeof(uint64)) 39 | #define ALIGNED_MALLOC_UINT64_MASKS(numUint64_masks) (uint64_masks *) ALIGNED_MALLOC((numUint64_masks)*sizeof(uint64_masks)) 40 | #define ALIGNED_MALLOC_USHORTS(numUshorts) (ushort *) ALIGNED_MALLOC((numUshorts)*sizeof(ushort)) 41 | 42 | #endif 43 | -------------------------------------------------------------------------------- /src/StringUtils.cpp: -------------------------------------------------------------------------------- 1 | // This file is part of ASMC, developed by Pier Francesco Palamara. 2 | // 3 | // ASMC is free software: you can redistribute it and/or modify 4 | // it under the terms of the GNU General Public License as published by 5 | // the Free Software Foundation, either version 3 of the License, or 6 | // (at your option) any later version. 7 | // 8 | // ASMC is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with ASMC. If not, see . 15 | 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | //#include 25 | 26 | #include "StringUtils.hpp" 27 | #include "Types.hpp" 28 | 29 | namespace StringUtils { 30 | using std::vector; 31 | using std::string; 32 | using std::cout; 33 | using std::cerr; 34 | using std::endl; 35 | 36 | float stof(const std::string &str) 37 | { 38 | return static_cast(std::stold(str)); 39 | } 40 | 41 | double stod(const std::string &str) 42 | { 43 | return static_cast(std::stold(str)); 44 | } 45 | 46 | string findDelimiters(const string &s, const string &c) { 47 | string delims; 48 | for (uint p = 0; p < s.length(); p++) 49 | if (c.find(s[p], 0) != string::npos) 50 | delims += s[p]; 51 | return delims; 52 | } 53 | // will not return blanks 54 | vector tokenizeMultipleDelimiters(const string &s, const string &c) 55 | { 56 | uint p = 0; 57 | vector ans; 58 | string tmp; 59 | while (p < s.length()) { 60 | tmp = ""; 61 | while (p < s.length() && c.find(s[p], 0) != string::npos) 62 | p++; 63 | while (p < s.length() && c.find(s[p], 0) == string::npos) { 64 | tmp += s[p]; 65 | p++; 66 | } 67 | if (tmp != "") 68 | ans.push_back(tmp); 69 | } 70 | return ans; 71 | } 72 | 73 | void rangeErrorExit(const string &str, const string &delims) { 74 | cerr << "ERROR: Invalid delimiter sequence for specifying range: " << endl; 75 | cerr << " Template string: " << str << endl; 76 | cerr << " Delimiter sequence found: " << delims << endl; 77 | cerr << "Range in must have format {start:end} with no other " << RANGE_DELIMS 78 | << " chars" << endl; 79 | exit(1); 80 | } 81 | 82 | // basic range template: expand "{start:end}" to vector with one entry per range element 83 | // if end==start-1, will return empty 84 | vector expandRangeTemplate(const string &str) { 85 | vector ret; 86 | string delims = findDelimiters(str, RANGE_DELIMS); 87 | if (delims.empty()) 88 | ret.push_back(str); 89 | else if (delims == RANGE_DELIMS) { 90 | vector tokens = tokenizeMultipleDelimiters(str, RANGE_DELIMS); 91 | for (int i = 0; i < (int) str.size(); i++) 92 | if (str[i] == ':' && (str[i - 1] == '{' || str[i + 1] == '}')) 93 | rangeErrorExit(str, delims); 94 | int startInd = (str[0] != RANGE_DELIMS[0]), endInd = startInd + 1; 95 | string prefix, suffix; 96 | if (str[0] != RANGE_DELIMS[0]) prefix = tokens[0]; 97 | if (str[str.length() - 1] != RANGE_DELIMS[2]) suffix = tokens.back(); 98 | int start = std::stoi(tokens[startInd]), end = std::stoi(tokens[endInd]); 99 | if (start > end + 1 || end > start + 1000000) { 100 | cerr << "ERROR: Invalid range in template string: " << str << endl; 101 | cerr << " Start: " << start << endl; 102 | cerr << " End: " << end << endl; 103 | exit(1); 104 | } 105 | for (int i = start; i <= end; i++) 106 | ret.push_back(prefix + std::to_string(i) + suffix); 107 | } 108 | else 109 | rangeErrorExit(str, delims); 110 | return ret; 111 | } 112 | 113 | vector expandRangeTemplates(const vector &rangeTemplates) { 114 | vector expanded; 115 | for (uint i = 0; i < rangeTemplates.size(); i++) { 116 | vector range = expandRangeTemplate(rangeTemplates[i]); 117 | expanded.insert(expanded.end(), range.begin(), range.end()); 118 | } 119 | return expanded; 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /src/StringUtils.hpp: -------------------------------------------------------------------------------- 1 | // This file is part of ASMC, developed by Pier Francesco Palamara. 2 | // 3 | // ASMC is free software: you can redistribute it and/or modify 4 | // it under the terms of the GNU General Public License as published by 5 | // the Free Software Foundation, either version 3 of the License, or 6 | // (at your option) any later version. 7 | // 8 | // ASMC is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with ASMC. If not, see . 15 | 16 | 17 | #ifndef STRINGUTILS_HPP 18 | #define STRINGUTILS_HPP 19 | 20 | #include 21 | #include 22 | 23 | namespace StringUtils { 24 | 25 | /** 26 | * Convert string to float, taking account of the fact that inputs may be given 27 | * at a precision too great to be representable as a float. 28 | * 29 | * This function converts first to long double, and then explicitly performs a 30 | * static cast to narrow the output to a float. 31 | * 32 | * @param str the string to convert to a float 33 | * @return the closest float representation of the string 34 | */ 35 | float stof(const std::string &str); 36 | 37 | /** 38 | * Convert string to double, taking account of the fact that inputs may be given 39 | * at a precision too great to be representable as a double. 40 | * 41 | * This function converts first to long double, and then explicitly performs a 42 | * static cast to narrow the output to a double. 43 | * 44 | * @param str the string to convert to a double 45 | * @return the closest double representation of the string 46 | */ 47 | double stod(const std::string &str); 48 | 49 | const std::string RANGE_DELIMS = "{:}"; 50 | 51 | std::string findDelimiters(const std::string &s, const std::string &c); 52 | 53 | std::vector tokenizeMultipleDelimiters(const std::string &s, const std::string &c); 54 | std::vector expandRangeTemplate(const std::string &str); 55 | std::vector expandRangeTemplates(const std::vector &rangeTemplates); 56 | } 57 | 58 | #endif 59 | -------------------------------------------------------------------------------- /src/Timer.cpp: -------------------------------------------------------------------------------- 1 | // This file is part of ASMC, developed by Pier Francesco Palamara. 2 | // 3 | // ASMC is free software: you can redistribute it and/or modify 4 | // it under the terms of the GNU General Public License as published by 5 | // the Free Software Foundation, either version 3 of the License, or 6 | // (at your option) any later version. 7 | // 8 | // ASMC is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with ASMC. If not, see . 15 | 16 | #include "Timer.hpp" 17 | 18 | Timer::Timer() 19 | { 20 | update_time(); 21 | } 22 | 23 | double Timer::update_time() 24 | { 25 | prevtime = curtime; 26 | curtime = timer_t::now(); 27 | std::chrono::duration diff = curtime - prevtime; 28 | return diff.count(); 29 | } 30 | -------------------------------------------------------------------------------- /src/Timer.hpp: -------------------------------------------------------------------------------- 1 | // This file is part of ASMC, developed by Pier Francesco Palamara. 2 | // 3 | // ASMC is free software: you can redistribute it and/or modify 4 | // it under the terms of the GNU General Public License as published by 5 | // the Free Software Foundation, either version 3 of the License, or 6 | // (at your option) any later version. 7 | // 8 | // ASMC is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with ASMC. If not, see . 15 | 16 | #include 17 | 18 | #ifndef TIMER_HPP 19 | #define TIMER_HPP 20 | 21 | class Timer 22 | { 23 | private: 24 | using timer_t = std::chrono::system_clock; 25 | using sys_time = std::chrono::time_point; 26 | 27 | sys_time prevtime, curtime; 28 | 29 | public: 30 | /// constructs a timer, recording the initial time 31 | Timer(); 32 | 33 | /// updates the current time and returns the time since the last update in seconds 34 | double update_time(); 35 | }; 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /src/Types.hpp: -------------------------------------------------------------------------------- 1 | // This file is part of ASMC, developed by Pier Francesco Palamara. 2 | // 3 | // ASMC is free software: you can redistribute it and/or modify 4 | // it under the terms of the GNU General Public License as published by 5 | // the Free Software Foundation, either version 3 of the License, or 6 | // (at your option) any later version. 7 | // 8 | // ASMC is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with ASMC. If not, see . 15 | 16 | 17 | #ifndef TYPES_HPP 18 | #define TYPES_HPP 19 | 20 | #include 21 | #include 22 | 23 | typedef unsigned char uchar; 24 | typedef unsigned int uint; 25 | typedef unsigned short ushort; 26 | typedef uint64_t uint64; 27 | typedef int64_t int64; 28 | typedef uint64_t hash_size; 29 | 30 | struct uint64_masks { 31 | uint64 is0, is2, is9; 32 | }; 33 | 34 | #endif 35 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | # This file is part of ASMC, developed by Pier Francesco Palamara. 2 | 3 | # ASMC is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or 6 | # (at your option) any later version. 7 | 8 | # ASMC is distributed in the hope that it will be useful, 9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | # GNU General Public License for more details. 12 | 13 | # You should have received a copy of the GNU General Public License 14 | # along with ASMC. If not, see . 15 | 16 | 17 | from asmc.asmc_python_bindings import BinaryDataReader 18 | from asmc.asmc_python_bindings import DecodingModeOverall 19 | from asmc.asmc_python_bindings import DecodingMode 20 | from asmc.asmc_python_bindings import DecodingReturnValues 21 | from asmc.asmc_python_bindings import DecodePairsReturnStruct 22 | from asmc.asmc_python_bindings import IbdPairDataLine 23 | from asmc.asmc_python_bindings import Individual 24 | from asmc.asmc_python_bindings import PairObservations 25 | from asmc.asmc_python_bindings import DecodingQuantities 26 | from asmc.asmc_python_bindings import DecodingParams 27 | from asmc.asmc_python_bindings import Data 28 | from asmc.asmc_python_bindings import HMM 29 | from asmc.asmc_python_bindings import FastSMC 30 | from asmc.asmc_python_bindings import ASMC 31 | 32 | 33 | # 34 | # ASMCReturnValues = collections.namedtuple( 35 | # "ASMCReturnValues", 36 | # "sumOverPairs sumOverPairs00 sumOverPairs01 sumOverPairs11") 37 | 38 | 39 | # def to_array(x): 40 | # a = list(x) 41 | # if a: 42 | # return np.array(a) 43 | # else: 44 | # return None 45 | # 46 | # 47 | # def flip_rows(a1, a2, flips): 48 | # # Swap rows according to boolean flips vector 49 | # if a1 is None or a2 is None: 50 | # return None, None 51 | # a1[flips], a2[flips] = a2[flips], a1[flips] 52 | # return a1, a2 53 | 54 | 55 | # def run(in_file_root, decoding_quant_file, out_file_root="", 56 | # mode=DecodingModeOverall.array, jobs=0, 57 | # job_index=0, skip_csfs_distance=0, 58 | # compress=False, use_ancestral=False, 59 | # posterior_sums=False, major_minor_posterior_sums=False): 60 | # ret = asmc(in_file_root=in_file_root, 61 | # decoding_quant_file=decoding_quant_file, 62 | # mode=mode, jobs=jobs, job_index=job_index, 63 | # skip_csfs_distance=skip_csfs_distance, 64 | # compress=compress, use_ancestral=use_ancestral, 65 | # posterior_sums=posterior_sums, 66 | # major_minor_posterior_sums=major_minor_posterior_sums) 67 | # sumOverPairs00, sumOverPairs11 = flip_rows( 68 | # to_array(ret.sumOverPairs00), to_array(ret.sumOverPairs11), 69 | # ret.siteWasFlippedDuringFolding) 70 | # return ASMCReturnValues( 71 | # sumOverPairs=to_array(ret.sumOverPairs), 72 | # sumOverPairs00=sumOverPairs00, 73 | # sumOverPairs01=to_array(ret.sumOverPairs01), 74 | # sumOverPairs11=sumOverPairs11) 75 | -------------------------------------------------------------------------------- /src/hashing/ExtendHash.hpp: -------------------------------------------------------------------------------- 1 | // This file is part of ASMC, developed by Pier Francesco Palamara. 2 | // 3 | // ASMC is free software: you can redistribute it and/or modify 4 | // it under the terms of the GNU General Public License as published by 5 | // the Free Software Foundation, either version 3 of the License, or 6 | // (at your option) any later version. 7 | // 8 | // ASMC is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with ASMC. If not, see . 15 | 16 | #ifndef ASMC_HASHING_EXTEND_HASH_HPP 17 | #define ASMC_HASHING_EXTEND_HASH_HPP 18 | 19 | #include 20 | 21 | #include 22 | 23 | #include "hashing/Match.hpp" 24 | 25 | /* Object for storing extension between pairs of Individuals */ 26 | class ExtendHash 27 | { 28 | 29 | boost::unordered_map extend_hash; 30 | 31 | unsigned long mWordSize; 32 | unsigned long num; 33 | 34 | bool mParHaploid; 35 | 36 | // Empty Match to insert into hash 37 | Match m; 38 | 39 | public: 40 | explicit ExtendHash(const unsigned long wordSize, const unsigned long num, const bool PAR_HAPLOID) 41 | : mWordSize(wordSize), num(num), mParHaploid(PAR_HAPLOID), m(wordSize) 42 | { 43 | } 44 | 45 | // Compute pair of Individuals from location indicator 46 | std::pair locationToPair(unsigned long loc) 47 | { 48 | const unsigned second = mParHaploid ? loc % num : 2 * (loc % num); 49 | const unsigned first = mParHaploid ? (loc - second) / num : 2 * ((loc - second / 2) / num); 50 | 51 | return std::make_pair(first, second); 52 | } 53 | 54 | // Compute location from pair of Individuals 55 | unsigned long pairToLocation(unsigned int i, unsigned int j) 56 | { 57 | if (!mParHaploid) { 58 | // round everyone down to the nearest haplotype 59 | i = (i - (i % 2)) / 2; 60 | j = (j - (j % 2)) / 2; 61 | } 62 | unsigned long loc = (i > j) ? j * num + i : i * num + j; 63 | return loc; 64 | } 65 | 66 | // Extend or add a given pair in the current hash 67 | // unsigned int i,j : identifiers for the two Individuals 68 | // int w : current word # to extend or add 69 | void extendPair(unsigned int i, unsigned int j, int w, const int GLOBAL_CURRENT_WORD) 70 | { 71 | m.getModifiableInterval()[0] = GLOBAL_CURRENT_WORD; 72 | // Find/extend this location in the hash 73 | auto extend_ret = extend_hash.insert(std::pair(pairToLocation(i, j), m)); 74 | (extend_ret.first->second).extend(w); 75 | } 76 | 77 | // Remove all pairs that were not extended beyond w 78 | // int w : word # to remove prior to 79 | void clearPairsPriorTo(int w, const int GLOBAL_CURRENT_WORD, const double PAR_MIN_MATCH, 80 | const std::vector& geneticPositions, HMM& hmm) 81 | { 82 | for (auto it = extend_hash.begin(); it != extend_hash.end();) { 83 | if (it->second.getInterval()[1] < w) { 84 | it->second.print(locationToPair(it->first), PAR_MIN_MATCH, geneticPositions, hmm); 85 | it = extend_hash.erase(it); 86 | } else { 87 | if (it->second.getInterval()[1] < GLOBAL_CURRENT_WORD) 88 | it->second.addGap(); 89 | it++; 90 | } 91 | } 92 | } 93 | 94 | // Remove all pairs that were not extended beyond w 95 | // int w : word # to remove prior to 96 | void extendAllPairsTo(int w) 97 | { 98 | for (auto it = extend_hash.begin(); it != extend_hash.end(); it++) 99 | it->second.getModifiableInterval()[1] = w; 100 | } 101 | 102 | // Remove all pairs 103 | // int w : word # to remove prior to 104 | void clearAllPairs(const double PAR_MIN_MATCH, const std::vector& geneticPositions, HMM& hmm) 105 | { 106 | for (auto it = extend_hash.begin(); it != extend_hash.end();) { 107 | it->second.print(locationToPair(it->first), PAR_MIN_MATCH, geneticPositions, hmm); 108 | it = extend_hash.erase(it); 109 | } 110 | } 111 | 112 | std::size_t size() const 113 | { 114 | return extend_hash.size(); 115 | } 116 | 117 | unsigned long getWordSize() const 118 | { 119 | return mWordSize; 120 | } 121 | 122 | }; 123 | 124 | #endif // ASMC_HASHING_EXTEND_HASH_HPP 125 | -------------------------------------------------------------------------------- /src/hashing/Individuals.hpp: -------------------------------------------------------------------------------- 1 | // This file is part of ASMC, developed by Pier Francesco Palamara. 2 | // 3 | // ASMC is free software: you can redistribute it and/or modify 4 | // it under the terms of the GNU General Public License as published by 5 | // the Free Software Foundation, either version 3 of the License, or 6 | // (at your option) any later version. 7 | // 8 | // ASMC is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with ASMC. If not, see . 15 | 16 | #ifndef ASMC_HASHING_INDIVIDUALS_HPP 17 | #define ASMC_HASHING_INDIVIDUALS_HPP 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #include "boost/dynamic_bitset.hpp" 25 | 26 | class Individuals 27 | { 28 | unsigned mIdNum; 29 | unsigned long mWordSize = 64ul; 30 | unsigned long mNumReadAhead = 10ul; 31 | 32 | std::vector> mHap{mNumReadAhead, boost::dynamic_bitset<>(mWordSize, 0ul)}; 33 | 34 | public: 35 | explicit Individuals(const unsigned long wordSize, const unsigned long numReadAhead, const unsigned idNum) 36 | : mIdNum{idNum}, mWordSize{wordSize}, mNumReadAhead{numReadAhead} 37 | { 38 | assert(wordSize > 0ul); 39 | assert(numReadAhead > 0ul); 40 | 41 | mHap.resize(numReadAhead); 42 | std::fill(mHap.begin(), mHap.end(), boost::dynamic_bitset<>(wordSize, 0ul)); 43 | } 44 | 45 | void clear(const int w) 46 | { 47 | assert(w >= 0); 48 | mHap.at(w % mNumReadAhead).reset(); 49 | } 50 | 51 | void setMarker(const int w, const std::size_t bit) 52 | { 53 | assert(w >= 0); 54 | assert(bit < mWordSize); 55 | mHap.at(w % mNumReadAhead).set(bit); 56 | } 57 | 58 | unsigned long getWordHash(const int w) 59 | { 60 | assert(w >= 0); 61 | return mHap.at(w % mNumReadAhead).to_ulong(); 62 | } 63 | 64 | std::string getWordString(const int w) 65 | { 66 | assert(w >= 0); 67 | std::string buffer; 68 | boost::to_string(mHap.at(w % mNumReadAhead), buffer); 69 | return buffer; 70 | } 71 | 72 | unsigned int getIdNum() const 73 | { 74 | return mIdNum; 75 | } 76 | 77 | unsigned long getWordSize() const 78 | { 79 | return mHap.front().size(); 80 | } 81 | 82 | unsigned long getNumReadAhead() const 83 | { 84 | return mHap.size(); 85 | } 86 | }; 87 | 88 | #endif // ASMC_HASHING_INDIVIDUALS_HPP 89 | -------------------------------------------------------------------------------- /src/hashing/Match.hpp: -------------------------------------------------------------------------------- 1 | // This file is part of ASMC, developed by Pier Francesco Palamara. 2 | // 3 | // ASMC is free software: you can redistribute it and/or modify 4 | // it under the terms of the GNU General Public License as published by 5 | // the Free Software Foundation, either version 3 of the License, or 6 | // (at your option) any later version. 7 | // 8 | // ASMC is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with ASMC. If not, see . 15 | 16 | #ifndef ASMC_HASHING_MATCH_HPP 17 | #define ASMC_HASHING_MATCH_HPP 18 | 19 | #include 20 | #include 21 | #include 22 | 23 | #include "hashing/Utils.hpp" 24 | #include "HMM.hpp" 25 | 26 | /** 27 | * Match object that does \\todo not clear to me what this does... 28 | */ 29 | class Match 30 | { 31 | private: 32 | std::array mInterval = {0, 0}; 33 | unsigned long mWordSize; 34 | unsigned mGaps = 0u; 35 | 36 | public: 37 | explicit Match(const unsigned long wordSize, const int i = 0) : mInterval{i, i}, mWordSize{wordSize} 38 | { 39 | } 40 | 41 | // pair : identifiers for the corresponding Individuals in all_ind 42 | void print(std::pair p, const double PAR_MIN_MATCH, const std::vector& geneticPositions, 43 | HMM& hmm) 44 | { 45 | const int intWordSize = static_cast(mWordSize); 46 | double mlen = asmc::cmBetween(mInterval[0], mInterval[1], geneticPositions, intWordSize); 47 | if (mlen >= PAR_MIN_MATCH) { 48 | const int from = mInterval[0] * intWordSize; 49 | const int to = mInterval[1] * intWordSize + intWordSize - 1; 50 | 51 | if(hmm.getDecodingParams().hashingOnly){ 52 | unsigned int jInd = p.first / 2; 53 | unsigned int iInd = p.second / 2; 54 | PairObservations observation = hmm.makePairObs(p.first % 2 == 0 ? 1 : 2, jInd, p.second % 2 == 0 ? 1 : 2, iInd); 55 | hmm.writePairIBD(observation, from, to); 56 | } else { 57 | hmm.decodeFromHashing(p.first, p.second, from, to); 58 | } 59 | } 60 | } 61 | 62 | void extend(const int w) 63 | { 64 | mInterval[1] = std::max(w, mInterval[1]); 65 | } 66 | 67 | void addGap() 68 | { 69 | mGaps++; 70 | } 71 | 72 | const std::array& getInterval() const 73 | { 74 | return mInterval; 75 | } 76 | 77 | std::array& getModifiableInterval() 78 | { 79 | return mInterval; 80 | } 81 | 82 | unsigned int getGaps() const 83 | { 84 | return mGaps; 85 | } 86 | 87 | unsigned long getWordSize() const 88 | { 89 | return mWordSize; 90 | } 91 | }; 92 | 93 | #endif // ASMC_HASHING_MATCH_HPP 94 | -------------------------------------------------------------------------------- /src/hashing/SeedHash.hpp: -------------------------------------------------------------------------------- 1 | // This file is part of ASMC, developed by Pier Francesco Palamara. 2 | // 3 | // ASMC is free software: you can redistribute it and/or modify 4 | // it under the terms of the GNU General Public License as published by 5 | // the Free Software Foundation, either version 3 of the License, or 6 | // (at your option) any later version. 7 | // 8 | // ASMC is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with ASMC. If not, see . 15 | 16 | #ifndef ASMC_HASHING_SEEDHASH_HPP 17 | #define ASMC_HASHING_SEEDHASH_HPP 18 | 19 | #include 20 | #include 21 | 22 | #include 23 | 24 | #include "hashing/ExtendHash.hpp" 25 | #include "hashing/Individuals.hpp" 26 | #include "Types.hpp" 27 | 28 | /* Object for storing initial word seeds */ 29 | class SeedHash 30 | { 31 | 32 | using ind_vec = std::vector; 33 | 34 | boost::unordered_map> seed_hash; 35 | // Empty vector to insert into the seed hash 36 | std::vector vec; 37 | // Iterator for testing insertion of elements 38 | // std::pair > > >, bool> seed_ret; 40 | public: 41 | void insertIndividuals(unsigned int i, hash_size word) 42 | { 43 | auto seed_ret = seed_hash.insert(std::pair>(word, vec)); 44 | (seed_ret.first->second).push_back(i); 45 | } 46 | void clear() 47 | { 48 | seed_hash.clear(); 49 | } 50 | int size() 51 | { 52 | return seed_hash.size(); 53 | } 54 | 55 | // Generate a new hash for this vector of Individualss 56 | static unsigned long subHash(ExtendHash* e, std::vector v, int w, ind_vec all_ind, const int MAX_seeds, 57 | const int jobID, const int jobs, const unsigned w_i, const unsigned w_j, 58 | const unsigned windowSize, const int GLOBAL_READ_WORDS, int& GLOBAL_SKIPPED_WORDS, 59 | const int GLOBAL_CURRENT_WORD, const bool is_j_above_diag) 60 | { 61 | SeedHash cur_sh; 62 | // seed the next word from this subset of Individualss 63 | for (unsigned int& i : v) { 64 | cur_sh.insertIndividuals(i, all_ind[i].getWordHash(w)); 65 | } 66 | // recursion: 67 | return cur_sh.extendAllPairs(e, w, all_ind, MAX_seeds, jobID, jobs, w_i, w_j, windowSize, GLOBAL_READ_WORDS, 68 | GLOBAL_SKIPPED_WORDS, GLOBAL_CURRENT_WORD, is_j_above_diag); 69 | } 70 | 71 | // Extend/save all pairs in the current hash 72 | // ExtendHash * e : Pointer to ExtendHash which will be called for each pair 73 | // returns : number of pairs evaluated 74 | unsigned long extendAllPairs(ExtendHash* e, int w, ind_vec all_ind, const int MAX_seeds, const int jobID, 75 | const int jobs, const unsigned w_i, const unsigned w_j, const unsigned windowSize, 76 | const int GLOBAL_READ_WORDS, int& GLOBAL_SKIPPED_WORDS, const int GLOBAL_CURRENT_WORD, 77 | const bool is_j_above_diag) 78 | { 79 | unsigned long tot_pairs = 0; 80 | for (auto it = seed_hash.begin(); it != seed_hash.end(); ++it) { 81 | 82 | // *** As long as the # of pairs is high, generate a sub-hash for the next word 83 | // *** Only store pairs of Individuals that have collision in a small hash 84 | // *** Extend only to the haplotypes that seeded here 85 | if (MAX_seeds != 0 && it->second.size() > static_cast(MAX_seeds) && w + 1 < GLOBAL_READ_WORDS) { 86 | // recursively generate a sub-hash 87 | // IMPORTANT: if we run out of buffered words then this seed does not get analyzed 88 | if (w + 1 < GLOBAL_READ_WORDS) { 89 | tot_pairs += subHash(e, it->second, w + 1, all_ind, MAX_seeds, jobID, jobs, w_i, w_j, windowSize, 90 | GLOBAL_READ_WORDS, GLOBAL_SKIPPED_WORDS, GLOBAL_CURRENT_WORD, is_j_above_diag); 91 | } else { 92 | GLOBAL_SKIPPED_WORDS++; 93 | } 94 | } else { 95 | // tot_pairs += it->second.size() * (it->second.size() - 1) / 2; 96 | for (auto i = 0ul; i < it->second.size(); i++) { 97 | for (auto ii = i + 1ul; ii < it->second.size(); ii++) { 98 | 99 | unsigned int ind_i = std::max(it->second[i], it->second[ii]); 100 | unsigned int ind_j = std::min(it->second[i], it->second[ii]); 101 | 102 | // for the last job only 103 | if (jobID == jobs) { 104 | if (all_ind[ind_i].getIdNum() >= (w_i - 1) * windowSize && 105 | all_ind[ind_j].getIdNum() >= (w_j - 1) * windowSize) { 106 | if (all_ind[ind_j].getIdNum() < 107 | (w_j - 1) * windowSize + (all_ind[ind_i].getIdNum() - (w_i - 1) * windowSize)) { 108 | e->extendPair(ind_j, ind_i, w, GLOBAL_CURRENT_WORD); 109 | tot_pairs++; 110 | } 111 | } 112 | } 113 | 114 | // for all other jobs 115 | else if ((all_ind[ind_i].getIdNum() >= (w_i - 1) * windowSize && 116 | all_ind[ind_i].getIdNum() < w_i * windowSize) && 117 | (all_ind[ind_j].getIdNum() >= (w_j - 1) * windowSize && 118 | all_ind[ind_j].getIdNum() < w_j * windowSize)) { 119 | if (is_j_above_diag && all_ind[ind_j].getIdNum() < (w_j - 1) * windowSize + (all_ind[ind_i].getIdNum() - 120 | (w_i - 1) * windowSize)) { 121 | e->extendPair(ind_j, ind_i, w, GLOBAL_CURRENT_WORD); 122 | tot_pairs++; 123 | } else if (!is_j_above_diag && 124 | all_ind[ind_j].getIdNum() >= 125 | (w_j - 1) * windowSize + (all_ind[ind_i].getIdNum() - (w_i - 1) * windowSize)) { 126 | e->extendPair(ind_j, ind_i, w, GLOBAL_CURRENT_WORD); 127 | tot_pairs++; 128 | } 129 | } 130 | } 131 | } 132 | } 133 | } 134 | return tot_pairs; 135 | } 136 | }; 137 | 138 | #endif // ASMC_HASHING_SEEDHASH_HPP 139 | -------------------------------------------------------------------------------- /src/hashing/Utils.cpp: -------------------------------------------------------------------------------- 1 | // This file is part of ASMC, developed by Pier Francesco Palamara. 2 | // 3 | // ASMC is free software: you can redistribute it and/or modify 4 | // it under the terms of the GNU General Public License as published by 5 | // the Free Software Foundation, either version 3 of the License, or 6 | // (at your option) any later version. 7 | // 8 | // ASMC is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with ASMC. If not, see . 15 | 16 | #include "hashing/Utils.hpp" 17 | 18 | #include 19 | #include 20 | #include 21 | 22 | double asmc::cmBetween(const int w1, const int w2, const std::vector& geneticPositions, const int wordSize) 23 | { 24 | assert(!geneticPositions.empty()); 25 | assert(wordSize * w1 < geneticPositions.size()); 26 | assert(w1 >= 0); 27 | assert(w2 >= 0); 28 | assert(w2 >= w1); 29 | 30 | const std::size_t start = wordSize * w1; 31 | const std::size_t end = std::min(wordSize * w2 + wordSize - 1, geneticPositions.size() - 1ul); 32 | 33 | return 100.0 * (geneticPositions[end] - geneticPositions[start]); 34 | } 35 | -------------------------------------------------------------------------------- /src/hashing/Utils.hpp: -------------------------------------------------------------------------------- 1 | // This file is part of ASMC, developed by Pier Francesco Palamara. 2 | // 3 | // ASMC is free software: you can redistribute it and/or modify 4 | // it under the terms of the GNU General Public License as published by 5 | // the Free Software Foundation, either version 3 of the License, or 6 | // (at your option) any later version. 7 | // 8 | // ASMC is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with ASMC. If not, see . 15 | 16 | #ifndef ASMC_HASHING_UTILS_HPP 17 | #define ASMC_HASHING_UTILS_HPP 18 | 19 | #include 20 | 21 | namespace asmc 22 | { 23 | 24 | /** 25 | * Convenience function to compute genetic distance between two words (start of w1 and end of w2) 26 | * 27 | * @param w1 the first word 28 | * @param w2 the second word 29 | * @param geneticPositions vector of genetic positions 30 | * @param wordSize number of locations per word 31 | * @return the number of centimorgans between start of w1 and end of w2 32 | */ 33 | double cmBetween(int w1, int w2, const std::vector& geneticPositions, int wordSize); 34 | 35 | } // namespace asmc 36 | 37 | #endif // ASMC_HASHING_UTILS_HPP 38 | -------------------------------------------------------------------------------- /test/cli_interface_test.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import os 3 | import subprocess 4 | import sys 5 | 6 | 7 | def list_files(startpath): 8 | for root, dirs, files in os.walk(startpath): 9 | level = root.replace(startpath, '').count(os.sep) 10 | indent = ' ' * 4 * level 11 | print('{}{}/'.format(indent, os.path.basename(root))) 12 | subindent = ' ' * 4 * (level + 1) 13 | for f in files: 14 | print('{}{}'.format(subindent, f)) 15 | 16 | 17 | def test_regession(asmc_exe): 18 | """ 19 | Run the ASMC regression test, which will test the output of an example ASMC run with the cached result in 20 | data/regression_test_original.gz. 21 | 22 | :param asmc_exe: path to the ASMC executable 23 | """ 24 | 25 | script_dir = os.path.realpath(os.path.dirname(__file__)) 26 | base_dir = os.path.realpath(os.path.join(script_dir, '..', '..')) 27 | old_file = os.path.join(script_dir, 'data', 'regression_test_original.gz') 28 | print('-' * 35) 29 | print('script dir', script_dir) 30 | print('base dir', base_dir) 31 | print('asmc exe', asmc_exe) 32 | print('-' * 35) 33 | assert os.path.isfile(old_file) 34 | 35 | # Old file contents are before OxfordRSE involvement in ASMC 36 | with gzip.open(old_file, 'rt') as gz_f: 37 | old_lines = gz_f.readlines() 38 | 39 | # New file contents are the result of running the example with the current ASMC source 40 | decoding_file = os.path.join(base_dir, 'FILES', 'DECODING_QUANTITIES', '30-100-2000_CEU.decodingQuantities.gz') 41 | in_file_root = os.path.join(base_dir, 'FILES', 'EXAMPLE', 'exampleFile.n300.array') 42 | 43 | subprocess.call([ 44 | asmc_exe, 45 | '--decodingQuantFile', decoding_file, 46 | '--inFileRoot', in_file_root, 47 | '--posteriorSums', 48 | ]) 49 | 50 | new_file = os.path.join(base_dir, 'FILES', 'EXAMPLE', 'exampleFile.n300.array.1-1.sumOverPairs.gz') 51 | assert os.path.isfile(new_file), \ 52 | "No output file found at {}. Did the executable run as expected?".format(new_file) 53 | 54 | with gzip.open(new_file, 'rt') as gz_f: 55 | new_lines = gz_f.readlines() 56 | 57 | assert len(old_lines) == len(new_lines), \ 58 | "The outputs have different numbers of lines ({} and {})".format(len(old_lines), len(new_lines)) 59 | 60 | for i, (old, new) in enumerate(zip(old_lines, new_lines)): 61 | assert old == new, "The outputs first differ at line {}".format(i) 62 | 63 | print('\n' + '#' * 35) 64 | print('# Regression test passed #') 65 | print('# All {} output lines identical #'.format(len(old_lines))) 66 | print('#' * 35 + '\n') 67 | 68 | 69 | if __name__ == "__main__": 70 | assert len(sys.argv) == 2, "Usage: {} /path/to/ASMC_exe".format(sys.argv[0]) 71 | 72 | path_to_asmc = sys.argv[1] 73 | assert os.path.isfile(path_to_asmc) and 'ASMC_exe' in path_to_asmc, \ 74 | "Expected path to ASMC executable, but got {}".format(path_to_asmc) 75 | 76 | test_regession(path_to_asmc) 77 | -------------------------------------------------------------------------------- /test/test_ASMC.cpp: -------------------------------------------------------------------------------- 1 | // This file is part of ASMC, developed by Pier Francesco Palamara. 2 | // 3 | // ASMC is free software: you can redistribute it and/or modify 4 | // it under the terms of the GNU General Public License as published by 5 | // the Free Software Foundation, either version 3 of the License, or 6 | // (at your option) any later version. 7 | // 8 | // ASMC is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with ASMC. If not, see . 15 | 16 | #include "catch.hpp" 17 | 18 | #include 19 | #include 20 | 21 | #include "ASMC.hpp" 22 | 23 | #include 24 | #include 25 | #include 26 | 27 | TEST_CASE("test ASMC decodePairsArray", "[ASMC]") 28 | { 29 | ASMC::ASMC asmc(ASMC_DATA_DIR "/examples/asmc/exampleFile.n300.array", 30 | ASMC_DATA_DIR "/decoding_quantities/30-100-2000_CEU.decodingQuantities.gz"); 31 | 32 | asmc.setStorePerPairMap(); 33 | asmc.setStorePerPairPosterior(); 34 | asmc.setStorePerPairPosteriorMean(); 35 | asmc.setStorePerPairMap(); 36 | 37 | std::vector indA = {1, 2, 3}; 38 | std::vector indB = {2, 3, 4}; 39 | asmc.decodePairs(indA, indB); 40 | auto result = asmc.getRefOfResults(); 41 | 42 | SECTION("test decode pair summarize") 43 | { 44 | REQUIRE(result.perPairIndices.size() == 3ul); 45 | 46 | // 0.1% margin in this test as the results can vary between pure and avx/sse 47 | REQUIRE(result.perPairPosteriorMeans(0, 0) == Approx(15968.91016f).margin(15968.91016f * 0.001f)); 48 | REQUIRE(result.perPairPosteriorMeans(1, 8) == Approx(27963.49805f).margin(27963.49805f * 0.001f)); 49 | REQUIRE(result.perPairPosteriorMeans(2, 29) == Approx(48573.32812f).margin(48573.32812f * 0.001f)); 50 | 51 | REQUIRE(result.perPairMAPs(0, 0) == 29); 52 | REQUIRE(result.perPairMAPs(1, 1234) == 65); 53 | REQUIRE(result.perPairMAPs(2, 7) == 33); 54 | 55 | // Check that the posteriors actually sum to one 56 | for (Eigen::Index idx = 0ll; idx < result.perPairPosteriors.size(); ++idx) { 57 | REQUIRE(result.perPairPosteriors.at(idx).colwise().sum().isOnes(1e-2)); 58 | } 59 | } 60 | } 61 | 62 | TEST_CASE("test ASMC decodePairsSequence", "[ASMC]") 63 | { 64 | ASMC::ASMC asmc(ASMC_DATA_DIR "/examples/asmc/exampleFile.n300", 65 | ASMC_DATA_DIR "/decoding_quantities/30-100-2000_CEU.decodingQuantities.gz", "", "sequence"); 66 | 67 | asmc.setStorePerPairMap(); 68 | asmc.setStorePerPairPosterior(); 69 | asmc.setStorePerPairPosteriorMean(); 70 | asmc.setStorePerPairMap(); 71 | 72 | std::vector indA = {5, 6}; 73 | std::vector indB = {7, 8}; 74 | asmc.decodePairs(indA, indB); 75 | auto result = asmc.getRefOfResults(); 76 | 77 | SECTION("test decode pair summarize") 78 | { 79 | REQUIRE(result.perPairIndices.size() == 2ul); 80 | 81 | // 0.1% margin in this test as the results can vary between pure and avx/sse 82 | REQUIRE(result.perPairPosteriorMeans(0, 0) == Approx(801.06647f).margin(801.06647f * 0.001f)); 83 | REQUIRE(result.perPairPosteriorMeans(1, 8) == Approx(17953.60938f).margin(17953.60938f * 0.001f)); 84 | 85 | REQUIRE(result.perPairMAPs(0, 0) == 16); 86 | REQUIRE(result.perPairMAPs(1, 1234) == 61); 87 | 88 | // Check that the posteriors actually sum to one 89 | for (Eigen::Index idx = 0ll; idx < result.perPairPosteriors.size(); ++idx) { 90 | REQUIRE(result.perPairPosteriors.at(idx).colwise().sum().isOnes(1e-2)); 91 | } 92 | } 93 | } 94 | 95 | TEST_CASE("test other get methods", "[ASMC]") 96 | { 97 | ASMC::ASMC asmc(ASMC_DATA_DIR "/examples/asmc/exampleFile.n300.array", 98 | ASMC_DATA_DIR "/decoding_quantities/30-100-2000_CEU.decodingQuantities.gz"); 99 | 100 | const std::vector& expectedTimes = asmc.getExpectedTimes(); 101 | CHECK(expectedTimes.at(0) == Approx(14.999777896567f).margin(1e-5)); 102 | CHECK(expectedTimes.at(4) == Approx(135.698150766900f).margin(1e-5)); 103 | } 104 | 105 | TEST_CASE("test from and to", "[ASMC]") 106 | { 107 | ASMC::ASMC asmc_full(ASMC_DATA_DIR "/examples/asmc/exampleFile.n300.array", 108 | ASMC_DATA_DIR "/decoding_quantities/30-100-2000_CEU.decodingQuantities.gz"); 109 | 110 | ASMC::ASMC asmc_part(ASMC_DATA_DIR "/examples/asmc/exampleFile.n300.array", 111 | ASMC_DATA_DIR "/decoding_quantities/30-100-2000_CEU.decodingQuantities.gz"); 112 | 113 | std::vector indA = {1, 2, 3, 4, 5}; 114 | std::vector indB = {6, 7, 8, 9, 10}; 115 | 116 | asmc_full.setStorePerPairMap(); 117 | asmc_full.setStorePerPairPosterior(); 118 | asmc_full.setStorePerPairPosteriorMean(); 119 | asmc_full.setStoreSumOfPosterior(); 120 | 121 | asmc_part.setStorePerPairMap(); 122 | asmc_part.setStorePerPairPosterior(); 123 | asmc_part.setStorePerPairPosteriorMean(); 124 | asmc_part.setStoreSumOfPosterior(); 125 | 126 | asmc_full.decodePairs(indA, indB); 127 | auto result_full = asmc_full.getRefOfResults(); 128 | asmc_part.setStorePerPairMap(); 129 | 130 | const unsigned lo = 1000; 131 | const unsigned hi = 1100; 132 | const unsigned long windowSize = static_cast(hi - lo); 133 | asmc_part.decodePairs(indA, indB, lo, hi, 0.5f); 134 | auto result_part = asmc_part.getRefOfResults(); 135 | 136 | SECTION("test part sizes are correct") 137 | { 138 | REQUIRE(result_part.perPairPosteriors.front().rows() == result_full.perPairPosteriors.front().rows()); 139 | REQUIRE(result_part.perPairPosteriors.front().cols() == windowSize); 140 | 141 | REQUIRE(result_part.sumOfPosteriors.rows() == result_full.sumOfPosteriors.rows()); 142 | REQUIRE(result_part.sumOfPosteriors.cols() == windowSize); 143 | 144 | REQUIRE(result_part.perPairPosteriorMeans.rows() == result_full.perPairPosteriorMeans.rows()); 145 | REQUIRE(result_part.perPairPosteriorMeans.cols() == windowSize); 146 | 147 | REQUIRE(result_part.minPosteriorMeans.cols() == windowSize); 148 | REQUIRE(result_part.argminPosteriorMeans.cols() == windowSize); 149 | 150 | REQUIRE(result_part.perPairMAPs.rows() == result_full.perPairMAPs.rows()); 151 | REQUIRE(result_part.perPairMAPs.cols() == windowSize); 152 | 153 | REQUIRE(result_part.minMAPs.cols() == windowSize); 154 | REQUIRE(result_part.argminMAPs.cols() == windowSize); 155 | } 156 | 157 | SECTION("test parts match full analysis") 158 | { 159 | for(auto i = 0ul; i < indA.size(); ++i) { 160 | REQUIRE( 161 | (result_full.perPairPosteriors.at(i).middleCols(static_cast(lo), windowSize) 162 | - result_part.perPairPosteriors.at(i)).abs().maxCoeff() < 1e-6 163 | ); 164 | } 165 | } 166 | } 167 | -------------------------------------------------------------------------------- /test/test_HMM.cpp: -------------------------------------------------------------------------------- 1 | // This file is part of ASMC, developed by Pier Francesco Palamara. 2 | // 3 | // ASMC is free software: you can redistribute it and/or modify 4 | // it under the terms of the GNU General Public License as published by 5 | // the Free Software Foundation, either version 3 of the License, or 6 | // (at your option) any later version. 7 | // 8 | // ASMC is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with ASMC. If not, see . 15 | 16 | #include "catch.hpp" 17 | 18 | #include 19 | #include 20 | 21 | #include "HMM.hpp" 22 | 23 | TEST_CASE("test hmm functions", "[HMM]") 24 | { 25 | DecodingParams params( 26 | ASMC_DATA_DIR "/examples/asmc/exampleFile.n300.array", 27 | ASMC_DATA_DIR "/decoding_quantities/30-100-2000_CEU.decodingQuantities.gz"); 28 | 29 | Data data(params); 30 | HMM hmm(data, params); 31 | 32 | REQUIRE(data.individuals.size() > 20); 33 | 34 | SECTION("test decode pair summarize") 35 | { 36 | PairObservations pairObs = hmm.makePairObs(1, 0, 2, 0); 37 | std::vector> decodeResult = hmm.decode(pairObs); 38 | std::pair, std::vector> decodeSummary = hmm.decodeSummarize(pairObs); 39 | // check that the MAP and posterior mean are the same length 40 | REQUIRE(decodeSummary.first.size() == decodeSummary.second.size()); 41 | REQUIRE(decodeSummary.first.size() == decodeResult[0].size()); 42 | } 43 | 44 | SECTION("test decode pair") 45 | { 46 | REQUIRE(hmm.getBatchBuffer().size() == 0); 47 | hmm.decodePair(0, 9); 48 | REQUIRE(hmm.getBatchBuffer().size() == 4); 49 | hmm.decodePair(1, 1); 50 | REQUIRE(hmm.getBatchBuffer().size() == 5); 51 | } 52 | 53 | SECTION("test decode pairs") 54 | { 55 | REQUIRE(hmm.getBatchBuffer().size() == 0); 56 | hmm.decodePairs({ 0, 1 }, { 9, 1 }); 57 | REQUIRE(hmm.getBatchBuffer().size() == 5); 58 | } 59 | 60 | SECTION("test finishDecoding") 61 | { 62 | REQUIRE(hmm.getBatchBuffer().size() == 0); 63 | hmm.decodePair(0, 9); 64 | REQUIRE(hmm.getBatchBuffer().size() == 4); 65 | hmm.finishDecoding(); 66 | REQUIRE(hmm.getBatchBuffer().size() == 0); 67 | } 68 | 69 | SECTION("test fill up buffer") 70 | { 71 | // default batch size is 64 72 | for (int i = 1; i <= 64 / 4; ++i) { 73 | hmm.decodePair(0, i); 74 | } 75 | 76 | // buffer should be empty now 77 | REQUIRE(hmm.getBatchBuffer().size() == 0); 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /test/test_binary_data_reader.cpp: -------------------------------------------------------------------------------- 1 | // This file is part of ASMC, developed by Pier Francesco Palamara. 2 | // 3 | // ASMC is free software: you can redistribute it and/or modify 4 | // it under the terms of the GNU General Public License as published by 5 | // the Free Software Foundation, either version 3 of the License, or 6 | // (at your option) any later version. 7 | // 8 | // ASMC is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with ASMC. If not, see . 15 | 16 | #include "catch.hpp" 17 | 18 | #include "BinaryDataReader.hpp" 19 | 20 | TEST_CASE("IbdPairDataLine default member test", "[BinaryDataReader]") 21 | { 22 | IbdPairDataLine line; 23 | 24 | REQUIRE(line.ind1FamId == "0_00"); 25 | REQUIRE(line.ind1Id == "0_00"); 26 | REQUIRE(line.ind1Hap == -1); 27 | REQUIRE(line.ind2FamId == "0_00"); 28 | REQUIRE(line.ind2Id == "0_00"); 29 | REQUIRE(line.ind2Hap == -1); 30 | REQUIRE(line.chromosome == -1); 31 | REQUIRE(line.ibdStart == -1); 32 | REQUIRE(line.ibdEnd == -1); 33 | REQUIRE(line.lengthInCentimorgans == -1.f); 34 | REQUIRE(line.ibdScore == -1.f); 35 | REQUIRE(line.postEst == -1.f); 36 | REQUIRE(line.mapEst == -1.f); 37 | 38 | REQUIRE(line.toString() == "0_00\t0_00\t-1\t0_00\t0_00\t-1\t-1\t-1\t-1"); 39 | 40 | line.ibdScore = 0.1; 41 | line.lengthInCentimorgans = 1.2; 42 | line.postEst = 2.3; 43 | line.mapEst = 3.4; 44 | 45 | REQUIRE(line.toString() == "0_00\t0_00\t-1\t0_00\t0_00\t-1\t-1\t-1\t-1\t1.2\t0.1\t2.3\t3.4"); 46 | } 47 | 48 | TEST_CASE("BinaryDataReader real data test with decoding", "[BinaryDataReader]") 49 | { 50 | BinaryDataReader dataReader(ASMC_DATA_DIR "/testing/fastsmc/binary_output.bibd.gz"); 51 | 52 | IbdPairDataLine line1 = dataReader.getNextLine(); 53 | REQUIRE(line1.ind1FamId == "1_94"); 54 | REQUIRE(line1.ind1Id == "1_94"); 55 | REQUIRE(line1.ind1Hap == 1); 56 | REQUIRE(line1.ind2FamId == "1_104"); 57 | REQUIRE(line1.ind2Id == "1_104"); 58 | REQUIRE(line1.ind2Hap == 1); 59 | REQUIRE(line1.chromosome == 1); 60 | REQUIRE(line1.ibdStart == 8740); 61 | REQUIRE(line1.ibdEnd == 1660011); 62 | REQUIRE(line1.lengthInCentimorgans == Approx(1.86962f).epsilon(1e-5)); 63 | REQUIRE(line1.ibdScore == Approx(0.5073708f).epsilon(1e-5)); 64 | REQUIRE(line1.postEst == Approx(215.6709f).epsilon(1e-5)); 65 | REQUIRE(line1.mapEst == Approx(24.99997f).epsilon(1e-5)); 66 | 67 | IbdPairDataLine line2 = dataReader.getNextLine(); 68 | REQUIRE(line2.ind1FamId == "1_94"); 69 | REQUIRE(line2.ind1Id == "1_94"); 70 | REQUIRE(line2.ind1Hap == 1); 71 | REQUIRE(line2.ind2FamId == "1_104"); 72 | REQUIRE(line2.ind2Id == "1_104"); 73 | REQUIRE(line2.ind2Hap == 1); 74 | REQUIRE(line2.chromosome == 1); 75 | REQUIRE(line2.ibdStart == 1679626); 76 | REQUIRE(line2.ibdEnd == 1679626); 77 | REQUIRE(line2.lengthInCentimorgans == Approx(0.f).epsilon(1e-5)); 78 | REQUIRE(line2.ibdScore == Approx(0.02249517f).epsilon(1e-5)); 79 | REQUIRE(line2.postEst == Approx(25544.65f).epsilon(1e-5)); 80 | REQUIRE(line2.mapEst == Approx(24.99997f).epsilon(1e-5)); 81 | 82 | int numLinesRead = 2; 83 | while (dataReader.moreLinesInFile()) { 84 | IbdPairDataLine line = dataReader.getNextLine(); 85 | numLinesRead++; 86 | } 87 | 88 | REQUIRE(numLinesRead == 1574); 89 | } 90 | 91 | TEST_CASE("BinaryDataReader real data test with only hashing", "[BinaryDataReader]") 92 | { 93 | BinaryDataReader dataReader(ASMC_DATA_DIR "/testing/fastsmc/binary_output_hashing.bibd.gz"); 94 | 95 | dataReader.getNextLine(); 96 | dataReader.getNextLine(); 97 | dataReader.getNextLine(); 98 | dataReader.getNextLine(); 99 | 100 | IbdPairDataLine line5 = dataReader.getNextLine(); 101 | REQUIRE(line5.ind1FamId == "1_35"); 102 | REQUIRE(line5.ind1Id == "1_35"); 103 | REQUIRE(line5.ind1Hap == 1); 104 | REQUIRE(line5.ind2FamId == "1_99"); 105 | REQUIRE(line5.ind2Id == "1_99"); 106 | REQUIRE(line5.ind2Hap == 2); 107 | REQUIRE(line5.chromosome == 1); 108 | REQUIRE(line5.ibdStart == 8740); 109 | REQUIRE(line5.ibdEnd == 1572363); 110 | REQUIRE(line5.lengthInCentimorgans == -1.f); // default value when not in file 111 | REQUIRE(line5.ibdScore == -1.f); // default value when not in file 112 | REQUIRE(line5.postEst == -1.f); // default value when not in file 113 | REQUIRE(line5.mapEst == -1.f); // default value when not in file 114 | 115 | int numLinesRead = 5; 116 | while (dataReader.moreLinesInFile()) { 117 | IbdPairDataLine line = dataReader.getNextLine(); 118 | numLinesRead++; 119 | } 120 | 121 | REQUIRE(numLinesRead == 495); 122 | } 123 | -------------------------------------------------------------------------------- /test/test_decoding_params.cpp: -------------------------------------------------------------------------------- 1 | // This file is part of ASMC, developed by Pier Francesco Palamara. 2 | // 3 | // ASMC is free software: you can redistribute it and/or modify 4 | // it under the terms of the GNU General Public License as published by 5 | // the Free Software Foundation, either version 3 of the License, or 6 | // (at your option) any later version. 7 | // 8 | // ASMC is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with ASMC. If not, see . 15 | 16 | #include "catch.hpp" 17 | 18 | #include 19 | 20 | #include "DecodingParams.hpp" 21 | 22 | TEST_CASE("test DecodingParams", "[DecodingParams]") 23 | { 24 | std::string inFileRoot = ASMC_DATA_DIR "/examples/asmc/exampleFile.n300.array"; 25 | std::string decodingQuantFile = ASMC_DATA_DIR "/decoding_quantities/30-100-2000_CEU.decodingQuantities.gz"; 26 | 27 | SECTION("test array folded") { 28 | DecodingParams params(inFileRoot, decodingQuantFile); 29 | REQUIRE(params.decodingMode == DecodingMode::arrayFolded); 30 | REQUIRE(params.compress == false); 31 | } 32 | 33 | SECTION("test sequence folded") { 34 | DecodingParams params(inFileRoot, decodingQuantFile, 35 | "", // _outFileRoot 36 | 1, // _jobs 37 | 1, // _jobInd 38 | "sequence", // _decodingModeString, override default 39 | false, // _decodingSequence 40 | true, // _usingCSFS 41 | true, // _compress, override default 42 | false, // _useAncestral 43 | nan("") // _skipCSFSdistance, override default 44 | ); 45 | REQUIRE(params.decodingMode == DecodingMode::sequenceFolded); 46 | REQUIRE(params.compress == true); 47 | } 48 | 49 | SECTION("test sequence") { 50 | DecodingParams params(inFileRoot, decodingQuantFile, 51 | "", // _outFileRoot 52 | 1, // _jobs 53 | 1, // _jobInd 54 | "sequence", // _decodingModeString, override default 55 | false, // _decodingSequence 56 | true, // _usingCSFS 57 | false, // _compress 58 | true // _useAncestral, override default 59 | ); 60 | REQUIRE(params.decodingMode == DecodingMode::sequence); 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /test/test_decoding_quantities.cpp: -------------------------------------------------------------------------------- 1 | // This file is part of ASMC, developed by Pier Francesco Palamara. 2 | // 3 | // ASMC is free software: you can redistribute it and/or modify 4 | // it under the terms of the GNU General Public License as published by 5 | // the Free Software Foundation, either version 3 of the License, or 6 | // (at your option) any later version. 7 | // 8 | // ASMC is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with ASMC. If not, see . 15 | 16 | #include "catch.hpp" 17 | 18 | #include 19 | 20 | #include "DecodingQuantities.hpp" 21 | 22 | using Catch::Matchers::Contains; 23 | 24 | TEST_CASE("test validate decoding quantities file", "[DecodingQuantities]") 25 | { 26 | std::string nonExistentDecodingQuantitiesFile = ASMC_DATA_DIR "/random_nonexistent_file.txt"; 27 | std::string goodDecodingQuantitiesFile = ASMC_DATA_DIR "/testing/asmc/decoding_quantities_good.txt"; 28 | std::string badDecodingQuantitiesFile = ASMC_DATA_DIR "/testing/asmc/decoding_quantities_bad.txt"; 29 | 30 | SECTION("test nonexistent file") 31 | { 32 | CHECK_THROWS_WITH(DecodingQuantities{nonExistentDecodingQuantitiesFile}, 33 | Contains("random_nonexistent_file.txt does not exist")); 34 | } 35 | 36 | SECTION("test good file") 37 | { 38 | CHECK_NOTHROW(DecodingQuantities{goodDecodingQuantitiesFile}); 39 | } 40 | 41 | SECTION("test bad file") 42 | { 43 | CHECK_THROWS_WITH(DecodingQuantities{badDecodingQuantitiesFile}, 44 | Contains("decoding_quantities_bad.txt does not seem to contain the correct information") && 45 | Contains("but instead found \"this file does not start with")); 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /test/test_hashing.cpp: -------------------------------------------------------------------------------- 1 | // This file is part of ASMC, developed by Pier Francesco Palamara. 2 | // 3 | // ASMC is free software: you can redistribute it and/or modify 4 | // it under the terms of the GNU General Public License as published by 5 | // the Free Software Foundation, either version 3 of the License, or 6 | // (at your option) any later version. 7 | // 8 | // ASMC is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with ASMC. If not, see . 15 | 16 | #include "catch.hpp" 17 | 18 | #include "hashing/ExtendHash.hpp" 19 | #include "hashing/Individuals.hpp" 20 | #include "hashing/Match.hpp" 21 | #include "hashing/SeedHash.hpp" 22 | #include "hashing/Utils.hpp" 23 | 24 | TEST_CASE("ExtendHash", "[hashing]") 25 | { 26 | ExtendHash e(4ul, 2ul, true); 27 | REQUIRE(e.size() == 0ul); 28 | REQUIRE(e.getWordSize() == 4ul); 29 | 30 | //todo: test ExtendHash 31 | } 32 | 33 | TEST_CASE("individuals", "[hashing]") 34 | { 35 | Individuals ind(8ul, 3ul, 5u); 36 | 37 | REQUIRE(ind.getIdNum() == 5u); 38 | REQUIRE(ind.getWordSize() == 8ul); 39 | REQUIRE(ind.getNumReadAhead() == 3ul); 40 | 41 | // Check up to 10 - but internally we're just going 0-1-2-0-1-2-0-1-2-0 42 | for (auto i = 0; i < 10; ++i) { 43 | REQUIRE(ind.getWordHash(i) == 0ul); 44 | REQUIRE(ind.getWordString(i) == "00000000"); 45 | } 46 | 47 | ind.setMarker(0, 0); 48 | ind.setMarker(1, 2); 49 | 50 | ind.setMarker(2, 2); 51 | ind.setMarker(2, 3); 52 | 53 | REQUIRE(ind.getWordHash(0) == 1ul); 54 | REQUIRE(ind.getWordString(0) == "00000001"); 55 | 56 | REQUIRE(ind.getWordHash(1) == 4ul); 57 | REQUIRE(ind.getWordString(1) == "00000100"); 58 | 59 | REQUIRE(ind.getWordHash(2) == 12ul); 60 | REQUIRE(ind.getWordString(2) == "00001100"); 61 | 62 | // Clear 2 63 | ind.clear(2); 64 | REQUIRE(ind.getWordHash(2) == 0ul); 65 | REQUIRE(ind.getWordString(2) == "00000000"); 66 | 67 | // Clear 1 by clearing 4 68 | ind.clear(4); 69 | REQUIRE(ind.getWordHash(1) == 0ul); 70 | REQUIRE(ind.getWordString(1) == "00000000"); 71 | } 72 | 73 | TEST_CASE("match", "[hashing]") 74 | { 75 | SECTION("default construction") 76 | { 77 | Match m(4ul); 78 | REQUIRE(m.getWordSize() == 4ul); 79 | REQUIRE(m.getGaps() == 0u); 80 | REQUIRE(m.getInterval()[0] == 0); 81 | REQUIRE(m.getInterval()[1] == 0); 82 | 83 | m.addGap(); 84 | m.addGap(); 85 | REQUIRE(m.getGaps() == 2u); 86 | 87 | m.extend(5); 88 | REQUIRE(m.getInterval()[1] == 5); 89 | } 90 | 91 | SECTION("explicit constructor") 92 | { 93 | Match m(4, 7); 94 | REQUIRE(m.getWordSize() == 4ul); 95 | REQUIRE(m.getGaps() == 0u); 96 | REQUIRE(m.getInterval()[0] == 7); 97 | REQUIRE(m.getInterval()[1] == 7); 98 | 99 | m.extend(5); 100 | REQUIRE(m.getInterval()[1] == 7); 101 | 102 | m.extend(8); 103 | REQUIRE(m.getInterval()[1] == 8); 104 | } 105 | 106 | SECTION("print method") 107 | { 108 | //TODO: this method is harder to test because it requires access to an HMM instance 109 | } 110 | } 111 | 112 | TEST_CASE("SeedHash", "[hashing]") 113 | { 114 | SeedHash s; 115 | REQUIRE(s.size() == 0ul); 116 | 117 | //todo: test SeedHash 118 | } 119 | 120 | TEST_CASE("utils", "[hashing]") 121 | { 122 | SECTION("cmBetween") 123 | { 124 | std::vector genPos = {0.00402186f, 0.0388124f, 0.0567817f, 0.0668489f, 0.0915063f, 0.12783f, 0.198618f, 125 | 0.199045f, 0.250093f, 0.259338f, 0.293267f, 0.294899f, 0.316173f, 0.353332f, 126 | 0.354553f, 0.357123f, 0.359118f, 0.395468f, 0.41749f, 0.421739f, 0.453347f, 127 | 0.471302f, 0.535031f, 0.548733f, 0.574022f, 0.604538f, 0.620419f}; 128 | 129 | SECTION("both words are inside vector") 130 | { 131 | const int wordSize = 4; 132 | const int w1 = 0; 133 | const int w2 = 3; 134 | const int w3 = 5; 135 | 136 | REQUIRE(asmc::cmBetween(w1, w2, genPos, wordSize) == 100.0 * (genPos.at(15) - genPos.at(0))); 137 | REQUIRE(asmc::cmBetween(w1, w3, genPos, wordSize) == 100.0 * (genPos.at(23) - genPos.at(0))); 138 | REQUIRE(asmc::cmBetween(w2, w3, genPos, wordSize) == 100.0 * (genPos.at(23) - genPos.at(12))); 139 | } 140 | 141 | SECTION("second word overflows vector") 142 | { 143 | const int wordSize = 4; 144 | const int w1 = 0; 145 | const int w2 = 1; 146 | const int w3 = 10; 147 | 148 | REQUIRE(asmc::cmBetween(w1, w3, genPos, wordSize) == 100.0 * (genPos.back() - genPos.at(0))); 149 | REQUIRE(asmc::cmBetween(w2, w3, genPos, wordSize) == 100.0 * (genPos.back() - genPos.at(4))); 150 | } 151 | } 152 | } -------------------------------------------------------------------------------- /test/test_regression.cpp: -------------------------------------------------------------------------------- 1 | // This file is part of ASMC, developed by Pier Francesco Palamara. 2 | // 3 | // ASMC is free software: you can redistribute it and/or modify 4 | // it under the terms of the GNU General Public License as published by 5 | // the Free Software Foundation, either version 3 of the License, or 6 | // (at your option) any later version. 7 | // 8 | // ASMC is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with ASMC. If not, see . 15 | 16 | #define CATCH_CONFIG_MAIN 17 | #include "catch.hpp" 18 | 19 | #include "ASMC.hpp" 20 | #include "FileUtils.hpp" 21 | 22 | #include 23 | #include 24 | 25 | #include 26 | 27 | TEST_CASE("test ASMC regression", "[HMM_regression]") 28 | { 29 | // we only needed to set doPosteriorSums to true, but because C++ does 30 | // not have keyword arguments we need to go through everything 31 | DecodingParams params(ASMC_DATA_DIR "/examples/asmc/exampleFile.n300.array", 32 | ASMC_DATA_DIR "/decoding_quantities/30-100-2000_CEU.decodingQuantities.gz", 33 | "", // _outFileRoot 34 | 1, // _jobs 35 | 1, // _jobInd 36 | "array", // _decodingModeString 37 | false, // _decodingSequence 38 | true, // _usingCSFS 39 | false, // _compress 40 | false, // _useAncestral 41 | 0.f, // _skipCSFSdistance 42 | false, // _noBatches 43 | true // _doPosteriorSums 44 | ); 45 | 46 | std::vector indToDecodeA = {1ul, 2ul, 3ul}; 47 | std::vector indToDecodeB = {2ul, 3ul, 4ul}; 48 | 49 | ASMC::ASMC asmc(params); 50 | asmc.setStorePerPairPosteriorMean(); 51 | asmc.setStorePerPairMap(); 52 | asmc.decodePairs(indToDecodeA, indToDecodeB); 53 | 54 | auto res = asmc.getRefOfResults(); 55 | 56 | SECTION("regression test per pair posterior means") 57 | { 58 | 59 | CHECK(res.perPairPosteriorMeans.rows() == 3ll); 60 | CHECK(res.perPairPosteriorMeans.cols() == 6760ll); 61 | 62 | std::string regressionFile = ASMC_DATA_DIR "/testing/asmc/regression/regression.perPairPosteriorMeans.gz"; 63 | FileUtils::AutoGzIfstream fin; 64 | fin.openOrExit(regressionFile); 65 | 66 | for (auto rowIdx = 0ul; rowIdx < indToDecodeA.size(); ++rowIdx) { 67 | std::string line; 68 | getline(fin, line); 69 | std::istringstream iss(line); 70 | std::vector rowAsFloats = {std::istream_iterator(iss), std::istream_iterator()}; 71 | 72 | CHECK(rowAsFloats.size() == 6760ul); 73 | for (auto colIdx = 0ul; colIdx < rowAsFloats.size(); ++colIdx) { 74 | CHECK(res.perPairPosteriorMeans(rowIdx, colIdx) == Approx(rowAsFloats.at(colIdx)).epsilon(0.001)); 75 | } 76 | } 77 | } 78 | 79 | SECTION("regression test per pair MAP") 80 | { 81 | CHECK(res.perPairMAPs.rows() == 3ll); 82 | CHECK(res.perPairMAPs.cols() == 6760ll); 83 | 84 | std::string regressionFile = ASMC_DATA_DIR "/testing/asmc/regression/regression.perPairMAP.gz"; 85 | FileUtils::AutoGzIfstream fin; 86 | fin.openOrExit(regressionFile); 87 | 88 | for (auto rowIdx = 0ul; rowIdx < indToDecodeA.size(); ++rowIdx) { 89 | std::string line; 90 | getline(fin, line); 91 | std::istringstream iss(line); 92 | std::vector rowAsInts = {std::istream_iterator(iss), std::istream_iterator()}; 93 | 94 | CHECK(rowAsInts.size() == 6760ul); 95 | for (auto colIdx = 0ul; colIdx < rowAsInts.size(); ++colIdx) { 96 | CHECK(res.perPairMAPs(rowIdx, colIdx) == rowAsInts.at(colIdx)); 97 | } 98 | } 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /test/test_regression.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import sys 3 | import unittest 4 | 5 | import numpy as np 6 | 7 | from asmc.asmc import * 8 | 9 | data_dir = pathlib.Path(__file__).resolve().parent.parent / 'ASMC_data' 10 | if not data_dir.exists(): 11 | print(f'ERROR. {data_dir} does not exist. Did you clone ASMC recursively with submodules?') 12 | sys.exit() 13 | 14 | 15 | class TestASMCRegression(unittest.TestCase): 16 | 17 | def setUp(self): 18 | in_file_root = str(data_dir / 'examples' / 'asmc' / 'exampleFile.n300.array') 19 | decoding_quant_file = str(data_dir / 'decoding_quantities' / '30-100-2000_CEU.decodingQuantities.gz') 20 | 21 | params = DecodingParams(in_file_root, decoding_quant_file, do_posterior_sums=True) 22 | 23 | self.asmc = ASMC(params) 24 | self.asmc.set_store_per_pair_posterior_mean(True) 25 | self.asmc.set_store_per_pair_map(True) 26 | 27 | def test_regression(self): 28 | self.asmc.decode_pairs([1, 2, 3], [2, 3, 4]) 29 | res = self.asmc.get_ref_of_results() 30 | 31 | self.assertEqual(res.per_pair_posterior_means.shape[0], 3) 32 | self.assertEqual(res.per_pair_posterior_means.shape[1], 6760) 33 | 34 | existing_post = np.loadtxt( 35 | str(data_dir / 'testing' / 'asmc' / 'regression' / 'regression.perPairPosteriorMeans.gz')) 36 | self.assertEqual(np.allclose(res.per_pair_posterior_means, existing_post, rtol=0.001), True) 37 | 38 | existing_map = np.loadtxt( 39 | str(data_dir / 'testing' / 'asmc' / 'regression' / 'regression.perPairMAP.gz')) 40 | self.assertEqual(np.allclose(res.per_pair_MAPs, existing_map, rtol=0.001), True) 41 | 42 | 43 | class TestFastSMCRegression(unittest.TestCase): 44 | 45 | def setUp(self): 46 | # Create decoding params object with required options 47 | self.params = DecodingParams() 48 | self.params.decodingQuantFile = str(data_dir / 'decoding_quantities' / '10-20-2000_CEU.decodingQuantities.gz') 49 | self.params.inFileRoot = str(data_dir / 'examples' / 'fastsmc' / 'example') 50 | self.params.outFileRoot = '/tmp/FastSMCresults' 51 | self.params.decodingModeString = 'array' 52 | self.params.usingCSFS = True 53 | self.params.batchSize = 32 54 | self.params.recallThreshold = 3 55 | self.params.min_m = 1.5 56 | self.params.hashing = True 57 | self.params.FastSMC = True 58 | self.params.BIN_OUT = False 59 | self.params.outputIbdSegmentLength = True 60 | self.params.time = 50 61 | self.params.noConditionalAgeEstimates = True 62 | self.params.doPerPairMAP = True 63 | self.params.doPerPairPosteriorMean = True 64 | self.params.useKnownSeed = True 65 | 66 | assert self.params.validateParamsFastSMC() 67 | 68 | fast_smc = FastSMC(self.params) 69 | fast_smc.run() 70 | 71 | def test_regression(self): 72 | original_text = np.loadtxt(str(data_dir / 'testing' / 'fastsmc' / 'regression' / 'regression_output.ibd.gz'), 73 | usecols=(7, 8, 9, 10, 11)) 74 | generated_text = np.loadtxt(self.params.outFileRoot + ".1.1.FastSMC.ibd.gz", usecols=(7, 8, 9, 10, 11)) 75 | 76 | self.assertEqual(original_text.shape, generated_text.shape) 77 | self.assertEqual(np.allclose(original_text, generated_text, rtol=0.001), True) 78 | 79 | 80 | class TestFastSMCRegressionWithoutHashing(unittest.TestCase): 81 | 82 | def setUp(self): 83 | # Create decoding params object with required options 84 | self.params = DecodingParams() 85 | self.params.decodingQuantFile = str(data_dir / 'decoding_quantities' / '10-20-2000_CEU.decodingQuantities.gz') 86 | self.params.inFileRoot = str(data_dir / 'examples' / 'fastsmc' / 'example') 87 | self.params.outFileRoot = '/tmp/FastSMCresults' 88 | self.params.decodingModeString = 'array' 89 | self.params.usingCSFS = True 90 | self.params.batchSize = 32 91 | self.params.recallThreshold = 3 92 | self.params.min_m = 1.5 93 | self.params.hashing = False 94 | self.params.FastSMC = True 95 | self.params.BIN_OUT = False 96 | self.params.outputIbdSegmentLength = True 97 | self.params.time = 50 98 | self.params.noConditionalAgeEstimates = True 99 | self.params.doPerPairMAP = True 100 | self.params.doPerPairPosteriorMean = True 101 | self.params.jobInd = 7 102 | self.params.jobs = 25 103 | self.params.useKnownSeed = True 104 | 105 | assert self.params.validateParamsFastSMC() 106 | 107 | fast_smc = FastSMC(self.params) 108 | fast_smc.run() 109 | 110 | def test_regression(self): 111 | original_text = np.loadtxt( 112 | str(data_dir / 'testing' / 'fastsmc' / 'regression' / 'regression_output_no_hashing.ibd.gz'), 113 | usecols=(7, 8, 9, 10, 11)) 114 | generated_text = np.loadtxt(self.params.outFileRoot + ".7.25.FastSMC.ibd.gz", usecols=(7, 8, 9, 10, 11)) 115 | 116 | self.assertEqual(original_text.shape, generated_text.shape) 117 | self.assertEqual(np.allclose(original_text, generated_text, rtol=0.001), True) 118 | -------------------------------------------------------------------------------- /test/test_unit_decoding_params.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import sys 3 | import unittest 4 | 5 | from asmc.asmc import DecodingParams, DecodingMode 6 | 7 | data_dir = pathlib.Path(__file__).resolve().parent.parent / 'ASMC_data' 8 | if not data_dir.exists(): 9 | print(f'ERROR. {data_dir} does not exist. Did you clone ASMC recursively with submodules?') 10 | sys.exit() 11 | 12 | 13 | class TestDecodingParams(unittest.TestCase): 14 | 15 | def setUp(self): 16 | self.inFileRoot = str(data_dir / 'examples' / 'asmc' / 'exampleFile.n300.array') 17 | self.decodingQuantFile = str(data_dir / 'decoding_quantities' / '30-100-2000_CEU.decodingQuantities.gz') 18 | 19 | def test_array_folded(self): 20 | params = DecodingParams(self.inFileRoot, self.decodingQuantFile) 21 | self.assertEqual(params.decodingMode, DecodingMode.arrayFolded) 22 | self.assertEqual(params.compress, False) 23 | 24 | def test_sequence_folded(self): 25 | params = DecodingParams(self.inFileRoot, self.decodingQuantFile, 26 | compress=True, skip_CSFS_distance=float('nan'), 27 | decoding_mode_string="sequence") 28 | self.assertEqual(params.decodingMode, DecodingMode.sequenceFolded) 29 | self.assertEqual(params.compress, True) 30 | 31 | def test_sequence(self): 32 | params = DecodingParams(self.inFileRoot, self.decodingQuantFile, 33 | decoding_mode_string="sequence", 34 | use_ancestral=True) 35 | self.assertEqual(params.decodingMode, DecodingMode.sequence) 36 | 37 | 38 | if __name__ == "__main__": 39 | unittest.main() 40 | -------------------------------------------------------------------------------- /test/unit_tests.cpp: -------------------------------------------------------------------------------- 1 | // This file is part of ASMC, developed by Pier Francesco Palamara. 2 | // 3 | // ASMC is free software: you can redistribute it and/or modify 4 | // it under the terms of the GNU General Public License as published by 5 | // the Free Software Foundation, either version 3 of the License, or 6 | // (at your option) any later version. 7 | // 8 | // ASMC is distributed in the hope that it will be useful, 9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | // GNU General Public License for more details. 12 | // 13 | // You should have received a copy of the GNU General Public License 14 | // along with ASMC. If not, see . 15 | 16 | 17 | // This tells Catch to provide a main() - only do this in one cpp file 18 | #define CATCH_CONFIG_MAIN 19 | #include "catch.hpp" 20 | 21 | #include 22 | #include 23 | 24 | #include "StringUtils.hpp" 25 | 26 | TEST_CASE("test string conversions", "[StringUtils]") { 27 | 28 | std::string str_1_0 = "1.0"; 29 | std::string str_1e0 = "1.0E0"; 30 | std::string str_minus1_0 = "-1.0"; 31 | 32 | // Not representable as float or double 33 | std::string too_small_str = "3.20676899524985E-310"; 34 | long double too_small_ld = 3.20676899524985E-310L; 35 | 36 | // Acceptable non-numbers 37 | std::string nan = "NAN"; 38 | std::string inf = "INF"; 39 | 40 | // Not representable at all 41 | std::string way_too_small = "1.23E-1000000000"; 42 | 43 | // Not convertible 44 | std::string greeting = "hello"; 45 | 46 | // Check stof 47 | CHECK(StringUtils::stof(str_1_0) == 1.f); 48 | CHECK(StringUtils::stof(str_1e0) == 1.f); 49 | CHECK(StringUtils::stof(str_minus1_0) == -1.f); 50 | CHECK(StringUtils::stof(too_small_str) == static_cast(too_small_ld)); 51 | CHECK(std::isnan(StringUtils::stof(nan))); 52 | CHECK(std::isinf(StringUtils::stof(inf))); 53 | CHECK_THROWS_AS(StringUtils::stof(way_too_small), std::out_of_range); 54 | CHECK_THROWS_AS(StringUtils::stof(greeting), std::invalid_argument); 55 | 56 | // Check stod 57 | CHECK(StringUtils::stod(str_1_0) == 1.0); 58 | CHECK(StringUtils::stod(str_1e0) == 1.0); 59 | CHECK(StringUtils::stod(str_minus1_0) == -1.0); 60 | CHECK(StringUtils::stod(too_small_str) == static_cast(too_small_ld)); 61 | CHECK(std::isnan(StringUtils::stod(nan))); 62 | CHECK(std::isinf(StringUtils::stod(inf))); 63 | CHECK_THROWS_AS(StringUtils::stod(way_too_small), std::out_of_range); 64 | CHECK_THROWS_AS(StringUtils::stod(greeting), std::invalid_argument); 65 | } 66 | 67 | TEST_CASE("test SOME coverage", "[coverage]") { 68 | 69 | auto s = StringUtils::findDelimiters("this;string;has;five;semi;colons", ";"); 70 | 71 | CHECK(s == ";;;;;"); 72 | } 73 | 74 | 75 | -------------------------------------------------------------------------------- /vcpkg.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "asmc-fastsmc", 3 | "version": "1.2", 4 | "dependencies": [ 5 | "boost-algorithm", 6 | "boost-dynamic-bitset", 7 | "boost-iostreams", 8 | "boost-math", 9 | "boost-program-options", 10 | "boost-unordered", 11 | "eigen3", 12 | "fmt", 13 | "range-v3", 14 | "zlib" 15 | ], 16 | "builtin-baseline": "f14984af3738e69f197bf0e647a8dca12de92996", 17 | "overrides": [ 18 | { "name": "boost-algorithm", "version": "1.80.0#1" }, 19 | { "name": "boost-dynamic-bitset", "version": "1.80.0#1" }, 20 | { "name": "boost-iostreams", "version": "1.80.0#1" }, 21 | { "name": "boost-math", "version": "1.80.0#1" }, 22 | { "name": "boost-program-options", "version": "1.80.0#1" }, 23 | { "name": "boost-unordered", "version": "1.80.0#1" }, 24 | { "name": "eigen3", "version": "3.3.9" }, 25 | { "name": "fmt", "version": "7.1.3#5" }, 26 | { "name": "range-v3", "version": "0.12.0#1" } 27 | ] 28 | } --------------------------------------------------------------------------------