├── .clang-format
├── .clang-tidy
├── .github
└── workflows
│ ├── macos-unit.yml
│ ├── python-package.yml
│ ├── sync.yml
│ ├── ubuntu-asan.yml
│ ├── ubuntu-coverage.yml
│ ├── ubuntu-no-sse.yml
│ ├── ubuntu-python.yml
│ ├── ubuntu-regression.yml
│ └── ubuntu-unit.yml
├── .gitignore
├── .gitmodules
├── .readthedocs.yaml
├── CMakeLists.txt
├── COPYING
├── LICENSE
├── PyPI_README.md
├── README.md
├── RELEASE_NOTES.md
├── asmc
└── asmc
├── cmake
├── AutodetectVcpkgToolchainFile.cmake
├── CheckDataModule.cmake
├── FindGMP.cmake
└── SIMD.cmake
├── cpp_example
├── FastSMC_example.sh
└── FastSMC_example_multiple_jobs.sh
├── docs
├── Makefile
├── asmc.md
├── asmc_python.md
├── conf.py
├── fastsmc.md
├── fastsmc_python.md
├── index.rst
├── make.bat
├── pages
│ ├── asmc.rst
│ ├── asmc_python.rst
│ ├── fastsmc.rst
│ ├── fastsmc_python.rst
│ ├── quickstart_developer.rst
│ └── quickstart_user.rst
├── quickstart_developer.md
├── quickstart_user.md
└── requirements.txt
├── exe
├── main.cpp
├── main_convertBinary.cpp
└── main_fastsmc.cpp
├── notebooks
├── asmc-minimal.ipynb
├── asmc.ipynb
├── asmc_w_decodingquant.ipynb
├── fastsmc-minimal.ipynb
└── fastsmc.ipynb
├── setup.py
├── src
├── ASMC.cpp
├── ASMC.hpp
├── AvxDefinitions.hpp
├── BinaryDataReader.hpp
├── Data.cpp
├── Data.hpp
├── DecodePairsReturnStruct.hpp
├── DecodingParams.cpp
├── DecodingParams.hpp
├── DecodingQuantities.cpp
├── DecodingQuantities.hpp
├── FastSMC.cpp
├── FastSMC.hpp
├── FileUtils.cpp
├── FileUtils.hpp
├── HMM.cpp
├── HMM.hpp
├── HmmUtils.cpp
├── HmmUtils.hpp
├── Individual.cpp
├── Individual.hpp
├── MemoryUtils.cpp
├── MemoryUtils.hpp
├── StringUtils.cpp
├── StringUtils.hpp
├── Timer.cpp
├── Timer.hpp
├── Types.hpp
├── __init__.py
├── hashing
│ ├── ExtendHash.hpp
│ ├── Individuals.hpp
│ ├── Match.hpp
│ ├── SeedHash.hpp
│ ├── Utils.cpp
│ └── Utils.hpp
└── pybind.cpp
├── test
├── catch.hpp
├── cli_interface_test.py
├── test_ASMC.cpp
├── test_HMM.cpp
├── test_binary_data_reader.cpp
├── test_decoding_params.cpp
├── test_decoding_quantities.cpp
├── test_fastsmc_regression.cpp
├── test_hashing.cpp
├── test_hmm_utils.cpp
├── test_regression.cpp
├── test_regression.py
├── test_unit_asmc.py
├── test_unit_decoding_params.py
└── unit_tests.cpp
└── vcpkg.json
/.clang-format:
--------------------------------------------------------------------------------
1 | ---
2 | BasedOnStyle: LLVM
3 | AllowShortFunctionsOnASingleLine: None
4 | AllowShortIfStatementsOnASingleLine: Never
5 | AllowShortLoopsOnASingleLine: false
6 | ColumnLimit: 120
7 | BreakBeforeBraces: Linux
8 | PointerAlignment: Left
9 |
10 | ...
11 |
--------------------------------------------------------------------------------
/.clang-tidy:
--------------------------------------------------------------------------------
1 | ---
2 | Checks: '*,-android-*,-bugprone-bool-pointer-implicit-conversion,-bugprone-exception-escape,-bugprone-infinite-loop,-bugprone-signed-char-misuse,-cert-dcl16-c,-cert-dcl37-c,-cert-dcl50-cpp,-cert-dcl51-cpp,-cert-dcl54-cpp,-cert-dcl59-cpp,-cert-env33-c,-cert-err09-cpp,-cert-err61-cpp,-cert-fio38-c,-cert-mem57-cpp,-cert-msc30-c,-cert-msc32-c,-cert-oop11-cpp,-cert-oop57-cpp,-cert-oop58-cpp,-cert-pos44-c,-clang-analyzer-*,-cppcoreguidelines-avoid-magic-numbers,-cppcoreguidelines-pro-bounds-array-to-pointer-decay,-cppcoreguidelines-pro-type-cstyle-cast,-darwin-*,-fuchsia-*,-google-*,google-default-arguments,google-explicit-constructor,google-runtime-operator,-hicpp-*,hicpp-exception-baseclass,hicpp-multiway-paths-covered,hicpp-signed-bitwise,-linuxkernel-*,-llvm-*,-llvmlibc-*,-misc-definitions-in-headers,-misc-non-private-member-variables-in-classes,-misc-unused-alias-decls,-misc-unused-parameters,-misc-unused-using-decls,-modernize-use-trailing-return-type,-objc-*,-openmp-exception-escape,-readability-braces-around-statements,-readability-else-after-return,-readability-function-size,-readability-identifier-naming,-readability-implicit-bool-conversion,-readability-isolate-declaration,-readability-magic-numbers,-readability-named-parameter,-readability-qualified-auto,-readability-redundant-access-specifiers,-readability-redundant-member-init,-readability-redundant-preprocessor,-readability-simplify-boolean-expr,-readability-uppercase-literal-suffix,-zircon-*'
3 | WarningsAsErrors: '-*'
4 | HeaderFilterRegex: ''
5 | FormatStyle: none
6 |
--------------------------------------------------------------------------------
/.github/workflows/macos-unit.yml:
--------------------------------------------------------------------------------
1 | name: "Unit tests: macOS"
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | pull_request:
8 | branches:
9 | - '**'
10 | workflow_dispatch:
11 |
12 | jobs:
13 |
14 | build-and-test:
15 | name: Unit tests on macOS
16 | runs-on: ${{ matrix.os }}
17 | if: ${{ github.event_name == 'pull_request' || github.repository == 'PalamaraLab/ASMC' }}
18 | strategy:
19 | fail-fast: false
20 | matrix:
21 | include:
22 | - os: macos-13
23 | - os: macos-14
24 |
25 | steps:
26 |
27 | - name: checkout repo & submodules
28 | uses: actions/checkout@v4
29 | with:
30 | submodules: true
31 | fetch-depth: 0
32 |
33 | - name: cache vcpkg installed packages
34 | uses: actions/cache@v4
35 | id: cache
36 | with:
37 | path: |
38 | vcpkg/
39 | build_dir/vcpkg_installed/
40 | key: ${{ runner.os }}-${{ hashFiles('vcpkg.json', 'vcpkg/CHANGELOG.md') }}
41 |
42 | - name: install dependencies
43 | run: |
44 | brew install libomp llvm pkg-config
45 |
46 | - name: make build directory
47 | run: mkdir -p build_dir
48 |
49 | - name: cmake configure
50 | run: cmake .. -DCMAKE_CXX_COMPILER=$(brew --prefix llvm)/bin/clang++ -DCMAKE_C_COMPILER=$(brew --prefix llvm)/bin/clang
51 | working-directory: build_dir
52 |
53 | - name: cmake build
54 | run: cmake --build . --parallel 3 --target ASMC_unit_tests
55 | working-directory: build_dir
56 |
57 | - name: cmake test
58 | run: ctest -j2 -R Asmc_unit_tests --output-on-failure
59 | working-directory: build_dir
60 |
--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
1 | name: Build python wheels
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | release:
8 | types:
9 | - published
10 | workflow_dispatch:
11 |
12 | jobs:
13 | build_wheels_cloud:
14 | name: Build wheels on ${{ matrix.os }}
15 | runs-on: ${{ matrix.os }}
16 | if: github.event_name == 'pull_request' || github.repository == 'PalamaraLab/ASMC'
17 | strategy:
18 | fail-fast: false
19 | matrix:
20 | include:
21 | - os: ubuntu-24.04
22 | arch: auto64
23 | py-vers: cp39-* cp310-* cp311-* cp312-* cp313-*
24 | before-all: |
25 | dnf -y groupinstall "Development Tools"
26 | dnf -y install git
27 | extra-env: ""
28 | mdt: ""
29 | - os: macos-13
30 | arch: x86_64
31 | py-vers: cp39-* cp310-* cp311-* cp312-* cp313-*
32 | before-all: brew install cmake libomp llvm pkg-config
33 | extra-env: CC="$(brew --prefix llvm)/bin/clang" CXX="$(brew --prefix llvm)/bin/clang++" HOMEBREW_NO_INSTALLED_DEPENDENTS_CHECK=1
34 | mdt: 13
35 | - os: macos-14
36 | arch: arm64
37 | py-vers: cp39-* cp310-* cp311-* cp312-* cp313-*
38 | before-all: brew install cmake libomp llvm pkg-config
39 | extra-env: CC="$(brew --prefix llvm)/bin/clang" CXX="$(brew --prefix llvm)/bin/clang++" HOMEBREW_NO_INSTALLED_DEPENDENTS_CHECK=1
40 | mdt: 14
41 |
42 | env:
43 | CIBW_BUILD: ${{ matrix.py-vers }}
44 | CIBW_SKIP: cp3*-musllinux_*
45 | CIBW_ARCHS: ${{ matrix.arch }}
46 | CIBW_MANYLINUX_X86_64_IMAGE: manylinux_2_28
47 | CIBW_BEFORE_ALL: ${{ matrix.before-all }}
48 | CIBW_BEFORE_BUILD: pip install --upgrade pip setuptools wheel ninja numpy cython
49 | CIBW_ENVIRONMENT: ${{ matrix.extra-env }}
50 | MACOSX_DEPLOYMENT_TARGET: ${{ matrix.mdt }}
51 |
52 | steps:
53 |
54 | - name: checkout repo & submodules
55 | uses: actions/checkout@v4
56 | with:
57 | submodules: true
58 | fetch-depth: 0
59 |
60 | - name: Build wheels
61 | uses: pypa/cibuildwheel@v2.19.2
62 |
63 | - uses: actions/upload-artifact@v4
64 | with:
65 | name: wheels-cloud-${{ matrix.os }}
66 | path: ./wheelhouse/*.whl
67 | retention-days: 1
68 |
69 | upload_all:
70 | name: Upload to PyPI
71 | needs: [build_wheels_cloud]
72 | runs-on: ubuntu-latest
73 | if: ${{ github.event_name == 'release' && github.event.action == 'published' && github.repository == 'PalamaraLab/ASMC' }}
74 |
75 | steps:
76 | - name: Download wheels from cloud runners
77 | uses: actions/download-artifact@v4
78 | with:
79 | pattern: wheels-cloud-*
80 | merge-multiple: true
81 | path: wheels
82 |
83 | - uses: pypa/gh-action-pypi-publish@v1.10.1
84 | with:
85 | user: __token__
86 | password: ${{ secrets.PYPI_TOKEN }}
87 | packages_dir: wheels/
88 |
--------------------------------------------------------------------------------
/.github/workflows/sync.yml:
--------------------------------------------------------------------------------
1 | name: Sync
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 |
8 | jobs:
9 | private-to-public:
10 | if: github.repository == 'PalamaraLab/ASMC_dev'
11 | runs-on: ubuntu-latest
12 | steps:
13 | - name: Checkout private repo
14 | uses: actions/checkout@v4
15 | with:
16 | fetch-depth: 0
17 | persist-credentials: false
18 |
19 | - name: Mirror main to public repo
20 | run: |
21 | remote_repo="https://fcooper8472:${{ secrets.DEPLOY_ACCESS_TOKEN }}@github.com/PalamaraLab/ASMC.git"
22 | git fetch "${remote_repo}" main
23 | if ! git diff --quiet HEAD FETCH_HEAD; then
24 | git push "${remote_repo}" HEAD:main --follow-tags --force
25 | echo "Changes detected and pushed to public repo."
26 | else
27 | echo "No changes detected. No push needed."
28 | fi
29 |
30 | public-to-private:
31 | if: github.repository == 'PalamaraLab/ASMC'
32 | runs-on: ubuntu-latest
33 | steps:
34 | - name: Checkout public repo
35 | uses: actions/checkout@v4
36 | with:
37 | fetch-depth: 0
38 | persist-credentials: false
39 |
40 | - name: Mirror main to private repo
41 | run: |
42 | remote_repo="https://fcooper8472:${{ secrets.DEPLOY_ACCESS_TOKEN }}@github.com/PalamaraLab/ASMC_dev.git"
43 | git fetch "${remote_repo}" main
44 | if ! git diff --quiet HEAD FETCH_HEAD; then
45 | git push "${remote_repo}" HEAD:main --follow-tags --force
46 | echo "Changes detected and pushed to private repo."
47 | else
48 | echo "No changes detected. No push needed."
49 | fi
50 |
--------------------------------------------------------------------------------
/.github/workflows/ubuntu-asan.yml:
--------------------------------------------------------------------------------
1 | name: Ubuntu asan
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | pull_request:
8 | branches:
9 | - '**'
10 | workflow_dispatch:
11 |
12 | jobs:
13 |
14 | build-and-test:
15 | name: Unit tests with address sanitizer on Ubuntu
16 | runs-on: ubuntu-22.04
17 | env:
18 | CXX: clang++-14
19 | if: ${{ github.event_name == 'pull_request' || github.repository == 'PalamaraLab/ASMC' }}
20 |
21 | steps:
22 |
23 | - name: checkout repo & submodules
24 | uses: actions/checkout@v3
25 | with:
26 | submodules: true
27 | fetch-depth: 0
28 |
29 | - name: cache vcpkg installed packages
30 | uses: actions/cache@v4
31 | id: cache
32 | with:
33 | path: |
34 | vcpkg/
35 | build_dir/vcpkg_installed/
36 | key: ${{ runner.os }}-${{ env.CXX }}-${{ hashFiles('vcpkg.json', 'vcpkg/CHANGELOG.md') }}
37 |
38 | - name: install openmp for LLVM compiler
39 | run: sudo apt install libomp-dev
40 |
41 | - name: make build directory
42 | run: mkdir -p build_dir
43 |
44 | - name: cmake configure
45 | run: cmake .. -DASMC_MEMCHECK:BOOL=TRUE
46 | working-directory: build_dir
47 |
48 | - name: cmake build
49 | run: cmake --build . --parallel 2 --target ASMC_unit_tests
50 | working-directory: build_dir
51 |
52 | - name: cmake test
53 | run: ctest -j2 -R Asmc_unit_tests --output-on-failure
54 | working-directory: build_dir
55 |
--------------------------------------------------------------------------------
/.github/workflows/ubuntu-coverage.yml:
--------------------------------------------------------------------------------
1 | name: Ubuntu coverage
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | workflow_dispatch:
8 |
9 | jobs:
10 |
11 | build-and-test:
12 | name: Coverage on Ubuntu
13 | runs-on: ubuntu-22.04
14 | if: ${{ github.repository == 'PalamaraLab/ASMC' }}
15 |
16 | steps:
17 |
18 | - name: checkout repo & submodules
19 | uses: actions/checkout@v3
20 | with:
21 | submodules: true
22 | fetch-depth: 0
23 |
24 | - name: cache vcpkg installed packages
25 | uses: actions/cache@v4
26 | id: cache
27 | with:
28 | path: |
29 | vcpkg/
30 | build_dir/vcpkg_installed/
31 | key: ${{ runner.os }}-${{ env.CXX }}-${{ hashFiles('vcpkg.json', 'vcpkg/CHANGELOG.md') }}
32 |
33 | - name: install tools
34 | run: |
35 | sudo apt -y update
36 | sudo apt -y install lcov libcurl4-openssl-dev
37 |
38 | - name: make build directory
39 | run: mkdir -p build_dir
40 |
41 | - name: configure
42 | run: |
43 | cmake .. -DCMAKE_BUILD_TYPE=Debug -DASMC_ENABLE_COVERAGE=ON
44 | working-directory: build_dir
45 |
46 | - name: build
47 | run: |
48 | cmake --build . --parallel 2 --target ASMC_unit_tests
49 | working-directory: build_dir
50 |
51 | - name: test
52 | run: |
53 | ctest -j2 -R Asmc_unit_tests --output-on-failure
54 | working-directory: build_dir
55 |
56 | - name: process coverage
57 | run: |
58 | lcov --directory . --capture --output-file coverage.info
59 | lcov --remove coverage.info '/usr/*' '*/test/*' '*/vcpkg_installed/*' --output-file coverage.info
60 | lcov --list coverage.info
61 | working-directory: build_dir
62 |
63 | - name: upload coverage to codecov
64 | run: |
65 | curl -Os https://uploader.codecov.io/latest/linux/codecov
66 | chmod +x codecov
67 | ./codecov
68 | working-directory: build_dir
69 |
--------------------------------------------------------------------------------
/.github/workflows/ubuntu-no-sse.yml:
--------------------------------------------------------------------------------
1 | name: Ubuntu no sse/avx
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | pull_request:
8 | branches:
9 | - '**'
10 | workflow_dispatch:
11 |
12 | jobs:
13 |
14 | build-and-test:
15 | name: Unit tests with out sse/avx on Ubuntu
16 | runs-on: ubuntu-22.04
17 | if: ${{ github.event_name == 'pull_request' || github.repository == 'PalamaraLab/ASMC' }}
18 |
19 | steps:
20 |
21 | - name: checkout repo & vcpkg submodule
22 | uses: actions/checkout@v3
23 | with:
24 | submodules: true
25 | fetch-depth: 0
26 |
27 | - name: cache vcpkg installed packages
28 | uses: actions/cache@v4
29 | id: cache
30 | with:
31 | path: |
32 | vcpkg/
33 | build_dir/vcpkg_installed/
34 | key: ${{ runner.os }}-${{ env.CXX }}-${{ hashFiles('vcpkg.json', 'vcpkg/CHANGELOG.md') }}
35 |
36 | - name: make build directory
37 | run: mkdir -p build_dir
38 |
39 | - name: cmake configure
40 | run: cmake .. -DASMC_FORCE_PURE:BOOL=TRUE
41 | working-directory: build_dir
42 |
43 | - name: cmake build
44 | run: cmake --build . --parallel 2 --target ASMC_unit_tests
45 | working-directory: build_dir
46 |
47 | - name: cmake test
48 | run: ctest -j2 -R Asmc_unit_tests --output-on-failure
49 | working-directory: build_dir
50 |
--------------------------------------------------------------------------------
/.github/workflows/ubuntu-python.yml:
--------------------------------------------------------------------------------
1 | name: Python 3.8 3.11
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | pull_request:
8 | branches:
9 | - '**'
10 | workflow_dispatch:
11 |
12 | jobs:
13 |
14 | build-and-test:
15 | name: Unit tests via Python on Ubuntu
16 | runs-on: ubuntu-22.04
17 | if: ${{ github.event_name == 'pull_request' || github.repository == 'PalamaraLab/ASMC' }}
18 |
19 | strategy:
20 | matrix:
21 | python-version: [3.8, 3.11]
22 |
23 | steps:
24 | - name: checkout repo & submodules
25 | uses: actions/checkout@v3
26 | with:
27 | submodules: true
28 | fetch-depth: 0
29 |
30 | - name: cache vcpkg installed packages
31 | uses: actions/cache@v4
32 | id: cache
33 | with:
34 | path: |
35 | vcpkg/
36 | build_dir/vcpkg_installed/
37 | key: ${{ runner.os }}-${{ env.CXX }}-${{ hashFiles('vcpkg.json', 'vcpkg/CHANGELOG.md') }}
38 |
39 | - name: Set up Python ${{ matrix.python-version }}
40 | uses: actions/setup-python@v2
41 | with:
42 | python-version: ${{ matrix.python-version }}
43 | architecture: x64
44 |
45 | - name: install python bindings
46 | run: |
47 | python -m pip install --upgrade pip setuptools wheel ninja
48 | python -m pip install .
49 |
50 | - name: python unit tests
51 | run: |
52 | python -m unittest discover test "test_unit*.py"
53 |
54 | - name: python regression tests
55 | run: |
56 | python -m unittest discover test "test_regression.py"
57 |
--------------------------------------------------------------------------------
/.github/workflows/ubuntu-regression.yml:
--------------------------------------------------------------------------------
1 | name: Regression test
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | pull_request:
8 | branches:
9 | - '**'
10 | workflow_dispatch:
11 |
12 | jobs:
13 |
14 | build-and-test:
15 | name: Regression test on Ubuntu
16 | runs-on: ubuntu-22.04
17 | env:
18 | CXX: g++-10
19 | if: ${{ github.event_name == 'pull_request' || github.repository == 'PalamaraLab/ASMC' }}
20 |
21 | steps:
22 | - name: checkout repo & submodules
23 | uses: actions/checkout@v3
24 | with:
25 | submodules: true
26 | fetch-depth: 0
27 |
28 | - name: cache vcpkg installed packages
29 | uses: actions/cache@v4
30 | id: cache
31 | with:
32 | path: |
33 | vcpkg/
34 | build_dir/vcpkg_installed/
35 | key: ${{ runner.os }}-${{ env.CXX }}-${{ hashFiles('vcpkg.json', 'vcpkg/CHANGELOG.md') }}
36 |
37 | - name: make build directory
38 | run: mkdir -p build_dir
39 |
40 | - name: cmake configure
41 | run: cmake .. -DCMAKE_BUILD_TYPE=Release
42 | working-directory: build_dir
43 |
44 | - name: cmake build
45 | run: cmake --build . --parallel 2 --target ASMC_regression
46 | working-directory: build_dir
47 |
48 | - name: cmake test
49 | run: ctest -R regression --output-on-failure
50 | working-directory: build_dir
51 |
--------------------------------------------------------------------------------
/.github/workflows/ubuntu-unit.yml:
--------------------------------------------------------------------------------
1 | name: "Unit tests: Ubuntu"
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | pull_request:
8 | branches:
9 | - '**'
10 | workflow_dispatch:
11 |
12 | jobs:
13 |
14 | build-and-test:
15 | name: Unit tests on Ubuntu
16 | runs-on: ubuntu-24.04
17 | if: ${{ github.event_name == 'pull_request' || github.repository == 'PalamaraLab/ASMC' }}
18 |
19 | steps:
20 |
21 | - name: checkout repo & submodules
22 | uses: actions/checkout@v4
23 | with:
24 | submodules: true
25 | fetch-depth: 0
26 |
27 | - name: cache vcpkg installed packages
28 | uses: actions/cache@v4
29 | id: cache
30 | with:
31 | path: |
32 | vcpkg/
33 | build_dir/vcpkg_installed/
34 | key: ${{ runner.os }}-${{ env.CXX }}-${{ hashFiles('vcpkg.json', 'vcpkg/CHANGELOG.md') }}
35 |
36 | - name: make build directory
37 | run: mkdir -p build_dir
38 |
39 | - name: cmake configure
40 | run: cmake ..
41 | working-directory: build_dir
42 |
43 | - name: cmake build
44 | run: cmake --build . --parallel 4 --target ASMC_unit_tests
45 | working-directory: build_dir
46 |
47 | - name: cmake test
48 | run: ctest -j2 -R Asmc_unit_tests --output-on-failure
49 | working-directory: build_dir
50 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Prerequisites
2 | *.d
3 |
4 | # Compiled Object files
5 | *.slo
6 | *.lo
7 | *.o
8 | *.obj
9 |
10 | # Precompiled Headers
11 | *.gch
12 | *.pch
13 |
14 | # Compiled Dynamic libraries
15 | *.so
16 | *.dylib
17 | *.dll
18 |
19 | # Fortran module files
20 | *.mod
21 | *.smod
22 |
23 | # Compiled Static libraries
24 | *.lai
25 | *.la
26 | *.a
27 | *.lib
28 |
29 | # Executables
30 | *.exe
31 | *.out
32 | *.app
33 |
34 | # Byte-compiled / optimized / DLL files
35 | __pycache__/
36 | *.py[cod]
37 | *$py.class
38 |
39 | # C extensions
40 | *.so
41 |
42 | # Distribution / packaging
43 | .Python
44 | build/
45 | develop-eggs/
46 | dist/
47 | downloads/
48 | eggs/
49 | .eggs/
50 | lib/
51 | lib64/
52 | parts/
53 | sdist/
54 | var/
55 | wheels/
56 | pip-wheel-metadata/
57 | share/python-wheels/
58 | *.egg-info/
59 | .installed.cfg
60 | *.egg
61 | MANIFEST
62 |
63 | # PyInstaller
64 | # Usually these files are written by a python script from a template
65 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
66 | *.manifest
67 | *.spec
68 |
69 | # Installer logs
70 | pip-log.txt
71 | pip-delete-this-directory.txt
72 |
73 | # Unit test / coverage reports
74 | htmlcov/
75 | .tox/
76 | .nox/
77 | .coverage
78 | .coverage.*
79 | .cache
80 | nosetests.xml
81 | coverage.xml
82 | *.cover
83 | *.py,cover
84 | .hypothesis/
85 | .pytest_cache/
86 |
87 | # Translations
88 | *.mo
89 | *.pot
90 |
91 | # Django stuff:
92 | *.log
93 | local_settings.py
94 | db.sqlite3
95 | db.sqlite3-journal
96 |
97 | # Flask stuff:
98 | instance/
99 | .webassets-cache
100 |
101 | # Scrapy stuff:
102 | .scrapy
103 |
104 | # Sphinx documentation
105 | docs/_build/
106 |
107 | # PyBuilder
108 | target/
109 |
110 | # Jupyter Notebook
111 | .ipynb_checkpoints
112 |
113 | # IPython
114 | profile_default/
115 | ipython_config.py
116 |
117 | # pyenv
118 | .python-version
119 |
120 | # pipenv
121 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
122 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
123 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
124 | # install all needed dependencies.
125 | #Pipfile.lock
126 |
127 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
128 | __pypackages__/
129 |
130 | # Celery stuff
131 | celerybeat-schedule
132 | celerybeat.pid
133 |
134 | # SageMath parsed files
135 | *.sage.py
136 |
137 | # Environments
138 | .env
139 | .venv
140 | env/
141 | venv/
142 | ENV/
143 | env.bak/
144 | venv.bak/
145 |
146 | # Spyder project settings
147 | .spyderproject
148 | .spyproject
149 |
150 | # Rope project settings
151 | .ropeproject
152 |
153 | # mkdocs documentation
154 | /site
155 |
156 | # mypy
157 | .mypy_cache/
158 | .dmypy.json
159 | dmypy.json
160 |
161 | # Pyre type checker
162 | .pyre/
163 |
164 | #######################
165 | # Custom ASMC-related #
166 | #######################
167 |
168 | .vscode
169 | .idea
170 | cmake-build-*
171 | build
172 |
173 | **/.uuid
174 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "pybind11"]
2 | path = pybind11
3 | url = https://github.com/pybind/pybind11
4 | [submodule "vcpkg"]
5 | path = vcpkg
6 | url = https://github.com/microsoft/vcpkg
7 | [submodule "ASMC_data"]
8 | path = ASMC_data
9 | url = https://github.com/PalamaraLab/ASMC_data
10 | [submodule "DataModule"]
11 | path = DataModule
12 | url = https://github.com/PalamaraLab/DataModule/
13 |
--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
1 | # .readthedocs.yaml
2 | # Read the Docs configuration file
3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
4 |
5 | # Required
6 | version: 2
7 |
8 | # Set the OS, Python version and other tools you might need
9 | build:
10 | os: ubuntu-22.04
11 | tools:
12 | python: "3.11"
13 |
14 | # Build documentation in the "docs/" directory with Sphinx
15 | sphinx:
16 | configuration: docs/conf.py
17 |
18 | # Optionally build your docs in additional formats such as PDF and ePub
19 | # formats:
20 | # - pdf
21 | # - epub
22 |
23 | # Optional but recommended, declare the Python requirements required
24 | # to build your documentation
25 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
26 | python:
27 | install:
28 | - requirements: docs/requirements.txt
29 |
--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
1 | Copyright (c) 2020, University of Oxford.
2 | All rights reserved.
3 |
4 | University of Oxford means the Chancellor, Masters and Scholars of the
5 | University of Oxford, having an administrative office at Wellington
6 | Square, Oxford OX1 2JD, UK.
7 |
8 | This program is free software: you can redistribute it and/or modify
9 | it under the terms of the GNU General Public License as published by
10 | the Free Software Foundation, either version 3 of the License, or
11 | (at your option) any later version.
12 |
13 | This program is distributed in the hope that it will be useful,
14 | but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 | GNU General Public License for more details.
17 |
18 | You should have received a copy of the GNU General Public License
19 | along with this program. If not, see .
20 |
--------------------------------------------------------------------------------
/PyPI_README.md:
--------------------------------------------------------------------------------
1 | [](https://github.com/PalamaraLab/ASMC/actions/workflows/ubuntu-unit.yml)
2 | [](https://github.com/PalamaraLab/ASMC/actions/workflows/macos-unit.yml)
3 | [](https://github.com/PalamaraLab/ASMC/actions/workflows/ubuntu-python.yml)
4 | [](https://github.com/PalamaraLab/ASMC/actions)
5 | [](https://github.com/PalamaraLab/ASMC/actions)
6 | [](https://github.com/PalamaraLab/ASMC/actions)
7 | [](https://codecov.io/gh/PalamaraLab/ASMC)
8 |
9 | # ASMC and FastSMC
10 |
11 | This repository contains ASMC and an extension, FastSMC, together with python bindings for both.
12 |
13 | ## Quickstart
14 |
15 | ### Install the Python module from PyPI
16 |
17 | Most functionality is available through a Python module which can be installed with:
18 |
19 | ```bash
20 | pip install asmc-asmc
21 | ```
22 |
23 | ### Documentation
24 |
25 | The following pages of documentation contains specific information:
26 | - [Quickstart guide for users](https://github.com/PalamaraLab/ASMC/blob/main/docs/quickstart_user.md)
27 | - [ASMC python docs](https://github.com/PalamaraLab/ASMC/blob/main/docs/asmc_python.md)
28 | - [FastSMC python docs](https://github.com/PalamaraLab/ASMC/blob/main/docs/fastsmc_python.md)
29 |
30 | This Python module is currently available on Linux and macOS.
31 |
32 | Example Jupyter notebooks showcasing basic functionality can be found here:
33 | - [Example notebooks](https://github.com/PalamaraLab/ASMC/tree/main/notebooks)
34 |
35 | ## License
36 |
37 | ASMC and FastSMC are distributed under the GNU General Public License v3.0 (GPLv3). For any questions or comments on ASMC, please contact Pier Palamara using `@stats.ox.ac.uk`.
38 |
39 | ## Reference
40 |
41 | If you use this software, please cite the appropriate reference(s) below.
42 |
43 | The ASMC algorithm and software were developed in
44 | - P. Palamara, J. Terhorst, Y. Song, A. Price. High-throughput inference of pairwise coalescence times identifies signals of selection and enriched disease heritability. *Nature Genetics*, 2018.
45 |
46 | The FastSMC algorithm and software were developed in
47 | - J. Nait Saada, G. Kalantzis, D. Shyr, F. Cooper, M. Robinson, A. Gusev, P. F. Palamara. Identity-by-descent detection across 487,409 British samples reveals fine-scale evolutionary history and trait associations. *Nature Communications*, 2020.
48 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://github.com/PalamaraLab/ASMC/actions/workflows/ubuntu-unit.yml)
2 | [](https://github.com/PalamaraLab/ASMC/actions/workflows/macos-unit.yml)
3 | [](https://github.com/PalamaraLab/ASMC/actions/workflows/ubuntu-python.yml)
4 | [](https://github.com/PalamaraLab/ASMC/actions)
5 | [](https://github.com/PalamaraLab/ASMC/actions)
6 | [](https://github.com/PalamaraLab/ASMC/actions)
7 | [](https://codecov.io/gh/PalamaraLab/ASMC)
8 |
9 | # ASMC and FastSMC
10 |
11 | This repository contains ASMC and an extension, FastSMC, together with python bindings for both.
12 |
13 | The following pages of documentation contains specific information:
14 | - [ASMC](./docs/asmc.md)
15 | - [ASMC python bindings](./docs/asmc_python.md)
16 | - [FastSMC](./docs/fastsmc.md)
17 | - [FastSMC python bindings](./docs/fastsmc_python.md)
18 |
19 | ## Installation
20 |
21 | ASMC and FastSMC are regularly built and tested on Ubuntu and macOS.
22 | They consist of a C++ library, C++ executables, and optional Python bindings.
23 |
24 | The C++ libraries and executables require:
25 |
26 | - A C++ compiler (C++17 or later)
27 | - CMake (3.15 or later)
28 | - Boost (1.62 or later)
29 | - Eigen (3.3.4 or later)
30 | - {fmt}
31 | - range-v3
32 | - OpenMP
33 | - zlib
34 |
35 | We recommend installing dependencies using vcpkg, distributed with this repository as a submodule.
36 | Information below.
37 |
38 | Building the optional Python bindings additionally requires:
39 |
40 | - Python (3.6 or later) with development files
41 | - PyBind11 (distributed with this repository as a submodule)
42 |
43 | ## Quickstart guides
44 |
45 | - [For users](./docs/quickstart_user.md)
46 | - [For developers](./docs/quickstart_developer.md)
47 |
48 | ## Decoding Quantities
49 |
50 | Decoding quantities files are required in order to run ASMC and FastSMC.
51 | These can be generated directly from a Python module, and instructions can be found [here](https://github.com/PalamaraLab/PrepareDecoding).
52 | Input and output file formats for the tool used to create decoding quantities are described [here](https://github.com/PalamaraLab/PrepareDecoding/blob/master/docs/file_formats.md).
53 |
54 | Note: the CEU.demo demographic model and the decoding quantities for CEU+UKBB previously provided in [this repository](https://github.com/PalamaraLab/FastSMC) and [this repository](https://github.com/PalamaraLab/ASMC_legacy) were mistakenly encoded as diploid rather than haploid.
55 | The file [CEU.demo](https://github.com/PalamaraLab/ASMC_data/tree/main/demographies) and CEU+UKBB decoding quantities [here](https://github.com/PalamaraLab/ASMC_data/tree/main/decoding_quantities) have now been fixed.
56 | They were generated using [v2.2.1](https://github.com/PalamaraLab/PrepareDecoding/releases/tag/v2.2.1) of the [PrepareDecoding tool](https://github.com/PalamaraLab/PrepareDecoding), which also provides a simpler interface for computing decoding quantities as well as support for additional demographic models.
57 | Using these new decoding quantities with v1.2 of ASMC will tend to produce more recent estimates for TMRCAs compared to the decoding quantities distributed with v1.0 and v1.1.
58 | This should not have a substantial impact on most downstream analyses.
59 |
60 | ## For developers: making a release
61 |
62 | - Bump the version number in [setup.py](setup.py), [CMakeLists.txt](CMakeLists.txt) and [conf.py](docs/conf.py)
63 | - Update [RELEASE_NOTES.md](RELEASE_NOTES.md)
64 | - Push changes and check that all [GitHub workflows](https://github.com/PalamaraLab/ASMC/actions) pass
65 | - Tag the commit in Git using syntax `vX.Y`
66 | - Make a release on GitHub, which should trigger a new build that will upload Python wheels to PyPI
67 |
68 | ## License
69 |
70 | ASMC and FastSMC are distributed under the GNU General Public License v3.0 (GPLv3). For any questions or comments on ASMC, please contact Pier Palamara using `@stats.ox.ac.uk`.
71 |
72 | ## Reference
73 |
74 | If you use this software, please cite the appropriate reference(s) below.
75 |
76 | The ASMC algorithm and software were developed in
77 | - P. Palamara, J. Terhorst, Y. Song, A. Price. High-throughput inference of pairwise coalescence times identifies signals of selection and enriched disease heritability. *Nature Genetics*, 2018.
78 |
79 | The FastSMC algorithm and software were developed in
80 | - J. Nait Saada, G. Kalantzis, D. Shyr, F. Cooper, M. Robinson, A. Gusev, P. F. Palamara. Identity-by-descent detection across 487,409 British samples reveals fine-scale evolutionary history and trait associations. *Nature Communications*, 2020.
81 |
--------------------------------------------------------------------------------
/RELEASE_NOTES.md:
--------------------------------------------------------------------------------
1 | # ASMC Release Notes
2 |
3 | ## v1.3.1 (2023-06-30)
4 |
5 | ### Breaking changes
6 |
7 | None
8 |
9 | ### Other changes
10 |
11 | - The location of a `.map` or `.map.gz` file can now be optionally specified explicitly: previously it was assumed to be at the `inFileRoot`.
12 |
13 |
14 | ## v1.3 (2023-03-03)
15 |
16 | ### Breaking changes
17 |
18 | None
19 |
20 | ### Other changes
21 |
22 | - Decoding a batch can now be done in a selected subregion with from / to indices.
23 | A `cm_burn_in` parameter takes into account additional variants on either side of the subregion for HMM burn-in.
24 | - Allow the user to access selected attributes of the DecodingParams and Data from the ASMC object.
25 | - Python continuous integration now uses Python 3.8 and 3.11 (previously 3.6 and 3.9)
26 | - Update Catch to v2.13.
27 |
28 |
29 | ## v1.2 (2021-09-28)
30 |
31 | All functionality for ASMC and FastSMC is now in this repository ([link](https://github.com/PalamaraLab/ASMC)).
32 |
33 | ### Breaking changes
34 |
35 | - Fixed an issue with demographic models.
36 | The `CEU.demo` demographic model and the decoding quantities for CEU+UKBB previously provided in the repository were mistakenly encoded as diploid rather than haploid.
37 | CEU.demo and CEU+UKBB decoding quantities have now been updated and can be found in [this repository](https://github.com/PalamaraLab/ASMC_data).
38 | Also see the manual for a note on how this affects analyses.
39 |
40 | ### Other changes
41 |
42 | - New API for decoding pairs with ASMC.
43 | In addition to running full analyses as described in the ASMC paper, users can now decode specific pairs and get back a variety of summary statistics.
44 | See the [ASMC python documentation](https://github.com/PalamaraLab/ASMC/blob/main/docs/asmc_python.md) for details.
45 | - New, more extensive, [documentation](https://github.com/PalamaraLab/ASMC/blob/main/docs/) is available.
46 |
47 |
48 | ## v1.1 (2021-01-20)
49 |
50 | [Legacy repository](https://github.com/PalamaraLab/FastSMC/releases/tag/v1.1)
51 |
52 | Improvements to documentation and default use.
53 | No changes to any core functionality.
54 |
55 | ### Breaking changes
56 |
57 | - The hashing functionality, previously named `GERMLINE`, has been renamed to `hashing`.
58 | This includes the command line flag for turning this behaviour on/off, which is now `--hashing`.
59 |
60 | ### Other changes
61 |
62 | - `--hashing` is now ON by default when running the FastSMC executable: previously, `--GERMLINE` was OFF by default.
63 | - Extra output, including the IBD segment length, posterior mean, and MAP, are now on by default.
64 | This behaviour can be toggled with the flags `--segmentLength`, `--perPairPosteriorMeans`, `--perPairMAP`.
65 | - An example script has been added to `cpp_example/FastSMC_example_multiple_jobs.sh` that demonstrates how to run FastSMC with multiple jobs simultaneously.
66 | - The README has been updated to focus on FastSMC functionality.
67 | - More robust checking is now used to verify the decoding quantities file is correct before reading it.
68 | - CMake will now, by default, build in Release mode (giving 03 optimisation on Linux).
69 | Previously, Debug was used by default.
70 |
71 |
72 | ## v1.0 (2020-09-18)
73 |
74 | [Legacy repository](https://github.com/PalamaraLab/FastSMC/releases/tag/v1.0)
75 |
76 | First public release of FastSMC, with functionality as described and used in [this paper](https://doi.org/10.1038/s41467-020-19588-x).
77 |
--------------------------------------------------------------------------------
/asmc/asmc:
--------------------------------------------------------------------------------
1 | ../src
--------------------------------------------------------------------------------
/cmake/AutodetectVcpkgToolchainFile.cmake:
--------------------------------------------------------------------------------
1 | # This file is part of https://github.com/PalamaraLab/ASMC which is released under the GPL-3.0 license.
2 | # See accompanying LICENSE and COPYING for copyright notice and full details.
3 |
4 | # If a VCPKG toolchain file is not defined, but the expected file exists, use it
5 | if (NOT DEFINED CMAKE_TOOLCHAIN_FILE)
6 | if (EXISTS ${CMAKE_SOURCE_DIR}/vcpkg/scripts/buildsystems/vcpkg.cmake)
7 | set(vcpkg_toolchain_file ${CMAKE_SOURCE_DIR}/vcpkg/scripts/buildsystems/vcpkg.cmake)
8 | message(STATUS "Detected vcpkg toolchain file at ${vcpkg_toolchain_file}")
9 | set(CMAKE_TOOLCHAIN_FILE ${vcpkg_toolchain_file})
10 | endif ()
11 | endif ()
12 |
--------------------------------------------------------------------------------
/cmake/CheckDataModule.cmake:
--------------------------------------------------------------------------------
1 | # This file is part of https://github.com/PalamaraLab/ASMC which is released under the GPL-3.0 license.
2 | # See accompanying LICENSE and COPYING for copyright notice and full details.
3 |
4 | # If a VCPKG toolchain file is not defined, but the expected file exists, use it
5 | if (NOT EXISTS ${CMAKE_SOURCE_DIR}/DataModule/README.md)
6 | message(FATAL_ERROR "
7 | The data module ${ASMC_data_module_dir} does not exist, and it is required for ASMC.
8 | Please either get all submodules when you clone ASMC:
9 | $ git clone --recurse-submodules https://github.com/PalamaraLab/ASMC.git
10 | or, at minimum, initialise the data module. From the ASMC directory:
11 | $ git submodule update --init DataModule
12 | Please see this quickstart guide for further information:
13 | https://github.com/PalamaraLab/ASMC/blob/main/docs/quickstart_user.md
14 | ")
15 | endif ()
16 |
--------------------------------------------------------------------------------
/cmake/FindGMP.cmake:
--------------------------------------------------------------------------------
1 | # Reproduced from https://github.com/dune-project/dune-common under the terms of version 2 of the GNU General Public
2 | # License (https://github.com/dune-project/dune-common/blob/master/LICENSE.md)
3 |
4 | #[=======================================================================[.rst:
5 | FindGMP
6 | -------
7 |
8 | Find the GNU MULTI-Precision Bignum (GMP) library
9 | and the corresponding C++ bindings GMPxx.
10 |
11 | This module searches for both libraries and only considers the package
12 | found if both can be located. It then defines separate targets for the C
13 | and the C++ library.
14 |
15 | Imported Targets
16 | ^^^^^^^^^^^^^^^^
17 |
18 | This module provides the following imported targets, if found:
19 |
20 | ``GMP::gmp``
21 | Library target of the C library.
22 | ``GMP::gmpxx``
23 | Library target of the C++ library, which also links to the C library.
24 |
25 | Result Variables
26 | ^^^^^^^^^^^^^^^^
27 |
28 | This will define the following variables:
29 |
30 | ``GMP_FOUND``
31 | True if the GMP library, the GMPxx headers and
32 | the GMPxx library were found.
33 |
34 | Cache Variables
35 | ^^^^^^^^^^^^^^^
36 |
37 | You may set the following variables to modify the behaviour of
38 | this module:
39 |
40 | ``GMP_INCLUDE_DIR``
41 | The directory containing ``gmp.h``.
42 | ``GMP_LIB``
43 | The path to the gmp library.
44 | ``GMPXX_INCLUDE_DIR``
45 | The directory containing ``gmpxx.h``.
46 | ``GMPXX_LIB``
47 | The path to the gmpxx library.
48 |
49 | #]=======================================================================]
50 |
51 | # Add a feature summary for this package
52 | include(FeatureSummary)
53 | set_package_properties(GMP PROPERTIES
54 | DESCRIPTION "GNU multi-precision library"
55 | URL "https://gmplib.org"
56 | )
57 |
58 | # Try finding the package with pkg-config
59 | find_package(PkgConfig QUIET)
60 | pkg_check_modules(PKG QUIET gmp gmpxx)
61 |
62 | # Try to locate the libraries and their headers, using pkg-config hints
63 | find_path(GMP_INCLUDE_DIR gmp.h HINTS ${PKG_gmp_INCLUDEDIR})
64 | find_library(GMP_LIB gmp HINTS ${PKG_gmp_LIBDIR})
65 |
66 | find_path(GMPXX_INCLUDE_DIR gmpxx.h HINTS ${PKG_gmpxx_INCLUDEDIR})
67 | find_library(GMPXX_LIB gmpxx HINTS ${PKG_gmpxx_LIBDIR})
68 |
69 | # Remove these variables from cache inspector
70 | mark_as_advanced(GMP_INCLUDE_DIR GMP_LIB GMPXX_INCLUDE_DIR GMPXX_LIB)
71 |
72 | # Report if package was found
73 | include(FindPackageHandleStandardArgs)
74 | find_package_handle_standard_args(GMP
75 | DEFAULT_MSG
76 | GMPXX_LIB GMPXX_INCLUDE_DIR GMP_INCLUDE_DIR GMP_LIB
77 | )
78 |
79 | # Set targets
80 | if(GMP_FOUND)
81 | # C library
82 | if(NOT TARGET GMP::gmp)
83 | add_library(GMP::gmp UNKNOWN IMPORTED)
84 | set_target_properties(GMP::gmp PROPERTIES
85 | IMPORTED_LOCATION ${GMP_LIB}
86 | INTERFACE_INCLUDE_DIRECTORIES ${GMP_INCLUDE_DIR}
87 | )
88 | endif()
89 |
90 | # C++ library, which requires a link to the C library
91 | if(NOT TARGET GMP::gmpxx)
92 | add_library(GMP::gmpxx UNKNOWN IMPORTED)
93 | set_target_properties(GMP::gmpxx PROPERTIES
94 | IMPORTED_LOCATION ${GMPXX_LIB}
95 | INTERFACE_INCLUDE_DIRECTORIES ${GMPXX_INCLUDE_DIR}
96 | INTERFACE_LINK_LIBRARIES GMP::gmp
97 | )
98 | endif()
99 | endif()
100 |
--------------------------------------------------------------------------------
/cmake/SIMD.cmake:
--------------------------------------------------------------------------------
1 | # fast-math and SIMD instruction settings below has been copied and modified from
2 | # th GLM library CMakeLists.txt (MIT license)
3 | #
4 | # https://github.com/g-truc/glm/blob/master/CMakeLists.txt
5 |
6 | option(ASMC_ENABLE_FAST_MATH "Enable fast math optimizations" OFF)
7 | if(ASMC_ENABLE_FAST_MATH)
8 | message(STATUS "Build with fast math optimizations")
9 |
10 | if((CMAKE_CXX_COMPILER_ID MATCHES "Clang") OR (CMAKE_CXX_COMPILER_ID MATCHES "GNU"))
11 | add_compile_options(-ffast-math)
12 |
13 | elseif(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
14 | add_compile_options(/fp:fast)
15 | endif()
16 | else()
17 | if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
18 | add_compile_options(/fp:precise)
19 | endif()
20 | endif()
21 |
22 | option(ASMC_ENABLE_SIMD_SSE3 "Enable SSE 1, 2 & 3 optimizations" OFF)
23 | option(ASMC_ENABLE_SIMD_AVX "Enable AVX optimizations" ON)
24 | option(ASMC_ENABLE_SIMD_AVX512 "Enable AVX512 optimizations" OFF)
25 | option(ASMC_FORCE_PURE "Force 'pure' instructions" OFF)
26 |
27 | if(ASMC_FORCE_PURE)
28 | add_definitions(-DNO_SSE)
29 |
30 | if(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
31 | add_compile_options(-mfpmath=387)
32 | endif()
33 | message(STATUS "No SIMD instruction set")
34 |
35 | elseif(ASMC_ENABLE_SIMD_AVX)
36 | add_definitions(-DAVX)
37 | add_compile_definitions(EIGEN_MAX_ALIGN_BYTES=64)
38 |
39 | if((CMAKE_CXX_COMPILER_ID MATCHES "GNU") OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang"))
40 | add_compile_options(-mavx)
41 | elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel")
42 | add_compile_options(/QxAVX)
43 | elseif(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
44 | add_compile_options(/arch:AVX)
45 | endif()
46 | message(STATUS "AVX instruction set")
47 |
48 | elseif(ASMC_ENABLE_SIMD_AVX512)
49 | add_definitions(-DAVX)
50 |
51 | if((CMAKE_CXX_COMPILER_ID MATCHES "GNU") OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang"))
52 | add_compile_options(-mavx512f)
53 | add_compile_options(-mavx512cd)
54 | elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel")
55 | add_compile_options(-xCOMMON-AVX512)
56 | elseif(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
57 | add_compile_options(/arch:AVX512)
58 | endif()
59 | message(STATUS "AVX-512 instruction set")
60 |
61 | elseif(ASMC_ENABLE_SIMD_SSE)
62 | add_definitions(-DSSE)
63 |
64 | if((CMAKE_CXX_COMPILER_ID MATCHES "GNU") OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang"))
65 | add_compile_options(-msse3)
66 | add_compile_options(-msse2)
67 | add_compile_options(-msse)
68 | elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel")
69 | add_compile_options(/QxSSE3)
70 | add_compile_options(/QxSSE2)
71 | add_compile_options(/QxSSE)
72 | elseif((CMAKE_CXX_COMPILER_ID MATCHES "MSVC"))
73 | add_compile_options(/arch:SSE2) # VC doesn't support SSE3
74 | add_compile_options(/arch:SSE)
75 | endif()
76 | message(STATUS "SSE2 & SSE3 instruction set")
77 |
78 | elseif(ASMC_ENABLE_SIMD_SSE2)
79 | add_definitions(-DGLM_FORCE_INTRINSICS)
80 |
81 | if((CMAKE_CXX_COMPILER_ID MATCHES "GNU") OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang"))
82 | add_compile_options(-msse2)
83 | elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel")
84 | add_compile_options(/QxSSE2)
85 | elseif((CMAKE_CXX_COMPILER_ID MATCHES "MSVC") AND NOT CMAKE_CL_64)
86 | add_compile_options(/arch:SSE2)
87 | endif()
88 | message(STATUS "SSE2 instruction set")
89 | endif()
90 |
91 |
--------------------------------------------------------------------------------
/cpp_example/FastSMC_example.sh:
--------------------------------------------------------------------------------
1 | # this script will run FastSMC on a simulated data as described in the paper (in FILES/FASTSMC_EXAMPLE/)
2 | # parameters can be changed if desired
3 |
4 | cd ../FASTSMC_BUILD_DIR/ || exit
5 |
6 | ./FastSMC_exe --inFileRoot ../FILES/FASTSMC_EXAMPLE/example \
7 | --outFileRoot ../cpp_example/FastSMC_output_example \
8 | --decodingQuantFile ../FILES/FASTSMC_EXAMPLE/example.decodingQuantities.gz \
9 | --mode array \
10 | --time 50 \
11 | --min_m 1.5 \
12 | --segmentLength \
13 | --hashing \
14 | --perPairPosteriorMeans \
15 | --perPairMAP \
16 | --noConditionalAgeEstimates \
17 | --bin
18 |
19 | # Binary output file can be converted with the following command line
20 | echo 'Showing first lines of the binary output...'
21 | ./convertBinary_exe ../cpp_example/FastSMC_output_example.1.1.FastSMC.bibd.gz | head
22 |
--------------------------------------------------------------------------------
/cpp_example/FastSMC_example_multiple_jobs.sh:
--------------------------------------------------------------------------------
1 | # this script will run FastSMC on a simulated data as described in the paper (in FILES/FASTSMC_EXAMPLE/)
2 | # parameters can be changed if desired
3 |
4 | # This example will run multiple jobs in different threads on the same machine. If you are running FastSMC on a cluster
5 | # then it may be more appropriate to instead use the job scheduler such as `qsub`.
6 |
7 | # The total number of jobs you want to run in parallel (this should be a square number).
8 | # Note that the standard output will be messy as information will be printed from every job simultaneously.
9 | total_num_jobs=4
10 |
11 | cd ../FASTSMC_BUILD_DIR/ || exit
12 |
13 | run_single_job() {
14 | local job=$1
15 | ./FastSMC_exe --inFileRoot ../FILES/FASTSMC_EXAMPLE/example \
16 | --outFileRoot ../cpp_example/FastSMC_output_example \
17 | --decodingQuantFile ../FILES/FASTSMC_EXAMPLE/example.decodingQuantities.gz \
18 | --mode array \
19 | --time 50 \
20 | --min_m 1.5 \
21 | --segmentLength \
22 | --hashing \
23 | --jobs "${total_num_jobs}" \
24 | --jobInd "${job}" \
25 | --perPairPosteriorMeans \
26 | --perPairMAP \
27 | --noConditionalAgeEstimates \
28 | --bin
29 | echo Finished job "$1"
30 | }
31 |
32 | # Run the jobs in parallel
33 | for ((i = 1; i <= total_num_jobs; i++)); do
34 | # substitute the command below with the appropriate command if running this on a cluster, e.g. "qsub run_single_job $i"
35 | run_single_job "$i" &
36 | done
37 | wait
38 |
39 | # Binary output file can be converted with the following command line
40 | echo 'Showing first lines of the binary output...'
41 | ./convertBinary_exe ../cpp_example/FastSMC_output_example.1.4.FastSMC.bibd.gz | head
42 |
43 | # Note that there will be the same number of output files as jobs, which will need to be concatenated.
44 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/asmc_python.md:
--------------------------------------------------------------------------------
1 | # ASMC Python API
2 |
3 | - [Examples using the Python bindings](#examples-using-the-python-bindings)
4 | - [API](#api)
5 | - [ASMC](#asmc)
6 | - [DecodePairsReturnStruct](#decodepairsreturnstruct)
7 |
8 | ASMC includes Python bindings which can be installed using pip:
9 |
10 | ```
11 | pip install asmc-asmc
12 | ```
13 |
14 | Before reading further you may wish to read the [ASMC docs](./asmc.md).
15 | In particular, these sections are directly relevant:
16 | - [Summary (TL;DR)](./asmc.md#summary-tldr)
17 | - [Input/output file formats](./asmc.md#inputoutput-file-formats)
18 | - [Tools, scripts, and analyses](./asmc.md#tools-scripts-and-analyses)
19 | - [Precomputed decoding quantities](./asmc.md#precomputed-decoding-quantities)
20 |
21 |
22 | ## Examples using the Python bindings
23 |
24 | See the `notebooks` directory for examples.
25 | There are two Jupyter notebooks:
26 | - a [minimal working example](../notebooks/asmc-minimal.ipynb), where sensible defaults for parameters are chosen automatically
27 | - a [more detailed example](../notebooks/asmc.ipynb) that demonstrates how to customise parameters
28 |
29 | ## API
30 |
31 | The core Python API for ASMC consists of the following classes:
32 | - `ASMC`
33 | - `DecodingParams`
34 | - `DecodePairsReturnStruct`
35 |
36 | ### ASMC
37 |
38 | The main `ASMC` object can be constructed minimally with an input file root and a decoding quantities file.
39 | Optional parameters are the output file root and the decoding mode.
40 | The full signature (with defaults indicated) is as follows:
41 |
42 | ```python
43 | asmc = ASMC(
44 | in_dir=input_files_root, # path to the
45 | dq_file=dq_file, # path to the decoding quantities file
46 | out_dir="", # location of output files (default is the input file root)
47 | decoding_mode="array" # one of "array" or "sequence"
48 | )
49 | ```
50 |
51 | This creates an ASMC object with sensible defaults.
52 | To fine-tune parameters you can instead create the ASMC object with an instance of `DecodingParams`:
53 |
54 | ```python
55 | # These are the arguments (with defaults indicated) for constructing a decoding paramters object
56 | params = DecodingParams(
57 | in_file_root=input_files_root,
58 | dq_file=dq_file,
59 | map_file="", # Optional override for map|map.gz file, if not in in_file_root
60 | out_file_root="",
61 | jobs=1, # Number of jobs being done in total
62 | job_ind=1, # Job index (0, ..., jobs)
63 | decoding_mode_string="array", # One of {"squence", "array"}
64 | decoding_sequence=False,
65 | using_CSFS=True, # Whether to use CSFS
66 | compress=False, # Compress emission to binary (no CSFS)
67 | use_ancestral=False, # Assume ancestral alleles are coded as 1 in input (will assume 1 = minor otherwise)
68 | skip_CSFS_distance=0.0, # Genetic distance between two CSFS emissions
69 | no_batches=False, # Decode with no vectorization (do not use without good reason)
70 | do_posterior_sums=False,
71 | do_per_pair_posterior_mean=False,
72 | expected_coal_times_file="",
73 | within_only=False,
74 | do_major_minor_posterior_sums=False, #
75 | do_per_pair_MAP=False
76 | )
77 |
78 | asmc = ASMC(params)
79 | ```
80 |
81 | You can specify the outputs that will be calculated with the following methods (with defaults indicated):
82 |
83 | ```python
84 | # Per pair posterior mean, MAP and full posteriors, as well as the sum of posteriors can be stored in matrices
85 | asmc.set_store_per_pair_posterior_mean(True) # <-- true by default; others false by default
86 | asmc.set_store_per_pair_map(False)
87 | asmc.set_store_per_pair_posterior(False)
88 | asmc.set_store_sum_of_posterior(False)
89 |
90 | # Per pair posterior mean and MAP can be written to file. This is typically slow.
91 | asmc.set_write_per_pair_posterior_mean(False)
92 | asmc.set_write_per_pair_map(False)
93 | ```
94 |
95 | Finally, the ASMC method `decode_pairs` will run the analysis.
96 | There are three different signatures available:
97 |
98 | ```python
99 | a = [1, 2, 3]
100 | b = [4, 5, 6]
101 |
102 | a_str = [f"1_{x}_1" for x in range(1,149)]
103 | b_str = [f"1_{x}_2" for x in range(1,149)]
104 |
105 | asmc.decode_pairs(a, b) # two lists of haplotype indices
106 | asmc.decode_pairs(a_str, b_str) # two lists of haplotype IDs, with _1 and _2 indicating the haplotype
107 | asmc.decode_pairs() # <-- decode all pairs in the dataset
108 | ```
109 |
110 | The results can then be accessed either by copy or reference:
111 |
112 | ```python
113 | return_vals = asmc.get_copy_of_results()
114 | return_vals_ref = asmc.get_ref_of_results()
115 | ```
116 |
117 | Getting the values by reference is safe if you are only planning to call `decode_pairs` once, or if you are performing calculations that do not require the results to persist after the first call to `decode_pairs`.
118 | If you call `decode_pairs` multiple times, the results will be overwritten, so you should ensure you get results by copy.
119 |
120 | ### DecodePairsReturnStruct
121 |
122 | The return structure will contain results based on the options selected on the ASMC object before calling `decode_pairs`.
123 | ```python
124 | # The index information for the pairs decoded
125 | return_vals.per_pair_indices
126 |
127 | # The `per_pair_posteriors` option gives the largest amount of information: a list of 2D numpy arrays
128 | # The list has length numPairs, and each 2D array has size (numStates x numSites)
129 | return_vals.per_pair_posteriors
130 |
131 | # The sum of posteriors is a single 2D numpy array of size (numStates x numSites)
132 | return_vals.sum_of_posteriors
133 |
134 | # Turning on the per_pair_posteriors flag gives you the the following:
135 | # A 2D numpy array with posterior means, of size (numPairs x numSites)
136 | return_vals.per_pair_posterior_means
137 | # Two 1D numpy arrays with the column-wise min and argmin of this array:
138 | return_vals.min_posterior_means
139 | return_vals.argmin_posterior_means
140 |
141 | # Turning on the per_pair_MAPs flag gives you the the following:
142 | # A 2D numpy array with posterior MAPs, of size (numPairs x numSites)
143 | return_vals.per_pair_MAPs
144 | # Two 1D numpy arrays with the column-wise min and argmin of this array:
145 | return_vals.min_MAPs
146 | return_vals.argmin_MAPs
147 | ```
148 |
149 | Finally, the ASMC object can also return the list of expected coalescent times from the decoding quantities file:
150 | asmc.get_expected_times()
151 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
6 |
7 | # -- Path setup --------------------------------------------------------------
8 |
9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | # import os
14 | # import sys
15 | # sys.path.insert(0, os.path.abspath('.'))
16 |
17 |
18 | # -- Project information -----------------------------------------------------
19 |
20 | project = 'ASMC'
21 | copyright = '2023, ASMC Developers'
22 | author = 'ASMC Developers, https://palamaralab.github.io/software/asmc/'
23 | release = 'v1.3.1'
24 |
25 | # -- General configuration ---------------------------------------------------
26 |
27 | # Add any Sphinx extension module names here, as strings. They can be
28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
29 | # ones.
30 | extensions = [
31 | 'sphinx_rtd_theme',
32 | ]
33 |
34 | # Add any paths that contain templates here, relative to this directory.
35 | templates_path = ['_templates']
36 |
37 | # List of patterns, relative to source directory, that match files and
38 | # directories to ignore when looking for source files.
39 | # This pattern also affects html_static_path and html_extra_path.
40 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
41 |
42 |
43 | # -- Options for HTML output -------------------------------------------------
44 |
45 | # The theme to use for HTML and HTML Help pages. See the documentation for
46 | # a list of builtin themes.
47 | #
48 | html_theme = 'sphinx_rtd_theme'
49 |
50 | # Add any paths that contain custom static files (such as style sheets) here,
51 | # relative to this directory. They are copied after the builtin static files,
52 | # so a file named "default.css" will overwrite the builtin "default.css".
53 | html_static_path = ['_static']
54 |
--------------------------------------------------------------------------------
/docs/fastsmc_python.md:
--------------------------------------------------------------------------------
1 | # FastSMC Python API
2 |
3 | - [Examples using the Python bindings](#examples-using-the-python-bindings)
4 | - [API](#api)
5 | - [FastSMC](#fastsmc)
6 | - [DecodingParams](#decodingparams)
7 | - [BinaryDataReader](#binarydatareader)
8 |
9 | FastSMC includes Python bindings which can be installed using pip:
10 |
11 | ```
12 | pip install asmc-asmc
13 | ```
14 |
15 | Before reading further you may wish to read the [FastSMC docs](./fastsmc.md).
16 | In particular, these sections are directly relevant:
17 | - [Summary (TL;DR)](./fastsmc.md#input-file-formats)
18 | - [Input/output file formats](./fastsmc.md#output-format)
19 | - [Tools, scripts, and analyses](./fastsmc.md#binary-output)
20 | - [Precomputed decoding quantities](./fastsmc.md#relationship-to-asmc)
21 |
22 | And, from the [ASMC docs](./asmc.md):
23 | - [Precomputed decoding quantities](./asmc.md#precomputed-decoding-quantities)
24 |
25 | ## Examples using the Python bindings
26 |
27 | See the `notebooks` directory for examples.
28 | There are two Jupyter notebooks:
29 | - a [minimal working example](../notebooks/fastsmc-minimal.ipynb), where sensible defaults for parameters are chosen automatically
30 | - a [more detailed example](../notebooks/fastsmc.ipynb) that demonstrates how to customise parameters, how to convert the binary file to text format, and how to analyse the output if it is too large to fit in memory.
31 |
32 | ## API
33 |
34 | The core Python API for FastSMC consists of the following classes:
35 | - `FastSMC`
36 | - `DecodingParams`
37 | - `BinaryDataReader`
38 |
39 | ### FastSMC
40 |
41 | The main `FastSMC` object can be constructed minimally with an input file root, a decoding quantities file, and an output directory.
42 | Simply construct a FastSMC object and call `run()` to generate output in the output file root:
43 |
44 | ```python
45 | fast_smc = FastSMC(in_dir=input_files_root, dq_file=dq_file, out_dir=output_files_root)
46 | fast_smc.run()
47 | ```
48 |
49 | This creates a FastSMC object with sensible defaults.
50 | To fine-tune parameters you can instead create the FastSMC object with an instance of `DecodingParams`.
51 |
52 | ### DecodingParams
53 |
54 | Create an empty `DecodingParams` object:
55 |
56 | ```python
57 | params = DecodingParams()
58 | ```
59 |
60 | The following parameters can be set:
61 |
62 | ```python
63 | params.decodingQuantFile = dq_file
64 | params.inFileRoot = input_files_root
65 | params.map_file = map_file # Optional override for .map file, if not in input_files_root
66 | params.outFileRoot = output_files_root
67 | params.decodingModeString = 'array'
68 | params.usingCSFS = True
69 | params.batchSize = 32
70 | params.recallThreshold = 3
71 | params.min_m = 1.5
72 | params.hashing = True
73 | params.FastSMC = True
74 | params.BIN_OUT = True
75 | params.outputIbdSegmentLength = True
76 | params.time = 50
77 | params.noConditionalAgeEstimates = True
78 | params.doPerPairMAP = True
79 | params.doPerPairPosteriorMean = True
80 | params.hashingOnly = False
81 | ```
82 |
83 | > Note: the `hashingOnly` flag has not been extensively tested.
84 | You may also want to look into [this repository](https://github.com/gusevlab/germline2) for a standalone version.
85 |
86 | Finally, you can validate that the parameters are consistent for running FastSMC:
87 |
88 | ```python
89 | assert params.validateParamsFastSMC()
90 | ```
91 |
92 | Then, construct and run a `FastSMC` object using these parameters:
93 |
94 | ```python
95 | fast_smc = FastSMC(params)
96 | fast_smc.run()
97 | ```
98 |
99 | ### BinaryDataReader
100 |
101 | If you turn on `BIN_OUT` in the decoding parameters, the `BinaryDataReader` class can read sequential lines in a file.
102 | This is useful particularly if the output is too large to process entirely in memory.
103 |
104 | ```python
105 | binary_data_reader = BinaryDataReader(output_files_root + '.1.1.FastSMC.bibd.gz')
106 |
107 | while binary_data_reader.moreLinesInFile():
108 | line = binary_data_reader.getNextLine()
109 | ```
110 |
111 | For each line, the following attributes and methods are available:
112 |
113 | ```python
114 | line.ind1FamId
115 | line.ind1Id
116 | line.ind1Hap
117 | line.ind2FamId
118 | line.ind2Id
119 | line.ind2Hap
120 | line.chromosome
121 | line.ibdStart
122 | line.ibdEnd
123 | line.lengthInCentimorgans
124 | line.ibdScore
125 | line.postEst
126 | line.mapEst
127 |
128 | line.toString()
129 | ```
130 |
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | .. ASMC documentation master file, created by
2 | sphinx-quickstart on Mon Dec 6 16:23:40 2021.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | Welcome to the ASMC documentation!
7 | ==================================
8 |
9 | .. toctree::
10 | :maxdepth: 2
11 | :caption: Contents:
12 |
13 | pages/quickstart_user
14 | pages/quickstart_developer
15 | pages/asmc
16 | pages/asmc_python
17 | pages/fastsmc
18 | pages/fastsmc_python
19 |
20 |
21 |
22 | Indices and tables
23 | ==================
24 |
25 | * :ref:`search`
26 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.https://www.sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/pages/fastsmc_python.rst:
--------------------------------------------------------------------------------
1 | FastSMC Python API
2 | ==================
3 |
4 | - `Examples using the Python
5 | bindings <#examples-using-the-python-bindings>`__
6 | - `API <#api>`__
7 |
8 | - `FastSMC <#fastsmc>`__
9 | - `DecodingParams <#decodingparams>`__
10 | - `BinaryDataReader <#binarydatareader>`__
11 |
12 | FastSMC includes Python bindings which can be installed using pip:
13 |
14 | ::
15 |
16 | pip install asmc-asmc
17 |
18 | Before reading further you may wish to read the `FastSMC
19 | docs <./fastsmc.md>`__. In particular, these sections are directly
20 | relevant:
21 |
22 | - `Summary (TL;DR) <./fastsmc.md#input-file-formats>`__
23 | - `Input/output file formats <./fastsmc.md#output-format>`__
24 | - `Tools, scripts, and analyses <./fastsmc.md#binary-output>`__
25 | - `Precomputed decoding
26 | quantities <./fastsmc.md#relationship-to-asmc>`__
27 |
28 | And, from the `ASMC docs <./asmc.md>`__:
29 |
30 | - `Precomputed decoding
31 | quantities <./asmc.md#precomputed-decoding-quantities>`__
32 |
33 | Examples using the Python bindings
34 | ----------------------------------
35 |
36 | See the ``notebooks`` directory for examples. There are two Jupyter
37 | notebooks:
38 |
39 | - a `minimal working example <../notebooks/fastsmc-minimal.ipynb>`__,
40 | where sensible defaults for parameters are chosen automatically
41 | - a `more detailed example <../notebooks/fastsmc.ipynb>`__ that
42 | demonstrates how to customise parameters, how to convert the binary
43 | file to text format, and how to analyse the output if it is too large
44 | to fit in memory.
45 |
46 | API
47 | ---
48 |
49 | The core Python API for FastSMC consists of the following classes:
50 |
51 | - ``FastSMC``
52 | - ``DecodingParams``
53 | - ``BinaryDataReader``
54 |
55 | FastSMC
56 | ~~~~~~~
57 |
58 | The main ``FastSMC`` object can be constructed minimally with an input
59 | file root, a decoding quantities file, and an output directory. Simply
60 | construct a FastSMC object and call ``run()`` to generate output in the
61 | output file root:
62 |
63 | .. code:: python
64 |
65 | fast_smc = FastSMC(in_dir=input_files_root, dq_file=dq_file, out_dir=output_files_root)
66 | fast_smc.run()
67 |
68 | This creates a FastSMC object with sensible defaults. To fine-tune
69 | parameters you can instead create the FastSMC object with an instance of
70 | ``DecodingParams``.
71 |
72 | DecodingParams
73 | ~~~~~~~~~~~~~~
74 |
75 | Create an empty ``DecodingParams`` object:
76 |
77 | .. code:: python
78 |
79 | params = DecodingParams()
80 |
81 | The following parameters can be set:
82 |
83 | .. code:: python
84 |
85 | params.decodingQuantFile = dq_file
86 | params.inFileRoot = input_files_root
87 | params.map_file = map_file # Optional override for .map file, if not in input_files_root
88 | params.outFileRoot = output_files_root
89 | params.decodingModeString = 'array'
90 | params.usingCSFS = True
91 | params.batchSize = 32
92 | params.recallThreshold = 3
93 | params.min_m = 1.5
94 | params.hashing = True
95 | params.FastSMC = True
96 | params.BIN_OUT = True
97 | params.outputIbdSegmentLength = True
98 | params.time = 50
99 | params.noConditionalAgeEstimates = True
100 | params.doPerPairMAP = True
101 | params.doPerPairPosteriorMean = True
102 | params.hashingOnly = False
103 |
104 | ..
105 |
106 | Note: the ``hashingOnly`` flag has not been extensively tested. You
107 | may also want to look into `this
108 | repository `__ for a
109 | standalone version.
110 |
111 | Finally, you can validate that the parameters are consistent for running
112 | FastSMC:
113 |
114 | .. code:: python
115 |
116 | assert params.validateParamsFastSMC()
117 |
118 | Then, construct and run a ``FastSMC`` object using these parameters:
119 |
120 | .. code:: python
121 |
122 | fast_smc = FastSMC(params)
123 | fast_smc.run()
124 |
125 | BinaryDataReader
126 | ~~~~~~~~~~~~~~~~
127 |
128 | If you turn on ``BIN_OUT`` in the decoding parameters, the
129 | ``BinaryDataReader`` class can read sequential lines in a file. This is
130 | useful particularly if the output is too large to process entirely in
131 | memory.
132 |
133 | .. code:: python
134 |
135 | binary_data_reader = BinaryDataReader(output_files_root + '.1.1.FastSMC.bibd.gz')
136 |
137 | while binary_data_reader.moreLinesInFile():
138 | line = binary_data_reader.getNextLine()
139 |
140 | For each line, the following attributes and methods are available:
141 |
142 | .. code:: python
143 |
144 | line.ind1FamId
145 | line.ind1Id
146 | line.ind1Hap
147 | line.ind2FamId
148 | line.ind2Id
149 | line.ind2Hap
150 | line.chromosome
151 | line.ibdStart
152 | line.ibdEnd
153 | line.lengthInCentimorgans
154 | line.ibdScore
155 | line.postEst
156 | line.mapEst
157 |
158 | line.toString()
159 |
--------------------------------------------------------------------------------
/docs/pages/quickstart_developer.rst:
--------------------------------------------------------------------------------
1 | Quickstart guide for developers
2 | ===============================
3 |
4 | - `Linux <#linux>`__
5 | - `macOS <#macos>`__
6 | - `ResComp (oxford research
7 | computing) <#rescomp-oxford-research-computing>`__
8 | - `Python bindings <#python-bindings>`__
9 |
10 | Linux
11 | -----
12 |
13 | This guide assumes you have a C++17 compatible compiler (e.g. gcc >= 8.3
14 | or clang >= 7) and `CMake >= 3.15 `__.
15 | Additionally, to compile the Python bindings you need Python with
16 | development files:
17 |
18 | .. code:: bash
19 |
20 | sudo apt install python3-dev
21 |
22 | Then, follow these steps:
23 |
24 | .. code:: bash
25 |
26 | # Get the source
27 | git clone --recurse-submodules https://github.com/PalamaraLab/ASMC_dev
28 | cd ASMC_dev
29 |
30 | # Create a build directory
31 | mkdir build && cd build
32 |
33 | # Configure and build
34 | # On first run, CMake will build the required dependencies
35 | cmake ..
36 | cmake --build . --parallel 4
37 |
38 | macOS
39 | -----
40 |
41 | This guide assumes you have a recent version of the `Xcode command line
42 | tools `__ and
43 | `Homebrew `__. Install the following dependencies:
44 |
45 | .. code:: bash
46 |
47 | brew install cmake
48 | brew install libomp
49 | brew install python # for python bindings, if required
50 |
51 | Then, follow these steps:
52 |
53 | .. code:: bash
54 |
55 | # Get the source
56 | git clone --recurse-submodules https://github.com/PalamaraLab/ASMC_dev
57 | cd ASMC_dev
58 |
59 | # Create a build directory
60 | mkdir build && cd build
61 |
62 | # Configure and build
63 | # On first run, CMake will build the required dependencies
64 | cmake ..
65 | cmake --build . --parallel 4
66 |
67 | ResComp (oxford research computing)
68 | -----------------------------------
69 |
70 | All necessary dependencies are already installed on ResComp. Simply
71 | follow these steps:
72 |
73 | .. code:: bash
74 |
75 | # Load required modules
76 | module load GCC/10.2.0
77 | module load CMake/3.18.4-GCCcore-10.2.0
78 | module load git/2.28.0-GCCcore-10.2.0-nodocs
79 | module load Python/3.8.6-GCCcore-10.2.0
80 |
81 | # Get the source
82 | git clone --recurse-submodules https://github.com/PalamaraLab/ASMC_dev
83 | cd ASMC_dev
84 |
85 | # Create a build directory
86 | mkdir build && cd build
87 |
88 | # Configure and build
89 | # On first run, CMake will build the required dependencies
90 | cmake ..
91 | cmake --build . --parallel 4
92 |
93 | Python bindings
94 | ---------------
95 |
96 | These instructions are platform independent, assuming you have installed
97 | all dependencies (excluding those from vcpkg) according to the
98 | instructions above. From the ``ASMC_dev`` directory:
99 |
100 | .. code:: bash
101 |
102 | python3 -m venv venv
103 | source venv/bin/activate
104 |
105 | pip install --upgrade pip setuptools wheel ninja
106 | pip install .
107 |
--------------------------------------------------------------------------------
/docs/pages/quickstart_user.rst:
--------------------------------------------------------------------------------
1 | Quickstart guide for users
2 | ==========================
3 |
4 | - `Python bindings <#python-bindings>`__
5 | - `Linux <#linux>`__
6 | - `macOS <#macos>`__
7 | - `Without vcpkg <#without-vcpkg>`__
8 |
9 | Python bindings
10 | ---------------
11 |
12 | If you want to use ASMC or FastSMC via their Python interface, you can
13 | simply install ASMC using pip:
14 |
15 | ::
16 |
17 | pip install asmc-asmc
18 |
19 | For examples, see the `ASMC python documentation <./asmc_python.md>`__
20 | and `FastSMC python documentation <./fastsmc_python.md>`__.
21 |
22 | If you want to compile the C++ executables, read on.
23 |
24 | Linux
25 | -----
26 |
27 | This guide assumes you have a C++17 compatible compiler (e.g. gcc >= 8.3
28 | or clang >= 7) and `CMake >= 3.15 `__. Then,
29 | follow these steps to build the ASMC, FastSMC and binary conversion
30 | executables:
31 |
32 | .. code:: bash
33 |
34 | # Get the source
35 | git clone --recurse-submodules https://github.com/PalamaraLab/ASMC
36 | cd ASMC
37 |
38 | # Create a build directory
39 | mkdir build && cd build
40 |
41 | # Configure and build
42 | # On first run, CMake will build the required dependencies
43 | cmake -DASMC_NO_PYTHON=TRUE ..
44 | cmake --build . --parallel 4
45 |
46 | macOS
47 | -----
48 |
49 | This guide assumes you have a recent version of the `Xcode command line
50 | tools `__ and
51 | `Homebrew `__. Install the following dependencies:
52 |
53 | .. code:: bash
54 |
55 | brew install cmake
56 | brew install libomp
57 |
58 | Then, follow these steps to build the ASMC, FastSMC and binary
59 | conversion executables:
60 |
61 | .. code:: bash
62 |
63 | # Get the source
64 | git clone --recurse-submodules https://github.com/PalamaraLab/ASMC
65 | cd ASMC
66 |
67 | # Create a build directory
68 | mkdir build && cd build
69 |
70 | # Configure and build
71 | # On first run, CMake will build the required dependencies
72 | cmake -DASMC_NO_PYTHON=TRUE ..
73 | cmake --build . --parallel 4
74 |
75 | Without vcpkg
76 | -------------
77 |
78 | If you would like to compile ASMC without using
79 | `vcpkg `__ to handle dependencies,
80 | you should first ensure all dependencies are installed:
81 |
82 | **Ubuntu**
83 |
84 | .. code:: bash
85 |
86 | sudo apt install libboost-iostreams-dev libboost-math-dev libboost-program-options-dev libeigen3-dev libfmt-dev librange-v3-dev zlib1g-dev
87 |
88 | **macOS**
89 |
90 | .. code:: bash
91 |
92 | brew install boost eigen fmt range-v3 zlib
93 |
94 | Then, when you run CMake, add the following definition:
95 |
96 | .. code:: bash
97 |
98 | cmake -DASMC_AVOID_VCPKG=true ..
99 |
100 | You may additionally choose to not recursively clone all submodules, as
101 | long as you still obtain the ``DataModule`` submodule. From the ASMC
102 | directory:
103 |
104 | .. code:: bash
105 |
106 | git clone https://github.com/PalamaraLab/ASMC
107 | cd ASMC
108 | git submodule update --init DataModule
109 |
--------------------------------------------------------------------------------
/docs/quickstart_developer.md:
--------------------------------------------------------------------------------
1 | # Quickstart guide for developers
2 |
3 | - [Linux](#linux)
4 | - [macOS](#macos)
5 | - [ResComp (oxford research computing)](#rescomp-oxford-research-computing)
6 | - [Python bindings](#python-bindings)
7 |
8 | ## Linux
9 |
10 | This guide assumes you have a C++17 compatible compiler (e.g. gcc >= 8.3 or clang >= 7) and [CMake >= 3.15](https://cmake.org/install/).
11 | Additionally, to compile the Python bindings you need Python with development files:
12 |
13 | ```bash
14 | sudo apt install python3-dev
15 | ```
16 |
17 | Then, follow these steps:
18 |
19 | ```bash
20 | # Get the source
21 | git clone --recurse-submodules https://github.com/PalamaraLab/ASMC_dev
22 | cd ASMC_dev
23 |
24 | # Create a build directory
25 | mkdir build && cd build
26 |
27 | # Configure and build
28 | # On first run, CMake will build the required dependencies
29 | cmake ..
30 | cmake --build . --parallel 4
31 | ```
32 |
33 | ## macOS
34 |
35 | This guide assumes you have a recent version of the [Xcode command line tools](https://developer.apple.com/xcode/features/) and [Homebrew](https://brew.sh/).
36 | Install the following dependencies:
37 |
38 | ```bash
39 | brew install cmake
40 | brew install libomp
41 | brew install python # for python bindings, if required
42 | ```
43 |
44 | Then, follow these steps:
45 |
46 | ```bash
47 | # Get the source
48 | git clone --recurse-submodules https://github.com/PalamaraLab/ASMC_dev
49 | cd ASMC_dev
50 |
51 | # Create a build directory
52 | mkdir build && cd build
53 |
54 | # Configure and build
55 | # On first run, CMake will build the required dependencies
56 | cmake ..
57 | cmake --build . --parallel 4
58 | ```
59 |
60 | ## ResComp (oxford research computing)
61 |
62 | All necessary dependencies are already installed on ResComp. Simply follow these steps:
63 |
64 | ```bash
65 | # Load required modules
66 | module load GCC/10.2.0
67 | module load CMake/3.18.4-GCCcore-10.2.0
68 | module load git/2.28.0-GCCcore-10.2.0-nodocs
69 | module load Python/3.8.6-GCCcore-10.2.0
70 |
71 | # Get the source
72 | git clone --recurse-submodules https://github.com/PalamaraLab/ASMC_dev
73 | cd ASMC_dev
74 |
75 | # Create a build directory
76 | mkdir build && cd build
77 |
78 | # Configure and build
79 | # On first run, CMake will build the required dependencies
80 | cmake ..
81 | cmake --build . --parallel 4
82 | ```
83 |
84 | ## Python bindings
85 |
86 | These instructions are platform independent, assuming you have installed all dependencies (excluding those from vcpkg) according to the instructions above.
87 | From the `ASMC_dev` directory:
88 |
89 | ```bash
90 | python3 -m venv venv
91 | source venv/bin/activate
92 |
93 | pip install --upgrade pip setuptools wheel ninja
94 | pip install .
95 | ```
96 |
--------------------------------------------------------------------------------
/docs/quickstart_user.md:
--------------------------------------------------------------------------------
1 | # Quickstart guide for users
2 |
3 | - [Python bindings](#python-bindings)
4 | - [Linux](#linux)
5 | - [macOS](#macos)
6 | - [Without vcpkg](#without-vcpkg)
7 |
8 | ## Python bindings
9 |
10 | If you want to use ASMC or FastSMC via their Python interface, you can simply install ASMC using pip:
11 |
12 | ```
13 | pip install asmc-asmc
14 | ```
15 |
16 | For examples, see the [ASMC python documentation](./asmc_python.md) and [FastSMC python documentation](./fastsmc_python.md).
17 |
18 | If you want to compile the C++ executables, read on.
19 |
20 | ## Linux
21 |
22 | This guide assumes you have a C++17 compatible compiler (e.g. gcc >= 8.3 or clang >= 7) and [CMake >= 3.15](https://cmake.org/install/).
23 | Then, follow these steps to build the ASMC, FastSMC and binary conversion executables:
24 |
25 | ```bash
26 | # Get the source
27 | git clone --recurse-submodules https://github.com/PalamaraLab/ASMC
28 | cd ASMC
29 |
30 | # Create a build directory
31 | mkdir build && cd build
32 |
33 | # Configure and build
34 | # On first run, CMake will build the required dependencies
35 | cmake -DASMC_NO_PYTHON=TRUE ..
36 | cmake --build . --parallel 4
37 | ```
38 |
39 | ## macOS
40 |
41 | This guide assumes you have a recent version of the [Xcode command line tools](https://developer.apple.com/xcode/features/) and [Homebrew](https://brew.sh/).
42 | Install the following dependencies:
43 |
44 | ```bash
45 | brew install cmake
46 | brew install libomp
47 | ```
48 |
49 | Then, follow these steps to build the ASMC, FastSMC and binary conversion executables:
50 |
51 | ```bash
52 | # Get the source
53 | git clone --recurse-submodules https://github.com/PalamaraLab/ASMC
54 | cd ASMC
55 |
56 | # Create a build directory
57 | mkdir build && cd build
58 |
59 | # Configure and build
60 | # On first run, CMake will build the required dependencies
61 | cmake -DASMC_NO_PYTHON=TRUE ..
62 | cmake --build . --parallel 4
63 | ```
64 |
65 | ## Without vcpkg
66 |
67 | If you would like to compile ASMC without using [vcpkg](https://github.com/microsoft/vcpkg/) to handle dependencies, you should first ensure all dependencies are installed:
68 |
69 | **Ubuntu**
70 | ```bash
71 | sudo apt install libboost-iostreams-dev libboost-math-dev libboost-program-options-dev libeigen3-dev libfmt-dev librange-v3-dev zlib1g-dev
72 | ```
73 |
74 | **macOS**
75 | ```bash
76 | brew install boost eigen fmt range-v3 zlib
77 | ````
78 |
79 | Then, when you run CMake, add the following definition:
80 |
81 | ```bash
82 | cmake -DASMC_AVOID_VCPKG=true ..
83 | ```
84 |
85 | You may additionally choose to not recursively clone all submodules, as long as you still obtain the `DataModule` submodule.
86 | From the ASMC directory:
87 |
88 | ```bash
89 | git clone https://github.com/PalamaraLab/ASMC
90 | cd ASMC
91 | git submodule update --init DataModule
92 | ```
93 |
--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx-rtd-theme
--------------------------------------------------------------------------------
/exe/main.cpp:
--------------------------------------------------------------------------------
1 | // This file is part of ASMC, developed by Pier Francesco Palamara.
2 | //
3 | // ASMC is free software: you can redistribute it and/or modify
4 | // it under the terms of the GNU General Public License as published by
5 | // the Free Software Foundation, either version 3 of the License, or
6 | // (at your option) any later version.
7 | //
8 | // ASMC is distributed in the hope that it will be useful,
9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | // GNU General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU General Public License
14 | // along with ASMC. If not, see .
15 |
16 | #include
17 | #include
18 | #include
19 | #include
20 |
21 | #include "Data.hpp"
22 | #include "DecodingParams.hpp"
23 | #include "DecodingQuantities.hpp"
24 | #include "FileUtils.hpp"
25 |
26 | #include "HMM.hpp"
27 | #include "StringUtils.hpp"
28 | #include "Timer.hpp"
29 | #include
30 |
31 | using namespace std;
32 |
33 | int main(int argc, char* argv[])
34 | {
35 |
36 | srand(1234);
37 |
38 | const char VERSION[] = "1.0";
39 | const char VERSION_DATE[] = "July 1, 2018";
40 | const char YEAR[] = "2018";
41 | const char LICENSE[] = "GNU GPL v3";
42 | const char WEBSITE[] = "https://palamaralab.github.io/software/fastsmc/";
43 | const char PROGRAM[] = "Ascertained Sequentially Markovian Coalescent (ASMC)";
44 |
45 | DecodingParams params;
46 |
47 | // parse input arguments
48 | if (!params.processCommandLineArgs(argc, argv)) {
49 | cerr << "Error processing command line; exiting." << endl;
50 | exit(1);
51 | }
52 |
53 | // Eigen output formatter to match original ASMC output
54 | Eigen::IOFormat TabFmt(Eigen::StreamPrecision, Eigen::DontAlignCols, "\t", "\n");
55 |
56 | cout << "\n";
57 |
58 | // cout << " _____ __ __ _____ \n";
59 | // cout << " /\\ / ____| | \\/ | / ____|\n";
60 | // cout << " / \\ | (___ | \\ / | | | \n";
61 | // cout << " / /\\ \\ \\___ \\ | |\\/| | | | \n";
62 | // cout << " / ____ \\ ____) | | | | | | |____ \n";
63 | // cout << " /_/ \\_\\ |_____/ |_| |_| \\_____|\n";
64 |
65 | cout << " █████╗ ███████╗ ███╗ ███╗ ██████╗\n";
66 | cout << "██╔══██╗ ██╔════╝ ████╗ ████║ ██╔════╝\n";
67 | cout << "███████║ ███████╗ ██╔████╔██║ ██║ \n";
68 | cout << "██╔══██║ ╚════██║ ██║╚██╔╝██║ ██║ \n";
69 | cout << "██║ ██║ ███████║ ██║ ╚═╝ ██║ ╚██████╗\n";
70 | cout << "╚═╝ ╚═╝ ╚══════╝ ╚═╝ ╚═╝ ╚═════╝\n";
71 |
72 | cout << "\n" << PROGRAM << " v." << VERSION << ", " << VERSION_DATE << "\n";
73 | cout << LICENSE << ", Copyright (C) " << YEAR << " Pier Palamara"
74 | << "\n";
75 | cout << "Manual: " << WEBSITE << "\n"
76 | << "\n";
77 |
78 | cout << "Decoding batch " << params.jobInd << " of " << params.jobs << "\n\n";
79 |
80 | cout << "Will decode " << params.decodingModeString << " data." << endl;
81 | cout << "Output will have prefix: " << params.outFileRoot << endl;
82 | if (params.compress)
83 | cout << "Will use classic emission model (no CSFS)." << endl;
84 | else
85 | cout << "Minimum marker distance to use CSFS is set to " << params.skipCSFSdistance
86 | << "." << endl;
87 | if (params.useAncestral)
88 | cout << "Assuming ancestral alleles are correctly encoded." << endl;
89 | if (params.doPosteriorSums)
90 | cout << "Will output sum of posterior tables for all pairs." << endl;
91 | if (params.doMajorMinorPosteriorSums)
92 | cout << "Will output sum of posterior tables for all pairs, partitioned by "
93 | "major/minor alleles."
94 | << endl;
95 |
96 | // if (params.noBatches)
97 | // cout << "Will not process samples in batches (slower)." << endl;
98 | // if (!params.withinOnly)
99 | // cout << "Will only decode maternal vs. paternal haplotypes." << endl;
100 | // if (params.doPerPairMAP)
101 | // cout << "Will output MAP for all haploid pairs (DANGER: huge files)." << endl;
102 | // if (params.doPerPairPosteriorMean)
103 | // cout << "Will output posterior mean for all haploid pairs (DANGER: huge
104 | // files)." << endl;
105 |
106 | // used for benchmarking
107 | Timer timer;
108 |
109 | cout << "Data will be loaded from " << params.inFileRoot << "*\n";
110 | Data data(params);
111 | printf("Read haps in %.3f seconds.\n", timer.update_time());
112 |
113 | HMM hmm(data, params);
114 |
115 | hmm.decodeAll(params.jobs, params.jobInd);
116 | const DecodingReturnValues& decodingReturnValues = hmm.getDecodingReturnValues();
117 |
118 | // output sums over pairs (if requested)
119 | if (params.doPosteriorSums) {
120 | FileUtils::AutoGzOfstream fout;
121 | fout.openOrExit(params.outFileRoot + ".sumOverPairs.gz");
122 | cout << "Output file: " << params.outFileRoot << ".sumOverPairs.gz" << endl;
123 | fout << decodingReturnValues.sumOverPairs.format(TabFmt) << endl;
124 | fout.close();
125 | }
126 | if (params.doMajorMinorPosteriorSums) {
127 | // Sum for 00
128 | FileUtils::AutoGzOfstream fout00;
129 | fout00.openOrExit(params.outFileRoot + ".00.sumOverPairs.gz");
130 | for (int pos = 0; pos < data.sites; pos++) {
131 | for (uint k = 0; k < hmm.getDecodingQuantities().states; k++) {
132 | if (k)
133 | fout00 << "\t";
134 | if (!data.siteWasFlippedDuringFolding[pos]) {
135 | fout00 << decodingReturnValues.sumOverPairs00(pos,k);
136 | } else {
137 | fout00 << decodingReturnValues.sumOverPairs11(pos,k);
138 | }
139 | }
140 | fout00 << endl;
141 | }
142 |
143 | fout00.close();
144 | // Sum for 01
145 | FileUtils::AutoGzOfstream fout01;
146 | fout01.openOrExit(params.outFileRoot + ".01.sumOverPairs.gz");
147 | fout01 << decodingReturnValues.sumOverPairs01.format(TabFmt) << endl;
148 | fout01.close();
149 | // Sum for 11
150 | FileUtils::AutoGzOfstream fout11;
151 | fout11.openOrExit(params.outFileRoot + ".11.sumOverPairs.gz");
152 | for (int pos = 0; pos < data.sites; pos++) {
153 | for (uint k = 0; k < hmm.getDecodingQuantities().states; k++) {
154 | if (k)
155 | fout11 << "\t";
156 | if (!data.siteWasFlippedDuringFolding[pos]) {
157 | fout11 << decodingReturnValues.sumOverPairs11(pos,k);
158 | } else {
159 | fout11 << decodingReturnValues.sumOverPairs00(pos,k);
160 | }
161 | }
162 | fout11 << endl;
163 | }
164 | fout11.close();
165 |
166 | cout << "Done.\n\n";
167 | }
168 | }
169 |
--------------------------------------------------------------------------------
/exe/main_convertBinary.cpp:
--------------------------------------------------------------------------------
1 | #include "BinaryDataReader.hpp"
2 |
3 | #include
4 | #include
5 |
6 | int main(int argc, char* argv[])
7 | {
8 |
9 | // make sure parameters are ok
10 | if (argc != 2) {
11 | std::cout << "Number of parameters is wrong." << std::endl;
12 | std::cout << "Only one parameter (name of binary file) is required." << std::endl;
13 | exit(1);
14 | }
15 |
16 | BinaryDataReader binaryDataReader(argv[1]);
17 |
18 | while (binaryDataReader.moreLinesInFile()) {
19 | std::cout << binaryDataReader.getNextLine().toString() << std::endl;
20 | }
21 |
22 | return 0;
23 | }
24 |
--------------------------------------------------------------------------------
/exe/main_fastsmc.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | #include "Data.hpp"
5 | #include "DecodingParams.hpp"
6 | #include "DecodingQuantities.hpp"
7 | #include "FastSMC.hpp"
8 | #include "HMM.hpp"
9 | #include "Timer.hpp"
10 |
11 | using namespace std;
12 |
13 | int main(int argc, char* argv[])
14 | {
15 | // Parse input arguments
16 | DecodingParams params;
17 | if (!params.processCommandLineArgsFastSMC(argc, argv)) {
18 | cerr << "Error processing command line; exiting." << endl;
19 | exit(1);
20 | }
21 |
22 | ASMC::FastSMC fastSMC(params);
23 | fastSMC.run();
24 |
25 | return 0;
26 | }
27 |
--------------------------------------------------------------------------------
/notebooks/fastsmc-minimal.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# FastSMC minimal working example notebook\n",
8 | "\n",
9 | "This notebook demonstrates a minimal working example of the FastSMC python bindings, where sensible default parameters are set automatically.\n",
10 | "\n",
11 | "Please make sure you have installed the python bindings by following the instructions in `../README.md` before attempting to run this notebook.\n",
12 | "\n",
13 | "The example dataset was simulated using the setup described in the paper, corresponding to SNP data for 150 diploid individuals and a chromosomal region of 30 Mb, with recombination rate from chromosome 2 and under a European demographic model (see https://www.nature.com/articles/s41467-020-19588-x for more details)."
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "1) Import `asmc` which is installed with the Python bindings"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "from asmc.asmc import *\n",
30 | "\n",
31 | "import pathlib\n",
32 | "import tempfile\n",
33 | "\n",
34 | "data_dir = pathlib.Path('.').resolve().parent / 'ASMC_data'"
35 | ]
36 | },
37 | {
38 | "cell_type": "markdown",
39 | "metadata": {},
40 | "source": [
41 | "2) Specify paths for input (example provided in a submodule of this repository) and output. Input is expected to have the following files (note: make sure the map file is in the right format, as described in https://github.com/PalamaraLab/ASMC/blob/main/docs/fastsmc.md#input-file-formats):\n",
42 | "- `.hap.gz`\n",
43 | "- `.map`\n",
44 | "- `.samples`"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "input_files_root = str(data_dir / 'examples' / 'fastsmc' / 'example')\n",
54 | "dq_file = str(data_dir / 'decoding_quantities' / '30-100-2000_CEU.decodingQuantities.gz')\n",
55 | "output_files_root = tempfile.TemporaryDirectory().name"
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "metadata": {},
61 | "source": [
62 | "3) Create the Python FastSMC object and run it. This should only take a few seconds."
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": null,
68 | "metadata": {},
69 | "outputs": [],
70 | "source": [
71 | "fast_smc = FastSMC(in_dir=input_files_root, dq_file=dq_file, out_dir=output_files_root)\n",
72 | "fast_smc.run()"
73 | ]
74 | },
75 | {
76 | "cell_type": "markdown",
77 | "metadata": {},
78 | "source": [
79 | "4) Read data, add column names and filter to remove IBD segments with low IBD score. Note that for a large analysis, loading all data into memory is unlikely to be possible. See fastsmc.ipynb for an example that reads the output line-by-line."
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": null,
85 | "metadata": {},
86 | "outputs": [],
87 | "source": [
88 | "%config InlineBackend.figure_formats = ['svg']\n",
89 | "\n",
90 | "import numpy as np\n",
91 | "import pandas as pd\n",
92 | "import matplotlib.pyplot as plt\n",
93 | "\n",
94 | "data = pd.read_csv(output_files_root + '.1.1.FastSMC.ibd.gz', sep='\\t', header=None)\n",
95 | "\n",
96 | "data.columns = ['ind1_famid', 'ind1_id', 'ind1_hap', 'ind2_famid', 'ind2_id', 'ind2_hap', 'chromosome',\n",
97 | " 'ibd_start', 'ibd_end', 'length_in_cM', 'ibd_score', 'post_est', 'map_est']\n",
98 | "\n",
99 | "filtered = data[data['ibd_score'] > 0.1]\n",
100 | "filtered"
101 | ]
102 | },
103 | {
104 | "cell_type": "markdown",
105 | "metadata": {
106 | "pycharm": {
107 | "name": "#%% md\n"
108 | }
109 | },
110 | "source": [
111 | "5) Visualise data: here we simply bin the MAP age estimates and the IBD segment length"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "metadata": {},
118 | "outputs": [],
119 | "source": [
120 | "plt.xlabel(\"MAP age estimate (in generations)\")\n",
121 | "filtered['map_est'].hist(range=(0, 100))\n",
122 | "plt.gca().set_yscale('linear')"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": null,
128 | "metadata": {
129 | "pycharm": {
130 | "name": "#%%\n"
131 | }
132 | },
133 | "outputs": [],
134 | "source": [
135 | "plt.xlabel(\"IBD segments length (in cM)\")\n",
136 | "filtered['length_in_cM'].hist(range=(0, 15))\n",
137 | "plt.gca().set_yscale('log')"
138 | ]
139 | }
140 | ],
141 | "metadata": {
142 | "kernelspec": {
143 | "display_name": "Python 3 (ipykernel)",
144 | "language": "python",
145 | "name": "python3"
146 | },
147 | "language_info": {
148 | "codemirror_mode": {
149 | "name": "ipython",
150 | "version": 3
151 | },
152 | "file_extension": ".py",
153 | "mimetype": "text/x-python",
154 | "name": "python",
155 | "nbconvert_exporter": "python",
156 | "pygments_lexer": "ipython3",
157 | "version": "3.8.10"
158 | }
159 | },
160 | "nbformat": 4,
161 | "nbformat_minor": 4
162 | }
163 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # Based on https://github.com/pybind/cmake_example
2 |
3 | import os
4 | import sys
5 | import subprocess
6 |
7 | from setuptools import setup, Extension, find_namespace_packages
8 | from setuptools.command.build_ext import build_ext
9 |
10 | # Convert distutils Windows platform specifiers to CMake -A arguments
11 | PLAT_TO_CMAKE = {
12 | "win32": "Win32",
13 | "win-amd64": "x64",
14 | "win-arm32": "ARM",
15 | "win-arm64": "ARM64",
16 | }
17 |
18 |
19 | # A CMakeExtension needs a sourcedir instead of a file list.
20 | # The name must be the _single_ output extension from the CMake build.
21 | # If you need multiple extensions, see scikit-build.
22 | class CMakeExtension(Extension):
23 | def __init__(self, name, sourcedir=""):
24 | Extension.__init__(self, name, sources=[])
25 | self.sourcedir = os.path.abspath(sourcedir)
26 |
27 |
28 | class CMakeBuild(build_ext):
29 |
30 | def build_extension(self, ext):
31 | extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
32 |
33 | # required for auto-detection of auxiliary "native" libs
34 | if not extdir.endswith(os.path.sep):
35 | extdir += os.path.sep
36 |
37 | cfg = "Debug" if self.debug else "Release"
38 |
39 | # CMake lets you override the generator - we need to check this.
40 | # Can be set with Conda-Build, for example.
41 | cmake_generator = os.environ.get("CMAKE_GENERATOR", "")
42 |
43 | # Set Python_EXECUTABLE instead if you use PYBIND11_FINDPYTHON
44 | # EXAMPLE_VERSION_INFO shows you how to pass a value into the C++ code
45 | # from Python.
46 | cmake_args = [
47 | f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}",
48 | f"-DPYTHON_EXECUTABLE={sys.executable}",
49 | f"-DCMAKE_BUILD_TYPE={cfg}",
50 | f"-DWARNINGS_AS_ERRORS=OFF",
51 | f"-DASMC_TESTING=OFF",
52 | ]
53 | build_args = []
54 |
55 | if self.compiler.compiler_type != "msvc":
56 | # Using Ninja-build since it a) is available as a wheel and b)
57 | # multithreads automatically. MSVC would require all variables be
58 | # exported for Ninja to pick it up, which is a little tricky to do.
59 | # Users can override the generator with CMAKE_GENERATOR in CMake
60 | # 3.15+.
61 | if not cmake_generator:
62 | cmake_args += ["-GNinja"]
63 |
64 | else:
65 |
66 | # Single config generators are handled "normally"
67 | single_config = any(x in cmake_generator for x in {"NMake", "Ninja"})
68 |
69 | # CMake allows an arch-in-generator style for backward compatibility
70 | contains_arch = any(x in cmake_generator for x in {"ARM", "Win64"})
71 |
72 | # Specify the arch if using MSVC generator, but only if it doesn't
73 | # contain a backward-compatibility arch spec already in the
74 | # generator name.
75 | if not single_config and not contains_arch:
76 | cmake_args += ["-A", PLAT_TO_CMAKE[self.plat_name]]
77 |
78 | # Multi-config generators have a different way to specify configs
79 | if not single_config:
80 | cmake_args += [
81 | "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}".format(cfg.upper(), extdir)
82 | ]
83 | build_args += ["--config", cfg]
84 |
85 | # Set CMAKE_BUILD_PARALLEL_LEVEL to control the parallel build level
86 | # across all generators.
87 | if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ:
88 | # self.parallel is a Python 3 only way to set parallel jobs by hand
89 | # using -j in the build_ext call, not supported by pip or PyPA-build.
90 | if hasattr(self, "parallel") and self.parallel:
91 | # CMake 3.12+ only.
92 | build_args += ["-j{}".format(self.parallel)]
93 |
94 | if not os.path.exists(self.build_temp):
95 | os.makedirs(self.build_temp)
96 |
97 | subprocess.check_call(
98 | ["cmake", ext.sourcedir] + cmake_args, cwd=self.build_temp
99 | )
100 | subprocess.check_call(
101 | ["cmake", "--build", "."] + build_args, cwd=self.build_temp
102 | )
103 |
104 |
105 | with open('PyPI_README.md', encoding='utf-8') as f:
106 | long_description = f.read()
107 |
108 | with open('RELEASE_NOTES.md', encoding='utf-8') as f:
109 | release_notes = f.read()
110 |
111 | setup(
112 | name='asmc-asmc',
113 | version='1.3.1',
114 | author='PalamaraLab (https://palamaralab.github.io/)',
115 | description='ASMC is a method to efficiently estimate pairwise coalescence time along the genome',
116 | url='https://github.com/PalamaraLab/ASMC/',
117 | python_requires=">=3.6",
118 | packages=find_namespace_packages(include=['asmc.*']),
119 | long_description='\n'.join([long_description, release_notes]),
120 | long_description_content_type="text/markdown",
121 | install_requires=['jupyter', 'numpy', 'pandas', 'asmc-preparedecoding', 'matplotlib'],
122 | ext_modules=[CMakeExtension('asmc/asmc')],
123 | cmdclass=dict(build_ext=CMakeBuild),
124 | zip_safe=False,
125 | )
126 |
--------------------------------------------------------------------------------
/src/ASMC.hpp:
--------------------------------------------------------------------------------
1 | // This file is part of ASMC, developed by Pier Francesco Palamara.
2 | //
3 | // ASMC is free software: you can redistribute it and/or modify
4 | // it under the terms of the GNU General Public License as published by
5 | // the Free Software Foundation, either version 3 of the License, or
6 | // (at your option) any later version.
7 | //
8 | // ASMC is distributed in the hope that it will be useful,
9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | // GNU General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU General Public License
14 | // along with ASMC. If not, see .
15 |
16 | #ifndef ASMC_HPP
17 | #define ASMC_HPP
18 |
19 | #include "Data.hpp"
20 | #include "DecodePairsReturnStruct.hpp"
21 | #include "DecodingParams.hpp"
22 | #include "HMM.hpp"
23 |
24 | #include
25 | #include
26 |
27 | namespace ASMC
28 | {
29 |
30 | class ASMC
31 | {
32 |
33 | private:
34 | DecodingParams mParams;
35 | Data mData;
36 | HMM mHmm;
37 |
38 | public:
39 | /**
40 | * ASMC constructor with full control over parameters, by manually specifying a DecodingParams object.
41 | *
42 | * @param params the decoding parameters
43 | */
44 | explicit ASMC(DecodingParams params);
45 |
46 | /**
47 | * ASMC constructor that will set sensible defaults. If you wish to fine-tune parameters, use the constructor that
48 | * takes a DecodingParams object, which you can configure manually.
49 | *
50 | * @param inFileRoot the input file root
51 | * @param decodingQuantFile the decoding quantities file
52 | * @param outFileRoot the output file root, default to the input file root
53 | */
54 | ASMC(const std::string& inFileRoot, const std::string& decodingQuantFile, const std::string& outFileRoot = "",
55 | const std::string& decodingMode = "array");
56 |
57 | DecodingParams getDecodingParams();
58 |
59 | unsigned long getDiploidSampleSize();
60 |
61 | unsigned long getHaploidSampleSize();
62 |
63 | int getNumSites();
64 |
65 | std::vector getPhysicalPositions();
66 |
67 | std::vector getGeneticPositions();
68 |
69 | DecodingReturnValues decodeAllInJob();
70 |
71 | void decodePairs(unsigned from = 0u, unsigned to = 0u, float cmBurnIn = 0.5f);
72 |
73 | void decodePairs(const std::vector &hapIndicesA, const std::vector &hapIndicesB,
74 | unsigned from = 0u, unsigned to = 0u, float cmBurnIn = 0.5f);
75 |
76 | void decodePairs(const std::vector& hapIdsA, const std::vector& hapIdsB,
77 | unsigned from = 0u, unsigned to = 0u, float cmBurnIn = 0.5f);
78 |
79 | DecodePairsReturnStruct getCopyOfResults();
80 |
81 | const DecodePairsReturnStruct& getRefOfResults();
82 |
83 | const std::vector& getExpectedTimes();
84 |
85 | /// Set to true to store per pair posterior mean
86 | void setStorePerPairPosteriorMean(bool storePerPairPosteriorMean = true);
87 |
88 | /// Set to true to write per pair posterior mean to file
89 | void setWritePerPairPosteriorMean(bool writePerPairPosteriorMean = true);
90 |
91 | /// Set to true to store per pair MAP
92 | void setStorePerPairMap(bool storePerPairMAP = true);
93 |
94 | /// Set to true to write per pair MAP to file
95 | void setWritePerPairMap(bool writePerPairMAP = true);
96 |
97 | /// Set to true to store per pair posterior
98 | void setStorePerPairPosterior(bool storePerPairPosterior = true);
99 |
100 | /// Set to true to store the sum of posteriors
101 | void setStoreSumOfPosterior(bool storeSumOfPosterior = true);
102 |
103 | };
104 | } // namespace ASMC
105 |
106 | #endif
107 |
--------------------------------------------------------------------------------
/src/AvxDefinitions.hpp:
--------------------------------------------------------------------------------
1 | // This file is part of ASMC, developed by Pier Francesco Palamara.
2 | //
3 | // ASMC is free software: you can redistribute it and/or modify
4 | // it under the terms of the GNU General Public License as published by
5 | // the Free Software Foundation, either version 3 of the License, or
6 | // (at your option) any later version.
7 | //
8 | // ASMC is distributed in the hope that it will be useful,
9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | // GNU General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU General Public License
14 | // along with ASMC. If not, see .
15 |
16 |
17 | #ifndef AVXDEFINITIONS_HPP
18 | #define AVXDEFINITIONS_HPP
19 |
20 | #include
21 |
22 | #ifdef NO_SSE
23 | #define MODE "NO_SSE"
24 | #define VECX 4
25 | #endif
26 |
27 | // SSE vectorization (block size = 4)
28 | #ifdef SSE
29 | #define MODE "SSE"
30 | #define VECX 4
31 | #define FLOAT __m128
32 | #define LOAD _mm_load_ps
33 | #define STORE _mm_store_ps
34 | #define MULT _mm_mul_ps
35 | #define ADD _mm_add_ps
36 | #define RECIPROCAL _mm_rcp_ps
37 | #define LOAD1 _mm_load1_ps
38 | #endif
39 |
40 | // AVX vectorization (block size = 8)
41 | #ifdef AVX
42 | #define MODE "AVX"
43 | #include
44 | #define VECX 8
45 | #define FLOAT __m256
46 | #define LOAD _mm256_load_ps
47 | #define STORE _mm256_store_ps
48 | #define MULT _mm256_mul_ps
49 | #define ADD _mm256_add_ps
50 | #define RECIPROCAL _mm256_rcp_ps
51 | #define LOAD1 _mm256_broadcast_ss
52 | #endif
53 |
54 | // AVX512 vectorization (block size = 16)
55 | #ifdef AVX512
56 | #define MODE "AVX512"
57 | #include
58 | #define VECX 16
59 | #define FLOAT __m512
60 | #define LOAD _mm512_load_ps
61 | #define STORE _mm512_store_ps
62 | #define MULT _mm512_mul_ps
63 | #define ADD _mm512_add_ps
64 | #define RECIPROCAL _mm512_rcp14_ps
65 | #define LOAD1 _mm512_set1_ps
66 | #endif
67 |
68 |
69 | #endif // AVXDEFINITIONS_HPP
70 |
--------------------------------------------------------------------------------
/src/BinaryDataReader.hpp:
--------------------------------------------------------------------------------
1 | //
2 | // Created by fergus on 28/08/2020.
3 | //
4 |
5 | #ifndef ASMC_BINARYDATAREADER_HPP
6 | #define ASMC_BINARYDATAREADER_HPP
7 |
8 | #include
9 |
10 | #include
11 |
12 | #include
13 | #include
14 | #include
15 | #include
16 | #include
17 | #include
18 | #include
19 | #include
20 | #include
21 |
22 | namespace fs = std::filesystem;
23 |
24 | struct IbdPairDataLine {
25 |
26 | std::string ind1FamId = "0_00";
27 | std::string ind1Id = "0_00";
28 | int ind1Hap = -1;
29 |
30 | std::string ind2FamId = "0_00";
31 | std::string ind2Id = "0_00";
32 | int ind2Hap = -1;
33 |
34 | int chromosome = -1;
35 |
36 | int ibdStart = -1;
37 | int ibdEnd = -1;
38 |
39 | float lengthInCentimorgans = -1.f;
40 | float ibdScore = -1.f;
41 | float postEst = -1.f;
42 | float mapEst = -1.f;
43 |
44 | [[nodiscard]] std::string toString() const
45 | {
46 | std::stringstream line;
47 | line << std::setprecision(std::numeric_limits::digits10 + 1);
48 |
49 | line << ind1FamId << '\t' << ind1Id << '\t' << ind1Hap << '\t' << ind2FamId << '\t' << ind2Id << '\t' << ind2Hap
50 | << '\t' << chromosome << '\t' << ibdStart << '\t' << ibdEnd;
51 |
52 | if (lengthInCentimorgans != -1.f) {
53 | line << '\t' << lengthInCentimorgans;
54 | }
55 |
56 | if (ibdScore != -1.f) {
57 | line << '\t' << ibdScore;
58 | }
59 |
60 | if (postEst != -1.f) {
61 | line << '\t' << postEst;
62 | }
63 |
64 | if (mapEst != -1.f) {
65 | line << '\t' << mapEst;
66 | }
67 |
68 | return line.str();
69 | }
70 | };
71 |
72 | class BinaryDataReader
73 | {
74 |
75 | private:
76 | /**
77 | * Handle to the binary zipped file, opened in the constructor and closed in the destructor
78 | */
79 | gzFile mGzBinaryFileHandle;
80 |
81 | bool mContainsIbdSegmentLengths = false;
82 | bool mContainsIbdScore = false;
83 | bool mContainsPosteriorAgeEstimates = false;
84 | bool mContainsMapAgeEstimates = false;
85 |
86 | int mChromosomeNumber = -1;
87 | unsigned mNumIds = 0u;
88 |
89 | std::vector mFamIds;
90 | std::vector mIIds;
91 |
92 | unsigned mPreReadStartOfNextLine = {};
93 |
94 | bool mMoreLinesInFile = true;
95 |
96 | void ReadHeader()
97 | {
98 | gzread(mGzBinaryFileHandle, reinterpret_cast(&mContainsIbdSegmentLengths), sizeof(bool));
99 | gzread(mGzBinaryFileHandle, reinterpret_cast(&mContainsIbdScore), sizeof(bool));
100 | gzread(mGzBinaryFileHandle, reinterpret_cast(&mContainsPosteriorAgeEstimates), sizeof(bool));
101 | gzread(mGzBinaryFileHandle, reinterpret_cast(&mContainsMapAgeEstimates), sizeof(bool));
102 | gzread(mGzBinaryFileHandle, reinterpret_cast(&mChromosomeNumber), sizeof(int));
103 |
104 | gzread(mGzBinaryFileHandle, reinterpret_cast(&mNumIds), sizeof(unsigned));
105 | mFamIds.reserve(mNumIds);
106 | mIIds.reserve(mNumIds);
107 |
108 | for (unsigned i = 0; i < mNumIds; i++) {
109 |
110 | unsigned lengthFamId = {};
111 | gzread(mGzBinaryFileHandle, reinterpret_cast(&lengthFamId), sizeof(unsigned));
112 | mFamIds.emplace_back(lengthFamId, 'z');
113 | gzread(mGzBinaryFileHandle, &mFamIds.at(i).at(0), lengthFamId);
114 |
115 | unsigned lengthIId = {};
116 | gzread(mGzBinaryFileHandle, reinterpret_cast(&lengthIId), sizeof(unsigned));
117 | mIIds.emplace_back(lengthIId, 'z');
118 | gzread(mGzBinaryFileHandle, &mIIds.at(i).at(0), lengthIId);
119 | }
120 | }
121 |
122 | void CheckIfNextLineExists()
123 | {
124 | if (gzread(mGzBinaryFileHandle, reinterpret_cast(&mPreReadStartOfNextLine), sizeof(unsigned)) <
125 | sizeof(unsigned)) {
126 | mMoreLinesInFile = false;
127 | }
128 | }
129 |
130 | public:
131 | explicit BinaryDataReader(const std::string& binaryFile)
132 | {
133 |
134 | if (!fs::is_regular_file(binaryFile)) {
135 | throw std::runtime_error(fmt::format("Provided path to binary file {} is not a file\n", binaryFile));
136 | }
137 |
138 | mGzBinaryFileHandle = gzopen(binaryFile.c_str(), "rb");
139 | ReadHeader();
140 | CheckIfNextLineExists();
141 | }
142 |
143 | IbdPairDataLine getNextLine()
144 | {
145 | IbdPairDataLine line;
146 |
147 | // We have already read the first number from this line with a call to CheckIfNextLineExists()
148 | unsigned ind1 = mPreReadStartOfNextLine;
149 | unsigned ind2 = -1;
150 |
151 | std::uint_least8_t hap1;
152 | std::uint_least8_t hap2;
153 |
154 | gzread(mGzBinaryFileHandle, reinterpret_cast(&hap1), sizeof(std::uint_least8_t));
155 | gzread(mGzBinaryFileHandle, reinterpret_cast(&ind2), sizeof(unsigned));
156 | gzread(mGzBinaryFileHandle, reinterpret_cast(&hap2), sizeof(std::uint_least8_t));
157 | gzread(mGzBinaryFileHandle, reinterpret_cast(&line.ibdStart), sizeof(int));
158 | gzread(mGzBinaryFileHandle, reinterpret_cast(&line.ibdEnd), sizeof(int));
159 |
160 | if (mContainsIbdSegmentLengths) {
161 | gzread(mGzBinaryFileHandle, reinterpret_cast(&line.lengthInCentimorgans), sizeof(float));
162 | }
163 |
164 | if (mContainsIbdScore) {
165 | gzread(mGzBinaryFileHandle, reinterpret_cast(&line.ibdScore), sizeof(float));
166 | }
167 |
168 | if (mContainsPosteriorAgeEstimates) {
169 | gzread(mGzBinaryFileHandle, reinterpret_cast(&line.postEst), sizeof(float));
170 | }
171 |
172 | if (mContainsMapAgeEstimates) {
173 | gzread(mGzBinaryFileHandle, reinterpret_cast(&line.mapEst), sizeof(float));
174 | }
175 |
176 | line.ind1Hap = static_cast(hap1);
177 | line.ind2Hap = static_cast(hap2);
178 |
179 | line.chromosome = mChromosomeNumber;
180 |
181 | line.ind1FamId = mFamIds.at(ind1);
182 | line.ind1Id = mIIds.at(ind1);
183 |
184 | line.ind2FamId = mFamIds.at(ind2);
185 | line.ind2Id = mIIds.at(ind2);
186 |
187 | // Pre-read first number from next line to check whether the line exists
188 | CheckIfNextLineExists();
189 |
190 | return line;
191 | }
192 |
193 | [[nodiscard]] bool moreLinesInFile() const
194 | {
195 | return mMoreLinesInFile;
196 | }
197 |
198 | ~BinaryDataReader()
199 | {
200 | gzclose(mGzBinaryFileHandle);
201 | }
202 | };
203 |
204 | #endif // ASMC_BINARYDATAREADER_HPP
205 |
--------------------------------------------------------------------------------
/src/Data.hpp:
--------------------------------------------------------------------------------
1 | // This file is part of ASMC, developed by Pier Francesco Palamara.
2 | //
3 | // ASMC is free software: you can redistribute it and/or modify
4 | // it under the terms of the GNU General Public License as published by
5 | // the Free Software Foundation, either version 3 of the License, or
6 | // (at your option) any later version.
7 | //
8 | // ASMC is distributed in the hope that it will be useful,
9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | // GNU General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU General Public License
14 | // along with ASMC. If not, see .
15 |
16 | #ifndef ASMC_DATA_HPP
17 | #define ASMC_DATA_HPP
18 |
19 | #include
20 | #include
21 | #include
22 | #include
23 |
24 | #include "Individual.hpp"
25 | #include "DecodingParams.hpp"
26 | #include "DecodingQuantities.hpp"
27 |
28 | class Data
29 | {
30 |
31 | public:
32 |
33 | std::vector FamIDList = {};
34 | std::vector IIDList = {};
35 | std::vector famAndIndNameList = {};
36 | std::vector individuals = {};
37 |
38 | unsigned long sampleSize = 0ul;
39 | unsigned long haploidSampleSize = 0ul;
40 | int sites = 0;
41 | bool decodingUsesCSFS = false;
42 | bool mJobbing = false;
43 | bool foldToMinorAlleles = false;
44 | std::vector geneticPositions = {};
45 | std::vector physicalPositions = {};
46 | std::vector siteWasFlippedDuringFolding = {};
47 | std::vector recRateAtMarker = {};
48 |
49 | // Variables relating to FastSMC
50 | int chrNumber = 0;
51 | unsigned int windowSize = 0u; // window size in triangles for each job
52 | unsigned int w_i = 0u; // window id for ind_i for jobs
53 | unsigned int w_j = 0u; // window id for ind_j for jobs
54 | bool is_j_above_diag = false;
55 | std::unordered_map physicalPositionsMap = {}; // map where key=physicalPosition, value=indexPosition
56 |
57 | /**
58 | * Construct the data object, which also constructs the decoding quantities that will be owned by this object
59 | *
60 | * @param params the decoding params
61 | */
62 | explicit Data(const DecodingParams& params);
63 |
64 | static int countHapLines(std::string inFileRoot);
65 | static int countSamplesLines(std::string inFileRoot);
66 |
67 | /**
68 | * Calculate the undistinguished counts
69 | *
70 | * @param numCsfsSamples the number of CSFS samples
71 | * @return the undistinguished counts
72 | */
73 | std::vector> calculateUndistinguishedCounts(int numCsfsSamples) const;
74 |
75 | const std::vector& getSnpIDs() const;
76 |
77 | private:
78 |
79 | /**
80 | * Determine whether a sample should be read, based on the jobID, number of jobs, and the number of lines processed.
81 | * ASMC will always return true, but FastSMC will determine whether to read a sample.
82 | *
83 | * @param linesProcessed the number of lines processed so far
84 | * @param jobID the jobID, which will be the default value of -1 for ASMC
85 | * @param jobs the number of jobs, which will be the default value of -1 for ASMC
86 | * @return whether to read the sample
87 | */
88 | bool readSample(unsigned linesProcessed, int jobID, int jobs);
89 |
90 | /**
91 | * Read the samples file and populate members `FamIDList`, `IIDList` and `famAndIndNameList`.
92 | *
93 | * @param inFileRoot location of input files
94 | * @param jobID the jobID which defaults to -1 indicating no jobbing
95 | * @param jobs the number of jobs which defaults to -1 indicating no jobbing
96 | */
97 | void readSamplesList(const std::string& inFileRoot, int jobID, int jobs);
98 |
99 | void readHaps(std::string inFileRoot, bool foldToMinorAlleles);
100 | void readHaps(std::string inFileRoot, bool foldToMinorAlleles, int jobID, int jobs,
101 | std::vector>& genetic_map);
102 |
103 | /**
104 | * Read Plink-format map file
105 | * @param inFileRoot
106 | * @param mapFile: optional direct path to map file, used if map file is not in inFileRoot
107 | */
108 | void readMap(const std::string& inFileRoot, const std::string& mapFile = "");
109 |
110 | /**
111 | * Subsumed functionality from FastSMC to read genetic map as a vector of pairs.
112 | * TODO: can this be harmonised with the other readMap method?
113 | * @param inFileRoot
114 | * @param mapFile: optional direct path to map file, used if map file is not in inFileRoot
115 | * @return
116 | */
117 | static std::vector> readMapFastSMC(const std::string& inFileRoot,
118 | const std::string& mapFile = "");
119 |
120 | std::vector totalSamplesCount;
121 | std::vector derivedAlleleCounts;
122 | std::vector SNP_IDs;
123 |
124 | static int sampleHypergeometric(int populationSize, int numberOfSuccesses, int sampleSize);
125 |
126 |
127 | void readGeneticMap(unsigned long int bp, std::vector>& genetic_map,
128 | unsigned int& cur_g, unsigned int pos);
129 |
130 | void addMarker(unsigned long int physicalPosition, double geneticPosition, unsigned int pos);
131 |
132 |
133 |
134 |
135 | };
136 |
137 | #endif // ASMC_DATA_HPP
138 |
--------------------------------------------------------------------------------
/src/DecodePairsReturnStruct.hpp:
--------------------------------------------------------------------------------
1 | // This file is part of ASMC, developed by Pier Francesco Palamara.
2 | //
3 | // ASMC is free software: you can redistribute it and/or modify
4 | // it under the terms of the GNU General Public License as published by
5 | // the Free Software Foundation, either version 3 of the License, or
6 | // (at your option) any later version.
7 | //
8 | // ASMC is distributed in the hope that it will be useful,
9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | // GNU General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU General Public License
14 | // along with ASMC. If not, see .
15 |
16 | #ifndef FASTSMC_DECODE_PAIRS_RETURN_STRUCT_HPP
17 | #define FASTSMC_DECODE_PAIRS_RETURN_STRUCT_HPP
18 |
19 | #pragma clang diagnostic push
20 | #pragma ide diagnostic ignored "cppcoreguidelines-non-private-member-variables-in-classes"
21 |
22 | #include
23 | #include
24 |
25 | #include
26 | #include
27 | #include
28 |
29 | struct DecodePairsReturnStruct {
30 |
31 | private:
32 |
33 | bool m_storeFullPosteriors = false;
34 | bool m_storeSumOfPosteriors = false;
35 | bool m_storePerPairPosteriors = false;
36 | bool m_storePerPairMAPs = false;
37 |
38 | std::size_t numWritten = 0ul;
39 |
40 | public:
41 | void initialise(const std::vector& individualsA, const std::vector& individualsB,
42 | long int numSites, long int numStates, bool _fullPosteriors = false, bool _sumOfPosteriors = false,
43 | bool _perPairPosteriors = false, bool _perPairMAPs = false)
44 | {
45 | numWritten = 0ul;
46 | Eigen::Index numPairsToDecode = individualsA.size();
47 |
48 | m_storeFullPosteriors = _fullPosteriors;
49 | m_storeSumOfPosteriors = _sumOfPosteriors;
50 | m_storePerPairPosteriors = _perPairPosteriors;
51 | m_storePerPairMAPs = _perPairMAPs;
52 |
53 | perPairIndices.resize(numPairsToDecode);
54 |
55 | if (m_storeFullPosteriors) {
56 | perPairPosteriors.resize(numPairsToDecode);
57 | for (auto& arr : perPairPosteriors) {
58 | arr.resize(numStates, numSites);
59 | }
60 | }
61 |
62 | if (m_storeSumOfPosteriors) {
63 | sumOfPosteriors.resize(numStates, numSites);
64 | sumOfPosteriors.setZero();
65 | }
66 |
67 | if (m_storePerPairPosteriors) {
68 | perPairPosteriorMeans.resize(numPairsToDecode, numSites);
69 | minPosteriorMeans.resize(numSites);
70 | argminPosteriorMeans.resize(numSites);
71 | }
72 |
73 | if (m_storePerPairMAPs) {
74 | perPairMAPs.resize(numPairsToDecode, numSites);
75 | minMAPs.resize(numSites);
76 | argminMAPs.resize(numSites);
77 | }
78 | }
79 |
80 | /// iHapIdx, iHapId, jHapIdx, jHapId
81 | std::vector> perPairIndices;
82 |
83 | /// The full set of posteriors: for each pair this is a (states * numSites) matrix
84 | std::vector> perPairPosteriors;
85 |
86 | /// The sum of all posteriors in perPairPosteriors: a (states * numSites) matrix
87 | Eigen::Array sumOfPosteriors;
88 |
89 | /// Posterior means: each row is an array of length numSites
90 | Eigen::Array perPairPosteriorMeans;
91 |
92 | Eigen::Array minPosteriorMeans;
93 | Eigen::Array argminPosteriorMeans;
94 |
95 | Eigen::Array perPairMAPs;
96 |
97 | Eigen::Array minMAPs;
98 | Eigen::Array argminMAPs;
99 |
100 | void incrementNumWritten()
101 | {
102 | numWritten += 1;
103 | }
104 |
105 | void finaliseCalculations()
106 | {
107 | for (Eigen::Index siteIdx = 0ll; siteIdx < perPairPosteriorMeans.cols(); ++siteIdx) {
108 | Eigen::Index argmin{};
109 | minPosteriorMeans(siteIdx) = perPairPosteriorMeans.col(siteIdx).minCoeff(&argmin);
110 | argminPosteriorMeans(siteIdx) = static_cast(argmin);
111 | }
112 |
113 | for (Eigen::Index siteIdx = 0ll; siteIdx < perPairMAPs.cols(); ++siteIdx) {
114 | Eigen::Index argmin{};
115 | minMAPs(siteIdx) = perPairMAPs.col(siteIdx).minCoeff(&argmin);
116 | argminMAPs(siteIdx) = static_cast(argmin);
117 | }
118 | }
119 |
120 | [[nodiscard]] std::size_t getNumWritten() const
121 | {
122 | return numWritten;
123 | }
124 | };
125 |
126 | #endif // FASTSMC_DECODE_PAIRS_RETURN_STRUCT_HPP
127 |
128 | #pragma clang diagnostic pop
--------------------------------------------------------------------------------
/src/DecodingParams.hpp:
--------------------------------------------------------------------------------
1 | // This file is part of ASMC, developed by Pier Francesco Palamara.
2 | //
3 | // ASMC is free software: you can redistribute it and/or modify
4 | // it under the terms of the GNU General Public License as published by
5 | // the Free Software Foundation, either version 3 of the License, or
6 | // (at your option) any later version.
7 | //
8 | // ASMC is distributed in the hope that it will be useful,
9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | // GNU General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU General Public License
14 | // along with ASMC. If not, see .
15 |
16 | #ifndef DECODINGPARAMS_HPP
17 | #define DECODINGPARAMS_HPP
18 |
19 | #include
20 |
21 | #include
22 |
23 | enum class DecodingMode { sequenceFolded, arrayFolded, sequence, array };
24 |
25 | enum class DecodingModeOverall { sequence, array };
26 |
27 | class DecodingParams
28 | {
29 |
30 | private:
31 | bool fastSmcInvokedWithProgramOptions = false;
32 |
33 | public:
34 | std::string inFileRoot;
35 | std::string decodingQuantFile;
36 | std::string mapFile;
37 | std::string outFileRoot;
38 | int jobs = 1;
39 | int jobInd = 1;
40 | std::string decodingModeString = "array";
41 | DecodingModeOverall decodingModeOverall;
42 | DecodingMode decodingMode;
43 | bool decodingSequence = false;
44 | bool foldData = false;
45 | bool usingCSFS = false;
46 | bool compress = false;
47 | bool useAncestral = false;
48 | float skipCSFSdistance{};
49 | bool noBatches = false;
50 |
51 | // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
52 | // New params from FastSMC that were not originally in ASMC
53 |
54 | int batchSize = 64;
55 | int recallThreshold = 3;
56 |
57 | float skip = 0.f;
58 | int gap = 1;
59 | int max_seeds = 0;
60 | float min_maf = 0;
61 | float min_m = 1;
62 | bool hashing = false;
63 | bool hashingOnly = false;
64 | bool FastSMC = false;
65 | bool BIN_OUT = false;
66 | bool useKnownSeed = false;
67 |
68 | /// Whether to write IBD segment length (in centimorgans)
69 | bool outputIbdSegmentLength = false;
70 |
71 | // Used by FastSCM itself
72 | int hashingWordSize = 64;
73 | int constReadAhead = 10;
74 | bool haploid = true;
75 |
76 | int time = 100; // state threshold for IBD detection
77 |
78 |
79 | // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
80 |
81 | // main tasks
82 | bool noConditionalAgeEstimates = false;
83 | bool doPosteriorSums = false;
84 | bool doPerPairPosteriorMean = false; // output posterior mean for each pair
85 | bool doPerPairMAP = false; // output MAP for each pair
86 | std::string expectedCoalTimesFile; // expected coalescence times within each interval
87 | bool withinOnly = false; // only compute decoding within individuals
88 | bool doMajorMinorPosteriorSums = false;
89 |
90 | bool processOptions();
91 | bool processCommandLineArgs(int argc, char* argv[]);
92 | bool processCommandLineArgsFastSMC(int argc, char* argv[]);
93 |
94 | /**
95 | * Verify that the selected parameters are compatible. Incompatible options will cause FastSMC to exit with a message
96 | * explaining the incompatibility.
97 | *
98 | * @return true if the parameters are compatible
99 | */
100 | bool validateParamsFastSMC();
101 |
102 | /**
103 | * Print decoding properties that are currently active.
104 | *
105 | * @return true;
106 | */
107 | bool printDecodingParams();
108 |
109 | /**
110 | * Constructor requiring only an input file root with all other parameters set to sensible defaults.
111 | * Decoding quantities will be generated if they are not specified and do not exist in the input file directory.
112 | */
113 | DecodingParams();
114 | explicit DecodingParams(std::string _inFileRoot, std::string _decodingQuantFile = "", std::string _outFileRoot = "",
115 | int _jobs = 1, int _jobInd = 1, std::string _decodingModeString = "array",
116 | bool _decodingSequence = false, bool _usingCSFS = true, bool _compress = false,
117 | bool _useAncestral = false, float _skipCSFSdistance = 0.f, bool _noBatches = false,
118 | bool _doPosteriorSums = false, bool _doPerPairPosteriorMean = false,
119 | std::string _expectedCoalTimesFile = "", bool _withinOnly = false,
120 | bool _doMajorMinorPosteriorSums = false, bool _doPerPairMAP = false,
121 | std::string _mapFile = "");
122 |
123 | /**
124 | * Minimal constructor that sets defaults for FastSMC. An error will occur if you try to use this constructor for
125 | * FastSMC == false.
126 | *
127 | * @param _inFileRoot the input file root
128 | * @param _decodingQuantFile the decoding quantities file
129 | * @param _outFileRoot the output file root
130 | * @param _fastSMC whether to run in FastSMC: if this is set to false an error will occur
131 | */
132 | DecodingParams(std::string _inFileRoot, std::string _decodingQuantFile, std::string _outFileRoot, bool _fastSMC = true);
133 |
134 |
135 | };
136 |
137 | #endif
138 |
--------------------------------------------------------------------------------
/src/DecodingQuantities.hpp:
--------------------------------------------------------------------------------
1 | // This file is part of ASMC, developed by Pier Francesco Palamara.
2 | //
3 | // ASMC is free software: you can redistribute it and/or modify
4 | // it under the terms of the GNU General Public License as published by
5 | // the Free Software Foundation, either version 3 of the License, or
6 | // (at your option) any later version.
7 | //
8 | // ASMC is distributed in the hope that it will be useful,
9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | // GNU General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU General Public License
14 | // along with ASMC. If not, see .
15 |
16 | #ifndef DECODINGQUANTITIES_HPP
17 | #define DECODINGQUANTITIES_HPP
18 |
19 | #include
20 | #include
21 | #include
22 |
23 | enum class DataType {
24 | TransitionType,
25 | States,
26 | CSFSSamples,
27 | TimeVector,
28 | SizeVector,
29 | Discretization,
30 | ExpectedTimes,
31 | CSFS,
32 | FoldedCSFS,
33 | ClassicEmission,
34 | AscertainedCSFS,
35 | FoldedAscertainedCSFS,
36 | CompressedAscertainedEmission,
37 | initialStateProb,
38 | ColumnRatios,
39 | RowRatios,
40 | Uvectors,
41 | Bvectors,
42 | Dvectors,
43 | HomozygousEmissions,
44 | None
45 | };
46 |
47 | class DecodingQuantities
48 | {
49 |
50 | public:
51 | unsigned int states = 0u;
52 | int CSFSSamples = 0;
53 | std::vector initialStateProb;
54 | std::vector expectedTimes;
55 | std::vector discretization;
56 | std::vector timeVector;
57 | std::vector columnRatios;
58 | std::vector> classicEmissionTable;
59 | std::vector> compressedEmissionTable;
60 | std::unordered_map> Dvectors;
61 | std::unordered_map> Bvectors;
62 | std::unordered_map> Uvectors;
63 | std::unordered_map> rowRatioVectors;
64 | std::unordered_map> homozygousEmissionMap;
65 | std::vector>> CSFSmap;
66 | std::vector>> foldedCSFSmap;
67 | std::vector>> ascertainedCSFSmap;
68 | std::vector>> foldedAscertainedCSFSmap;
69 |
70 | explicit DecodingQuantities(const std::string& fileName);
71 |
72 | private:
73 | // implemented, but need to update other code
74 | // void createFromBinary(const char *fileName);
75 | void createFromGzippedText(const std::string& fileName);
76 |
77 | /**
78 | * Validate that an appropriate decoding quantities file has been provided. This is achieved by:
79 | *
80 | * 1. Verifying the file exists
81 | * 2. Verifying the first line of the file contains exactly "TransitionType"
82 | *
83 | * @param fileName the name of the provided decoding quantities file
84 | */
85 | void validateDecodingQuantitiesFile(const std::string& fileName);
86 |
87 | };
88 |
89 | #endif
90 |
--------------------------------------------------------------------------------
/src/FastSMC.hpp:
--------------------------------------------------------------------------------
1 | // This file is part of ASMC, developed by Pier Francesco Palamara.
2 | //
3 | // ASMC is free software: you can redistribute it and/or modify
4 | // it under the terms of the GNU General Public License as published by
5 | // the Free Software Foundation, either version 3 of the License, or
6 | // (at your option) any later version.
7 | //
8 | // ASMC is distributed in the hope that it will be useful,
9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | // GNU General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU General Public License
14 | // along with ASMC. If not, see .
15 |
16 | #ifndef ASMC_FASTSMC_HPP
17 | #define ASMC_FASTSMC_HPP
18 |
19 | #include "Data.hpp"
20 | #include "DecodingParams.hpp"
21 | #include "HMM.hpp"
22 |
23 | namespace ASMC
24 | {
25 |
26 | class FastSMC
27 | {
28 |
29 | private:
30 |
31 | DecodingParams mParams;
32 | Data mData;
33 | HMM mHmm;
34 |
35 | public:
36 |
37 | /**
38 | * FastSMC constructor with full control over parameters, by manually specifying a DecodingParams object.
39 | *
40 | * @param params the decoding parameters
41 | */
42 | explicit FastSMC(DecodingParams params);
43 |
44 | /**
45 | * FastSMC constructor that will set sensible defaults. If you wish to fine-tune parameters, use the constructor that
46 | * takes a DecodingParams object, which you can configure manually.
47 | *
48 | * @param inFileRoot the input file root
49 | * @param dqFile the decoding quantities file
50 | * @param outFileRoot the output file root
51 | */
52 | FastSMC(const std::string& inFileRoot, const std::string& dqFile, const std::string& outFileRoot);
53 |
54 | void run();
55 |
56 | };
57 |
58 | } // namespace ASMC
59 |
60 | #endif // ASMC_FASTSMC_HPP
61 |
--------------------------------------------------------------------------------
/src/FileUtils.cpp:
--------------------------------------------------------------------------------
1 | // This file is part of ASMC, developed by Pier Francesco Palamara.
2 | //
3 | // ASMC is free software: you can redistribute it and/or modify
4 | // it under the terms of the GNU General Public License as published by
5 | // the Free Software Foundation, either version 3 of the License, or
6 | // (at your option) any later version.
7 | //
8 | // ASMC is distributed in the hope that it will be useful,
9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | // GNU General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU General Public License
14 | // along with ASMC. If not, see .
15 |
16 |
17 | #include
18 | #include
19 | #include
20 | #include
21 | #include
22 | #include
23 |
24 | #include "StringUtils.hpp"
25 | #include "FileUtils.hpp"
26 | #include "Types.hpp"
27 |
28 | #include
29 | #include
30 |
31 | namespace FileUtils {
32 |
33 | using std::string;
34 | using std::vector;
35 | using std::cerr;
36 | using std::endl;
37 |
38 | bool fileExists(const std::string& name)
39 | {
40 | std::ifstream f(name.c_str());
41 | return f.good();
42 | }
43 |
44 | void openOrExit(std::ifstream &stream, const string &file,
45 | std::ios_base::openmode mode) {
46 | stream.open(file.c_str(), mode);
47 | if (!stream) {
48 | cerr << "ERROR: Unable to open file: " << file << endl;
49 | exit(1);
50 | }
51 | }
52 |
53 | void openWritingOrExit(std::ofstream &stream, const string &file,
54 | std::ios_base::openmode mode) {
55 | stream.open(file.c_str(), mode);
56 | if (!stream) {
57 | cerr << "ERROR: Unable to open file for writing: " << file << endl;
58 | exit(1);
59 | }
60 | }
61 |
62 | void requireEmptyOrReadable(const std::string &file) {
63 | if (file.empty()) return;
64 | std::ifstream fin;
65 | fin.open(file.c_str());
66 | if (!fin) {
67 | cerr << "ERROR: Unable to open file: " << file << endl;
68 | exit(1);
69 | }
70 | fin.close();
71 | }
72 |
73 | void requireEachEmptyOrReadable(const std::vector &fileList) {
74 | for (uint i = 0; i < fileList.size(); i++)
75 | requireEmptyOrReadable(fileList[i]);
76 | }
77 |
78 | void requireEmptyOrWriteable(const std::string &file) {
79 | if (file.empty()) return;
80 | std::ofstream fout;
81 | fout.open(file.c_str(), std::ios::out | std::ios::app);
82 | if (!fout) {
83 | cerr << "ERROR: Output file is not writeable: " << file << endl;
84 | exit(1);
85 | }
86 | fout.close();
87 | }
88 |
89 | vector parseHeader(const string &fileName, const string &delimiters) {
90 | AutoGzIfstream fin; fin.openOrExit(fileName);
91 | string header;
92 | getline(fin, header);
93 | vector split = StringUtils::tokenizeMultipleDelimiters(header, delimiters);
94 | fin.close();
95 | return split;
96 | }
97 |
98 | int lookupColumnInd(const string &fileName, const string &delimiters, const string &columnName) {
99 | vector headers = parseHeader(fileName, delimiters);
100 | int columnInd = -1;
101 | for (uint c = 0; c < headers.size(); c++)
102 | if (headers[c] == columnName)
103 | columnInd = c; // first column is snp ID, treated separately
104 | if (columnInd == -1) {
105 | cerr << "WARNING: Column " << columnName << " not found in headers of " << fileName << endl;
106 | //exit(1);
107 | }
108 | return columnInd;
109 | }
110 |
111 | double readDoubleNanInf(std::istream &stream) {
112 | string str;
113 | stream >> str;
114 | return std::stod(str);
115 | }
116 |
117 | vector < std::pair > readFidIids(const string &file) {
118 | vector < std::pair > ret;
119 | AutoGzIfstream fin;
120 | fin.openOrExit(file);
121 | string FID, IID, line;
122 | while (fin >> FID >> IID) {
123 | if (FID.empty() || IID.empty()) {
124 | cerr << "ERROR: In file " << file << endl;
125 | cerr << " unable to read FID and IID; check format" << endl;
126 | exit(1);
127 | }
128 | ret.push_back(make_pair(FID, IID));
129 | getline(fin, line);
130 | }
131 | fin.close();
132 | return ret;
133 | }
134 |
135 | int AutoGzIfstream::lineCount(const std::string &file) {
136 | AutoGzIfstream fin; fin.openOrExit(file);
137 | int ctr = 0; string line;
138 | while (getline(fin, line))
139 | ctr++;
140 | return ctr;
141 | }
142 |
143 | void AutoGzIfstream::openOrExit(const std::string &file, std::ios_base::openmode mode) {
144 | fin.open(file.c_str(), mode);
145 | if (!fin) {
146 | cerr << "ERROR: Unable to open file: " << file << endl;
147 | exit(1);
148 | }
149 | if ((int) file.length() > 3 && file.substr(file.length() - 3) == ".gz")
150 | boost_in.push(boost::iostreams::gzip_decompressor());
151 | boost_in.push(fin);
152 | }
153 |
154 | void AutoGzIfstream::close() {
155 | fin.close();
156 | boost_in.reset();
157 | }
158 |
159 | AutoGzIfstream::operator bool() const {
160 | return !boost_in.fail();
161 | }
162 |
163 | AutoGzIfstream& AutoGzIfstream::read(char *s, std::streamsize n) {
164 | boost_in.read(s, n);
165 | return *this;
166 | }
167 |
168 | int AutoGzIfstream::get() {
169 | return boost_in.get();
170 | }
171 |
172 | double AutoGzIfstream::readDoubleNanInf() {
173 | return FileUtils::readDoubleNanInf(boost_in);
174 | }
175 |
176 | void AutoGzIfstream::clear() {
177 | boost_in.clear();
178 | }
179 |
180 | AutoGzIfstream& AutoGzIfstream::seekg(std::streamoff off, std::ios_base::seekdir way) {
181 | boost_in.seekg(off, way);
182 | return *this;
183 | }
184 |
185 | AutoGzIfstream& getline(AutoGzIfstream& in, std::string &s) {
186 | std::getline(in.boost_in, s);
187 | return in;
188 | }
189 |
190 | void AutoGzOfstream::openOrExit(const std::string &file, std::ios_base::openmode mode) {
191 | fout.open(file.c_str(), mode);
192 | if (!fout) {
193 | cerr << "ERROR: Unable to open file: " << file << endl;
194 | exit(1);
195 | }
196 | if ((int) file.length() > 3 && file.substr(file.length() - 3) == ".gz")
197 | boost_out.push(boost::iostreams::gzip_compressor());
198 | boost_out.push(fout);
199 | }
200 |
201 | void AutoGzOfstream::close() {
202 | boost_out.reset();
203 | fout.close();
204 | }
205 |
206 | AutoGzOfstream& AutoGzOfstream::operator << (std::ostream & (*manip)(std::ostream&)) {
207 | manip(boost_out);
208 | return *this;
209 | }
210 |
211 | void AutoGzOfstream::unsetf(std::ios_base::fmtflags mask) {
212 | boost_out.unsetf(mask);
213 | }
214 |
215 | AutoGzOfstream::operator bool() const {
216 | return !boost_out.fail();
217 | }
218 |
219 | }
220 |
--------------------------------------------------------------------------------
/src/FileUtils.hpp:
--------------------------------------------------------------------------------
1 | // This file is part of ASMC, developed by Pier Francesco Palamara.
2 | //
3 | // ASMC is free software: you can redistribute it and/or modify
4 | // it under the terms of the GNU General Public License as published by
5 | // the Free Software Foundation, either version 3 of the License, or
6 | // (at your option) any later version.
7 | //
8 | // ASMC is distributed in the hope that it will be useful,
9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | // GNU General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU General Public License
14 | // along with ASMC. If not, see .
15 |
16 |
17 | #ifndef FILEUTILS_HPP
18 | #define FILEUTILS_HPP
19 |
20 | #include
21 | #include
22 | #include
23 |
24 | #include "StringUtils.hpp"
25 |
26 | #include
27 |
28 | namespace FileUtils {
29 |
30 | bool fileExists(const std::string& name);
31 |
32 | void openOrExit(std::ifstream &stream, const std::string &file,
33 | std::ios_base::openmode mode = std::ios::in);
34 |
35 | void openWritingOrExit(std::ofstream &stream, const std::string &file,
36 | std::ios_base::openmode mode = std::ios::out);
37 |
38 | void requireEmptyOrReadable(const std::string &file);
39 |
40 | void requireEachEmptyOrReadable(const std::vector &fileList);
41 |
42 | void requireEmptyOrWriteable(const std::string &file);
43 |
44 | std::vector parseHeader(const std::string &fileName,
45 | const std::string &delimiters);
46 |
47 | int lookupColumnInd(const std::string &fileName, const std::string &delimiters,
48 | const std::string &columnName);
49 |
50 | double readDoubleNanInf(std::istream &stream);
51 |
52 | std::vector < std::pair > readFidIids(const std::string &file);
53 |
54 | class AutoGzIfstream {
55 | boost::iostreams::filtering_istream boost_in;
56 | std::ifstream fin;
57 |
58 | public:
59 |
60 | static int lineCount(const std::string &file);
61 |
62 | void openOrExit(const std::string &file, std::ios_base::openmode mode = std::ios::in);
63 | void close();
64 | template AutoGzIfstream& operator >> (T &x) {
65 | boost_in >> x;
66 | return *this;
67 | }
68 |
69 | explicit operator bool() const;
70 | AutoGzIfstream& read(char *s, std::streamsize n);
71 | int get();
72 | double readDoubleNanInf();
73 | void clear();
74 | AutoGzIfstream& seekg(std::streamoff off, std::ios_base::seekdir way);
75 | friend AutoGzIfstream& getline(AutoGzIfstream& in, std::string &s);
76 | };
77 |
78 | AutoGzIfstream& getline(AutoGzIfstream& in, std::string &s);
79 |
80 | class AutoGzOfstream {
81 | boost::iostreams::filtering_ostream boost_out;
82 | std::ofstream fout;
83 |
84 | public:
85 |
86 | void openOrExit(const std::string &file, std::ios_base::openmode mode = std::ios::out);
87 | void close();
88 | template AutoGzOfstream& operator << (const T &x) {
89 | boost_out << x;
90 | return *this;
91 | }
92 | AutoGzOfstream& operator << (std::ostream & (*manip)(std::ostream&));
93 | void unsetf(std::ios_base::fmtflags);
94 | explicit operator bool() const;
95 | };
96 |
97 | }
98 |
99 | #endif
100 |
--------------------------------------------------------------------------------
/src/Individual.cpp:
--------------------------------------------------------------------------------
1 | // This file is part of ASMC, developed by Pier Francesco Palamara.
2 | //
3 | // ASMC is free software: you can redistribute it and/or modify
4 | // it under the terms of the GNU General Public License as published by
5 | // the Free Software Foundation, either version 3 of the License, or
6 | // (at your option) any later version.
7 | //
8 | // ASMC is distributed in the hope that it will be useful,
9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | // GNU General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU General Public License
14 | // along with ASMC. If not, see .
15 |
16 | #include "Individual.hpp"
17 |
18 | Individual::Individual(int numOfSites)
19 | {
20 | genotype1 = std::vector(numOfSites);
21 | genotype2 = std::vector(numOfSites);
22 | }
23 |
24 | void Individual::setGenotype(int_least8_t hap, int pos, bool val)
25 | {
26 | if (hap == 1) {
27 | genotype1[pos] = val;
28 | } else {
29 | genotype2[pos] = val;
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/src/Individual.hpp:
--------------------------------------------------------------------------------
1 | // This file is part of ASMC, developed by Pier Francesco Palamara.
2 | //
3 | // ASMC is free software: you can redistribute it and/or modify
4 | // it under the terms of the GNU General Public License as published by
5 | // the Free Software Foundation, either version 3 of the License, or
6 | // (at your option) any later version.
7 | //
8 | // ASMC is distributed in the hope that it will be useful,
9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | // GNU General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU General Public License
14 | // along with ASMC. If not, see .
15 |
16 |
17 | #ifndef INDIVIDUAL_HPP
18 | #define INDIVIDUAL_HPP
19 |
20 | #include
21 | #include
22 |
23 | class Individual {
24 |
25 | /* **************************** */
26 | /* **************************** */
27 | // contains individual data
28 | /* **************************** */
29 | /* **************************** */
30 | public:
31 | std::vector genotype1;
32 | std::vector genotype2;
33 |
34 | public:
35 | explicit Individual(int numOfSites = 0);
36 | void setGenotype(int_least8_t hap, int pos, bool val);
37 |
38 | };
39 |
40 | #endif
41 |
--------------------------------------------------------------------------------
/src/MemoryUtils.cpp:
--------------------------------------------------------------------------------
1 | // This file is part of ASMC, developed by Pier Francesco Palamara.
2 | //
3 | // ASMC is free software: you can redistribute it and/or modify
4 | // it under the terms of the GNU General Public License as published by
5 | // the Free Software Foundation, either version 3 of the License, or
6 | // (at your option) any later version.
7 | //
8 | // ASMC is distributed in the hope that it will be useful,
9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | // GNU General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU General Public License
14 | // along with ASMC. If not, see .
15 |
16 |
17 | #include
18 | #include
19 |
20 | #include "MemoryUtils.hpp"
21 | #include "Types.hpp"
22 |
23 | void *ALIGNED_MALLOC(size_t size) {
24 | #ifdef USE_MKL_MALLOC
25 | void *p = mkl_malloc(size, MEM_ALIGNMENT);
26 | #else
27 | void *p = _mm_malloc(size, MEM_ALIGNMENT);
28 | #endif
29 | if (p == NULL) {
30 | std::cerr << "ERROR: Failed to allocate " << size << " bytes" << std::endl;
31 | exit(1);
32 | } else if ((uint64) p & 0xf) {
33 | std::cerr << "ERROR: Memory alignment of " << size << " bytes failed" << std::endl;
34 | exit(1);
35 | }
36 | return p;
37 | }
38 |
--------------------------------------------------------------------------------
/src/MemoryUtils.hpp:
--------------------------------------------------------------------------------
1 | // This file is part of ASMC, developed by Pier Francesco Palamara.
2 | //
3 | // ASMC is free software: you can redistribute it and/or modify
4 | // it under the terms of the GNU General Public License as published by
5 | // the Free Software Foundation, either version 3 of the License, or
6 | // (at your option) any later version.
7 | //
8 | // ASMC is distributed in the hope that it will be useful,
9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | // GNU General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU General Public License
14 | // along with ASMC. If not, see .
15 |
16 |
17 | #ifndef MEMORYUTILS_HPP
18 | #define MEMORYUTILS_HPP
19 |
20 | #include "Types.hpp"
21 |
22 | #define MEM_ALIGNMENT 64
23 |
24 | void *ALIGNED_MALLOC(size_t size);
25 |
26 | #ifdef USE_MKL_MALLOC
27 | #include
28 | #define ALIGNED_FREE mkl_free
29 | #else
30 | #include
31 | #define ALIGNED_FREE _mm_free
32 | #endif
33 |
34 | #define ALIGNED_MALLOC_DOUBLES(numDoubles) (double *) ALIGNED_MALLOC((numDoubles)*sizeof(double))
35 | #define ALIGNED_MALLOC_FLOATS(numFloats) (float *) ALIGNED_MALLOC((numFloats)*sizeof(float))
36 | #define ALIGNED_MALLOC_UCHARS(numUchars) (uchar *) ALIGNED_MALLOC((numUchars)*sizeof(uchar))
37 | #define ALIGNED_MALLOC_UINTS(numUints) (uint *) ALIGNED_MALLOC((numUints)*sizeof(uint))
38 | #define ALIGNED_MALLOC_UINT64S(numUint64s) (uint64 *) ALIGNED_MALLOC((numUint64s)*sizeof(uint64))
39 | #define ALIGNED_MALLOC_UINT64_MASKS(numUint64_masks) (uint64_masks *) ALIGNED_MALLOC((numUint64_masks)*sizeof(uint64_masks))
40 | #define ALIGNED_MALLOC_USHORTS(numUshorts) (ushort *) ALIGNED_MALLOC((numUshorts)*sizeof(ushort))
41 |
42 | #endif
43 |
--------------------------------------------------------------------------------
/src/StringUtils.cpp:
--------------------------------------------------------------------------------
1 | // This file is part of ASMC, developed by Pier Francesco Palamara.
2 | //
3 | // ASMC is free software: you can redistribute it and/or modify
4 | // it under the terms of the GNU General Public License as published by
5 | // the Free Software Foundation, either version 3 of the License, or
6 | // (at your option) any later version.
7 | //
8 | // ASMC is distributed in the hope that it will be useful,
9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | // GNU General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU General Public License
14 | // along with ASMC. If not, see .
15 |
16 |
17 | #include
18 | #include
19 | #include
20 | #include
21 | #include
22 | #include
23 |
24 | //#include
25 |
26 | #include "StringUtils.hpp"
27 | #include "Types.hpp"
28 |
29 | namespace StringUtils {
30 | using std::vector;
31 | using std::string;
32 | using std::cout;
33 | using std::cerr;
34 | using std::endl;
35 |
36 | float stof(const std::string &str)
37 | {
38 | return static_cast(std::stold(str));
39 | }
40 |
41 | double stod(const std::string &str)
42 | {
43 | return static_cast(std::stold(str));
44 | }
45 |
46 | string findDelimiters(const string &s, const string &c) {
47 | string delims;
48 | for (uint p = 0; p < s.length(); p++)
49 | if (c.find(s[p], 0) != string::npos)
50 | delims += s[p];
51 | return delims;
52 | }
53 | // will not return blanks
54 | vector tokenizeMultipleDelimiters(const string &s, const string &c)
55 | {
56 | uint p = 0;
57 | vector ans;
58 | string tmp;
59 | while (p < s.length()) {
60 | tmp = "";
61 | while (p < s.length() && c.find(s[p], 0) != string::npos)
62 | p++;
63 | while (p < s.length() && c.find(s[p], 0) == string::npos) {
64 | tmp += s[p];
65 | p++;
66 | }
67 | if (tmp != "")
68 | ans.push_back(tmp);
69 | }
70 | return ans;
71 | }
72 |
73 | void rangeErrorExit(const string &str, const string &delims) {
74 | cerr << "ERROR: Invalid delimiter sequence for specifying range: " << endl;
75 | cerr << " Template string: " << str << endl;
76 | cerr << " Delimiter sequence found: " << delims << endl;
77 | cerr << "Range in must have format {start:end} with no other " << RANGE_DELIMS
78 | << " chars" << endl;
79 | exit(1);
80 | }
81 |
82 | // basic range template: expand "{start:end}" to vector with one entry per range element
83 | // if end==start-1, will return empty
84 | vector expandRangeTemplate(const string &str) {
85 | vector ret;
86 | string delims = findDelimiters(str, RANGE_DELIMS);
87 | if (delims.empty())
88 | ret.push_back(str);
89 | else if (delims == RANGE_DELIMS) {
90 | vector tokens = tokenizeMultipleDelimiters(str, RANGE_DELIMS);
91 | for (int i = 0; i < (int) str.size(); i++)
92 | if (str[i] == ':' && (str[i - 1] == '{' || str[i + 1] == '}'))
93 | rangeErrorExit(str, delims);
94 | int startInd = (str[0] != RANGE_DELIMS[0]), endInd = startInd + 1;
95 | string prefix, suffix;
96 | if (str[0] != RANGE_DELIMS[0]) prefix = tokens[0];
97 | if (str[str.length() - 1] != RANGE_DELIMS[2]) suffix = tokens.back();
98 | int start = std::stoi(tokens[startInd]), end = std::stoi(tokens[endInd]);
99 | if (start > end + 1 || end > start + 1000000) {
100 | cerr << "ERROR: Invalid range in template string: " << str << endl;
101 | cerr << " Start: " << start << endl;
102 | cerr << " End: " << end << endl;
103 | exit(1);
104 | }
105 | for (int i = start; i <= end; i++)
106 | ret.push_back(prefix + std::to_string(i) + suffix);
107 | }
108 | else
109 | rangeErrorExit(str, delims);
110 | return ret;
111 | }
112 |
113 | vector expandRangeTemplates(const vector &rangeTemplates) {
114 | vector expanded;
115 | for (uint i = 0; i < rangeTemplates.size(); i++) {
116 | vector range = expandRangeTemplate(rangeTemplates[i]);
117 | expanded.insert(expanded.end(), range.begin(), range.end());
118 | }
119 | return expanded;
120 | }
121 | }
122 |
--------------------------------------------------------------------------------
/src/StringUtils.hpp:
--------------------------------------------------------------------------------
1 | // This file is part of ASMC, developed by Pier Francesco Palamara.
2 | //
3 | // ASMC is free software: you can redistribute it and/or modify
4 | // it under the terms of the GNU General Public License as published by
5 | // the Free Software Foundation, either version 3 of the License, or
6 | // (at your option) any later version.
7 | //
8 | // ASMC is distributed in the hope that it will be useful,
9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | // GNU General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU General Public License
14 | // along with ASMC. If not, see .
15 |
16 |
17 | #ifndef STRINGUTILS_HPP
18 | #define STRINGUTILS_HPP
19 |
20 | #include
21 | #include
22 |
23 | namespace StringUtils {
24 |
25 | /**
26 | * Convert string to float, taking account of the fact that inputs may be given
27 | * at a precision too great to be representable as a float.
28 | *
29 | * This function converts first to long double, and then explicitly performs a
30 | * static cast to narrow the output to a float.
31 | *
32 | * @param str the string to convert to a float
33 | * @return the closest float representation of the string
34 | */
35 | float stof(const std::string &str);
36 |
37 | /**
38 | * Convert string to double, taking account of the fact that inputs may be given
39 | * at a precision too great to be representable as a double.
40 | *
41 | * This function converts first to long double, and then explicitly performs a
42 | * static cast to narrow the output to a double.
43 | *
44 | * @param str the string to convert to a double
45 | * @return the closest double representation of the string
46 | */
47 | double stod(const std::string &str);
48 |
49 | const std::string RANGE_DELIMS = "{:}";
50 |
51 | std::string findDelimiters(const std::string &s, const std::string &c);
52 |
53 | std::vector tokenizeMultipleDelimiters(const std::string &s, const std::string &c);
54 | std::vector expandRangeTemplate(const std::string &str);
55 | std::vector expandRangeTemplates(const std::vector &rangeTemplates);
56 | }
57 |
58 | #endif
59 |
--------------------------------------------------------------------------------
/src/Timer.cpp:
--------------------------------------------------------------------------------
1 | // This file is part of ASMC, developed by Pier Francesco Palamara.
2 | //
3 | // ASMC is free software: you can redistribute it and/or modify
4 | // it under the terms of the GNU General Public License as published by
5 | // the Free Software Foundation, either version 3 of the License, or
6 | // (at your option) any later version.
7 | //
8 | // ASMC is distributed in the hope that it will be useful,
9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | // GNU General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU General Public License
14 | // along with ASMC. If not, see .
15 |
16 | #include "Timer.hpp"
17 |
18 | Timer::Timer()
19 | {
20 | update_time();
21 | }
22 |
23 | double Timer::update_time()
24 | {
25 | prevtime = curtime;
26 | curtime = timer_t::now();
27 | std::chrono::duration diff = curtime - prevtime;
28 | return diff.count();
29 | }
30 |
--------------------------------------------------------------------------------
/src/Timer.hpp:
--------------------------------------------------------------------------------
1 | // This file is part of ASMC, developed by Pier Francesco Palamara.
2 | //
3 | // ASMC is free software: you can redistribute it and/or modify
4 | // it under the terms of the GNU General Public License as published by
5 | // the Free Software Foundation, either version 3 of the License, or
6 | // (at your option) any later version.
7 | //
8 | // ASMC is distributed in the hope that it will be useful,
9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | // GNU General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU General Public License
14 | // along with ASMC. If not, see .
15 |
16 | #include
17 |
18 | #ifndef TIMER_HPP
19 | #define TIMER_HPP
20 |
21 | class Timer
22 | {
23 | private:
24 | using timer_t = std::chrono::system_clock;
25 | using sys_time = std::chrono::time_point;
26 |
27 | sys_time prevtime, curtime;
28 |
29 | public:
30 | /// constructs a timer, recording the initial time
31 | Timer();
32 |
33 | /// updates the current time and returns the time since the last update in seconds
34 | double update_time();
35 | };
36 |
37 | #endif
38 |
--------------------------------------------------------------------------------
/src/Types.hpp:
--------------------------------------------------------------------------------
1 | // This file is part of ASMC, developed by Pier Francesco Palamara.
2 | //
3 | // ASMC is free software: you can redistribute it and/or modify
4 | // it under the terms of the GNU General Public License as published by
5 | // the Free Software Foundation, either version 3 of the License, or
6 | // (at your option) any later version.
7 | //
8 | // ASMC is distributed in the hope that it will be useful,
9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | // GNU General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU General Public License
14 | // along with ASMC. If not, see .
15 |
16 |
17 | #ifndef TYPES_HPP
18 | #define TYPES_HPP
19 |
20 | #include
21 | #include
22 |
23 | typedef unsigned char uchar;
24 | typedef unsigned int uint;
25 | typedef unsigned short ushort;
26 | typedef uint64_t uint64;
27 | typedef int64_t int64;
28 | typedef uint64_t hash_size;
29 |
30 | struct uint64_masks {
31 | uint64 is0, is2, is9;
32 | };
33 |
34 | #endif
35 |
--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
1 | # This file is part of ASMC, developed by Pier Francesco Palamara.
2 |
3 | # ASMC is free software: you can redistribute it and/or modify
4 | # it under the terms of the GNU General Public License as published by
5 | # the Free Software Foundation, either version 3 of the License, or
6 | # (at your option) any later version.
7 |
8 | # ASMC is distributed in the hope that it will be useful,
9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | # GNU General Public License for more details.
12 |
13 | # You should have received a copy of the GNU General Public License
14 | # along with ASMC. If not, see .
15 |
16 |
17 | from asmc.asmc_python_bindings import BinaryDataReader
18 | from asmc.asmc_python_bindings import DecodingModeOverall
19 | from asmc.asmc_python_bindings import DecodingMode
20 | from asmc.asmc_python_bindings import DecodingReturnValues
21 | from asmc.asmc_python_bindings import DecodePairsReturnStruct
22 | from asmc.asmc_python_bindings import IbdPairDataLine
23 | from asmc.asmc_python_bindings import Individual
24 | from asmc.asmc_python_bindings import PairObservations
25 | from asmc.asmc_python_bindings import DecodingQuantities
26 | from asmc.asmc_python_bindings import DecodingParams
27 | from asmc.asmc_python_bindings import Data
28 | from asmc.asmc_python_bindings import HMM
29 | from asmc.asmc_python_bindings import FastSMC
30 | from asmc.asmc_python_bindings import ASMC
31 |
32 |
33 | #
34 | # ASMCReturnValues = collections.namedtuple(
35 | # "ASMCReturnValues",
36 | # "sumOverPairs sumOverPairs00 sumOverPairs01 sumOverPairs11")
37 |
38 |
39 | # def to_array(x):
40 | # a = list(x)
41 | # if a:
42 | # return np.array(a)
43 | # else:
44 | # return None
45 | #
46 | #
47 | # def flip_rows(a1, a2, flips):
48 | # # Swap rows according to boolean flips vector
49 | # if a1 is None or a2 is None:
50 | # return None, None
51 | # a1[flips], a2[flips] = a2[flips], a1[flips]
52 | # return a1, a2
53 |
54 |
55 | # def run(in_file_root, decoding_quant_file, out_file_root="",
56 | # mode=DecodingModeOverall.array, jobs=0,
57 | # job_index=0, skip_csfs_distance=0,
58 | # compress=False, use_ancestral=False,
59 | # posterior_sums=False, major_minor_posterior_sums=False):
60 | # ret = asmc(in_file_root=in_file_root,
61 | # decoding_quant_file=decoding_quant_file,
62 | # mode=mode, jobs=jobs, job_index=job_index,
63 | # skip_csfs_distance=skip_csfs_distance,
64 | # compress=compress, use_ancestral=use_ancestral,
65 | # posterior_sums=posterior_sums,
66 | # major_minor_posterior_sums=major_minor_posterior_sums)
67 | # sumOverPairs00, sumOverPairs11 = flip_rows(
68 | # to_array(ret.sumOverPairs00), to_array(ret.sumOverPairs11),
69 | # ret.siteWasFlippedDuringFolding)
70 | # return ASMCReturnValues(
71 | # sumOverPairs=to_array(ret.sumOverPairs),
72 | # sumOverPairs00=sumOverPairs00,
73 | # sumOverPairs01=to_array(ret.sumOverPairs01),
74 | # sumOverPairs11=sumOverPairs11)
75 |
--------------------------------------------------------------------------------
/src/hashing/ExtendHash.hpp:
--------------------------------------------------------------------------------
1 | // This file is part of ASMC, developed by Pier Francesco Palamara.
2 | //
3 | // ASMC is free software: you can redistribute it and/or modify
4 | // it under the terms of the GNU General Public License as published by
5 | // the Free Software Foundation, either version 3 of the License, or
6 | // (at your option) any later version.
7 | //
8 | // ASMC is distributed in the hope that it will be useful,
9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | // GNU General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU General Public License
14 | // along with ASMC. If not, see .
15 |
16 | #ifndef ASMC_HASHING_EXTEND_HASH_HPP
17 | #define ASMC_HASHING_EXTEND_HASH_HPP
18 |
19 | #include
20 |
21 | #include
22 |
23 | #include "hashing/Match.hpp"
24 |
25 | /* Object for storing extension between pairs of Individuals */
26 | class ExtendHash
27 | {
28 |
29 | boost::unordered_map extend_hash;
30 |
31 | unsigned long mWordSize;
32 | unsigned long num;
33 |
34 | bool mParHaploid;
35 |
36 | // Empty Match to insert into hash
37 | Match m;
38 |
39 | public:
40 | explicit ExtendHash(const unsigned long wordSize, const unsigned long num, const bool PAR_HAPLOID)
41 | : mWordSize(wordSize), num(num), mParHaploid(PAR_HAPLOID), m(wordSize)
42 | {
43 | }
44 |
45 | // Compute pair of Individuals from location indicator
46 | std::pair locationToPair(unsigned long loc)
47 | {
48 | const unsigned second = mParHaploid ? loc % num : 2 * (loc % num);
49 | const unsigned first = mParHaploid ? (loc - second) / num : 2 * ((loc - second / 2) / num);
50 |
51 | return std::make_pair(first, second);
52 | }
53 |
54 | // Compute location from pair of Individuals
55 | unsigned long pairToLocation(unsigned int i, unsigned int j)
56 | {
57 | if (!mParHaploid) {
58 | // round everyone down to the nearest haplotype
59 | i = (i - (i % 2)) / 2;
60 | j = (j - (j % 2)) / 2;
61 | }
62 | unsigned long loc = (i > j) ? j * num + i : i * num + j;
63 | return loc;
64 | }
65 |
66 | // Extend or add a given pair in the current hash
67 | // unsigned int i,j : identifiers for the two Individuals
68 | // int w : current word # to extend or add
69 | void extendPair(unsigned int i, unsigned int j, int w, const int GLOBAL_CURRENT_WORD)
70 | {
71 | m.getModifiableInterval()[0] = GLOBAL_CURRENT_WORD;
72 | // Find/extend this location in the hash
73 | auto extend_ret = extend_hash.insert(std::pair(pairToLocation(i, j), m));
74 | (extend_ret.first->second).extend(w);
75 | }
76 |
77 | // Remove all pairs that were not extended beyond w
78 | // int w : word # to remove prior to
79 | void clearPairsPriorTo(int w, const int GLOBAL_CURRENT_WORD, const double PAR_MIN_MATCH,
80 | const std::vector& geneticPositions, HMM& hmm)
81 | {
82 | for (auto it = extend_hash.begin(); it != extend_hash.end();) {
83 | if (it->second.getInterval()[1] < w) {
84 | it->second.print(locationToPair(it->first), PAR_MIN_MATCH, geneticPositions, hmm);
85 | it = extend_hash.erase(it);
86 | } else {
87 | if (it->second.getInterval()[1] < GLOBAL_CURRENT_WORD)
88 | it->second.addGap();
89 | it++;
90 | }
91 | }
92 | }
93 |
94 | // Remove all pairs that were not extended beyond w
95 | // int w : word # to remove prior to
96 | void extendAllPairsTo(int w)
97 | {
98 | for (auto it = extend_hash.begin(); it != extend_hash.end(); it++)
99 | it->second.getModifiableInterval()[1] = w;
100 | }
101 |
102 | // Remove all pairs
103 | // int w : word # to remove prior to
104 | void clearAllPairs(const double PAR_MIN_MATCH, const std::vector& geneticPositions, HMM& hmm)
105 | {
106 | for (auto it = extend_hash.begin(); it != extend_hash.end();) {
107 | it->second.print(locationToPair(it->first), PAR_MIN_MATCH, geneticPositions, hmm);
108 | it = extend_hash.erase(it);
109 | }
110 | }
111 |
112 | std::size_t size() const
113 | {
114 | return extend_hash.size();
115 | }
116 |
117 | unsigned long getWordSize() const
118 | {
119 | return mWordSize;
120 | }
121 |
122 | };
123 |
124 | #endif // ASMC_HASHING_EXTEND_HASH_HPP
125 |
--------------------------------------------------------------------------------
/src/hashing/Individuals.hpp:
--------------------------------------------------------------------------------
1 | // This file is part of ASMC, developed by Pier Francesco Palamara.
2 | //
3 | // ASMC is free software: you can redistribute it and/or modify
4 | // it under the terms of the GNU General Public License as published by
5 | // the Free Software Foundation, either version 3 of the License, or
6 | // (at your option) any later version.
7 | //
8 | // ASMC is distributed in the hope that it will be useful,
9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | // GNU General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU General Public License
14 | // along with ASMC. If not, see .
15 |
16 | #ifndef ASMC_HASHING_INDIVIDUALS_HPP
17 | #define ASMC_HASHING_INDIVIDUALS_HPP
18 |
19 | #include
20 | #include
21 | #include
22 | #include
23 |
24 | #include "boost/dynamic_bitset.hpp"
25 |
26 | class Individuals
27 | {
28 | unsigned mIdNum;
29 | unsigned long mWordSize = 64ul;
30 | unsigned long mNumReadAhead = 10ul;
31 |
32 | std::vector> mHap{mNumReadAhead, boost::dynamic_bitset<>(mWordSize, 0ul)};
33 |
34 | public:
35 | explicit Individuals(const unsigned long wordSize, const unsigned long numReadAhead, const unsigned idNum)
36 | : mIdNum{idNum}, mWordSize{wordSize}, mNumReadAhead{numReadAhead}
37 | {
38 | assert(wordSize > 0ul);
39 | assert(numReadAhead > 0ul);
40 |
41 | mHap.resize(numReadAhead);
42 | std::fill(mHap.begin(), mHap.end(), boost::dynamic_bitset<>(wordSize, 0ul));
43 | }
44 |
45 | void clear(const int w)
46 | {
47 | assert(w >= 0);
48 | mHap.at(w % mNumReadAhead).reset();
49 | }
50 |
51 | void setMarker(const int w, const std::size_t bit)
52 | {
53 | assert(w >= 0);
54 | assert(bit < mWordSize);
55 | mHap.at(w % mNumReadAhead).set(bit);
56 | }
57 |
58 | unsigned long getWordHash(const int w)
59 | {
60 | assert(w >= 0);
61 | return mHap.at(w % mNumReadAhead).to_ulong();
62 | }
63 |
64 | std::string getWordString(const int w)
65 | {
66 | assert(w >= 0);
67 | std::string buffer;
68 | boost::to_string(mHap.at(w % mNumReadAhead), buffer);
69 | return buffer;
70 | }
71 |
72 | unsigned int getIdNum() const
73 | {
74 | return mIdNum;
75 | }
76 |
77 | unsigned long getWordSize() const
78 | {
79 | return mHap.front().size();
80 | }
81 |
82 | unsigned long getNumReadAhead() const
83 | {
84 | return mHap.size();
85 | }
86 | };
87 |
88 | #endif // ASMC_HASHING_INDIVIDUALS_HPP
89 |
--------------------------------------------------------------------------------
/src/hashing/Match.hpp:
--------------------------------------------------------------------------------
1 | // This file is part of ASMC, developed by Pier Francesco Palamara.
2 | //
3 | // ASMC is free software: you can redistribute it and/or modify
4 | // it under the terms of the GNU General Public License as published by
5 | // the Free Software Foundation, either version 3 of the License, or
6 | // (at your option) any later version.
7 | //
8 | // ASMC is distributed in the hope that it will be useful,
9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | // GNU General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU General Public License
14 | // along with ASMC. If not, see .
15 |
16 | #ifndef ASMC_HASHING_MATCH_HPP
17 | #define ASMC_HASHING_MATCH_HPP
18 |
19 | #include
20 | #include
21 | #include
22 |
23 | #include "hashing/Utils.hpp"
24 | #include "HMM.hpp"
25 |
26 | /**
27 | * Match object that does \\todo not clear to me what this does...
28 | */
29 | class Match
30 | {
31 | private:
32 | std::array mInterval = {0, 0};
33 | unsigned long mWordSize;
34 | unsigned mGaps = 0u;
35 |
36 | public:
37 | explicit Match(const unsigned long wordSize, const int i = 0) : mInterval{i, i}, mWordSize{wordSize}
38 | {
39 | }
40 |
41 | // pair : identifiers for the corresponding Individuals in all_ind
42 | void print(std::pair p, const double PAR_MIN_MATCH, const std::vector& geneticPositions,
43 | HMM& hmm)
44 | {
45 | const int intWordSize = static_cast(mWordSize);
46 | double mlen = asmc::cmBetween(mInterval[0], mInterval[1], geneticPositions, intWordSize);
47 | if (mlen >= PAR_MIN_MATCH) {
48 | const int from = mInterval[0] * intWordSize;
49 | const int to = mInterval[1] * intWordSize + intWordSize - 1;
50 |
51 | if(hmm.getDecodingParams().hashingOnly){
52 | unsigned int jInd = p.first / 2;
53 | unsigned int iInd = p.second / 2;
54 | PairObservations observation = hmm.makePairObs(p.first % 2 == 0 ? 1 : 2, jInd, p.second % 2 == 0 ? 1 : 2, iInd);
55 | hmm.writePairIBD(observation, from, to);
56 | } else {
57 | hmm.decodeFromHashing(p.first, p.second, from, to);
58 | }
59 | }
60 | }
61 |
62 | void extend(const int w)
63 | {
64 | mInterval[1] = std::max(w, mInterval[1]);
65 | }
66 |
67 | void addGap()
68 | {
69 | mGaps++;
70 | }
71 |
72 | const std::array& getInterval() const
73 | {
74 | return mInterval;
75 | }
76 |
77 | std::array& getModifiableInterval()
78 | {
79 | return mInterval;
80 | }
81 |
82 | unsigned int getGaps() const
83 | {
84 | return mGaps;
85 | }
86 |
87 | unsigned long getWordSize() const
88 | {
89 | return mWordSize;
90 | }
91 | };
92 |
93 | #endif // ASMC_HASHING_MATCH_HPP
94 |
--------------------------------------------------------------------------------
/src/hashing/SeedHash.hpp:
--------------------------------------------------------------------------------
1 | // This file is part of ASMC, developed by Pier Francesco Palamara.
2 | //
3 | // ASMC is free software: you can redistribute it and/or modify
4 | // it under the terms of the GNU General Public License as published by
5 | // the Free Software Foundation, either version 3 of the License, or
6 | // (at your option) any later version.
7 | //
8 | // ASMC is distributed in the hope that it will be useful,
9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | // GNU General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU General Public License
14 | // along with ASMC. If not, see .
15 |
16 | #ifndef ASMC_HASHING_SEEDHASH_HPP
17 | #define ASMC_HASHING_SEEDHASH_HPP
18 |
19 | #include
20 | #include
21 |
22 | #include
23 |
24 | #include "hashing/ExtendHash.hpp"
25 | #include "hashing/Individuals.hpp"
26 | #include "Types.hpp"
27 |
28 | /* Object for storing initial word seeds */
29 | class SeedHash
30 | {
31 |
32 | using ind_vec = std::vector;
33 |
34 | boost::unordered_map> seed_hash;
35 | // Empty vector to insert into the seed hash
36 | std::vector vec;
37 | // Iterator for testing insertion of elements
38 | // std::pair > > >, bool> seed_ret;
40 | public:
41 | void insertIndividuals(unsigned int i, hash_size word)
42 | {
43 | auto seed_ret = seed_hash.insert(std::pair>(word, vec));
44 | (seed_ret.first->second).push_back(i);
45 | }
46 | void clear()
47 | {
48 | seed_hash.clear();
49 | }
50 | int size()
51 | {
52 | return seed_hash.size();
53 | }
54 |
55 | // Generate a new hash for this vector of Individualss
56 | static unsigned long subHash(ExtendHash* e, std::vector v, int w, ind_vec all_ind, const int MAX_seeds,
57 | const int jobID, const int jobs, const unsigned w_i, const unsigned w_j,
58 | const unsigned windowSize, const int GLOBAL_READ_WORDS, int& GLOBAL_SKIPPED_WORDS,
59 | const int GLOBAL_CURRENT_WORD, const bool is_j_above_diag)
60 | {
61 | SeedHash cur_sh;
62 | // seed the next word from this subset of Individualss
63 | for (unsigned int& i : v) {
64 | cur_sh.insertIndividuals(i, all_ind[i].getWordHash(w));
65 | }
66 | // recursion:
67 | return cur_sh.extendAllPairs(e, w, all_ind, MAX_seeds, jobID, jobs, w_i, w_j, windowSize, GLOBAL_READ_WORDS,
68 | GLOBAL_SKIPPED_WORDS, GLOBAL_CURRENT_WORD, is_j_above_diag);
69 | }
70 |
71 | // Extend/save all pairs in the current hash
72 | // ExtendHash * e : Pointer to ExtendHash which will be called for each pair
73 | // returns : number of pairs evaluated
74 | unsigned long extendAllPairs(ExtendHash* e, int w, ind_vec all_ind, const int MAX_seeds, const int jobID,
75 | const int jobs, const unsigned w_i, const unsigned w_j, const unsigned windowSize,
76 | const int GLOBAL_READ_WORDS, int& GLOBAL_SKIPPED_WORDS, const int GLOBAL_CURRENT_WORD,
77 | const bool is_j_above_diag)
78 | {
79 | unsigned long tot_pairs = 0;
80 | for (auto it = seed_hash.begin(); it != seed_hash.end(); ++it) {
81 |
82 | // *** As long as the # of pairs is high, generate a sub-hash for the next word
83 | // *** Only store pairs of Individuals that have collision in a small hash
84 | // *** Extend only to the haplotypes that seeded here
85 | if (MAX_seeds != 0 && it->second.size() > static_cast(MAX_seeds) && w + 1 < GLOBAL_READ_WORDS) {
86 | // recursively generate a sub-hash
87 | // IMPORTANT: if we run out of buffered words then this seed does not get analyzed
88 | if (w + 1 < GLOBAL_READ_WORDS) {
89 | tot_pairs += subHash(e, it->second, w + 1, all_ind, MAX_seeds, jobID, jobs, w_i, w_j, windowSize,
90 | GLOBAL_READ_WORDS, GLOBAL_SKIPPED_WORDS, GLOBAL_CURRENT_WORD, is_j_above_diag);
91 | } else {
92 | GLOBAL_SKIPPED_WORDS++;
93 | }
94 | } else {
95 | // tot_pairs += it->second.size() * (it->second.size() - 1) / 2;
96 | for (auto i = 0ul; i < it->second.size(); i++) {
97 | for (auto ii = i + 1ul; ii < it->second.size(); ii++) {
98 |
99 | unsigned int ind_i = std::max(it->second[i], it->second[ii]);
100 | unsigned int ind_j = std::min(it->second[i], it->second[ii]);
101 |
102 | // for the last job only
103 | if (jobID == jobs) {
104 | if (all_ind[ind_i].getIdNum() >= (w_i - 1) * windowSize &&
105 | all_ind[ind_j].getIdNum() >= (w_j - 1) * windowSize) {
106 | if (all_ind[ind_j].getIdNum() <
107 | (w_j - 1) * windowSize + (all_ind[ind_i].getIdNum() - (w_i - 1) * windowSize)) {
108 | e->extendPair(ind_j, ind_i, w, GLOBAL_CURRENT_WORD);
109 | tot_pairs++;
110 | }
111 | }
112 | }
113 |
114 | // for all other jobs
115 | else if ((all_ind[ind_i].getIdNum() >= (w_i - 1) * windowSize &&
116 | all_ind[ind_i].getIdNum() < w_i * windowSize) &&
117 | (all_ind[ind_j].getIdNum() >= (w_j - 1) * windowSize &&
118 | all_ind[ind_j].getIdNum() < w_j * windowSize)) {
119 | if (is_j_above_diag && all_ind[ind_j].getIdNum() < (w_j - 1) * windowSize + (all_ind[ind_i].getIdNum() -
120 | (w_i - 1) * windowSize)) {
121 | e->extendPair(ind_j, ind_i, w, GLOBAL_CURRENT_WORD);
122 | tot_pairs++;
123 | } else if (!is_j_above_diag &&
124 | all_ind[ind_j].getIdNum() >=
125 | (w_j - 1) * windowSize + (all_ind[ind_i].getIdNum() - (w_i - 1) * windowSize)) {
126 | e->extendPair(ind_j, ind_i, w, GLOBAL_CURRENT_WORD);
127 | tot_pairs++;
128 | }
129 | }
130 | }
131 | }
132 | }
133 | }
134 | return tot_pairs;
135 | }
136 | };
137 |
138 | #endif // ASMC_HASHING_SEEDHASH_HPP
139 |
--------------------------------------------------------------------------------
/src/hashing/Utils.cpp:
--------------------------------------------------------------------------------
1 | // This file is part of ASMC, developed by Pier Francesco Palamara.
2 | //
3 | // ASMC is free software: you can redistribute it and/or modify
4 | // it under the terms of the GNU General Public License as published by
5 | // the Free Software Foundation, either version 3 of the License, or
6 | // (at your option) any later version.
7 | //
8 | // ASMC is distributed in the hope that it will be useful,
9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | // GNU General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU General Public License
14 | // along with ASMC. If not, see .
15 |
16 | #include "hashing/Utils.hpp"
17 |
18 | #include
19 | #include
20 | #include
21 |
22 | double asmc::cmBetween(const int w1, const int w2, const std::vector& geneticPositions, const int wordSize)
23 | {
24 | assert(!geneticPositions.empty());
25 | assert(wordSize * w1 < geneticPositions.size());
26 | assert(w1 >= 0);
27 | assert(w2 >= 0);
28 | assert(w2 >= w1);
29 |
30 | const std::size_t start = wordSize * w1;
31 | const std::size_t end = std::min(wordSize * w2 + wordSize - 1, geneticPositions.size() - 1ul);
32 |
33 | return 100.0 * (geneticPositions[end] - geneticPositions[start]);
34 | }
35 |
--------------------------------------------------------------------------------
/src/hashing/Utils.hpp:
--------------------------------------------------------------------------------
1 | // This file is part of ASMC, developed by Pier Francesco Palamara.
2 | //
3 | // ASMC is free software: you can redistribute it and/or modify
4 | // it under the terms of the GNU General Public License as published by
5 | // the Free Software Foundation, either version 3 of the License, or
6 | // (at your option) any later version.
7 | //
8 | // ASMC is distributed in the hope that it will be useful,
9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | // GNU General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU General Public License
14 | // along with ASMC. If not, see .
15 |
16 | #ifndef ASMC_HASHING_UTILS_HPP
17 | #define ASMC_HASHING_UTILS_HPP
18 |
19 | #include
20 |
21 | namespace asmc
22 | {
23 |
24 | /**
25 | * Convenience function to compute genetic distance between two words (start of w1 and end of w2)
26 | *
27 | * @param w1 the first word
28 | * @param w2 the second word
29 | * @param geneticPositions vector of genetic positions
30 | * @param wordSize number of locations per word
31 | * @return the number of centimorgans between start of w1 and end of w2
32 | */
33 | double cmBetween(int w1, int w2, const std::vector& geneticPositions, int wordSize);
34 |
35 | } // namespace asmc
36 |
37 | #endif // ASMC_HASHING_UTILS_HPP
38 |
--------------------------------------------------------------------------------
/test/cli_interface_test.py:
--------------------------------------------------------------------------------
1 | import gzip
2 | import os
3 | import subprocess
4 | import sys
5 |
6 |
7 | def list_files(startpath):
8 | for root, dirs, files in os.walk(startpath):
9 | level = root.replace(startpath, '').count(os.sep)
10 | indent = ' ' * 4 * level
11 | print('{}{}/'.format(indent, os.path.basename(root)))
12 | subindent = ' ' * 4 * (level + 1)
13 | for f in files:
14 | print('{}{}'.format(subindent, f))
15 |
16 |
17 | def test_regession(asmc_exe):
18 | """
19 | Run the ASMC regression test, which will test the output of an example ASMC run with the cached result in
20 | data/regression_test_original.gz.
21 |
22 | :param asmc_exe: path to the ASMC executable
23 | """
24 |
25 | script_dir = os.path.realpath(os.path.dirname(__file__))
26 | base_dir = os.path.realpath(os.path.join(script_dir, '..', '..'))
27 | old_file = os.path.join(script_dir, 'data', 'regression_test_original.gz')
28 | print('-' * 35)
29 | print('script dir', script_dir)
30 | print('base dir', base_dir)
31 | print('asmc exe', asmc_exe)
32 | print('-' * 35)
33 | assert os.path.isfile(old_file)
34 |
35 | # Old file contents are before OxfordRSE involvement in ASMC
36 | with gzip.open(old_file, 'rt') as gz_f:
37 | old_lines = gz_f.readlines()
38 |
39 | # New file contents are the result of running the example with the current ASMC source
40 | decoding_file = os.path.join(base_dir, 'FILES', 'DECODING_QUANTITIES', '30-100-2000_CEU.decodingQuantities.gz')
41 | in_file_root = os.path.join(base_dir, 'FILES', 'EXAMPLE', 'exampleFile.n300.array')
42 |
43 | subprocess.call([
44 | asmc_exe,
45 | '--decodingQuantFile', decoding_file,
46 | '--inFileRoot', in_file_root,
47 | '--posteriorSums',
48 | ])
49 |
50 | new_file = os.path.join(base_dir, 'FILES', 'EXAMPLE', 'exampleFile.n300.array.1-1.sumOverPairs.gz')
51 | assert os.path.isfile(new_file), \
52 | "No output file found at {}. Did the executable run as expected?".format(new_file)
53 |
54 | with gzip.open(new_file, 'rt') as gz_f:
55 | new_lines = gz_f.readlines()
56 |
57 | assert len(old_lines) == len(new_lines), \
58 | "The outputs have different numbers of lines ({} and {})".format(len(old_lines), len(new_lines))
59 |
60 | for i, (old, new) in enumerate(zip(old_lines, new_lines)):
61 | assert old == new, "The outputs first differ at line {}".format(i)
62 |
63 | print('\n' + '#' * 35)
64 | print('# Regression test passed #')
65 | print('# All {} output lines identical #'.format(len(old_lines)))
66 | print('#' * 35 + '\n')
67 |
68 |
69 | if __name__ == "__main__":
70 | assert len(sys.argv) == 2, "Usage: {} /path/to/ASMC_exe".format(sys.argv[0])
71 |
72 | path_to_asmc = sys.argv[1]
73 | assert os.path.isfile(path_to_asmc) and 'ASMC_exe' in path_to_asmc, \
74 | "Expected path to ASMC executable, but got {}".format(path_to_asmc)
75 |
76 | test_regession(path_to_asmc)
77 |
--------------------------------------------------------------------------------
/test/test_ASMC.cpp:
--------------------------------------------------------------------------------
1 | // This file is part of ASMC, developed by Pier Francesco Palamara.
2 | //
3 | // ASMC is free software: you can redistribute it and/or modify
4 | // it under the terms of the GNU General Public License as published by
5 | // the Free Software Foundation, either version 3 of the License, or
6 | // (at your option) any later version.
7 | //
8 | // ASMC is distributed in the hope that it will be useful,
9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | // GNU General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU General Public License
14 | // along with ASMC. If not, see .
15 |
16 | #include "catch.hpp"
17 |
18 | #include
19 | #include
20 |
21 | #include "ASMC.hpp"
22 |
23 | #include
24 | #include
25 | #include
26 |
27 | TEST_CASE("test ASMC decodePairsArray", "[ASMC]")
28 | {
29 | ASMC::ASMC asmc(ASMC_DATA_DIR "/examples/asmc/exampleFile.n300.array",
30 | ASMC_DATA_DIR "/decoding_quantities/30-100-2000_CEU.decodingQuantities.gz");
31 |
32 | asmc.setStorePerPairMap();
33 | asmc.setStorePerPairPosterior();
34 | asmc.setStorePerPairPosteriorMean();
35 | asmc.setStorePerPairMap();
36 |
37 | std::vector indA = {1, 2, 3};
38 | std::vector indB = {2, 3, 4};
39 | asmc.decodePairs(indA, indB);
40 | auto result = asmc.getRefOfResults();
41 |
42 | SECTION("test decode pair summarize")
43 | {
44 | REQUIRE(result.perPairIndices.size() == 3ul);
45 |
46 | // 0.1% margin in this test as the results can vary between pure and avx/sse
47 | REQUIRE(result.perPairPosteriorMeans(0, 0) == Approx(15968.91016f).margin(15968.91016f * 0.001f));
48 | REQUIRE(result.perPairPosteriorMeans(1, 8) == Approx(27963.49805f).margin(27963.49805f * 0.001f));
49 | REQUIRE(result.perPairPosteriorMeans(2, 29) == Approx(48573.32812f).margin(48573.32812f * 0.001f));
50 |
51 | REQUIRE(result.perPairMAPs(0, 0) == 29);
52 | REQUIRE(result.perPairMAPs(1, 1234) == 65);
53 | REQUIRE(result.perPairMAPs(2, 7) == 33);
54 |
55 | // Check that the posteriors actually sum to one
56 | for (Eigen::Index idx = 0ll; idx < result.perPairPosteriors.size(); ++idx) {
57 | REQUIRE(result.perPairPosteriors.at(idx).colwise().sum().isOnes(1e-2));
58 | }
59 | }
60 | }
61 |
62 | TEST_CASE("test ASMC decodePairsSequence", "[ASMC]")
63 | {
64 | ASMC::ASMC asmc(ASMC_DATA_DIR "/examples/asmc/exampleFile.n300",
65 | ASMC_DATA_DIR "/decoding_quantities/30-100-2000_CEU.decodingQuantities.gz", "", "sequence");
66 |
67 | asmc.setStorePerPairMap();
68 | asmc.setStorePerPairPosterior();
69 | asmc.setStorePerPairPosteriorMean();
70 | asmc.setStorePerPairMap();
71 |
72 | std::vector indA = {5, 6};
73 | std::vector indB = {7, 8};
74 | asmc.decodePairs(indA, indB);
75 | auto result = asmc.getRefOfResults();
76 |
77 | SECTION("test decode pair summarize")
78 | {
79 | REQUIRE(result.perPairIndices.size() == 2ul);
80 |
81 | // 0.1% margin in this test as the results can vary between pure and avx/sse
82 | REQUIRE(result.perPairPosteriorMeans(0, 0) == Approx(801.06647f).margin(801.06647f * 0.001f));
83 | REQUIRE(result.perPairPosteriorMeans(1, 8) == Approx(17953.60938f).margin(17953.60938f * 0.001f));
84 |
85 | REQUIRE(result.perPairMAPs(0, 0) == 16);
86 | REQUIRE(result.perPairMAPs(1, 1234) == 61);
87 |
88 | // Check that the posteriors actually sum to one
89 | for (Eigen::Index idx = 0ll; idx < result.perPairPosteriors.size(); ++idx) {
90 | REQUIRE(result.perPairPosteriors.at(idx).colwise().sum().isOnes(1e-2));
91 | }
92 | }
93 | }
94 |
95 | TEST_CASE("test other get methods", "[ASMC]")
96 | {
97 | ASMC::ASMC asmc(ASMC_DATA_DIR "/examples/asmc/exampleFile.n300.array",
98 | ASMC_DATA_DIR "/decoding_quantities/30-100-2000_CEU.decodingQuantities.gz");
99 |
100 | const std::vector& expectedTimes = asmc.getExpectedTimes();
101 | CHECK(expectedTimes.at(0) == Approx(14.999777896567f).margin(1e-5));
102 | CHECK(expectedTimes.at(4) == Approx(135.698150766900f).margin(1e-5));
103 | }
104 |
105 | TEST_CASE("test from and to", "[ASMC]")
106 | {
107 | ASMC::ASMC asmc_full(ASMC_DATA_DIR "/examples/asmc/exampleFile.n300.array",
108 | ASMC_DATA_DIR "/decoding_quantities/30-100-2000_CEU.decodingQuantities.gz");
109 |
110 | ASMC::ASMC asmc_part(ASMC_DATA_DIR "/examples/asmc/exampleFile.n300.array",
111 | ASMC_DATA_DIR "/decoding_quantities/30-100-2000_CEU.decodingQuantities.gz");
112 |
113 | std::vector indA = {1, 2, 3, 4, 5};
114 | std::vector indB = {6, 7, 8, 9, 10};
115 |
116 | asmc_full.setStorePerPairMap();
117 | asmc_full.setStorePerPairPosterior();
118 | asmc_full.setStorePerPairPosteriorMean();
119 | asmc_full.setStoreSumOfPosterior();
120 |
121 | asmc_part.setStorePerPairMap();
122 | asmc_part.setStorePerPairPosterior();
123 | asmc_part.setStorePerPairPosteriorMean();
124 | asmc_part.setStoreSumOfPosterior();
125 |
126 | asmc_full.decodePairs(indA, indB);
127 | auto result_full = asmc_full.getRefOfResults();
128 | asmc_part.setStorePerPairMap();
129 |
130 | const unsigned lo = 1000;
131 | const unsigned hi = 1100;
132 | const unsigned long windowSize = static_cast(hi - lo);
133 | asmc_part.decodePairs(indA, indB, lo, hi, 0.5f);
134 | auto result_part = asmc_part.getRefOfResults();
135 |
136 | SECTION("test part sizes are correct")
137 | {
138 | REQUIRE(result_part.perPairPosteriors.front().rows() == result_full.perPairPosteriors.front().rows());
139 | REQUIRE(result_part.perPairPosteriors.front().cols() == windowSize);
140 |
141 | REQUIRE(result_part.sumOfPosteriors.rows() == result_full.sumOfPosteriors.rows());
142 | REQUIRE(result_part.sumOfPosteriors.cols() == windowSize);
143 |
144 | REQUIRE(result_part.perPairPosteriorMeans.rows() == result_full.perPairPosteriorMeans.rows());
145 | REQUIRE(result_part.perPairPosteriorMeans.cols() == windowSize);
146 |
147 | REQUIRE(result_part.minPosteriorMeans.cols() == windowSize);
148 | REQUIRE(result_part.argminPosteriorMeans.cols() == windowSize);
149 |
150 | REQUIRE(result_part.perPairMAPs.rows() == result_full.perPairMAPs.rows());
151 | REQUIRE(result_part.perPairMAPs.cols() == windowSize);
152 |
153 | REQUIRE(result_part.minMAPs.cols() == windowSize);
154 | REQUIRE(result_part.argminMAPs.cols() == windowSize);
155 | }
156 |
157 | SECTION("test parts match full analysis")
158 | {
159 | for(auto i = 0ul; i < indA.size(); ++i) {
160 | REQUIRE(
161 | (result_full.perPairPosteriors.at(i).middleCols(static_cast(lo), windowSize)
162 | - result_part.perPairPosteriors.at(i)).abs().maxCoeff() < 1e-6
163 | );
164 | }
165 | }
166 | }
167 |
--------------------------------------------------------------------------------
/test/test_HMM.cpp:
--------------------------------------------------------------------------------
1 | // This file is part of ASMC, developed by Pier Francesco Palamara.
2 | //
3 | // ASMC is free software: you can redistribute it and/or modify
4 | // it under the terms of the GNU General Public License as published by
5 | // the Free Software Foundation, either version 3 of the License, or
6 | // (at your option) any later version.
7 | //
8 | // ASMC is distributed in the hope that it will be useful,
9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | // GNU General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU General Public License
14 | // along with ASMC. If not, see .
15 |
16 | #include "catch.hpp"
17 |
18 | #include
19 | #include
20 |
21 | #include "HMM.hpp"
22 |
23 | TEST_CASE("test hmm functions", "[HMM]")
24 | {
25 | DecodingParams params(
26 | ASMC_DATA_DIR "/examples/asmc/exampleFile.n300.array",
27 | ASMC_DATA_DIR "/decoding_quantities/30-100-2000_CEU.decodingQuantities.gz");
28 |
29 | Data data(params);
30 | HMM hmm(data, params);
31 |
32 | REQUIRE(data.individuals.size() > 20);
33 |
34 | SECTION("test decode pair summarize")
35 | {
36 | PairObservations pairObs = hmm.makePairObs(1, 0, 2, 0);
37 | std::vector> decodeResult = hmm.decode(pairObs);
38 | std::pair, std::vector> decodeSummary = hmm.decodeSummarize(pairObs);
39 | // check that the MAP and posterior mean are the same length
40 | REQUIRE(decodeSummary.first.size() == decodeSummary.second.size());
41 | REQUIRE(decodeSummary.first.size() == decodeResult[0].size());
42 | }
43 |
44 | SECTION("test decode pair")
45 | {
46 | REQUIRE(hmm.getBatchBuffer().size() == 0);
47 | hmm.decodePair(0, 9);
48 | REQUIRE(hmm.getBatchBuffer().size() == 4);
49 | hmm.decodePair(1, 1);
50 | REQUIRE(hmm.getBatchBuffer().size() == 5);
51 | }
52 |
53 | SECTION("test decode pairs")
54 | {
55 | REQUIRE(hmm.getBatchBuffer().size() == 0);
56 | hmm.decodePairs({ 0, 1 }, { 9, 1 });
57 | REQUIRE(hmm.getBatchBuffer().size() == 5);
58 | }
59 |
60 | SECTION("test finishDecoding")
61 | {
62 | REQUIRE(hmm.getBatchBuffer().size() == 0);
63 | hmm.decodePair(0, 9);
64 | REQUIRE(hmm.getBatchBuffer().size() == 4);
65 | hmm.finishDecoding();
66 | REQUIRE(hmm.getBatchBuffer().size() == 0);
67 | }
68 |
69 | SECTION("test fill up buffer")
70 | {
71 | // default batch size is 64
72 | for (int i = 1; i <= 64 / 4; ++i) {
73 | hmm.decodePair(0, i);
74 | }
75 |
76 | // buffer should be empty now
77 | REQUIRE(hmm.getBatchBuffer().size() == 0);
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/test/test_binary_data_reader.cpp:
--------------------------------------------------------------------------------
1 | // This file is part of ASMC, developed by Pier Francesco Palamara.
2 | //
3 | // ASMC is free software: you can redistribute it and/or modify
4 | // it under the terms of the GNU General Public License as published by
5 | // the Free Software Foundation, either version 3 of the License, or
6 | // (at your option) any later version.
7 | //
8 | // ASMC is distributed in the hope that it will be useful,
9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | // GNU General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU General Public License
14 | // along with ASMC. If not, see .
15 |
16 | #include "catch.hpp"
17 |
18 | #include "BinaryDataReader.hpp"
19 |
20 | TEST_CASE("IbdPairDataLine default member test", "[BinaryDataReader]")
21 | {
22 | IbdPairDataLine line;
23 |
24 | REQUIRE(line.ind1FamId == "0_00");
25 | REQUIRE(line.ind1Id == "0_00");
26 | REQUIRE(line.ind1Hap == -1);
27 | REQUIRE(line.ind2FamId == "0_00");
28 | REQUIRE(line.ind2Id == "0_00");
29 | REQUIRE(line.ind2Hap == -1);
30 | REQUIRE(line.chromosome == -1);
31 | REQUIRE(line.ibdStart == -1);
32 | REQUIRE(line.ibdEnd == -1);
33 | REQUIRE(line.lengthInCentimorgans == -1.f);
34 | REQUIRE(line.ibdScore == -1.f);
35 | REQUIRE(line.postEst == -1.f);
36 | REQUIRE(line.mapEst == -1.f);
37 |
38 | REQUIRE(line.toString() == "0_00\t0_00\t-1\t0_00\t0_00\t-1\t-1\t-1\t-1");
39 |
40 | line.ibdScore = 0.1;
41 | line.lengthInCentimorgans = 1.2;
42 | line.postEst = 2.3;
43 | line.mapEst = 3.4;
44 |
45 | REQUIRE(line.toString() == "0_00\t0_00\t-1\t0_00\t0_00\t-1\t-1\t-1\t-1\t1.2\t0.1\t2.3\t3.4");
46 | }
47 |
48 | TEST_CASE("BinaryDataReader real data test with decoding", "[BinaryDataReader]")
49 | {
50 | BinaryDataReader dataReader(ASMC_DATA_DIR "/testing/fastsmc/binary_output.bibd.gz");
51 |
52 | IbdPairDataLine line1 = dataReader.getNextLine();
53 | REQUIRE(line1.ind1FamId == "1_94");
54 | REQUIRE(line1.ind1Id == "1_94");
55 | REQUIRE(line1.ind1Hap == 1);
56 | REQUIRE(line1.ind2FamId == "1_104");
57 | REQUIRE(line1.ind2Id == "1_104");
58 | REQUIRE(line1.ind2Hap == 1);
59 | REQUIRE(line1.chromosome == 1);
60 | REQUIRE(line1.ibdStart == 8740);
61 | REQUIRE(line1.ibdEnd == 1660011);
62 | REQUIRE(line1.lengthInCentimorgans == Approx(1.86962f).epsilon(1e-5));
63 | REQUIRE(line1.ibdScore == Approx(0.5073708f).epsilon(1e-5));
64 | REQUIRE(line1.postEst == Approx(215.6709f).epsilon(1e-5));
65 | REQUIRE(line1.mapEst == Approx(24.99997f).epsilon(1e-5));
66 |
67 | IbdPairDataLine line2 = dataReader.getNextLine();
68 | REQUIRE(line2.ind1FamId == "1_94");
69 | REQUIRE(line2.ind1Id == "1_94");
70 | REQUIRE(line2.ind1Hap == 1);
71 | REQUIRE(line2.ind2FamId == "1_104");
72 | REQUIRE(line2.ind2Id == "1_104");
73 | REQUIRE(line2.ind2Hap == 1);
74 | REQUIRE(line2.chromosome == 1);
75 | REQUIRE(line2.ibdStart == 1679626);
76 | REQUIRE(line2.ibdEnd == 1679626);
77 | REQUIRE(line2.lengthInCentimorgans == Approx(0.f).epsilon(1e-5));
78 | REQUIRE(line2.ibdScore == Approx(0.02249517f).epsilon(1e-5));
79 | REQUIRE(line2.postEst == Approx(25544.65f).epsilon(1e-5));
80 | REQUIRE(line2.mapEst == Approx(24.99997f).epsilon(1e-5));
81 |
82 | int numLinesRead = 2;
83 | while (dataReader.moreLinesInFile()) {
84 | IbdPairDataLine line = dataReader.getNextLine();
85 | numLinesRead++;
86 | }
87 |
88 | REQUIRE(numLinesRead == 1574);
89 | }
90 |
91 | TEST_CASE("BinaryDataReader real data test with only hashing", "[BinaryDataReader]")
92 | {
93 | BinaryDataReader dataReader(ASMC_DATA_DIR "/testing/fastsmc/binary_output_hashing.bibd.gz");
94 |
95 | dataReader.getNextLine();
96 | dataReader.getNextLine();
97 | dataReader.getNextLine();
98 | dataReader.getNextLine();
99 |
100 | IbdPairDataLine line5 = dataReader.getNextLine();
101 | REQUIRE(line5.ind1FamId == "1_35");
102 | REQUIRE(line5.ind1Id == "1_35");
103 | REQUIRE(line5.ind1Hap == 1);
104 | REQUIRE(line5.ind2FamId == "1_99");
105 | REQUIRE(line5.ind2Id == "1_99");
106 | REQUIRE(line5.ind2Hap == 2);
107 | REQUIRE(line5.chromosome == 1);
108 | REQUIRE(line5.ibdStart == 8740);
109 | REQUIRE(line5.ibdEnd == 1572363);
110 | REQUIRE(line5.lengthInCentimorgans == -1.f); // default value when not in file
111 | REQUIRE(line5.ibdScore == -1.f); // default value when not in file
112 | REQUIRE(line5.postEst == -1.f); // default value when not in file
113 | REQUIRE(line5.mapEst == -1.f); // default value when not in file
114 |
115 | int numLinesRead = 5;
116 | while (dataReader.moreLinesInFile()) {
117 | IbdPairDataLine line = dataReader.getNextLine();
118 | numLinesRead++;
119 | }
120 |
121 | REQUIRE(numLinesRead == 495);
122 | }
123 |
--------------------------------------------------------------------------------
/test/test_decoding_params.cpp:
--------------------------------------------------------------------------------
1 | // This file is part of ASMC, developed by Pier Francesco Palamara.
2 | //
3 | // ASMC is free software: you can redistribute it and/or modify
4 | // it under the terms of the GNU General Public License as published by
5 | // the Free Software Foundation, either version 3 of the License, or
6 | // (at your option) any later version.
7 | //
8 | // ASMC is distributed in the hope that it will be useful,
9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | // GNU General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU General Public License
14 | // along with ASMC. If not, see .
15 |
16 | #include "catch.hpp"
17 |
18 | #include
19 |
20 | #include "DecodingParams.hpp"
21 |
22 | TEST_CASE("test DecodingParams", "[DecodingParams]")
23 | {
24 | std::string inFileRoot = ASMC_DATA_DIR "/examples/asmc/exampleFile.n300.array";
25 | std::string decodingQuantFile = ASMC_DATA_DIR "/decoding_quantities/30-100-2000_CEU.decodingQuantities.gz";
26 |
27 | SECTION("test array folded") {
28 | DecodingParams params(inFileRoot, decodingQuantFile);
29 | REQUIRE(params.decodingMode == DecodingMode::arrayFolded);
30 | REQUIRE(params.compress == false);
31 | }
32 |
33 | SECTION("test sequence folded") {
34 | DecodingParams params(inFileRoot, decodingQuantFile,
35 | "", // _outFileRoot
36 | 1, // _jobs
37 | 1, // _jobInd
38 | "sequence", // _decodingModeString, override default
39 | false, // _decodingSequence
40 | true, // _usingCSFS
41 | true, // _compress, override default
42 | false, // _useAncestral
43 | nan("") // _skipCSFSdistance, override default
44 | );
45 | REQUIRE(params.decodingMode == DecodingMode::sequenceFolded);
46 | REQUIRE(params.compress == true);
47 | }
48 |
49 | SECTION("test sequence") {
50 | DecodingParams params(inFileRoot, decodingQuantFile,
51 | "", // _outFileRoot
52 | 1, // _jobs
53 | 1, // _jobInd
54 | "sequence", // _decodingModeString, override default
55 | false, // _decodingSequence
56 | true, // _usingCSFS
57 | false, // _compress
58 | true // _useAncestral, override default
59 | );
60 | REQUIRE(params.decodingMode == DecodingMode::sequence);
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/test/test_decoding_quantities.cpp:
--------------------------------------------------------------------------------
1 | // This file is part of ASMC, developed by Pier Francesco Palamara.
2 | //
3 | // ASMC is free software: you can redistribute it and/or modify
4 | // it under the terms of the GNU General Public License as published by
5 | // the Free Software Foundation, either version 3 of the License, or
6 | // (at your option) any later version.
7 | //
8 | // ASMC is distributed in the hope that it will be useful,
9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | // GNU General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU General Public License
14 | // along with ASMC. If not, see .
15 |
16 | #include "catch.hpp"
17 |
18 | #include
19 |
20 | #include "DecodingQuantities.hpp"
21 |
22 | using Catch::Matchers::Contains;
23 |
24 | TEST_CASE("test validate decoding quantities file", "[DecodingQuantities]")
25 | {
26 | std::string nonExistentDecodingQuantitiesFile = ASMC_DATA_DIR "/random_nonexistent_file.txt";
27 | std::string goodDecodingQuantitiesFile = ASMC_DATA_DIR "/testing/asmc/decoding_quantities_good.txt";
28 | std::string badDecodingQuantitiesFile = ASMC_DATA_DIR "/testing/asmc/decoding_quantities_bad.txt";
29 |
30 | SECTION("test nonexistent file")
31 | {
32 | CHECK_THROWS_WITH(DecodingQuantities{nonExistentDecodingQuantitiesFile},
33 | Contains("random_nonexistent_file.txt does not exist"));
34 | }
35 |
36 | SECTION("test good file")
37 | {
38 | CHECK_NOTHROW(DecodingQuantities{goodDecodingQuantitiesFile});
39 | }
40 |
41 | SECTION("test bad file")
42 | {
43 | CHECK_THROWS_WITH(DecodingQuantities{badDecodingQuantitiesFile},
44 | Contains("decoding_quantities_bad.txt does not seem to contain the correct information") &&
45 | Contains("but instead found \"this file does not start with"));
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/test/test_hashing.cpp:
--------------------------------------------------------------------------------
1 | // This file is part of ASMC, developed by Pier Francesco Palamara.
2 | //
3 | // ASMC is free software: you can redistribute it and/or modify
4 | // it under the terms of the GNU General Public License as published by
5 | // the Free Software Foundation, either version 3 of the License, or
6 | // (at your option) any later version.
7 | //
8 | // ASMC is distributed in the hope that it will be useful,
9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | // GNU General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU General Public License
14 | // along with ASMC. If not, see .
15 |
16 | #include "catch.hpp"
17 |
18 | #include "hashing/ExtendHash.hpp"
19 | #include "hashing/Individuals.hpp"
20 | #include "hashing/Match.hpp"
21 | #include "hashing/SeedHash.hpp"
22 | #include "hashing/Utils.hpp"
23 |
24 | TEST_CASE("ExtendHash", "[hashing]")
25 | {
26 | ExtendHash e(4ul, 2ul, true);
27 | REQUIRE(e.size() == 0ul);
28 | REQUIRE(e.getWordSize() == 4ul);
29 |
30 | //todo: test ExtendHash
31 | }
32 |
33 | TEST_CASE("individuals", "[hashing]")
34 | {
35 | Individuals ind(8ul, 3ul, 5u);
36 |
37 | REQUIRE(ind.getIdNum() == 5u);
38 | REQUIRE(ind.getWordSize() == 8ul);
39 | REQUIRE(ind.getNumReadAhead() == 3ul);
40 |
41 | // Check up to 10 - but internally we're just going 0-1-2-0-1-2-0-1-2-0
42 | for (auto i = 0; i < 10; ++i) {
43 | REQUIRE(ind.getWordHash(i) == 0ul);
44 | REQUIRE(ind.getWordString(i) == "00000000");
45 | }
46 |
47 | ind.setMarker(0, 0);
48 | ind.setMarker(1, 2);
49 |
50 | ind.setMarker(2, 2);
51 | ind.setMarker(2, 3);
52 |
53 | REQUIRE(ind.getWordHash(0) == 1ul);
54 | REQUIRE(ind.getWordString(0) == "00000001");
55 |
56 | REQUIRE(ind.getWordHash(1) == 4ul);
57 | REQUIRE(ind.getWordString(1) == "00000100");
58 |
59 | REQUIRE(ind.getWordHash(2) == 12ul);
60 | REQUIRE(ind.getWordString(2) == "00001100");
61 |
62 | // Clear 2
63 | ind.clear(2);
64 | REQUIRE(ind.getWordHash(2) == 0ul);
65 | REQUIRE(ind.getWordString(2) == "00000000");
66 |
67 | // Clear 1 by clearing 4
68 | ind.clear(4);
69 | REQUIRE(ind.getWordHash(1) == 0ul);
70 | REQUIRE(ind.getWordString(1) == "00000000");
71 | }
72 |
73 | TEST_CASE("match", "[hashing]")
74 | {
75 | SECTION("default construction")
76 | {
77 | Match m(4ul);
78 | REQUIRE(m.getWordSize() == 4ul);
79 | REQUIRE(m.getGaps() == 0u);
80 | REQUIRE(m.getInterval()[0] == 0);
81 | REQUIRE(m.getInterval()[1] == 0);
82 |
83 | m.addGap();
84 | m.addGap();
85 | REQUIRE(m.getGaps() == 2u);
86 |
87 | m.extend(5);
88 | REQUIRE(m.getInterval()[1] == 5);
89 | }
90 |
91 | SECTION("explicit constructor")
92 | {
93 | Match m(4, 7);
94 | REQUIRE(m.getWordSize() == 4ul);
95 | REQUIRE(m.getGaps() == 0u);
96 | REQUIRE(m.getInterval()[0] == 7);
97 | REQUIRE(m.getInterval()[1] == 7);
98 |
99 | m.extend(5);
100 | REQUIRE(m.getInterval()[1] == 7);
101 |
102 | m.extend(8);
103 | REQUIRE(m.getInterval()[1] == 8);
104 | }
105 |
106 | SECTION("print method")
107 | {
108 | //TODO: this method is harder to test because it requires access to an HMM instance
109 | }
110 | }
111 |
112 | TEST_CASE("SeedHash", "[hashing]")
113 | {
114 | SeedHash s;
115 | REQUIRE(s.size() == 0ul);
116 |
117 | //todo: test SeedHash
118 | }
119 |
120 | TEST_CASE("utils", "[hashing]")
121 | {
122 | SECTION("cmBetween")
123 | {
124 | std::vector genPos = {0.00402186f, 0.0388124f, 0.0567817f, 0.0668489f, 0.0915063f, 0.12783f, 0.198618f,
125 | 0.199045f, 0.250093f, 0.259338f, 0.293267f, 0.294899f, 0.316173f, 0.353332f,
126 | 0.354553f, 0.357123f, 0.359118f, 0.395468f, 0.41749f, 0.421739f, 0.453347f,
127 | 0.471302f, 0.535031f, 0.548733f, 0.574022f, 0.604538f, 0.620419f};
128 |
129 | SECTION("both words are inside vector")
130 | {
131 | const int wordSize = 4;
132 | const int w1 = 0;
133 | const int w2 = 3;
134 | const int w3 = 5;
135 |
136 | REQUIRE(asmc::cmBetween(w1, w2, genPos, wordSize) == 100.0 * (genPos.at(15) - genPos.at(0)));
137 | REQUIRE(asmc::cmBetween(w1, w3, genPos, wordSize) == 100.0 * (genPos.at(23) - genPos.at(0)));
138 | REQUIRE(asmc::cmBetween(w2, w3, genPos, wordSize) == 100.0 * (genPos.at(23) - genPos.at(12)));
139 | }
140 |
141 | SECTION("second word overflows vector")
142 | {
143 | const int wordSize = 4;
144 | const int w1 = 0;
145 | const int w2 = 1;
146 | const int w3 = 10;
147 |
148 | REQUIRE(asmc::cmBetween(w1, w3, genPos, wordSize) == 100.0 * (genPos.back() - genPos.at(0)));
149 | REQUIRE(asmc::cmBetween(w2, w3, genPos, wordSize) == 100.0 * (genPos.back() - genPos.at(4)));
150 | }
151 | }
152 | }
--------------------------------------------------------------------------------
/test/test_regression.cpp:
--------------------------------------------------------------------------------
1 | // This file is part of ASMC, developed by Pier Francesco Palamara.
2 | //
3 | // ASMC is free software: you can redistribute it and/or modify
4 | // it under the terms of the GNU General Public License as published by
5 | // the Free Software Foundation, either version 3 of the License, or
6 | // (at your option) any later version.
7 | //
8 | // ASMC is distributed in the hope that it will be useful,
9 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | // GNU General Public License for more details.
12 | //
13 | // You should have received a copy of the GNU General Public License
14 | // along with ASMC. If not, see .
15 |
16 | #define CATCH_CONFIG_MAIN
17 | #include "catch.hpp"
18 |
19 | #include "ASMC.hpp"
20 | #include "FileUtils.hpp"
21 |
22 | #include
23 | #include
24 |
25 | #include
26 |
27 | TEST_CASE("test ASMC regression", "[HMM_regression]")
28 | {
29 | // we only needed to set doPosteriorSums to true, but because C++ does
30 | // not have keyword arguments we need to go through everything
31 | DecodingParams params(ASMC_DATA_DIR "/examples/asmc/exampleFile.n300.array",
32 | ASMC_DATA_DIR "/decoding_quantities/30-100-2000_CEU.decodingQuantities.gz",
33 | "", // _outFileRoot
34 | 1, // _jobs
35 | 1, // _jobInd
36 | "array", // _decodingModeString
37 | false, // _decodingSequence
38 | true, // _usingCSFS
39 | false, // _compress
40 | false, // _useAncestral
41 | 0.f, // _skipCSFSdistance
42 | false, // _noBatches
43 | true // _doPosteriorSums
44 | );
45 |
46 | std::vector indToDecodeA = {1ul, 2ul, 3ul};
47 | std::vector indToDecodeB = {2ul, 3ul, 4ul};
48 |
49 | ASMC::ASMC asmc(params);
50 | asmc.setStorePerPairPosteriorMean();
51 | asmc.setStorePerPairMap();
52 | asmc.decodePairs(indToDecodeA, indToDecodeB);
53 |
54 | auto res = asmc.getRefOfResults();
55 |
56 | SECTION("regression test per pair posterior means")
57 | {
58 |
59 | CHECK(res.perPairPosteriorMeans.rows() == 3ll);
60 | CHECK(res.perPairPosteriorMeans.cols() == 6760ll);
61 |
62 | std::string regressionFile = ASMC_DATA_DIR "/testing/asmc/regression/regression.perPairPosteriorMeans.gz";
63 | FileUtils::AutoGzIfstream fin;
64 | fin.openOrExit(regressionFile);
65 |
66 | for (auto rowIdx = 0ul; rowIdx < indToDecodeA.size(); ++rowIdx) {
67 | std::string line;
68 | getline(fin, line);
69 | std::istringstream iss(line);
70 | std::vector rowAsFloats = {std::istream_iterator(iss), std::istream_iterator()};
71 |
72 | CHECK(rowAsFloats.size() == 6760ul);
73 | for (auto colIdx = 0ul; colIdx < rowAsFloats.size(); ++colIdx) {
74 | CHECK(res.perPairPosteriorMeans(rowIdx, colIdx) == Approx(rowAsFloats.at(colIdx)).epsilon(0.001));
75 | }
76 | }
77 | }
78 |
79 | SECTION("regression test per pair MAP")
80 | {
81 | CHECK(res.perPairMAPs.rows() == 3ll);
82 | CHECK(res.perPairMAPs.cols() == 6760ll);
83 |
84 | std::string regressionFile = ASMC_DATA_DIR "/testing/asmc/regression/regression.perPairMAP.gz";
85 | FileUtils::AutoGzIfstream fin;
86 | fin.openOrExit(regressionFile);
87 |
88 | for (auto rowIdx = 0ul; rowIdx < indToDecodeA.size(); ++rowIdx) {
89 | std::string line;
90 | getline(fin, line);
91 | std::istringstream iss(line);
92 | std::vector rowAsInts = {std::istream_iterator