├── .github
    └── workflows
    │   ├── cpu.yaml
    │   ├── cuda100.yaml
    │   ├── cuda101.yaml
    │   ├── cuda102.yaml
    │   └── cuda92.yaml
├── .gitignore
├── CMakeLists.txt
├── LICENSE
├── README.md
├── ci
    ├── README.md
    ├── build.sh
    ├── cpu
    │   └── Dockerfile
    └── gpu
    │   └── Dockerfile
├── doc
    ├── baidu-research-logo-small.png
    └── deep-speech-ctc-small.png
├── include
    ├── contrib
    │   └── moderngpu
    │   │   ├── LICENSE
    │   │   └── include
    │   │       ├── device
    │   │           ├── ctaloadbalance.cuh
    │   │           ├── ctamerge.cuh
    │   │           ├── ctascan.cuh
    │   │           ├── ctasearch.cuh
    │   │           ├── ctasegreduce.cuh
    │   │           ├── ctasegscan.cuh
    │   │           ├── ctasegsort.cuh
    │   │           ├── ctasortedsearch.cuh
    │   │           ├── devicetypes.cuh
    │   │           ├── deviceutil.cuh
    │   │           ├── intrinsics.cuh
    │   │           ├── loadstore.cuh
    │   │           ├── serialsets.cuh
    │   │           └── sortnetwork.cuh
    │   │       ├── mgpudevice.cuh
    │   │       ├── mgpuenums.h
    │   │       └── util
    │   │           └── static.h
    ├── ctc.h
    └── detail
    │   ├── cpu_ctc.h
    │   ├── ctc_helper.h
    │   ├── gpu_ctc.h
    │   ├── gpu_ctc_kernels.h
    │   ├── hostdevice.h
    │   └── reduce.h
├── pytorch_binding
    ├── .gitignore
    ├── .pypirc
    ├── setup.cfg
    ├── setup.py
    ├── src
    │   ├── binding.cpp
    │   ├── cpu_binding.h
    │   └── gpu_binding.h
    ├── tests
    │   ├── test_cpu.py
    │   └── test_gpu.py
    ├── warpctc_pytorch
    │   └── __init__.py
    └── wheel
    │   ├── build_wheels.sh
    │   └── rename_wheels.py
├── src
    ├── ctc_entrypoint.cpp
    ├── ctc_entrypoint.cu
    └── reduce.cu
└── tests
    ├── random.cpp
    ├── test.h
    ├── test_cpu.cpp
    └── test_gpu.cu


/.github/workflows/cpu.yaml:
--------------------------------------------------------------------------------
 1 | name: CPU
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - pytorch_bindings
 6 |   pull_request:
 7 |     branches:
 8 |       - pytorch_bindings
 9 |   release:
10 |     types: [published]
11 | jobs:
12 |   build:
13 |     runs-on: ubuntu-20.04
14 |     container:
15 |       image: espnet/warpctc_builder:cpu
16 |     defaults:
17 |       run:
18 |         shell: bash --login -eo pipefail {0}
19 |     strategy:
20 |       matrix:
21 |         python-version: [3.6.13, 3.7.10, 3.8.7, 3.9.1]
22 |         pytorch-version: [1.1.0, 1.2.0, 1.3.1, 1.4.0, 1.5.1, 1.6.0]
23 |         exclude:
24 |           - python-version: 3.8.7
25 |             pytorch-version: 1.1.0
26 |           - python-version: 3.8.7
27 |             pytorch-version: 1.2.0
28 |           - python-version: 3.8.7
29 |             pytorch-version: 1.3.1
30 |           - python-version: 3.9.1
31 |             pytorch-version: 1.1.0
32 |           - python-version: 3.9.1
33 |             pytorch-version: 1.2.0
34 |           - python-version: 3.9.1
35 |             pytorch-version: 1.3.1
36 |           - python-version: 3.9.1
37 |             pytorch-version: 1.4.0
38 |           - python-version: 3.9.1
39 |             pytorch-version: 1.5.1
40 |           - python-version: 3.9.1
41 |             pytorch-version: 1.6.0
42 |     steps:
43 |       - uses: actions/checkout@v2
44 |       - name: Copy .bash_profile
45 |         run: cp /root/.bash_profile ~/
46 |       - name: Build warpctc
47 |         run: |
48 |           mkdir build
49 |           cd build
50 |           cmake ..
51 |           make
52 |       - name: Set Python version
53 |         run: pyenv global ${{ matrix.python-version }}
54 |       - name: Install dependencies
55 |         run: |
56 |           pip install -U pip setuptools
57 |           pip install numpy pytest pytest-flakes wheel torch==${{ matrix.pytorch-version }}
58 |       - name: Build wheel
59 |         run: |
60 |           cd pytorch_binding
61 |           python setup.py bdist_wheel
62 |           python wheel/rename_wheels.py
63 |           ls dist
64 |           echo "WHEEL_NAME=$(basename dist/*.whl)" >> $GITHUB_ENV
65 |       - name: Install wheel
66 |         run: |
67 |           cd pytorch_binding
68 |           pip install dist/warpctc_pytorch*.whl
69 |       - name: Run tests
70 |         run: |
71 |           cd pytorch_binding
72 |           pytest --flakes
73 |           pytest tests
74 |       - name: Set 'upload_url' of the latest release
75 |         if: startsWith(github.ref, 'refs/tags/v')
76 |         run: |
77 |           # https://docs.github.com/en/rest/reference/repos#get-the-latest-release
78 |           cmd="curl -s -H 'Accept: application/vnd.github.v3+json' https://api.github.com/repos/espnet/warp-ctc/releases/latest"
79 |           output_json=$($cmd)
80 |           echo "UPLOAD_URL=$(echo $output_json | jq .upload_url | sed 's/"//g')" >> $GITHUB_ENV
81 |       - name: Upload a wheel to the latest release
82 |         if: startsWith(github.ref, 'refs/tags/v')
83 |         uses: actions/upload-release-asset@v1
84 |         env:
85 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
86 |         with:
87 |           upload_url: ${{ env.UPLOAD_URL }}
88 |           asset_path: pytorch_binding/dist/${{ env.WHEEL_NAME }}
89 |           asset_name: ${{ env.WHEEL_NAME }}
90 |           asset_content_type: application/zip
91 | 


--------------------------------------------------------------------------------
/.github/workflows/cuda100.yaml:
--------------------------------------------------------------------------------
 1 | name: CUDA 10.0
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - pytorch_bindings
 6 |   pull_request:
 7 |     branches:
 8 |       - pytorch_bindings
 9 |   release:
10 |     types: [published]
11 | jobs:
12 |   build:
13 |     runs-on: ubuntu-20.04
14 |     container:
15 |       image: espnet/warpctc_builder:cuda100
16 |     defaults:
17 |       run:
18 |         shell: bash --login -eo pipefail {0}
19 |     strategy:
20 |       matrix:
21 |         python-version: [3.6.13, 3.7.10, 3.8.7, 3.9.1]
22 |         pytorch-version: [1.1.0, 1.2.0, 1.3.1, 1.4.0, 1.5.1, 1.6.0]
23 |         exclude:
24 |           - python-version: 3.8.7
25 |             pytorch-version: 1.1.0
26 |           - python-version: 3.8.7
27 |             pytorch-version: 1.2.0
28 |           - python-version: 3.8.7
29 |             pytorch-version: 1.3.1
30 |           - python-version: 3.9.1
31 |             pytorch-version: 1.1.0
32 |           - python-version: 3.9.1
33 |             pytorch-version: 1.2.0
34 |           - python-version: 3.9.1
35 |             pytorch-version: 1.3.1
36 |           - python-version: 3.9.1
37 |             pytorch-version: 1.4.0
38 |           - python-version: 3.9.1
39 |             pytorch-version: 1.5.1
40 |           - python-version: 3.9.1
41 |             pytorch-version: 1.6.0
42 |     steps:
43 |       - uses: actions/checkout@v2
44 |       - name: Copy .bash_profile
45 |         run: cp /root/.bash_profile ~/
46 |       - name: Build warpctc
47 |         run: |
48 |           mkdir build
49 |           cd build
50 |           cmake ..
51 |           make
52 |       - name: Set Python version
53 |         run: pyenv global ${{ matrix.python-version }}
54 |       - name: Install dependencies
55 |         run: |
56 |           pip install -U pip setuptools
57 |           pip install numpy pytest pytest-flakes wheel torch==${{ matrix.pytorch-version }}
58 |       - name: Build wheel
59 |         run: |
60 |           cd pytorch_binding
61 |           python setup.py bdist_wheel
62 |           python wheel/rename_wheels.py
63 |           ls dist
64 |           echo "WHEEL_NAME=$(basename dist/*.whl)" >> $GITHUB_ENV
65 |       - name: Install wheel
66 |         run: |
67 |           cd pytorch_binding
68 |           pip install dist/warpctc_pytorch*.whl
69 |       - name: Run tests
70 |         run: |
71 |           cd pytorch_binding
72 |           pytest --flakes
73 |           pytest tests
74 |       - name: Set 'upload_url' of the latest release
75 |         if: startsWith(github.ref, 'refs/tags/v')
76 |         run: |
77 |           # https://docs.github.com/en/rest/reference/repos#get-the-latest-release
78 |           cmd="curl -s -H 'Accept: application/vnd.github.v3+json' https://api.github.com/repos/espnet/warp-ctc/releases/latest"
79 |           output_json=$($cmd)
80 |           echo "UPLOAD_URL=$(echo $output_json | jq .upload_url | sed 's/"//g')" >> $GITHUB_ENV
81 |       - name: Upload a wheel to the latest release
82 |         if: startsWith(github.ref, 'refs/tags/v')
83 |         uses: actions/upload-release-asset@v1
84 |         env:
85 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
86 |         with:
87 |           upload_url: ${{ env.UPLOAD_URL }}
88 |           asset_path: pytorch_binding/dist/${{ env.WHEEL_NAME }}
89 |           asset_name: ${{ env.WHEEL_NAME }}
90 |           asset_content_type: application/zip
91 | 


--------------------------------------------------------------------------------
/.github/workflows/cuda101.yaml:
--------------------------------------------------------------------------------
 1 | name: CUDA 10.1
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - pytorch_bindings
 6 |   pull_request:
 7 |     branches:
 8 |       - pytorch_bindings
 9 |   release:
10 |     types: [published]
11 | jobs:
12 |   build:
13 |     runs-on: ubuntu-20.04
14 |     container:
15 |       image: espnet/warpctc_builder:cuda101
16 |     defaults:
17 |       run:
18 |         shell: bash --login -eo pipefail {0}
19 |     strategy:
20 |       matrix:
21 |         python-version: [3.6.13, 3.7.10, 3.8.7, 3.9.1]
22 |         pytorch-version: [1.1.0, 1.2.0, 1.3.1, 1.4.0, 1.5.1, 1.6.0]
23 |         exclude:
24 |           - python-version: 3.8.7
25 |             pytorch-version: 1.1.0
26 |           - python-version: 3.8.7
27 |             pytorch-version: 1.2.0
28 |           - python-version: 3.8.7
29 |             pytorch-version: 1.3.1
30 |           - python-version: 3.9.1
31 |             pytorch-version: 1.1.0
32 |           - python-version: 3.9.1
33 |             pytorch-version: 1.2.0
34 |           - python-version: 3.9.1
35 |             pytorch-version: 1.3.1
36 |           - python-version: 3.9.1
37 |             pytorch-version: 1.4.0
38 |           - python-version: 3.9.1
39 |             pytorch-version: 1.5.1
40 |           - python-version: 3.9.1
41 |             pytorch-version: 1.6.0
42 |     steps:
43 |       - uses: actions/checkout@v2
44 |       - name: Copy .bash_profile
45 |         run: cp /root/.bash_profile ~/
46 |       - name: Build warpctc
47 |         run: |
48 |           mkdir build
49 |           cd build
50 |           cmake ..
51 |           make
52 |       - name: Set Python version
53 |         run: pyenv global ${{ matrix.python-version }}
54 |       - name: Install dependencies
55 |         run: |
56 |           pip install -U pip setuptools
57 |           pip install numpy pytest pytest-flakes wheel torch==${{ matrix.pytorch-version }}
58 |       - name: Build wheel
59 |         run: |
60 |           cd pytorch_binding
61 |           python setup.py bdist_wheel
62 |           python wheel/rename_wheels.py
63 |           ls dist
64 |           echo "WHEEL_NAME=$(basename dist/*.whl)" >> $GITHUB_ENV
65 |       - name: Install wheel
66 |         run: |
67 |           cd pytorch_binding
68 |           pip install dist/warpctc_pytorch*.whl
69 |       - name: Run tests
70 |         run: |
71 |           cd pytorch_binding
72 |           pytest --flakes
73 |           pytest tests
74 |       - name: Set 'upload_url' of the latest release
75 |         if: startsWith(github.ref, 'refs/tags/v')
76 |         run: |
77 |           # https://docs.github.com/en/rest/reference/repos#get-the-latest-release
78 |           cmd="curl -s -H 'Accept: application/vnd.github.v3+json' https://api.github.com/repos/espnet/warp-ctc/releases/latest"
79 |           output_json=$($cmd)
80 |           echo "UPLOAD_URL=$(echo $output_json | jq .upload_url | sed 's/"//g')" >> $GITHUB_ENV
81 |       - name: Upload a wheel to the latest release
82 |         if: startsWith(github.ref, 'refs/tags/v')
83 |         uses: actions/upload-release-asset@v1
84 |         env:
85 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
86 |         with:
87 |           upload_url: ${{ env.UPLOAD_URL }}
88 |           asset_path: pytorch_binding/dist/${{ env.WHEEL_NAME }}
89 |           asset_name: ${{ env.WHEEL_NAME }}
90 |           asset_content_type: application/zip
91 | 


--------------------------------------------------------------------------------
/.github/workflows/cuda102.yaml:
--------------------------------------------------------------------------------
 1 | name: CUDA 10.2
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - pytorch_bindings
 6 |   pull_request:
 7 |     branches:
 8 |       - pytorch_bindings
 9 |   release:
10 |     types: [published]
11 | jobs:
12 |   build:
13 |     runs-on: ubuntu-20.04
14 |     container:
15 |       image: espnet/warpctc_builder:cuda102
16 |     defaults:
17 |       run:
18 |         shell: bash --login -eo pipefail {0}
19 |     strategy:
20 |       matrix:
21 |         python-version: [3.6.13, 3.7.10, 3.8.7, 3.9.1]
22 |         pytorch-version: [1.1.0, 1.2.0, 1.3.1, 1.4.0, 1.5.1, 1.6.0]
23 |         exclude:
24 |           - python-version: 3.8.7
25 |             pytorch-version: 1.1.0
26 |           - python-version: 3.8.7
27 |             pytorch-version: 1.2.0
28 |           - python-version: 3.8.7
29 |             pytorch-version: 1.3.1
30 |           - python-version: 3.9.1
31 |             pytorch-version: 1.1.0
32 |           - python-version: 3.9.1
33 |             pytorch-version: 1.2.0
34 |           - python-version: 3.9.1
35 |             pytorch-version: 1.3.1
36 |           - python-version: 3.9.1
37 |             pytorch-version: 1.4.0
38 |           - python-version: 3.9.1
39 |             pytorch-version: 1.5.1
40 |           - python-version: 3.9.1
41 |             pytorch-version: 1.6.0
42 |     steps:
43 |       - uses: actions/checkout@v2
44 |       - name: Copy .bash_profile
45 |         run: cp /root/.bash_profile ~/
46 |       - name: Build warpctc
47 |         run: |
48 |           mkdir build
49 |           cd build
50 |           cmake ..
51 |           make
52 |       - name: Set Python version
53 |         run: pyenv global ${{ matrix.python-version }}
54 |       - name: Install dependencies
55 |         run: |
56 |           pip install -U pip setuptools
57 |           pip install numpy pytest pytest-flakes wheel torch==${{ matrix.pytorch-version }}
58 |       - name: Build wheel
59 |         run: |
60 |           cd pytorch_binding
61 |           python setup.py bdist_wheel
62 |           python wheel/rename_wheels.py
63 |           ls dist
64 |           echo "WHEEL_NAME=$(basename dist/*.whl)" >> $GITHUB_ENV
65 |       - name: Install wheel
66 |         run: |
67 |           cd pytorch_binding
68 |           pip install dist/warpctc_pytorch*.whl
69 |       - name: Run tests
70 |         run: |
71 |           cd pytorch_binding
72 |           pytest --flakes
73 |           pytest tests
74 |       - name: Set 'upload_url' of the latest release
75 |         if: startsWith(github.ref, 'refs/tags/v')
76 |         run: |
77 |           # https://docs.github.com/en/rest/reference/repos#get-the-latest-release
78 |           cmd="curl -s -H 'Accept: application/vnd.github.v3+json' https://api.github.com/repos/espnet/warp-ctc/releases/latest"
79 |           output_json=$($cmd)
80 |           echo "UPLOAD_URL=$(echo $output_json | jq .upload_url | sed 's/"//g')" >> $GITHUB_ENV
81 |       - name: Upload a wheel to the latest release
82 |         if: startsWith(github.ref, 'refs/tags/v')
83 |         uses: actions/upload-release-asset@v1
84 |         env:
85 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
86 |         with:
87 |           upload_url: ${{ env.UPLOAD_URL }}
88 |           asset_path: pytorch_binding/dist/${{ env.WHEEL_NAME }}
89 |           asset_name: ${{ env.WHEEL_NAME }}
90 |           asset_content_type: application/zip
91 | 


--------------------------------------------------------------------------------
/.github/workflows/cuda92.yaml:
--------------------------------------------------------------------------------
 1 | name: CUDA 9.2
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - pytorch_bindings
 6 |   pull_request:
 7 |     branches:
 8 |       - pytorch_bindings
 9 |   release:
10 |     types: [published]
11 | jobs:
12 |   build:
13 |     runs-on: ubuntu-20.04
14 |     container:
15 |       image: espnet/warpctc_builder:cuda92
16 |     defaults:
17 |       run:
18 |         shell: bash --login -eo pipefail {0}
19 |     strategy:
20 |       matrix:
21 |         python-version: [3.6.13, 3.7.10, 3.8.7, 3.9.1]
22 |         pytorch-version: [1.1.0, 1.2.0, 1.3.1, 1.4.0, 1.5.1, 1.6.0]
23 |         exclude:
24 |           - python-version: 3.8.7
25 |             pytorch-version: 1.1.0
26 |           - python-version: 3.8.7
27 |             pytorch-version: 1.2.0
28 |           - python-version: 3.8.7
29 |             pytorch-version: 1.3.1
30 |           - python-version: 3.9.1
31 |             pytorch-version: 1.1.0
32 |           - python-version: 3.9.1
33 |             pytorch-version: 1.2.0
34 |           - python-version: 3.9.1
35 |             pytorch-version: 1.3.1
36 |           - python-version: 3.9.1
37 |             pytorch-version: 1.4.0
38 |           - python-version: 3.9.1
39 |             pytorch-version: 1.5.1
40 |           - python-version: 3.9.1
41 |             pytorch-version: 1.6.0
42 |     steps:
43 |       - uses: actions/checkout@v2
44 |       - name: Copy .bash_profile
45 |         run: cp /root/.bash_profile ~/
46 |       - name: Build warpctc
47 |         run: |
48 |           mkdir build
49 |           cd build
50 |           cmake ..
51 |           make
52 |       - name: Set Python version
53 |         run: pyenv global ${{ matrix.python-version }}
54 |       - name: Install dependencies
55 |         run: |
56 |           pip install -U pip setuptools
57 |           pip install numpy pytest pytest-flakes wheel torch==${{ matrix.pytorch-version }}
58 |       - name: Build wheel
59 |         run: |
60 |           cd pytorch_binding
61 |           python setup.py bdist_wheel
62 |           python wheel/rename_wheels.py
63 |           ls dist
64 |           echo "WHEEL_NAME=$(basename dist/*.whl)" >> $GITHUB_ENV
65 |       - name: Install wheel
66 |         run: |
67 |           cd pytorch_binding
68 |           pip install dist/warpctc_pytorch*.whl
69 |       - name: Run tests
70 |         run: |
71 |           cd pytorch_binding
72 |           pytest --flakes
73 |           pytest tests
74 |       - name: Set 'upload_url' of the latest release
75 |         if: startsWith(github.ref, 'refs/tags/v')
76 |         run: |
77 |           # https://docs.github.com/en/rest/reference/repos#get-the-latest-release
78 |           cmd="curl -s -H 'Accept: application/vnd.github.v3+json' https://api.github.com/repos/espnet/warp-ctc/releases/latest"
79 |           output_json=$($cmd)
80 |           echo "UPLOAD_URL=$(echo $output_json | jq .upload_url | sed 's/"//g')" >> $GITHUB_ENV
81 |       - name: Upload a wheel to the latest release
82 |         if: startsWith(github.ref, 'refs/tags/v')
83 |         uses: actions/upload-release-asset@v1
84 |         env:
85 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
86 |         with:
87 |           upload_url: ${{ env.UPLOAD_URL }}
88 |           asset_path: pytorch_binding/dist/${{ env.WHEEL_NAME }}
89 |           asset_name: ${{ env.WHEEL_NAME }}
90 |           asset_content_type: application/zip
91 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | Makefile
3 | build


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | IF (APPLE)
  2 |     cmake_minimum_required(VERSION 3.4)
  3 | ELSE()
  4 |     cmake_minimum_required(VERSION 2.8)
  5 | ENDIF()
  6 | 
  7 | project(ctc_release)
  8 | 
  9 | IF (NOT APPLE)
 10 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
 11 | ENDIF()
 12 | 
 13 | IF (APPLE)
 14 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -O2")
 15 |     add_definitions(-DAPPLE)
 16 | ENDIF()
 17 | 
 18 | include_directories(include)
 19 | 
 20 | FIND_PACKAGE(CUDA 6.5)
 21 | MESSAGE(STATUS "cuda found ${CUDA_FOUND}")
 22 | 
 23 | option(WITH_GPU "compile warp-ctc with cuda." ${CUDA_FOUND})
 24 | option(WITH_OMP "compile warp-ctc with openmp." ON)
 25 | 
 26 | if(NOT WITH_OMP)
 27 |     add_definitions(-DCTC_DISABLE_OMP)
 28 | endif()
 29 | if (WITH_OMP)
 30 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
 31 |     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fopenmp")
 32 | endif()
 33 | 
 34 | # need to be at least 30 or __shfl_down in reduce wont compile
 35 | IF (CUDA_VERSION LESS 11.0)
 36 |   set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30 -O2")
 37 | ENDIF()
 38 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_35,code=sm_35")
 39 | 
 40 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_50,code=sm_50")
 41 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52")
 42 | IF(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5)
 43 |   SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -D_MWAITXINTRIN_H_INCLUDED -D_FORCE_INLINES")
 44 | ENDIF()
 45 | 
 46 | IF (CUDA_VERSION GREATER 7.6)
 47 |     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60")
 48 |     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61")
 49 |     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62")
 50 | ENDIF()
 51 | 
 52 | IF (CUDA_VERSION GREATER 8.9)
 53 |     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_70")
 54 | ENDIF()
 55 | 
 56 | if (NOT APPLE)
 57 |     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++14")
 58 |     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
 59 | ENDIF()
 60 | 
 61 | IF (APPLE)
 62 |     EXEC_PROGRAM(uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION)
 63 |     STRING(REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
 64 |     MESSAGE(STATUS "DARWIN_VERSION=${DARWIN_VERSION}")
 65 | 
 66 |     #for el capitain have to use rpath
 67 | 
 68 |     IF (DARWIN_VERSION LESS 15)
 69 |         set(CMAKE_SKIP_RPATH TRUE)
 70 |     ENDIF ()
 71 | 
 72 | ELSE()
 73 |     #always skip for linux
 74 |     set(CMAKE_SKIP_RPATH TRUE)
 75 | ENDIF()
 76 | 
 77 | 
 78 | IF (WITH_GPU)
 79 | 
 80 |     MESSAGE(STATUS "Building shared library with GPU support")
 81 | 
 82 |     CUDA_ADD_LIBRARY(warpctc SHARED src/ctc_entrypoint.cu src/reduce.cu)
 83 |     IF (!Torch_FOUND)
 84 |         TARGET_LINK_LIBRARIES(warpctc ${CUDA_curand_LIBRARY})
 85 |     ENDIF()
 86 | 
 87 |     add_executable(test_cpu tests/test_cpu.cpp tests/random.cpp )
 88 |     TARGET_LINK_LIBRARIES(test_cpu warpctc)
 89 |     SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++14")
 90 | 
 91 |     cuda_add_executable(test_gpu tests/test_gpu.cu tests/random.cpp )
 92 |     TARGET_LINK_LIBRARIES(test_gpu warpctc ${CUDA_curand_LIBRARY})
 93 |     SET_TARGET_PROPERTIES(test_gpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++14")
 94 | 
 95 |     INSTALL(TARGETS warpctc
 96 |             RUNTIME DESTINATION "bin"
 97 |             LIBRARY DESTINATION "lib"
 98 |             ARCHIVE DESTINATION "lib")
 99 | 
100 |     INSTALL(FILES include/ctc.h DESTINATION "include")
101 | ELSE()
102 |     MESSAGE(STATUS "Building shared library with no GPU support")
103 | 
104 |     if (NOT APPLE)
105 |         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -O2")
106 |     ENDIF()
107 | 
108 |     ADD_LIBRARY(warpctc SHARED src/ctc_entrypoint.cpp)
109 | 
110 |     add_executable(test_cpu tests/test_cpu.cpp tests/random.cpp )
111 |     TARGET_LINK_LIBRARIES(test_cpu warpctc)
112 |     SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++14")
113 | 
114 |     INSTALL(TARGETS warpctc
115 |             RUNTIME DESTINATION "bin"
116 |             LIBRARY DESTINATION "lib"
117 |             ARCHIVE DESTINATION "lib")
118 | 
119 |     INSTALL(FILES include/ctc.h DESTINATION "include")
120 | ENDIF()
121 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Copyright 2015-2016 Baidu USA LLC.  All rights reserved.
  2 | 
  3 |                                  Apache License
  4 |                            Version 2.0, January 2004
  5 |                         http://www.apache.org/licenses/
  6 | 
  7 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  8 | 
  9 |    1. Definitions.
 10 | 
 11 |       "License" shall mean the terms and conditions for use, reproduction,
 12 |       and distribution as defined by Sections 1 through 9 of this document.
 13 | 
 14 |       "Licensor" shall mean the copyright owner or entity authorized by
 15 |       the copyright owner that is granting the License.
 16 | 
 17 |       "Legal Entity" shall mean the union of the acting entity and all
 18 |       other entities that control, are controlled by, or are under common
 19 |       control with that entity. For the purposes of this definition,
 20 |       "control" means (i) the power, direct or indirect, to cause the
 21 |       direction or management of such entity, whether by contract or
 22 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 23 |       outstanding shares, or (iii) beneficial ownership of such entity.
 24 | 
 25 |       "You" (or "Your") shall mean an individual or Legal Entity
 26 |       exercising permissions granted by this License.
 27 | 
 28 |       "Source" form shall mean the preferred form for making modifications,
 29 |       including but not limited to software source code, documentation
 30 |       source, and configuration files.
 31 | 
 32 |       "Object" form shall mean any form resulting from mechanical
 33 |       transformation or translation of a Source form, including but
 34 |       not limited to compiled object code, generated documentation,
 35 |       and conversions to other media types.
 36 | 
 37 |       "Work" shall mean the work of authorship, whether in Source or
 38 |       Object form, made available under the License, as indicated by a
 39 |       copyright notice that is included in or attached to the work
 40 |       (an example is provided in the Appendix below).
 41 | 
 42 |       "Derivative Works" shall mean any work, whether in Source or Object
 43 |       form, that is based on (or derived from) the Work and for which the
 44 |       editorial revisions, annotations, elaborations, or other modifications
 45 |       represent, as a whole, an original work of authorship. For the purposes
 46 |       of this License, Derivative Works shall not include works that remain
 47 |       separable from, or merely link (or bind by name) to the interfaces of,
 48 |       the Work and Derivative Works thereof.
 49 | 
 50 |       "Contribution" shall mean any work of authorship, including
 51 |       the original version of the Work and any modifications or additions
 52 |       to that Work or Derivative Works thereof, that is intentionally
 53 |       submitted to Licensor for inclusion in the Work by the copyright owner
 54 |       or by an individual or Legal Entity authorized to submit on behalf of
 55 |       the copyright owner. For the purposes of this definition, "submitted"
 56 |       means any form of electronic, verbal, or written communication sent
 57 |       to the Licensor or its representatives, including but not limited to
 58 |       communication on electronic mailing lists, source code control systems,
 59 |       and issue tracking systems that are managed by, or on behalf of, the
 60 |       Licensor for the purpose of discussing and improving the Work, but
 61 |       excluding communication that is conspicuously marked or otherwise
 62 |       designated in writing by the copyright owner as "Not a Contribution."
 63 | 
 64 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 65 |       on behalf of whom a Contribution has been received by Licensor and
 66 |       subsequently incorporated within the Work.
 67 | 
 68 |    2. Grant of Copyright License. Subject to the terms and conditions of
 69 |       this License, each Contributor hereby grants to You a perpetual,
 70 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 71 |       copyright license to reproduce, prepare Derivative Works of,
 72 |       publicly display, publicly perform, sublicense, and distribute the
 73 |       Work and such Derivative Works in Source or Object form.
 74 | 
 75 |    3. Grant of Patent License. Subject to the terms and conditions of
 76 |       this License, each Contributor hereby grants to You a perpetual,
 77 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 78 |       (except as stated in this section) patent license to make, have made,
 79 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 80 |       where such license applies only to those patent claims licensable
 81 |       by such Contributor that are necessarily infringed by their
 82 |       Contribution(s) alone or by combination of their Contribution(s)
 83 |       with the Work to which such Contribution(s) was submitted. If You
 84 |       institute patent litigation against any entity (including a
 85 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 86 |       or a Contribution incorporated within the Work constitutes direct
 87 |       or contributory patent infringement, then any patent licenses
 88 |       granted to You under this License for that Work shall terminate
 89 |       as of the date such litigation is filed.
 90 | 
 91 |    4. Redistribution. You may reproduce and distribute copies of the
 92 |       Work or Derivative Works thereof in any medium, with or without
 93 |       modifications, and in Source or Object form, provided that You
 94 |       meet the following conditions:
 95 | 
 96 |       (a) You must give any other recipients of the Work or
 97 |           Derivative Works a copy of this License; and
 98 | 
 99 |       (b) You must cause any modified files to carry prominent notices
100 |           stating that You changed the files; and
101 | 
102 |       (c) You must retain, in the Source form of any Derivative Works
103 |           that You distribute, all copyright, patent, trademark, and
104 |           attribution notices from the Source form of the Work,
105 |           excluding those notices that do not pertain to any part of
106 |           the Derivative Works; and
107 | 
108 |       (d) If the Work includes a "NOTICE" text file as part of its
109 |           distribution, then any Derivative Works that You distribute must
110 |           include a readable copy of the attribution notices contained
111 |           within such NOTICE file, excluding those notices that do not
112 |           pertain to any part of the Derivative Works, in at least one
113 |           of the following places: within a NOTICE text file distributed
114 |           as part of the Derivative Works; within the Source form or
115 |           documentation, if provided along with the Derivative Works; or,
116 |           within a display generated by the Derivative Works, if and
117 |           wherever such third-party notices normally appear. The contents
118 |           of the NOTICE file are for informational purposes only and
119 |           do not modify the License. You may add Your own attribution
120 |           notices within Derivative Works that You distribute, alongside
121 |           or as an addendum to the NOTICE text from the Work, provided
122 |           that such additional attribution notices cannot be construed
123 |           as modifying the License.
124 | 
125 |       You may add Your own copyright statement to Your modifications and
126 |       may provide additional or different license terms and conditions
127 |       for use, reproduction, or distribution of Your modifications, or
128 |       for any such Derivative Works as a whole, provided Your use,
129 |       reproduction, and distribution of the Work otherwise complies with
130 |       the conditions stated in this License.
131 | 
132 |    5. Submission of Contributions. Unless You explicitly state otherwise,
133 |       any Contribution intentionally submitted for inclusion in the Work
134 |       by You to the Licensor shall be under the terms and conditions of
135 |       this License, without any additional terms or conditions.
136 |       Notwithstanding the above, nothing herein shall supersede or modify
137 |       the terms of any separate license agreement you may have executed
138 |       with Licensor regarding such Contributions.
139 | 
140 |    6. Trademarks. This License does not grant permission to use the trade
141 |       names, trademarks, service marks, or product names of the Licensor,
142 |       except as required for reasonable and customary use in describing the
143 |       origin of the Work and reproducing the content of the NOTICE file.
144 | 
145 |    7. Disclaimer of Warranty. Unless required by applicable law or
146 |       agreed to in writing, Licensor provides the Work (and each
147 |       Contributor provides its Contributions) on an "AS IS" BASIS,
148 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
149 |       implied, including, without limitation, any warranties or conditions
150 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
151 |       PARTICULAR PURPOSE. You are solely responsible for determining the
152 |       appropriateness of using or redistributing the Work and assume any
153 |       risks associated with Your exercise of permissions under this License.
154 | 
155 |    8. Limitation of Liability. In no event and under no legal theory,
156 |       whether in tort (including negligence), contract, or otherwise,
157 |       unless required by applicable law (such as deliberate and grossly
158 |       negligent acts) or agreed to in writing, shall any Contributor be
159 |       liable to You for damages, including any direct, indirect, special,
160 |       incidental, or consequential damages of any character arising as a
161 |       result of this License or out of the use or inability to use the
162 |       Work (including but not limited to damages for loss of goodwill,
163 |       work stoppage, computer failure or malfunction, or any and all
164 |       other commercial damages or losses), even if such Contributor
165 |       has been advised of the possibility of such damages.
166 | 
167 |    9. Accepting Warranty or Additional Liability. While redistributing
168 |       the Work or Derivative Works thereof, You may choose to offer,
169 |       and charge a fee for, acceptance of support, warranty, indemnity,
170 |       or other liability obligations and/or rights consistent with this
171 |       License. However, in accepting such obligations, You may act only
172 |       on Your own behalf and on Your sole responsibility, not on behalf
173 |       of any other Contributor, and only if You agree to indemnify,
174 |       defend, and hold each Contributor harmless for any liability
175 |       incurred by, or claims asserted against, such Contributor by reason
176 |       of your accepting any such warranty or additional liability.
177 | 
178 |    END OF TERMS AND CONDITIONS
179 | 
180 |    APPENDIX: How to apply the Apache License to your work.
181 | 
182 |       To apply the Apache License to your work, attach the following
183 |       boilerplate notice, with the fields enclosed by brackets "[]"
184 |       replaced with your own identifying information. (Don't include
185 |       the brackets!)  The text should be enclosed in the appropriate
186 |       comment syntax for the file format. We also recommend that a
187 |       file or class name and description of purpose be included on the
188 |       same "printed page" as the copyright notice for easier
189 |       identification within third-party archives.
190 | 
191 |    Copyright 2015-2016, Baidu USA LLC.
192 | 
193 |    Licensed under the Apache License, Version 2.0 (the "License");
194 |    you may not use this file except in compliance with the License.
195 |    You may obtain a copy of the License at
196 | 
197 |        http://www.apache.org/licenses/LICENSE-2.0
198 | 
199 |    Unless required by applicable law or agreed to in writing, software
200 |    distributed under the License is distributed on an "AS IS" BASIS,
201 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
202 |    See the License for the specific language governing permissions and
203 |    limitations under the License.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # PyTorch bindings for Warp-ctc
  2 | 
  3 | |branch|status|
  4 | |:-:|:-:|
  5 | |`pytorch_bindings`|[![Build Status](https://travis-ci.org/espnet/warp-ctc.svg?branch=pytorch_bindings)](https://github.com/espnet/warp-ctc/tree/pytorch_bindings)|
  6 | |`pytorch-0.4`|[![Build Status](https://travis-ci.org/espnet/warp-ctc.svg?branch=pytorch-0.4)](https://github.com/espnet/warp-ctc/tree/pytorch-0.4)|
  7 | |`pytorch-1.0`|[![Build Status](https://travis-ci.org/espnet/warp-ctc.svg?branch=pytorch-1.0)](https://github.com/espnet/warp-ctc/tree/pytorch-1.0)|
  8 | 
  9 | This is an extension onto the original repo found [here](https://github.com/baidu-research/warp-ctc).
 10 | 
 11 | ## Installation
 12 | 
 13 | Install [PyTorch](https://github.com/pytorch/pytorch#installation) first.
 14 | 
 15 | `warpctc-pytorch` wheel uses [local version identifiers](https://www.python.org/dev/peps/pep-0440/#local-version-identifiers),
 16 | which has a restriction that users have to specify the version explicitly.
 17 | 
 18 | ```console
 19 | $ pip install warpctc-pytorch==X.X.X+torchYY.cudaZZ
 20 | ```
 21 | 
 22 | The latest version is 0.2.1 and if you work with PyTorch 1.6 and CUDA 10.2, you can run:
 23 | 
 24 | ```console
 25 | $ pip install warpctc-pytorch==0.2.1+torch16.cuda102
 26 | ```
 27 | 
 28 | ### for PyTorch 1.4 - 1.6
 29 | 
 30 | `warpctc-pytorch` wheels are provided for Python 3.8, 3.7, 3.6 and CUDA 10.2, 10.1, 10.0, 9.2.
 31 | 
 32 | ### for PyTorch 1.1 - 1.3
 33 | 
 34 | `warpctc-pytorch` wheels are provided for Python 3.7, 3.6 and CUDA 10.2, 10.1, 10.0, 9.2.
 35 | 
 36 | ### for PyTorch 1.0
 37 | 
 38 | `warpctc-pytorch10-cudaYY` wheels are provided for Python 3.7, 3.6 and CUDA 10.1, 10.0, 9.2, 9.1, 9.0, 8.0.
 39 | 
 40 | If you work with CUDA 10.1, you can run:
 41 | 
 42 | ```console
 43 | $ pip install warpctc-pytorch10-cuda101
 44 | ```
 45 | 
 46 | ### for PyTorch 0.4.1
 47 | 
 48 | Wheels for PyTorch 0.4.1 are not provided so users have to build from source manually.
 49 | 
 50 | `WARP_CTC_PATH` should be set to the location of a built WarpCTC
 51 | (i.e. `libwarpctc.so`).  This defaults to `../build`, so from within a
 52 | new warp-ctc clone you could build WarpCTC like this:
 53 | 
 54 | ```bash
 55 | $ git clone https://github.com/espnet/warp-ctc.git
 56 | $ cd warp-ctc; git checkout -b pytorch-0.4 remotes/origin/pytorch-0.4
 57 | $ mkdir build; cd build
 58 | $ cmake ..
 59 | $ make
 60 | ```
 61 | 
 62 | Now install the bindings:
 63 | ```bash
 64 | $ cd ../pytorch_binding
 65 | $ pip install numpy cffi
 66 | $ python setup.py install
 67 | ```
 68 | 
 69 | ## Example
 70 | 
 71 | Example to use the bindings below.
 72 | 
 73 | ```python
 74 | import torch
 75 | from warpctc_pytorch import CTCLoss
 76 | ctc_loss = CTCLoss()
 77 | # expected shape of seqLength x batchSize x alphabet_size
 78 | probs = torch.FloatTensor([[[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]]]).transpose(0, 1).contiguous()
 79 | labels = torch.IntTensor([1, 2])
 80 | label_sizes = torch.IntTensor([2])
 81 | probs_sizes = torch.IntTensor([2])
 82 | probs.requires_grad_(True)  # tells autograd to compute gradients for probs
 83 | cost = ctc_loss(probs, labels, probs_sizes, label_sizes)
 84 | cost.backward()
 85 | ```
 86 | 
 87 | ## Documentation
 88 | 
 89 | ```
 90 | CTCLoss(size_average=False, length_average=False, reduce=True)
 91 |     # size_average (bool): normalize the loss by the batch size (default: False)
 92 |     # length_average (bool): normalize the loss by the total number of frames in the batch. If True, supersedes size_average (default: False)
 93 |     # reduce (bool): average or sum over observation for each minibatch.
 94 |         If `False`, returns a loss per batch element instead and ignores `average` options.
 95 |         (default: `True`)
 96 | 
 97 | forward(acts, labels, act_lens, label_lens)
 98 |     # acts: Tensor of (seqLength x batch x outputDim) containing output activations from network (before softmax)
 99 |     # labels: 1 dimensional Tensor containing all the targets of the batch in one large sequence
100 |     # act_lens: Tensor of size (batch) containing size of each output sequence from the network
101 |     # label_lens: Tensor of (batch) containing label length of each example
102 | ```
103 | 


--------------------------------------------------------------------------------
/ci/README.md:
--------------------------------------------------------------------------------
 1 | Docker image builder for Travis CI
 2 | ===
 3 | 
 4 | This directory contains tools to build following Docker images used in Travis CI,
 5 | 
 6 | - `espnet/warpctc_builder:cuda101` for CUDA 10.1
 7 | - `espnet/warpctc_builder:cuda100` for CUDA 10.0
 8 | - `espnet/warpctc_builder:cuda92` for CUDA 9.2
 9 | - `espnet/warpctc_builder:cuda91` for CUDA 9.1
10 | - `espnet/warpctc_builder:cuda90` for CUDA 9.0
11 | - `espnet/warpctc_builder:cuda80` for CUDA 8.0
12 | - `espnet/warpctc_builder:cpu` for no CUDA environment
13 | 
14 | 
15 | ## Building Docker images
16 | 
17 | Run `build.sh`.
18 | 
19 | ```console
20 | $ ./build.sh
21 | ```
22 | 
23 | ## Uploading images to Dockerhub
24 | 
25 | Run `docker push`.
26 | 
27 | ```console
28 | $ docker push espnet/warpctc_builder:TAG
29 | ```
30 | 
31 | Note that your Dockerhub account have write access to [espnet/warpctc_builder](https://hub.docker.com/r/espnet/warpctc_builder) repository.
32 | 


--------------------------------------------------------------------------------
/ci/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | YYYYMMDD=$(date '+%Y%m%d')
 4 | image_repository=espnet/warpctc_builder
 5 | cuda_versions=(9.2 10.0 10.1 10.2)
 6 | for cuda_version in ${cuda_versions[@]}; do
 7 |   # gcc version check exists in /usr/local/cuda/include/crt/host_config.h
 8 |   devtoolset_version=8
 9 |   if [ "$cuda_version" = "10.0" ] || [ "$cuda_version" = "9.2" ]; then
10 |     devtoolset_version=7
11 |   fi
12 |   base_image="nvidia/cuda:$cuda_version-cudnn7-devel-centos7"
13 |   image_tag=cuda${cuda_version/./}
14 |   image_name=$image_repository:$image_tag
15 |   echo "Building $image_name"
16 |   docker build --no-cache --build-arg base_image=$base_image --build-arg devtoolset_version=$devtoolset_version -t $image_name ./gpu
17 |   docker tag $image_name $image_name-$YYYYMMDD
18 |   echo -e "Done.\n"
19 | done
20 | 
21 | image_tag=cpu
22 | image_name=$image_repository:$image_tag
23 | echo "Building $image_name"
24 | docker build --no-cache -t $image_name ./cpu
25 | docker tag $image_name $image_name-$YYYYMMDD
26 | echo Done.
27 | 


--------------------------------------------------------------------------------
/ci/cpu/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM centos:centos7
 2 | 
 3 | RUN yum update -y \
 4 |  && yum install -y epel-release \
 5 |  && yum install -y centos-release-scl \
 6 |  && yum install -y devtoolset-9-gcc-c++ \
 7 |  && echo 'source scl_source enable devtoolset-9' >> ~/.bash_profile \
 8 |  && yum install -y \
 9 |     bzip2-devel \
10 |     cmake \
11 |     git \
12 |     jq \
13 |     libffi-devel \
14 |     make \
15 |     openssl-devel \
16 |     readline-devel \
17 |     sqlite-devel \
18 |     which \
19 |     zlib-devel \
20 |  && yum clean all \
21 |  && rm -rf /var/cache/yum/*
22 | # Install pyenv
23 | RUN git clone https://github.com/pyenv/pyenv.git /opt/pyenv
24 | ENV PYENV_ROOT /opt/pyenv
25 | ENV PATH ${PYENV_ROOT}/shims:${PYENV_ROOT}/bin:${PATH}
26 | # Install Python
27 | SHELL ["/bin/bash", "-c"]
28 | ENV PYTHON_VERSIONS 3.6.13 3.7.10 3.8.7 3.9.1
29 | RUN source scl_source enable devtoolset-9 \
30 |  && for python_version in ${PYTHON_VERSIONS}; do \
31 |       pyenv install ${python_version}; \
32 |     done
33 | 


--------------------------------------------------------------------------------
/ci/gpu/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG base_image
 2 | FROM ${base_image}
 3 | ARG devtoolset_version
 4 | 
 5 | ENV CUDA_HOME /usr/local/cuda
 6 | 
 7 | RUN yum update -y \
 8 |  && yum install -y epel-release \
 9 |  && yum install -y centos-release-scl \
10 |  && yum install -y devtoolset-${devtoolset_version}-gcc-c++ \
11 |  && echo "source scl_source enable devtoolset-${devtoolset_version}" >> ~/.bash_profile \
12 |  && yum install -y \
13 |     bzip2-devel \
14 |     cmake \
15 |     git \
16 |     jq \
17 |     libffi-devel \
18 |     make \
19 |     openssl-devel \
20 |     readline-devel \
21 |     sqlite-devel \
22 |     which \
23 |     zlib-devel \
24 |  && yum clean all \
25 |  && rm -rf /var/cache/yum/*
26 | # Install pyenv
27 | RUN git clone https://github.com/pyenv/pyenv.git /opt/pyenv
28 | ENV PYENV_ROOT /opt/pyenv
29 | ENV PATH ${PYENV_ROOT}/shims:${PYENV_ROOT}/bin:${PATH}
30 | # Install Python
31 | ENV PYTHON_VERSIONS 3.6.13 3.7.10 3.8.7 3.9.1
32 | RUN source scl_source enable devtoolset-${devtoolset_version} \
33 |  && for python_version in ${PYTHON_VERSIONS}; do \
34 |       pyenv install ${python_version}; \
35 |     done
36 | 


--------------------------------------------------------------------------------
/doc/baidu-research-logo-small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/espnet/warp-ctc/0705437521a1302f38692fb684f4743f8a85d324/doc/baidu-research-logo-small.png


--------------------------------------------------------------------------------
/doc/deep-speech-ctc-small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/espnet/warp-ctc/0705437521a1302f38692fb684f4743f8a85d324/doc/deep-speech-ctc-small.png


--------------------------------------------------------------------------------
/include/contrib/moderngpu/LICENSE:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 | * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
 3 | * 
 4 | * Redistribution and use in source and binary forms, with or without
 5 | * modification, are permitted provided that the following conditions are met:
 6 | *     * Redistributions of source code must retain the above copyright
 7 | *       notice, this list of conditions and the following disclaimer.
 8 | *     * Redistributions in binary form must reproduce the above copyright
 9 | *       notice, this list of conditions and the following disclaimer in the
10 | *       documentation and/or other materials provided with the distribution.
11 | *     * Neither the name of the NVIDIA CORPORATION nor the
12 | *       names of its contributors may be used to endorse or promote products
13 | *       derived from this software without specific prior written permission.
14 | * 
15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | *
26 | ******************************************************************************/
27 | 


--------------------------------------------------------------------------------
/include/contrib/moderngpu/include/device/ctaloadbalance.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /******************************************************************************
 29 |  *
 30 |  * Code and text by Sean Baxter, NVIDIA Research
 31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
 32 |  *
 33 |  ******************************************************************************/
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "ctasearch.cuh"
 38 | #include "loadstore.cuh"
 39 | 
 40 | namespace mgpu {
 41 | 
 42 | ////////////////////////////////////////////////////////////////////////////////
 43 | // DeviceLoadBalancingSearch
 44 | // Upper Bound search from A (needles) into B (haystack). The A values are
 45 | // natural numbers from aBegin to aEnd. bFirst is the index of the B value at
 46 | // bBegin in shared memory.
 47 | 
 48 | template<int VT, bool RangeCheck>
 49 | MGPU_DEVICE void DeviceSerialLoadBalanceSearch(const int* b_shared, int aBegin,
 50 | 	int aEnd, int bFirst, int bBegin, int bEnd, int* a_shared) {
 51 | 
 52 | 	int bKey = b_shared[bBegin];
 53 | 
 54 | 	#pragma unroll
 55 | 	for(int i = 0; i < VT; ++i) {
 56 | 		bool p;
 57 | 		if(RangeCheck)
 58 | 			p = (aBegin < aEnd) && ((bBegin >= bEnd) || (aBegin < bKey));
 59 | 		else
 60 | 			p = aBegin < bKey;
 61 | 
 62 | 		if(p)
 63 | 			// Advance A (the needle).
 64 | 			a_shared[aBegin++] = bFirst + bBegin;
 65 | 		else
 66 | 			// Advance B (the haystack).
 67 | 			bKey = b_shared[++bBegin];
 68 | 	}
 69 | }
 70 | 
 71 | ////////////////////////////////////////////////////////////////////////////////
 72 | // CTALoadBalance
 73 | // Computes upper_bound(counting_iterator<int>(first), b_global) - 1.
 74 | 
 75 | // Unlike most other CTA* functions, CTALoadBalance loads from global memory.
 76 | // This returns the loaded B elements at the beginning or end of shared memory
 77 | // depending on the aFirst argument.
 78 | 
 79 | // CTALoadBalance requires NT * VT + 2 slots of shared memory.
 80 | template<int NT, int VT, typename InputIt>
 81 | MGPU_DEVICE int4 CTALoadBalance(int destCount, InputIt b_global,
 82 | 	int sourceCount, int block, int tid, const int* mp_global,
 83 | 	int* indices_shared, bool loadPrecedingB) {
 84 | 
 85 | 	int4 range = ComputeMergeRange(destCount, sourceCount, block, 0, NT * VT,
 86 | 		mp_global);
 87 | 
 88 | 	int a0 = range.x;
 89 | 	int a1 = range.y;
 90 | 	int b0 = range.z;
 91 | 	int b1 = range.w;
 92 | 	if(!b0) loadPrecedingB = false;
 93 | 
 94 | 	// Load one trailing term from B. If we're already at the end, fill the
 95 | 	// end of the buffer with destCount.
 96 | 	int aCount = a1 - a0;
 97 | 	int bCount = b1 - b0;
 98 | 	int extended = b1 < sourceCount;
 99 | 	int loadCount = bCount + extended;
100 | 	int fillCount = NT * VT + 1 - loadCount - aCount;
101 | 
102 | 	int* a_shared = indices_shared;
103 | 	int* b_shared = indices_shared + aCount + (int)loadPrecedingB;
104 | 
105 | 	// Load the B values.
106 | //	DeviceMemToMemLoop<NT>(bCount + extended + (int)loadPrecedingB,
107 | //		b_global + b0 - (int)loadPrecedingB, tid,
108 | //		b_shared - (int)loadPrecedingB);
109 | 
110 | 	for(int i = tid - (int)loadPrecedingB; i < bCount + extended; i += NT)
111 | 		b_shared[i] = b_global[b0 + i];
112 | 
113 | 	// Fill the end of the array with destCount.
114 | 	for(int i = tid + extended; i < fillCount; i += NT)
115 | 		b_shared[bCount + i] = destCount;
116 | 	__syncthreads();
117 | 
118 | 	// Run a merge path to find the start of the serial merge for each thread.
119 | 	int diag = VT * tid;
120 | 	int mp = MergePath<MgpuBoundsUpper>(mgpu::counting_iterator<int>(a0),
121 | 		aCount, b_shared, bCount, diag, mgpu::less<int>());
122 | 
123 | 	int a0tid = a0 + mp;
124 | 	int b0tid = diag - mp;
125 | 
126 | 	// Subtract 1 from b0 because we want to return upper_bound - 1.
127 | 	DeviceSerialLoadBalanceSearch<VT, false>(b_shared, a0tid, a1, b0 - 1,
128 | 		b0tid, bCount, a_shared - a0);
129 | 	__syncthreads();
130 | 
131 | 	b0 -= (int)loadPrecedingB;
132 | 	return make_int4(a0, a1, b0, b1);
133 | }
134 | 
135 | 
136 | } // namespace mgpu
137 | 


--------------------------------------------------------------------------------
/include/contrib/moderngpu/include/device/ctamerge.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /******************************************************************************
 29 |  *
 30 |  * Code and text by Sean Baxter, NVIDIA Research
 31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
 32 |  *
 33 |  ******************************************************************************/
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "ctasearch.cuh"
 38 | #include "loadstore.cuh"
 39 | #include "sortnetwork.cuh"
 40 | 
 41 | namespace mgpu {
 42 | 
 43 | ////////////////////////////////////////////////////////////////////////////////
 44 | // SerialMerge
 45 | 
 46 | template<int VT, bool RangeCheck, typename T, typename Comp>
 47 | MGPU_DEVICE void SerialMerge(const T* keys_shared, int aBegin, int aEnd,
 48 | 	int bBegin, int bEnd, T* results, int* indices, Comp comp) {
 49 | 
 50 | 	T aKey = keys_shared[aBegin];
 51 | 	T bKey = keys_shared[bBegin];
 52 | 
 53 | 	#pragma unroll
 54 | 	for(int i = 0; i < VT; ++i) {
 55 | 		bool p;
 56 | 		if(RangeCheck)
 57 | 			p = (bBegin >= bEnd) || ((aBegin < aEnd) && !comp(bKey, aKey));
 58 | 		else
 59 | 			p = !comp(bKey, aKey);
 60 | 
 61 | 		results[i] = p ? aKey : bKey;
 62 | 		indices[i] = p ? aBegin : bBegin - !RangeCheck;
 63 | 
 64 | 		if(p) aKey = keys_shared[++aBegin];
 65 | 		else bKey = keys_shared[++bBegin];
 66 | 	}
 67 | 	__syncthreads();
 68 | }
 69 | 
 70 | ////////////////////////////////////////////////////////////////////////////////
 71 | // FindMergeFrame and FindMergesortInterval help mergesort (both CTA and global
 72 | // merge pass levels) locate lists within the single source array.
 73 | 
 74 | // Returns (offset of a, offset of b, length of list).
 75 | MGPU_HOST_DEVICE int3 FindMergesortFrame(int coop, int block, int nv) {
 76 | 	// coop is the number of CTAs or threads cooperating to merge two lists into
 77 | 	// one. We round block down to the first CTA's ID that is working on this
 78 | 	// merge.
 79 | 	int start = ~(coop - 1) & block;
 80 | 	int size = nv * (coop>> 1);
 81 | 	return make_int3(nv * start, nv * start + size, size);
 82 | }
 83 | 
 84 | // Returns (a0, a1, b0, b1) into mergesort input lists between mp0 and mp1.
 85 | MGPU_HOST_DEVICE int4 FindMergesortInterval(int3 frame, int coop, int block,
 86 | 	int nv, int count, int mp0, int mp1) {
 87 | 
 88 | 	// Locate diag from the start of the A sublist.
 89 | 	int diag = nv * block - frame.x;
 90 | 	int a0 = frame.x + mp0;
 91 | 	int a1 = min(count, frame.x + mp1);
 92 | 	int b0 = min(count, frame.y + diag - mp0);
 93 | 	int b1 = min(count, frame.y + diag + nv - mp1);
 94 | 
 95 | 	// The end partition of the last block for each merge operation is computed
 96 | 	// and stored as the begin partition for the subsequent merge. i.e. it is
 97 | 	// the same partition but in the wrong coordinate system, so its 0 when it
 98 | 	// should be listSize. Correct that by checking if this is the last block
 99 | 	// in this merge operation.
100 | 	if(coop - 1 == ((coop - 1) & block)) {
101 | 		a1 = min(count, frame.x + frame.z);
102 | 		b1 = min(count, frame.y + frame.z);
103 | 	}
104 | 	return make_int4(a0, a1, b0, b1);
105 | }
106 | 
107 | ////////////////////////////////////////////////////////////////////////////////
108 | // ComputeMergeRange
109 | 
110 | MGPU_HOST_DEVICE int4 ComputeMergeRange(int aCount, int bCount, int block,
111 | 	int coop, int NV, const int* mp_global) {
112 | 
113 | 	// Load the merge paths computed by the partitioning kernel.
114 | 	int mp0 = mp_global[block];
115 | 	int mp1 = mp_global[block + 1];
116 | 	int gid = NV * block;
117 | 
118 | 	// Compute the ranges of the sources in global memory.
119 | 	int4 range;
120 | 	if(coop) {
121 | 		int3 frame = FindMergesortFrame(coop, block, NV);
122 | 		range = FindMergesortInterval(frame, coop, block, NV, aCount, mp0,
123 | 			mp1);
124 | 	} else {
125 | 		range.x = mp0;											// a0
126 | 		range.y = mp1;											// a1
127 | 		range.z = gid - range.x;								// b0
128 | 		range.w = min(aCount + bCount, gid + NV) - range.y;		// b1
129 | 	}
130 | 	return range;
131 | }
132 | 
133 | ////////////////////////////////////////////////////////////////////////////////
134 | // CTA mergesort support
135 | 
136 | template<int NT, int VT, typename T, typename Comp>
137 | MGPU_DEVICE void CTABlocksortPass(T* keys_shared, int tid, int count,
138 | 	int coop, T* keys, int* indices, Comp comp) {
139 | 
140 | 	int list = ~(coop - 1) & tid;
141 | 	int diag = min(count, VT * ((coop - 1) & tid));
142 | 	int start = VT * list;
143 | 	int a0 = min(count, start);
144 | 	int b0 = min(count, start + VT * (coop / 2));
145 | 	int b1 = min(count, start + VT * coop);
146 | 
147 | 	int p = MergePath<MgpuBoundsLower>(keys_shared + a0, b0 - a0,
148 | 		keys_shared + b0, b1 - b0, diag, comp);
149 | 
150 | 	SerialMerge<VT, true>(keys_shared, a0 + p, b0, b0 + diag - p, b1, keys,
151 | 		indices, comp);
152 | }
153 | 
154 | template<int NT, int VT, bool HasValues, typename KeyType, typename ValType,
155 | 	typename Comp>
156 | MGPU_DEVICE void CTABlocksortLoop(ValType threadValues[VT],
157 | 	KeyType* keys_shared, ValType* values_shared, int tid, int count,
158 | 	Comp comp) {
159 | 
160 | 	#pragma unroll
161 | 	for(int coop = 2; coop <= NT; coop *= 2) {
162 | 		int indices[VT];
163 | 		KeyType keys[VT];
164 | 		CTABlocksortPass<NT, VT>(keys_shared, tid, count, coop, keys,
165 | 			indices, comp);
166 | 
167 | 		if(HasValues) {
168 | 			// Exchange the values through shared memory.
169 | 			DeviceThreadToShared<VT>(threadValues, tid, values_shared);
170 | 			DeviceGather<NT, VT>(NT * VT, values_shared, indices, tid,
171 | 				threadValues);
172 | 		}
173 | 
174 | 		// Store results in shared memory in sorted order.
175 | 		DeviceThreadToShared<VT>(keys, tid, keys_shared);
176 | 	}
177 | }
178 | 
179 | ////////////////////////////////////////////////////////////////////////////////
180 | // CTAMergesort
181 | // Caller provides the keys in shared memory. This functions sorts the first
182 | // count elements.
183 | 
184 | template<int NT, int VT, bool Stable, bool HasValues, typename KeyType,
185 | 	typename ValType, typename Comp>
186 | MGPU_DEVICE void CTAMergesort(KeyType threadKeys[VT], ValType threadValues[VT],
187 | 	KeyType* keys_shared, ValType* values_shared, int count, int tid,
188 | 	Comp comp) {
189 | 
190 | 	// Stable sort the keys in the thread.
191 | 	if(VT * tid < count) {
192 | 		if(Stable)
193 | 			OddEvenTransposeSort<VT>(threadKeys, threadValues, comp);
194 | 		else
195 | 			OddEvenMergesort<VT>(threadKeys, threadValues, comp);
196 | 	}
197 | 
198 | 	// Store the locally sorted keys into shared memory.
199 | 	DeviceThreadToShared<VT>(threadKeys, tid, keys_shared);
200 | 
201 | 	// Recursively merge lists until the entire CTA is sorted.
202 | 	CTABlocksortLoop<NT, VT, HasValues>(threadValues, keys_shared,
203 | 		values_shared, tid, count, comp);
204 | }
205 | 
206 | template<int NT, int VT, bool Stable, typename KeyType, typename Comp>
207 | MGPU_DEVICE void CTAMergesortKeys(KeyType threadKeys[VT],
208 | 	KeyType* keys_shared, int count, int tid, Comp comp) {
209 | 
210 | 	int valuesTemp[VT];
211 | 	CTAMergesort<NT, VT, Stable, false>(threadKeys, valuesTemp, keys_shared,
212 | 		(int*)keys_shared, count, tid, comp);
213 | }
214 | 
215 | template<int NT, int VT, bool Stable, typename KeyType, typename ValType,
216 | 	typename Comp>
217 | MGPU_DEVICE void CTAMergesortPairs(KeyType threadKeys[VT],
218 | 	ValType threadValues[VT], KeyType* keys_shared, ValType* values_shared,
219 | 	int count, int tid, Comp comp) {
220 | 
221 | 	CTAMergesort<NT, VT, Stable, true>(threadKeys, threadValues, keys_shared,
222 | 		values_shared, count, tid, comp);
223 | }
224 | 
225 | ////////////////////////////////////////////////////////////////////////////////
226 | // DeviceMergeKeysIndices
227 | 
228 | template<int NT, int VT, bool LoadExtended, typename It1, typename It2,
229 | 	typename T, typename Comp>
230 | MGPU_DEVICE void DeviceMergeKeysIndices(It1 a_global, int aCount, It2 b_global,
231 | 	int bCount, int4 range, int tid, T* keys_shared, T* results, int* indices,
232 | 	Comp comp) {
233 | 
234 | 	int a0 = range.x;
235 | 	int a1 = range.y;
236 | 	int b0 = range.z;
237 | 	int b1 = range.w;
238 | 
239 | 	if(LoadExtended) {
240 | 		bool extended = (a1 < aCount) && (b1 < bCount);
241 | 		aCount = a1 - a0;
242 | 		bCount = b1 - b0;
243 | 		int aCount2 = aCount + (int)extended;
244 | 		int bCount2 = bCount + (int)extended;
245 | 
246 | 		// Load one element past the end of each input to avoid having to use
247 | 		// range checking in the merge loop.
248 | 		DeviceLoad2ToShared<NT, VT, VT + 1>(a_global + a0, aCount2,
249 | 			b_global + b0, bCount2, tid, keys_shared);
250 | 
251 | 		// Run a Merge Path search for each thread's starting point.
252 | 		int diag = VT * tid;
253 | 		int mp = MergePath<MgpuBoundsLower>(keys_shared, aCount,
254 | 			keys_shared + aCount2, bCount, diag, comp);
255 | 
256 | 		// Compute the ranges of the sources in shared memory.
257 | 		int a0tid = mp;
258 | 		int b0tid = aCount2 + diag - mp;
259 | 		if(extended) {
260 | 			SerialMerge<VT, false>(keys_shared, a0tid, 0, b0tid, 0, results,
261 | 				indices, comp);
262 | 		} else {
263 | 			int a1tid = aCount;
264 | 			int b1tid = aCount2 + bCount;
265 | 			SerialMerge<VT, true>(keys_shared, a0tid, a1tid, b0tid, b1tid,
266 | 				results, indices, comp);
267 | 		}
268 | 	} else {
269 | 		// Use the input intervals from the ranges between the merge path
270 | 		// intersections.
271 | 		aCount = a1 - a0;
272 | 		bCount = b1 - b0;
273 | 
274 | 		// Load the data into shared memory.
275 | 		DeviceLoad2ToShared<NT, VT, VT>(a_global + a0, aCount, b_global + b0,
276 | 			bCount, tid, keys_shared);
277 | 
278 | 		// Run a merge path to find the start of the serial merge for each
279 | 		// thread.
280 | 		int diag = VT * tid;
281 | 		int mp = MergePath<MgpuBoundsLower>(keys_shared, aCount,
282 | 			keys_shared + aCount, bCount, diag, comp);
283 | 
284 | 		// Compute the ranges of the sources in shared memory.
285 | 		int a0tid = mp;
286 | 		int a1tid = aCount;
287 | 		int b0tid = aCount + diag - mp;
288 | 		int b1tid = aCount + bCount;
289 | 
290 | 		// Serial merge into register.
291 | 		SerialMerge<VT, true>(keys_shared, a0tid, a1tid, b0tid, b1tid, results,
292 | 			indices, comp);
293 | 	}
294 | }
295 | 
296 | ////////////////////////////////////////////////////////////////////////////////
297 | // DeviceMerge
298 | // Merge pairs from global memory into global memory. Useful factorization to
299 | // enable calling from merge, mergesort, and locality sort.
300 | 
301 | template<int NT, int VT, bool HasValues, bool LoadExtended, typename KeysIt1,
302 | 	typename KeysIt2, typename KeysIt3, typename ValsIt1, typename ValsIt2,
303 | 	typename KeyType, typename ValsIt3, typename Comp>
304 | MGPU_DEVICE void DeviceMerge(KeysIt1 aKeys_global, ValsIt1 aVals_global,
305 | 	int aCount, KeysIt2 bKeys_global, ValsIt2 bVals_global, int bCount,
306 | 	int tid, int block, int4 range, KeyType* keys_shared, int* indices_shared,
307 | 	KeysIt3 keys_global, ValsIt3 vals_global, Comp comp) {
308 | 
309 | 	KeyType results[VT];
310 | 	int indices[VT];
311 | 	DeviceMergeKeysIndices<NT, VT, LoadExtended>(aKeys_global, aCount,
312 | 		bKeys_global, bCount, range, tid, keys_shared, results, indices, comp);
313 | 
314 | 	// Store merge results back to shared memory.
315 | 	DeviceThreadToShared<VT>(results, tid, keys_shared);
316 | 
317 | 	// Store merged keys to global memory.
318 | 	aCount = range.y - range.x;
319 | 	bCount = range.w - range.z;
320 | 	DeviceSharedToGlobal<NT, VT>(aCount + bCount, keys_shared, tid,
321 | 		keys_global + NT * VT * block);
322 | 
323 | 	// Copy the values.
324 | 	if(HasValues) {
325 | 		DeviceThreadToShared<VT>(indices, tid, indices_shared);
326 | 
327 | 		DeviceTransferMergeValuesShared<NT, VT>(aCount + bCount,
328 | 			aVals_global + range.x, bVals_global + range.z, aCount,
329 | 			indices_shared, tid, vals_global + NT * VT * block);
330 | 	}
331 | }
332 | 
333 | } // namespace mgpu
334 | 


--------------------------------------------------------------------------------
/include/contrib/moderngpu/include/device/ctascan.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /******************************************************************************
 29 |  *
 30 |  * Code and text by Sean Baxter, NVIDIA Research
 31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
 32 |  *
 33 |  ******************************************************************************/
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "../mgpuenums.h"
 38 | #include "deviceutil.cuh"
 39 | #include "intrinsics.cuh"
 40 | 
 41 | namespace mgpu {
 42 | 
 43 | ////////////////////////////////////////////////////////////////////////////////
 44 | // CTAReduce
 45 | 
 46 | template<int NT, typename Op = mgpu::plus<int> >
 47 | struct CTAReduce {
 48 | 	typedef typename Op::first_argument_type T;
 49 | 	enum { Size = NT, Capacity = NT };
 50 | 	struct Storage { T shared[Capacity]; };
 51 | 
 52 | 	MGPU_DEVICE static T Reduce(int tid, T x, Storage& storage, Op op = Op()) {
 53 | 		storage.shared[tid] = x;
 54 | 		__syncthreads();
 55 | 
 56 | 		// Fold the data in half with each pass.
 57 | 		#pragma unroll
 58 | 		for(int destCount = NT / 2; destCount >= 1; destCount /= 2) {
 59 | 			if(tid < destCount) {
 60 | 				// Read from the right half and store to the left half.
 61 | 				x = op(x, storage.shared[destCount + tid]);
 62 | 				storage.shared[tid] = x;
 63 | 			}
 64 | 			__syncthreads();
 65 | 		}
 66 | 		T total = storage.shared[0];
 67 | 		__syncthreads();
 68 | 		return total;
 69 | 	}
 70 | };
 71 | 
 72 | #if __CUDA_ARCH__ >= 300
 73 | 
 74 | template<int NT>
 75 | struct CTAReduce<NT, mgpu::plus<int> > {
 76 | 	typedef mgpu::plus<int> Op;
 77 | 	typedef int T;
 78 | 	enum { Size = NT, Capacity = WARP_SIZE };
 79 | 	struct Storage { int shared[Capacity]; };
 80 | 
 81 | 	MGPU_DEVICE static int Reduce(int tid, int x, Storage& storage,
 82 | 		Op op = Op()) {
 83 | 
 84 | 		const int NumSections = WARP_SIZE;
 85 | 		const int SecSize = NT / NumSections;
 86 | 		int lane = (SecSize - 1) & tid;
 87 | 		int sec = tid / SecSize;
 88 | 
 89 | 		// In the first phase, threads cooperatively find the reduction within
 90 | 		// their segment. The segments are SecSize threads (NT / WARP_SIZE)
 91 | 		// wide.
 92 | 		#pragma unroll
 93 | 		for(int offset = 1; offset < SecSize; offset *= 2)
 94 | 			x = shfl_add(x, offset, SecSize);
 95 | 
 96 | 		// The last thread in each segment stores the local reduction to shared
 97 | 		// memory.
 98 | 		if(SecSize - 1 == lane) storage.shared[sec] = x;
 99 | 		__syncthreads();
100 | 
101 | 		// Reduce the totals of each input segment. The spine is WARP_SIZE
102 | 		// threads wide.
103 | 		if(tid < NumSections) {
104 | 			x = storage.shared[tid];
105 | 			#pragma unroll
106 | 			for(int offset = 1; offset < NumSections; offset *= 2)
107 | 				x = shfl_add(x, offset, NumSections);
108 | 			storage.shared[tid] = x;
109 | 		}
110 | 		__syncthreads();
111 | 
112 | 		int reduction = storage.shared[NumSections - 1];
113 | 		__syncthreads();
114 | 
115 | 		return reduction;
116 | 	}
117 | };
118 | 
119 | template<int NT>
120 | struct CTAReduce<NT, mgpu::maximum<int> > {
121 | 	typedef mgpu::maximum<int> Op;
122 | 	enum { Size = NT, Capacity = WARP_SIZE };
123 | 	struct Storage { int shared[Capacity]; };
124 | 
125 | 	MGPU_DEVICE static int Reduce(int tid, int x, Storage& storage,
126 | 		Op op = Op()) {
127 | 
128 | 		const int NumSections = WARP_SIZE;
129 | 		const int SecSize = NT / NumSections;
130 | 		int lane = (SecSize - 1) & tid;
131 | 		int sec = tid / SecSize;
132 | 
133 | 		#pragma unroll
134 | 		for(int offset = 1; offset < SecSize; offset *= 2)
135 | 			x = shfl_max(x, offset, SecSize);
136 | 
137 | 		if(SecSize - 1 == lane) storage.shared[sec] = x;
138 | 		__syncthreads();
139 | 
140 | 		if(tid < NumSections) {
141 | 			x = storage.shared[tid];
142 | 			#pragma unroll
143 | 			for(int offset = 1; offset < NumSections; offset *= 2)
144 | 				x = shfl_max(x, offset, NumSections);
145 | 			storage.shared[tid] = x;
146 | 		}
147 | 		__syncthreads();
148 | 
149 | 		int reduction = storage.shared[NumSections - 1];
150 | 		__syncthreads();
151 | 
152 | 		return reduction;
153 | 	}
154 | };
155 | 
156 | #endif // __CUDA_ARCH__ >= 300
157 | 
158 | ////////////////////////////////////////////////////////////////////////////////
159 | // CTAScan
160 | 
161 | template<int NT, typename Op = mgpu::plus<int> >
162 | struct CTAScan {
163 | 	typedef typename Op::result_type T;
164 | 	enum { Size = NT, Capacity = 2 * NT + 1 };
165 | 	struct Storage { T shared[Capacity]; };
166 | 
167 | 	MGPU_DEVICE static T Scan(int tid, T x, Storage& storage, T* total,
168 | 		MgpuScanType type = MgpuScanTypeExc, T identity = (T)0, Op op = Op()) {
169 | 
170 | 		storage.shared[tid] = x;
171 | 		int first = 0;
172 | 		__syncthreads();
173 | 
174 | 		#pragma unroll
175 | 		for(int offset = 1; offset < NT; offset += offset) {
176 | 			if(tid >= offset)
177 | 				x = op(storage.shared[first + tid - offset], x);
178 | 			first = NT - first;
179 | 			storage.shared[first + tid] = x;
180 | 			__syncthreads();
181 | 		}
182 | 		*total = storage.shared[first + NT - 1];
183 | 
184 | 		if(MgpuScanTypeExc == type)
185 | 			x = tid ? storage.shared[first + tid - 1] : identity;
186 | 
187 | 		__syncthreads();
188 | 		return x;
189 | 	}
190 | 	MGPU_DEVICE static T Scan(int tid, T x, Storage& storage) {
191 | 		T total;
192 | 		return Scan(tid, x, storage, &total, MgpuScanTypeExc, (T)0, Op());
193 | 	}
194 | };
195 | 
196 | ////////////////////////////////////////////////////////////////////////////////
197 | // Special partial specialization for CTAScan<NT, ScanOpAdd> on Kepler.
198 | // This uses the shfl intrinsic to reduce scan latency.
199 | 
200 | #if __CUDA_ARCH__ >= 300
201 | 
202 | template<int NT>
203 | struct CTAScan<NT, mgpu::plus<int> > {
204 | 	typedef mgpu::plus<int> Op;
205 | 	enum { Size = NT, NumSegments = WARP_SIZE, SegSize = NT / NumSegments };
206 | 	enum { Capacity = NumSegments + 1 };
207 | 	struct Storage { int shared[Capacity + 1]; };
208 | 
209 | 	MGPU_DEVICE static int Scan(int tid, int x, Storage& storage, int* total,
210 | 		MgpuScanType type = MgpuScanTypeExc, int identity = 0, Op op = Op()) {
211 | 
212 | 		// Define WARP_SIZE segments that are NT / WARP_SIZE large.
213 | 		// Each warp makes log(SegSize) shfl_add calls.
214 | 		// The spine makes log(WARP_SIZE) shfl_add calls.
215 | 		int lane = (SegSize - 1) & tid;
216 | 		int segment = tid / SegSize;
217 | 
218 | 		// Scan each segment using shfl_add.
219 | 		int scan = x;
220 | 		#pragma unroll
221 | 		for(int offset = 1; offset < SegSize; offset *= 2)
222 | 			scan = shfl_add(scan, offset, SegSize);
223 | 
224 | 		// Store the reduction (last element) of each segment into storage.
225 | 		if(SegSize - 1 == lane) storage.shared[segment] = scan;
226 | 		__syncthreads();
227 | 
228 | 		// Warp 0 does a full shfl warp scan on the partials. The total is
229 | 		// stored to shared[NumSegments]. (NumSegments = WARP_SIZE)
230 | 		if(tid < NumSegments) {
231 | 			int y = storage.shared[tid];
232 | 			int scan = y;
233 | 			#pragma unroll
234 | 			for(int offset = 1; offset < NumSegments; offset *= 2)
235 | 				scan = shfl_add(scan, offset, NumSegments);
236 | 			storage.shared[tid] = scan - y;
237 | 			if(NumSegments - 1 == tid) storage.shared[NumSegments] = scan;
238 | 		}
239 | 		__syncthreads();
240 | 
241 | 		// Add the scanned partials back in and convert to exclusive scan.
242 | 		scan += storage.shared[segment];
243 | 		if(MgpuScanTypeExc == type) {
244 | 			scan -= x;
245 | 			if(identity && !tid) scan = identity;
246 | 		}
247 | 		*total = storage.shared[NumSegments];
248 | 		__syncthreads();
249 | 
250 | 		return scan;
251 | 	}
252 | 	MGPU_DEVICE static int Scan(int tid, int x, Storage& storage) {
253 | 		int total;
254 | 		return Scan(tid, x, storage, &total, MgpuScanTypeExc, 0);
255 | 	}
256 | };
257 | 
258 | #endif // __CUDA_ARCH__ >= 300
259 | 
260 | ////////////////////////////////////////////////////////////////////////////////
261 | // CTABinaryScan
262 | 
263 | template<int NT>
264 | MGPU_DEVICE int CTABinaryScan(int tid, bool x, int* shared, int* total) {
265 | 	const int NumWarps = NT / WARP_SIZE;
266 | 	int warp = tid / WARP_SIZE;
267 | 	int lane = (WARP_SIZE - 1);
268 | 
269 | 	// Store the bit totals for each warp.
270 | 	uint bits = __ballot(x);
271 | 	shared[warp] = popc(bits);
272 | 	__syncthreads();
273 | 
274 | #if __CUDA_ARCH__ >= 300
275 | 	if(tid < NumWarps) {
276 | 		int x = shared[tid];
277 | 		int scan = x;
278 | 		#pragma unroll
279 | 		for(int offset = 1; offset < NumWarps; offset *= 2)
280 | 			scan = shfl_add(scan, offset, NumWarps);
281 | 		shared[tid] = scan - x;
282 | 	}
283 | 	__syncthreads();
284 | 
285 | #else
286 | 	// Thread 0 scans warp totals.
287 | 	if(!tid) {
288 | 		int scan = 0;
289 | 		#pragma unroll
290 | 		for(int i = 0; i < NumWarps; ++i) {
291 | 			int y = shared[i];
292 | 			shared[i] = scan;
293 | 			scan += y;
294 | 		}
295 | 		shared[NumWarps] = scan;
296 | 	}
297 | 	__syncthreads();
298 | 
299 | #endif // __CUDA_ARCH__ >= 300
300 | 
301 | 	// Add the warp scan back into the partials.
302 | 	int scan = shared[warp] + __popc(bfe(bits, 0, lane));
303 | 	*total = shared[NumWarps];
304 | 	__syncthreads();
305 | 	return scan;
306 | }
307 | 
308 | } // namespace mgpu
309 | 


--------------------------------------------------------------------------------
/include/contrib/moderngpu/include/device/ctasearch.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /******************************************************************************
 29 |  *
 30 |  * Code and text by Sean Baxter, NVIDIA Research
 31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
 32 |  *
 33 |  ******************************************************************************/
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "deviceutil.cuh"
 38 | #include "../mgpudevice.cuh"
 39 | 
 40 | namespace mgpu {
 41 | 
 42 | template<MgpuBounds Bounds, typename IntT, typename It, typename T,
 43 | 	typename Comp>
 44 | MGPU_HOST_DEVICE void BinarySearchIt(It data, int& begin, int& end, T key,
 45 | 	int shift, Comp comp) {
 46 | 
 47 | 	IntT scale = (1<< shift) - 1;
 48 | 	int mid = (int)((begin + scale * end)>> shift);
 49 | 
 50 | 	T key2 = data[mid];
 51 | 	bool pred = (MgpuBoundsUpper == Bounds) ?
 52 | 		!comp(key, key2) :
 53 | 		comp(key2, key);
 54 | 	if(pred) begin = mid + 1;
 55 | 	else end = mid;
 56 | }
 57 | 
 58 | template<MgpuBounds Bounds, typename IntT, typename T, typename It,
 59 | 	typename Comp>
 60 | MGPU_HOST_DEVICE int BiasedBinarySearch(It data, int count, T key, int levels,
 61 | 	Comp comp) {
 62 | 
 63 | 	int begin = 0;
 64 | 	int end = count;
 65 | 
 66 | 	if(levels >= 4 && begin < end)
 67 | 		BinarySearchIt<Bounds, IntT>(data, begin, end, key, 9, comp);
 68 | 	if(levels >= 3 && begin < end)
 69 | 		BinarySearchIt<Bounds, IntT>(data, begin, end, key, 7, comp);
 70 | 	if(levels >= 2 && begin < end)
 71 | 		BinarySearchIt<Bounds, IntT>(data, begin, end, key, 5, comp);
 72 | 	if(levels >= 1 && begin < end)
 73 | 		BinarySearchIt<Bounds, IntT>(data, begin, end, key, 4, comp);
 74 | 
 75 | 	while(begin < end)
 76 | 		BinarySearchIt<Bounds, int>(data, begin, end, key, 1, comp);
 77 | 	return begin;
 78 | }
 79 | 
 80 | template<MgpuBounds Bounds, typename T, typename It, typename Comp>
 81 | MGPU_HOST_DEVICE int BinarySearch(It data, int count, T key, Comp comp) {
 82 | 	int begin = 0;
 83 | 	int end = count;
 84 | 	while(begin < end)
 85 | 		BinarySearchIt<Bounds, int>(data, begin, end, key, 1, comp);
 86 | 	return begin;
 87 | }
 88 | 
 89 | ////////////////////////////////////////////////////////////////////////////////
 90 | // MergePath search
 91 | 
 92 | template<MgpuBounds Bounds, typename It1, typename It2, typename Comp>
 93 | MGPU_HOST_DEVICE int MergePath(It1 a, int aCount, It2 b, int bCount, int diag,
 94 | 	Comp comp) {
 95 | 
 96 | 	typedef typename std::iterator_traits<It1>::value_type T;
 97 | 	int begin = max(0, diag - bCount);
 98 | 	int end = min(diag, aCount);
 99 | 
100 | 	while(begin < end) {
101 | 		int mid = (begin + end)>> 1;
102 | 		T aKey = a[mid];
103 | 		T bKey = b[diag - 1 - mid];
104 | 		bool pred = (MgpuBoundsUpper == Bounds) ?
105 | 			comp(aKey, bKey) :
106 | 			!comp(bKey, aKey);
107 | 		if(pred) begin = mid + 1;
108 | 		else end = mid;
109 | 	}
110 | 	return begin;
111 | }
112 | 
113 | 
114 | ////////////////////////////////////////////////////////////////////////////////
115 | // SegmentedMergePath search
116 | 
117 | template<typename InputIt, typename Comp>
118 | MGPU_HOST_DEVICE int SegmentedMergePath(InputIt keys, int aOffset, int aCount,
119 | 	int bOffset, int bCount, int leftEnd, int rightStart, int diag, Comp comp) {
120 | 
121 | 	// leftEnd and rightStart are defined from the origin, and diag is defined
122 | 	// from aOffset.
123 | 	// We only need to run a Merge Path search if the diagonal intersects the
124 | 	// segment that strides the left and right halves (i.e. is between leftEnd
125 | 	// and rightStart).
126 | 	if(aOffset + diag <= leftEnd) return diag;
127 | 	if(aOffset + diag >= rightStart) return aCount;
128 | 
129 | 	bCount = min(bCount, rightStart - bOffset);
130 | 	int begin = max(max(leftEnd - aOffset, 0), diag - bCount);
131 | 	int end = min(diag, aCount);
132 | 
133 | 	while(begin < end) {
134 | 		int mid = (begin + end)>> 1;
135 | 		int ai = aOffset + mid;
136 | 		int bi = bOffset + diag - 1 - mid;
137 | 
138 | 		bool pred = !comp(keys[bi], keys[ai]);
139 | 		if(pred) begin = mid + 1;
140 | 		else end = mid;
141 | 	}
142 | 	return begin;
143 | }
144 | 
145 | ////////////////////////////////////////////////////////////////////////////////
146 | // BalancedPath search
147 | 
148 | template<bool Duplicates, typename IntT, typename InputIt1, typename InputIt2,
149 | 	typename Comp>
150 | MGPU_HOST_DEVICE int2 BalancedPath(InputIt1 a, int aCount, InputIt2 b,
151 | 	int bCount, int diag, int levels, Comp comp) {
152 | 
153 | 	typedef typename std::iterator_traits<InputIt1>::value_type T;
154 | 
155 | 	int p = MergePath<MgpuBoundsLower>(a, aCount, b, bCount, diag, comp);
156 | 	int aIndex = p;
157 | 	int bIndex = diag - p;
158 | 
159 | 	bool star = false;
160 | 	if(bIndex < bCount) {
161 | 		if(Duplicates) {
162 | 			T x = b[bIndex];
163 | 
164 | 			// Search for the beginning of the duplicate run in both A and B.
165 | 			// Because
166 | 			int aStart = BiasedBinarySearch<MgpuBoundsLower, IntT>(a, aIndex, x,
167 | 				levels, comp);
168 | 			int bStart = BiasedBinarySearch<MgpuBoundsLower, IntT>(b, bIndex, x,
169 | 				levels, comp);
170 | 
171 | 			// The distance between the merge path and the lower_bound is the
172 | 			// 'run'. We add up the a- and b- runs and evenly distribute them to
173 | 			// get a stairstep path.
174 | 			int aRun = aIndex - aStart;
175 | 			int bRun = bIndex - bStart;
176 | 			int xCount = aRun + bRun;
177 | 
178 | 			// Attempt to advance b and regress a.
179 | 			int bAdvance = max(xCount>> 1, bRun);
180 | 			int bEnd = min(bCount, bStart + bAdvance + 1);
181 | 			int bRunEnd = BinarySearch<MgpuBoundsUpper>(b + bIndex,
182 | 				bEnd - bIndex, x, comp) + bIndex;
183 | 			bRun = bRunEnd - bStart;
184 | 
185 | 			bAdvance = min(bAdvance, bRun);
186 | 			int aAdvance = xCount - bAdvance;
187 | 
188 | 			bool roundUp = (aAdvance == bAdvance + 1) && (bAdvance < bRun);
189 | 			aIndex = aStart + aAdvance;
190 | 
191 | 			if(roundUp) star = true;
192 | 		} else {
193 | 			if(aIndex && aCount) {
194 | 				T aKey = a[aIndex - 1];
195 | 				T bKey = b[bIndex];
196 | 
197 | 				// If the last consumed element in A (aIndex - 1) is the same as
198 | 				// the next element in B (bIndex), we're sitting at a starred
199 | 				// partition.
200 | 				if(!comp(aKey, bKey)) star = true;
201 | 			}
202 | 		}
203 | 	}
204 | 	return make_int2(aIndex, star);
205 | }
206 | 
207 | } // namespace mgpu
208 | 


--------------------------------------------------------------------------------
/include/contrib/moderngpu/include/device/ctasegreduce.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /******************************************************************************
 29 |  *
 30 |  * Code and text by Sean Baxter, NVIDIA Research
 31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
 32 |  *
 33 |  ******************************************************************************/
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "ctasegscan.cuh"
 38 | #include "ctasearch.cuh"
 39 | 
 40 | namespace mgpu {
 41 | 
 42 | ////////////////////////////////////////////////////////////////////////////////
 43 | // Segmented reduce utility functions.
 44 | 
 45 | // Extract the upper-bound indices from the coded ranges. Decrement to include
 46 | // the first addressed row/segment.
 47 | 
 48 | struct SegReduceRange {
 49 | 	int begin;
 50 | 	int end;
 51 | 	int total;
 52 | 	bool flushLast;
 53 | };
 54 | 
 55 | MGPU_DEVICE SegReduceRange DeviceShiftRange(int limit0, int limit1) {
 56 | 	SegReduceRange range;
 57 | 	range.begin = 0x7fffffff & limit0;
 58 | 	range.end = 0x7fffffff & limit1;
 59 | 	range.total = range.end - range.begin;
 60 | 	range.flushLast = 0 == (0x80000000 & limit1);
 61 | 	range.end += !range.flushLast;
 62 | 	return range;
 63 | }
 64 | 
 65 | // Reconstitute row/segment indices from a starting row index and packed end
 66 | // flags. Used for pre-processed versions of interval reduce and interval Spmv.
 67 | template<int VT>
 68 | MGPU_DEVICE void DeviceExpandFlagsToRows(int first, int endFlags,
 69 | 	int rows[VT + 1]) {
 70 | 
 71 | 	rows[0] = first;
 72 | 	#pragma unroll
 73 | 	for(int i = 0; i < VT; ++i) {
 74 | 		if((1<< i) & endFlags) ++first;
 75 | 		rows[i + 1] = first;
 76 | 	}
 77 | }
 78 | 
 79 | ////////////////////////////////////////////////////////////////////////////////
 80 | // After loading CSR terms into shared memory, each thread binary searches
 81 | // (upper-bound) to find its starting point. Each thread then walks forward,
 82 | // emitting the csr0-relative row indices to register.
 83 | 
 84 | template<int NT, int VT>
 85 | MGPU_DEVICE int DeviceExpandCsrRows(int tidOffset, int* csr_shared,
 86 | 	int numRows, int end, int rows[VT + 1], int rowStarts[VT]) {
 87 | 
 88 | 	// Each thread binary searches for its starting row.
 89 | 	int row = BinarySearch<MgpuBoundsUpper>(csr_shared, numRows, tidOffset,
 90 | 		mgpu::less<int>()) - 1;
 91 | 
 92 | 	// Each thread starts at row and scans forward, emitting row IDs into
 93 | 	// register. Store the CTA-local row index (starts at 0) to rows and the
 94 | 	// start of the row (globally) to rowStarts.
 95 | 	int curOffset = csr_shared[row];
 96 | 	int nextOffset = (row + 1 < numRows) ? csr_shared[row + 1] : end;
 97 | 
 98 | 	rows[0] = row;
 99 | 	rowStarts[0] = curOffset;
100 | 	int endFlags = 0;
101 | 
102 | 	#pragma unroll
103 | 	for(int i = 1; i <= VT; ++i) {
104 | 		// Advance the row cursor when the iterator hits the next row offset.
105 | 		if(tidOffset + i == nextOffset) {
106 | 			// Set an end flag when the cursor advances to the next row.
107 | 			endFlags |= 1<< (i - 1);
108 | 
109 | 			// Advance the cursor and load the next row offset.
110 | 			++row;
111 | 			curOffset = nextOffset;
112 | 			nextOffset = (row + 1 < numRows) ? csr_shared[row + 1] : end;
113 | 		}
114 | 		rows[i] = row;
115 | 		if(i < VT) rowStarts[i] = curOffset;
116 | 	}
117 | 	__syncthreads();
118 | 
119 | 	return endFlags;
120 | }
121 | 
122 | ////////////////////////////////////////////////////////////////////////////////
123 | // DeviceSegReducePrepare
124 | // Expand non-empty interval of CSR elements into row indices. Compute end-flags
125 | // by comparing adjacent row IDs.
126 | 
127 | // DeviceSegReducePrepare may be called either by a pre-processing kernel or by
128 | // the kernel that actually evaluates the segmented reduction if no preprocesing
129 | // is desired.
130 | struct SegReduceTerms {
131 | 	int endFlags;
132 | 	int tidDelta;
133 | };
134 | 
135 | template<int NT, int VT>
136 | MGPU_DEVICE SegReduceTerms DeviceSegReducePrepare(int* csr_shared, int numRows,
137 | 	int tid, int gid, bool flushLast, int rows[VT + 1], int rowStarts[VT]) {
138 | 
139 | 	// Pass a sentinel (end) to point to the next segment start. If we flush,
140 | 	// this is the end of this tile. Otherwise it is INT_MAX
141 | 	int endFlags = DeviceExpandCsrRows<NT, VT>(gid + VT * tid, csr_shared,
142 | 		numRows, flushLast ? (gid + NT * VT) : INT_MAX, rows, rowStarts);
143 | 
144 | 	// Find the distance to to scan to compute carry-in for each thread. Use the
145 | 	// existance of an end flag anywhere in the thread to determine if carry-out
146 | 	// values from the left should propagate through to the right.
147 | 	int tidDelta = DeviceFindSegScanDelta<NT>(tid, rows[0] != rows[VT],
148 | 		csr_shared);
149 | 
150 | 	SegReduceTerms terms = { endFlags, tidDelta };
151 | 	return terms;
152 | }
153 | 
154 | ////////////////////////////////////////////////////////////////////////////////
155 | // CTASegReduce
156 | // Core segmented reduction code. Supports fast-path and slow-path for intra-CTA
157 | // segmented reduction. Stores partials to global memory.
158 | // Callers feed CTASegReduce::ReduceToGlobal values in thread order.
159 | template<int NT, int VT, bool HalfCapacity, typename T, typename Op>
160 | struct CTASegReduce {
161 | 	typedef CTASegScan<NT, Op> SegScan;
162 | 
163 | 	enum {
164 | 		NV = NT * VT,
165 | 		Capacity = HalfCapacity ? (NV / 2) : NV
166 | 	};
167 | 
168 | 	union Storage {
169 | 		typename SegScan::Storage segScanStorage;
170 | 		T values[Capacity];
171 | 	};
172 | 
173 | 	template<typename DestIt>
174 | 	MGPU_DEVICE static void ReduceToGlobal(const int rows[VT + 1], int total,
175 | 		int tidDelta, int startRow, int block, int tid, T data[VT],
176 | 		DestIt dest_global, T* carryOut_global, T identity, Op op,
177 | 		Storage& storage) {
178 | 
179 | 		// Run a segmented scan within the thread.
180 | 		T x, localScan[VT];
181 | 		#pragma unroll
182 | 		for(int i = 0; i < VT; ++i) {
183 | 			x = i ? op(x, data[i]) : data[i];
184 | 			localScan[i] = x;
185 | 			if(rows[i] != rows[i + 1]) x = identity;
186 | 		}
187 | 
188 | 		// Run a parallel segmented scan over the carry-out values to compute
189 | 		// carry-in.
190 | 		T carryOut;
191 | 		T carryIn = SegScan::SegScanDelta(tid, tidDelta, x,
192 | 			storage.segScanStorage, &carryOut, identity, op);
193 | 
194 | 		// Store the carry-out for the entire CTA to global memory.
195 | 		if(!tid) carryOut_global[block] = carryOut;
196 | 
197 | 		dest_global += startRow;
198 | 		if(HalfCapacity && total > Capacity) {
199 | 			// Add carry-in to each thread-local scan value. Store directly
200 | 			// to global.
201 | 			#pragma unroll
202 | 			for(int i = 0; i < VT; ++i) {
203 | 				// Add the carry-in to the local scan.
204 | 				T x2 = op(carryIn, localScan[i]);
205 | 
206 | 				// Store on the end flag and clear the carry-in.
207 | 				if(rows[i] != rows[i + 1]) {
208 | 					carryIn = identity;
209 | 					dest_global[rows[i]] = x2;
210 | 				}
211 | 			}
212 | 		} else {
213 | 			// All partials fit in shared memory. Add carry-in to each thread-
214 | 			// local scan value.
215 | 			#pragma unroll
216 | 			for(int i = 0; i < VT; ++i) {
217 | 				// Add the carry-in to the local scan.
218 | 				T x2 = op(carryIn, localScan[i]);
219 | 
220 | 				// Store reduction when the segment changes and clear the
221 | 				// carry-in.
222 | 				if(rows[i] != rows[i + 1]) {
223 | 					storage.values[rows[i]] = x2;
224 | 					carryIn = identity;
225 | 				}
226 | 			}
227 | 			__syncthreads();
228 | 
229 | 			// Cooperatively store reductions to global memory.
230 | 			for(int index = tid; index < total; index += NT)
231 | 				dest_global[index] = storage.values[index];
232 | 			__syncthreads();
233 | 		}
234 | 	}
235 | };
236 | 
237 | } // namespace mgpu
238 | 
239 | 


--------------------------------------------------------------------------------
/include/contrib/moderngpu/include/device/ctasegscan.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /******************************************************************************
 29 |  *
 30 |  * Code and text by Sean Baxter, NVIDIA Research
 31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
 32 |  *
 33 |  ******************************************************************************/
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "ctascan.cuh"
 38 | 
 39 | namespace mgpu {
 40 | 
 41 | ////////////////////////////////////////////////////////////////////////////////
 42 | // DeviceFindSegScanDelta
 43 | // Runs an inclusive max-index scan over binary inputs.
 44 | 
 45 | template<int NT>
 46 | MGPU_DEVICE int DeviceFindSegScanDelta(int tid, bool flag, int* delta_shared) {
 47 | 	const int NumWarps = NT / 32;
 48 | 
 49 | 	int warp = tid / 32;
 50 | 	int lane = 31 & tid;
 51 | 	uint warpMask = 0xffffffff>> (31 - lane);		// inclusive search
 52 | 	uint ctaMask = 0x7fffffff>> (31 - lane);		// exclusive search
 53 | 
 54 | 	uint warpBits = __ballot(flag);
 55 | 	delta_shared[warp] = warpBits;
 56 | 	__syncthreads();
 57 | 
 58 | 	if(tid < NumWarps) {
 59 | 		uint ctaBits = __ballot(0 != delta_shared[tid]);
 60 | 		int warpSegment = 31 - clz(ctaMask & ctaBits);
 61 | 		int start = (-1 != warpSegment) ?
 62 | 			(31 - clz(delta_shared[warpSegment]) + 32 * warpSegment) : 0;
 63 | 		delta_shared[NumWarps + tid] = start;
 64 | 	}
 65 | 	__syncthreads();
 66 | 
 67 | 	// Find the closest flag to the left of this thread within the warp.
 68 | 	// Include the flag for this thread.
 69 | 	int start = 31 - clz(warpMask & warpBits);
 70 | 	if(-1 != start) start += ~31 & tid;
 71 | 	else start = delta_shared[NumWarps + warp];
 72 | 	__syncthreads();
 73 | 
 74 | 	return tid - start;
 75 | }
 76 | 
 77 | ////////////////////////////////////////////////////////////////////////////////
 78 | // CTASegScan
 79 | 
 80 | template<int NT, typename _Op = mgpu::plus<int> >
 81 | struct CTASegScan {
 82 | 	typedef _Op Op;
 83 | 	typedef typename Op::result_type T;
 84 | 	enum { NumWarps = NT / 32, Size = NT, Capacity = 2 * NT };
 85 | 	union Storage {
 86 | 		int delta[NumWarps];
 87 | 		T values[Capacity];
 88 | 	};
 89 | 
 90 | 	// Each thread passes the reduction of the LAST SEGMENT that it covers.
 91 | 	// flag is set to true if there's at least one segment flag in the thread.
 92 | 	// SegScan returns the reduction of values for the first segment in this
 93 | 	// thread over the preceding threads.
 94 | 	// Return the value init for the first thread.
 95 | 
 96 | 	// When scanning single elements per thread, interpret the flag as a BEGIN
 97 | 	// FLAG. If tid's flag is set, its value belongs to thread tid + 1, not
 98 | 	// thread tid.
 99 | 
100 | 	// The function returns the reduction of the last segment in the CTA.
101 | 
102 | 	MGPU_DEVICE static T SegScanDelta(int tid, int tidDelta, T x,
103 | 		Storage& storage, T* carryOut, T identity = (T)0, Op op = Op()) {
104 | 
105 | 		// Run an inclusive scan
106 | 		int first = 0;
107 | 		storage.values[first + tid] = x;
108 | 		__syncthreads();
109 | 
110 | 		#pragma unroll
111 | 		for(int offset = 1; offset < NT; offset += offset) {
112 | 			if(tidDelta >= offset)
113 | 				x = op(storage.values[first + tid - offset], x);
114 | 			first = NT - first;
115 | 			storage.values[first + tid] = x;
116 | 			__syncthreads();
117 | 		}
118 | 
119 | 		// Get the exclusive scan.
120 | 		x = tid ? storage.values[first + tid - 1] : identity;
121 | 		*carryOut = storage.values[first + NT - 1];
122 | 		__syncthreads();
123 | 		return x;
124 | 	}
125 | 
126 | 	MGPU_DEVICE static T SegScan(int tid, T x, bool flag, Storage& storage,
127 | 		T* carryOut, T identity = (T)0, Op op = Op()) {
128 | 
129 | 		// Find the left-most thread that covers the first segment of this
130 | 		// thread.
131 | 		int tidDelta = DeviceFindSegScanDelta<NT>(tid, flag, storage.delta);
132 | 
133 | 		return SegScanDelta(tid, tidDelta, x, storage, carryOut, identity, op);
134 | 	}
135 | };
136 | 
137 | } // namespace mgpu
138 | 


--------------------------------------------------------------------------------
/include/contrib/moderngpu/include/device/ctasortedsearch.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /******************************************************************************
 29 |  *
 30 |  * Code and text by Sean Baxter, NVIDIA Research
 31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
 32 |  *
 33 |  ******************************************************************************/
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "../mgpudevice.cuh"
 38 | #include "ctasearch.cuh"
 39 | 
 40 | namespace mgpu {
 41 | 
 42 | 
 43 | ////////////////////////////////////////////////////////////////////////////////
 44 | // DeviceSerialSearch
 45 | 
 46 | template<int VT, MgpuBounds Bounds, bool RangeCheck, bool IndexA, bool MatchA,
 47 | 	bool IndexB, bool MatchB, typename T, typename Comp>
 48 | MGPU_DEVICE int3 DeviceSerialSearch(const T* keys_shared, int aBegin,
 49 | 	int aEnd, int bBegin, int bEnd, int aOffset, int bOffset, int* indices,
 50 | 	Comp comp) {
 51 | 
 52 | 	const int FlagA = IndexA ? 0x80000000 : 1;
 53 | 	const int FlagB = IndexB ? 0x80000000 : 1;
 54 | 
 55 | 	T aKey = keys_shared[aBegin];
 56 | 	T bKey = keys_shared[bBegin];
 57 | 	T aPrev, bPrev;
 58 | 	if(aBegin > 0) aPrev = keys_shared[aBegin - 1];
 59 | 	if(bBegin > 0) bPrev = keys_shared[bBegin - 1];
 60 | 	int decisions = 0;
 61 | 	int matchCountA = 0;
 62 | 	int matchCountB = 0;
 63 | 
 64 | 	#pragma unroll
 65 | 	for(int i = 0; i < VT; ++i) {
 66 | 		bool p;
 67 | 		if(RangeCheck && aBegin >= aEnd) p = false;
 68 | 		else if(RangeCheck && bBegin >= bEnd) p = true;
 69 | 		else p = (MgpuBoundsUpper == Bounds) ?
 70 | 			comp(aKey, bKey) :
 71 | 			!comp(bKey, aKey);
 72 | 
 73 | 		if(p) {
 74 | 			// aKey is smaller than bKey, so it is inserted before bKey.
 75 | 			// Save bKey's index (bBegin + first) as the result of the search
 76 | 			// and advance to the next needle in A.
 77 | 			bool match = false;
 78 | 			if(MatchA) {
 79 | 				// Test if there is an element in B that matches aKey.
 80 | 				if(MgpuBoundsUpper == Bounds) {
 81 | 					// Upper Bound: We're inserting aKey after bKey. If there
 82 | 					// is a match for aKey it must be bPrev. Check that bPrev
 83 | 					// is in range and equal to aKey.
 84 | 					// The predicate test result !comp(aKey, bPrev) was
 85 | 					// established on the previous A-advancing iteration (it
 86 | 					// failed the comp(aKey, bKey) test to get us to this
 87 | 					// point). Check the other half of the equality condition
 88 | 					// with a second comparison.
 89 | 					bool inRange = !RangeCheck || (bBegin > aEnd);
 90 | 					match = inRange && !comp(bPrev, aKey);
 91 | 				} else {
 92 | 					// Lower Bound: We're inserting aKey before bKey. If there
 93 | 					// is a match for aKey, it must be bKey. Check that bKey
 94 | 					// is in range and equal to aKey.
 95 | 					// The predicate test !comp(bKey, aKey) has established one
 96 | 					// half of the equality condition. We establish the other
 97 | 					// half with a second comparison.
 98 | 					bool inRange = !RangeCheck || (bBegin < bEnd);
 99 | 					match = inRange && !comp(aKey, bKey);
100 | 				}
101 | 			}
102 | 
103 | 			int index = 0;
104 | 		 	if(IndexA) index = bOffset + bBegin;
105 | 			if(match) index |= FlagA;
106 | 			if(IndexA || MatchA) indices[i] = index;
107 | 			matchCountA += match;
108 | 
109 | 			// Mark the decision bit to indicate that this iteration has
110 | 			// progressed A (the needles).
111 | 			decisions |= 1<< i;
112 | 			aPrev = aKey;
113 | 			aKey = keys_shared[++aBegin];
114 | 		} else {
115 | 			// aKey is larger than bKey, so it is inserted after bKey (but we
116 | 			// don't know where yet). Advance the B index to the next element in
117 | 			// the haystack to continue the search for the current needle.
118 | 			bool match = false;
119 | 			if(MatchB) {
120 | 				if(MgpuBoundsUpper == Bounds) {
121 | 					// Upper Bound: aKey is not smaller than bKey. We advance to
122 | 					// the next haystack element in B. If there is a match in A
123 | 					// for bKey it must be aKey. By entering this branch we've
124 | 					// verified that !comp(aKey, bKey). Making the reciprocal
125 | 					// comparison !comp(bKey, aKey) establishes aKey == bKey.
126 | 					bool inRange = !RangeCheck ||
127 | 						((bBegin < bEnd) && (aBegin < aEnd));
128 | 					match = inRange && !comp(bKey, aKey);
129 | 				} else {
130 | 					// Lower Bound: bKey is smaller than aKey. We advance to the
131 | 					// next element in B. If there is a match for bKey, it must
132 | 					// be aPrev. The previous A-advancing iteration proved that
133 | 					// !comp(bKey, aPrev). We test !comp(aPrev, bKey) for the
134 | 					// other half of the equality condition.
135 | 					bool inRange = !RangeCheck ||
136 | 						((bBegin < bEnd) && (aBegin > 0));
137 | 					match = inRange && !comp(aPrev, bKey);
138 | 				}
139 | 			}
140 | 
141 | 			int index = 0;
142 | 			if(IndexB) index = aOffset + aBegin;
143 | 			if(match) index |= FlagB;
144 | 			if(IndexB || MatchB) indices[i] = index;
145 | 			matchCountB += match;
146 | 
147 | 			// Keep the decision bit cleared to indicate that this iteration
148 | 			// has progressed B (the haystack).
149 | 			bPrev = bKey;
150 | 			bKey = keys_shared[++bBegin];
151 | 		}
152 | 	}
153 | 	return make_int3(decisions, matchCountA, matchCountB);
154 | }
155 | 
156 | ////////////////////////////////////////////////////////////////////////////////
157 | // CTASortedSearch
158 | // Take keys in shared memory and return indices and b-match flags in shared
159 | // memory.
160 | // NOTE: This function doesn't do any strided-to-thread order transposes so
161 | // using an even number of values per thread will incur no additional bank
162 | // conflicts.
163 | 
164 | template<int NT, int VT, MgpuBounds Bounds, bool IndexA, bool MatchA,
165 | 	bool IndexB, bool MatchB, typename T, typename Comp>
166 | MGPU_DEVICE int2 CTASortedSearch(T* keys_shared, int aStart, int aCount,
167 | 	int aEnd, int a0, int bStart, int bCount, int bEnd, int b0, bool extended,
168 | 	int tid, int* indices_shared, Comp comp) {
169 | 
170 | 	// Run a merge path to find the start of the serial search for each thread.
171 | 	int diag = VT * tid;
172 | 	int mp = MergePath<Bounds>(keys_shared + aStart, aCount,
173 | 		keys_shared + bStart, bCount, diag, comp);
174 | 	int a0tid = mp;
175 | 	int b0tid = diag - mp;
176 | 
177 | 	// Serial search into register.
178 | 	int3 results;
179 | 	int indices[VT];
180 | 	if(extended)
181 | 		results = DeviceSerialSearch<VT, Bounds, false, IndexA, MatchA, IndexB,
182 | 			MatchB>(keys_shared, a0tid + aStart, aEnd, b0tid + bStart, bEnd,
183 | 			a0 - aStart, b0 - bStart, indices, comp);
184 | 	else
185 | 		results = DeviceSerialSearch<VT, Bounds, true, IndexA, MatchA, IndexB,
186 | 			MatchB>(keys_shared, a0tid + aStart, aEnd, b0tid + bStart, bEnd,
187 | 			a0 - aStart, b0 - bStart, indices, comp);
188 | 	__syncthreads();
189 | 
190 | 	// Compact the indices into shared memory. Use the decision bits (set is A,
191 | 	// cleared is B) to select the destination.
192 | 	int decisions = results.x;
193 | 	b0tid += aCount;
194 | 	#pragma unroll
195 | 	for(int i = 0; i < VT; ++i) {
196 | 		if((1<< i) & decisions) {
197 | 			if(IndexA || MatchA) indices_shared[a0tid++] = indices[i];
198 | 		} else {
199 | 			if(IndexB || MatchB) indices_shared[b0tid++] = indices[i];
200 | 		}
201 | 	}
202 | 	__syncthreads();
203 | 
204 | 	// Return the match counts for A and B keys.
205 | 	return make_int2(results.y, results.z);
206 | }
207 | 
208 | } // namespace mgpu
209 | 


--------------------------------------------------------------------------------
/include/contrib/moderngpu/include/device/devicetypes.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  3 |  * 
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  * 
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
 16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
 18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /******************************************************************************
 29 |  *
 30 |  * Code and text by Sean Baxter, NVIDIA Research
 31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
 32 |  *
 33 |  ******************************************************************************/
 34 | 
 35 | #pragma once
 36 | 
 37 | #if __CUDA_ARCH__ == 100
 38 | 	#error "COMPUTE CAPABILITY 1.0 NOT SUPPORTED BY MPGU. TRY 2.0!"
 39 | #endif 
 40 | 
 41 | #include <climits>
 42 | #include "../util/static.h"
 43 | 
 44 | #ifdef _MSC_VER
 45 | #define INLINESYMBOL __forceinline__
 46 | #else
 47 | #define INLINESYMBOL inline
 48 | #endif
 49 | 
 50 | namespace mgpu {
 51 | 
 52 | #define MGPU_HOST __host__ INLINESYMBOL
 53 | #define MGPU_DEVICE __device__ INLINESYMBOL
 54 | #define MGPU_HOST_DEVICE __host__ __device__ INLINESYMBOL
 55 | 
 56 | const int WARP_SIZE = 32;
 57 | const int LOG_WARP_SIZE = 5;
 58 | 
 59 | ////////////////////////////////////////////////////////////////////////////////
 60 | // Device-side comparison operators
 61 | 
 62 | template<typename T>
 63 | struct less : public std::binary_function<T, T, bool> {
 64 | 	MGPU_HOST_DEVICE bool operator()(T a, T b) { return a < b; }
 65 | };
 66 | template<typename T>
 67 | struct less_equal : public std::binary_function<T, T, bool> {
 68 | 	MGPU_HOST_DEVICE bool operator()(T a, T b) { return a <= b; }
 69 | };
 70 | template<typename T>
 71 | struct greater : public std::binary_function<T, T, bool> {
 72 | 	MGPU_HOST_DEVICE bool operator()(T a, T b) { return a > b; }
 73 | };
 74 | template<typename T>
 75 | struct greater_equal : public std::binary_function<T, T, bool> {
 76 | 	MGPU_HOST_DEVICE bool operator()(T a, T b) { return a >= b; }
 77 | };
 78 | template<typename T>
 79 | struct equal_to : public std::binary_function<T, T, bool> {
 80 | 	MGPU_HOST_DEVICE bool operator()(T a, T b) { return a == b; }
 81 | };
 82 | template<typename T>
 83 | struct not_equal_to : public std::binary_function<T, T, bool> {
 84 | 	MGPU_HOST_DEVICE bool operator()(T a, T b) { return a != b; }
 85 | };
 86 | 
 87 | ////////////////////////////////////////////////////////////////////////////////
 88 | // Device-side arithmetic operators
 89 | 
 90 | template<typename T>
 91 | struct plus : public std::binary_function<T, T, T> {
 92 | 	MGPU_HOST_DEVICE T operator()(T a, T b) { return a + b; }
 93 | };
 94 | 
 95 | template<typename T>
 96 | struct minus : public std::binary_function<T, T, T> {
 97 | 	MGPU_HOST_DEVICE T operator()(T a, T b) { return a - b; }
 98 | };
 99 | 
100 | template<typename T>
101 | struct multiplies : public std::binary_function<T, T, T> {
102 | 	MGPU_HOST_DEVICE T operator()(T a, T b) { return a * b; }
103 | };
104 | 
105 | template<typename T>
106 | struct modulus : public std::binary_function<T, T, T> {
107 | 	MGPU_HOST_DEVICE T operator()(T a, T b) { return a % b; }
108 | };
109 | 
110 | template<typename T>
111 | struct bit_or : public std::binary_function<T, T, T> {
112 | 	MGPU_HOST_DEVICE T operator()(T a, T b) { return a | b; }
113 | };
114 | 
115 | template<typename T>
116 | struct bit_and : public std::binary_function<T, T, T> {
117 | 	MGPU_HOST_DEVICE T operator()(T a, T b) { return a & b; }
118 | };
119 | 
120 | template<typename T>
121 | struct bit_xor : public std::binary_function<T, T, T> {
122 | 	MGPU_HOST_DEVICE T operator()(T a, T b) { return a ^ b; }
123 | };
124 | 
125 | template<typename T>
126 | struct maximum : public std::binary_function<T, T, T> {
127 | 	MGPU_HOST_DEVICE T operator()(T a, T b) { return max(a, b); }
128 | };
129 | 
130 | template<typename T>
131 | struct minimum : public std::binary_function<T, T, T> {
132 | 	MGPU_HOST_DEVICE T operator()(T a, T b) { return min(a, b); }
133 | };
134 | 
135 | ////////////////////////////////////////////////////////////////////////////////
136 | 
137 | template<typename T>
138 | MGPU_HOST_DEVICE void swap(T& a, T& b) {
139 | 	T c = a;
140 | 	a = b;
141 | 	b = c;
142 | }
143 | 
144 | template<typename T>
145 | struct DevicePair {
146 | 	T x, y;
147 | };
148 | 
149 | template<typename T>
150 | MGPU_HOST_DEVICE DevicePair<T> MakeDevicePair(T x, T y) {
151 | 	DevicePair<T> p = { x, y };
152 | 	return p;
153 | }
154 | 
155 | template<typename T> struct numeric_limits;
156 | template<> struct numeric_limits<int> {
157 | 	MGPU_HOST_DEVICE static int min() { return INT_MIN; }
158 | 	MGPU_HOST_DEVICE static int max() { return INT_MAX; }
159 | 	MGPU_HOST_DEVICE static int lowest() { return INT_MIN; }
160 | 	MGPU_HOST_DEVICE static int AddIdent() { return 0; }
161 | 	MGPU_HOST_DEVICE static int MulIdent() { return 1; }
162 | };
163 | template<> struct numeric_limits<long long> {
164 | 	MGPU_HOST_DEVICE static long long min() { return LLONG_MIN; }
165 | 	MGPU_HOST_DEVICE static long long max() { return LLONG_MAX; }
166 | 	MGPU_HOST_DEVICE static long long lowest() { return LLONG_MIN; }
167 | 	MGPU_HOST_DEVICE static long long AddIdent() { return 0; }
168 | 	MGPU_HOST_DEVICE static long long MulIdent() { return 1; }
169 | };
170 | template<> struct numeric_limits<uint> {
171 | 	MGPU_HOST_DEVICE static uint min() { return 0; }
172 | 	MGPU_HOST_DEVICE static uint max() { return UINT_MAX; }
173 | 	MGPU_HOST_DEVICE static uint lowest() { return 0; }
174 | 	MGPU_HOST_DEVICE static uint AddIdent() { return 0; }
175 | 	MGPU_HOST_DEVICE static uint MulIdent() { return 1; }
176 | };
177 | template<> struct numeric_limits<unsigned long long> {
178 | 	MGPU_HOST_DEVICE static unsigned long long min() { return 0; }
179 | 	MGPU_HOST_DEVICE static unsigned long long max() { return ULLONG_MAX; }
180 | 	MGPU_HOST_DEVICE static unsigned long long lowest() { return 0; }
181 | 	MGPU_HOST_DEVICE static unsigned long long AddIdent() { return 0; }
182 | 	MGPU_HOST_DEVICE static unsigned long long MulIdent() { return 1; }
183 | };
184 | template<> struct numeric_limits<float> {
185 | 	MGPU_HOST_DEVICE static float min() { return FLT_MIN; }
186 | 	MGPU_HOST_DEVICE static float max() { return FLT_MAX; }
187 | 	MGPU_HOST_DEVICE static float lowest() { return -FLT_MAX; }
188 | 	MGPU_HOST_DEVICE static float AddIdent() { return 0; }
189 | 	MGPU_HOST_DEVICE static float MulIdent() { return 1; }
190 | };
191 | template<> struct numeric_limits<double> {
192 | 	MGPU_HOST_DEVICE static double min() { return DBL_MIN; }
193 | 	MGPU_HOST_DEVICE static double max() { return DBL_MAX; }
194 | 	MGPU_HOST_DEVICE static double lowest() { return -DBL_MAX; }
195 | 	MGPU_HOST_DEVICE static double AddIdent() { return 0; }
196 | 	MGPU_HOST_DEVICE static double MulIdent() { return 1; }
197 | };
198 | 
199 | 
200 | MGPU_HOST_DEVICE int2 operator+(int2 a, int2 b) {
201 | 	return make_int2(a.x + b.x, a.y + b.y); 
202 | }
203 | MGPU_HOST_DEVICE int2& operator+=(int2& a, int2 b) {
204 | 	a = a + b;
205 | 	return a;
206 | }
207 | MGPU_HOST_DEVICE int2 operator*(int2 a, int2 b) {
208 | 	return make_int2(a.x * b.x, a.y * b.y);
209 | }
210 | MGPU_HOST_DEVICE int2& operator*=(int2& a, int2 b) {
211 | 	a = a * b;
212 | 	return a;
213 | }
214 | 
215 | template<typename T>
216 | MGPU_HOST_DEVICE T max(T a, T b) {
217 | #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 100)
218 | 	return std::max(a, b);
219 | #else
220 | 	return (a < b) ? b : a;
221 | #endif
222 | }
223 | template<typename T>
224 | MGPU_HOST_DEVICE T min(T a, T b) {
225 | #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 100)
226 | 	return std::min(a, b);
227 | #else
228 | 	return (b < a) ? b : a;
229 | #endif
230 | }
231 | 
232 | MGPU_HOST_DEVICE int2 max(int2 a, int2 b) {
233 | 	return make_int2(max(a.x, b.x), max(a.y, b.y));
234 | }
235 | 
236 | MGPU_HOST_DEVICE int2 min(int2 a, int2 b) {
237 | 	return make_int2(min(a.x, b.x), min(a.y, b.y));
238 | }
239 | 
240 | template<> struct numeric_limits<int2> {
241 | 	MGPU_HOST_DEVICE static int2 min() { return make_int2(INT_MIN, INT_MIN); }
242 | 	MGPU_HOST_DEVICE static int2 max() { return make_int2(INT_MAX, INT_MAX); }
243 | 	MGPU_HOST_DEVICE static int2 lowest() { 
244 | 		return make_int2(INT_MIN, INT_MIN); 
245 | 	}
246 | 	MGPU_HOST_DEVICE static int2 AddIdent() { return make_int2(0, 0); }
247 | 	MGPU_HOST_DEVICE static int2 MulIdent() { return make_int2(1, 1); }
248 | };
249 | 
250 | template<typename T>
251 | class constant_iterator : public std::iterator_traits<const T*> {
252 | public:
253 | 	MGPU_HOST_DEVICE constant_iterator(T value) : _value(value) { }
254 | 
255 | 	MGPU_HOST_DEVICE T operator[](ptrdiff_t i) const { 
256 | 		return _value;
257 | 	}
258 | 	MGPU_HOST_DEVICE T operator*() const {
259 | 		return _value;
260 | 	}
261 | 	MGPU_HOST_DEVICE constant_iterator operator+(ptrdiff_t diff) const {
262 | 		return constant_iterator(_value);
263 | 	}
264 | 	MGPU_HOST_DEVICE constant_iterator operator-(ptrdiff_t diff) const {
265 | 		return constant_iterator(_value);
266 | 	}
267 | 	MGPU_HOST_DEVICE constant_iterator& operator+=(ptrdiff_t diff) {
268 | 		return *this;
269 | 	}
270 | 	MGPU_HOST_DEVICE constant_iterator& operator-=(ptrdiff_t diff) {
271 | 		return *this;
272 | 	}
273 | private:
274 | 	T _value;
275 | };
276 | 
277 | template<typename T>
278 | class counting_iterator : public std::iterator_traits<const T*> {
279 | public:
280 | 	MGPU_HOST_DEVICE counting_iterator(T value) : _value(value) { }
281 | 
282 | 	MGPU_HOST_DEVICE T operator[](ptrdiff_t i) { 
283 | 		return _value + i;
284 | 	}
285 | 	MGPU_HOST_DEVICE T operator*() {
286 | 		return _value;
287 | 	}
288 | 	MGPU_HOST_DEVICE counting_iterator operator+(ptrdiff_t diff) {
289 | 		return counting_iterator(_value + diff);
290 | 	}
291 | 	MGPU_HOST_DEVICE counting_iterator operator-(ptrdiff_t diff) {
292 | 		return counting_iterator(_value - diff);
293 | 	}
294 | 	MGPU_HOST_DEVICE counting_iterator& operator+=(ptrdiff_t diff) {
295 | 		_value += diff;
296 | 		return *this;
297 | 	}
298 | 	MGPU_HOST_DEVICE counting_iterator& operator-=(ptrdiff_t diff) {
299 | 		_value -= diff;
300 | 		return *this;
301 | 	}
302 | private:
303 | 	T _value;
304 | };
305 | 
306 | template<typename T>
307 | class step_iterator : public std::iterator_traits<const T*> {
308 | public:
309 | 	MGPU_HOST_DEVICE step_iterator(T base, T step) :
310 | 		_base(base), _step(step), _offset(0) { }
311 | 
312 | 	MGPU_HOST_DEVICE T operator[](ptrdiff_t i) { 
313 | 		return _base + (_offset + i) * _step; 
314 | 	}
315 | 	MGPU_HOST_DEVICE T operator*() { 
316 | 		return _base + _offset * _step; 
317 | 	} 
318 | 	MGPU_HOST_DEVICE step_iterator operator+(ptrdiff_t diff) {
319 | 		step_iterator it = *this;
320 | 		it._offset += diff;
321 | 		return it;
322 | 	}
323 | 	MGPU_HOST_DEVICE step_iterator operator-(ptrdiff_t diff) {
324 | 		step_iterator it = *this;
325 | 		it._offset -= diff;
326 | 		return it;
327 | 	}
328 | 	MGPU_HOST_DEVICE step_iterator& operator+=(ptrdiff_t diff) { 
329 | 		_offset += diff;
330 | 		return *this;
331 | 	}
332 | 	MGPU_HOST_DEVICE step_iterator& operator-=(ptrdiff_t diff) { 
333 | 		_offset -= diff;
334 | 		return *this;
335 | 	}
336 | private:
337 | 	ptrdiff_t _offset;
338 | 	T _base, _step;	
339 | };
340 | 
341 | } // namespace mgpu
342 | 
343 | 
344 | template<typename T>
345 | MGPU_HOST_DEVICE mgpu::counting_iterator<T> operator+(ptrdiff_t diff,
346 | 	mgpu::counting_iterator<T> it) {
347 | 	return it + diff;
348 | }
349 | template<typename T>
350 | MGPU_HOST_DEVICE mgpu::counting_iterator<T> operator-(ptrdiff_t diff,
351 | 	mgpu::counting_iterator<T> it) {
352 | 	return it + (-diff);
353 | }
354 | template<typename T>
355 | MGPU_HOST_DEVICE mgpu::step_iterator<T> operator+(ptrdiff_t diff, 
356 | 	mgpu::step_iterator<T> it) {
357 | 	return it + diff;
358 | }
359 | template<typename T>
360 | MGPU_HOST_DEVICE mgpu::step_iterator<T> operator-(ptrdiff_t diff, 
361 | 	mgpu::step_iterator<T> it) {
362 | 	return it + (-diff);
363 | }
364 | 


--------------------------------------------------------------------------------
/include/contrib/moderngpu/include/device/deviceutil.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /******************************************************************************
 29 |  *
 30 |  * Code and text by Sean Baxter, NVIDIA Research
 31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
 32 |  *
 33 |  ******************************************************************************/
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "intrinsics.cuh"
 38 | 
 39 | namespace mgpu {
 40 | 
 41 | // Get the difference between two pointers in bytes.
 42 | MGPU_HOST_DEVICE ptrdiff_t PtrDiff(const void* a, const void* b) {
 43 | 	return (const byte*)b - (const byte*)a;
 44 | }
 45 | 
 46 | // Offset a pointer by i bytes.
 47 | template<typename T>
 48 | MGPU_HOST_DEVICE const T* PtrOffset(const T* p, ptrdiff_t i) {
 49 | 	return (const T*)((const byte*)p + i);
 50 | }
 51 | template<typename T>
 52 | MGPU_HOST_DEVICE T* PtrOffset(T* p, ptrdiff_t i) {
 53 | 	return (T*)((byte*)p + i);
 54 | }
 55 | 
 56 | ////////////////////////////////////////////////////////////////////////////////
 57 | // Task range support
 58 | // Evenly distributes variable-length arrays over a fixed number of CTAs.
 59 | 
 60 | MGPU_HOST int2 DivideTaskRange(int numItems, int numWorkers) {
 61 | 	div_t d = div(numItems, numWorkers);
 62 | 	return make_int2(d.quot, d.rem);
 63 | }
 64 | 
 65 | MGPU_HOST_DEVICE int2 ComputeTaskRange(int block, int2 task) {
 66 | 	int2 range;
 67 | 	range.x = task.x * block;
 68 | 	range.x += min(block, task.y);
 69 | 	range.y = range.x + task.x + (block < task.y);
 70 | 	return range;
 71 | }
 72 | 
 73 | MGPU_HOST_DEVICE int2 ComputeTaskRange(int block, int2 task, int blockSize,
 74 | 	int count) {
 75 | 	int2 range = ComputeTaskRange(block, task);
 76 | 	range.x *= blockSize;
 77 | 	range.y = min(count, range.y * blockSize);
 78 | 	return range;
 79 | }
 80 | 
 81 | ////////////////////////////////////////////////////////////////////////////////
 82 | // DeviceExtractHeadFlags
 83 | // Input array flags is a bit array with 32 head flags per word.
 84 | // ExtractThreadHeadFlags returns numBits flags starting at bit index.
 85 | 
 86 | MGPU_HOST_DEVICE uint DeviceExtractHeadFlags(const uint* flags, int index,
 87 | 	int numBits) {
 88 | 
 89 | 	int index2 = index>> 5;
 90 | 	int shift = 31 & index;
 91 | 	uint headFlags = flags[index2]>> shift;
 92 | 	int shifted = 32 - shift;
 93 | 
 94 | 	if(shifted < numBits)
 95 | 		// We also need to shift in the next set of bits.
 96 | 		headFlags = bfi(flags[index2 + 1], headFlags, shifted, shift);
 97 | 	headFlags &= (1<< numBits) - 1;
 98 | 	return headFlags;
 99 | }
100 | 
101 | ////////////////////////////////////////////////////////////////////////////////
102 | // DevicePackHeadFlags
103 | // Pack VT bits per thread at 32 bits/thread. Will consume an integer number of
104 | // words, because CTA size is a multiple of 32. The first NT * VT / 32 threads
105 | // return packed words.
106 | 
107 | template<int NT, int VT>
108 | MGPU_DEVICE uint DevicePackHeadFlags(uint threadBits, int tid,
109 | 	uint* flags_shared) {
110 | 
111 | 	const int WordCount = NT * VT / 32;
112 | 
113 | 	// Each thread stores its thread bits to flags_shared[tid].
114 | 	flags_shared[tid] = threadBits;
115 | 	__syncthreads();
116 | 
117 | 	uint packed = 0;
118 | 	if(tid < WordCount) {
119 | 		const int Items = MGPU_DIV_UP(32, VT);
120 | 		int index = 32 * tid;
121 | 		int first = index / VT;
122 | 		int bit = 0;
123 | 
124 | 		int rem = index - VT * first;
125 | 		packed = flags_shared[first]>> rem;
126 | 		bit = VT - rem;
127 | 		++first;
128 | 
129 | 		#pragma unroll
130 | 		for(int i = 0; i < Items; ++i) {
131 | 			if(i < Items - 1 || bit < 32) {
132 | 				uint x = flags_shared[first + i];
133 | 				if(bit < 32) packed |= x<< bit;
134 | 				bit += VT;
135 | 			}
136 | 		}
137 | 	}
138 | 	__syncthreads();
139 | 
140 | 	return packed;
141 | }
142 | 
143 | } // namespace mgpu
144 | 


--------------------------------------------------------------------------------
/include/contrib/moderngpu/include/device/intrinsics.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /******************************************************************************
 29 |  *
 30 |  * Code and text by Sean Baxter, NVIDIA Research
 31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
 32 |  *
 33 |  ******************************************************************************/
 34 | 
 35 | #include "devicetypes.cuh"
 36 | 
 37 | #pragma once
 38 | 
 39 | #pragma GCC diagnostic push
 40 | #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 41 | 
 42 | namespace mgpu {
 43 | 
 44 | MGPU_HOST_DEVICE uint2 ulonglong_as_uint2(uint64 x) {
 45 | 	return *reinterpret_cast<uint2*>(&x);
 46 | }
 47 | MGPU_HOST_DEVICE uint64 uint2_as_ulonglong(uint2 x) {
 48 | 	return *reinterpret_cast<uint64*>(&x);
 49 | }
 50 | 
 51 | MGPU_HOST_DEVICE int2 longlong_as_int2(int64 x) {
 52 | 	return *reinterpret_cast<int2*>(&x);
 53 | }
 54 | MGPU_HOST_DEVICE int64 int2_as_longlong(int2 x) {
 55 | 	return *reinterpret_cast<int64*>(&x);
 56 | }
 57 | 
 58 | MGPU_HOST_DEVICE int2 double_as_int2(double x) {
 59 | 	return *reinterpret_cast<int2*>(&x);
 60 | }
 61 | MGPU_HOST_DEVICE double int2_as_double(int2 x) {
 62 | 	return *reinterpret_cast<double*>(&x);
 63 | }
 64 | 
 65 | MGPU_HOST_DEVICE void SetDoubleX(double& d, int x) {
 66 | 	reinterpret_cast<int*>(&d)[0] = x;
 67 | }
 68 | MGPU_HOST_DEVICE int GetDoubleX(double d) {
 69 | 	return double_as_int2(d).x;
 70 | }
 71 | MGPU_HOST_DEVICE void SetDoubleY(double& d, int y) {
 72 | 	reinterpret_cast<int*>(&d)[1] = y;
 73 | }
 74 | MGPU_HOST_DEVICE int GetDoubleY(double d) {
 75 | 	return double_as_int2(d).y;
 76 | }
 77 | 
 78 | 
 79 | ////////////////////////////////////////////////////////////////////////////////
 80 | // PTX for bfe and bfi
 81 | 
 82 | #if __CUDA_ARCH__ >= 200
 83 | 
 84 | MGPU_DEVICE uint bfe_ptx(uint x, uint bit, uint numBits) {
 85 | 	uint result;
 86 | 	asm("bfe.u32 %0, %1, %2, %3;" :
 87 | 		"=r"(result) : "r"(x), "r"(bit), "r"(numBits));
 88 | 	return result;
 89 | }
 90 | 
 91 | 
 92 | MGPU_DEVICE uint bfi_ptx(uint x, uint y, uint bit, uint numBits) {
 93 | 	uint result;
 94 | 	asm("bfi.b32 %0, %1, %2, %3, %4;" :
 95 | 		"=r"(result) : "r"(x), "r"(y), "r"(bit), "r"(numBits));
 96 | 	return result;
 97 | }
 98 | 
 99 | MGPU_DEVICE uint prmt_ptx(uint a, uint b, uint index) {
100 | 	uint ret;
101 | 	asm("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
102 | 	return ret;
103 | }
104 | 
105 | #endif // __CUDA_ARCH__ >= 200
106 | 
107 | 
108 | ////////////////////////////////////////////////////////////////////////////////
109 | // shfl_up
110 | 
111 | __device__ __forceinline__ float shfl_up(float var,
112 | 	unsigned int delta, int width = 32) {
113 | 
114 | #if __CUDA_ARCH__ >= 300
115 | 	var = __shfl_up_sync(0xFFFFFFFF, var, delta, width);
116 | #endif
117 | 	return var;
118 | }
119 | 
120 | __device__ __forceinline__ double shfl_up(double var,
121 | 	unsigned int delta, int width = 32) {
122 | 
123 | #if __CUDA_ARCH__ >= 300
124 | 	int2 p = mgpu::double_as_int2(var);
125 | 	p.x = __shfl_up_sync(0xFFFFFFFF, p.x, delta, width);
126 | 	p.y = __shfl_up_sync(0xFFFFFFFF, p.y, delta, width);
127 | 	var = mgpu::int2_as_double(p);
128 | #endif
129 | 
130 | 	return var;
131 | }
132 | 
133 | ////////////////////////////////////////////////////////////////////////////////
134 | // shfl_add
135 | 
136 | MGPU_DEVICE int shfl_add(int x, int offset, int width = WARP_SIZE) {
137 | 	int result = 0;
138 | #if __CUDA_ARCH__ >= 300
139 | 	int mask = (WARP_SIZE - width)<< 8;
140 | 	asm(
141 | 		"{.reg .s32 r0;"
142 | 		".reg .pred p;"
143 | 		"shfl.up.sync.b32 r0|p, %1, %2, %3, %4;"
144 | 		"@p add.s32 r0, r0, %4;"
145 | 		"mov.s32 %0, r0; }"
146 | 		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
147 | #endif
148 | 	return result;
149 | }
150 | 
151 | MGPU_DEVICE int shfl_max(int x, int offset, int width = WARP_SIZE) {
152 | 	int result = 0;
153 | #if __CUDA_ARCH__ >= 300
154 | 	int mask = (WARP_SIZE - width)<< 8;
155 | 	asm(
156 | 		"{.reg .s32 r0;"
157 | 		".reg .pred p;"
158 | 		"shfl.up.sync..b32 r0|p, %1, %2, %3, %4;"
159 | 		"@p max.s32 r0, r0, %4;"
160 | 		"mov.s32 %0, r0; }"
161 | 		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
162 | #endif
163 | 	return result;
164 | }
165 | 
166 | ////////////////////////////////////////////////////////////////////////////////
167 | // brev, popc, clz, bfe, bfi, prmt
168 | 
169 | // Reverse the bits in an integer.
170 | MGPU_HOST_DEVICE uint brev(uint x) {
171 | #if __CUDA_ARCH__ >= 200
172 | 	uint y = __brev(x);
173 | #else
174 | 	uint y = 0;
175 | 	for(int i = 0; i < 32; ++i)
176 | 		y |= (1 & (x>> i))<< (31 - i);
177 | #endif
178 | 	return y;
179 | }
180 | 
181 | // Count number of bits in a register.
182 | MGPU_HOST_DEVICE int popc(uint x) {
183 | #if __CUDA_ARCH__ >= 200
184 | 	return __popc(x);
185 | #else
186 | 	int c;
187 | 	for(c = 0; x; ++c)
188 | 		x &= x - 1;
189 | 	return c;
190 | #endif
191 | }
192 | 
193 | // Count leading zeros - start from most significant bit.
194 | MGPU_HOST_DEVICE int clz(int x) {
195 | #if __CUDA_ARCH__ >= 200
196 | 	return __clz(x);
197 | #else
198 | 	for(int i = 31; i >= 0; --i)
199 | 		if((1<< i) & x) return 31 - i;
200 | 	return 32;
201 | #endif
202 | }
203 | 
204 | // Find first set - start from least significant bit. LSB is 1. ffs(0) is 0.
205 | MGPU_HOST_DEVICE int ffs(int x) {
206 | #if __CUDA_ARCH__ >= 200
207 | 	return __ffs(x);
208 | #else
209 | 	for(int i = 0; i < 32; ++i)
210 | 		if((1<< i) & x) return i + 1;
211 | 	return 0;
212 | #endif
213 | }
214 | 
215 | MGPU_HOST_DEVICE uint bfe(uint x, uint bit, uint numBits) {
216 | #if __CUDA_ARCH__ >= 200
217 | 	return bfe_ptx(x, bit, numBits);
218 | #else
219 | 	return ((1<< numBits) - 1) & (x>> bit);
220 | #endif
221 | }
222 | 
223 | MGPU_HOST_DEVICE uint bfi(uint x, uint y, uint bit, uint numBits) {
224 | 	uint result;
225 | #if __CUDA_ARCH__ >= 200
226 | 	result = bfi_ptx(x, y, bit, numBits);
227 | #else
228 | 	if(bit + numBits > 32) numBits = 32 - bit;
229 | 	uint mask = ((1<< numBits) - 1)<< bit;
230 | 	result = y & ~mask;
231 | 	result |= mask & (x<< bit);
232 | #endif
233 | 	return result;
234 | }
235 | 
236 | MGPU_HOST_DEVICE uint prmt(uint a, uint b, uint index) {
237 | 	uint result;
238 | #if __CUDA_ARCH__ >= 200
239 | 	result = prmt_ptx(a, b, index);
240 | #else
241 | 	result = 0;
242 | 	for(int i = 0; i < 4; ++i) {
243 | 		uint sel = 0xf & (index>> (4 * i));
244 | 		uint x = ((7 & sel) > 3) ? b : a;
245 | 		x = 0xff & (x>> (8 * (3 & sel)));
246 | 		if(8 & sel) x = (128 & x) ? 0xff : 0;
247 | 		result |= x<< (8 * i);
248 | 	}
249 | #endif
250 | 	return result;
251 | }
252 | 
253 | // Find log2(x) and optionally round up to the next integer logarithm.
254 | MGPU_HOST_DEVICE int FindLog2(int x, bool roundUp = false) {
255 | 	int a = 31 - clz(x);
256 | 	if(roundUp) a += !MGPU_IS_POW_2(x);
257 | 	return a;
258 | }
259 | 
260 | ////////////////////////////////////////////////////////////////////////////////
261 | // vset4
262 | 
263 | #if __CUDA_ARCH__ >= 300
264 | 
265 | // Performs four byte-wise comparisons and returns 1 for each byte that
266 | // satisfies the conditional, and zero otherwise.
267 | MGPU_DEVICE uint vset4_lt_add_ptx(uint a, uint b, uint c) {
268 | 	uint result;
269 | 	asm("vset4.u32.u32.lt.add %0, %1, %2, %3;" :
270 | 		"=r"(result) : "r"(a), "r"(b), "r"(c));
271 | 	return result;
272 | }
273 | MGPU_DEVICE uint vset4_eq_ptx(uint a, uint b) {
274 | 	uint result;
275 | 	asm("vset4.u32.u32.eq %0, %1, %2, %3;" :
276 | 		"=r"(result) : "r"(a), "r"(b), "r"(0));
277 | 	return result;
278 | }
279 | #endif // __CUDA_ARCH__ >= 300
280 | 
281 | MGPU_HOST_DEVICE uint vset4_lt_add(uint a, uint b, uint c) {
282 | 	uint result;
283 | #if __CUDA_ARCH__ >= 300
284 | 	result = vset4_lt_add_ptx(a, b, c);
285 | #else
286 | 	result = c;
287 | 	if((0x000000ff & a) < (0x000000ff & b)) result += 0x00000001;
288 | 	if((0x0000ff00 & a) < (0x0000ff00 & b)) result += 0x00000100;
289 | 	if((0x00ff0000 & a) < (0x00ff0000 & b)) result += 0x00010000;
290 | 	if((0xff000000 & a) < (0xff000000 & b)) result += 0x01000000;
291 | #endif
292 | 	return result;
293 | }
294 | 
295 | MGPU_HOST_DEVICE uint vset4_eq(uint a, uint b) {
296 | 	uint result;
297 | #if __CUDA_ARCH__ >= 300
298 | 	result = vset4_eq_ptx(a, b);
299 | #else
300 | 	result = 0;
301 | 	if((0x000000ff & a) == (0x000000ff & b)) result = 0x00000001;
302 | 	if((0x0000ff00 & a) == (0x0000ff00 & b)) result += 0x00000100;
303 | 	if((0x00ff0000 & a) == (0x00ff0000 & b)) result += 0x00010000;
304 | 	if((0xff000000 & a) == (0xff000000 & b)) result += 0x01000000;
305 | #endif
306 | 	return result;
307 | }
308 | 
309 | ////////////////////////////////////////////////////////////////////////////////
310 | //
311 | 
312 | MGPU_HOST_DEVICE uint umulhi(uint x, uint y) {
313 | #if __CUDA_ARCH__ >= 100
314 | 	return __umulhi(x, y);
315 | #else
316 | 	uint64 product = (uint64)x * y;
317 | 	return (uint)(product>> 32);
318 | #endif
319 | }
320 | 
321 | ////////////////////////////////////////////////////////////////////////////////
322 | // ldg() function defined for all devices and all types. Only compiles to __ldg
323 | // intrinsic for __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400 for types supported
324 | // by __ldg in sm_32_intrinsics.h
325 | 
326 | template<typename T>
327 | struct IsLdgType {
328 | 	enum { value = false };
329 | };
330 | #define DEFINE_LDG_TYPE(T) \
331 | 	template<> struct IsLdgType<T> { enum { value = true }; };
332 | 
333 | template<typename T, bool UseLDG = IsLdgType<T>::value>
334 | struct LdgShim {
335 | 	MGPU_DEVICE static T Ldg(const T* p) {
336 | 		return *p;
337 | 	}
338 | };
339 | 
340 | #if __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400
341 | 
342 | 	// List of __ldg-compatible types from sm_32_intrinsics.h.
343 | 	DEFINE_LDG_TYPE(char)
344 | 	DEFINE_LDG_TYPE(short)
345 | 	DEFINE_LDG_TYPE(int)
346 | 	DEFINE_LDG_TYPE(long long)
347 | 	DEFINE_LDG_TYPE(char2)
348 | 	DEFINE_LDG_TYPE(char4)
349 | 	DEFINE_LDG_TYPE(short2)
350 | 	DEFINE_LDG_TYPE(short4)
351 | 	DEFINE_LDG_TYPE(int2)
352 | 	DEFINE_LDG_TYPE(int4)
353 | 	DEFINE_LDG_TYPE(longlong2)
354 | 
355 | 	DEFINE_LDG_TYPE(unsigned char)
356 | 	DEFINE_LDG_TYPE(unsigned short)
357 | 	DEFINE_LDG_TYPE(unsigned int)
358 | 	DEFINE_LDG_TYPE(unsigned long long)
359 | 	DEFINE_LDG_TYPE(uchar2)
360 | 	DEFINE_LDG_TYPE(uchar4)
361 | 	DEFINE_LDG_TYPE(ushort2)
362 | 	DEFINE_LDG_TYPE(ushort4)
363 | 	DEFINE_LDG_TYPE(uint2)
364 | 	DEFINE_LDG_TYPE(uint4)
365 | 	DEFINE_LDG_TYPE(ulonglong2)
366 | 
367 | 	DEFINE_LDG_TYPE(float)
368 | 	DEFINE_LDG_TYPE(double)
369 | 	DEFINE_LDG_TYPE(float2)
370 | 	DEFINE_LDG_TYPE(float4)
371 | 	DEFINE_LDG_TYPE(double2)
372 | 
373 | 	template<typename T> struct LdgShim<T, true> {
374 | 		MGPU_DEVICE static T Ldg(const T* p) {
375 | 			return __ldg(p);
376 | 		}
377 | 	};
378 | #endif
379 | 
380 | template<typename T>
381 | MGPU_DEVICE T ldg(const T* p) {
382 | 	return LdgShim<T>::Ldg(p);
383 | }
384 | 
385 | ////////////////////////////////////////////////////////////////////////////////
386 | 
387 | // Fast division for 31-bit integers.
388 | // Uses the method in Hacker's Delight (2nd edition) page 228.
389 | // Evaluates for denom > 1 and x < 2^31.
390 | struct FastDivide {
391 | 	uint denom;
392 | 	uint coef;
393 | 	uint shift;
394 | 
395 | 	MGPU_HOST_DEVICE uint Divide(uint x) {
396 | 		return umulhi(x, coef)>> shift;
397 | 	}
398 | 	MGPU_HOST_DEVICE uint Modulus(uint x) {
399 | 		return x - Divide(x) * denom;
400 | 	}
401 | 
402 | 	explicit FastDivide(uint denom_) {
403 | 		denom = denom_;
404 | 		uint p = 31 + FindLog2(denom, true);
405 | 		coef = (uint)(((1ull<< p) + denom - 1) / denom);
406 | 		shift = p - 32;
407 | 	}
408 | };
409 | 
410 | #pragma GCC diagnostic pop
411 | 
412 | } // namespace mgpu
413 | 


--------------------------------------------------------------------------------
/include/contrib/moderngpu/include/device/serialsets.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /******************************************************************************
 29 |  *
 30 |  * Code and text by Sean Baxter, NVIDIA Research
 31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
 32 |  *
 33 |  ******************************************************************************/
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "deviceutil.cuh"
 38 | 
 39 | namespace mgpu {
 40 | 
 41 | ////////////////////////////////////////////////////////////////////////////////
 42 | // SerialSetIntersection
 43 | // Emit A if A and B are in range and equal.
 44 | 
 45 | template<int VT, bool RangeCheck, typename T, typename Comp>
 46 | MGPU_DEVICE int SerialSetIntersection(const T* data, int aBegin, int aEnd,
 47 | 	int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {
 48 | 
 49 | 	const int MinIterations = VT / 2;
 50 | 	int commit = 0;
 51 | 
 52 | 	#pragma unroll
 53 | 	for(int i = 0; i < VT; ++i) {
 54 | 		bool test = RangeCheck ?
 55 | 			((aBegin + bBegin < end) && (aBegin < aEnd) && (bBegin < bEnd)) :
 56 | 			(i < MinIterations || (aBegin + bBegin < end));
 57 | 
 58 | 		if(test) {
 59 | 			T aKey = data[aBegin];
 60 | 			T bKey = data[bBegin];
 61 | 
 62 | 			bool pA = comp(aKey, bKey);
 63 | 			bool pB = comp(bKey, aKey);
 64 | 
 65 | 			// The outputs must come from A by definition of set interection.
 66 | 			results[i] = aKey;
 67 | 			indices[i] = aBegin;
 68 | 
 69 | 			if(!pB) ++aBegin;
 70 | 			if(!pA) ++bBegin;
 71 | 			if(pA == pB) commit |= 1<< i;
 72 | 		}
 73 | 	}
 74 | 	return commit;
 75 | }
 76 | 
 77 | ////////////////////////////////////////////////////////////////////////////////
 78 | // SerialSetUnion
 79 | // Emit A if A <= B. Emit B if B < A.
 80 | 
 81 | template<int VT, bool RangeCheck, typename T, typename Comp>
 82 | MGPU_DEVICE int SerialSetUnion(const T* data, int aBegin, int aEnd,
 83 | 	int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {
 84 | 
 85 | 	const int MinIterations = VT / 2;
 86 | 	int commit = 0;
 87 | 
 88 | 	#pragma unroll
 89 | 	for(int i = 0; i < VT; ++i) {
 90 | 		bool test = RangeCheck ?
 91 | 			(aBegin + bBegin < end) :
 92 | 			(i < MinIterations || (aBegin + bBegin < end));
 93 | 
 94 | 		if(test) {
 95 | 			T aKey = data[aBegin];
 96 | 			T bKey = data[bBegin];
 97 | 
 98 | 			bool pA = false, pB = false;
 99 | 			if(RangeCheck && aBegin >= aEnd)
100 | 				pB = true;
101 | 			else if(RangeCheck && bBegin >= bEnd)
102 | 				pA = true;
103 | 			else {
104 | 				// Both are in range.
105 | 				pA = comp(aKey, bKey);
106 | 				pB = comp(bKey, aKey);
107 | 			}
108 | 
109 | 			// Output A in case of a tie, so check if b < a.
110 | 			results[i] = pB ? bKey : aKey;
111 | 			indices[i] = pB ? bBegin : aBegin;
112 | 			if(!pB) ++aBegin;
113 | 			if(!pA) ++bBegin;
114 | 			commit |= 1<< i;
115 | 		}
116 | 	}
117 | 	return commit;
118 | }
119 | 
120 | ////////////////////////////////////////////////////////////////////////////////
121 | // SerialSetDifference
122 | // Emit A if A < B.
123 | 
124 | template<int VT, bool RangeCheck, typename T, typename Comp>
125 | MGPU_DEVICE int SerialSetDifference(const T* data, int aBegin, int aEnd,
126 | 	int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {
127 | 
128 | 	const int MinIterations = VT / 2;
129 | 	int commit = 0;
130 | 
131 | 	#pragma unroll
132 | 	for(int i = 0; i < VT; ++i) {
133 | 		bool test = RangeCheck ?
134 | 			(aBegin + bBegin < end) :
135 | 			(i < MinIterations || (aBegin + bBegin < end));
136 | 		if(test) {
137 | 			T aKey = data[aBegin];
138 | 			T bKey = data[bBegin];
139 | 
140 | 			bool pA = false, pB = false;
141 | 			if(RangeCheck && aBegin >= aEnd)
142 | 				pB = true;
143 | 			else if(RangeCheck && bBegin >= bEnd)
144 | 				pA = true;
145 | 			else {
146 | 				pA = comp(aKey, bKey);
147 | 				pB = comp(bKey, aKey);
148 | 			}
149 | 
150 | 			// The outputs must come from A by definition of set difference.
151 | 			results[i] = aKey;
152 | 			indices[i] = aBegin;
153 | 			if(!pB) ++aBegin;
154 | 			if(!pA) ++bBegin;
155 | 			if(pA) commit |= 1<< i;
156 | 		}
157 | 	}
158 | 	return commit;
159 | }
160 | 
161 | ////////////////////////////////////////////////////////////////////////////////
162 | // SerialSetSymDiff
163 | // Emit A if A < B and emit B if B < A.
164 | 
165 | template<int VT, bool RangeCheck, typename T, typename Comp>
166 | MGPU_DEVICE int SerialSetSymDiff(const T* data, int aBegin, int aEnd,
167 | 	int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {
168 | 
169 | 	const int MinIterations = VT / 2;
170 | 	int commit = 0;
171 | 
172 | 	#pragma unroll
173 | 	for(int i = 0; i < VT; ++i) {
174 | 		bool test = RangeCheck ?
175 | 			(aBegin + bBegin < end) :
176 | 			(i < MinIterations || (aBegin + bBegin < end));
177 | 		if(test) {
178 | 			T aKey = data[aBegin];
179 | 			T bKey = data[bBegin];
180 | 
181 | 			bool pA = false, pB = false;
182 | 			if(RangeCheck && (bBegin >= bEnd))
183 | 				pA = true;
184 | 			else if(RangeCheck && (aBegin >= aEnd))
185 | 				pB = true;
186 | 			else {
187 | 				pA = comp(aKey, bKey);
188 | 				pB = comp(bKey, aKey);
189 | 			}
190 | 
191 | 			results[i] = pA ? aKey : bKey;
192 | 			indices[i] = pA ? aBegin : bBegin;
193 | 			if(!pA) ++bBegin;
194 | 			if(!pB) ++aBegin;
195 | 			if(pA != pB) commit |= 1<< i;
196 | 		}
197 | 	}
198 | 	return commit;
199 | }
200 | 
201 | ////////////////////////////////////////////////////////////////////////////////
202 | // SerialSetOp
203 | // Uses the MgpuSetOp enum to statically select one of the four serial ops
204 | // above.
205 | 
206 | template<int VT, bool RangeCheck, MgpuSetOp Op, typename T, typename Comp>
207 | MGPU_DEVICE int SerialSetOp(const T* data, int aBegin, int aEnd,
208 | 	int bBegin, int bEnd, int star, T* results, int* indices, Comp comp) {
209 | 
210 | 	int end = aBegin + bBegin + VT - star;
211 | 	if(RangeCheck) end = min(end, aEnd + bEnd);
212 | 	int commit;
213 | 	switch(Op) {
214 | 		case MgpuSetOpIntersection:
215 | 			commit = SerialSetIntersection<VT, RangeCheck>(data, aBegin,
216 | 				aEnd, bBegin, bEnd, end, results, indices, comp);
217 | 			break;
218 | 		case MgpuSetOpUnion:
219 | 			commit = SerialSetUnion<VT, RangeCheck>(data, aBegin, aEnd,
220 | 				bBegin, bEnd, end, results, indices, comp);
221 | 			break;
222 | 		case MgpuSetOpDiff:
223 | 			commit = SerialSetDifference<VT, RangeCheck>(data, aBegin, aEnd,
224 | 				bBegin, bEnd, end, results, indices, comp);
225 | 			break;
226 | 		case MgpuSetOpSymDiff:
227 | 			commit = SerialSetSymDiff<VT, RangeCheck>(data, aBegin, aEnd,
228 | 				bBegin, bEnd, end, results, indices, comp);
229 | 			break;
230 | 	}
231 | 	__syncthreads();
232 | 	return commit;
233 | }
234 | 
235 | } // namespace mgpu
236 | 


--------------------------------------------------------------------------------
/include/contrib/moderngpu/include/device/sortnetwork.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /******************************************************************************
 29 |  *
 30 |  * Code and text by Sean Baxter, NVIDIA Research
 31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
 32 |  *
 33 |  ******************************************************************************/
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "deviceutil.cuh"
 38 | 
 39 | namespace mgpu {
 40 | 
 41 | ////////////////////////////////////////////////////////////////////////////////
 42 | // Odd-even transposition sorting network. Sorts keys and values in-place in
 43 | // register.
 44 | // http://en.wikipedia.org/wiki/Odd%E2%80%93even_sort
 45 | 
 46 | // CUDA Compiler does not currently unroll these loops correctly. Write using
 47 | // template loop unrolling.
 48 | /*
 49 | template<int VT, typename T, typename V, typename Comp>
 50 | MGPU_DEVICE void OddEvenTransposeSort(T* keys, V* values, Comp comp) {
 51 | 	#pragma unroll
 52 | 	for(int level = 0; level < VT; ++level) {
 53 | 
 54 | 		#pragma unroll
 55 | 		for(int i = 1 & level; i < VT - 1; i += 2) {
 56 | 			if(comp(keys[i + 1], keys[i])) {
 57 | 				mgpu::swap(keys[i], keys[i + 1]);
 58 | 				mgpu::swap(values[i], values[i + 1]);
 59 | 			}
 60 | 		}
 61 | 	}
 62 | }*/
 63 | 
 64 | template<int I, int VT>
 65 | struct OddEvenTransposeSortT {
 66 | 	// Sort segments marked by head flags. If the head flag between i and i + 1
 67 | 	// is set (so that (2<< i) & flags is true), the values belong to different
 68 | 	// segments and are not swapped.
 69 | 	template<typename K, typename V, typename Comp>
 70 | 	static MGPU_DEVICE void Sort(K* keys, V* values, int flags, Comp comp) {
 71 | 		#pragma unroll
 72 | 		for(int i = 1 & I; i < VT - 1; i += 2)
 73 | 			if((0 == ((2<< i) & flags)) && comp(keys[i + 1], keys[i])) {
 74 | 				mgpu::swap(keys[i], keys[i + 1]);
 75 | 				mgpu::swap(values[i], values[i + 1]);
 76 | 			}
 77 | 		OddEvenTransposeSortT<I + 1, VT>::Sort(keys, values, flags, comp);
 78 | 	}
 79 | };
 80 | template<int I> struct OddEvenTransposeSortT<I, I> {
 81 | 	template<typename K, typename V, typename Comp>
 82 | 	static MGPU_DEVICE void Sort(K* keys, V* values, int flags, Comp comp) { }
 83 | };
 84 | 
 85 | template<int VT, typename K, typename V, typename Comp>
 86 | MGPU_DEVICE void OddEvenTransposeSort(K* keys, V* values, Comp comp) {
 87 | 	OddEvenTransposeSortT<0, VT>::Sort(keys, values, 0, comp);
 88 | }
 89 | template<int VT, typename K, typename V, typename Comp>
 90 | MGPU_DEVICE void OddEvenTransposeSortFlags(K* keys, V* values, int flags,
 91 | 	Comp comp) {
 92 | 	OddEvenTransposeSortT<0, VT>::Sort(keys, values, flags, comp);
 93 | }
 94 | 
 95 | ////////////////////////////////////////////////////////////////////////////////
 96 | // Batcher Odd-Even Mergesort network
 97 | // Unstable but executes much faster than the transposition sort.
 98 | // http://en.wikipedia.org/wiki/Batcher_odd%E2%80%93even_mergesort
 99 | 
100 | template<int Width, int Low, int Count>
101 | struct OddEvenMergesortT {
102 | 	template<typename K, typename V, typename Comp>
103 | 	MGPU_DEVICE static void CompareAndSwap(K* keys, V* values, int flags,
104 | 		int a, int b, Comp comp) {
105 | 		if(b < Count) {
106 | 			// Mask the bits between a and b. Any head flags in this interval
107 | 			// means the keys are in different segments and must not be swapped.
108 | 			const int Mask = ((2<< b) - 1) ^ ((2<< a) - 1);
109 | 			if(!(Mask & flags) && comp(keys[b], keys[a])) {
110 | 				mgpu::swap(keys[b], keys[a]);
111 | 				mgpu::swap(values[b], values[a]);
112 | 			}
113 | 		}
114 | 	}
115 | 
116 | 	template<int R, int Low2, bool Recurse = 2 * R < Width>
117 | 	struct OddEvenMerge {
118 | 		template<typename K, typename V, typename Comp>
119 | 		MGPU_DEVICE static void Merge(K* keys, V* values, int flags,
120 | 			Comp comp) {
121 | 			// Compare and swap
122 | 			const int M = 2 * R;
123 | 			OddEvenMerge<M, Low2>::Merge(keys, values, flags, comp);
124 | 			OddEvenMerge<M, Low2 + R>::Merge(keys, values, flags, comp);
125 | 
126 | 			#pragma unroll
127 | 			for(int i = Low2 + R; i + R < Low2 + Width; i += M)
128 | 				CompareAndSwap(keys, values, flags, i, i + R, comp);
129 | 		}
130 | 	};
131 | 	template<int R, int Low2>
132 | 	struct OddEvenMerge<R, Low2, false> {
133 | 		template<typename K, typename V, typename Comp>
134 | 		MGPU_DEVICE static void Merge(K* keys, V* values, int flags,
135 | 			Comp comp) {
136 | 			CompareAndSwap(keys, values, flags, Low2, Low2 + R, comp);
137 | 		}
138 | 	};
139 | 
140 | 	template<typename K, typename V, typename Comp>
141 | 	MGPU_DEVICE static void Sort(K* keys, V* values, int flags,
142 | 		Comp comp) {
143 | 
144 | 		const int M = Width / 2;
145 | 		OddEvenMergesortT<M, Low, Count>::Sort(keys, values, flags, comp);
146 | 		OddEvenMergesortT<M, Low + M, Count>::Sort(keys, values, flags, comp);
147 | 		OddEvenMerge<1, Low>::Merge(keys, values, flags, comp);
148 | 	}
149 | };
150 | template<int Low, int Count> struct OddEvenMergesortT<1, Low, Count> {
151 | 	template<typename K, typename V, typename Comp>
152 | 	MGPU_DEVICE static void Sort(K* keys, V* values, int flags,
153 | 		Comp comp) { }
154 | };
155 | 
156 | template<int VT, typename K, typename V, typename Comp>
157 | MGPU_DEVICE void OddEvenMergesort(K* keys, V* values, Comp comp) {
158 | 	const int Width = 1<< sLogPow2<VT, true>::value;
159 | 	OddEvenMergesortT<Width, 0, VT>::Sort(keys, values, 0, comp);
160 | }
161 | template<int VT, typename K, typename V, typename Comp>
162 | MGPU_DEVICE void OddEvenMergesortFlags(K* keys, V* values, int flags,
163 | 	Comp comp) {
164 | 	const int Width = 1<< sLogPow2<VT, true>::value;
165 | 	OddEvenMergesortT<Width, 0, VT>::Sort(keys, values, flags, comp);
166 | }
167 | 
168 | } // namespace mgpu
169 | 


--------------------------------------------------------------------------------
/include/contrib/moderngpu/include/mgpudevice.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /******************************************************************************
 29 |  *
 30 |  * Code and text by Sean Baxter, NVIDIA Research
 31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
 32 |  *
 33 |  ******************************************************************************/
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "mgpuenums.h"
 38 | #include "device/deviceutil.cuh"
 39 | 
 40 | namespace mgpu {
 41 | 
 42 | ////////////////////////////////////////////////////////////////////////////////
 43 | // device/loadstore.cuh
 44 | 
 45 | // For 0 <= i < VT:
 46 | //		index = NT * i + tid;
 47 | //		reg[i] = data[index];
 48 | // Synchronize after load.
 49 | template<int NT, int VT, typename InputIt, typename T>
 50 | MGPU_DEVICE void DeviceSharedToReg(InputIt data, int tid, T* reg,
 51 | 	bool sync = true);
 52 | 
 53 | // For 0 <= i < VT:
 54 | //		index = NT * i + tid;
 55 | //		if(index < count) reg[i] = data[index];
 56 | // No synchronize after load.
 57 | template<int NT, int VT, typename InputIt, typename T>
 58 | MGPU_DEVICE void DeviceGlobalToReg(int count, InputIt data, int tid,
 59 | 	T* reg, bool sync = false);
 60 | 
 61 | template<int NT, int VT, typename InputIt, typename T>
 62 | MGPU_DEVICE void DeviceGlobalToRegDefault(int count, InputIt data, int tid,
 63 | 	T* reg, T init, bool sync = false);
 64 | 
 65 | // For 0 <= i < VT:
 66 | //		index = NT * i + tid;
 67 | //		if(index < count) reg[i] = data[index];
 68 | // No synchronize after load.
 69 | template<int NT, int VT0, int VT1, typename InputIt, typename T>
 70 | MGPU_DEVICE void DeviceGlobalToReg(int count, InputIt data, int tid,
 71 | 	T* reg, bool sync = false);
 72 | 
 73 | // For 0 <= i < VT:
 74 | //		index = NT * i + tid;
 75 | //		if(index < count) reg[i] = data[index];
 76 | // No synchronize after load.
 77 | template<int NT, int VT0, int VT1, typename InputIt, typename T>
 78 | MGPU_DEVICE void DeviceGlobalToRegDefault2(int count, InputIt data, int tid,
 79 | 	T* reg, T init, bool sync = false);
 80 | 
 81 | // For 0 <= i < VT:
 82 | //		index = NT * i + tid;
 83 | //		if(index < count) reg[i] = data[index];
 84 | // No synchronize after load.
 85 | // No optimized code path for count < NV (smaller generated code).
 86 | template<int NT, int VT, typename InputIt, typename T>
 87 | MGPU_DEVICE void DeviceGlobalToRegLoop(int count, InputIt data, int tid,
 88 | 	T* reg, bool sync = false);
 89 | 
 90 | 
 91 | // For 0 <= i < VT:
 92 | //		index = VT * tid + i.
 93 | //		if(index < count) reg[i] = data[index];
 94 | // No synchronize after load.
 95 | template<int NT, int VT, typename InputIt, typename T>
 96 | MGPU_DEVICE void DeviceGlobalToThread(int count, InputIt data, int tid,
 97 | 	T* reg);
 98 | 
 99 | template<int NT, int VT, typename InputIt, typename T>
100 | MGPU_DEVICE void DeviceGlobalToThreadDefault(int count, InputIt data, int tid,
101 | 	T* reg, T init);
102 | 
103 | // For 0 <= i < VT:
104 | //		index = NT * i + tid;
105 | //		if(index < count) data[index] = reg[i];
106 | // Synchronize after load.
107 | template<int NT, int VT, typename OutputIt, typename T>
108 | MGPU_DEVICE void DeviceRegToShared(const T* reg, int tid, OutputIt dest,
109 | 	bool sync = true);
110 | 
111 | // For 0 <= i < VT:
112 | //		index = NT * i + tid;
113 | //		if(index < count) data[index] = reg[i];
114 | // No synchronize after load.
115 | template<int NT, int VT, typename OutputIt, typename T>
116 | MGPU_DEVICE void DeviceRegToGlobal(int count, const T* reg, int tid,
117 | 	OutputIt dest, bool sync = false);
118 | 
119 | // For 0 <= index < count:
120 | //		dest[index] = source[index];
121 | // This function is intended to replace DeviceGlobalToShared in cases where
122 | // count is much less than NT * VT.
123 | template<int NT, typename InputIt, typename OutputIt>
124 | MGPU_DEVICE void DeviceMemToMemLoop(int count, InputIt source, int tid,
125 | 	OutputIt dest, bool sync = true);
126 | 
127 | // For 0 <= index < count:
128 | //		dest[index] = source[index];
129 | // Synchronize after store.
130 | template<int NT, int VT, typename T, typename OutputIt>
131 | MGPU_DEVICE void DeviceSharedToGlobal(int count, const T* source, int tid,
132 | 	OutputIt dest, bool sync = true);
133 | 
134 | // For 0 <= index < count:
135 | //		dest[index] = source[index];
136 | // Synchronize after store.
137 | template<int NT, int VT, typename InputIt, typename T>
138 | MGPU_DEVICE void DeviceGlobalToShared(int count, InputIt source, int tid,
139 | 	T* dest, bool sync = true);
140 | 
141 | template<int NT, int VT0, int VT1, typename InputIt, typename T>
142 | MGPU_DEVICE void DeviceGlobalToShared2(int count, InputIt source, int tid,
143 | 	T* dest, bool sync = true);
144 | 
145 | // For 0 <= index < count:
146 | //		dest[index] = source[index];
147 | // Synchronize after store.
148 | // No optimized code path for count < NV (smaller generated code).
149 | template<int NT, int VT, typename InputIt, typename T>
150 | MGPU_DEVICE void DeviceGlobalToSharedLoop(int count, InputIt source, int tid,
151 | 	T* dest, bool sync = true);
152 | 
153 | template<int NT, int VT, typename InputIt, typename T>
154 | MGPU_DEVICE void DeviceGlobalToSharedDefault(int count, InputIt source, int tid,
155 | 	T* dest, T init, bool sync = true);
156 | 
157 | template<int NT, int VT0, int VT1, typename InputIt, typename T>
158 | MGPU_DEVICE void DeviceGlobalToSharedDefault2(int count, InputIt source,
159 | 	int tid, T* dest, T init, bool sync = true);
160 | 
161 | // For 0 <= index < count:
162 | //		dest[index] = source[index];
163 | // No synchronize.
164 | template<int NT, int VT, typename InputIt, typename OutputIt>
165 | MGPU_DEVICE void DeviceGlobalToGlobal(int count, InputIt source, int tid,
166 | 	OutputIt dest, bool sync = false);
167 | 
168 | // Transponse VT elements in NT threads (x) into thread-order registers (y)
169 | // using only NT * VT / 2 elements of shared memory.
170 | template<int NT, int VT, typename T>
171 | MGPU_DEVICE void HalfSmemTranspose(const T* x, int tid, T* shared, T* y);
172 | 
173 | // For 0 <= i < VT:
174 | //		index = NT * i + tid;
175 | //		if(index < count)
176 | //			gather = indices[index];
177 | //			reg[i] = data[gather];
178 | // Synchronize after load.
179 | template<int NT, int VT, typename InputIt, typename T>
180 | MGPU_DEVICE void DeviceGather(int count, InputIt data, int indices[VT],
181 | 	int tid, T* reg, bool sync = true);
182 | 
183 | template<int NT, int VT, typename InputIt, typename T>
184 | MGPU_DEVICE void DeviceGatherDefault(int count, InputIt data, int indices[VT],
185 | 	int tid, T* reg, T identity, bool sync = true);
186 | 
187 | // For 0 <= i < VT:
188 | //		index = NT * i + tid;
189 | //		if(index < count)
190 | //			scatter = indices[index];
191 | //			data[scatter] = reg[i];
192 | // Synchronize after store.
193 | template<int NT, int VT, typename T, typename OutputIt>
194 | MGPU_DEVICE void DeviceScatter(int count, const T* reg, int tid,
195 | 	int indices[VT], OutputIt data, bool sync = true);
196 | 
197 | // For 0 <= i < VT:
198 | //		shared[VT * tid + i] = threadReg[i];
199 | // Synchronize after store.
200 | // Note this function moves data in THREAD ORDER.
201 | // (DeviceRegToShared moves data in STRIDED ORDER).
202 | template<int VT, typename T>
203 | MGPU_DEVICE void DeviceThreadToShared(const T* threadReg, int tid, T* shared,
204 | 	bool sync = true);
205 | 
206 | // For 0 <= i < VT:
207 | //		threadReg[i] = shared[VT * tid + i];
208 | // Synchronize after load.
209 | // Note this function moves data in THREAD ORDER.
210 | // (DeviceSharedToReg moves data in STRIDED ORDER).
211 | template<int VT, typename T>
212 | MGPU_DEVICE void DeviceSharedToThread(const T* shared, int tid, T* threadReg,
213 | 	bool sync = true);
214 | 
215 | // For 0 <= index < aCount:
216 | //		shared[index] = a_global[index];
217 | // For 0 <= index < bCount:
218 | //		shared[aCount + index] = b_global[index];
219 | // VT0 is the lower-bound for predication-free execution:
220 | //		If count >= NT * VT0, a predication-free branch is taken.
221 | // VT1 is the upper-bound for loads:
222 | //		NT * VT1 must >= aCount + bCount.
223 | 
224 | template<int NT, int VT0, int VT1, typename T>
225 | MGPU_DEVICE void DeviceLoad2ToReg(const T* a_global, int aCount,
226 | 	const T* b_global, int bCount, int tid, T* reg, bool sync = false);
227 | 
228 | template<int NT, int VT0, int VT1, typename T>
229 | MGPU_DEVICE void DeviceLoad2ToShared(const T* a_global, int aCount,
230 | 	const T* b_global, int bCount, int tid, T* shared, bool sync = true);
231 | 
232 | template<int NT, int VT0, int VT1, typename InputIt1, typename InputIt2,
233 | 	typename T>
234 | MGPU_DEVICE void DeviceLoad2ToReg(InputIt1 a_global, int aCount,
235 | 	InputIt2 b_global, int bCount, int tid, T* reg, bool sync = false);
236 | 
237 | template<int NT, int VT0, int VT1, typename InputIt1, typename InputIt2,
238 | 	typename T>
239 | MGPU_DEVICE void DeviceLoad2ToShared(InputIt1 a_global, int aCount,
240 | 	InputIt2 b_global, int bCount, int tid, T* shared, bool sync = true);
241 | 
242 | // For 0 <= i < VT
243 | //		index = NT * i + tid;
244 | //		if(index < count)
245 | //			gather = indices_shared[index];
246 | //			dest_global[index] = data_global[gather];
247 | // Synchronize after load.
248 | template<int NT, int VT, typename InputIt, typename OutputIt>
249 | MGPU_DEVICE void DeviceGatherGlobalToGlobal(int count, InputIt data_global,
250 | 	const int* indices_shared, int tid, OutputIt dest_global,
251 | 	bool sync = true);
252 | 
253 | // For 0 <= i < VT
254 | //		index = NT * i + tid
255 | //		if(index < count)
256 | //			gather = indices[index];
257 | //			if(gather < aCount) data = a_global[gather];
258 | //			else data = b_global[gather - aCount];
259 | //			dest_global[index] = data;
260 | // Synchronize after load.
261 | template<int NT, int VT, typename InputIt1, typename InputIt2,
262 | 	typename T>
263 | MGPU_DEVICE void DeviceTransferMergeValuesReg(int count, InputIt1 a_global,
264 | 	InputIt2 b_global, int bStart, const int* indices, int tid,
265 | 	T* reg, bool sync = false);
266 | 
267 | template<int NT, int VT, typename InputIt1, typename InputIt2,
268 | 	typename OutputIt>
269 | MGPU_DEVICE void DeviceTransferMergeValuesShared(int count, InputIt1 a_global,
270 | 	InputIt2 b_global, int bStart, const int* indices_shared, int tid,
271 | 	OutputIt dest_global, bool sync = true);
272 | 
273 | template<int NT, int VT, typename T>
274 | MGPU_DEVICE void DeviceTransferMergeValuesReg(int count, const T* a_global,
275 | 	const T* b_global, int bStart, const int* indices, int tid,
276 | 	T* reg, bool sync = false);
277 | 
278 | template<int NT, int VT, typename T, typename OutputIt>
279 | MGPU_DEVICE void DeviceTransferMergeValuesShared(int count, const T* a_global,
280 | 	const T* b_global, int bStart, const int* indices_shared, int tid,
281 | 	OutputIt dest_global, bool sync = true);
282 | 
283 | 
284 | 
285 | } // namespace mgpu
286 | 
287 | 
288 | #include "device/loadstore.cuh"
289 | #include "device/ctasegscan.cuh"
290 | 


--------------------------------------------------------------------------------
/include/contrib/moderngpu/include/mgpuenums.h:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
 3 |  * 
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  * 
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | /******************************************************************************
29 |  *
30 |  * Code and text by Sean Baxter, NVIDIA Research
31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
32 |  *
33 |  ******************************************************************************/
34 | 
35 | #pragma once 
36 | 
37 | namespace mgpu {
38 | 
39 | enum MgpuBounds {
40 | 	MgpuBoundsLower,
41 | 	MgpuBoundsUpper
42 | };
43 | 
44 | enum MgpuScanType {
45 | 	MgpuScanTypeExc,
46 | 	MgpuScanTypeInc
47 | };
48 | 
49 | enum MgpuSearchType {
50 | 	MgpuSearchTypeNone,
51 | 	MgpuSearchTypeIndex,
52 | 	MgpuSearchTypeMatch,
53 | 	MgpuSearchTypeIndexMatch
54 | };
55 | 
56 | enum MgpuJoinKind {
57 | 	MgpuJoinKindInner,
58 | 	MgpuJoinKindLeft,
59 | 	MgpuJoinKindRight,
60 | 	MgpuJoinKindOuter
61 | };
62 | 
63 | enum MgpuSetOp {
64 | 	MgpuSetOpIntersection,
65 | 	MgpuSetOpUnion,
66 | 	MgpuSetOpDiff,
67 | 	MgpuSetOpSymDiff
68 | };
69 | 
70 | } // namespace mgpu
71 | 


--------------------------------------------------------------------------------
/include/contrib/moderngpu/include/util/static.h:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  3 |  * 
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  * 
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
 16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
 18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /******************************************************************************
 29 |  *
 30 |  * Code and text by Sean Baxter, NVIDIA Research
 31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
 32 |  *
 33 |  ******************************************************************************/
 34 | 
 35 | #pragma once
 36 | 
 37 | #include <functional>
 38 | #include <iterator>
 39 | #include <cfloat>
 40 | #include <typeinfo>
 41 | #include <vector>
 42 | #include <list>
 43 | #include <map>
 44 | #include <algorithm>
 45 | #include <cassert>
 46 | #include <memory>
 47 | #include <cmath>
 48 | #include <cstdio>
 49 | #include <cstdlib>
 50 | 
 51 | #ifndef MGPU_MIN
 52 | #define MGPU_MIN(x, y) (((x) <= (y)) ? (x) : (y))
 53 | #define MGPU_MAX(x, y) (((x) >= (y)) ? (x) : (y))
 54 | #define MGPU_MAX0(x) (((x) >= 0) ? (x) : 0)
 55 | #define MGPU_ABS(x) (((x) >= 0) ? (x) : (-x))
 56 | 
 57 | #define MGPU_DIV_UP(x, y) (((x) + (y) - 1) / (y))
 58 | #define MGPU_DIV_ROUND(x, y) (((x) + (y) / 2) / (y))
 59 | #define MGPU_ROUND_UP(x, y) ((y) * MGPU_DIV_UP(x, y))
 60 | #define MGPU_SHIFT_DIV_UP(x, y) (((x) + ((1<< (y)) - 1))>> y)
 61 | #define MGPU_ROUND_UP_POW2(x, y) (((x) + (y) - 1) & ~((y) - 1))
 62 | #define MGPU_ROUND_DOWN_POW2(x, y) ((x) & ~((y) - 1))
 63 | #define MGPU_IS_POW_2(x) (0 == ((x) & ((x) - 1)))
 64 | 
 65 | #endif // MGPU_MIN
 66 | 
 67 | namespace mgpu {
 68 | 
 69 | 
 70 | typedef unsigned char byte;
 71 | 
 72 | typedef unsigned int uint;
 73 | typedef signed short int16;
 74 | 
 75 | typedef unsigned short ushort;
 76 | typedef unsigned short uint16;
 77 | 
 78 | typedef long long int64;
 79 | typedef unsigned long long uint64;
 80 | 
 81 | // IsPow2<X>::value is true if X is a power of 2.
 82 | template<int X> struct sIsPow2 {
 83 | 	enum { value = 0 == (X & (X - 1)) };
 84 | };
 85 | 
 86 | // Finds the base-2 logarithm of X. value is -1 if X is not a power of 2.
 87 | template<int X, bool roundUp = true> struct sLogPow2 { 
 88 | 	enum { extra = sIsPow2<X>::value ? 0 : (roundUp ? 1 : 0) };
 89 | 	enum { inner = sLogPow2<X / 2>::inner + 1 };
 90 | 	enum { value = inner + extra };
 91 | };
 92 | template<bool roundUp> struct sLogPow2<0, roundUp> {
 93 | 	enum { inner = 0 };
 94 | 	enum { value = 0 };
 95 | };
 96 | template<bool roundUp> struct sLogPow2<1, roundUp> { 
 97 | 	enum { inner = 0 };
 98 | 	enum { value = 0 };
 99 | };
100 | 
101 | template<int X, int Y>
102 | struct sDivUp {
103 | 	enum { value = (X + Y - 1) / Y };
104 | };
105 | 
106 | template<int count, int levels> struct sDiv2RoundUp {
107 | 	enum { value = sDiv2RoundUp<sDivUp<count, 2>::value, levels - 1>::value };
108 | };
109 | template<int count> struct sDiv2RoundUp<count, 0> {
110 | 	enum { value = count };
111 | };
112 | 
113 | template<int X, int Y>
114 | struct sDivSafe {
115 | 	enum { value = X / Y };
116 | };
117 | template<int X>
118 | struct sDivSafe<X, 0> {
119 | 	enum { value = 0 };
120 | };
121 | 
122 | template<int X, int Y>
123 | struct sRoundUp {
124 | 	enum { rem = X % Y };
125 | 	enum { value = X + (rem ? (Y - rem) : 0) };
126 | };
127 | 
128 | template<int X, int Y>
129 | struct sRoundDown {
130 | 	enum { rem = X % Y };
131 | 	enum { value = X - rem };
132 | };
133 | 
134 | // IntegerDiv is a template for avoiding divisions by zero in template 
135 | // evaluation. Templates always evaluate both b and c in an expression like
136 | // a ? b : c, and will error if either rhs contains an illegal expression,
137 | // even if the ternary is explictly designed to guard against that.
138 | template<int X, int Y>
139 | struct sIntegerDiv {
140 | 	enum { value = X / (Y ? Y : (X + 1)) };
141 | };
142 | 
143 | template<int X, int Y>
144 | struct sMax {
145 | 	enum { value = (X >= Y) ? X : Y };
146 | };
147 | template<int X, int Y>
148 | struct sMin {
149 | 	enum { value = (X <= Y) ? X : Y };
150 | };
151 | 
152 | template<int X>
153 | struct sAbs {
154 | 	enum { value = (X >= 0) ? X : -X };
155 | };
156 | 
157 | 
158 | // Finds the number of powers of 2 in the prime factorization of X.
159 | template<int X, int LSB = 1 & X> struct sNumFactorsOf2 {
160 | 	enum { shifted = X >> 1 };
161 | 	enum { value = 1 + sNumFactorsOf2<shifted>::value };
162 | };
163 | template<int X> struct sNumFactorsOf2<X, 1> {
164 | 	enum { value = 0 };
165 | };
166 | 
167 | // Returns the divisor for a conflict-free transpose.
168 | template<int X, int NumBanks = 32> struct sBankConflictDivisor {
169 | 	enum { value = 
170 | 		(1 & X) ? 0 : 
171 | 		(sIsPow2<X>::value ? NumBanks :
172 | 		(1<< sNumFactorsOf2<X>::value)) }; 
173 | 	enum { log_value = sLogPow2<value>::value };
174 | };
175 | 
176 | template<int NT, int X, int NumBanks = 32> struct sConflictFreeStorage {
177 | 	enum { count = NT * X };
178 | 	enum { divisor = sBankConflictDivisor<X, NumBanks>::value };
179 | 	enum { padding = sDivSafe<count, divisor>::value };
180 | 	enum { value = count + padding };
181 | };
182 | 
183 | } // namespace mgpu
184 | 


--------------------------------------------------------------------------------
/include/ctc.h:
--------------------------------------------------------------------------------
  1 | /** \file ctc.h
  2 |  * Contains a simple C interface to call fast CPU and GPU based computation
  3 |  * of the CTC loss.
  4 |  */
  5 | 
  6 | #pragma once
  7 | 
  8 | #ifdef __cplusplus
  9 | #include <cstddef>
 10 | extern "C" {
 11 | #endif
 12 | 
 13 | //forward declare of CUDA typedef to avoid needing to pull in CUDA headers
 14 | typedef struct CUstream_st* CUstream;
 15 | 
 16 | typedef enum {
 17 |     CTC_STATUS_SUCCESS = 0,
 18 |     CTC_STATUS_MEMOPS_FAILED = 1,
 19 |     CTC_STATUS_INVALID_VALUE = 2,
 20 |     CTC_STATUS_EXECUTION_FAILED = 3,
 21 |     CTC_STATUS_UNKNOWN_ERROR = 4
 22 | } ctcStatus_t;
 23 | 
 24 | /** Returns a single integer which specifies the API version of the warpctc library */
 25 | int get_warpctc_version();
 26 | 
 27 | /** Returns a string containing a description of status that was passed in
 28 |  *  \param[in] status identifies which string should be returned
 29 |  *  \return C style string containing the text description
 30 |  *  */
 31 | const char* ctcGetStatusString(ctcStatus_t status);
 32 | 
 33 | typedef enum {
 34 |     CTC_CPU = 0,
 35 |     CTC_GPU = 1
 36 | } ctcComputeLocation;
 37 | 
 38 | /** Structure used for options to the CTC compution.  Applications
 39 |  *  should zero out the array using memset and sizeof(struct
 40 |  *  ctcOptions) in C or default initialization (e.g. 'ctcOptions
 41 |  *  options{};' or 'auto options = ctcOptions{}') in C++ to ensure
 42 |  *  forward compatibility with added options. */
 43 | struct ctcOptions {
 44 |     /// indicates where the ctc calculation should take place {CTC_CPU | CTC_GPU}
 45 |     ctcComputeLocation loc;
 46 |     union {
 47 |         /// used when loc == CTC_CPU, the maximum number of threads that can be used
 48 |         unsigned int num_threads;
 49 | 
 50 |         /// used when loc == CTC_GPU, which stream the kernels should be launched in
 51 |         CUstream stream;
 52 |     };
 53 | 
 54 |     /// the label value/index that the CTC calculation should use as the blank label
 55 |     int blank_label;
 56 | };
 57 | 
 58 | /** Compute the connectionist temporal classification loss between a sequence
 59 |  *  of probabilities and a ground truth labeling.  Optionally compute the
 60 |  *  gradient with respect to the inputs.
 61 |  * \param [in] activations pointer to the activations in either CPU or GPU
 62 |  *             addressable memory, depending on info.  We assume a fixed
 63 |  *             memory layout for this 3 dimensional tensor, which has dimension
 64 |  *             (t, n, p), where t is the time index, n is the minibatch index,
 65 |  *             and p indexes over probabilities of each symbol in the alphabet.
 66 |  *             The memory layout is (t, n, p) in C order (slowest to fastest changing
 67 |  *             index, aka row-major), or (p, n, t) in Fortran order (fastest to slowest
 68 |  *             changing index, aka column-major). We also assume strides are equal to
 69 |  *             dimensions - there is no padding between dimensions.
 70 |  *             More precisely, element (t, n, p), for a problem with mini_batch examples
 71 |  *             in the mini batch, and alphabet_size symbols in the alphabet, is located at:
 72 |  *             activations[(t * mini_batch + n) * alphabet_size + p]
 73 |  * \param [out] gradients if not NULL, then gradients are computed.  Should be
 74 |  *              allocated in the same memory space as probs and memory
 75 |  *              ordering is identical.
 76 |  * \param [in]  flat_labels Always in CPU memory.  A concatenation
 77 |  *              of all the labels for the minibatch.
 78 |  * \param [in]  label_lengths Always in CPU memory. The length of each label
 79 |  *              for each example in the minibatch.
 80 |  * \param [in]  input_lengths Always in CPU memory.  The number of time steps
 81 |  *              for each sequence in the minibatch.
 82 |  * \param [in]  alphabet_size The number of possible output symbols.  There
 83 |  *              should be this many probabilities for each time step.
 84 |  * \param [in]  mini_batch How many examples in a minibatch.
 85 |  * \param [out] costs Always in CPU memory.  The cost of each example in the
 86 |  *              minibatch.
 87 |  * \param [in,out] workspace In same memory space as probs. Should be of
 88 |  *                 size requested by get_workspace_size.
 89 |  * \param [in]  options see struct ctcOptions
 90 |  *
 91 |  *  \return Status information
 92 |  *
 93 |  * */
 94 | ctcStatus_t compute_ctc_loss(const float* const activations,
 95 |                              float* gradients,
 96 |                              const int* const flat_labels,
 97 |                              const int* const label_lengths,
 98 |                              const int* const input_lengths,
 99 |                              int alphabet_size,
100 |                              int minibatch,
101 |                              float *costs,
102 |                              void *workspace,
103 |                              ctcOptions options);
104 | 
105 | 
106 | /** For a given set of labels and minibatch size return the required workspace
107 |  *  size.  This will need to be allocated in the same memory space as your
108 |  *  probabilities.
109 |  * \param [in]  label_lengths Always in CPU memory. The length of each label
110 |  *              for each example in the minibatch.
111 |  * \param [in]  input_lengths Always in CPU memory.  The number of time steps
112 |  *              for each sequence in the minibatch.
113 |  * \param [in]  alphabet_size How many symbols in the alphabet or, equivalently,
114 |  *              the number of probabilities at each time step
115 |  * \param [in]  mini_batch How many examples in a minibatch.
116 |  * \param [in]  info see struct ctcOptions
117 |  * \param [out] size_bytes is pointer to a scalar where the memory
118 |  *              requirement in bytes will be placed. This memory should be allocated
119 |  *              at the same place, CPU or GPU, that the probs are in
120 |  *
121 |  *  \return Status information
122 |  **/
123 | ctcStatus_t get_workspace_size(const int* const label_lengths,
124 |                                const int* const input_lengths,
125 |                                int alphabet_size, int minibatch,
126 |                                ctcOptions info,
127 |                                size_t* size_bytes);
128 | 
129 | #ifdef __cplusplus
130 | }
131 | #endif
132 | 


--------------------------------------------------------------------------------
/include/detail/ctc_helper.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <limits>
 4 | #include <algorithm>
 5 | #include <cmath>
 6 | 
 7 | #include "hostdevice.h"
 8 | 
 9 | namespace ctc_helper {
10 | 
11 | static const float threshold = 1e-1;
12 | 
13 | template<typename T>
14 | HOSTDEVICE
15 | T neg_inf() { return -T(INFINITY); }
16 | 
17 | inline int div_up(int x, int y) {
18 |     return (x + y - 1) / y;
19 | }
20 | 
21 | template <typename Arg, typename Res = Arg> struct maximum {
22 |     HOSTDEVICE
23 |     Res operator()(const Arg& x, const Arg& y) const {
24 |         return x < y ? y : x;
25 |     }
26 | };
27 | 
28 | template <typename Arg, typename Res = Arg> struct add {
29 |     HOSTDEVICE
30 |     Res operator()(const Arg& x, const Arg& y) const {
31 |         return x + y;
32 |     }
33 | };
34 | 
35 | template <typename Arg, typename Res = Arg> struct identity {
36 |     HOSTDEVICE Res operator()(const Arg& x) const {return Res(x);}
37 | };
38 | 
39 | template <typename Arg, typename Res = Arg> struct negate {
40 |     HOSTDEVICE Res operator()(const Arg& x) const {return Res(-x);}
41 | };
42 | 
43 | template <typename Arg, typename Res = Arg> struct exponential {
44 |     HOSTDEVICE Res operator()(const Arg& x) const {return std::exp(x);}
45 | };
46 | 
47 | template<typename Arg1, typename Arg2 = Arg1, typename Res=Arg1>
48 | struct log_plus {
49 |     typedef Res result_type;
50 |     HOSTDEVICE
51 |     Res operator()(const Arg1& p1, const Arg2& p2) {
52 |         if (p1 == neg_inf<Arg1>())
53 |             return p2;
54 |         if (p2 == neg_inf<Arg2>())
55 |             return p1;
56 |         Res result = log1p(exp(-fabs(p1 - p2))) + maximum<Res>()(p1, p2);
57 |         return result;
58 |     }
59 | };
60 | 
61 | }
62 | 


--------------------------------------------------------------------------------
/include/detail/hostdevice.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #ifdef __CUDACC__
4 |     #define HOSTDEVICE __host__ __device__
5 | #else
6 |     #define HOSTDEVICE
7 | #endif
8 | 


--------------------------------------------------------------------------------
/include/detail/reduce.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | ctcStatus_t reduce_negate(const float* input, float* output, int rows, int cols, bool axis, cudaStream_t stream);
4 | ctcStatus_t reduce_exp(const float* input, float* output, int rows, int cols, bool axis, cudaStream_t stream);
5 | ctcStatus_t reduce_max(const float* input, float* output, int rows, int cols, bool axis, cudaStream_t stream);
6 | 


--------------------------------------------------------------------------------
/pytorch_binding/.gitignore:
--------------------------------------------------------------------------------
  1 | # Ignore compiled FFI location
  2 | warpctc_pytorch/_warp_ctc
  3 | 
  4 | # Created by https://www.gitignore.io/api/python
  5 | 
  6 | ### Python ###
  7 | # Byte-compiled / optimized / DLL files
  8 | __pycache__/
  9 | *.py[cod]
 10 | *$py.class
 11 | 
 12 | # C extensions
 13 | *.so
 14 | 
 15 | # Distribution / packaging
 16 | .Python
 17 | build/
 18 | develop-eggs/
 19 | dist/
 20 | downloads/
 21 | eggs/
 22 | .eggs/
 23 | lib/
 24 | lib64/
 25 | parts/
 26 | sdist/
 27 | var/
 28 | wheels/
 29 | *.egg-info/
 30 | .installed.cfg
 31 | *.egg
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | .hypothesis/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | 
 62 | # Flask stuff:
 63 | instance/
 64 | .webassets-cache
 65 | 
 66 | # Scrapy stuff:
 67 | .scrapy
 68 | 
 69 | # Sphinx documentation
 70 | docs/_build/
 71 | 
 72 | # PyBuilder
 73 | target/
 74 | 
 75 | # Jupyter Notebook
 76 | .ipynb_checkpoints
 77 | 
 78 | # pyenv
 79 | .python-version
 80 | 
 81 | # celery beat schedule file
 82 | celerybeat-schedule.*
 83 | 
 84 | # SageMath parsed files
 85 | *.sage.py
 86 | 
 87 | # Environments
 88 | .env
 89 | .venv
 90 | env/
 91 | venv/
 92 | ENV/
 93 | env.bak/
 94 | venv.bak/
 95 | 
 96 | # Spyder project settings
 97 | .spyderproject
 98 | .spyproject
 99 | 
100 | # Rope project settings
101 | .ropeproject
102 | 
103 | # mkdocs documentation
104 | /site
105 | 
106 | # mypy
107 | .mypy_cache/
108 | 
109 | 
110 | # End of https://www.gitignore.io/api/python
111 | 


--------------------------------------------------------------------------------
/pytorch_binding/.pypirc:
--------------------------------------------------------------------------------
1 | [distutils]
2 | index-servers =
3 |   pypi
4 | 
5 | [pypi]
6 | repository: https://upload.pypi.org/legacy/
7 | username: __token__
8 | 


--------------------------------------------------------------------------------
/pytorch_binding/setup.cfg:
--------------------------------------------------------------------------------
1 | [tool:pytest]
2 | 


--------------------------------------------------------------------------------
/pytorch_binding/setup.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import platform
  3 | import shutil
  4 | import sys
  5 | from setuptools import setup, find_packages
  6 | from subprocess import Popen, PIPE
  7 | 
  8 | from torch.utils.cpp_extension import BuildExtension, CppExtension
  9 | import torch
 10 | 
 11 | extra_compile_args = ['-std=c++14', '-fPIC', '-fopenmp']
 12 | warp_ctc_path = "../build"
 13 | 
 14 | if platform.system() == 'Darwin':
 15 |     lib_ext = ".dylib"
 16 | else:
 17 |     lib_ext = ".so"
 18 | warp_ctc_libname = 'libwarpctc{}'.format(lib_ext)
 19 | 
 20 | if "WARP_CTC_PATH" in os.environ:
 21 |     warp_ctc_path = os.environ["WARP_CTC_PATH"]
 22 | if not os.path.exists(os.path.join(warp_ctc_path, warp_ctc_libname)):
 23 |     print(("Could not find {libname} in {build_path}.\n"
 24 |            "Build warp-ctc and set WARP_CTC_PATH to the location of"
 25 |            " {libname} (default is '../build')").format(
 26 |                libname=warp_ctc_libname, build_path=warp_ctc_path))
 27 |     sys.exit(1)
 28 | 
 29 | include_dirs = [os.path.realpath('../include')]
 30 | 
 31 | warp_ctc_libpath = "./warpctc_pytorch/lib"
 32 | if not os.path.isdir(warp_ctc_libpath):
 33 |     os.mkdir(warp_ctc_libpath)
 34 | shutil.copyfile(
 35 |     '{}/{}'.format(warp_ctc_path, warp_ctc_libname),
 36 |     '{}/{}'.format(warp_ctc_libpath, warp_ctc_libname)
 37 | )
 38 | 
 39 | 
 40 | def get_cuda_version():
 41 |     proc = Popen(['nvcc', '--version'], stdout=PIPE, stderr=PIPE)
 42 |     out, err = proc.communicate()
 43 |     out.decode('utf-8').split('\n')[-2].split(', ')[-2].split(' ')
 44 |     return out.decode('utf-8').split()[-2][:-1].replace('.', '')
 45 | 
 46 | 
 47 | def get_torch_version():
 48 |     major_ver, minor_ver = torch.__version__.split('.')[:2]
 49 |     return major_ver + minor_ver
 50 | 
 51 | 
 52 | def get_local_version_identifier(enable_gpu):
 53 |     local_version_identifier = '+torch{}'.format(get_torch_version())
 54 |     if enable_gpu:
 55 |         local_version_identifier += ".cuda{}".format(get_cuda_version())
 56 |     else:
 57 |         local_version_identifier += ".cpu"
 58 |     return local_version_identifier
 59 | 
 60 | 
 61 | if torch.cuda.is_available() or "CUDA_HOME" in os.environ:
 62 |     enable_gpu = True
 63 |     # For CUDA10.1, libcublas-10-2 is installed
 64 |     # and we have to add /usr/local/cuda-10.2 to search paths
 65 |     if get_cuda_version() == "101":
 66 |         include_dirs.append("/usr/local/cuda-10.2/include")
 67 | else:
 68 |     print("Torch was not built with CUDA support, not building warp-ctc GPU extensions.")
 69 |     enable_gpu = False
 70 | 
 71 | if enable_gpu:
 72 |     from torch.utils.cpp_extension import CUDAExtension
 73 | 
 74 |     build_extension = CUDAExtension
 75 |     extra_compile_args += ['-DWARPCTC_ENABLE_GPU']
 76 | else:
 77 |     build_extension = CppExtension
 78 | 
 79 | ext_modules = [
 80 |     build_extension(
 81 |         name='warpctc_pytorch._warp_ctc',
 82 |         language='c++',
 83 |         sources=['src/binding.cpp'],
 84 |         include_dirs=include_dirs,
 85 |         library_dirs=[os.path.realpath(warp_ctc_libpath)],
 86 |         libraries=['warpctc'],
 87 |         extra_link_args=['-Wl,-rpath,{}'.format('$ORIGIN/lib')],
 88 |         extra_compile_args=extra_compile_args
 89 |     )
 90 | ]
 91 | 
 92 | public_version_identifier = "0.2.2"
 93 | setup(
 94 |     name="warpctc_pytorch",
 95 |     version=public_version_identifier + get_local_version_identifier(enable_gpu),
 96 |     description="Pytorch Bindings for warp-ctc maintained by ESPnet",
 97 |     url="https://github.com/espnet/warp-ctc",
 98 |     author=','.join([
 99 |         "Jared Casper",
100 |         "Sean Naren",
101 |         "Shinji Watanabe",
102 |         "Jiro Nishitoba",
103 |         "Yusuke Nishioka"
104 |     ]),
105 |     author_email=','.join([
106 |         "jared.casper@baidu.com",
107 |         "sean.narenthiran@digitalreasoning.com",
108 |         "sw005320@gmail.com",
109 |         "j.nshtb+github@gmail.com",
110 |         "yusuke.nishioka.0713@gmail.com"
111 |     ]),
112 |     license="Apache",
113 |     packages=find_packages(),
114 |     package_data={'': ['lib/{}'.format(warp_ctc_libname)]},
115 |     ext_modules=ext_modules,
116 |     cmdclass={'build_ext': BuildExtension}
117 | )
118 | 


--------------------------------------------------------------------------------
/pytorch_binding/src/binding.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <vector>
  3 | 
  4 | #include <numeric>
  5 | 
  6 | #include <torch/extension.h>
  7 | 
  8 | #ifdef WARPCTC_ENABLE_GPU
  9 | 	#include "ATen/cuda/CUDAContext.h"
 10 | 	#include <c10/cuda/CUDAGuard.h>
 11 | 	#include "ATen/cuda/CUDAEvent.h"
 12 | 
 13 |     #include "THC.h"
 14 |     extern THCState* state;
 15 | #endif
 16 | 
 17 | #include "ctc.h"
 18 | 
 19 | int cpu_ctc(torch::Tensor probs,
 20 |             torch::Tensor grads,
 21 |             torch::Tensor labels,
 22 |             torch::Tensor label_sizes,
 23 |             torch::Tensor sizes,
 24 |             int minibatch_size,
 25 |             torch::Tensor costs,
 26 |             int blank_label)
 27 | {
 28 |     float* probs_ptr       = (float*)probs.data_ptr();
 29 |     float* grads_ptr       = grads.storage() ? (float*)grads.data_ptr() : NULL;
 30 |     int*   sizes_ptr       = (int*)sizes.data_ptr();
 31 |     int*   labels_ptr      = (int*)labels.data_ptr();
 32 |     int*   label_sizes_ptr = (int*)label_sizes.data_ptr();
 33 |     float* costs_ptr       = (float*)costs.data_ptr();
 34 | 
 35 |     const int probs_size = probs.size(2);
 36 | 
 37 |     ctcOptions options;
 38 |     memset(&options, 0, sizeof(options));
 39 |     options.loc = CTC_CPU;
 40 |     options.num_threads = 0; // will use default number of threads
 41 |     options.blank_label = blank_label;
 42 | 
 43 | #if defined(CTC_DISABLE_OMP) || defined(APPLE)
 44 |     // have to use at least one
 45 |     options.num_threads = std::max(options.num_threads, (unsigned int) 1);
 46 | #endif
 47 | 
 48 |     size_t cpu_size_bytes;
 49 |     get_workspace_size(label_sizes_ptr, sizes_ptr,
 50 |                        probs_size, minibatch_size,
 51 |                        options, &cpu_size_bytes);
 52 | 
 53 |     float* cpu_workspace = new float[cpu_size_bytes / sizeof(float)];
 54 | 
 55 |     compute_ctc_loss(probs_ptr, grads_ptr,
 56 |                      labels_ptr, label_sizes_ptr,
 57 |                      sizes_ptr, probs_size,
 58 |                      minibatch_size, costs_ptr,
 59 |                      cpu_workspace, options);
 60 | 
 61 |     delete[] cpu_workspace;
 62 |     return 1;
 63 | }
 64 | 
 65 | #ifdef WARPCTC_ENABLE_GPU
 66 | int gpu_ctc(torch::Tensor probs,
 67 |             torch::Tensor grads,
 68 |             torch::Tensor labels,
 69 |             torch::Tensor label_sizes,
 70 |             torch::Tensor sizes,
 71 |             int minibatch_size,
 72 |             torch::Tensor costs,
 73 |             int blank_label)
 74 | {
 75 |     float* probs_ptr       = (float*)probs.data_ptr();
 76 |     float* grads_ptr       = grads.storage() ? (float*)grads.data_ptr() : NULL;
 77 |     int*   sizes_ptr       = (int*)sizes.data_ptr();
 78 |     int*   labels_ptr      = (int*)labels.data_ptr();
 79 |     int*   label_sizes_ptr = (int*)label_sizes.data_ptr();
 80 |     float* costs_ptr       = (float*)costs.data_ptr();
 81 | 
 82 |     const int probs_size = probs.size(2);
 83 | 
 84 |     ctcOptions options;
 85 |     memset(&options, 0, sizeof(options));
 86 |     options.loc = CTC_GPU;
 87 |     options.blank_label = blank_label;
 88 |     options.stream = at::cuda::getCurrentCUDAStream();
 89 | 
 90 |     size_t gpu_size_bytes;
 91 |     get_workspace_size(label_sizes_ptr, sizes_ptr,
 92 |                        probs_size, minibatch_size,
 93 |                        options, &gpu_size_bytes);
 94 | 
 95 |     void* gpu_workspace = THCudaMalloc(state, gpu_size_bytes);
 96 | 
 97 |     compute_ctc_loss(probs_ptr, grads_ptr,
 98 |                      labels_ptr, label_sizes_ptr,
 99 |                      sizes_ptr, probs_size,
100 |                      minibatch_size, costs_ptr,
101 |                      gpu_workspace, options);
102 | 
103 |     THCudaFree(state, (void *) gpu_workspace);
104 |     return 1;
105 | }
106 | #endif
107 | 
108 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
109 |   m.def("cpu_ctc", &cpu_ctc, "CTC Loss function with cpu");
110 | #ifdef WARPCTC_ENABLE_GPU
111 |   m.def("gpu_ctc", &gpu_ctc, "CTC Loss function with gpu");
112 | #endif
113 | }
114 | 


--------------------------------------------------------------------------------
/pytorch_binding/src/cpu_binding.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | int cpu_ctc(THFloatTensor *probs,
 3 |                         THFloatTensor *grads,
 4 |                         THIntTensor *labels_ptr,
 5 |                         THIntTensor *label_sizes_ptr,
 6 |                         THIntTensor *sizes,
 7 |                         int minibatch_size,
 8 |                         THFloatTensor *costs,
 9 |                         int blank_label);
10 | */
11 | 
12 | int cpu_ctc(torch::Tensor probs,
13 |             torch::Tensor grads,
14 |             torch::Tensor labels,
15 |             torch::Tensor label_sizes,
16 |             torch::Tensor sizes,
17 |             int minibatch_size,
18 |             torch::Tensor costs,
19 |             int blank_label); 
20 | 


--------------------------------------------------------------------------------
/pytorch_binding/src/gpu_binding.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | int gpu_ctc(THCudaTensor *probs,
 3 |                         THCudaTensor *grads,
 4 |                         THIntTensor *labels_ptr,
 5 |                         THIntTensor *label_sizes_ptr,
 6 |                         THIntTensor *sizes,
 7 |                         int minibatch_size,
 8 |                         THFloatTensor *costs,
 9 |                         int blank_label);
10 | */
11 | 
12 | int gpu_ctc(torch::Tensor probs,
13 |             torch::Tensor grads,
14 |             torch::Tensor labels,
15 |             torch::Tensor label_sizes,
16 |             torch::Tensor sizes,
17 |             int minibatch_size,
18 |             torch::Tensor costs,
19 |             int blank_label);
20 | 


--------------------------------------------------------------------------------
/pytorch_binding/tests/test_cpu.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import warpctc_pytorch as warp_ctc
 3 | import pytest
 4 | 
 5 | 
 6 | def test_simple():
 7 |     probs = torch.FloatTensor([[[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]]]).transpose(0, 1).contiguous()
 8 |     grads = torch.zeros(probs.size())
 9 |     labels = torch.IntTensor([1, 2])
10 |     label_sizes = torch.IntTensor([2])
11 |     sizes = torch.IntTensor(probs.size(1)).fill_(probs.size(0))
12 |     minibatch_size = probs.size(1)
13 |     costs = torch.zeros(minibatch_size)
14 |     warp_ctc.cpu_ctc(probs,
15 |                      grads,
16 |                      labels,
17 |                      label_sizes,
18 |                      sizes,
19 |                      minibatch_size,
20 |                      costs,
21 |                      0)
22 |     print('CPU_cost: %f' % costs.sum())
23 | 
24 | 
25 | @pytest.mark.parametrize("multiplier", [1.0, 200.0])
26 | def test_medium(multiplier):
27 |     probs = torch.FloatTensor([
28 |         [[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]],
29 |         [[0.6, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.5, 0.2, 0.1]]
30 |     ]).contiguous() * multiplier
31 | 
32 |     grads = torch.zeros(probs.size())
33 |     labels = torch.IntTensor([1, 2, 1, 2])
34 |     label_sizes = torch.IntTensor([2, 2])
35 |     sizes = torch.IntTensor([2, 2])
36 |     minibatch_size = probs.size(1)
37 |     costs = torch.zeros(minibatch_size)
38 |     warp_ctc.cpu_ctc(probs,
39 |                      grads,
40 |                      labels,
41 |                      label_sizes,
42 |                      sizes,
43 |                      minibatch_size,
44 |                      costs,
45 |                      0)
46 |     print('CPU_cost: %f' % costs.sum())
47 | 
48 | 
49 | def test_empty_label():
50 |     probs = torch.FloatTensor([
51 |         [[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]],
52 |         [[0.6, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.5, 0.2, 0.1]]
53 |     ]).contiguous()
54 | 
55 |     grads = torch.zeros(probs.size())
56 |     labels = torch.IntTensor([1, 2])
57 |     label_sizes = torch.IntTensor([2, 0])
58 |     sizes = torch.IntTensor([2, 2])
59 |     minibatch_size = probs.size(1)
60 |     costs = torch.zeros(minibatch_size)
61 |     warp_ctc.cpu_ctc(probs,
62 |                      grads,
63 |                      labels,
64 |                      label_sizes,
65 |                      sizes,
66 |                      minibatch_size,
67 |                      costs,
68 |                      0)
69 |     print('CPU_cost: %f' % costs.sum())
70 | 
71 | 
72 | def test_CTCLoss():
73 |     probs = torch.FloatTensor([[
74 |         [0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]
75 |     ]]).transpose(0, 1).contiguous()
76 |     labels = torch.IntTensor([1, 2])
77 |     label_sizes = torch.IntTensor([2])
78 |     probs_sizes = torch.IntTensor([2])
79 |     probs.requires_grad_(True)
80 | 
81 |     ctc_loss = warp_ctc.CTCLoss()
82 |     cost = ctc_loss(probs, labels, probs_sizes, label_sizes)
83 |     cost.backward()
84 | 
85 | 
86 | if __name__ == '__main__':
87 |     pytest.main([__file__])
88 | 


--------------------------------------------------------------------------------
/pytorch_binding/tests/test_gpu.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import warpctc_pytorch as warp_ctc
  3 | import pytest
  4 | 
  5 | 
  6 | @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU")
  7 | def test_simple():
  8 |     probs = torch.FloatTensor([[[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]]]).transpose(0, 1).contiguous()
  9 |     grads = torch.zeros(probs.size())
 10 |     labels = torch.IntTensor([1, 2])
 11 |     label_sizes = torch.IntTensor([2])
 12 |     sizes = torch.IntTensor(probs.size(1)).fill_(probs.size(0))
 13 |     minibatch_size = probs.size(1)
 14 |     costs = torch.zeros(minibatch_size)
 15 |     warp_ctc.cpu_ctc(probs,
 16 |                      grads,
 17 |                      labels,
 18 |                      label_sizes,
 19 |                      sizes,
 20 |                      minibatch_size,
 21 |                      costs,
 22 |                      0)
 23 |     print('CPU_cost: %f' % costs.sum())
 24 |     probs = probs.clone().cuda()
 25 |     grads = torch.zeros(probs.size()).cuda()
 26 |     costs = torch.zeros(minibatch_size)
 27 |     warp_ctc.gpu_ctc(probs,
 28 |                      grads,
 29 |                      labels,
 30 |                      label_sizes,
 31 |                      sizes,
 32 |                      minibatch_size,
 33 |                      costs,
 34 |                      0)
 35 |     print('GPU_cost: %f' % costs.sum())
 36 |     print(grads.view(grads.size(0) * grads.size(1), grads.size(2)))
 37 | 
 38 | 
 39 | @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU")
 40 | @pytest.mark.parametrize("multiplier", [1.0, 200.0])
 41 | def test_medium(multiplier):
 42 |     probs = torch.FloatTensor([
 43 |         [[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]],
 44 |         [[0.6, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.5, 0.2, 0.1]]
 45 |     ]).contiguous() * multiplier
 46 | 
 47 |     grads = torch.zeros(probs.size())
 48 |     labels = torch.IntTensor([1, 2, 1, 2])
 49 |     label_sizes = torch.IntTensor([2, 2])
 50 |     sizes = torch.IntTensor([2, 2])
 51 |     minibatch_size = probs.size(1)
 52 |     costs = torch.zeros(minibatch_size)
 53 |     warp_ctc.cpu_ctc(probs,
 54 |                      grads,
 55 |                      labels,
 56 |                      label_sizes,
 57 |                      sizes,
 58 |                      minibatch_size,
 59 |                      costs,
 60 |                      0)
 61 |     print('CPU_cost: %f' % costs.sum())
 62 |     probs = probs.clone().cuda()
 63 |     grads = torch.zeros(probs.size()).cuda()
 64 |     costs = torch.zeros(minibatch_size)
 65 |     warp_ctc.gpu_ctc(probs,
 66 |                      grads,
 67 |                      labels,
 68 |                      label_sizes,
 69 |                      sizes,
 70 |                      minibatch_size,
 71 |                      costs,
 72 |                      0)
 73 |     print('GPU_cost: %f' % costs.sum())
 74 |     print(grads.view(grads.size(0) * grads.size(1), grads.size(2)))
 75 | 
 76 | 
 77 | @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU")
 78 | def test_empty_label():
 79 |     probs = torch.FloatTensor([
 80 |         [[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]],
 81 |         [[0.6, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.5, 0.2, 0.1]]
 82 |     ]).contiguous()
 83 | 
 84 |     grads = torch.zeros(probs.size())
 85 |     labels = torch.IntTensor([1, 2])
 86 |     label_sizes = torch.IntTensor([2, 0])
 87 |     sizes = torch.IntTensor([2, 2])
 88 |     minibatch_size = probs.size(1)
 89 |     costs = torch.zeros(minibatch_size)
 90 |     warp_ctc.cpu_ctc(probs,
 91 |                      grads,
 92 |                      labels,
 93 |                      label_sizes,
 94 |                      sizes,
 95 |                      minibatch_size,
 96 |                      costs,
 97 |                      0)
 98 |     print('CPU_cost: %f' % costs.sum())
 99 |     probs = probs.clone().cuda()
100 |     grads = torch.zeros(probs.size()).cuda()
101 |     costs = torch.zeros(minibatch_size)
102 |     warp_ctc.gpu_ctc(probs,
103 |                      grads,
104 |                      labels,
105 |                      label_sizes,
106 |                      sizes,
107 |                      minibatch_size,
108 |                      costs,
109 |                      0)
110 |     print('GPU_cost: %f' % costs.sum())
111 |     print(grads.view(grads.size(0) * grads.size(1), grads.size(2)))
112 | 
113 | 
114 | @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU")
115 | def test_CTCLoss():
116 |     probs = torch.FloatTensor([[
117 |         [0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]
118 |     ]]).transpose(0, 1).contiguous().cuda()
119 |     labels = torch.IntTensor([1, 2])
120 |     label_sizes = torch.IntTensor([2])
121 |     probs_sizes = torch.IntTensor([2])
122 |     probs.requires_grad_(True)
123 | 
124 |     ctc_loss = warp_ctc.CTCLoss()
125 |     cost = ctc_loss(probs, labels, probs_sizes, label_sizes)
126 |     cost.backward()
127 | 
128 | 
129 | if __name__ == '__main__':
130 |     pytest.main([__file__])
131 | 


--------------------------------------------------------------------------------
/pytorch_binding/warpctc_pytorch/__init__.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import warpctc_pytorch as warp_ctc
 3 | from torch.autograd import Function
 4 | from torch.nn import Module
 5 | 
 6 | from ._warp_ctc import *  # noqa
 7 | 
 8 | __version__ = '0.2.2'
 9 | 
10 | 
11 | def _assert_no_grad(tensor):
12 |     assert not tensor.requires_grad, \
13 |         "gradients only computed for acts - please " \
14 |         "mark other tensors as not requiring gradients"
15 | 
16 | 
17 | class _CTC(Function):
18 |     @staticmethod
19 |     def forward(ctx, acts, labels, act_lens, label_lens, size_average=False,
20 |                 length_average=False, blank=0, reduce=True):
21 |         is_cuda = True if acts.is_cuda else False
22 |         acts = acts.contiguous()
23 |         loss_func = warp_ctc.gpu_ctc if is_cuda else warp_ctc.cpu_ctc
24 |         grads = torch.zeros(acts.size()).type_as(acts)
25 |         minibatch_size = acts.size(1)
26 |         costs = torch.zeros(minibatch_size).cpu()
27 |         loss_func(acts,
28 |                   grads,
29 |                   labels,
30 |                   label_lens,
31 |                   act_lens,
32 |                   minibatch_size,
33 |                   costs,
34 |                   blank)
35 | 
36 |         if reduce:
37 |             costs = torch.FloatTensor([costs.sum()])
38 | 
39 |             if length_average:
40 |                 # Compute the avg. log-probability per batch sample and frame.
41 |                 total_length = torch.sum(act_lens).item()
42 |                 grads = grads / total_length
43 |                 costs = costs / total_length
44 |             elif size_average:
45 |                 # Compute the avg. log-probability per batch sample.
46 |                 grads = grads / minibatch_size
47 |                 costs = costs / minibatch_size
48 |         else:
49 |             # Make the costs size be B x 1, then grad_output is also B x 1
50 |             # Thus the `grad_output' in backward() is broadcastable
51 |             costs = costs.unsqueeze(1)
52 | 
53 |         ctx.grads = grads
54 |         return costs
55 | 
56 |     @staticmethod
57 |     def backward(ctx, grad_output):
58 |         _grad_output = grad_output.to(ctx.grads.device)
59 |         return ctx.grads.mul_(_grad_output), None, None, None, None, None, None, None
60 | 
61 | class CTCLoss(Module):
62 |     """
63 |     Parameters:
64 |         size_average (bool): normalize the loss by the batch size
65 |             (default: `False`)
66 |         length_average (bool): normalize the loss by the total number of frames
67 |             in the batch. If `True`, supersedes `size_average`
68 |             (default: `False`)
69 |         reduce (bool): average or sum over observation for each minibatch.
70 |             If `False`, returns a loss per batch element instead and ignores `average` options.
71 |             (default: `True`)
72 |     """
73 |     def __init__(self, blank=0, size_average=False, length_average=False, reduce=True):
74 |         super(CTCLoss, self).__init__()
75 |         self.ctc = _CTC.apply
76 |         self.blank = blank
77 |         self.size_average = size_average
78 |         self.length_average = length_average
79 |         self.reduce = reduce
80 | 
81 |     def forward(self, acts, labels, act_lens, label_lens):
82 |         """
83 |         acts: Tensor of (seqLength x batch x outputDim) containing output from network
84 |         labels: 1 dimensional Tensor containing all the targets of the batch in one sequence
85 |         act_lens: Tensor of size (batch) containing size of each output sequence from the network
86 |         label_lens: Tensor of (batch) containing label length of each example
87 |         """
88 |         assert len(labels.size()) == 1  # labels must be 1 dimensional
89 |         _assert_no_grad(labels)
90 |         _assert_no_grad(act_lens)
91 |         _assert_no_grad(label_lens)
92 |         return self.ctc(acts, labels, act_lens, label_lens, self.size_average,
93 |                         self.length_average, self.blank, self.reduce)
94 | 


--------------------------------------------------------------------------------
/pytorch_binding/wheel/build_wheels.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -eu
 4 | 
 5 | function install_torch_of_specified_version() {
 6 |   version=$1
 7 |   pip install torch==$1
 8 | }
 9 | 
10 | function build_wheel() {
11 |   python setup.py bdist_wheel
12 |   python wheel/rename_wheels.py
13 | }
14 | 
15 | function install_wheel() {
16 |   torch_version=$1
17 | 
18 |   torch_vers=(${torch_version//./ })
19 |   torch_major_ver=${torch_vers[0]}
20 |   torch_minor_ver=${torch_vers[1]}
21 |   pip install dist/warpctc_pytorch-*+torch${torch_major_ver}${torch_minor_ver}*.whl
22 | }
23 | 
24 | function run_tests() {
25 |   pytest tests
26 |   pytest --flakes
27 | }
28 | 
29 | function post_process() {
30 |   python setup.py clean
31 |   pip uninstall -y warpctc-pytorch torch
32 |   rm -rf build warpctc_pytorch.egg-info
33 | }
34 | 
35 | torch_versions=(${TORCH_VERSIONS//:/ })
36 | for torch_version in ${torch_versions[@]}; do
37 |   install_torch_of_specified_version $torch_version
38 |   build_wheel
39 |   install_wheel $torch_version
40 |   run_tests
41 |   post_process
42 | done
43 | 


--------------------------------------------------------------------------------
/pytorch_binding/wheel/rename_wheels.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | import shutil
 4 | 
 5 | 
 6 | for whl_path in glob.glob(os.path.join(os.getcwd(), 'dist', '*.whl')):
 7 |     whl_name = os.path.basename(whl_path)
 8 |     dist, version, python_tag, abi_tag, platform_tag = whl_name.split('-')
 9 |     if 'manylinux' in platform_tag:
10 |         continue
11 |     platform_tag = platform_tag.replace('linux', 'manylinux1')
12 |     new_whl_name = '-'.join([dist, version, python_tag, abi_tag, platform_tag])
13 |     new_whl_path = os.path.join(os.path.dirname(whl_path), new_whl_name)
14 |     shutil.move(whl_path, new_whl_path)
15 | 


--------------------------------------------------------------------------------
/src/ctc_entrypoint.cpp:
--------------------------------------------------------------------------------
  1 | #include <cstddef>
  2 | #include <iostream>
  3 | #include <algorithm>
  4 | 
  5 | #include <ctc.h>
  6 | 
  7 | #include "detail/cpu_ctc.h"
  8 | #ifdef __CUDACC__
  9 |     #include "detail/gpu_ctc.h"
 10 | #endif
 11 | 
 12 | 
 13 | extern "C" {
 14 | 
 15 | int get_warpctc_version() {
 16 |     return 2;
 17 | }
 18 | 
 19 | const char* ctcGetStatusString(ctcStatus_t status) {
 20 |     switch (status) {
 21 |     case CTC_STATUS_SUCCESS:
 22 |         return "no error";
 23 |     case CTC_STATUS_MEMOPS_FAILED:
 24 |         return "cuda memcpy or memset failed";
 25 |     case CTC_STATUS_INVALID_VALUE:
 26 |         return "invalid value";
 27 |     case CTC_STATUS_EXECUTION_FAILED:
 28 |         return "execution failed";
 29 | 
 30 |     case CTC_STATUS_UNKNOWN_ERROR:
 31 |     default:
 32 |         return "unknown error";
 33 | 
 34 |     }
 35 | 
 36 | }
 37 | 
 38 | 
 39 | ctcStatus_t compute_ctc_loss(const float* const activations,
 40 |                              float* gradients,
 41 |                              const int* const flat_labels,
 42 |                              const int* const label_lengths,
 43 |                              const int* const input_lengths,
 44 |                              int alphabet_size,
 45 |                              int minibatch,
 46 |                              float *costs,
 47 |                              void *workspace,
 48 |                              ctcOptions options) {
 49 | 
 50 |     if (activations == nullptr ||
 51 |         flat_labels == nullptr ||
 52 |         label_lengths == nullptr ||
 53 |         input_lengths == nullptr ||
 54 |         costs == nullptr ||
 55 |         workspace == nullptr ||
 56 |         alphabet_size <= 0 ||
 57 |         minibatch <= 0)
 58 |         return CTC_STATUS_INVALID_VALUE;
 59 | 
 60 |     if (options.loc == CTC_CPU) {
 61 |         CpuCTC<float> ctc(alphabet_size, minibatch, workspace, options.num_threads,
 62 |                           options.blank_label);
 63 | 
 64 |         if (gradients != NULL)
 65 |             return ctc.cost_and_grad(activations, gradients,
 66 |                                      costs,
 67 |                                      flat_labels, label_lengths,
 68 |                                      input_lengths);
 69 |         else
 70 |             return ctc.score_forward(activations, costs, flat_labels,
 71 |                                      label_lengths, input_lengths);
 72 |     } else if (options.loc == CTC_GPU) {
 73 | #ifdef __CUDACC__
 74 |         GpuCTC<float> ctc(alphabet_size, minibatch, workspace, options.stream,
 75 |                           options.blank_label);
 76 | 
 77 |         if (gradients != NULL)
 78 |             return ctc.cost_and_grad(activations, gradients, costs,
 79 |                                      flat_labels, label_lengths,
 80 |                                      input_lengths);
 81 |         else
 82 |             return ctc.score_forward(activations, costs, flat_labels,
 83 |                                      label_lengths, input_lengths);
 84 | #else
 85 |         std::cerr << "GPU execution requested, but not compiled with GPU support" << std::endl;
 86 |         return CTC_STATUS_EXECUTION_FAILED;
 87 | #endif
 88 |     } else {
 89 |         return CTC_STATUS_INVALID_VALUE;
 90 |     }
 91 | }
 92 | 
 93 | 
 94 | ctcStatus_t get_workspace_size(const int* const label_lengths,
 95 |                                const int* const input_lengths,
 96 |                                int alphabet_size, int minibatch,
 97 |                                ctcOptions options,
 98 |                                size_t* size_bytes)
 99 | {
100 |     if (label_lengths == nullptr ||
101 |         input_lengths == nullptr ||
102 |         size_bytes == nullptr ||
103 |         alphabet_size <= 0 ||
104 |         minibatch <= 0)
105 |         return CTC_STATUS_INVALID_VALUE;
106 | 
107 |     // This is the max of all S and T for all examples in the minibatch.
108 |     int maxL = *std::max_element(label_lengths, label_lengths + minibatch);
109 |     int maxT = *std::max_element(input_lengths, input_lengths + minibatch);
110 | 
111 |     const int S = 2 * maxL + 1;
112 | 
113 |     *size_bytes = 0;
114 | 
115 |     if (options.loc == CTC_GPU) {
116 |         // GPU storage
117 |         //nll_forward, nll_backward
118 |         *size_bytes += 2 * sizeof(float) * minibatch;
119 | 
120 |         //repeats
121 |         *size_bytes += sizeof(int) * minibatch;
122 | 
123 |         //label offsets
124 |         *size_bytes += sizeof(int) * minibatch;
125 | 
126 |         //utt_length
127 |         *size_bytes += sizeof(int) * minibatch;
128 | 
129 |         //label lengths
130 |         *size_bytes += sizeof(int) * minibatch;
131 | 
132 |         //labels without blanks - overallocate for now
133 |         *size_bytes += sizeof(int) * maxL * minibatch;
134 | 
135 |         //labels with blanks
136 |         *size_bytes += sizeof(int) * S * minibatch;
137 | 
138 |         //alphas
139 |         *size_bytes += sizeof(float) * S * maxT * minibatch;
140 | 
141 |         //denoms
142 |         *size_bytes += sizeof(float) * maxT * minibatch;
143 | 
144 |         //probs (since we will pass in activations)
145 |         *size_bytes += sizeof(float) * alphabet_size * maxT * minibatch;
146 | 
147 |     } else {
148 |         //cpu can eventually replace all minibatch with
149 |         //max number of concurrent threads if memory is
150 |         //really tight
151 | 
152 |         //per minibatch memory
153 |         size_t per_minibatch_bytes = 0;
154 | 
155 |         //output
156 |         per_minibatch_bytes += sizeof(float) * alphabet_size ;
157 | 
158 |         //alphas
159 |         per_minibatch_bytes += sizeof(float) * S * maxT;
160 | 
161 |         //betas
162 |         per_minibatch_bytes += sizeof(float) * S;
163 | 
164 |         //labels w/blanks, e_inc, s_inc
165 |         per_minibatch_bytes += 3 * sizeof(int) * S;
166 | 
167 |         *size_bytes = per_minibatch_bytes * minibatch;
168 | 
169 |         //probs
170 |         *size_bytes += sizeof(float) * alphabet_size * maxT * minibatch;
171 |     }
172 | 
173 |     return CTC_STATUS_SUCCESS;
174 | }
175 | 
176 | }
177 | 


--------------------------------------------------------------------------------
/src/ctc_entrypoint.cu:
--------------------------------------------------------------------------------
1 | ctc_entrypoint.cpp


--------------------------------------------------------------------------------
/src/reduce.cu:
--------------------------------------------------------------------------------
  1 | // Includes, system
  2 | // #include <stdio.h>
  3 | // #include <stdlib.h>
  4 | 
  5 | // Includes, cuda
  6 | // #include <cuda_runtime.h>
  7 | // #include <cublas_v2.h>
  8 | 
  9 | // Includes, cuda helper functions
 10 | // #include <helper_cuda.h>
 11 | 
 12 | // For the functors
 13 | #include "detail/ctc_helper.h"
 14 | #include "ctc.h"
 15 | 
 16 | const int warp_size = 32;
 17 | 
 18 | template<int NT, typename T, typename Rop>
 19 | struct CTAReduce;
 20 | 
 21 | template<int NT, typename T, typename Rop>
 22 | struct CTAReduce {
 23 |     enum { Size = NT, Capacity = NT };
 24 |     struct Storage { T shared[Capacity]; };
 25 | 
 26 |     __device__ static T reduce(int tid, T x, Storage& storage, int count, Rop g) {
 27 |         T* s = storage.shared;
 28 |         s[tid] = x;
 29 |         __syncthreads();
 30 | 
 31 |         // Fold the data in half with each pass.
 32 | #pragma unroll
 33 |         for(int offset = NT / 2; offset >= warp_size; offset /= 2) {
 34 |             if(tid + offset < count && tid < offset) {
 35 |                 // Read from the right half and store to the left half.
 36 |                 x = g(x, s[offset + tid]);
 37 |                 s[tid] = x;
 38 |             }
 39 |             __syncthreads();
 40 |         }
 41 | 
 42 |         T shuff;
 43 |         for (int offset = warp_size / 2; offset > 0; offset /= 2) {
 44 |             shuff = __shfl_down_sync(0xFFFFFFFF, x, offset);
 45 |             if (tid + offset < count && tid < offset)
 46 |                 x = g(x, shuff);
 47 |         }
 48 |         return x;
 49 |     }
 50 | };
 51 | 
 52 | template <int NT, typename Iop, typename Rop, typename T>
 53 | __global__ void reduce_rows(Iop f, Rop g, const T* input, T* output,
 54 |                             int num_rows, int num_cols) {
 55 | 
 56 |     typedef CTAReduce<NT, T, Rop> R;
 57 |     __shared__ typename R::Storage storage;
 58 | 
 59 |     int tid = threadIdx.x;
 60 |     int idx = tid;
 61 |     int col = blockIdx.x;
 62 |     T curr;
 63 | 
 64 |     // Each block works on a column
 65 |     if (idx < num_rows)
 66 |         curr = f(input[idx + col*num_rows]);
 67 |     idx += NT;
 68 | 
 69 | 
 70 |     while (idx < num_rows) {
 71 |         curr = g(curr, f(input[idx + col*num_rows]));
 72 |         idx += NT;
 73 |     }
 74 | 
 75 |     // Sum thread-totals over the CTA.
 76 |     curr = R::reduce(tid, curr, storage, num_rows, g);
 77 | 
 78 |     // Store result in out
 79 |     if (tid == 0)
 80 |         output[col] = curr;
 81 | }
 82 | 
 83 | template <int NT, typename Iop, typename Rop, typename T>
 84 | __global__ void reduce_cols(Iop f, Rop g, const T* input, T* output,
 85 |                             int num_rows, int num_cols) {
 86 | 
 87 |     __shared__ T s[NT];
 88 | 
 89 |     int warps_per_block = NT / warp_size;
 90 |     int row = blockDim.x * blockIdx.x + threadIdx.x;
 91 |     int col = threadIdx.y;
 92 |     T curr;
 93 | 
 94 |     if (row < num_rows && col < num_cols) {
 95 |         curr = f(input[row + col*num_rows]);
 96 |         col += blockDim.y;
 97 |         while (col < num_cols) {
 98 |             curr = g(curr, f(input[row + col*num_rows]));
 99 |             col += blockDim.y;
100 |         }
101 |     }
102 |     s[threadIdx.x * warps_per_block + threadIdx.y] = curr;
103 |     __syncthreads();
104 | 
105 |     // Reduce
106 |     if (threadIdx.y == 0 && row < num_rows) {
107 | #pragma unroll
108 |         for (int i = 1; i < warps_per_block && i < num_cols; ++i)
109 |             curr = g(curr, s[i + threadIdx.x * warps_per_block]);
110 |         output[row] = curr;
111 |     }
112 | }
113 | 
114 | struct ReduceHelper {
115 | 
116 |     template<typename T, typename Iof, typename Rof>
117 |     static void impl(Iof f, Rof g, const T* input, T* output, int num_rows, int num_cols, bool axis, cudaStream_t stream) {
118 | 
119 |         int grid_size;
120 | 
121 |         if (axis) {
122 |             grid_size = num_cols;
123 |             reduce_rows<128><<<grid_size, 128, 0, stream>>>
124 |                (f, g, input, output, num_rows, num_cols);
125 | 
126 |         } else {
127 |             dim3 tpb(warp_size, 128 / warp_size);
128 |             grid_size = (num_cols + warp_size - 1)/warp_size;
129 |             reduce_cols<128><<<grid_size, tpb, 0, stream>>>
130 |                 (f, g, input, output, num_rows, num_cols);
131 | 
132 |         }
133 |     }
134 | };
135 | 
136 | 
137 | template<typename T, typename Iof, typename  Rof>
138 | ctcStatus_t reduce(Iof f, Rof g, const T* input, T* output, int rows, int cols, bool axis, cudaStream_t stream) {
139 |     ReduceHelper::impl(f, g, input, output, rows, cols, axis, stream);
140 |     cudaStreamSynchronize(stream);
141 |     cudaError_t err = cudaGetLastError();
142 |     if (err != cudaSuccess)
143 |         return CTC_STATUS_EXECUTION_FAILED;
144 | 
145 |     return CTC_STATUS_SUCCESS;
146 | }
147 | 
148 | ctcStatus_t reduce_negate(const float *input, float *output, int rows, int cols, bool axis, cudaStream_t stream) {
149 |     return reduce(ctc_helper::negate<float>(), ctc_helper::add<float>(), input, output, rows, cols, axis, stream);
150 | }
151 | 
152 | ctcStatus_t reduce_exp(const float *input, float *output, int rows, int cols, bool axis, cudaStream_t stream) {
153 |     return reduce(ctc_helper::exponential<float>(), ctc_helper::add<float>(), input, output, rows, cols, axis, stream);
154 | }
155 | 
156 | ctcStatus_t reduce_max(const float *input, float *output, int rows, int cols, bool axis, cudaStream_t stream) {
157 |     return reduce(ctc_helper::identity<float>(), ctc_helper::maximum<float>(),input, output, rows, cols, axis, stream);
158 | }
159 | 


--------------------------------------------------------------------------------
/tests/random.cpp:
--------------------------------------------------------------------------------
 1 | #include <vector>
 2 | #include <random>
 3 | 
 4 | 
 5 | std::vector<float>
 6 | genActs(int size) {
 7 |     std::vector<float> arr(size);
 8 |     std::mt19937 gen(0);
 9 |     std::uniform_real_distribution<> dis(0, 1);
10 |     for(int i = 0; i < size; ++i)
11 |         arr[i] = dis(gen);
12 |     return arr;
13 | }
14 | 
15 | std::vector<int>
16 | genLabels(int alphabet_size, int L) {
17 |     std::vector<int> label(L);
18 | 
19 |     std::mt19937 gen(1);
20 |     std::uniform_int_distribution<> dis(1, alphabet_size - 1);
21 | 
22 |     for(int i = 0; i < L; ++i) {
23 |         label[i] = dis(gen);
24 |     }
25 |     // guarantee repeats for testing
26 |     if (L >= 3) {
27 |         label[L / 2] = label[L / 2 + 1];
28 |         label[L / 2 - 1] = label[L / 2];
29 |     }
30 |     return label;
31 | }
32 | 
33 | 


--------------------------------------------------------------------------------
/tests/test.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <stdexcept>
 4 | #include <vector>
 5 | #include <limits>
 6 | #include <numeric>
 7 | 
 8 | 
 9 | #include <ctc.h>
10 | 
11 | inline void throw_on_error(ctcStatus_t status, const char* message) {
12 |     if (status != CTC_STATUS_SUCCESS) {
13 |         throw std::runtime_error(message + (", stat = " + 
14 |                                             std::string(ctcGetStatusString(status))));
15 |     }
16 | }
17 | 
18 | #ifdef __CUDACC__
19 | #include <thrust/system_error.h>
20 | #include <thrust/system/cuda/error.h>
21 | 
22 | inline void throw_on_error(cudaError_t error, const char* message) {
23 |     if (error) {
24 |         throw thrust::system_error(error, thrust::cuda_category(), message);
25 |     }
26 | }
27 | 
28 | #endif
29 | 
30 | std::vector<float> genActs(int size);
31 | std::vector<int> genLabels(int alphabet_size, int L);
32 | 
33 | float rel_diff(const std::vector<float>& grad,
34 |                const std::vector<float>& num_grad) {
35 |     float diff = 0.;
36 |     float tot = 0.;
37 |     for(size_t idx = 0; idx < grad.size(); ++idx) {
38 |         diff += (grad[idx] - num_grad[idx]) * (grad[idx] - num_grad[idx]);
39 |         tot += grad[idx] * grad[idx];
40 |     }
41 | 
42 |     return diff / tot;
43 | }
44 | 
45 | // Numerically stable softmax for a minibatch of 1
46 | void softmax(const float* const acts,
47 |              int alphabet_size, int T,
48 |              float *probs) {
49 | 
50 |     for (int t = 0; t < T; ++t) {
51 | 
52 |         float max_activation =
53 |             -std::numeric_limits<float>::infinity();
54 | 
55 |         for (int a = 0; a < alphabet_size; ++a)
56 |             max_activation =
57 |                std::max(max_activation, acts[t*alphabet_size + a]);
58 | 
59 |         float denom = 0;
60 |         for (int a = 0; a < alphabet_size; ++a)
61 |             denom += std::exp(acts[t*alphabet_size + a] - max_activation);
62 | 
63 |         for (int a = 0; a < alphabet_size; ++a)
64 |             probs[t*alphabet_size + a] =
65 |                std::exp(acts[t*alphabet_size + a] - max_activation) / denom;
66 |     }
67 | }
68 | 


--------------------------------------------------------------------------------