├── .github └── workflows │ ├── cpu.yaml │ ├── cuda100.yaml │ ├── cuda101.yaml │ ├── cuda102.yaml │ └── cuda92.yaml ├── .gitignore ├── CMakeLists.txt ├── LICENSE ├── README.md ├── ci ├── README.md ├── build.sh ├── cpu │ └── Dockerfile └── gpu │ └── Dockerfile ├── doc ├── baidu-research-logo-small.png └── deep-speech-ctc-small.png ├── include ├── contrib │ └── moderngpu │ │ ├── LICENSE │ │ └── include │ │ ├── device │ │ ├── ctaloadbalance.cuh │ │ ├── ctamerge.cuh │ │ ├── ctascan.cuh │ │ ├── ctasearch.cuh │ │ ├── ctasegreduce.cuh │ │ ├── ctasegscan.cuh │ │ ├── ctasegsort.cuh │ │ ├── ctasortedsearch.cuh │ │ ├── devicetypes.cuh │ │ ├── deviceutil.cuh │ │ ├── intrinsics.cuh │ │ ├── loadstore.cuh │ │ ├── serialsets.cuh │ │ └── sortnetwork.cuh │ │ ├── mgpudevice.cuh │ │ ├── mgpuenums.h │ │ └── util │ │ └── static.h ├── ctc.h └── detail │ ├── cpu_ctc.h │ ├── ctc_helper.h │ ├── gpu_ctc.h │ ├── gpu_ctc_kernels.h │ ├── hostdevice.h │ └── reduce.h ├── pytorch_binding ├── .gitignore ├── .pypirc ├── setup.cfg ├── setup.py ├── src │ ├── binding.cpp │ ├── cpu_binding.h │ └── gpu_binding.h ├── tests │ ├── test_cpu.py │ └── test_gpu.py ├── warpctc_pytorch │ └── __init__.py └── wheel │ ├── build_wheels.sh │ └── rename_wheels.py ├── src ├── ctc_entrypoint.cpp ├── ctc_entrypoint.cu └── reduce.cu └── tests ├── random.cpp ├── test.h ├── test_cpu.cpp └── test_gpu.cu /.github/workflows/cpu.yaml: -------------------------------------------------------------------------------- 1 | name: CPU 2 | on: 3 | push: 4 | branches: 5 | - pytorch_bindings 6 | pull_request: 7 | branches: 8 | - pytorch_bindings 9 | release: 10 | types: [published] 11 | jobs: 12 | build: 13 | runs-on: ubuntu-20.04 14 | container: 15 | image: espnet/warpctc_builder:cpu 16 | defaults: 17 | run: 18 | shell: bash --login -eo pipefail {0} 19 | strategy: 20 | matrix: 21 | python-version: [3.6.13, 3.7.10, 3.8.7, 3.9.1] 22 | pytorch-version: [1.1.0, 1.2.0, 1.3.1, 1.4.0, 1.5.1, 1.6.0] 23 | exclude: 24 | - python-version: 3.8.7 25 | pytorch-version: 1.1.0 26 | - python-version: 3.8.7 27 | pytorch-version: 1.2.0 28 | - python-version: 3.8.7 29 | pytorch-version: 1.3.1 30 | - python-version: 3.9.1 31 | pytorch-version: 1.1.0 32 | - python-version: 3.9.1 33 | pytorch-version: 1.2.0 34 | - python-version: 3.9.1 35 | pytorch-version: 1.3.1 36 | - python-version: 3.9.1 37 | pytorch-version: 1.4.0 38 | - python-version: 3.9.1 39 | pytorch-version: 1.5.1 40 | - python-version: 3.9.1 41 | pytorch-version: 1.6.0 42 | steps: 43 | - uses: actions/checkout@v2 44 | - name: Copy .bash_profile 45 | run: cp /root/.bash_profile ~/ 46 | - name: Build warpctc 47 | run: | 48 | mkdir build 49 | cd build 50 | cmake .. 51 | make 52 | - name: Set Python version 53 | run: pyenv global ${{ matrix.python-version }} 54 | - name: Install dependencies 55 | run: | 56 | pip install -U pip setuptools 57 | pip install numpy pytest pytest-flakes wheel torch==${{ matrix.pytorch-version }} 58 | - name: Build wheel 59 | run: | 60 | cd pytorch_binding 61 | python setup.py bdist_wheel 62 | python wheel/rename_wheels.py 63 | ls dist 64 | echo "WHEEL_NAME=$(basename dist/*.whl)" >> $GITHUB_ENV 65 | - name: Install wheel 66 | run: | 67 | cd pytorch_binding 68 | pip install dist/warpctc_pytorch*.whl 69 | - name: Run tests 70 | run: | 71 | cd pytorch_binding 72 | pytest --flakes 73 | pytest tests 74 | - name: Set 'upload_url' of the latest release 75 | if: startsWith(github.ref, 'refs/tags/v') 76 | run: | 77 | # https://docs.github.com/en/rest/reference/repos#get-the-latest-release 78 | cmd="curl -s -H 'Accept: application/vnd.github.v3+json' https://api.github.com/repos/espnet/warp-ctc/releases/latest" 79 | output_json=$($cmd) 80 | echo "UPLOAD_URL=$(echo $output_json | jq .upload_url | sed 's/"//g')" >> $GITHUB_ENV 81 | - name: Upload a wheel to the latest release 82 | if: startsWith(github.ref, 'refs/tags/v') 83 | uses: actions/upload-release-asset@v1 84 | env: 85 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 86 | with: 87 | upload_url: ${{ env.UPLOAD_URL }} 88 | asset_path: pytorch_binding/dist/${{ env.WHEEL_NAME }} 89 | asset_name: ${{ env.WHEEL_NAME }} 90 | asset_content_type: application/zip 91 | -------------------------------------------------------------------------------- /.github/workflows/cuda100.yaml: -------------------------------------------------------------------------------- 1 | name: CUDA 10.0 2 | on: 3 | push: 4 | branches: 5 | - pytorch_bindings 6 | pull_request: 7 | branches: 8 | - pytorch_bindings 9 | release: 10 | types: [published] 11 | jobs: 12 | build: 13 | runs-on: ubuntu-20.04 14 | container: 15 | image: espnet/warpctc_builder:cuda100 16 | defaults: 17 | run: 18 | shell: bash --login -eo pipefail {0} 19 | strategy: 20 | matrix: 21 | python-version: [3.6.13, 3.7.10, 3.8.7, 3.9.1] 22 | pytorch-version: [1.1.0, 1.2.0, 1.3.1, 1.4.0, 1.5.1, 1.6.0] 23 | exclude: 24 | - python-version: 3.8.7 25 | pytorch-version: 1.1.0 26 | - python-version: 3.8.7 27 | pytorch-version: 1.2.0 28 | - python-version: 3.8.7 29 | pytorch-version: 1.3.1 30 | - python-version: 3.9.1 31 | pytorch-version: 1.1.0 32 | - python-version: 3.9.1 33 | pytorch-version: 1.2.0 34 | - python-version: 3.9.1 35 | pytorch-version: 1.3.1 36 | - python-version: 3.9.1 37 | pytorch-version: 1.4.0 38 | - python-version: 3.9.1 39 | pytorch-version: 1.5.1 40 | - python-version: 3.9.1 41 | pytorch-version: 1.6.0 42 | steps: 43 | - uses: actions/checkout@v2 44 | - name: Copy .bash_profile 45 | run: cp /root/.bash_profile ~/ 46 | - name: Build warpctc 47 | run: | 48 | mkdir build 49 | cd build 50 | cmake .. 51 | make 52 | - name: Set Python version 53 | run: pyenv global ${{ matrix.python-version }} 54 | - name: Install dependencies 55 | run: | 56 | pip install -U pip setuptools 57 | pip install numpy pytest pytest-flakes wheel torch==${{ matrix.pytorch-version }} 58 | - name: Build wheel 59 | run: | 60 | cd pytorch_binding 61 | python setup.py bdist_wheel 62 | python wheel/rename_wheels.py 63 | ls dist 64 | echo "WHEEL_NAME=$(basename dist/*.whl)" >> $GITHUB_ENV 65 | - name: Install wheel 66 | run: | 67 | cd pytorch_binding 68 | pip install dist/warpctc_pytorch*.whl 69 | - name: Run tests 70 | run: | 71 | cd pytorch_binding 72 | pytest --flakes 73 | pytest tests 74 | - name: Set 'upload_url' of the latest release 75 | if: startsWith(github.ref, 'refs/tags/v') 76 | run: | 77 | # https://docs.github.com/en/rest/reference/repos#get-the-latest-release 78 | cmd="curl -s -H 'Accept: application/vnd.github.v3+json' https://api.github.com/repos/espnet/warp-ctc/releases/latest" 79 | output_json=$($cmd) 80 | echo "UPLOAD_URL=$(echo $output_json | jq .upload_url | sed 's/"//g')" >> $GITHUB_ENV 81 | - name: Upload a wheel to the latest release 82 | if: startsWith(github.ref, 'refs/tags/v') 83 | uses: actions/upload-release-asset@v1 84 | env: 85 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 86 | with: 87 | upload_url: ${{ env.UPLOAD_URL }} 88 | asset_path: pytorch_binding/dist/${{ env.WHEEL_NAME }} 89 | asset_name: ${{ env.WHEEL_NAME }} 90 | asset_content_type: application/zip 91 | -------------------------------------------------------------------------------- /.github/workflows/cuda101.yaml: -------------------------------------------------------------------------------- 1 | name: CUDA 10.1 2 | on: 3 | push: 4 | branches: 5 | - pytorch_bindings 6 | pull_request: 7 | branches: 8 | - pytorch_bindings 9 | release: 10 | types: [published] 11 | jobs: 12 | build: 13 | runs-on: ubuntu-20.04 14 | container: 15 | image: espnet/warpctc_builder:cuda101 16 | defaults: 17 | run: 18 | shell: bash --login -eo pipefail {0} 19 | strategy: 20 | matrix: 21 | python-version: [3.6.13, 3.7.10, 3.8.7, 3.9.1] 22 | pytorch-version: [1.1.0, 1.2.0, 1.3.1, 1.4.0, 1.5.1, 1.6.0] 23 | exclude: 24 | - python-version: 3.8.7 25 | pytorch-version: 1.1.0 26 | - python-version: 3.8.7 27 | pytorch-version: 1.2.0 28 | - python-version: 3.8.7 29 | pytorch-version: 1.3.1 30 | - python-version: 3.9.1 31 | pytorch-version: 1.1.0 32 | - python-version: 3.9.1 33 | pytorch-version: 1.2.0 34 | - python-version: 3.9.1 35 | pytorch-version: 1.3.1 36 | - python-version: 3.9.1 37 | pytorch-version: 1.4.0 38 | - python-version: 3.9.1 39 | pytorch-version: 1.5.1 40 | - python-version: 3.9.1 41 | pytorch-version: 1.6.0 42 | steps: 43 | - uses: actions/checkout@v2 44 | - name: Copy .bash_profile 45 | run: cp /root/.bash_profile ~/ 46 | - name: Build warpctc 47 | run: | 48 | mkdir build 49 | cd build 50 | cmake .. 51 | make 52 | - name: Set Python version 53 | run: pyenv global ${{ matrix.python-version }} 54 | - name: Install dependencies 55 | run: | 56 | pip install -U pip setuptools 57 | pip install numpy pytest pytest-flakes wheel torch==${{ matrix.pytorch-version }} 58 | - name: Build wheel 59 | run: | 60 | cd pytorch_binding 61 | python setup.py bdist_wheel 62 | python wheel/rename_wheels.py 63 | ls dist 64 | echo "WHEEL_NAME=$(basename dist/*.whl)" >> $GITHUB_ENV 65 | - name: Install wheel 66 | run: | 67 | cd pytorch_binding 68 | pip install dist/warpctc_pytorch*.whl 69 | - name: Run tests 70 | run: | 71 | cd pytorch_binding 72 | pytest --flakes 73 | pytest tests 74 | - name: Set 'upload_url' of the latest release 75 | if: startsWith(github.ref, 'refs/tags/v') 76 | run: | 77 | # https://docs.github.com/en/rest/reference/repos#get-the-latest-release 78 | cmd="curl -s -H 'Accept: application/vnd.github.v3+json' https://api.github.com/repos/espnet/warp-ctc/releases/latest" 79 | output_json=$($cmd) 80 | echo "UPLOAD_URL=$(echo $output_json | jq .upload_url | sed 's/"//g')" >> $GITHUB_ENV 81 | - name: Upload a wheel to the latest release 82 | if: startsWith(github.ref, 'refs/tags/v') 83 | uses: actions/upload-release-asset@v1 84 | env: 85 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 86 | with: 87 | upload_url: ${{ env.UPLOAD_URL }} 88 | asset_path: pytorch_binding/dist/${{ env.WHEEL_NAME }} 89 | asset_name: ${{ env.WHEEL_NAME }} 90 | asset_content_type: application/zip 91 | -------------------------------------------------------------------------------- /.github/workflows/cuda102.yaml: -------------------------------------------------------------------------------- 1 | name: CUDA 10.2 2 | on: 3 | push: 4 | branches: 5 | - pytorch_bindings 6 | pull_request: 7 | branches: 8 | - pytorch_bindings 9 | release: 10 | types: [published] 11 | jobs: 12 | build: 13 | runs-on: ubuntu-20.04 14 | container: 15 | image: espnet/warpctc_builder:cuda102 16 | defaults: 17 | run: 18 | shell: bash --login -eo pipefail {0} 19 | strategy: 20 | matrix: 21 | python-version: [3.6.13, 3.7.10, 3.8.7, 3.9.1] 22 | pytorch-version: [1.1.0, 1.2.0, 1.3.1, 1.4.0, 1.5.1, 1.6.0] 23 | exclude: 24 | - python-version: 3.8.7 25 | pytorch-version: 1.1.0 26 | - python-version: 3.8.7 27 | pytorch-version: 1.2.0 28 | - python-version: 3.8.7 29 | pytorch-version: 1.3.1 30 | - python-version: 3.9.1 31 | pytorch-version: 1.1.0 32 | - python-version: 3.9.1 33 | pytorch-version: 1.2.0 34 | - python-version: 3.9.1 35 | pytorch-version: 1.3.1 36 | - python-version: 3.9.1 37 | pytorch-version: 1.4.0 38 | - python-version: 3.9.1 39 | pytorch-version: 1.5.1 40 | - python-version: 3.9.1 41 | pytorch-version: 1.6.0 42 | steps: 43 | - uses: actions/checkout@v2 44 | - name: Copy .bash_profile 45 | run: cp /root/.bash_profile ~/ 46 | - name: Build warpctc 47 | run: | 48 | mkdir build 49 | cd build 50 | cmake .. 51 | make 52 | - name: Set Python version 53 | run: pyenv global ${{ matrix.python-version }} 54 | - name: Install dependencies 55 | run: | 56 | pip install -U pip setuptools 57 | pip install numpy pytest pytest-flakes wheel torch==${{ matrix.pytorch-version }} 58 | - name: Build wheel 59 | run: | 60 | cd pytorch_binding 61 | python setup.py bdist_wheel 62 | python wheel/rename_wheels.py 63 | ls dist 64 | echo "WHEEL_NAME=$(basename dist/*.whl)" >> $GITHUB_ENV 65 | - name: Install wheel 66 | run: | 67 | cd pytorch_binding 68 | pip install dist/warpctc_pytorch*.whl 69 | - name: Run tests 70 | run: | 71 | cd pytorch_binding 72 | pytest --flakes 73 | pytest tests 74 | - name: Set 'upload_url' of the latest release 75 | if: startsWith(github.ref, 'refs/tags/v') 76 | run: | 77 | # https://docs.github.com/en/rest/reference/repos#get-the-latest-release 78 | cmd="curl -s -H 'Accept: application/vnd.github.v3+json' https://api.github.com/repos/espnet/warp-ctc/releases/latest" 79 | output_json=$($cmd) 80 | echo "UPLOAD_URL=$(echo $output_json | jq .upload_url | sed 's/"//g')" >> $GITHUB_ENV 81 | - name: Upload a wheel to the latest release 82 | if: startsWith(github.ref, 'refs/tags/v') 83 | uses: actions/upload-release-asset@v1 84 | env: 85 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 86 | with: 87 | upload_url: ${{ env.UPLOAD_URL }} 88 | asset_path: pytorch_binding/dist/${{ env.WHEEL_NAME }} 89 | asset_name: ${{ env.WHEEL_NAME }} 90 | asset_content_type: application/zip 91 | -------------------------------------------------------------------------------- /.github/workflows/cuda92.yaml: -------------------------------------------------------------------------------- 1 | name: CUDA 9.2 2 | on: 3 | push: 4 | branches: 5 | - pytorch_bindings 6 | pull_request: 7 | branches: 8 | - pytorch_bindings 9 | release: 10 | types: [published] 11 | jobs: 12 | build: 13 | runs-on: ubuntu-20.04 14 | container: 15 | image: espnet/warpctc_builder:cuda92 16 | defaults: 17 | run: 18 | shell: bash --login -eo pipefail {0} 19 | strategy: 20 | matrix: 21 | python-version: [3.6.13, 3.7.10, 3.8.7, 3.9.1] 22 | pytorch-version: [1.1.0, 1.2.0, 1.3.1, 1.4.0, 1.5.1, 1.6.0] 23 | exclude: 24 | - python-version: 3.8.7 25 | pytorch-version: 1.1.0 26 | - python-version: 3.8.7 27 | pytorch-version: 1.2.0 28 | - python-version: 3.8.7 29 | pytorch-version: 1.3.1 30 | - python-version: 3.9.1 31 | pytorch-version: 1.1.0 32 | - python-version: 3.9.1 33 | pytorch-version: 1.2.0 34 | - python-version: 3.9.1 35 | pytorch-version: 1.3.1 36 | - python-version: 3.9.1 37 | pytorch-version: 1.4.0 38 | - python-version: 3.9.1 39 | pytorch-version: 1.5.1 40 | - python-version: 3.9.1 41 | pytorch-version: 1.6.0 42 | steps: 43 | - uses: actions/checkout@v2 44 | - name: Copy .bash_profile 45 | run: cp /root/.bash_profile ~/ 46 | - name: Build warpctc 47 | run: | 48 | mkdir build 49 | cd build 50 | cmake .. 51 | make 52 | - name: Set Python version 53 | run: pyenv global ${{ matrix.python-version }} 54 | - name: Install dependencies 55 | run: | 56 | pip install -U pip setuptools 57 | pip install numpy pytest pytest-flakes wheel torch==${{ matrix.pytorch-version }} 58 | - name: Build wheel 59 | run: | 60 | cd pytorch_binding 61 | python setup.py bdist_wheel 62 | python wheel/rename_wheels.py 63 | ls dist 64 | echo "WHEEL_NAME=$(basename dist/*.whl)" >> $GITHUB_ENV 65 | - name: Install wheel 66 | run: | 67 | cd pytorch_binding 68 | pip install dist/warpctc_pytorch*.whl 69 | - name: Run tests 70 | run: | 71 | cd pytorch_binding 72 | pytest --flakes 73 | pytest tests 74 | - name: Set 'upload_url' of the latest release 75 | if: startsWith(github.ref, 'refs/tags/v') 76 | run: | 77 | # https://docs.github.com/en/rest/reference/repos#get-the-latest-release 78 | cmd="curl -s -H 'Accept: application/vnd.github.v3+json' https://api.github.com/repos/espnet/warp-ctc/releases/latest" 79 | output_json=$($cmd) 80 | echo "UPLOAD_URL=$(echo $output_json | jq .upload_url | sed 's/"//g')" >> $GITHUB_ENV 81 | - name: Upload a wheel to the latest release 82 | if: startsWith(github.ref, 'refs/tags/v') 83 | uses: actions/upload-release-asset@v1 84 | env: 85 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 86 | with: 87 | upload_url: ${{ env.UPLOAD_URL }} 88 | asset_path: pytorch_binding/dist/${{ env.WHEEL_NAME }} 89 | asset_name: ${{ env.WHEEL_NAME }} 90 | asset_content_type: application/zip 91 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | Makefile 3 | build -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | IF (APPLE) 2 | cmake_minimum_required(VERSION 3.4) 3 | ELSE() 4 | cmake_minimum_required(VERSION 2.8) 5 | ENDIF() 6 | 7 | project(ctc_release) 8 | 9 | IF (NOT APPLE) 10 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") 11 | ENDIF() 12 | 13 | IF (APPLE) 14 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -O2") 15 | add_definitions(-DAPPLE) 16 | ENDIF() 17 | 18 | include_directories(include) 19 | 20 | FIND_PACKAGE(CUDA 6.5) 21 | MESSAGE(STATUS "cuda found ${CUDA_FOUND}") 22 | 23 | option(WITH_GPU "compile warp-ctc with cuda." ${CUDA_FOUND}) 24 | option(WITH_OMP "compile warp-ctc with openmp." ON) 25 | 26 | if(NOT WITH_OMP) 27 | add_definitions(-DCTC_DISABLE_OMP) 28 | endif() 29 | if (WITH_OMP) 30 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") 31 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fopenmp") 32 | endif() 33 | 34 | # need to be at least 30 or __shfl_down in reduce wont compile 35 | IF (CUDA_VERSION LESS 11.0) 36 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30 -O2") 37 | ENDIF() 38 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_35,code=sm_35") 39 | 40 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_50,code=sm_50") 41 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52") 42 | IF(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5) 43 | SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -D_MWAITXINTRIN_H_INCLUDED -D_FORCE_INLINES") 44 | ENDIF() 45 | 46 | IF (CUDA_VERSION GREATER 7.6) 47 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60") 48 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61") 49 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62") 50 | ENDIF() 51 | 52 | IF (CUDA_VERSION GREATER 8.9) 53 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_70") 54 | ENDIF() 55 | 56 | if (NOT APPLE) 57 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++14") 58 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") 59 | ENDIF() 60 | 61 | IF (APPLE) 62 | EXEC_PROGRAM(uname ARGS -v OUTPUT_VARIABLE DARWIN_VERSION) 63 | STRING(REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION}) 64 | MESSAGE(STATUS "DARWIN_VERSION=${DARWIN_VERSION}") 65 | 66 | #for el capitain have to use rpath 67 | 68 | IF (DARWIN_VERSION LESS 15) 69 | set(CMAKE_SKIP_RPATH TRUE) 70 | ENDIF () 71 | 72 | ELSE() 73 | #always skip for linux 74 | set(CMAKE_SKIP_RPATH TRUE) 75 | ENDIF() 76 | 77 | 78 | IF (WITH_GPU) 79 | 80 | MESSAGE(STATUS "Building shared library with GPU support") 81 | 82 | CUDA_ADD_LIBRARY(warpctc SHARED src/ctc_entrypoint.cu src/reduce.cu) 83 | IF (!Torch_FOUND) 84 | TARGET_LINK_LIBRARIES(warpctc ${CUDA_curand_LIBRARY}) 85 | ENDIF() 86 | 87 | add_executable(test_cpu tests/test_cpu.cpp tests/random.cpp ) 88 | TARGET_LINK_LIBRARIES(test_cpu warpctc) 89 | SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++14") 90 | 91 | cuda_add_executable(test_gpu tests/test_gpu.cu tests/random.cpp ) 92 | TARGET_LINK_LIBRARIES(test_gpu warpctc ${CUDA_curand_LIBRARY}) 93 | SET_TARGET_PROPERTIES(test_gpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++14") 94 | 95 | INSTALL(TARGETS warpctc 96 | RUNTIME DESTINATION "bin" 97 | LIBRARY DESTINATION "lib" 98 | ARCHIVE DESTINATION "lib") 99 | 100 | INSTALL(FILES include/ctc.h DESTINATION "include") 101 | ELSE() 102 | MESSAGE(STATUS "Building shared library with no GPU support") 103 | 104 | if (NOT APPLE) 105 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -O2") 106 | ENDIF() 107 | 108 | ADD_LIBRARY(warpctc SHARED src/ctc_entrypoint.cpp) 109 | 110 | add_executable(test_cpu tests/test_cpu.cpp tests/random.cpp ) 111 | TARGET_LINK_LIBRARIES(test_cpu warpctc) 112 | SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++14") 113 | 114 | INSTALL(TARGETS warpctc 115 | RUNTIME DESTINATION "bin" 116 | LIBRARY DESTINATION "lib" 117 | ARCHIVE DESTINATION "lib") 118 | 119 | INSTALL(FILES include/ctc.h DESTINATION "include") 120 | ENDIF() 121 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2015-2016 Baidu USA LLC. All rights reserved. 2 | 3 | Apache License 4 | Version 2.0, January 2004 5 | http://www.apache.org/licenses/ 6 | 7 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 8 | 9 | 1. Definitions. 10 | 11 | "License" shall mean the terms and conditions for use, reproduction, 12 | and distribution as defined by Sections 1 through 9 of this document. 13 | 14 | "Licensor" shall mean the copyright owner or entity authorized by 15 | the copyright owner that is granting the License. 16 | 17 | "Legal Entity" shall mean the union of the acting entity and all 18 | other entities that control, are controlled by, or are under common 19 | control with that entity. For the purposes of this definition, 20 | "control" means (i) the power, direct or indirect, to cause the 21 | direction or management of such entity, whether by contract or 22 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 23 | outstanding shares, or (iii) beneficial ownership of such entity. 24 | 25 | "You" (or "Your") shall mean an individual or Legal Entity 26 | exercising permissions granted by this License. 27 | 28 | "Source" form shall mean the preferred form for making modifications, 29 | including but not limited to software source code, documentation 30 | source, and configuration files. 31 | 32 | "Object" form shall mean any form resulting from mechanical 33 | transformation or translation of a Source form, including but 34 | not limited to compiled object code, generated documentation, 35 | and conversions to other media types. 36 | 37 | "Work" shall mean the work of authorship, whether in Source or 38 | Object form, made available under the License, as indicated by a 39 | copyright notice that is included in or attached to the work 40 | (an example is provided in the Appendix below). 41 | 42 | "Derivative Works" shall mean any work, whether in Source or Object 43 | form, that is based on (or derived from) the Work and for which the 44 | editorial revisions, annotations, elaborations, or other modifications 45 | represent, as a whole, an original work of authorship. For the purposes 46 | of this License, Derivative Works shall not include works that remain 47 | separable from, or merely link (or bind by name) to the interfaces of, 48 | the Work and Derivative Works thereof. 49 | 50 | "Contribution" shall mean any work of authorship, including 51 | the original version of the Work and any modifications or additions 52 | to that Work or Derivative Works thereof, that is intentionally 53 | submitted to Licensor for inclusion in the Work by the copyright owner 54 | or by an individual or Legal Entity authorized to submit on behalf of 55 | the copyright owner. For the purposes of this definition, "submitted" 56 | means any form of electronic, verbal, or written communication sent 57 | to the Licensor or its representatives, including but not limited to 58 | communication on electronic mailing lists, source code control systems, 59 | and issue tracking systems that are managed by, or on behalf of, the 60 | Licensor for the purpose of discussing and improving the Work, but 61 | excluding communication that is conspicuously marked or otherwise 62 | designated in writing by the copyright owner as "Not a Contribution." 63 | 64 | "Contributor" shall mean Licensor and any individual or Legal Entity 65 | on behalf of whom a Contribution has been received by Licensor and 66 | subsequently incorporated within the Work. 67 | 68 | 2. Grant of Copyright License. Subject to the terms and conditions of 69 | this License, each Contributor hereby grants to You a perpetual, 70 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 71 | copyright license to reproduce, prepare Derivative Works of, 72 | publicly display, publicly perform, sublicense, and distribute the 73 | Work and such Derivative Works in Source or Object form. 74 | 75 | 3. Grant of Patent License. Subject to the terms and conditions of 76 | this License, each Contributor hereby grants to You a perpetual, 77 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 78 | (except as stated in this section) patent license to make, have made, 79 | use, offer to sell, sell, import, and otherwise transfer the Work, 80 | where such license applies only to those patent claims licensable 81 | by such Contributor that are necessarily infringed by their 82 | Contribution(s) alone or by combination of their Contribution(s) 83 | with the Work to which such Contribution(s) was submitted. If You 84 | institute patent litigation against any entity (including a 85 | cross-claim or counterclaim in a lawsuit) alleging that the Work 86 | or a Contribution incorporated within the Work constitutes direct 87 | or contributory patent infringement, then any patent licenses 88 | granted to You under this License for that Work shall terminate 89 | as of the date such litigation is filed. 90 | 91 | 4. Redistribution. You may reproduce and distribute copies of the 92 | Work or Derivative Works thereof in any medium, with or without 93 | modifications, and in Source or Object form, provided that You 94 | meet the following conditions: 95 | 96 | (a) You must give any other recipients of the Work or 97 | Derivative Works a copy of this License; and 98 | 99 | (b) You must cause any modified files to carry prominent notices 100 | stating that You changed the files; and 101 | 102 | (c) You must retain, in the Source form of any Derivative Works 103 | that You distribute, all copyright, patent, trademark, and 104 | attribution notices from the Source form of the Work, 105 | excluding those notices that do not pertain to any part of 106 | the Derivative Works; and 107 | 108 | (d) If the Work includes a "NOTICE" text file as part of its 109 | distribution, then any Derivative Works that You distribute must 110 | include a readable copy of the attribution notices contained 111 | within such NOTICE file, excluding those notices that do not 112 | pertain to any part of the Derivative Works, in at least one 113 | of the following places: within a NOTICE text file distributed 114 | as part of the Derivative Works; within the Source form or 115 | documentation, if provided along with the Derivative Works; or, 116 | within a display generated by the Derivative Works, if and 117 | wherever such third-party notices normally appear. The contents 118 | of the NOTICE file are for informational purposes only and 119 | do not modify the License. You may add Your own attribution 120 | notices within Derivative Works that You distribute, alongside 121 | or as an addendum to the NOTICE text from the Work, provided 122 | that such additional attribution notices cannot be construed 123 | as modifying the License. 124 | 125 | You may add Your own copyright statement to Your modifications and 126 | may provide additional or different license terms and conditions 127 | for use, reproduction, or distribution of Your modifications, or 128 | for any such Derivative Works as a whole, provided Your use, 129 | reproduction, and distribution of the Work otherwise complies with 130 | the conditions stated in this License. 131 | 132 | 5. Submission of Contributions. Unless You explicitly state otherwise, 133 | any Contribution intentionally submitted for inclusion in the Work 134 | by You to the Licensor shall be under the terms and conditions of 135 | this License, without any additional terms or conditions. 136 | Notwithstanding the above, nothing herein shall supersede or modify 137 | the terms of any separate license agreement you may have executed 138 | with Licensor regarding such Contributions. 139 | 140 | 6. Trademarks. This License does not grant permission to use the trade 141 | names, trademarks, service marks, or product names of the Licensor, 142 | except as required for reasonable and customary use in describing the 143 | origin of the Work and reproducing the content of the NOTICE file. 144 | 145 | 7. Disclaimer of Warranty. Unless required by applicable law or 146 | agreed to in writing, Licensor provides the Work (and each 147 | Contributor provides its Contributions) on an "AS IS" BASIS, 148 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 149 | implied, including, without limitation, any warranties or conditions 150 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 151 | PARTICULAR PURPOSE. You are solely responsible for determining the 152 | appropriateness of using or redistributing the Work and assume any 153 | risks associated with Your exercise of permissions under this License. 154 | 155 | 8. Limitation of Liability. In no event and under no legal theory, 156 | whether in tort (including negligence), contract, or otherwise, 157 | unless required by applicable law (such as deliberate and grossly 158 | negligent acts) or agreed to in writing, shall any Contributor be 159 | liable to You for damages, including any direct, indirect, special, 160 | incidental, or consequential damages of any character arising as a 161 | result of this License or out of the use or inability to use the 162 | Work (including but not limited to damages for loss of goodwill, 163 | work stoppage, computer failure or malfunction, or any and all 164 | other commercial damages or losses), even if such Contributor 165 | has been advised of the possibility of such damages. 166 | 167 | 9. Accepting Warranty or Additional Liability. While redistributing 168 | the Work or Derivative Works thereof, You may choose to offer, 169 | and charge a fee for, acceptance of support, warranty, indemnity, 170 | or other liability obligations and/or rights consistent with this 171 | License. However, in accepting such obligations, You may act only 172 | on Your own behalf and on Your sole responsibility, not on behalf 173 | of any other Contributor, and only if You agree to indemnify, 174 | defend, and hold each Contributor harmless for any liability 175 | incurred by, or claims asserted against, such Contributor by reason 176 | of your accepting any such warranty or additional liability. 177 | 178 | END OF TERMS AND CONDITIONS 179 | 180 | APPENDIX: How to apply the Apache License to your work. 181 | 182 | To apply the Apache License to your work, attach the following 183 | boilerplate notice, with the fields enclosed by brackets "[]" 184 | replaced with your own identifying information. (Don't include 185 | the brackets!) The text should be enclosed in the appropriate 186 | comment syntax for the file format. We also recommend that a 187 | file or class name and description of purpose be included on the 188 | same "printed page" as the copyright notice for easier 189 | identification within third-party archives. 190 | 191 | Copyright 2015-2016, Baidu USA LLC. 192 | 193 | Licensed under the Apache License, Version 2.0 (the "License"); 194 | you may not use this file except in compliance with the License. 195 | You may obtain a copy of the License at 196 | 197 | http://www.apache.org/licenses/LICENSE-2.0 198 | 199 | Unless required by applicable law or agreed to in writing, software 200 | distributed under the License is distributed on an "AS IS" BASIS, 201 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 202 | See the License for the specific language governing permissions and 203 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PyTorch bindings for Warp-ctc 2 | 3 | |branch|status| 4 | |:-:|:-:| 5 | |`pytorch_bindings`|[![Build Status](https://travis-ci.org/espnet/warp-ctc.svg?branch=pytorch_bindings)](https://github.com/espnet/warp-ctc/tree/pytorch_bindings)| 6 | |`pytorch-0.4`|[![Build Status](https://travis-ci.org/espnet/warp-ctc.svg?branch=pytorch-0.4)](https://github.com/espnet/warp-ctc/tree/pytorch-0.4)| 7 | |`pytorch-1.0`|[![Build Status](https://travis-ci.org/espnet/warp-ctc.svg?branch=pytorch-1.0)](https://github.com/espnet/warp-ctc/tree/pytorch-1.0)| 8 | 9 | This is an extension onto the original repo found [here](https://github.com/baidu-research/warp-ctc). 10 | 11 | ## Installation 12 | 13 | Install [PyTorch](https://github.com/pytorch/pytorch#installation) first. 14 | 15 | `warpctc-pytorch` wheel uses [local version identifiers](https://www.python.org/dev/peps/pep-0440/#local-version-identifiers), 16 | which has a restriction that users have to specify the version explicitly. 17 | 18 | ```console 19 | $ pip install warpctc-pytorch==X.X.X+torchYY.cudaZZ 20 | ``` 21 | 22 | The latest version is 0.2.1 and if you work with PyTorch 1.6 and CUDA 10.2, you can run: 23 | 24 | ```console 25 | $ pip install warpctc-pytorch==0.2.1+torch16.cuda102 26 | ``` 27 | 28 | ### for PyTorch 1.4 - 1.6 29 | 30 | `warpctc-pytorch` wheels are provided for Python 3.8, 3.7, 3.6 and CUDA 10.2, 10.1, 10.0, 9.2. 31 | 32 | ### for PyTorch 1.1 - 1.3 33 | 34 | `warpctc-pytorch` wheels are provided for Python 3.7, 3.6 and CUDA 10.2, 10.1, 10.0, 9.2. 35 | 36 | ### for PyTorch 1.0 37 | 38 | `warpctc-pytorch10-cudaYY` wheels are provided for Python 3.7, 3.6 and CUDA 10.1, 10.0, 9.2, 9.1, 9.0, 8.0. 39 | 40 | If you work with CUDA 10.1, you can run: 41 | 42 | ```console 43 | $ pip install warpctc-pytorch10-cuda101 44 | ``` 45 | 46 | ### for PyTorch 0.4.1 47 | 48 | Wheels for PyTorch 0.4.1 are not provided so users have to build from source manually. 49 | 50 | `WARP_CTC_PATH` should be set to the location of a built WarpCTC 51 | (i.e. `libwarpctc.so`). This defaults to `../build`, so from within a 52 | new warp-ctc clone you could build WarpCTC like this: 53 | 54 | ```bash 55 | $ git clone https://github.com/espnet/warp-ctc.git 56 | $ cd warp-ctc; git checkout -b pytorch-0.4 remotes/origin/pytorch-0.4 57 | $ mkdir build; cd build 58 | $ cmake .. 59 | $ make 60 | ``` 61 | 62 | Now install the bindings: 63 | ```bash 64 | $ cd ../pytorch_binding 65 | $ pip install numpy cffi 66 | $ python setup.py install 67 | ``` 68 | 69 | ## Example 70 | 71 | Example to use the bindings below. 72 | 73 | ```python 74 | import torch 75 | from warpctc_pytorch import CTCLoss 76 | ctc_loss = CTCLoss() 77 | # expected shape of seqLength x batchSize x alphabet_size 78 | probs = torch.FloatTensor([[[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]]]).transpose(0, 1).contiguous() 79 | labels = torch.IntTensor([1, 2]) 80 | label_sizes = torch.IntTensor([2]) 81 | probs_sizes = torch.IntTensor([2]) 82 | probs.requires_grad_(True) # tells autograd to compute gradients for probs 83 | cost = ctc_loss(probs, labels, probs_sizes, label_sizes) 84 | cost.backward() 85 | ``` 86 | 87 | ## Documentation 88 | 89 | ``` 90 | CTCLoss(size_average=False, length_average=False, reduce=True) 91 | # size_average (bool): normalize the loss by the batch size (default: False) 92 | # length_average (bool): normalize the loss by the total number of frames in the batch. If True, supersedes size_average (default: False) 93 | # reduce (bool): average or sum over observation for each minibatch. 94 | If `False`, returns a loss per batch element instead and ignores `average` options. 95 | (default: `True`) 96 | 97 | forward(acts, labels, act_lens, label_lens) 98 | # acts: Tensor of (seqLength x batch x outputDim) containing output activations from network (before softmax) 99 | # labels: 1 dimensional Tensor containing all the targets of the batch in one large sequence 100 | # act_lens: Tensor of size (batch) containing size of each output sequence from the network 101 | # label_lens: Tensor of (batch) containing label length of each example 102 | ``` 103 | -------------------------------------------------------------------------------- /ci/README.md: -------------------------------------------------------------------------------- 1 | Docker image builder for Travis CI 2 | === 3 | 4 | This directory contains tools to build following Docker images used in Travis CI, 5 | 6 | - `espnet/warpctc_builder:cuda101` for CUDA 10.1 7 | - `espnet/warpctc_builder:cuda100` for CUDA 10.0 8 | - `espnet/warpctc_builder:cuda92` for CUDA 9.2 9 | - `espnet/warpctc_builder:cuda91` for CUDA 9.1 10 | - `espnet/warpctc_builder:cuda90` for CUDA 9.0 11 | - `espnet/warpctc_builder:cuda80` for CUDA 8.0 12 | - `espnet/warpctc_builder:cpu` for no CUDA environment 13 | 14 | 15 | ## Building Docker images 16 | 17 | Run `build.sh`. 18 | 19 | ```console 20 | $ ./build.sh 21 | ``` 22 | 23 | ## Uploading images to Dockerhub 24 | 25 | Run `docker push`. 26 | 27 | ```console 28 | $ docker push espnet/warpctc_builder:TAG 29 | ``` 30 | 31 | Note that your Dockerhub account have write access to [espnet/warpctc_builder](https://hub.docker.com/r/espnet/warpctc_builder) repository. 32 | -------------------------------------------------------------------------------- /ci/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | YYYYMMDD=$(date '+%Y%m%d') 4 | image_repository=espnet/warpctc_builder 5 | cuda_versions=(9.2 10.0 10.1 10.2) 6 | for cuda_version in ${cuda_versions[@]}; do 7 | # gcc version check exists in /usr/local/cuda/include/crt/host_config.h 8 | devtoolset_version=8 9 | if [ "$cuda_version" = "10.0" ] || [ "$cuda_version" = "9.2" ]; then 10 | devtoolset_version=7 11 | fi 12 | base_image="nvidia/cuda:$cuda_version-cudnn7-devel-centos7" 13 | image_tag=cuda${cuda_version/./} 14 | image_name=$image_repository:$image_tag 15 | echo "Building $image_name" 16 | docker build --no-cache --build-arg base_image=$base_image --build-arg devtoolset_version=$devtoolset_version -t $image_name ./gpu 17 | docker tag $image_name $image_name-$YYYYMMDD 18 | echo -e "Done.\n" 19 | done 20 | 21 | image_tag=cpu 22 | image_name=$image_repository:$image_tag 23 | echo "Building $image_name" 24 | docker build --no-cache -t $image_name ./cpu 25 | docker tag $image_name $image_name-$YYYYMMDD 26 | echo Done. 27 | -------------------------------------------------------------------------------- /ci/cpu/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM centos:centos7 2 | 3 | RUN yum update -y \ 4 | && yum install -y epel-release \ 5 | && yum install -y centos-release-scl \ 6 | && yum install -y devtoolset-9-gcc-c++ \ 7 | && echo 'source scl_source enable devtoolset-9' >> ~/.bash_profile \ 8 | && yum install -y \ 9 | bzip2-devel \ 10 | cmake \ 11 | git \ 12 | jq \ 13 | libffi-devel \ 14 | make \ 15 | openssl-devel \ 16 | readline-devel \ 17 | sqlite-devel \ 18 | which \ 19 | zlib-devel \ 20 | && yum clean all \ 21 | && rm -rf /var/cache/yum/* 22 | # Install pyenv 23 | RUN git clone https://github.com/pyenv/pyenv.git /opt/pyenv 24 | ENV PYENV_ROOT /opt/pyenv 25 | ENV PATH ${PYENV_ROOT}/shims:${PYENV_ROOT}/bin:${PATH} 26 | # Install Python 27 | SHELL ["/bin/bash", "-c"] 28 | ENV PYTHON_VERSIONS 3.6.13 3.7.10 3.8.7 3.9.1 29 | RUN source scl_source enable devtoolset-9 \ 30 | && for python_version in ${PYTHON_VERSIONS}; do \ 31 | pyenv install ${python_version}; \ 32 | done 33 | -------------------------------------------------------------------------------- /ci/gpu/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG base_image 2 | FROM ${base_image} 3 | ARG devtoolset_version 4 | 5 | ENV CUDA_HOME /usr/local/cuda 6 | 7 | RUN yum update -y \ 8 | && yum install -y epel-release \ 9 | && yum install -y centos-release-scl \ 10 | && yum install -y devtoolset-${devtoolset_version}-gcc-c++ \ 11 | && echo "source scl_source enable devtoolset-${devtoolset_version}" >> ~/.bash_profile \ 12 | && yum install -y \ 13 | bzip2-devel \ 14 | cmake \ 15 | git \ 16 | jq \ 17 | libffi-devel \ 18 | make \ 19 | openssl-devel \ 20 | readline-devel \ 21 | sqlite-devel \ 22 | which \ 23 | zlib-devel \ 24 | && yum clean all \ 25 | && rm -rf /var/cache/yum/* 26 | # Install pyenv 27 | RUN git clone https://github.com/pyenv/pyenv.git /opt/pyenv 28 | ENV PYENV_ROOT /opt/pyenv 29 | ENV PATH ${PYENV_ROOT}/shims:${PYENV_ROOT}/bin:${PATH} 30 | # Install Python 31 | ENV PYTHON_VERSIONS 3.6.13 3.7.10 3.8.7 3.9.1 32 | RUN source scl_source enable devtoolset-${devtoolset_version} \ 33 | && for python_version in ${PYTHON_VERSIONS}; do \ 34 | pyenv install ${python_version}; \ 35 | done 36 | -------------------------------------------------------------------------------- /doc/baidu-research-logo-small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/espnet/warp-ctc/0705437521a1302f38692fb684f4743f8a85d324/doc/baidu-research-logo-small.png -------------------------------------------------------------------------------- /doc/deep-speech-ctc-small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/espnet/warp-ctc/0705437521a1302f38692fb684f4743f8a85d324/doc/deep-speech-ctc-small.png -------------------------------------------------------------------------------- /include/contrib/moderngpu/LICENSE: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | -------------------------------------------------------------------------------- /include/contrib/moderngpu/include/device/ctaloadbalance.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "ctasearch.cuh" 38 | #include "loadstore.cuh" 39 | 40 | namespace mgpu { 41 | 42 | //////////////////////////////////////////////////////////////////////////////// 43 | // DeviceLoadBalancingSearch 44 | // Upper Bound search from A (needles) into B (haystack). The A values are 45 | // natural numbers from aBegin to aEnd. bFirst is the index of the B value at 46 | // bBegin in shared memory. 47 | 48 | template 49 | MGPU_DEVICE void DeviceSerialLoadBalanceSearch(const int* b_shared, int aBegin, 50 | int aEnd, int bFirst, int bBegin, int bEnd, int* a_shared) { 51 | 52 | int bKey = b_shared[bBegin]; 53 | 54 | #pragma unroll 55 | for(int i = 0; i < VT; ++i) { 56 | bool p; 57 | if(RangeCheck) 58 | p = (aBegin < aEnd) && ((bBegin >= bEnd) || (aBegin < bKey)); 59 | else 60 | p = aBegin < bKey; 61 | 62 | if(p) 63 | // Advance A (the needle). 64 | a_shared[aBegin++] = bFirst + bBegin; 65 | else 66 | // Advance B (the haystack). 67 | bKey = b_shared[++bBegin]; 68 | } 69 | } 70 | 71 | //////////////////////////////////////////////////////////////////////////////// 72 | // CTALoadBalance 73 | // Computes upper_bound(counting_iterator(first), b_global) - 1. 74 | 75 | // Unlike most other CTA* functions, CTALoadBalance loads from global memory. 76 | // This returns the loaded B elements at the beginning or end of shared memory 77 | // depending on the aFirst argument. 78 | 79 | // CTALoadBalance requires NT * VT + 2 slots of shared memory. 80 | template 81 | MGPU_DEVICE int4 CTALoadBalance(int destCount, InputIt b_global, 82 | int sourceCount, int block, int tid, const int* mp_global, 83 | int* indices_shared, bool loadPrecedingB) { 84 | 85 | int4 range = ComputeMergeRange(destCount, sourceCount, block, 0, NT * VT, 86 | mp_global); 87 | 88 | int a0 = range.x; 89 | int a1 = range.y; 90 | int b0 = range.z; 91 | int b1 = range.w; 92 | if(!b0) loadPrecedingB = false; 93 | 94 | // Load one trailing term from B. If we're already at the end, fill the 95 | // end of the buffer with destCount. 96 | int aCount = a1 - a0; 97 | int bCount = b1 - b0; 98 | int extended = b1 < sourceCount; 99 | int loadCount = bCount + extended; 100 | int fillCount = NT * VT + 1 - loadCount - aCount; 101 | 102 | int* a_shared = indices_shared; 103 | int* b_shared = indices_shared + aCount + (int)loadPrecedingB; 104 | 105 | // Load the B values. 106 | // DeviceMemToMemLoop(bCount + extended + (int)loadPrecedingB, 107 | // b_global + b0 - (int)loadPrecedingB, tid, 108 | // b_shared - (int)loadPrecedingB); 109 | 110 | for(int i = tid - (int)loadPrecedingB; i < bCount + extended; i += NT) 111 | b_shared[i] = b_global[b0 + i]; 112 | 113 | // Fill the end of the array with destCount. 114 | for(int i = tid + extended; i < fillCount; i += NT) 115 | b_shared[bCount + i] = destCount; 116 | __syncthreads(); 117 | 118 | // Run a merge path to find the start of the serial merge for each thread. 119 | int diag = VT * tid; 120 | int mp = MergePath(mgpu::counting_iterator(a0), 121 | aCount, b_shared, bCount, diag, mgpu::less()); 122 | 123 | int a0tid = a0 + mp; 124 | int b0tid = diag - mp; 125 | 126 | // Subtract 1 from b0 because we want to return upper_bound - 1. 127 | DeviceSerialLoadBalanceSearch(b_shared, a0tid, a1, b0 - 1, 128 | b0tid, bCount, a_shared - a0); 129 | __syncthreads(); 130 | 131 | b0 -= (int)loadPrecedingB; 132 | return make_int4(a0, a1, b0, b1); 133 | } 134 | 135 | 136 | } // namespace mgpu 137 | -------------------------------------------------------------------------------- /include/contrib/moderngpu/include/device/ctamerge.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "ctasearch.cuh" 38 | #include "loadstore.cuh" 39 | #include "sortnetwork.cuh" 40 | 41 | namespace mgpu { 42 | 43 | //////////////////////////////////////////////////////////////////////////////// 44 | // SerialMerge 45 | 46 | template 47 | MGPU_DEVICE void SerialMerge(const T* keys_shared, int aBegin, int aEnd, 48 | int bBegin, int bEnd, T* results, int* indices, Comp comp) { 49 | 50 | T aKey = keys_shared[aBegin]; 51 | T bKey = keys_shared[bBegin]; 52 | 53 | #pragma unroll 54 | for(int i = 0; i < VT; ++i) { 55 | bool p; 56 | if(RangeCheck) 57 | p = (bBegin >= bEnd) || ((aBegin < aEnd) && !comp(bKey, aKey)); 58 | else 59 | p = !comp(bKey, aKey); 60 | 61 | results[i] = p ? aKey : bKey; 62 | indices[i] = p ? aBegin : bBegin - !RangeCheck; 63 | 64 | if(p) aKey = keys_shared[++aBegin]; 65 | else bKey = keys_shared[++bBegin]; 66 | } 67 | __syncthreads(); 68 | } 69 | 70 | //////////////////////////////////////////////////////////////////////////////// 71 | // FindMergeFrame and FindMergesortInterval help mergesort (both CTA and global 72 | // merge pass levels) locate lists within the single source array. 73 | 74 | // Returns (offset of a, offset of b, length of list). 75 | MGPU_HOST_DEVICE int3 FindMergesortFrame(int coop, int block, int nv) { 76 | // coop is the number of CTAs or threads cooperating to merge two lists into 77 | // one. We round block down to the first CTA's ID that is working on this 78 | // merge. 79 | int start = ~(coop - 1) & block; 80 | int size = nv * (coop>> 1); 81 | return make_int3(nv * start, nv * start + size, size); 82 | } 83 | 84 | // Returns (a0, a1, b0, b1) into mergesort input lists between mp0 and mp1. 85 | MGPU_HOST_DEVICE int4 FindMergesortInterval(int3 frame, int coop, int block, 86 | int nv, int count, int mp0, int mp1) { 87 | 88 | // Locate diag from the start of the A sublist. 89 | int diag = nv * block - frame.x; 90 | int a0 = frame.x + mp0; 91 | int a1 = min(count, frame.x + mp1); 92 | int b0 = min(count, frame.y + diag - mp0); 93 | int b1 = min(count, frame.y + diag + nv - mp1); 94 | 95 | // The end partition of the last block for each merge operation is computed 96 | // and stored as the begin partition for the subsequent merge. i.e. it is 97 | // the same partition but in the wrong coordinate system, so its 0 when it 98 | // should be listSize. Correct that by checking if this is the last block 99 | // in this merge operation. 100 | if(coop - 1 == ((coop - 1) & block)) { 101 | a1 = min(count, frame.x + frame.z); 102 | b1 = min(count, frame.y + frame.z); 103 | } 104 | return make_int4(a0, a1, b0, b1); 105 | } 106 | 107 | //////////////////////////////////////////////////////////////////////////////// 108 | // ComputeMergeRange 109 | 110 | MGPU_HOST_DEVICE int4 ComputeMergeRange(int aCount, int bCount, int block, 111 | int coop, int NV, const int* mp_global) { 112 | 113 | // Load the merge paths computed by the partitioning kernel. 114 | int mp0 = mp_global[block]; 115 | int mp1 = mp_global[block + 1]; 116 | int gid = NV * block; 117 | 118 | // Compute the ranges of the sources in global memory. 119 | int4 range; 120 | if(coop) { 121 | int3 frame = FindMergesortFrame(coop, block, NV); 122 | range = FindMergesortInterval(frame, coop, block, NV, aCount, mp0, 123 | mp1); 124 | } else { 125 | range.x = mp0; // a0 126 | range.y = mp1; // a1 127 | range.z = gid - range.x; // b0 128 | range.w = min(aCount + bCount, gid + NV) - range.y; // b1 129 | } 130 | return range; 131 | } 132 | 133 | //////////////////////////////////////////////////////////////////////////////// 134 | // CTA mergesort support 135 | 136 | template 137 | MGPU_DEVICE void CTABlocksortPass(T* keys_shared, int tid, int count, 138 | int coop, T* keys, int* indices, Comp comp) { 139 | 140 | int list = ~(coop - 1) & tid; 141 | int diag = min(count, VT * ((coop - 1) & tid)); 142 | int start = VT * list; 143 | int a0 = min(count, start); 144 | int b0 = min(count, start + VT * (coop / 2)); 145 | int b1 = min(count, start + VT * coop); 146 | 147 | int p = MergePath(keys_shared + a0, b0 - a0, 148 | keys_shared + b0, b1 - b0, diag, comp); 149 | 150 | SerialMerge(keys_shared, a0 + p, b0, b0 + diag - p, b1, keys, 151 | indices, comp); 152 | } 153 | 154 | template 156 | MGPU_DEVICE void CTABlocksortLoop(ValType threadValues[VT], 157 | KeyType* keys_shared, ValType* values_shared, int tid, int count, 158 | Comp comp) { 159 | 160 | #pragma unroll 161 | for(int coop = 2; coop <= NT; coop *= 2) { 162 | int indices[VT]; 163 | KeyType keys[VT]; 164 | CTABlocksortPass(keys_shared, tid, count, coop, keys, 165 | indices, comp); 166 | 167 | if(HasValues) { 168 | // Exchange the values through shared memory. 169 | DeviceThreadToShared(threadValues, tid, values_shared); 170 | DeviceGather(NT * VT, values_shared, indices, tid, 171 | threadValues); 172 | } 173 | 174 | // Store results in shared memory in sorted order. 175 | DeviceThreadToShared(keys, tid, keys_shared); 176 | } 177 | } 178 | 179 | //////////////////////////////////////////////////////////////////////////////// 180 | // CTAMergesort 181 | // Caller provides the keys in shared memory. This functions sorts the first 182 | // count elements. 183 | 184 | template 186 | MGPU_DEVICE void CTAMergesort(KeyType threadKeys[VT], ValType threadValues[VT], 187 | KeyType* keys_shared, ValType* values_shared, int count, int tid, 188 | Comp comp) { 189 | 190 | // Stable sort the keys in the thread. 191 | if(VT * tid < count) { 192 | if(Stable) 193 | OddEvenTransposeSort(threadKeys, threadValues, comp); 194 | else 195 | OddEvenMergesort(threadKeys, threadValues, comp); 196 | } 197 | 198 | // Store the locally sorted keys into shared memory. 199 | DeviceThreadToShared(threadKeys, tid, keys_shared); 200 | 201 | // Recursively merge lists until the entire CTA is sorted. 202 | CTABlocksortLoop(threadValues, keys_shared, 203 | values_shared, tid, count, comp); 204 | } 205 | 206 | template 207 | MGPU_DEVICE void CTAMergesortKeys(KeyType threadKeys[VT], 208 | KeyType* keys_shared, int count, int tid, Comp comp) { 209 | 210 | int valuesTemp[VT]; 211 | CTAMergesort(threadKeys, valuesTemp, keys_shared, 212 | (int*)keys_shared, count, tid, comp); 213 | } 214 | 215 | template 217 | MGPU_DEVICE void CTAMergesortPairs(KeyType threadKeys[VT], 218 | ValType threadValues[VT], KeyType* keys_shared, ValType* values_shared, 219 | int count, int tid, Comp comp) { 220 | 221 | CTAMergesort(threadKeys, threadValues, keys_shared, 222 | values_shared, count, tid, comp); 223 | } 224 | 225 | //////////////////////////////////////////////////////////////////////////////// 226 | // DeviceMergeKeysIndices 227 | 228 | template 230 | MGPU_DEVICE void DeviceMergeKeysIndices(It1 a_global, int aCount, It2 b_global, 231 | int bCount, int4 range, int tid, T* keys_shared, T* results, int* indices, 232 | Comp comp) { 233 | 234 | int a0 = range.x; 235 | int a1 = range.y; 236 | int b0 = range.z; 237 | int b1 = range.w; 238 | 239 | if(LoadExtended) { 240 | bool extended = (a1 < aCount) && (b1 < bCount); 241 | aCount = a1 - a0; 242 | bCount = b1 - b0; 243 | int aCount2 = aCount + (int)extended; 244 | int bCount2 = bCount + (int)extended; 245 | 246 | // Load one element past the end of each input to avoid having to use 247 | // range checking in the merge loop. 248 | DeviceLoad2ToShared(a_global + a0, aCount2, 249 | b_global + b0, bCount2, tid, keys_shared); 250 | 251 | // Run a Merge Path search for each thread's starting point. 252 | int diag = VT * tid; 253 | int mp = MergePath(keys_shared, aCount, 254 | keys_shared + aCount2, bCount, diag, comp); 255 | 256 | // Compute the ranges of the sources in shared memory. 257 | int a0tid = mp; 258 | int b0tid = aCount2 + diag - mp; 259 | if(extended) { 260 | SerialMerge(keys_shared, a0tid, 0, b0tid, 0, results, 261 | indices, comp); 262 | } else { 263 | int a1tid = aCount; 264 | int b1tid = aCount2 + bCount; 265 | SerialMerge(keys_shared, a0tid, a1tid, b0tid, b1tid, 266 | results, indices, comp); 267 | } 268 | } else { 269 | // Use the input intervals from the ranges between the merge path 270 | // intersections. 271 | aCount = a1 - a0; 272 | bCount = b1 - b0; 273 | 274 | // Load the data into shared memory. 275 | DeviceLoad2ToShared(a_global + a0, aCount, b_global + b0, 276 | bCount, tid, keys_shared); 277 | 278 | // Run a merge path to find the start of the serial merge for each 279 | // thread. 280 | int diag = VT * tid; 281 | int mp = MergePath(keys_shared, aCount, 282 | keys_shared + aCount, bCount, diag, comp); 283 | 284 | // Compute the ranges of the sources in shared memory. 285 | int a0tid = mp; 286 | int a1tid = aCount; 287 | int b0tid = aCount + diag - mp; 288 | int b1tid = aCount + bCount; 289 | 290 | // Serial merge into register. 291 | SerialMerge(keys_shared, a0tid, a1tid, b0tid, b1tid, results, 292 | indices, comp); 293 | } 294 | } 295 | 296 | //////////////////////////////////////////////////////////////////////////////// 297 | // DeviceMerge 298 | // Merge pairs from global memory into global memory. Useful factorization to 299 | // enable calling from merge, mergesort, and locality sort. 300 | 301 | template 304 | MGPU_DEVICE void DeviceMerge(KeysIt1 aKeys_global, ValsIt1 aVals_global, 305 | int aCount, KeysIt2 bKeys_global, ValsIt2 bVals_global, int bCount, 306 | int tid, int block, int4 range, KeyType* keys_shared, int* indices_shared, 307 | KeysIt3 keys_global, ValsIt3 vals_global, Comp comp) { 308 | 309 | KeyType results[VT]; 310 | int indices[VT]; 311 | DeviceMergeKeysIndices(aKeys_global, aCount, 312 | bKeys_global, bCount, range, tid, keys_shared, results, indices, comp); 313 | 314 | // Store merge results back to shared memory. 315 | DeviceThreadToShared(results, tid, keys_shared); 316 | 317 | // Store merged keys to global memory. 318 | aCount = range.y - range.x; 319 | bCount = range.w - range.z; 320 | DeviceSharedToGlobal(aCount + bCount, keys_shared, tid, 321 | keys_global + NT * VT * block); 322 | 323 | // Copy the values. 324 | if(HasValues) { 325 | DeviceThreadToShared(indices, tid, indices_shared); 326 | 327 | DeviceTransferMergeValuesShared(aCount + bCount, 328 | aVals_global + range.x, bVals_global + range.z, aCount, 329 | indices_shared, tid, vals_global + NT * VT * block); 330 | } 331 | } 332 | 333 | } // namespace mgpu 334 | -------------------------------------------------------------------------------- /include/contrib/moderngpu/include/device/ctascan.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "../mgpuenums.h" 38 | #include "deviceutil.cuh" 39 | #include "intrinsics.cuh" 40 | 41 | namespace mgpu { 42 | 43 | //////////////////////////////////////////////////////////////////////////////// 44 | // CTAReduce 45 | 46 | template > 47 | struct CTAReduce { 48 | typedef typename Op::first_argument_type T; 49 | enum { Size = NT, Capacity = NT }; 50 | struct Storage { T shared[Capacity]; }; 51 | 52 | MGPU_DEVICE static T Reduce(int tid, T x, Storage& storage, Op op = Op()) { 53 | storage.shared[tid] = x; 54 | __syncthreads(); 55 | 56 | // Fold the data in half with each pass. 57 | #pragma unroll 58 | for(int destCount = NT / 2; destCount >= 1; destCount /= 2) { 59 | if(tid < destCount) { 60 | // Read from the right half and store to the left half. 61 | x = op(x, storage.shared[destCount + tid]); 62 | storage.shared[tid] = x; 63 | } 64 | __syncthreads(); 65 | } 66 | T total = storage.shared[0]; 67 | __syncthreads(); 68 | return total; 69 | } 70 | }; 71 | 72 | #if __CUDA_ARCH__ >= 300 73 | 74 | template 75 | struct CTAReduce > { 76 | typedef mgpu::plus Op; 77 | typedef int T; 78 | enum { Size = NT, Capacity = WARP_SIZE }; 79 | struct Storage { int shared[Capacity]; }; 80 | 81 | MGPU_DEVICE static int Reduce(int tid, int x, Storage& storage, 82 | Op op = Op()) { 83 | 84 | const int NumSections = WARP_SIZE; 85 | const int SecSize = NT / NumSections; 86 | int lane = (SecSize - 1) & tid; 87 | int sec = tid / SecSize; 88 | 89 | // In the first phase, threads cooperatively find the reduction within 90 | // their segment. The segments are SecSize threads (NT / WARP_SIZE) 91 | // wide. 92 | #pragma unroll 93 | for(int offset = 1; offset < SecSize; offset *= 2) 94 | x = shfl_add(x, offset, SecSize); 95 | 96 | // The last thread in each segment stores the local reduction to shared 97 | // memory. 98 | if(SecSize - 1 == lane) storage.shared[sec] = x; 99 | __syncthreads(); 100 | 101 | // Reduce the totals of each input segment. The spine is WARP_SIZE 102 | // threads wide. 103 | if(tid < NumSections) { 104 | x = storage.shared[tid]; 105 | #pragma unroll 106 | for(int offset = 1; offset < NumSections; offset *= 2) 107 | x = shfl_add(x, offset, NumSections); 108 | storage.shared[tid] = x; 109 | } 110 | __syncthreads(); 111 | 112 | int reduction = storage.shared[NumSections - 1]; 113 | __syncthreads(); 114 | 115 | return reduction; 116 | } 117 | }; 118 | 119 | template 120 | struct CTAReduce > { 121 | typedef mgpu::maximum Op; 122 | enum { Size = NT, Capacity = WARP_SIZE }; 123 | struct Storage { int shared[Capacity]; }; 124 | 125 | MGPU_DEVICE static int Reduce(int tid, int x, Storage& storage, 126 | Op op = Op()) { 127 | 128 | const int NumSections = WARP_SIZE; 129 | const int SecSize = NT / NumSections; 130 | int lane = (SecSize - 1) & tid; 131 | int sec = tid / SecSize; 132 | 133 | #pragma unroll 134 | for(int offset = 1; offset < SecSize; offset *= 2) 135 | x = shfl_max(x, offset, SecSize); 136 | 137 | if(SecSize - 1 == lane) storage.shared[sec] = x; 138 | __syncthreads(); 139 | 140 | if(tid < NumSections) { 141 | x = storage.shared[tid]; 142 | #pragma unroll 143 | for(int offset = 1; offset < NumSections; offset *= 2) 144 | x = shfl_max(x, offset, NumSections); 145 | storage.shared[tid] = x; 146 | } 147 | __syncthreads(); 148 | 149 | int reduction = storage.shared[NumSections - 1]; 150 | __syncthreads(); 151 | 152 | return reduction; 153 | } 154 | }; 155 | 156 | #endif // __CUDA_ARCH__ >= 300 157 | 158 | //////////////////////////////////////////////////////////////////////////////// 159 | // CTAScan 160 | 161 | template > 162 | struct CTAScan { 163 | typedef typename Op::result_type T; 164 | enum { Size = NT, Capacity = 2 * NT + 1 }; 165 | struct Storage { T shared[Capacity]; }; 166 | 167 | MGPU_DEVICE static T Scan(int tid, T x, Storage& storage, T* total, 168 | MgpuScanType type = MgpuScanTypeExc, T identity = (T)0, Op op = Op()) { 169 | 170 | storage.shared[tid] = x; 171 | int first = 0; 172 | __syncthreads(); 173 | 174 | #pragma unroll 175 | for(int offset = 1; offset < NT; offset += offset) { 176 | if(tid >= offset) 177 | x = op(storage.shared[first + tid - offset], x); 178 | first = NT - first; 179 | storage.shared[first + tid] = x; 180 | __syncthreads(); 181 | } 182 | *total = storage.shared[first + NT - 1]; 183 | 184 | if(MgpuScanTypeExc == type) 185 | x = tid ? storage.shared[first + tid - 1] : identity; 186 | 187 | __syncthreads(); 188 | return x; 189 | } 190 | MGPU_DEVICE static T Scan(int tid, T x, Storage& storage) { 191 | T total; 192 | return Scan(tid, x, storage, &total, MgpuScanTypeExc, (T)0, Op()); 193 | } 194 | }; 195 | 196 | //////////////////////////////////////////////////////////////////////////////// 197 | // Special partial specialization for CTAScan on Kepler. 198 | // This uses the shfl intrinsic to reduce scan latency. 199 | 200 | #if __CUDA_ARCH__ >= 300 201 | 202 | template 203 | struct CTAScan > { 204 | typedef mgpu::plus Op; 205 | enum { Size = NT, NumSegments = WARP_SIZE, SegSize = NT / NumSegments }; 206 | enum { Capacity = NumSegments + 1 }; 207 | struct Storage { int shared[Capacity + 1]; }; 208 | 209 | MGPU_DEVICE static int Scan(int tid, int x, Storage& storage, int* total, 210 | MgpuScanType type = MgpuScanTypeExc, int identity = 0, Op op = Op()) { 211 | 212 | // Define WARP_SIZE segments that are NT / WARP_SIZE large. 213 | // Each warp makes log(SegSize) shfl_add calls. 214 | // The spine makes log(WARP_SIZE) shfl_add calls. 215 | int lane = (SegSize - 1) & tid; 216 | int segment = tid / SegSize; 217 | 218 | // Scan each segment using shfl_add. 219 | int scan = x; 220 | #pragma unroll 221 | for(int offset = 1; offset < SegSize; offset *= 2) 222 | scan = shfl_add(scan, offset, SegSize); 223 | 224 | // Store the reduction (last element) of each segment into storage. 225 | if(SegSize - 1 == lane) storage.shared[segment] = scan; 226 | __syncthreads(); 227 | 228 | // Warp 0 does a full shfl warp scan on the partials. The total is 229 | // stored to shared[NumSegments]. (NumSegments = WARP_SIZE) 230 | if(tid < NumSegments) { 231 | int y = storage.shared[tid]; 232 | int scan = y; 233 | #pragma unroll 234 | for(int offset = 1; offset < NumSegments; offset *= 2) 235 | scan = shfl_add(scan, offset, NumSegments); 236 | storage.shared[tid] = scan - y; 237 | if(NumSegments - 1 == tid) storage.shared[NumSegments] = scan; 238 | } 239 | __syncthreads(); 240 | 241 | // Add the scanned partials back in and convert to exclusive scan. 242 | scan += storage.shared[segment]; 243 | if(MgpuScanTypeExc == type) { 244 | scan -= x; 245 | if(identity && !tid) scan = identity; 246 | } 247 | *total = storage.shared[NumSegments]; 248 | __syncthreads(); 249 | 250 | return scan; 251 | } 252 | MGPU_DEVICE static int Scan(int tid, int x, Storage& storage) { 253 | int total; 254 | return Scan(tid, x, storage, &total, MgpuScanTypeExc, 0); 255 | } 256 | }; 257 | 258 | #endif // __CUDA_ARCH__ >= 300 259 | 260 | //////////////////////////////////////////////////////////////////////////////// 261 | // CTABinaryScan 262 | 263 | template 264 | MGPU_DEVICE int CTABinaryScan(int tid, bool x, int* shared, int* total) { 265 | const int NumWarps = NT / WARP_SIZE; 266 | int warp = tid / WARP_SIZE; 267 | int lane = (WARP_SIZE - 1); 268 | 269 | // Store the bit totals for each warp. 270 | uint bits = __ballot(x); 271 | shared[warp] = popc(bits); 272 | __syncthreads(); 273 | 274 | #if __CUDA_ARCH__ >= 300 275 | if(tid < NumWarps) { 276 | int x = shared[tid]; 277 | int scan = x; 278 | #pragma unroll 279 | for(int offset = 1; offset < NumWarps; offset *= 2) 280 | scan = shfl_add(scan, offset, NumWarps); 281 | shared[tid] = scan - x; 282 | } 283 | __syncthreads(); 284 | 285 | #else 286 | // Thread 0 scans warp totals. 287 | if(!tid) { 288 | int scan = 0; 289 | #pragma unroll 290 | for(int i = 0; i < NumWarps; ++i) { 291 | int y = shared[i]; 292 | shared[i] = scan; 293 | scan += y; 294 | } 295 | shared[NumWarps] = scan; 296 | } 297 | __syncthreads(); 298 | 299 | #endif // __CUDA_ARCH__ >= 300 300 | 301 | // Add the warp scan back into the partials. 302 | int scan = shared[warp] + __popc(bfe(bits, 0, lane)); 303 | *total = shared[NumWarps]; 304 | __syncthreads(); 305 | return scan; 306 | } 307 | 308 | } // namespace mgpu 309 | -------------------------------------------------------------------------------- /include/contrib/moderngpu/include/device/ctasearch.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "deviceutil.cuh" 38 | #include "../mgpudevice.cuh" 39 | 40 | namespace mgpu { 41 | 42 | template 44 | MGPU_HOST_DEVICE void BinarySearchIt(It data, int& begin, int& end, T key, 45 | int shift, Comp comp) { 46 | 47 | IntT scale = (1<< shift) - 1; 48 | int mid = (int)((begin + scale * end)>> shift); 49 | 50 | T key2 = data[mid]; 51 | bool pred = (MgpuBoundsUpper == Bounds) ? 52 | !comp(key, key2) : 53 | comp(key2, key); 54 | if(pred) begin = mid + 1; 55 | else end = mid; 56 | } 57 | 58 | template 60 | MGPU_HOST_DEVICE int BiasedBinarySearch(It data, int count, T key, int levels, 61 | Comp comp) { 62 | 63 | int begin = 0; 64 | int end = count; 65 | 66 | if(levels >= 4 && begin < end) 67 | BinarySearchIt(data, begin, end, key, 9, comp); 68 | if(levels >= 3 && begin < end) 69 | BinarySearchIt(data, begin, end, key, 7, comp); 70 | if(levels >= 2 && begin < end) 71 | BinarySearchIt(data, begin, end, key, 5, comp); 72 | if(levels >= 1 && begin < end) 73 | BinarySearchIt(data, begin, end, key, 4, comp); 74 | 75 | while(begin < end) 76 | BinarySearchIt(data, begin, end, key, 1, comp); 77 | return begin; 78 | } 79 | 80 | template 81 | MGPU_HOST_DEVICE int BinarySearch(It data, int count, T key, Comp comp) { 82 | int begin = 0; 83 | int end = count; 84 | while(begin < end) 85 | BinarySearchIt(data, begin, end, key, 1, comp); 86 | return begin; 87 | } 88 | 89 | //////////////////////////////////////////////////////////////////////////////// 90 | // MergePath search 91 | 92 | template 93 | MGPU_HOST_DEVICE int MergePath(It1 a, int aCount, It2 b, int bCount, int diag, 94 | Comp comp) { 95 | 96 | typedef typename std::iterator_traits::value_type T; 97 | int begin = max(0, diag - bCount); 98 | int end = min(diag, aCount); 99 | 100 | while(begin < end) { 101 | int mid = (begin + end)>> 1; 102 | T aKey = a[mid]; 103 | T bKey = b[diag - 1 - mid]; 104 | bool pred = (MgpuBoundsUpper == Bounds) ? 105 | comp(aKey, bKey) : 106 | !comp(bKey, aKey); 107 | if(pred) begin = mid + 1; 108 | else end = mid; 109 | } 110 | return begin; 111 | } 112 | 113 | 114 | //////////////////////////////////////////////////////////////////////////////// 115 | // SegmentedMergePath search 116 | 117 | template 118 | MGPU_HOST_DEVICE int SegmentedMergePath(InputIt keys, int aOffset, int aCount, 119 | int bOffset, int bCount, int leftEnd, int rightStart, int diag, Comp comp) { 120 | 121 | // leftEnd and rightStart are defined from the origin, and diag is defined 122 | // from aOffset. 123 | // We only need to run a Merge Path search if the diagonal intersects the 124 | // segment that strides the left and right halves (i.e. is between leftEnd 125 | // and rightStart). 126 | if(aOffset + diag <= leftEnd) return diag; 127 | if(aOffset + diag >= rightStart) return aCount; 128 | 129 | bCount = min(bCount, rightStart - bOffset); 130 | int begin = max(max(leftEnd - aOffset, 0), diag - bCount); 131 | int end = min(diag, aCount); 132 | 133 | while(begin < end) { 134 | int mid = (begin + end)>> 1; 135 | int ai = aOffset + mid; 136 | int bi = bOffset + diag - 1 - mid; 137 | 138 | bool pred = !comp(keys[bi], keys[ai]); 139 | if(pred) begin = mid + 1; 140 | else end = mid; 141 | } 142 | return begin; 143 | } 144 | 145 | //////////////////////////////////////////////////////////////////////////////// 146 | // BalancedPath search 147 | 148 | template 150 | MGPU_HOST_DEVICE int2 BalancedPath(InputIt1 a, int aCount, InputIt2 b, 151 | int bCount, int diag, int levels, Comp comp) { 152 | 153 | typedef typename std::iterator_traits::value_type T; 154 | 155 | int p = MergePath(a, aCount, b, bCount, diag, comp); 156 | int aIndex = p; 157 | int bIndex = diag - p; 158 | 159 | bool star = false; 160 | if(bIndex < bCount) { 161 | if(Duplicates) { 162 | T x = b[bIndex]; 163 | 164 | // Search for the beginning of the duplicate run in both A and B. 165 | // Because 166 | int aStart = BiasedBinarySearch(a, aIndex, x, 167 | levels, comp); 168 | int bStart = BiasedBinarySearch(b, bIndex, x, 169 | levels, comp); 170 | 171 | // The distance between the merge path and the lower_bound is the 172 | // 'run'. We add up the a- and b- runs and evenly distribute them to 173 | // get a stairstep path. 174 | int aRun = aIndex - aStart; 175 | int bRun = bIndex - bStart; 176 | int xCount = aRun + bRun; 177 | 178 | // Attempt to advance b and regress a. 179 | int bAdvance = max(xCount>> 1, bRun); 180 | int bEnd = min(bCount, bStart + bAdvance + 1); 181 | int bRunEnd = BinarySearch(b + bIndex, 182 | bEnd - bIndex, x, comp) + bIndex; 183 | bRun = bRunEnd - bStart; 184 | 185 | bAdvance = min(bAdvance, bRun); 186 | int aAdvance = xCount - bAdvance; 187 | 188 | bool roundUp = (aAdvance == bAdvance + 1) && (bAdvance < bRun); 189 | aIndex = aStart + aAdvance; 190 | 191 | if(roundUp) star = true; 192 | } else { 193 | if(aIndex && aCount) { 194 | T aKey = a[aIndex - 1]; 195 | T bKey = b[bIndex]; 196 | 197 | // If the last consumed element in A (aIndex - 1) is the same as 198 | // the next element in B (bIndex), we're sitting at a starred 199 | // partition. 200 | if(!comp(aKey, bKey)) star = true; 201 | } 202 | } 203 | } 204 | return make_int2(aIndex, star); 205 | } 206 | 207 | } // namespace mgpu 208 | -------------------------------------------------------------------------------- /include/contrib/moderngpu/include/device/ctasegreduce.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "ctasegscan.cuh" 38 | #include "ctasearch.cuh" 39 | 40 | namespace mgpu { 41 | 42 | //////////////////////////////////////////////////////////////////////////////// 43 | // Segmented reduce utility functions. 44 | 45 | // Extract the upper-bound indices from the coded ranges. Decrement to include 46 | // the first addressed row/segment. 47 | 48 | struct SegReduceRange { 49 | int begin; 50 | int end; 51 | int total; 52 | bool flushLast; 53 | }; 54 | 55 | MGPU_DEVICE SegReduceRange DeviceShiftRange(int limit0, int limit1) { 56 | SegReduceRange range; 57 | range.begin = 0x7fffffff & limit0; 58 | range.end = 0x7fffffff & limit1; 59 | range.total = range.end - range.begin; 60 | range.flushLast = 0 == (0x80000000 & limit1); 61 | range.end += !range.flushLast; 62 | return range; 63 | } 64 | 65 | // Reconstitute row/segment indices from a starting row index and packed end 66 | // flags. Used for pre-processed versions of interval reduce and interval Spmv. 67 | template 68 | MGPU_DEVICE void DeviceExpandFlagsToRows(int first, int endFlags, 69 | int rows[VT + 1]) { 70 | 71 | rows[0] = first; 72 | #pragma unroll 73 | for(int i = 0; i < VT; ++i) { 74 | if((1<< i) & endFlags) ++first; 75 | rows[i + 1] = first; 76 | } 77 | } 78 | 79 | //////////////////////////////////////////////////////////////////////////////// 80 | // After loading CSR terms into shared memory, each thread binary searches 81 | // (upper-bound) to find its starting point. Each thread then walks forward, 82 | // emitting the csr0-relative row indices to register. 83 | 84 | template 85 | MGPU_DEVICE int DeviceExpandCsrRows(int tidOffset, int* csr_shared, 86 | int numRows, int end, int rows[VT + 1], int rowStarts[VT]) { 87 | 88 | // Each thread binary searches for its starting row. 89 | int row = BinarySearch(csr_shared, numRows, tidOffset, 90 | mgpu::less()) - 1; 91 | 92 | // Each thread starts at row and scans forward, emitting row IDs into 93 | // register. Store the CTA-local row index (starts at 0) to rows and the 94 | // start of the row (globally) to rowStarts. 95 | int curOffset = csr_shared[row]; 96 | int nextOffset = (row + 1 < numRows) ? csr_shared[row + 1] : end; 97 | 98 | rows[0] = row; 99 | rowStarts[0] = curOffset; 100 | int endFlags = 0; 101 | 102 | #pragma unroll 103 | for(int i = 1; i <= VT; ++i) { 104 | // Advance the row cursor when the iterator hits the next row offset. 105 | if(tidOffset + i == nextOffset) { 106 | // Set an end flag when the cursor advances to the next row. 107 | endFlags |= 1<< (i - 1); 108 | 109 | // Advance the cursor and load the next row offset. 110 | ++row; 111 | curOffset = nextOffset; 112 | nextOffset = (row + 1 < numRows) ? csr_shared[row + 1] : end; 113 | } 114 | rows[i] = row; 115 | if(i < VT) rowStarts[i] = curOffset; 116 | } 117 | __syncthreads(); 118 | 119 | return endFlags; 120 | } 121 | 122 | //////////////////////////////////////////////////////////////////////////////// 123 | // DeviceSegReducePrepare 124 | // Expand non-empty interval of CSR elements into row indices. Compute end-flags 125 | // by comparing adjacent row IDs. 126 | 127 | // DeviceSegReducePrepare may be called either by a pre-processing kernel or by 128 | // the kernel that actually evaluates the segmented reduction if no preprocesing 129 | // is desired. 130 | struct SegReduceTerms { 131 | int endFlags; 132 | int tidDelta; 133 | }; 134 | 135 | template 136 | MGPU_DEVICE SegReduceTerms DeviceSegReducePrepare(int* csr_shared, int numRows, 137 | int tid, int gid, bool flushLast, int rows[VT + 1], int rowStarts[VT]) { 138 | 139 | // Pass a sentinel (end) to point to the next segment start. If we flush, 140 | // this is the end of this tile. Otherwise it is INT_MAX 141 | int endFlags = DeviceExpandCsrRows(gid + VT * tid, csr_shared, 142 | numRows, flushLast ? (gid + NT * VT) : INT_MAX, rows, rowStarts); 143 | 144 | // Find the distance to to scan to compute carry-in for each thread. Use the 145 | // existance of an end flag anywhere in the thread to determine if carry-out 146 | // values from the left should propagate through to the right. 147 | int tidDelta = DeviceFindSegScanDelta(tid, rows[0] != rows[VT], 148 | csr_shared); 149 | 150 | SegReduceTerms terms = { endFlags, tidDelta }; 151 | return terms; 152 | } 153 | 154 | //////////////////////////////////////////////////////////////////////////////// 155 | // CTASegReduce 156 | // Core segmented reduction code. Supports fast-path and slow-path for intra-CTA 157 | // segmented reduction. Stores partials to global memory. 158 | // Callers feed CTASegReduce::ReduceToGlobal values in thread order. 159 | template 160 | struct CTASegReduce { 161 | typedef CTASegScan SegScan; 162 | 163 | enum { 164 | NV = NT * VT, 165 | Capacity = HalfCapacity ? (NV / 2) : NV 166 | }; 167 | 168 | union Storage { 169 | typename SegScan::Storage segScanStorage; 170 | T values[Capacity]; 171 | }; 172 | 173 | template 174 | MGPU_DEVICE static void ReduceToGlobal(const int rows[VT + 1], int total, 175 | int tidDelta, int startRow, int block, int tid, T data[VT], 176 | DestIt dest_global, T* carryOut_global, T identity, Op op, 177 | Storage& storage) { 178 | 179 | // Run a segmented scan within the thread. 180 | T x, localScan[VT]; 181 | #pragma unroll 182 | for(int i = 0; i < VT; ++i) { 183 | x = i ? op(x, data[i]) : data[i]; 184 | localScan[i] = x; 185 | if(rows[i] != rows[i + 1]) x = identity; 186 | } 187 | 188 | // Run a parallel segmented scan over the carry-out values to compute 189 | // carry-in. 190 | T carryOut; 191 | T carryIn = SegScan::SegScanDelta(tid, tidDelta, x, 192 | storage.segScanStorage, &carryOut, identity, op); 193 | 194 | // Store the carry-out for the entire CTA to global memory. 195 | if(!tid) carryOut_global[block] = carryOut; 196 | 197 | dest_global += startRow; 198 | if(HalfCapacity && total > Capacity) { 199 | // Add carry-in to each thread-local scan value. Store directly 200 | // to global. 201 | #pragma unroll 202 | for(int i = 0; i < VT; ++i) { 203 | // Add the carry-in to the local scan. 204 | T x2 = op(carryIn, localScan[i]); 205 | 206 | // Store on the end flag and clear the carry-in. 207 | if(rows[i] != rows[i + 1]) { 208 | carryIn = identity; 209 | dest_global[rows[i]] = x2; 210 | } 211 | } 212 | } else { 213 | // All partials fit in shared memory. Add carry-in to each thread- 214 | // local scan value. 215 | #pragma unroll 216 | for(int i = 0; i < VT; ++i) { 217 | // Add the carry-in to the local scan. 218 | T x2 = op(carryIn, localScan[i]); 219 | 220 | // Store reduction when the segment changes and clear the 221 | // carry-in. 222 | if(rows[i] != rows[i + 1]) { 223 | storage.values[rows[i]] = x2; 224 | carryIn = identity; 225 | } 226 | } 227 | __syncthreads(); 228 | 229 | // Cooperatively store reductions to global memory. 230 | for(int index = tid; index < total; index += NT) 231 | dest_global[index] = storage.values[index]; 232 | __syncthreads(); 233 | } 234 | } 235 | }; 236 | 237 | } // namespace mgpu 238 | 239 | -------------------------------------------------------------------------------- /include/contrib/moderngpu/include/device/ctasegscan.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "ctascan.cuh" 38 | 39 | namespace mgpu { 40 | 41 | //////////////////////////////////////////////////////////////////////////////// 42 | // DeviceFindSegScanDelta 43 | // Runs an inclusive max-index scan over binary inputs. 44 | 45 | template 46 | MGPU_DEVICE int DeviceFindSegScanDelta(int tid, bool flag, int* delta_shared) { 47 | const int NumWarps = NT / 32; 48 | 49 | int warp = tid / 32; 50 | int lane = 31 & tid; 51 | uint warpMask = 0xffffffff>> (31 - lane); // inclusive search 52 | uint ctaMask = 0x7fffffff>> (31 - lane); // exclusive search 53 | 54 | uint warpBits = __ballot(flag); 55 | delta_shared[warp] = warpBits; 56 | __syncthreads(); 57 | 58 | if(tid < NumWarps) { 59 | uint ctaBits = __ballot(0 != delta_shared[tid]); 60 | int warpSegment = 31 - clz(ctaMask & ctaBits); 61 | int start = (-1 != warpSegment) ? 62 | (31 - clz(delta_shared[warpSegment]) + 32 * warpSegment) : 0; 63 | delta_shared[NumWarps + tid] = start; 64 | } 65 | __syncthreads(); 66 | 67 | // Find the closest flag to the left of this thread within the warp. 68 | // Include the flag for this thread. 69 | int start = 31 - clz(warpMask & warpBits); 70 | if(-1 != start) start += ~31 & tid; 71 | else start = delta_shared[NumWarps + warp]; 72 | __syncthreads(); 73 | 74 | return tid - start; 75 | } 76 | 77 | //////////////////////////////////////////////////////////////////////////////// 78 | // CTASegScan 79 | 80 | template > 81 | struct CTASegScan { 82 | typedef _Op Op; 83 | typedef typename Op::result_type T; 84 | enum { NumWarps = NT / 32, Size = NT, Capacity = 2 * NT }; 85 | union Storage { 86 | int delta[NumWarps]; 87 | T values[Capacity]; 88 | }; 89 | 90 | // Each thread passes the reduction of the LAST SEGMENT that it covers. 91 | // flag is set to true if there's at least one segment flag in the thread. 92 | // SegScan returns the reduction of values for the first segment in this 93 | // thread over the preceding threads. 94 | // Return the value init for the first thread. 95 | 96 | // When scanning single elements per thread, interpret the flag as a BEGIN 97 | // FLAG. If tid's flag is set, its value belongs to thread tid + 1, not 98 | // thread tid. 99 | 100 | // The function returns the reduction of the last segment in the CTA. 101 | 102 | MGPU_DEVICE static T SegScanDelta(int tid, int tidDelta, T x, 103 | Storage& storage, T* carryOut, T identity = (T)0, Op op = Op()) { 104 | 105 | // Run an inclusive scan 106 | int first = 0; 107 | storage.values[first + tid] = x; 108 | __syncthreads(); 109 | 110 | #pragma unroll 111 | for(int offset = 1; offset < NT; offset += offset) { 112 | if(tidDelta >= offset) 113 | x = op(storage.values[first + tid - offset], x); 114 | first = NT - first; 115 | storage.values[first + tid] = x; 116 | __syncthreads(); 117 | } 118 | 119 | // Get the exclusive scan. 120 | x = tid ? storage.values[first + tid - 1] : identity; 121 | *carryOut = storage.values[first + NT - 1]; 122 | __syncthreads(); 123 | return x; 124 | } 125 | 126 | MGPU_DEVICE static T SegScan(int tid, T x, bool flag, Storage& storage, 127 | T* carryOut, T identity = (T)0, Op op = Op()) { 128 | 129 | // Find the left-most thread that covers the first segment of this 130 | // thread. 131 | int tidDelta = DeviceFindSegScanDelta(tid, flag, storage.delta); 132 | 133 | return SegScanDelta(tid, tidDelta, x, storage, carryOut, identity, op); 134 | } 135 | }; 136 | 137 | } // namespace mgpu 138 | -------------------------------------------------------------------------------- /include/contrib/moderngpu/include/device/ctasortedsearch.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "../mgpudevice.cuh" 38 | #include "ctasearch.cuh" 39 | 40 | namespace mgpu { 41 | 42 | 43 | //////////////////////////////////////////////////////////////////////////////// 44 | // DeviceSerialSearch 45 | 46 | template 48 | MGPU_DEVICE int3 DeviceSerialSearch(const T* keys_shared, int aBegin, 49 | int aEnd, int bBegin, int bEnd, int aOffset, int bOffset, int* indices, 50 | Comp comp) { 51 | 52 | const int FlagA = IndexA ? 0x80000000 : 1; 53 | const int FlagB = IndexB ? 0x80000000 : 1; 54 | 55 | T aKey = keys_shared[aBegin]; 56 | T bKey = keys_shared[bBegin]; 57 | T aPrev, bPrev; 58 | if(aBegin > 0) aPrev = keys_shared[aBegin - 1]; 59 | if(bBegin > 0) bPrev = keys_shared[bBegin - 1]; 60 | int decisions = 0; 61 | int matchCountA = 0; 62 | int matchCountB = 0; 63 | 64 | #pragma unroll 65 | for(int i = 0; i < VT; ++i) { 66 | bool p; 67 | if(RangeCheck && aBegin >= aEnd) p = false; 68 | else if(RangeCheck && bBegin >= bEnd) p = true; 69 | else p = (MgpuBoundsUpper == Bounds) ? 70 | comp(aKey, bKey) : 71 | !comp(bKey, aKey); 72 | 73 | if(p) { 74 | // aKey is smaller than bKey, so it is inserted before bKey. 75 | // Save bKey's index (bBegin + first) as the result of the search 76 | // and advance to the next needle in A. 77 | bool match = false; 78 | if(MatchA) { 79 | // Test if there is an element in B that matches aKey. 80 | if(MgpuBoundsUpper == Bounds) { 81 | // Upper Bound: We're inserting aKey after bKey. If there 82 | // is a match for aKey it must be bPrev. Check that bPrev 83 | // is in range and equal to aKey. 84 | // The predicate test result !comp(aKey, bPrev) was 85 | // established on the previous A-advancing iteration (it 86 | // failed the comp(aKey, bKey) test to get us to this 87 | // point). Check the other half of the equality condition 88 | // with a second comparison. 89 | bool inRange = !RangeCheck || (bBegin > aEnd); 90 | match = inRange && !comp(bPrev, aKey); 91 | } else { 92 | // Lower Bound: We're inserting aKey before bKey. If there 93 | // is a match for aKey, it must be bKey. Check that bKey 94 | // is in range and equal to aKey. 95 | // The predicate test !comp(bKey, aKey) has established one 96 | // half of the equality condition. We establish the other 97 | // half with a second comparison. 98 | bool inRange = !RangeCheck || (bBegin < bEnd); 99 | match = inRange && !comp(aKey, bKey); 100 | } 101 | } 102 | 103 | int index = 0; 104 | if(IndexA) index = bOffset + bBegin; 105 | if(match) index |= FlagA; 106 | if(IndexA || MatchA) indices[i] = index; 107 | matchCountA += match; 108 | 109 | // Mark the decision bit to indicate that this iteration has 110 | // progressed A (the needles). 111 | decisions |= 1<< i; 112 | aPrev = aKey; 113 | aKey = keys_shared[++aBegin]; 114 | } else { 115 | // aKey is larger than bKey, so it is inserted after bKey (but we 116 | // don't know where yet). Advance the B index to the next element in 117 | // the haystack to continue the search for the current needle. 118 | bool match = false; 119 | if(MatchB) { 120 | if(MgpuBoundsUpper == Bounds) { 121 | // Upper Bound: aKey is not smaller than bKey. We advance to 122 | // the next haystack element in B. If there is a match in A 123 | // for bKey it must be aKey. By entering this branch we've 124 | // verified that !comp(aKey, bKey). Making the reciprocal 125 | // comparison !comp(bKey, aKey) establishes aKey == bKey. 126 | bool inRange = !RangeCheck || 127 | ((bBegin < bEnd) && (aBegin < aEnd)); 128 | match = inRange && !comp(bKey, aKey); 129 | } else { 130 | // Lower Bound: bKey is smaller than aKey. We advance to the 131 | // next element in B. If there is a match for bKey, it must 132 | // be aPrev. The previous A-advancing iteration proved that 133 | // !comp(bKey, aPrev). We test !comp(aPrev, bKey) for the 134 | // other half of the equality condition. 135 | bool inRange = !RangeCheck || 136 | ((bBegin < bEnd) && (aBegin > 0)); 137 | match = inRange && !comp(aPrev, bKey); 138 | } 139 | } 140 | 141 | int index = 0; 142 | if(IndexB) index = aOffset + aBegin; 143 | if(match) index |= FlagB; 144 | if(IndexB || MatchB) indices[i] = index; 145 | matchCountB += match; 146 | 147 | // Keep the decision bit cleared to indicate that this iteration 148 | // has progressed B (the haystack). 149 | bPrev = bKey; 150 | bKey = keys_shared[++bBegin]; 151 | } 152 | } 153 | return make_int3(decisions, matchCountA, matchCountB); 154 | } 155 | 156 | //////////////////////////////////////////////////////////////////////////////// 157 | // CTASortedSearch 158 | // Take keys in shared memory and return indices and b-match flags in shared 159 | // memory. 160 | // NOTE: This function doesn't do any strided-to-thread order transposes so 161 | // using an even number of values per thread will incur no additional bank 162 | // conflicts. 163 | 164 | template 166 | MGPU_DEVICE int2 CTASortedSearch(T* keys_shared, int aStart, int aCount, 167 | int aEnd, int a0, int bStart, int bCount, int bEnd, int b0, bool extended, 168 | int tid, int* indices_shared, Comp comp) { 169 | 170 | // Run a merge path to find the start of the serial search for each thread. 171 | int diag = VT * tid; 172 | int mp = MergePath(keys_shared + aStart, aCount, 173 | keys_shared + bStart, bCount, diag, comp); 174 | int a0tid = mp; 175 | int b0tid = diag - mp; 176 | 177 | // Serial search into register. 178 | int3 results; 179 | int indices[VT]; 180 | if(extended) 181 | results = DeviceSerialSearch(keys_shared, a0tid + aStart, aEnd, b0tid + bStart, bEnd, 183 | a0 - aStart, b0 - bStart, indices, comp); 184 | else 185 | results = DeviceSerialSearch(keys_shared, a0tid + aStart, aEnd, b0tid + bStart, bEnd, 187 | a0 - aStart, b0 - bStart, indices, comp); 188 | __syncthreads(); 189 | 190 | // Compact the indices into shared memory. Use the decision bits (set is A, 191 | // cleared is B) to select the destination. 192 | int decisions = results.x; 193 | b0tid += aCount; 194 | #pragma unroll 195 | for(int i = 0; i < VT; ++i) { 196 | if((1<< i) & decisions) { 197 | if(IndexA || MatchA) indices_shared[a0tid++] = indices[i]; 198 | } else { 199 | if(IndexB || MatchB) indices_shared[b0tid++] = indices[i]; 200 | } 201 | } 202 | __syncthreads(); 203 | 204 | // Return the match counts for A and B keys. 205 | return make_int2(results.y, results.z); 206 | } 207 | 208 | } // namespace mgpu 209 | -------------------------------------------------------------------------------- /include/contrib/moderngpu/include/device/devicetypes.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #if __CUDA_ARCH__ == 100 38 | #error "COMPUTE CAPABILITY 1.0 NOT SUPPORTED BY MPGU. TRY 2.0!" 39 | #endif 40 | 41 | #include 42 | #include "../util/static.h" 43 | 44 | #ifdef _MSC_VER 45 | #define INLINESYMBOL __forceinline__ 46 | #else 47 | #define INLINESYMBOL inline 48 | #endif 49 | 50 | namespace mgpu { 51 | 52 | #define MGPU_HOST __host__ INLINESYMBOL 53 | #define MGPU_DEVICE __device__ INLINESYMBOL 54 | #define MGPU_HOST_DEVICE __host__ __device__ INLINESYMBOL 55 | 56 | const int WARP_SIZE = 32; 57 | const int LOG_WARP_SIZE = 5; 58 | 59 | //////////////////////////////////////////////////////////////////////////////// 60 | // Device-side comparison operators 61 | 62 | template 63 | struct less : public std::binary_function { 64 | MGPU_HOST_DEVICE bool operator()(T a, T b) { return a < b; } 65 | }; 66 | template 67 | struct less_equal : public std::binary_function { 68 | MGPU_HOST_DEVICE bool operator()(T a, T b) { return a <= b; } 69 | }; 70 | template 71 | struct greater : public std::binary_function { 72 | MGPU_HOST_DEVICE bool operator()(T a, T b) { return a > b; } 73 | }; 74 | template 75 | struct greater_equal : public std::binary_function { 76 | MGPU_HOST_DEVICE bool operator()(T a, T b) { return a >= b; } 77 | }; 78 | template 79 | struct equal_to : public std::binary_function { 80 | MGPU_HOST_DEVICE bool operator()(T a, T b) { return a == b; } 81 | }; 82 | template 83 | struct not_equal_to : public std::binary_function { 84 | MGPU_HOST_DEVICE bool operator()(T a, T b) { return a != b; } 85 | }; 86 | 87 | //////////////////////////////////////////////////////////////////////////////// 88 | // Device-side arithmetic operators 89 | 90 | template 91 | struct plus : public std::binary_function { 92 | MGPU_HOST_DEVICE T operator()(T a, T b) { return a + b; } 93 | }; 94 | 95 | template 96 | struct minus : public std::binary_function { 97 | MGPU_HOST_DEVICE T operator()(T a, T b) { return a - b; } 98 | }; 99 | 100 | template 101 | struct multiplies : public std::binary_function { 102 | MGPU_HOST_DEVICE T operator()(T a, T b) { return a * b; } 103 | }; 104 | 105 | template 106 | struct modulus : public std::binary_function { 107 | MGPU_HOST_DEVICE T operator()(T a, T b) { return a % b; } 108 | }; 109 | 110 | template 111 | struct bit_or : public std::binary_function { 112 | MGPU_HOST_DEVICE T operator()(T a, T b) { return a | b; } 113 | }; 114 | 115 | template 116 | struct bit_and : public std::binary_function { 117 | MGPU_HOST_DEVICE T operator()(T a, T b) { return a & b; } 118 | }; 119 | 120 | template 121 | struct bit_xor : public std::binary_function { 122 | MGPU_HOST_DEVICE T operator()(T a, T b) { return a ^ b; } 123 | }; 124 | 125 | template 126 | struct maximum : public std::binary_function { 127 | MGPU_HOST_DEVICE T operator()(T a, T b) { return max(a, b); } 128 | }; 129 | 130 | template 131 | struct minimum : public std::binary_function { 132 | MGPU_HOST_DEVICE T operator()(T a, T b) { return min(a, b); } 133 | }; 134 | 135 | //////////////////////////////////////////////////////////////////////////////// 136 | 137 | template 138 | MGPU_HOST_DEVICE void swap(T& a, T& b) { 139 | T c = a; 140 | a = b; 141 | b = c; 142 | } 143 | 144 | template 145 | struct DevicePair { 146 | T x, y; 147 | }; 148 | 149 | template 150 | MGPU_HOST_DEVICE DevicePair MakeDevicePair(T x, T y) { 151 | DevicePair p = { x, y }; 152 | return p; 153 | } 154 | 155 | template struct numeric_limits; 156 | template<> struct numeric_limits { 157 | MGPU_HOST_DEVICE static int min() { return INT_MIN; } 158 | MGPU_HOST_DEVICE static int max() { return INT_MAX; } 159 | MGPU_HOST_DEVICE static int lowest() { return INT_MIN; } 160 | MGPU_HOST_DEVICE static int AddIdent() { return 0; } 161 | MGPU_HOST_DEVICE static int MulIdent() { return 1; } 162 | }; 163 | template<> struct numeric_limits { 164 | MGPU_HOST_DEVICE static long long min() { return LLONG_MIN; } 165 | MGPU_HOST_DEVICE static long long max() { return LLONG_MAX; } 166 | MGPU_HOST_DEVICE static long long lowest() { return LLONG_MIN; } 167 | MGPU_HOST_DEVICE static long long AddIdent() { return 0; } 168 | MGPU_HOST_DEVICE static long long MulIdent() { return 1; } 169 | }; 170 | template<> struct numeric_limits { 171 | MGPU_HOST_DEVICE static uint min() { return 0; } 172 | MGPU_HOST_DEVICE static uint max() { return UINT_MAX; } 173 | MGPU_HOST_DEVICE static uint lowest() { return 0; } 174 | MGPU_HOST_DEVICE static uint AddIdent() { return 0; } 175 | MGPU_HOST_DEVICE static uint MulIdent() { return 1; } 176 | }; 177 | template<> struct numeric_limits { 178 | MGPU_HOST_DEVICE static unsigned long long min() { return 0; } 179 | MGPU_HOST_DEVICE static unsigned long long max() { return ULLONG_MAX; } 180 | MGPU_HOST_DEVICE static unsigned long long lowest() { return 0; } 181 | MGPU_HOST_DEVICE static unsigned long long AddIdent() { return 0; } 182 | MGPU_HOST_DEVICE static unsigned long long MulIdent() { return 1; } 183 | }; 184 | template<> struct numeric_limits { 185 | MGPU_HOST_DEVICE static float min() { return FLT_MIN; } 186 | MGPU_HOST_DEVICE static float max() { return FLT_MAX; } 187 | MGPU_HOST_DEVICE static float lowest() { return -FLT_MAX; } 188 | MGPU_HOST_DEVICE static float AddIdent() { return 0; } 189 | MGPU_HOST_DEVICE static float MulIdent() { return 1; } 190 | }; 191 | template<> struct numeric_limits { 192 | MGPU_HOST_DEVICE static double min() { return DBL_MIN; } 193 | MGPU_HOST_DEVICE static double max() { return DBL_MAX; } 194 | MGPU_HOST_DEVICE static double lowest() { return -DBL_MAX; } 195 | MGPU_HOST_DEVICE static double AddIdent() { return 0; } 196 | MGPU_HOST_DEVICE static double MulIdent() { return 1; } 197 | }; 198 | 199 | 200 | MGPU_HOST_DEVICE int2 operator+(int2 a, int2 b) { 201 | return make_int2(a.x + b.x, a.y + b.y); 202 | } 203 | MGPU_HOST_DEVICE int2& operator+=(int2& a, int2 b) { 204 | a = a + b; 205 | return a; 206 | } 207 | MGPU_HOST_DEVICE int2 operator*(int2 a, int2 b) { 208 | return make_int2(a.x * b.x, a.y * b.y); 209 | } 210 | MGPU_HOST_DEVICE int2& operator*=(int2& a, int2 b) { 211 | a = a * b; 212 | return a; 213 | } 214 | 215 | template 216 | MGPU_HOST_DEVICE T max(T a, T b) { 217 | #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 100) 218 | return std::max(a, b); 219 | #else 220 | return (a < b) ? b : a; 221 | #endif 222 | } 223 | template 224 | MGPU_HOST_DEVICE T min(T a, T b) { 225 | #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 100) 226 | return std::min(a, b); 227 | #else 228 | return (b < a) ? b : a; 229 | #endif 230 | } 231 | 232 | MGPU_HOST_DEVICE int2 max(int2 a, int2 b) { 233 | return make_int2(max(a.x, b.x), max(a.y, b.y)); 234 | } 235 | 236 | MGPU_HOST_DEVICE int2 min(int2 a, int2 b) { 237 | return make_int2(min(a.x, b.x), min(a.y, b.y)); 238 | } 239 | 240 | template<> struct numeric_limits { 241 | MGPU_HOST_DEVICE static int2 min() { return make_int2(INT_MIN, INT_MIN); } 242 | MGPU_HOST_DEVICE static int2 max() { return make_int2(INT_MAX, INT_MAX); } 243 | MGPU_HOST_DEVICE static int2 lowest() { 244 | return make_int2(INT_MIN, INT_MIN); 245 | } 246 | MGPU_HOST_DEVICE static int2 AddIdent() { return make_int2(0, 0); } 247 | MGPU_HOST_DEVICE static int2 MulIdent() { return make_int2(1, 1); } 248 | }; 249 | 250 | template 251 | class constant_iterator : public std::iterator_traits { 252 | public: 253 | MGPU_HOST_DEVICE constant_iterator(T value) : _value(value) { } 254 | 255 | MGPU_HOST_DEVICE T operator[](ptrdiff_t i) const { 256 | return _value; 257 | } 258 | MGPU_HOST_DEVICE T operator*() const { 259 | return _value; 260 | } 261 | MGPU_HOST_DEVICE constant_iterator operator+(ptrdiff_t diff) const { 262 | return constant_iterator(_value); 263 | } 264 | MGPU_HOST_DEVICE constant_iterator operator-(ptrdiff_t diff) const { 265 | return constant_iterator(_value); 266 | } 267 | MGPU_HOST_DEVICE constant_iterator& operator+=(ptrdiff_t diff) { 268 | return *this; 269 | } 270 | MGPU_HOST_DEVICE constant_iterator& operator-=(ptrdiff_t diff) { 271 | return *this; 272 | } 273 | private: 274 | T _value; 275 | }; 276 | 277 | template 278 | class counting_iterator : public std::iterator_traits { 279 | public: 280 | MGPU_HOST_DEVICE counting_iterator(T value) : _value(value) { } 281 | 282 | MGPU_HOST_DEVICE T operator[](ptrdiff_t i) { 283 | return _value + i; 284 | } 285 | MGPU_HOST_DEVICE T operator*() { 286 | return _value; 287 | } 288 | MGPU_HOST_DEVICE counting_iterator operator+(ptrdiff_t diff) { 289 | return counting_iterator(_value + diff); 290 | } 291 | MGPU_HOST_DEVICE counting_iterator operator-(ptrdiff_t diff) { 292 | return counting_iterator(_value - diff); 293 | } 294 | MGPU_HOST_DEVICE counting_iterator& operator+=(ptrdiff_t diff) { 295 | _value += diff; 296 | return *this; 297 | } 298 | MGPU_HOST_DEVICE counting_iterator& operator-=(ptrdiff_t diff) { 299 | _value -= diff; 300 | return *this; 301 | } 302 | private: 303 | T _value; 304 | }; 305 | 306 | template 307 | class step_iterator : public std::iterator_traits { 308 | public: 309 | MGPU_HOST_DEVICE step_iterator(T base, T step) : 310 | _base(base), _step(step), _offset(0) { } 311 | 312 | MGPU_HOST_DEVICE T operator[](ptrdiff_t i) { 313 | return _base + (_offset + i) * _step; 314 | } 315 | MGPU_HOST_DEVICE T operator*() { 316 | return _base + _offset * _step; 317 | } 318 | MGPU_HOST_DEVICE step_iterator operator+(ptrdiff_t diff) { 319 | step_iterator it = *this; 320 | it._offset += diff; 321 | return it; 322 | } 323 | MGPU_HOST_DEVICE step_iterator operator-(ptrdiff_t diff) { 324 | step_iterator it = *this; 325 | it._offset -= diff; 326 | return it; 327 | } 328 | MGPU_HOST_DEVICE step_iterator& operator+=(ptrdiff_t diff) { 329 | _offset += diff; 330 | return *this; 331 | } 332 | MGPU_HOST_DEVICE step_iterator& operator-=(ptrdiff_t diff) { 333 | _offset -= diff; 334 | return *this; 335 | } 336 | private: 337 | ptrdiff_t _offset; 338 | T _base, _step; 339 | }; 340 | 341 | } // namespace mgpu 342 | 343 | 344 | template 345 | MGPU_HOST_DEVICE mgpu::counting_iterator operator+(ptrdiff_t diff, 346 | mgpu::counting_iterator it) { 347 | return it + diff; 348 | } 349 | template 350 | MGPU_HOST_DEVICE mgpu::counting_iterator operator-(ptrdiff_t diff, 351 | mgpu::counting_iterator it) { 352 | return it + (-diff); 353 | } 354 | template 355 | MGPU_HOST_DEVICE mgpu::step_iterator operator+(ptrdiff_t diff, 356 | mgpu::step_iterator it) { 357 | return it + diff; 358 | } 359 | template 360 | MGPU_HOST_DEVICE mgpu::step_iterator operator-(ptrdiff_t diff, 361 | mgpu::step_iterator it) { 362 | return it + (-diff); 363 | } 364 | -------------------------------------------------------------------------------- /include/contrib/moderngpu/include/device/deviceutil.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "intrinsics.cuh" 38 | 39 | namespace mgpu { 40 | 41 | // Get the difference between two pointers in bytes. 42 | MGPU_HOST_DEVICE ptrdiff_t PtrDiff(const void* a, const void* b) { 43 | return (const byte*)b - (const byte*)a; 44 | } 45 | 46 | // Offset a pointer by i bytes. 47 | template 48 | MGPU_HOST_DEVICE const T* PtrOffset(const T* p, ptrdiff_t i) { 49 | return (const T*)((const byte*)p + i); 50 | } 51 | template 52 | MGPU_HOST_DEVICE T* PtrOffset(T* p, ptrdiff_t i) { 53 | return (T*)((byte*)p + i); 54 | } 55 | 56 | //////////////////////////////////////////////////////////////////////////////// 57 | // Task range support 58 | // Evenly distributes variable-length arrays over a fixed number of CTAs. 59 | 60 | MGPU_HOST int2 DivideTaskRange(int numItems, int numWorkers) { 61 | div_t d = div(numItems, numWorkers); 62 | return make_int2(d.quot, d.rem); 63 | } 64 | 65 | MGPU_HOST_DEVICE int2 ComputeTaskRange(int block, int2 task) { 66 | int2 range; 67 | range.x = task.x * block; 68 | range.x += min(block, task.y); 69 | range.y = range.x + task.x + (block < task.y); 70 | return range; 71 | } 72 | 73 | MGPU_HOST_DEVICE int2 ComputeTaskRange(int block, int2 task, int blockSize, 74 | int count) { 75 | int2 range = ComputeTaskRange(block, task); 76 | range.x *= blockSize; 77 | range.y = min(count, range.y * blockSize); 78 | return range; 79 | } 80 | 81 | //////////////////////////////////////////////////////////////////////////////// 82 | // DeviceExtractHeadFlags 83 | // Input array flags is a bit array with 32 head flags per word. 84 | // ExtractThreadHeadFlags returns numBits flags starting at bit index. 85 | 86 | MGPU_HOST_DEVICE uint DeviceExtractHeadFlags(const uint* flags, int index, 87 | int numBits) { 88 | 89 | int index2 = index>> 5; 90 | int shift = 31 & index; 91 | uint headFlags = flags[index2]>> shift; 92 | int shifted = 32 - shift; 93 | 94 | if(shifted < numBits) 95 | // We also need to shift in the next set of bits. 96 | headFlags = bfi(flags[index2 + 1], headFlags, shifted, shift); 97 | headFlags &= (1<< numBits) - 1; 98 | return headFlags; 99 | } 100 | 101 | //////////////////////////////////////////////////////////////////////////////// 102 | // DevicePackHeadFlags 103 | // Pack VT bits per thread at 32 bits/thread. Will consume an integer number of 104 | // words, because CTA size is a multiple of 32. The first NT * VT / 32 threads 105 | // return packed words. 106 | 107 | template 108 | MGPU_DEVICE uint DevicePackHeadFlags(uint threadBits, int tid, 109 | uint* flags_shared) { 110 | 111 | const int WordCount = NT * VT / 32; 112 | 113 | // Each thread stores its thread bits to flags_shared[tid]. 114 | flags_shared[tid] = threadBits; 115 | __syncthreads(); 116 | 117 | uint packed = 0; 118 | if(tid < WordCount) { 119 | const int Items = MGPU_DIV_UP(32, VT); 120 | int index = 32 * tid; 121 | int first = index / VT; 122 | int bit = 0; 123 | 124 | int rem = index - VT * first; 125 | packed = flags_shared[first]>> rem; 126 | bit = VT - rem; 127 | ++first; 128 | 129 | #pragma unroll 130 | for(int i = 0; i < Items; ++i) { 131 | if(i < Items - 1 || bit < 32) { 132 | uint x = flags_shared[first + i]; 133 | if(bit < 32) packed |= x<< bit; 134 | bit += VT; 135 | } 136 | } 137 | } 138 | __syncthreads(); 139 | 140 | return packed; 141 | } 142 | 143 | } // namespace mgpu 144 | -------------------------------------------------------------------------------- /include/contrib/moderngpu/include/device/intrinsics.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #include "devicetypes.cuh" 36 | 37 | #pragma once 38 | 39 | #pragma GCC diagnostic push 40 | #pragma GCC diagnostic ignored "-Wstrict-aliasing" 41 | 42 | namespace mgpu { 43 | 44 | MGPU_HOST_DEVICE uint2 ulonglong_as_uint2(uint64 x) { 45 | return *reinterpret_cast(&x); 46 | } 47 | MGPU_HOST_DEVICE uint64 uint2_as_ulonglong(uint2 x) { 48 | return *reinterpret_cast(&x); 49 | } 50 | 51 | MGPU_HOST_DEVICE int2 longlong_as_int2(int64 x) { 52 | return *reinterpret_cast(&x); 53 | } 54 | MGPU_HOST_DEVICE int64 int2_as_longlong(int2 x) { 55 | return *reinterpret_cast(&x); 56 | } 57 | 58 | MGPU_HOST_DEVICE int2 double_as_int2(double x) { 59 | return *reinterpret_cast(&x); 60 | } 61 | MGPU_HOST_DEVICE double int2_as_double(int2 x) { 62 | return *reinterpret_cast(&x); 63 | } 64 | 65 | MGPU_HOST_DEVICE void SetDoubleX(double& d, int x) { 66 | reinterpret_cast(&d)[0] = x; 67 | } 68 | MGPU_HOST_DEVICE int GetDoubleX(double d) { 69 | return double_as_int2(d).x; 70 | } 71 | MGPU_HOST_DEVICE void SetDoubleY(double& d, int y) { 72 | reinterpret_cast(&d)[1] = y; 73 | } 74 | MGPU_HOST_DEVICE int GetDoubleY(double d) { 75 | return double_as_int2(d).y; 76 | } 77 | 78 | 79 | //////////////////////////////////////////////////////////////////////////////// 80 | // PTX for bfe and bfi 81 | 82 | #if __CUDA_ARCH__ >= 200 83 | 84 | MGPU_DEVICE uint bfe_ptx(uint x, uint bit, uint numBits) { 85 | uint result; 86 | asm("bfe.u32 %0, %1, %2, %3;" : 87 | "=r"(result) : "r"(x), "r"(bit), "r"(numBits)); 88 | return result; 89 | } 90 | 91 | 92 | MGPU_DEVICE uint bfi_ptx(uint x, uint y, uint bit, uint numBits) { 93 | uint result; 94 | asm("bfi.b32 %0, %1, %2, %3, %4;" : 95 | "=r"(result) : "r"(x), "r"(y), "r"(bit), "r"(numBits)); 96 | return result; 97 | } 98 | 99 | MGPU_DEVICE uint prmt_ptx(uint a, uint b, uint index) { 100 | uint ret; 101 | asm("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index)); 102 | return ret; 103 | } 104 | 105 | #endif // __CUDA_ARCH__ >= 200 106 | 107 | 108 | //////////////////////////////////////////////////////////////////////////////// 109 | // shfl_up 110 | 111 | __device__ __forceinline__ float shfl_up(float var, 112 | unsigned int delta, int width = 32) { 113 | 114 | #if __CUDA_ARCH__ >= 300 115 | var = __shfl_up_sync(0xFFFFFFFF, var, delta, width); 116 | #endif 117 | return var; 118 | } 119 | 120 | __device__ __forceinline__ double shfl_up(double var, 121 | unsigned int delta, int width = 32) { 122 | 123 | #if __CUDA_ARCH__ >= 300 124 | int2 p = mgpu::double_as_int2(var); 125 | p.x = __shfl_up_sync(0xFFFFFFFF, p.x, delta, width); 126 | p.y = __shfl_up_sync(0xFFFFFFFF, p.y, delta, width); 127 | var = mgpu::int2_as_double(p); 128 | #endif 129 | 130 | return var; 131 | } 132 | 133 | //////////////////////////////////////////////////////////////////////////////// 134 | // shfl_add 135 | 136 | MGPU_DEVICE int shfl_add(int x, int offset, int width = WARP_SIZE) { 137 | int result = 0; 138 | #if __CUDA_ARCH__ >= 300 139 | int mask = (WARP_SIZE - width)<< 8; 140 | asm( 141 | "{.reg .s32 r0;" 142 | ".reg .pred p;" 143 | "shfl.up.sync.b32 r0|p, %1, %2, %3, %4;" 144 | "@p add.s32 r0, r0, %4;" 145 | "mov.s32 %0, r0; }" 146 | : "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x)); 147 | #endif 148 | return result; 149 | } 150 | 151 | MGPU_DEVICE int shfl_max(int x, int offset, int width = WARP_SIZE) { 152 | int result = 0; 153 | #if __CUDA_ARCH__ >= 300 154 | int mask = (WARP_SIZE - width)<< 8; 155 | asm( 156 | "{.reg .s32 r0;" 157 | ".reg .pred p;" 158 | "shfl.up.sync..b32 r0|p, %1, %2, %3, %4;" 159 | "@p max.s32 r0, r0, %4;" 160 | "mov.s32 %0, r0; }" 161 | : "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x)); 162 | #endif 163 | return result; 164 | } 165 | 166 | //////////////////////////////////////////////////////////////////////////////// 167 | // brev, popc, clz, bfe, bfi, prmt 168 | 169 | // Reverse the bits in an integer. 170 | MGPU_HOST_DEVICE uint brev(uint x) { 171 | #if __CUDA_ARCH__ >= 200 172 | uint y = __brev(x); 173 | #else 174 | uint y = 0; 175 | for(int i = 0; i < 32; ++i) 176 | y |= (1 & (x>> i))<< (31 - i); 177 | #endif 178 | return y; 179 | } 180 | 181 | // Count number of bits in a register. 182 | MGPU_HOST_DEVICE int popc(uint x) { 183 | #if __CUDA_ARCH__ >= 200 184 | return __popc(x); 185 | #else 186 | int c; 187 | for(c = 0; x; ++c) 188 | x &= x - 1; 189 | return c; 190 | #endif 191 | } 192 | 193 | // Count leading zeros - start from most significant bit. 194 | MGPU_HOST_DEVICE int clz(int x) { 195 | #if __CUDA_ARCH__ >= 200 196 | return __clz(x); 197 | #else 198 | for(int i = 31; i >= 0; --i) 199 | if((1<< i) & x) return 31 - i; 200 | return 32; 201 | #endif 202 | } 203 | 204 | // Find first set - start from least significant bit. LSB is 1. ffs(0) is 0. 205 | MGPU_HOST_DEVICE int ffs(int x) { 206 | #if __CUDA_ARCH__ >= 200 207 | return __ffs(x); 208 | #else 209 | for(int i = 0; i < 32; ++i) 210 | if((1<< i) & x) return i + 1; 211 | return 0; 212 | #endif 213 | } 214 | 215 | MGPU_HOST_DEVICE uint bfe(uint x, uint bit, uint numBits) { 216 | #if __CUDA_ARCH__ >= 200 217 | return bfe_ptx(x, bit, numBits); 218 | #else 219 | return ((1<< numBits) - 1) & (x>> bit); 220 | #endif 221 | } 222 | 223 | MGPU_HOST_DEVICE uint bfi(uint x, uint y, uint bit, uint numBits) { 224 | uint result; 225 | #if __CUDA_ARCH__ >= 200 226 | result = bfi_ptx(x, y, bit, numBits); 227 | #else 228 | if(bit + numBits > 32) numBits = 32 - bit; 229 | uint mask = ((1<< numBits) - 1)<< bit; 230 | result = y & ~mask; 231 | result |= mask & (x<< bit); 232 | #endif 233 | return result; 234 | } 235 | 236 | MGPU_HOST_DEVICE uint prmt(uint a, uint b, uint index) { 237 | uint result; 238 | #if __CUDA_ARCH__ >= 200 239 | result = prmt_ptx(a, b, index); 240 | #else 241 | result = 0; 242 | for(int i = 0; i < 4; ++i) { 243 | uint sel = 0xf & (index>> (4 * i)); 244 | uint x = ((7 & sel) > 3) ? b : a; 245 | x = 0xff & (x>> (8 * (3 & sel))); 246 | if(8 & sel) x = (128 & x) ? 0xff : 0; 247 | result |= x<< (8 * i); 248 | } 249 | #endif 250 | return result; 251 | } 252 | 253 | // Find log2(x) and optionally round up to the next integer logarithm. 254 | MGPU_HOST_DEVICE int FindLog2(int x, bool roundUp = false) { 255 | int a = 31 - clz(x); 256 | if(roundUp) a += !MGPU_IS_POW_2(x); 257 | return a; 258 | } 259 | 260 | //////////////////////////////////////////////////////////////////////////////// 261 | // vset4 262 | 263 | #if __CUDA_ARCH__ >= 300 264 | 265 | // Performs four byte-wise comparisons and returns 1 for each byte that 266 | // satisfies the conditional, and zero otherwise. 267 | MGPU_DEVICE uint vset4_lt_add_ptx(uint a, uint b, uint c) { 268 | uint result; 269 | asm("vset4.u32.u32.lt.add %0, %1, %2, %3;" : 270 | "=r"(result) : "r"(a), "r"(b), "r"(c)); 271 | return result; 272 | } 273 | MGPU_DEVICE uint vset4_eq_ptx(uint a, uint b) { 274 | uint result; 275 | asm("vset4.u32.u32.eq %0, %1, %2, %3;" : 276 | "=r"(result) : "r"(a), "r"(b), "r"(0)); 277 | return result; 278 | } 279 | #endif // __CUDA_ARCH__ >= 300 280 | 281 | MGPU_HOST_DEVICE uint vset4_lt_add(uint a, uint b, uint c) { 282 | uint result; 283 | #if __CUDA_ARCH__ >= 300 284 | result = vset4_lt_add_ptx(a, b, c); 285 | #else 286 | result = c; 287 | if((0x000000ff & a) < (0x000000ff & b)) result += 0x00000001; 288 | if((0x0000ff00 & a) < (0x0000ff00 & b)) result += 0x00000100; 289 | if((0x00ff0000 & a) < (0x00ff0000 & b)) result += 0x00010000; 290 | if((0xff000000 & a) < (0xff000000 & b)) result += 0x01000000; 291 | #endif 292 | return result; 293 | } 294 | 295 | MGPU_HOST_DEVICE uint vset4_eq(uint a, uint b) { 296 | uint result; 297 | #if __CUDA_ARCH__ >= 300 298 | result = vset4_eq_ptx(a, b); 299 | #else 300 | result = 0; 301 | if((0x000000ff & a) == (0x000000ff & b)) result = 0x00000001; 302 | if((0x0000ff00 & a) == (0x0000ff00 & b)) result += 0x00000100; 303 | if((0x00ff0000 & a) == (0x00ff0000 & b)) result += 0x00010000; 304 | if((0xff000000 & a) == (0xff000000 & b)) result += 0x01000000; 305 | #endif 306 | return result; 307 | } 308 | 309 | //////////////////////////////////////////////////////////////////////////////// 310 | // 311 | 312 | MGPU_HOST_DEVICE uint umulhi(uint x, uint y) { 313 | #if __CUDA_ARCH__ >= 100 314 | return __umulhi(x, y); 315 | #else 316 | uint64 product = (uint64)x * y; 317 | return (uint)(product>> 32); 318 | #endif 319 | } 320 | 321 | //////////////////////////////////////////////////////////////////////////////// 322 | // ldg() function defined for all devices and all types. Only compiles to __ldg 323 | // intrinsic for __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400 for types supported 324 | // by __ldg in sm_32_intrinsics.h 325 | 326 | template 327 | struct IsLdgType { 328 | enum { value = false }; 329 | }; 330 | #define DEFINE_LDG_TYPE(T) \ 331 | template<> struct IsLdgType { enum { value = true }; }; 332 | 333 | template::value> 334 | struct LdgShim { 335 | MGPU_DEVICE static T Ldg(const T* p) { 336 | return *p; 337 | } 338 | }; 339 | 340 | #if __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400 341 | 342 | // List of __ldg-compatible types from sm_32_intrinsics.h. 343 | DEFINE_LDG_TYPE(char) 344 | DEFINE_LDG_TYPE(short) 345 | DEFINE_LDG_TYPE(int) 346 | DEFINE_LDG_TYPE(long long) 347 | DEFINE_LDG_TYPE(char2) 348 | DEFINE_LDG_TYPE(char4) 349 | DEFINE_LDG_TYPE(short2) 350 | DEFINE_LDG_TYPE(short4) 351 | DEFINE_LDG_TYPE(int2) 352 | DEFINE_LDG_TYPE(int4) 353 | DEFINE_LDG_TYPE(longlong2) 354 | 355 | DEFINE_LDG_TYPE(unsigned char) 356 | DEFINE_LDG_TYPE(unsigned short) 357 | DEFINE_LDG_TYPE(unsigned int) 358 | DEFINE_LDG_TYPE(unsigned long long) 359 | DEFINE_LDG_TYPE(uchar2) 360 | DEFINE_LDG_TYPE(uchar4) 361 | DEFINE_LDG_TYPE(ushort2) 362 | DEFINE_LDG_TYPE(ushort4) 363 | DEFINE_LDG_TYPE(uint2) 364 | DEFINE_LDG_TYPE(uint4) 365 | DEFINE_LDG_TYPE(ulonglong2) 366 | 367 | DEFINE_LDG_TYPE(float) 368 | DEFINE_LDG_TYPE(double) 369 | DEFINE_LDG_TYPE(float2) 370 | DEFINE_LDG_TYPE(float4) 371 | DEFINE_LDG_TYPE(double2) 372 | 373 | template struct LdgShim { 374 | MGPU_DEVICE static T Ldg(const T* p) { 375 | return __ldg(p); 376 | } 377 | }; 378 | #endif 379 | 380 | template 381 | MGPU_DEVICE T ldg(const T* p) { 382 | return LdgShim::Ldg(p); 383 | } 384 | 385 | //////////////////////////////////////////////////////////////////////////////// 386 | 387 | // Fast division for 31-bit integers. 388 | // Uses the method in Hacker's Delight (2nd edition) page 228. 389 | // Evaluates for denom > 1 and x < 2^31. 390 | struct FastDivide { 391 | uint denom; 392 | uint coef; 393 | uint shift; 394 | 395 | MGPU_HOST_DEVICE uint Divide(uint x) { 396 | return umulhi(x, coef)>> shift; 397 | } 398 | MGPU_HOST_DEVICE uint Modulus(uint x) { 399 | return x - Divide(x) * denom; 400 | } 401 | 402 | explicit FastDivide(uint denom_) { 403 | denom = denom_; 404 | uint p = 31 + FindLog2(denom, true); 405 | coef = (uint)(((1ull<< p) + denom - 1) / denom); 406 | shift = p - 32; 407 | } 408 | }; 409 | 410 | #pragma GCC diagnostic pop 411 | 412 | } // namespace mgpu 413 | -------------------------------------------------------------------------------- /include/contrib/moderngpu/include/device/serialsets.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "deviceutil.cuh" 38 | 39 | namespace mgpu { 40 | 41 | //////////////////////////////////////////////////////////////////////////////// 42 | // SerialSetIntersection 43 | // Emit A if A and B are in range and equal. 44 | 45 | template 46 | MGPU_DEVICE int SerialSetIntersection(const T* data, int aBegin, int aEnd, 47 | int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) { 48 | 49 | const int MinIterations = VT / 2; 50 | int commit = 0; 51 | 52 | #pragma unroll 53 | for(int i = 0; i < VT; ++i) { 54 | bool test = RangeCheck ? 55 | ((aBegin + bBegin < end) && (aBegin < aEnd) && (bBegin < bEnd)) : 56 | (i < MinIterations || (aBegin + bBegin < end)); 57 | 58 | if(test) { 59 | T aKey = data[aBegin]; 60 | T bKey = data[bBegin]; 61 | 62 | bool pA = comp(aKey, bKey); 63 | bool pB = comp(bKey, aKey); 64 | 65 | // The outputs must come from A by definition of set interection. 66 | results[i] = aKey; 67 | indices[i] = aBegin; 68 | 69 | if(!pB) ++aBegin; 70 | if(!pA) ++bBegin; 71 | if(pA == pB) commit |= 1<< i; 72 | } 73 | } 74 | return commit; 75 | } 76 | 77 | //////////////////////////////////////////////////////////////////////////////// 78 | // SerialSetUnion 79 | // Emit A if A <= B. Emit B if B < A. 80 | 81 | template 82 | MGPU_DEVICE int SerialSetUnion(const T* data, int aBegin, int aEnd, 83 | int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) { 84 | 85 | const int MinIterations = VT / 2; 86 | int commit = 0; 87 | 88 | #pragma unroll 89 | for(int i = 0; i < VT; ++i) { 90 | bool test = RangeCheck ? 91 | (aBegin + bBegin < end) : 92 | (i < MinIterations || (aBegin + bBegin < end)); 93 | 94 | if(test) { 95 | T aKey = data[aBegin]; 96 | T bKey = data[bBegin]; 97 | 98 | bool pA = false, pB = false; 99 | if(RangeCheck && aBegin >= aEnd) 100 | pB = true; 101 | else if(RangeCheck && bBegin >= bEnd) 102 | pA = true; 103 | else { 104 | // Both are in range. 105 | pA = comp(aKey, bKey); 106 | pB = comp(bKey, aKey); 107 | } 108 | 109 | // Output A in case of a tie, so check if b < a. 110 | results[i] = pB ? bKey : aKey; 111 | indices[i] = pB ? bBegin : aBegin; 112 | if(!pB) ++aBegin; 113 | if(!pA) ++bBegin; 114 | commit |= 1<< i; 115 | } 116 | } 117 | return commit; 118 | } 119 | 120 | //////////////////////////////////////////////////////////////////////////////// 121 | // SerialSetDifference 122 | // Emit A if A < B. 123 | 124 | template 125 | MGPU_DEVICE int SerialSetDifference(const T* data, int aBegin, int aEnd, 126 | int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) { 127 | 128 | const int MinIterations = VT / 2; 129 | int commit = 0; 130 | 131 | #pragma unroll 132 | for(int i = 0; i < VT; ++i) { 133 | bool test = RangeCheck ? 134 | (aBegin + bBegin < end) : 135 | (i < MinIterations || (aBegin + bBegin < end)); 136 | if(test) { 137 | T aKey = data[aBegin]; 138 | T bKey = data[bBegin]; 139 | 140 | bool pA = false, pB = false; 141 | if(RangeCheck && aBegin >= aEnd) 142 | pB = true; 143 | else if(RangeCheck && bBegin >= bEnd) 144 | pA = true; 145 | else { 146 | pA = comp(aKey, bKey); 147 | pB = comp(bKey, aKey); 148 | } 149 | 150 | // The outputs must come from A by definition of set difference. 151 | results[i] = aKey; 152 | indices[i] = aBegin; 153 | if(!pB) ++aBegin; 154 | if(!pA) ++bBegin; 155 | if(pA) commit |= 1<< i; 156 | } 157 | } 158 | return commit; 159 | } 160 | 161 | //////////////////////////////////////////////////////////////////////////////// 162 | // SerialSetSymDiff 163 | // Emit A if A < B and emit B if B < A. 164 | 165 | template 166 | MGPU_DEVICE int SerialSetSymDiff(const T* data, int aBegin, int aEnd, 167 | int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) { 168 | 169 | const int MinIterations = VT / 2; 170 | int commit = 0; 171 | 172 | #pragma unroll 173 | for(int i = 0; i < VT; ++i) { 174 | bool test = RangeCheck ? 175 | (aBegin + bBegin < end) : 176 | (i < MinIterations || (aBegin + bBegin < end)); 177 | if(test) { 178 | T aKey = data[aBegin]; 179 | T bKey = data[bBegin]; 180 | 181 | bool pA = false, pB = false; 182 | if(RangeCheck && (bBegin >= bEnd)) 183 | pA = true; 184 | else if(RangeCheck && (aBegin >= aEnd)) 185 | pB = true; 186 | else { 187 | pA = comp(aKey, bKey); 188 | pB = comp(bKey, aKey); 189 | } 190 | 191 | results[i] = pA ? aKey : bKey; 192 | indices[i] = pA ? aBegin : bBegin; 193 | if(!pA) ++bBegin; 194 | if(!pB) ++aBegin; 195 | if(pA != pB) commit |= 1<< i; 196 | } 197 | } 198 | return commit; 199 | } 200 | 201 | //////////////////////////////////////////////////////////////////////////////// 202 | // SerialSetOp 203 | // Uses the MgpuSetOp enum to statically select one of the four serial ops 204 | // above. 205 | 206 | template 207 | MGPU_DEVICE int SerialSetOp(const T* data, int aBegin, int aEnd, 208 | int bBegin, int bEnd, int star, T* results, int* indices, Comp comp) { 209 | 210 | int end = aBegin + bBegin + VT - star; 211 | if(RangeCheck) end = min(end, aEnd + bEnd); 212 | int commit; 213 | switch(Op) { 214 | case MgpuSetOpIntersection: 215 | commit = SerialSetIntersection(data, aBegin, 216 | aEnd, bBegin, bEnd, end, results, indices, comp); 217 | break; 218 | case MgpuSetOpUnion: 219 | commit = SerialSetUnion(data, aBegin, aEnd, 220 | bBegin, bEnd, end, results, indices, comp); 221 | break; 222 | case MgpuSetOpDiff: 223 | commit = SerialSetDifference(data, aBegin, aEnd, 224 | bBegin, bEnd, end, results, indices, comp); 225 | break; 226 | case MgpuSetOpSymDiff: 227 | commit = SerialSetSymDiff(data, aBegin, aEnd, 228 | bBegin, bEnd, end, results, indices, comp); 229 | break; 230 | } 231 | __syncthreads(); 232 | return commit; 233 | } 234 | 235 | } // namespace mgpu 236 | -------------------------------------------------------------------------------- /include/contrib/moderngpu/include/device/sortnetwork.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "deviceutil.cuh" 38 | 39 | namespace mgpu { 40 | 41 | //////////////////////////////////////////////////////////////////////////////// 42 | // Odd-even transposition sorting network. Sorts keys and values in-place in 43 | // register. 44 | // http://en.wikipedia.org/wiki/Odd%E2%80%93even_sort 45 | 46 | // CUDA Compiler does not currently unroll these loops correctly. Write using 47 | // template loop unrolling. 48 | /* 49 | template 50 | MGPU_DEVICE void OddEvenTransposeSort(T* keys, V* values, Comp comp) { 51 | #pragma unroll 52 | for(int level = 0; level < VT; ++level) { 53 | 54 | #pragma unroll 55 | for(int i = 1 & level; i < VT - 1; i += 2) { 56 | if(comp(keys[i + 1], keys[i])) { 57 | mgpu::swap(keys[i], keys[i + 1]); 58 | mgpu::swap(values[i], values[i + 1]); 59 | } 60 | } 61 | } 62 | }*/ 63 | 64 | template 65 | struct OddEvenTransposeSortT { 66 | // Sort segments marked by head flags. If the head flag between i and i + 1 67 | // is set (so that (2<< i) & flags is true), the values belong to different 68 | // segments and are not swapped. 69 | template 70 | static MGPU_DEVICE void Sort(K* keys, V* values, int flags, Comp comp) { 71 | #pragma unroll 72 | for(int i = 1 & I; i < VT - 1; i += 2) 73 | if((0 == ((2<< i) & flags)) && comp(keys[i + 1], keys[i])) { 74 | mgpu::swap(keys[i], keys[i + 1]); 75 | mgpu::swap(values[i], values[i + 1]); 76 | } 77 | OddEvenTransposeSortT::Sort(keys, values, flags, comp); 78 | } 79 | }; 80 | template struct OddEvenTransposeSortT { 81 | template 82 | static MGPU_DEVICE void Sort(K* keys, V* values, int flags, Comp comp) { } 83 | }; 84 | 85 | template 86 | MGPU_DEVICE void OddEvenTransposeSort(K* keys, V* values, Comp comp) { 87 | OddEvenTransposeSortT<0, VT>::Sort(keys, values, 0, comp); 88 | } 89 | template 90 | MGPU_DEVICE void OddEvenTransposeSortFlags(K* keys, V* values, int flags, 91 | Comp comp) { 92 | OddEvenTransposeSortT<0, VT>::Sort(keys, values, flags, comp); 93 | } 94 | 95 | //////////////////////////////////////////////////////////////////////////////// 96 | // Batcher Odd-Even Mergesort network 97 | // Unstable but executes much faster than the transposition sort. 98 | // http://en.wikipedia.org/wiki/Batcher_odd%E2%80%93even_mergesort 99 | 100 | template 101 | struct OddEvenMergesortT { 102 | template 103 | MGPU_DEVICE static void CompareAndSwap(K* keys, V* values, int flags, 104 | int a, int b, Comp comp) { 105 | if(b < Count) { 106 | // Mask the bits between a and b. Any head flags in this interval 107 | // means the keys are in different segments and must not be swapped. 108 | const int Mask = ((2<< b) - 1) ^ ((2<< a) - 1); 109 | if(!(Mask & flags) && comp(keys[b], keys[a])) { 110 | mgpu::swap(keys[b], keys[a]); 111 | mgpu::swap(values[b], values[a]); 112 | } 113 | } 114 | } 115 | 116 | template 117 | struct OddEvenMerge { 118 | template 119 | MGPU_DEVICE static void Merge(K* keys, V* values, int flags, 120 | Comp comp) { 121 | // Compare and swap 122 | const int M = 2 * R; 123 | OddEvenMerge::Merge(keys, values, flags, comp); 124 | OddEvenMerge::Merge(keys, values, flags, comp); 125 | 126 | #pragma unroll 127 | for(int i = Low2 + R; i + R < Low2 + Width; i += M) 128 | CompareAndSwap(keys, values, flags, i, i + R, comp); 129 | } 130 | }; 131 | template 132 | struct OddEvenMerge { 133 | template 134 | MGPU_DEVICE static void Merge(K* keys, V* values, int flags, 135 | Comp comp) { 136 | CompareAndSwap(keys, values, flags, Low2, Low2 + R, comp); 137 | } 138 | }; 139 | 140 | template 141 | MGPU_DEVICE static void Sort(K* keys, V* values, int flags, 142 | Comp comp) { 143 | 144 | const int M = Width / 2; 145 | OddEvenMergesortT::Sort(keys, values, flags, comp); 146 | OddEvenMergesortT::Sort(keys, values, flags, comp); 147 | OddEvenMerge<1, Low>::Merge(keys, values, flags, comp); 148 | } 149 | }; 150 | template struct OddEvenMergesortT<1, Low, Count> { 151 | template 152 | MGPU_DEVICE static void Sort(K* keys, V* values, int flags, 153 | Comp comp) { } 154 | }; 155 | 156 | template 157 | MGPU_DEVICE void OddEvenMergesort(K* keys, V* values, Comp comp) { 158 | const int Width = 1<< sLogPow2::value; 159 | OddEvenMergesortT::Sort(keys, values, 0, comp); 160 | } 161 | template 162 | MGPU_DEVICE void OddEvenMergesortFlags(K* keys, V* values, int flags, 163 | Comp comp) { 164 | const int Width = 1<< sLogPow2::value; 165 | OddEvenMergesortT::Sort(keys, values, flags, comp); 166 | } 167 | 168 | } // namespace mgpu 169 | -------------------------------------------------------------------------------- /include/contrib/moderngpu/include/mgpudevice.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "mgpuenums.h" 38 | #include "device/deviceutil.cuh" 39 | 40 | namespace mgpu { 41 | 42 | //////////////////////////////////////////////////////////////////////////////// 43 | // device/loadstore.cuh 44 | 45 | // For 0 <= i < VT: 46 | // index = NT * i + tid; 47 | // reg[i] = data[index]; 48 | // Synchronize after load. 49 | template 50 | MGPU_DEVICE void DeviceSharedToReg(InputIt data, int tid, T* reg, 51 | bool sync = true); 52 | 53 | // For 0 <= i < VT: 54 | // index = NT * i + tid; 55 | // if(index < count) reg[i] = data[index]; 56 | // No synchronize after load. 57 | template 58 | MGPU_DEVICE void DeviceGlobalToReg(int count, InputIt data, int tid, 59 | T* reg, bool sync = false); 60 | 61 | template 62 | MGPU_DEVICE void DeviceGlobalToRegDefault(int count, InputIt data, int tid, 63 | T* reg, T init, bool sync = false); 64 | 65 | // For 0 <= i < VT: 66 | // index = NT * i + tid; 67 | // if(index < count) reg[i] = data[index]; 68 | // No synchronize after load. 69 | template 70 | MGPU_DEVICE void DeviceGlobalToReg(int count, InputIt data, int tid, 71 | T* reg, bool sync = false); 72 | 73 | // For 0 <= i < VT: 74 | // index = NT * i + tid; 75 | // if(index < count) reg[i] = data[index]; 76 | // No synchronize after load. 77 | template 78 | MGPU_DEVICE void DeviceGlobalToRegDefault2(int count, InputIt data, int tid, 79 | T* reg, T init, bool sync = false); 80 | 81 | // For 0 <= i < VT: 82 | // index = NT * i + tid; 83 | // if(index < count) reg[i] = data[index]; 84 | // No synchronize after load. 85 | // No optimized code path for count < NV (smaller generated code). 86 | template 87 | MGPU_DEVICE void DeviceGlobalToRegLoop(int count, InputIt data, int tid, 88 | T* reg, bool sync = false); 89 | 90 | 91 | // For 0 <= i < VT: 92 | // index = VT * tid + i. 93 | // if(index < count) reg[i] = data[index]; 94 | // No synchronize after load. 95 | template 96 | MGPU_DEVICE void DeviceGlobalToThread(int count, InputIt data, int tid, 97 | T* reg); 98 | 99 | template 100 | MGPU_DEVICE void DeviceGlobalToThreadDefault(int count, InputIt data, int tid, 101 | T* reg, T init); 102 | 103 | // For 0 <= i < VT: 104 | // index = NT * i + tid; 105 | // if(index < count) data[index] = reg[i]; 106 | // Synchronize after load. 107 | template 108 | MGPU_DEVICE void DeviceRegToShared(const T* reg, int tid, OutputIt dest, 109 | bool sync = true); 110 | 111 | // For 0 <= i < VT: 112 | // index = NT * i + tid; 113 | // if(index < count) data[index] = reg[i]; 114 | // No synchronize after load. 115 | template 116 | MGPU_DEVICE void DeviceRegToGlobal(int count, const T* reg, int tid, 117 | OutputIt dest, bool sync = false); 118 | 119 | // For 0 <= index < count: 120 | // dest[index] = source[index]; 121 | // This function is intended to replace DeviceGlobalToShared in cases where 122 | // count is much less than NT * VT. 123 | template 124 | MGPU_DEVICE void DeviceMemToMemLoop(int count, InputIt source, int tid, 125 | OutputIt dest, bool sync = true); 126 | 127 | // For 0 <= index < count: 128 | // dest[index] = source[index]; 129 | // Synchronize after store. 130 | template 131 | MGPU_DEVICE void DeviceSharedToGlobal(int count, const T* source, int tid, 132 | OutputIt dest, bool sync = true); 133 | 134 | // For 0 <= index < count: 135 | // dest[index] = source[index]; 136 | // Synchronize after store. 137 | template 138 | MGPU_DEVICE void DeviceGlobalToShared(int count, InputIt source, int tid, 139 | T* dest, bool sync = true); 140 | 141 | template 142 | MGPU_DEVICE void DeviceGlobalToShared2(int count, InputIt source, int tid, 143 | T* dest, bool sync = true); 144 | 145 | // For 0 <= index < count: 146 | // dest[index] = source[index]; 147 | // Synchronize after store. 148 | // No optimized code path for count < NV (smaller generated code). 149 | template 150 | MGPU_DEVICE void DeviceGlobalToSharedLoop(int count, InputIt source, int tid, 151 | T* dest, bool sync = true); 152 | 153 | template 154 | MGPU_DEVICE void DeviceGlobalToSharedDefault(int count, InputIt source, int tid, 155 | T* dest, T init, bool sync = true); 156 | 157 | template 158 | MGPU_DEVICE void DeviceGlobalToSharedDefault2(int count, InputIt source, 159 | int tid, T* dest, T init, bool sync = true); 160 | 161 | // For 0 <= index < count: 162 | // dest[index] = source[index]; 163 | // No synchronize. 164 | template 165 | MGPU_DEVICE void DeviceGlobalToGlobal(int count, InputIt source, int tid, 166 | OutputIt dest, bool sync = false); 167 | 168 | // Transponse VT elements in NT threads (x) into thread-order registers (y) 169 | // using only NT * VT / 2 elements of shared memory. 170 | template 171 | MGPU_DEVICE void HalfSmemTranspose(const T* x, int tid, T* shared, T* y); 172 | 173 | // For 0 <= i < VT: 174 | // index = NT * i + tid; 175 | // if(index < count) 176 | // gather = indices[index]; 177 | // reg[i] = data[gather]; 178 | // Synchronize after load. 179 | template 180 | MGPU_DEVICE void DeviceGather(int count, InputIt data, int indices[VT], 181 | int tid, T* reg, bool sync = true); 182 | 183 | template 184 | MGPU_DEVICE void DeviceGatherDefault(int count, InputIt data, int indices[VT], 185 | int tid, T* reg, T identity, bool sync = true); 186 | 187 | // For 0 <= i < VT: 188 | // index = NT * i + tid; 189 | // if(index < count) 190 | // scatter = indices[index]; 191 | // data[scatter] = reg[i]; 192 | // Synchronize after store. 193 | template 194 | MGPU_DEVICE void DeviceScatter(int count, const T* reg, int tid, 195 | int indices[VT], OutputIt data, bool sync = true); 196 | 197 | // For 0 <= i < VT: 198 | // shared[VT * tid + i] = threadReg[i]; 199 | // Synchronize after store. 200 | // Note this function moves data in THREAD ORDER. 201 | // (DeviceRegToShared moves data in STRIDED ORDER). 202 | template 203 | MGPU_DEVICE void DeviceThreadToShared(const T* threadReg, int tid, T* shared, 204 | bool sync = true); 205 | 206 | // For 0 <= i < VT: 207 | // threadReg[i] = shared[VT * tid + i]; 208 | // Synchronize after load. 209 | // Note this function moves data in THREAD ORDER. 210 | // (DeviceSharedToReg moves data in STRIDED ORDER). 211 | template 212 | MGPU_DEVICE void DeviceSharedToThread(const T* shared, int tid, T* threadReg, 213 | bool sync = true); 214 | 215 | // For 0 <= index < aCount: 216 | // shared[index] = a_global[index]; 217 | // For 0 <= index < bCount: 218 | // shared[aCount + index] = b_global[index]; 219 | // VT0 is the lower-bound for predication-free execution: 220 | // If count >= NT * VT0, a predication-free branch is taken. 221 | // VT1 is the upper-bound for loads: 222 | // NT * VT1 must >= aCount + bCount. 223 | 224 | template 225 | MGPU_DEVICE void DeviceLoad2ToReg(const T* a_global, int aCount, 226 | const T* b_global, int bCount, int tid, T* reg, bool sync = false); 227 | 228 | template 229 | MGPU_DEVICE void DeviceLoad2ToShared(const T* a_global, int aCount, 230 | const T* b_global, int bCount, int tid, T* shared, bool sync = true); 231 | 232 | template 234 | MGPU_DEVICE void DeviceLoad2ToReg(InputIt1 a_global, int aCount, 235 | InputIt2 b_global, int bCount, int tid, T* reg, bool sync = false); 236 | 237 | template 239 | MGPU_DEVICE void DeviceLoad2ToShared(InputIt1 a_global, int aCount, 240 | InputIt2 b_global, int bCount, int tid, T* shared, bool sync = true); 241 | 242 | // For 0 <= i < VT 243 | // index = NT * i + tid; 244 | // if(index < count) 245 | // gather = indices_shared[index]; 246 | // dest_global[index] = data_global[gather]; 247 | // Synchronize after load. 248 | template 249 | MGPU_DEVICE void DeviceGatherGlobalToGlobal(int count, InputIt data_global, 250 | const int* indices_shared, int tid, OutputIt dest_global, 251 | bool sync = true); 252 | 253 | // For 0 <= i < VT 254 | // index = NT * i + tid 255 | // if(index < count) 256 | // gather = indices[index]; 257 | // if(gather < aCount) data = a_global[gather]; 258 | // else data = b_global[gather - aCount]; 259 | // dest_global[index] = data; 260 | // Synchronize after load. 261 | template 263 | MGPU_DEVICE void DeviceTransferMergeValuesReg(int count, InputIt1 a_global, 264 | InputIt2 b_global, int bStart, const int* indices, int tid, 265 | T* reg, bool sync = false); 266 | 267 | template 269 | MGPU_DEVICE void DeviceTransferMergeValuesShared(int count, InputIt1 a_global, 270 | InputIt2 b_global, int bStart, const int* indices_shared, int tid, 271 | OutputIt dest_global, bool sync = true); 272 | 273 | template 274 | MGPU_DEVICE void DeviceTransferMergeValuesReg(int count, const T* a_global, 275 | const T* b_global, int bStart, const int* indices, int tid, 276 | T* reg, bool sync = false); 277 | 278 | template 279 | MGPU_DEVICE void DeviceTransferMergeValuesShared(int count, const T* a_global, 280 | const T* b_global, int bStart, const int* indices_shared, int tid, 281 | OutputIt dest_global, bool sync = true); 282 | 283 | 284 | 285 | } // namespace mgpu 286 | 287 | 288 | #include "device/loadstore.cuh" 289 | #include "device/ctasegscan.cuh" 290 | -------------------------------------------------------------------------------- /include/contrib/moderngpu/include/mgpuenums.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | namespace mgpu { 38 | 39 | enum MgpuBounds { 40 | MgpuBoundsLower, 41 | MgpuBoundsUpper 42 | }; 43 | 44 | enum MgpuScanType { 45 | MgpuScanTypeExc, 46 | MgpuScanTypeInc 47 | }; 48 | 49 | enum MgpuSearchType { 50 | MgpuSearchTypeNone, 51 | MgpuSearchTypeIndex, 52 | MgpuSearchTypeMatch, 53 | MgpuSearchTypeIndexMatch 54 | }; 55 | 56 | enum MgpuJoinKind { 57 | MgpuJoinKindInner, 58 | MgpuJoinKindLeft, 59 | MgpuJoinKindRight, 60 | MgpuJoinKindOuter 61 | }; 62 | 63 | enum MgpuSetOp { 64 | MgpuSetOpIntersection, 65 | MgpuSetOpUnion, 66 | MgpuSetOpDiff, 67 | MgpuSetOpSymDiff 68 | }; 69 | 70 | } // namespace mgpu 71 | -------------------------------------------------------------------------------- /include/contrib/moderngpu/include/util/static.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | #include 46 | #include 47 | #include 48 | #include 49 | #include 50 | 51 | #ifndef MGPU_MIN 52 | #define MGPU_MIN(x, y) (((x) <= (y)) ? (x) : (y)) 53 | #define MGPU_MAX(x, y) (((x) >= (y)) ? (x) : (y)) 54 | #define MGPU_MAX0(x) (((x) >= 0) ? (x) : 0) 55 | #define MGPU_ABS(x) (((x) >= 0) ? (x) : (-x)) 56 | 57 | #define MGPU_DIV_UP(x, y) (((x) + (y) - 1) / (y)) 58 | #define MGPU_DIV_ROUND(x, y) (((x) + (y) / 2) / (y)) 59 | #define MGPU_ROUND_UP(x, y) ((y) * MGPU_DIV_UP(x, y)) 60 | #define MGPU_SHIFT_DIV_UP(x, y) (((x) + ((1<< (y)) - 1))>> y) 61 | #define MGPU_ROUND_UP_POW2(x, y) (((x) + (y) - 1) & ~((y) - 1)) 62 | #define MGPU_ROUND_DOWN_POW2(x, y) ((x) & ~((y) - 1)) 63 | #define MGPU_IS_POW_2(x) (0 == ((x) & ((x) - 1))) 64 | 65 | #endif // MGPU_MIN 66 | 67 | namespace mgpu { 68 | 69 | 70 | typedef unsigned char byte; 71 | 72 | typedef unsigned int uint; 73 | typedef signed short int16; 74 | 75 | typedef unsigned short ushort; 76 | typedef unsigned short uint16; 77 | 78 | typedef long long int64; 79 | typedef unsigned long long uint64; 80 | 81 | // IsPow2::value is true if X is a power of 2. 82 | template struct sIsPow2 { 83 | enum { value = 0 == (X & (X - 1)) }; 84 | }; 85 | 86 | // Finds the base-2 logarithm of X. value is -1 if X is not a power of 2. 87 | template struct sLogPow2 { 88 | enum { extra = sIsPow2::value ? 0 : (roundUp ? 1 : 0) }; 89 | enum { inner = sLogPow2::inner + 1 }; 90 | enum { value = inner + extra }; 91 | }; 92 | template struct sLogPow2<0, roundUp> { 93 | enum { inner = 0 }; 94 | enum { value = 0 }; 95 | }; 96 | template struct sLogPow2<1, roundUp> { 97 | enum { inner = 0 }; 98 | enum { value = 0 }; 99 | }; 100 | 101 | template 102 | struct sDivUp { 103 | enum { value = (X + Y - 1) / Y }; 104 | }; 105 | 106 | template struct sDiv2RoundUp { 107 | enum { value = sDiv2RoundUp::value, levels - 1>::value }; 108 | }; 109 | template struct sDiv2RoundUp { 110 | enum { value = count }; 111 | }; 112 | 113 | template 114 | struct sDivSafe { 115 | enum { value = X / Y }; 116 | }; 117 | template 118 | struct sDivSafe { 119 | enum { value = 0 }; 120 | }; 121 | 122 | template 123 | struct sRoundUp { 124 | enum { rem = X % Y }; 125 | enum { value = X + (rem ? (Y - rem) : 0) }; 126 | }; 127 | 128 | template 129 | struct sRoundDown { 130 | enum { rem = X % Y }; 131 | enum { value = X - rem }; 132 | }; 133 | 134 | // IntegerDiv is a template for avoiding divisions by zero in template 135 | // evaluation. Templates always evaluate both b and c in an expression like 136 | // a ? b : c, and will error if either rhs contains an illegal expression, 137 | // even if the ternary is explictly designed to guard against that. 138 | template 139 | struct sIntegerDiv { 140 | enum { value = X / (Y ? Y : (X + 1)) }; 141 | }; 142 | 143 | template 144 | struct sMax { 145 | enum { value = (X >= Y) ? X : Y }; 146 | }; 147 | template 148 | struct sMin { 149 | enum { value = (X <= Y) ? X : Y }; 150 | }; 151 | 152 | template 153 | struct sAbs { 154 | enum { value = (X >= 0) ? X : -X }; 155 | }; 156 | 157 | 158 | // Finds the number of powers of 2 in the prime factorization of X. 159 | template struct sNumFactorsOf2 { 160 | enum { shifted = X >> 1 }; 161 | enum { value = 1 + sNumFactorsOf2::value }; 162 | }; 163 | template struct sNumFactorsOf2 { 164 | enum { value = 0 }; 165 | }; 166 | 167 | // Returns the divisor for a conflict-free transpose. 168 | template struct sBankConflictDivisor { 169 | enum { value = 170 | (1 & X) ? 0 : 171 | (sIsPow2::value ? NumBanks : 172 | (1<< sNumFactorsOf2::value)) }; 173 | enum { log_value = sLogPow2::value }; 174 | }; 175 | 176 | template struct sConflictFreeStorage { 177 | enum { count = NT * X }; 178 | enum { divisor = sBankConflictDivisor::value }; 179 | enum { padding = sDivSafe::value }; 180 | enum { value = count + padding }; 181 | }; 182 | 183 | } // namespace mgpu 184 | -------------------------------------------------------------------------------- /include/ctc.h: -------------------------------------------------------------------------------- 1 | /** \file ctc.h 2 | * Contains a simple C interface to call fast CPU and GPU based computation 3 | * of the CTC loss. 4 | */ 5 | 6 | #pragma once 7 | 8 | #ifdef __cplusplus 9 | #include 10 | extern "C" { 11 | #endif 12 | 13 | //forward declare of CUDA typedef to avoid needing to pull in CUDA headers 14 | typedef struct CUstream_st* CUstream; 15 | 16 | typedef enum { 17 | CTC_STATUS_SUCCESS = 0, 18 | CTC_STATUS_MEMOPS_FAILED = 1, 19 | CTC_STATUS_INVALID_VALUE = 2, 20 | CTC_STATUS_EXECUTION_FAILED = 3, 21 | CTC_STATUS_UNKNOWN_ERROR = 4 22 | } ctcStatus_t; 23 | 24 | /** Returns a single integer which specifies the API version of the warpctc library */ 25 | int get_warpctc_version(); 26 | 27 | /** Returns a string containing a description of status that was passed in 28 | * \param[in] status identifies which string should be returned 29 | * \return C style string containing the text description 30 | * */ 31 | const char* ctcGetStatusString(ctcStatus_t status); 32 | 33 | typedef enum { 34 | CTC_CPU = 0, 35 | CTC_GPU = 1 36 | } ctcComputeLocation; 37 | 38 | /** Structure used for options to the CTC compution. Applications 39 | * should zero out the array using memset and sizeof(struct 40 | * ctcOptions) in C or default initialization (e.g. 'ctcOptions 41 | * options{};' or 'auto options = ctcOptions{}') in C++ to ensure 42 | * forward compatibility with added options. */ 43 | struct ctcOptions { 44 | /// indicates where the ctc calculation should take place {CTC_CPU | CTC_GPU} 45 | ctcComputeLocation loc; 46 | union { 47 | /// used when loc == CTC_CPU, the maximum number of threads that can be used 48 | unsigned int num_threads; 49 | 50 | /// used when loc == CTC_GPU, which stream the kernels should be launched in 51 | CUstream stream; 52 | }; 53 | 54 | /// the label value/index that the CTC calculation should use as the blank label 55 | int blank_label; 56 | }; 57 | 58 | /** Compute the connectionist temporal classification loss between a sequence 59 | * of probabilities and a ground truth labeling. Optionally compute the 60 | * gradient with respect to the inputs. 61 | * \param [in] activations pointer to the activations in either CPU or GPU 62 | * addressable memory, depending on info. We assume a fixed 63 | * memory layout for this 3 dimensional tensor, which has dimension 64 | * (t, n, p), where t is the time index, n is the minibatch index, 65 | * and p indexes over probabilities of each symbol in the alphabet. 66 | * The memory layout is (t, n, p) in C order (slowest to fastest changing 67 | * index, aka row-major), or (p, n, t) in Fortran order (fastest to slowest 68 | * changing index, aka column-major). We also assume strides are equal to 69 | * dimensions - there is no padding between dimensions. 70 | * More precisely, element (t, n, p), for a problem with mini_batch examples 71 | * in the mini batch, and alphabet_size symbols in the alphabet, is located at: 72 | * activations[(t * mini_batch + n) * alphabet_size + p] 73 | * \param [out] gradients if not NULL, then gradients are computed. Should be 74 | * allocated in the same memory space as probs and memory 75 | * ordering is identical. 76 | * \param [in] flat_labels Always in CPU memory. A concatenation 77 | * of all the labels for the minibatch. 78 | * \param [in] label_lengths Always in CPU memory. The length of each label 79 | * for each example in the minibatch. 80 | * \param [in] input_lengths Always in CPU memory. The number of time steps 81 | * for each sequence in the minibatch. 82 | * \param [in] alphabet_size The number of possible output symbols. There 83 | * should be this many probabilities for each time step. 84 | * \param [in] mini_batch How many examples in a minibatch. 85 | * \param [out] costs Always in CPU memory. The cost of each example in the 86 | * minibatch. 87 | * \param [in,out] workspace In same memory space as probs. Should be of 88 | * size requested by get_workspace_size. 89 | * \param [in] options see struct ctcOptions 90 | * 91 | * \return Status information 92 | * 93 | * */ 94 | ctcStatus_t compute_ctc_loss(const float* const activations, 95 | float* gradients, 96 | const int* const flat_labels, 97 | const int* const label_lengths, 98 | const int* const input_lengths, 99 | int alphabet_size, 100 | int minibatch, 101 | float *costs, 102 | void *workspace, 103 | ctcOptions options); 104 | 105 | 106 | /** For a given set of labels and minibatch size return the required workspace 107 | * size. This will need to be allocated in the same memory space as your 108 | * probabilities. 109 | * \param [in] label_lengths Always in CPU memory. The length of each label 110 | * for each example in the minibatch. 111 | * \param [in] input_lengths Always in CPU memory. The number of time steps 112 | * for each sequence in the minibatch. 113 | * \param [in] alphabet_size How many symbols in the alphabet or, equivalently, 114 | * the number of probabilities at each time step 115 | * \param [in] mini_batch How many examples in a minibatch. 116 | * \param [in] info see struct ctcOptions 117 | * \param [out] size_bytes is pointer to a scalar where the memory 118 | * requirement in bytes will be placed. This memory should be allocated 119 | * at the same place, CPU or GPU, that the probs are in 120 | * 121 | * \return Status information 122 | **/ 123 | ctcStatus_t get_workspace_size(const int* const label_lengths, 124 | const int* const input_lengths, 125 | int alphabet_size, int minibatch, 126 | ctcOptions info, 127 | size_t* size_bytes); 128 | 129 | #ifdef __cplusplus 130 | } 131 | #endif 132 | -------------------------------------------------------------------------------- /include/detail/ctc_helper.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "hostdevice.h" 8 | 9 | namespace ctc_helper { 10 | 11 | static const float threshold = 1e-1; 12 | 13 | template 14 | HOSTDEVICE 15 | T neg_inf() { return -T(INFINITY); } 16 | 17 | inline int div_up(int x, int y) { 18 | return (x + y - 1) / y; 19 | } 20 | 21 | template struct maximum { 22 | HOSTDEVICE 23 | Res operator()(const Arg& x, const Arg& y) const { 24 | return x < y ? y : x; 25 | } 26 | }; 27 | 28 | template struct add { 29 | HOSTDEVICE 30 | Res operator()(const Arg& x, const Arg& y) const { 31 | return x + y; 32 | } 33 | }; 34 | 35 | template struct identity { 36 | HOSTDEVICE Res operator()(const Arg& x) const {return Res(x);} 37 | }; 38 | 39 | template struct negate { 40 | HOSTDEVICE Res operator()(const Arg& x) const {return Res(-x);} 41 | }; 42 | 43 | template struct exponential { 44 | HOSTDEVICE Res operator()(const Arg& x) const {return std::exp(x);} 45 | }; 46 | 47 | template 48 | struct log_plus { 49 | typedef Res result_type; 50 | HOSTDEVICE 51 | Res operator()(const Arg1& p1, const Arg2& p2) { 52 | if (p1 == neg_inf()) 53 | return p2; 54 | if (p2 == neg_inf()) 55 | return p1; 56 | Res result = log1p(exp(-fabs(p1 - p2))) + maximum()(p1, p2); 57 | return result; 58 | } 59 | }; 60 | 61 | } 62 | -------------------------------------------------------------------------------- /include/detail/hostdevice.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef __CUDACC__ 4 | #define HOSTDEVICE __host__ __device__ 5 | #else 6 | #define HOSTDEVICE 7 | #endif 8 | -------------------------------------------------------------------------------- /include/detail/reduce.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | ctcStatus_t reduce_negate(const float* input, float* output, int rows, int cols, bool axis, cudaStream_t stream); 4 | ctcStatus_t reduce_exp(const float* input, float* output, int rows, int cols, bool axis, cudaStream_t stream); 5 | ctcStatus_t reduce_max(const float* input, float* output, int rows, int cols, bool axis, cudaStream_t stream); 6 | -------------------------------------------------------------------------------- /pytorch_binding/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore compiled FFI location 2 | warpctc_pytorch/_warp_ctc 3 | 4 | # Created by https://www.gitignore.io/api/python 5 | 6 | ### Python ### 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | .hypothesis/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # pyenv 79 | .python-version 80 | 81 | # celery beat schedule file 82 | celerybeat-schedule.* 83 | 84 | # SageMath parsed files 85 | *.sage.py 86 | 87 | # Environments 88 | .env 89 | .venv 90 | env/ 91 | venv/ 92 | ENV/ 93 | env.bak/ 94 | venv.bak/ 95 | 96 | # Spyder project settings 97 | .spyderproject 98 | .spyproject 99 | 100 | # Rope project settings 101 | .ropeproject 102 | 103 | # mkdocs documentation 104 | /site 105 | 106 | # mypy 107 | .mypy_cache/ 108 | 109 | 110 | # End of https://www.gitignore.io/api/python 111 | -------------------------------------------------------------------------------- /pytorch_binding/.pypirc: -------------------------------------------------------------------------------- 1 | [distutils] 2 | index-servers = 3 | pypi 4 | 5 | [pypi] 6 | repository: https://upload.pypi.org/legacy/ 7 | username: __token__ 8 | -------------------------------------------------------------------------------- /pytorch_binding/setup.cfg: -------------------------------------------------------------------------------- 1 | [tool:pytest] 2 | -------------------------------------------------------------------------------- /pytorch_binding/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import platform 3 | import shutil 4 | import sys 5 | from setuptools import setup, find_packages 6 | from subprocess import Popen, PIPE 7 | 8 | from torch.utils.cpp_extension import BuildExtension, CppExtension 9 | import torch 10 | 11 | extra_compile_args = ['-std=c++14', '-fPIC', '-fopenmp'] 12 | warp_ctc_path = "../build" 13 | 14 | if platform.system() == 'Darwin': 15 | lib_ext = ".dylib" 16 | else: 17 | lib_ext = ".so" 18 | warp_ctc_libname = 'libwarpctc{}'.format(lib_ext) 19 | 20 | if "WARP_CTC_PATH" in os.environ: 21 | warp_ctc_path = os.environ["WARP_CTC_PATH"] 22 | if not os.path.exists(os.path.join(warp_ctc_path, warp_ctc_libname)): 23 | print(("Could not find {libname} in {build_path}.\n" 24 | "Build warp-ctc and set WARP_CTC_PATH to the location of" 25 | " {libname} (default is '../build')").format( 26 | libname=warp_ctc_libname, build_path=warp_ctc_path)) 27 | sys.exit(1) 28 | 29 | include_dirs = [os.path.realpath('../include')] 30 | 31 | warp_ctc_libpath = "./warpctc_pytorch/lib" 32 | if not os.path.isdir(warp_ctc_libpath): 33 | os.mkdir(warp_ctc_libpath) 34 | shutil.copyfile( 35 | '{}/{}'.format(warp_ctc_path, warp_ctc_libname), 36 | '{}/{}'.format(warp_ctc_libpath, warp_ctc_libname) 37 | ) 38 | 39 | 40 | def get_cuda_version(): 41 | proc = Popen(['nvcc', '--version'], stdout=PIPE, stderr=PIPE) 42 | out, err = proc.communicate() 43 | out.decode('utf-8').split('\n')[-2].split(', ')[-2].split(' ') 44 | return out.decode('utf-8').split()[-2][:-1].replace('.', '') 45 | 46 | 47 | def get_torch_version(): 48 | major_ver, minor_ver = torch.__version__.split('.')[:2] 49 | return major_ver + minor_ver 50 | 51 | 52 | def get_local_version_identifier(enable_gpu): 53 | local_version_identifier = '+torch{}'.format(get_torch_version()) 54 | if enable_gpu: 55 | local_version_identifier += ".cuda{}".format(get_cuda_version()) 56 | else: 57 | local_version_identifier += ".cpu" 58 | return local_version_identifier 59 | 60 | 61 | if torch.cuda.is_available() or "CUDA_HOME" in os.environ: 62 | enable_gpu = True 63 | # For CUDA10.1, libcublas-10-2 is installed 64 | # and we have to add /usr/local/cuda-10.2 to search paths 65 | if get_cuda_version() == "101": 66 | include_dirs.append("/usr/local/cuda-10.2/include") 67 | else: 68 | print("Torch was not built with CUDA support, not building warp-ctc GPU extensions.") 69 | enable_gpu = False 70 | 71 | if enable_gpu: 72 | from torch.utils.cpp_extension import CUDAExtension 73 | 74 | build_extension = CUDAExtension 75 | extra_compile_args += ['-DWARPCTC_ENABLE_GPU'] 76 | else: 77 | build_extension = CppExtension 78 | 79 | ext_modules = [ 80 | build_extension( 81 | name='warpctc_pytorch._warp_ctc', 82 | language='c++', 83 | sources=['src/binding.cpp'], 84 | include_dirs=include_dirs, 85 | library_dirs=[os.path.realpath(warp_ctc_libpath)], 86 | libraries=['warpctc'], 87 | extra_link_args=['-Wl,-rpath,{}'.format('$ORIGIN/lib')], 88 | extra_compile_args=extra_compile_args 89 | ) 90 | ] 91 | 92 | public_version_identifier = "0.2.2" 93 | setup( 94 | name="warpctc_pytorch", 95 | version=public_version_identifier + get_local_version_identifier(enable_gpu), 96 | description="Pytorch Bindings for warp-ctc maintained by ESPnet", 97 | url="https://github.com/espnet/warp-ctc", 98 | author=','.join([ 99 | "Jared Casper", 100 | "Sean Naren", 101 | "Shinji Watanabe", 102 | "Jiro Nishitoba", 103 | "Yusuke Nishioka" 104 | ]), 105 | author_email=','.join([ 106 | "jared.casper@baidu.com", 107 | "sean.narenthiran@digitalreasoning.com", 108 | "sw005320@gmail.com", 109 | "j.nshtb+github@gmail.com", 110 | "yusuke.nishioka.0713@gmail.com" 111 | ]), 112 | license="Apache", 113 | packages=find_packages(), 114 | package_data={'': ['lib/{}'.format(warp_ctc_libname)]}, 115 | ext_modules=ext_modules, 116 | cmdclass={'build_ext': BuildExtension} 117 | ) 118 | -------------------------------------------------------------------------------- /pytorch_binding/src/binding.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | 6 | #include 7 | 8 | #ifdef WARPCTC_ENABLE_GPU 9 | #include "ATen/cuda/CUDAContext.h" 10 | #include 11 | #include "ATen/cuda/CUDAEvent.h" 12 | 13 | #include "THC.h" 14 | extern THCState* state; 15 | #endif 16 | 17 | #include "ctc.h" 18 | 19 | int cpu_ctc(torch::Tensor probs, 20 | torch::Tensor grads, 21 | torch::Tensor labels, 22 | torch::Tensor label_sizes, 23 | torch::Tensor sizes, 24 | int minibatch_size, 25 | torch::Tensor costs, 26 | int blank_label) 27 | { 28 | float* probs_ptr = (float*)probs.data_ptr(); 29 | float* grads_ptr = grads.storage() ? (float*)grads.data_ptr() : NULL; 30 | int* sizes_ptr = (int*)sizes.data_ptr(); 31 | int* labels_ptr = (int*)labels.data_ptr(); 32 | int* label_sizes_ptr = (int*)label_sizes.data_ptr(); 33 | float* costs_ptr = (float*)costs.data_ptr(); 34 | 35 | const int probs_size = probs.size(2); 36 | 37 | ctcOptions options; 38 | memset(&options, 0, sizeof(options)); 39 | options.loc = CTC_CPU; 40 | options.num_threads = 0; // will use default number of threads 41 | options.blank_label = blank_label; 42 | 43 | #if defined(CTC_DISABLE_OMP) || defined(APPLE) 44 | // have to use at least one 45 | options.num_threads = std::max(options.num_threads, (unsigned int) 1); 46 | #endif 47 | 48 | size_t cpu_size_bytes; 49 | get_workspace_size(label_sizes_ptr, sizes_ptr, 50 | probs_size, minibatch_size, 51 | options, &cpu_size_bytes); 52 | 53 | float* cpu_workspace = new float[cpu_size_bytes / sizeof(float)]; 54 | 55 | compute_ctc_loss(probs_ptr, grads_ptr, 56 | labels_ptr, label_sizes_ptr, 57 | sizes_ptr, probs_size, 58 | minibatch_size, costs_ptr, 59 | cpu_workspace, options); 60 | 61 | delete[] cpu_workspace; 62 | return 1; 63 | } 64 | 65 | #ifdef WARPCTC_ENABLE_GPU 66 | int gpu_ctc(torch::Tensor probs, 67 | torch::Tensor grads, 68 | torch::Tensor labels, 69 | torch::Tensor label_sizes, 70 | torch::Tensor sizes, 71 | int minibatch_size, 72 | torch::Tensor costs, 73 | int blank_label) 74 | { 75 | float* probs_ptr = (float*)probs.data_ptr(); 76 | float* grads_ptr = grads.storage() ? (float*)grads.data_ptr() : NULL; 77 | int* sizes_ptr = (int*)sizes.data_ptr(); 78 | int* labels_ptr = (int*)labels.data_ptr(); 79 | int* label_sizes_ptr = (int*)label_sizes.data_ptr(); 80 | float* costs_ptr = (float*)costs.data_ptr(); 81 | 82 | const int probs_size = probs.size(2); 83 | 84 | ctcOptions options; 85 | memset(&options, 0, sizeof(options)); 86 | options.loc = CTC_GPU; 87 | options.blank_label = blank_label; 88 | options.stream = at::cuda::getCurrentCUDAStream(); 89 | 90 | size_t gpu_size_bytes; 91 | get_workspace_size(label_sizes_ptr, sizes_ptr, 92 | probs_size, minibatch_size, 93 | options, &gpu_size_bytes); 94 | 95 | void* gpu_workspace = THCudaMalloc(state, gpu_size_bytes); 96 | 97 | compute_ctc_loss(probs_ptr, grads_ptr, 98 | labels_ptr, label_sizes_ptr, 99 | sizes_ptr, probs_size, 100 | minibatch_size, costs_ptr, 101 | gpu_workspace, options); 102 | 103 | THCudaFree(state, (void *) gpu_workspace); 104 | return 1; 105 | } 106 | #endif 107 | 108 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 109 | m.def("cpu_ctc", &cpu_ctc, "CTC Loss function with cpu"); 110 | #ifdef WARPCTC_ENABLE_GPU 111 | m.def("gpu_ctc", &gpu_ctc, "CTC Loss function with gpu"); 112 | #endif 113 | } 114 | -------------------------------------------------------------------------------- /pytorch_binding/src/cpu_binding.h: -------------------------------------------------------------------------------- 1 | /* 2 | int cpu_ctc(THFloatTensor *probs, 3 | THFloatTensor *grads, 4 | THIntTensor *labels_ptr, 5 | THIntTensor *label_sizes_ptr, 6 | THIntTensor *sizes, 7 | int minibatch_size, 8 | THFloatTensor *costs, 9 | int blank_label); 10 | */ 11 | 12 | int cpu_ctc(torch::Tensor probs, 13 | torch::Tensor grads, 14 | torch::Tensor labels, 15 | torch::Tensor label_sizes, 16 | torch::Tensor sizes, 17 | int minibatch_size, 18 | torch::Tensor costs, 19 | int blank_label); 20 | -------------------------------------------------------------------------------- /pytorch_binding/src/gpu_binding.h: -------------------------------------------------------------------------------- 1 | /* 2 | int gpu_ctc(THCudaTensor *probs, 3 | THCudaTensor *grads, 4 | THIntTensor *labels_ptr, 5 | THIntTensor *label_sizes_ptr, 6 | THIntTensor *sizes, 7 | int minibatch_size, 8 | THFloatTensor *costs, 9 | int blank_label); 10 | */ 11 | 12 | int gpu_ctc(torch::Tensor probs, 13 | torch::Tensor grads, 14 | torch::Tensor labels, 15 | torch::Tensor label_sizes, 16 | torch::Tensor sizes, 17 | int minibatch_size, 18 | torch::Tensor costs, 19 | int blank_label); 20 | -------------------------------------------------------------------------------- /pytorch_binding/tests/test_cpu.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import warpctc_pytorch as warp_ctc 3 | import pytest 4 | 5 | 6 | def test_simple(): 7 | probs = torch.FloatTensor([[[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]]]).transpose(0, 1).contiguous() 8 | grads = torch.zeros(probs.size()) 9 | labels = torch.IntTensor([1, 2]) 10 | label_sizes = torch.IntTensor([2]) 11 | sizes = torch.IntTensor(probs.size(1)).fill_(probs.size(0)) 12 | minibatch_size = probs.size(1) 13 | costs = torch.zeros(minibatch_size) 14 | warp_ctc.cpu_ctc(probs, 15 | grads, 16 | labels, 17 | label_sizes, 18 | sizes, 19 | minibatch_size, 20 | costs, 21 | 0) 22 | print('CPU_cost: %f' % costs.sum()) 23 | 24 | 25 | @pytest.mark.parametrize("multiplier", [1.0, 200.0]) 26 | def test_medium(multiplier): 27 | probs = torch.FloatTensor([ 28 | [[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]], 29 | [[0.6, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.5, 0.2, 0.1]] 30 | ]).contiguous() * multiplier 31 | 32 | grads = torch.zeros(probs.size()) 33 | labels = torch.IntTensor([1, 2, 1, 2]) 34 | label_sizes = torch.IntTensor([2, 2]) 35 | sizes = torch.IntTensor([2, 2]) 36 | minibatch_size = probs.size(1) 37 | costs = torch.zeros(minibatch_size) 38 | warp_ctc.cpu_ctc(probs, 39 | grads, 40 | labels, 41 | label_sizes, 42 | sizes, 43 | minibatch_size, 44 | costs, 45 | 0) 46 | print('CPU_cost: %f' % costs.sum()) 47 | 48 | 49 | def test_empty_label(): 50 | probs = torch.FloatTensor([ 51 | [[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]], 52 | [[0.6, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.5, 0.2, 0.1]] 53 | ]).contiguous() 54 | 55 | grads = torch.zeros(probs.size()) 56 | labels = torch.IntTensor([1, 2]) 57 | label_sizes = torch.IntTensor([2, 0]) 58 | sizes = torch.IntTensor([2, 2]) 59 | minibatch_size = probs.size(1) 60 | costs = torch.zeros(minibatch_size) 61 | warp_ctc.cpu_ctc(probs, 62 | grads, 63 | labels, 64 | label_sizes, 65 | sizes, 66 | minibatch_size, 67 | costs, 68 | 0) 69 | print('CPU_cost: %f' % costs.sum()) 70 | 71 | 72 | def test_CTCLoss(): 73 | probs = torch.FloatTensor([[ 74 | [0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1] 75 | ]]).transpose(0, 1).contiguous() 76 | labels = torch.IntTensor([1, 2]) 77 | label_sizes = torch.IntTensor([2]) 78 | probs_sizes = torch.IntTensor([2]) 79 | probs.requires_grad_(True) 80 | 81 | ctc_loss = warp_ctc.CTCLoss() 82 | cost = ctc_loss(probs, labels, probs_sizes, label_sizes) 83 | cost.backward() 84 | 85 | 86 | if __name__ == '__main__': 87 | pytest.main([__file__]) 88 | -------------------------------------------------------------------------------- /pytorch_binding/tests/test_gpu.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import warpctc_pytorch as warp_ctc 3 | import pytest 4 | 5 | 6 | @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU") 7 | def test_simple(): 8 | probs = torch.FloatTensor([[[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]]]).transpose(0, 1).contiguous() 9 | grads = torch.zeros(probs.size()) 10 | labels = torch.IntTensor([1, 2]) 11 | label_sizes = torch.IntTensor([2]) 12 | sizes = torch.IntTensor(probs.size(1)).fill_(probs.size(0)) 13 | minibatch_size = probs.size(1) 14 | costs = torch.zeros(minibatch_size) 15 | warp_ctc.cpu_ctc(probs, 16 | grads, 17 | labels, 18 | label_sizes, 19 | sizes, 20 | minibatch_size, 21 | costs, 22 | 0) 23 | print('CPU_cost: %f' % costs.sum()) 24 | probs = probs.clone().cuda() 25 | grads = torch.zeros(probs.size()).cuda() 26 | costs = torch.zeros(minibatch_size) 27 | warp_ctc.gpu_ctc(probs, 28 | grads, 29 | labels, 30 | label_sizes, 31 | sizes, 32 | minibatch_size, 33 | costs, 34 | 0) 35 | print('GPU_cost: %f' % costs.sum()) 36 | print(grads.view(grads.size(0) * grads.size(1), grads.size(2))) 37 | 38 | 39 | @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU") 40 | @pytest.mark.parametrize("multiplier", [1.0, 200.0]) 41 | def test_medium(multiplier): 42 | probs = torch.FloatTensor([ 43 | [[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]], 44 | [[0.6, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.5, 0.2, 0.1]] 45 | ]).contiguous() * multiplier 46 | 47 | grads = torch.zeros(probs.size()) 48 | labels = torch.IntTensor([1, 2, 1, 2]) 49 | label_sizes = torch.IntTensor([2, 2]) 50 | sizes = torch.IntTensor([2, 2]) 51 | minibatch_size = probs.size(1) 52 | costs = torch.zeros(minibatch_size) 53 | warp_ctc.cpu_ctc(probs, 54 | grads, 55 | labels, 56 | label_sizes, 57 | sizes, 58 | minibatch_size, 59 | costs, 60 | 0) 61 | print('CPU_cost: %f' % costs.sum()) 62 | probs = probs.clone().cuda() 63 | grads = torch.zeros(probs.size()).cuda() 64 | costs = torch.zeros(minibatch_size) 65 | warp_ctc.gpu_ctc(probs, 66 | grads, 67 | labels, 68 | label_sizes, 69 | sizes, 70 | minibatch_size, 71 | costs, 72 | 0) 73 | print('GPU_cost: %f' % costs.sum()) 74 | print(grads.view(grads.size(0) * grads.size(1), grads.size(2))) 75 | 76 | 77 | @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU") 78 | def test_empty_label(): 79 | probs = torch.FloatTensor([ 80 | [[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]], 81 | [[0.6, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.5, 0.2, 0.1]] 82 | ]).contiguous() 83 | 84 | grads = torch.zeros(probs.size()) 85 | labels = torch.IntTensor([1, 2]) 86 | label_sizes = torch.IntTensor([2, 0]) 87 | sizes = torch.IntTensor([2, 2]) 88 | minibatch_size = probs.size(1) 89 | costs = torch.zeros(minibatch_size) 90 | warp_ctc.cpu_ctc(probs, 91 | grads, 92 | labels, 93 | label_sizes, 94 | sizes, 95 | minibatch_size, 96 | costs, 97 | 0) 98 | print('CPU_cost: %f' % costs.sum()) 99 | probs = probs.clone().cuda() 100 | grads = torch.zeros(probs.size()).cuda() 101 | costs = torch.zeros(minibatch_size) 102 | warp_ctc.gpu_ctc(probs, 103 | grads, 104 | labels, 105 | label_sizes, 106 | sizes, 107 | minibatch_size, 108 | costs, 109 | 0) 110 | print('GPU_cost: %f' % costs.sum()) 111 | print(grads.view(grads.size(0) * grads.size(1), grads.size(2))) 112 | 113 | 114 | @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU") 115 | def test_CTCLoss(): 116 | probs = torch.FloatTensor([[ 117 | [0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1] 118 | ]]).transpose(0, 1).contiguous().cuda() 119 | labels = torch.IntTensor([1, 2]) 120 | label_sizes = torch.IntTensor([2]) 121 | probs_sizes = torch.IntTensor([2]) 122 | probs.requires_grad_(True) 123 | 124 | ctc_loss = warp_ctc.CTCLoss() 125 | cost = ctc_loss(probs, labels, probs_sizes, label_sizes) 126 | cost.backward() 127 | 128 | 129 | if __name__ == '__main__': 130 | pytest.main([__file__]) 131 | -------------------------------------------------------------------------------- /pytorch_binding/warpctc_pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import warpctc_pytorch as warp_ctc 3 | from torch.autograd import Function 4 | from torch.nn import Module 5 | 6 | from ._warp_ctc import * # noqa 7 | 8 | __version__ = '0.2.2' 9 | 10 | 11 | def _assert_no_grad(tensor): 12 | assert not tensor.requires_grad, \ 13 | "gradients only computed for acts - please " \ 14 | "mark other tensors as not requiring gradients" 15 | 16 | 17 | class _CTC(Function): 18 | @staticmethod 19 | def forward(ctx, acts, labels, act_lens, label_lens, size_average=False, 20 | length_average=False, blank=0, reduce=True): 21 | is_cuda = True if acts.is_cuda else False 22 | acts = acts.contiguous() 23 | loss_func = warp_ctc.gpu_ctc if is_cuda else warp_ctc.cpu_ctc 24 | grads = torch.zeros(acts.size()).type_as(acts) 25 | minibatch_size = acts.size(1) 26 | costs = torch.zeros(minibatch_size).cpu() 27 | loss_func(acts, 28 | grads, 29 | labels, 30 | label_lens, 31 | act_lens, 32 | minibatch_size, 33 | costs, 34 | blank) 35 | 36 | if reduce: 37 | costs = torch.FloatTensor([costs.sum()]) 38 | 39 | if length_average: 40 | # Compute the avg. log-probability per batch sample and frame. 41 | total_length = torch.sum(act_lens).item() 42 | grads = grads / total_length 43 | costs = costs / total_length 44 | elif size_average: 45 | # Compute the avg. log-probability per batch sample. 46 | grads = grads / minibatch_size 47 | costs = costs / minibatch_size 48 | else: 49 | # Make the costs size be B x 1, then grad_output is also B x 1 50 | # Thus the `grad_output' in backward() is broadcastable 51 | costs = costs.unsqueeze(1) 52 | 53 | ctx.grads = grads 54 | return costs 55 | 56 | @staticmethod 57 | def backward(ctx, grad_output): 58 | _grad_output = grad_output.to(ctx.grads.device) 59 | return ctx.grads.mul_(_grad_output), None, None, None, None, None, None, None 60 | 61 | class CTCLoss(Module): 62 | """ 63 | Parameters: 64 | size_average (bool): normalize the loss by the batch size 65 | (default: `False`) 66 | length_average (bool): normalize the loss by the total number of frames 67 | in the batch. If `True`, supersedes `size_average` 68 | (default: `False`) 69 | reduce (bool): average or sum over observation for each minibatch. 70 | If `False`, returns a loss per batch element instead and ignores `average` options. 71 | (default: `True`) 72 | """ 73 | def __init__(self, blank=0, size_average=False, length_average=False, reduce=True): 74 | super(CTCLoss, self).__init__() 75 | self.ctc = _CTC.apply 76 | self.blank = blank 77 | self.size_average = size_average 78 | self.length_average = length_average 79 | self.reduce = reduce 80 | 81 | def forward(self, acts, labels, act_lens, label_lens): 82 | """ 83 | acts: Tensor of (seqLength x batch x outputDim) containing output from network 84 | labels: 1 dimensional Tensor containing all the targets of the batch in one sequence 85 | act_lens: Tensor of size (batch) containing size of each output sequence from the network 86 | label_lens: Tensor of (batch) containing label length of each example 87 | """ 88 | assert len(labels.size()) == 1 # labels must be 1 dimensional 89 | _assert_no_grad(labels) 90 | _assert_no_grad(act_lens) 91 | _assert_no_grad(label_lens) 92 | return self.ctc(acts, labels, act_lens, label_lens, self.size_average, 93 | self.length_average, self.blank, self.reduce) 94 | -------------------------------------------------------------------------------- /pytorch_binding/wheel/build_wheels.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -eu 4 | 5 | function install_torch_of_specified_version() { 6 | version=$1 7 | pip install torch==$1 8 | } 9 | 10 | function build_wheel() { 11 | python setup.py bdist_wheel 12 | python wheel/rename_wheels.py 13 | } 14 | 15 | function install_wheel() { 16 | torch_version=$1 17 | 18 | torch_vers=(${torch_version//./ }) 19 | torch_major_ver=${torch_vers[0]} 20 | torch_minor_ver=${torch_vers[1]} 21 | pip install dist/warpctc_pytorch-*+torch${torch_major_ver}${torch_minor_ver}*.whl 22 | } 23 | 24 | function run_tests() { 25 | pytest tests 26 | pytest --flakes 27 | } 28 | 29 | function post_process() { 30 | python setup.py clean 31 | pip uninstall -y warpctc-pytorch torch 32 | rm -rf build warpctc_pytorch.egg-info 33 | } 34 | 35 | torch_versions=(${TORCH_VERSIONS//:/ }) 36 | for torch_version in ${torch_versions[@]}; do 37 | install_torch_of_specified_version $torch_version 38 | build_wheel 39 | install_wheel $torch_version 40 | run_tests 41 | post_process 42 | done 43 | -------------------------------------------------------------------------------- /pytorch_binding/wheel/rename_wheels.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import shutil 4 | 5 | 6 | for whl_path in glob.glob(os.path.join(os.getcwd(), 'dist', '*.whl')): 7 | whl_name = os.path.basename(whl_path) 8 | dist, version, python_tag, abi_tag, platform_tag = whl_name.split('-') 9 | if 'manylinux' in platform_tag: 10 | continue 11 | platform_tag = platform_tag.replace('linux', 'manylinux1') 12 | new_whl_name = '-'.join([dist, version, python_tag, abi_tag, platform_tag]) 13 | new_whl_path = os.path.join(os.path.dirname(whl_path), new_whl_name) 14 | shutil.move(whl_path, new_whl_path) 15 | -------------------------------------------------------------------------------- /src/ctc_entrypoint.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | 7 | #include "detail/cpu_ctc.h" 8 | #ifdef __CUDACC__ 9 | #include "detail/gpu_ctc.h" 10 | #endif 11 | 12 | 13 | extern "C" { 14 | 15 | int get_warpctc_version() { 16 | return 2; 17 | } 18 | 19 | const char* ctcGetStatusString(ctcStatus_t status) { 20 | switch (status) { 21 | case CTC_STATUS_SUCCESS: 22 | return "no error"; 23 | case CTC_STATUS_MEMOPS_FAILED: 24 | return "cuda memcpy or memset failed"; 25 | case CTC_STATUS_INVALID_VALUE: 26 | return "invalid value"; 27 | case CTC_STATUS_EXECUTION_FAILED: 28 | return "execution failed"; 29 | 30 | case CTC_STATUS_UNKNOWN_ERROR: 31 | default: 32 | return "unknown error"; 33 | 34 | } 35 | 36 | } 37 | 38 | 39 | ctcStatus_t compute_ctc_loss(const float* const activations, 40 | float* gradients, 41 | const int* const flat_labels, 42 | const int* const label_lengths, 43 | const int* const input_lengths, 44 | int alphabet_size, 45 | int minibatch, 46 | float *costs, 47 | void *workspace, 48 | ctcOptions options) { 49 | 50 | if (activations == nullptr || 51 | flat_labels == nullptr || 52 | label_lengths == nullptr || 53 | input_lengths == nullptr || 54 | costs == nullptr || 55 | workspace == nullptr || 56 | alphabet_size <= 0 || 57 | minibatch <= 0) 58 | return CTC_STATUS_INVALID_VALUE; 59 | 60 | if (options.loc == CTC_CPU) { 61 | CpuCTC ctc(alphabet_size, minibatch, workspace, options.num_threads, 62 | options.blank_label); 63 | 64 | if (gradients != NULL) 65 | return ctc.cost_and_grad(activations, gradients, 66 | costs, 67 | flat_labels, label_lengths, 68 | input_lengths); 69 | else 70 | return ctc.score_forward(activations, costs, flat_labels, 71 | label_lengths, input_lengths); 72 | } else if (options.loc == CTC_GPU) { 73 | #ifdef __CUDACC__ 74 | GpuCTC ctc(alphabet_size, minibatch, workspace, options.stream, 75 | options.blank_label); 76 | 77 | if (gradients != NULL) 78 | return ctc.cost_and_grad(activations, gradients, costs, 79 | flat_labels, label_lengths, 80 | input_lengths); 81 | else 82 | return ctc.score_forward(activations, costs, flat_labels, 83 | label_lengths, input_lengths); 84 | #else 85 | std::cerr << "GPU execution requested, but not compiled with GPU support" << std::endl; 86 | return CTC_STATUS_EXECUTION_FAILED; 87 | #endif 88 | } else { 89 | return CTC_STATUS_INVALID_VALUE; 90 | } 91 | } 92 | 93 | 94 | ctcStatus_t get_workspace_size(const int* const label_lengths, 95 | const int* const input_lengths, 96 | int alphabet_size, int minibatch, 97 | ctcOptions options, 98 | size_t* size_bytes) 99 | { 100 | if (label_lengths == nullptr || 101 | input_lengths == nullptr || 102 | size_bytes == nullptr || 103 | alphabet_size <= 0 || 104 | minibatch <= 0) 105 | return CTC_STATUS_INVALID_VALUE; 106 | 107 | // This is the max of all S and T for all examples in the minibatch. 108 | int maxL = *std::max_element(label_lengths, label_lengths + minibatch); 109 | int maxT = *std::max_element(input_lengths, input_lengths + minibatch); 110 | 111 | const int S = 2 * maxL + 1; 112 | 113 | *size_bytes = 0; 114 | 115 | if (options.loc == CTC_GPU) { 116 | // GPU storage 117 | //nll_forward, nll_backward 118 | *size_bytes += 2 * sizeof(float) * minibatch; 119 | 120 | //repeats 121 | *size_bytes += sizeof(int) * minibatch; 122 | 123 | //label offsets 124 | *size_bytes += sizeof(int) * minibatch; 125 | 126 | //utt_length 127 | *size_bytes += sizeof(int) * minibatch; 128 | 129 | //label lengths 130 | *size_bytes += sizeof(int) * minibatch; 131 | 132 | //labels without blanks - overallocate for now 133 | *size_bytes += sizeof(int) * maxL * minibatch; 134 | 135 | //labels with blanks 136 | *size_bytes += sizeof(int) * S * minibatch; 137 | 138 | //alphas 139 | *size_bytes += sizeof(float) * S * maxT * minibatch; 140 | 141 | //denoms 142 | *size_bytes += sizeof(float) * maxT * minibatch; 143 | 144 | //probs (since we will pass in activations) 145 | *size_bytes += sizeof(float) * alphabet_size * maxT * minibatch; 146 | 147 | } else { 148 | //cpu can eventually replace all minibatch with 149 | //max number of concurrent threads if memory is 150 | //really tight 151 | 152 | //per minibatch memory 153 | size_t per_minibatch_bytes = 0; 154 | 155 | //output 156 | per_minibatch_bytes += sizeof(float) * alphabet_size ; 157 | 158 | //alphas 159 | per_minibatch_bytes += sizeof(float) * S * maxT; 160 | 161 | //betas 162 | per_minibatch_bytes += sizeof(float) * S; 163 | 164 | //labels w/blanks, e_inc, s_inc 165 | per_minibatch_bytes += 3 * sizeof(int) * S; 166 | 167 | *size_bytes = per_minibatch_bytes * minibatch; 168 | 169 | //probs 170 | *size_bytes += sizeof(float) * alphabet_size * maxT * minibatch; 171 | } 172 | 173 | return CTC_STATUS_SUCCESS; 174 | } 175 | 176 | } 177 | -------------------------------------------------------------------------------- /src/ctc_entrypoint.cu: -------------------------------------------------------------------------------- 1 | ctc_entrypoint.cpp -------------------------------------------------------------------------------- /src/reduce.cu: -------------------------------------------------------------------------------- 1 | // Includes, system 2 | // #include 3 | // #include 4 | 5 | // Includes, cuda 6 | // #include 7 | // #include 8 | 9 | // Includes, cuda helper functions 10 | // #include 11 | 12 | // For the functors 13 | #include "detail/ctc_helper.h" 14 | #include "ctc.h" 15 | 16 | const int warp_size = 32; 17 | 18 | template 19 | struct CTAReduce; 20 | 21 | template 22 | struct CTAReduce { 23 | enum { Size = NT, Capacity = NT }; 24 | struct Storage { T shared[Capacity]; }; 25 | 26 | __device__ static T reduce(int tid, T x, Storage& storage, int count, Rop g) { 27 | T* s = storage.shared; 28 | s[tid] = x; 29 | __syncthreads(); 30 | 31 | // Fold the data in half with each pass. 32 | #pragma unroll 33 | for(int offset = NT / 2; offset >= warp_size; offset /= 2) { 34 | if(tid + offset < count && tid < offset) { 35 | // Read from the right half and store to the left half. 36 | x = g(x, s[offset + tid]); 37 | s[tid] = x; 38 | } 39 | __syncthreads(); 40 | } 41 | 42 | T shuff; 43 | for (int offset = warp_size / 2; offset > 0; offset /= 2) { 44 | shuff = __shfl_down_sync(0xFFFFFFFF, x, offset); 45 | if (tid + offset < count && tid < offset) 46 | x = g(x, shuff); 47 | } 48 | return x; 49 | } 50 | }; 51 | 52 | template 53 | __global__ void reduce_rows(Iop f, Rop g, const T* input, T* output, 54 | int num_rows, int num_cols) { 55 | 56 | typedef CTAReduce R; 57 | __shared__ typename R::Storage storage; 58 | 59 | int tid = threadIdx.x; 60 | int idx = tid; 61 | int col = blockIdx.x; 62 | T curr; 63 | 64 | // Each block works on a column 65 | if (idx < num_rows) 66 | curr = f(input[idx + col*num_rows]); 67 | idx += NT; 68 | 69 | 70 | while (idx < num_rows) { 71 | curr = g(curr, f(input[idx + col*num_rows])); 72 | idx += NT; 73 | } 74 | 75 | // Sum thread-totals over the CTA. 76 | curr = R::reduce(tid, curr, storage, num_rows, g); 77 | 78 | // Store result in out 79 | if (tid == 0) 80 | output[col] = curr; 81 | } 82 | 83 | template 84 | __global__ void reduce_cols(Iop f, Rop g, const T* input, T* output, 85 | int num_rows, int num_cols) { 86 | 87 | __shared__ T s[NT]; 88 | 89 | int warps_per_block = NT / warp_size; 90 | int row = blockDim.x * blockIdx.x + threadIdx.x; 91 | int col = threadIdx.y; 92 | T curr; 93 | 94 | if (row < num_rows && col < num_cols) { 95 | curr = f(input[row + col*num_rows]); 96 | col += blockDim.y; 97 | while (col < num_cols) { 98 | curr = g(curr, f(input[row + col*num_rows])); 99 | col += blockDim.y; 100 | } 101 | } 102 | s[threadIdx.x * warps_per_block + threadIdx.y] = curr; 103 | __syncthreads(); 104 | 105 | // Reduce 106 | if (threadIdx.y == 0 && row < num_rows) { 107 | #pragma unroll 108 | for (int i = 1; i < warps_per_block && i < num_cols; ++i) 109 | curr = g(curr, s[i + threadIdx.x * warps_per_block]); 110 | output[row] = curr; 111 | } 112 | } 113 | 114 | struct ReduceHelper { 115 | 116 | template 117 | static void impl(Iof f, Rof g, const T* input, T* output, int num_rows, int num_cols, bool axis, cudaStream_t stream) { 118 | 119 | int grid_size; 120 | 121 | if (axis) { 122 | grid_size = num_cols; 123 | reduce_rows<128><<>> 124 | (f, g, input, output, num_rows, num_cols); 125 | 126 | } else { 127 | dim3 tpb(warp_size, 128 / warp_size); 128 | grid_size = (num_cols + warp_size - 1)/warp_size; 129 | reduce_cols<128><<>> 130 | (f, g, input, output, num_rows, num_cols); 131 | 132 | } 133 | } 134 | }; 135 | 136 | 137 | template 138 | ctcStatus_t reduce(Iof f, Rof g, const T* input, T* output, int rows, int cols, bool axis, cudaStream_t stream) { 139 | ReduceHelper::impl(f, g, input, output, rows, cols, axis, stream); 140 | cudaStreamSynchronize(stream); 141 | cudaError_t err = cudaGetLastError(); 142 | if (err != cudaSuccess) 143 | return CTC_STATUS_EXECUTION_FAILED; 144 | 145 | return CTC_STATUS_SUCCESS; 146 | } 147 | 148 | ctcStatus_t reduce_negate(const float *input, float *output, int rows, int cols, bool axis, cudaStream_t stream) { 149 | return reduce(ctc_helper::negate(), ctc_helper::add(), input, output, rows, cols, axis, stream); 150 | } 151 | 152 | ctcStatus_t reduce_exp(const float *input, float *output, int rows, int cols, bool axis, cudaStream_t stream) { 153 | return reduce(ctc_helper::exponential(), ctc_helper::add(), input, output, rows, cols, axis, stream); 154 | } 155 | 156 | ctcStatus_t reduce_max(const float *input, float *output, int rows, int cols, bool axis, cudaStream_t stream) { 157 | return reduce(ctc_helper::identity(), ctc_helper::maximum(),input, output, rows, cols, axis, stream); 158 | } 159 | -------------------------------------------------------------------------------- /tests/random.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | 5 | std::vector 6 | genActs(int size) { 7 | std::vector arr(size); 8 | std::mt19937 gen(0); 9 | std::uniform_real_distribution<> dis(0, 1); 10 | for(int i = 0; i < size; ++i) 11 | arr[i] = dis(gen); 12 | return arr; 13 | } 14 | 15 | std::vector 16 | genLabels(int alphabet_size, int L) { 17 | std::vector label(L); 18 | 19 | std::mt19937 gen(1); 20 | std::uniform_int_distribution<> dis(1, alphabet_size - 1); 21 | 22 | for(int i = 0; i < L; ++i) { 23 | label[i] = dis(gen); 24 | } 25 | // guarantee repeats for testing 26 | if (L >= 3) { 27 | label[L / 2] = label[L / 2 + 1]; 28 | label[L / 2 - 1] = label[L / 2]; 29 | } 30 | return label; 31 | } 32 | 33 | -------------------------------------------------------------------------------- /tests/test.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | 9 | #include 10 | 11 | inline void throw_on_error(ctcStatus_t status, const char* message) { 12 | if (status != CTC_STATUS_SUCCESS) { 13 | throw std::runtime_error(message + (", stat = " + 14 | std::string(ctcGetStatusString(status)))); 15 | } 16 | } 17 | 18 | #ifdef __CUDACC__ 19 | #include 20 | #include 21 | 22 | inline void throw_on_error(cudaError_t error, const char* message) { 23 | if (error) { 24 | throw thrust::system_error(error, thrust::cuda_category(), message); 25 | } 26 | } 27 | 28 | #endif 29 | 30 | std::vector genActs(int size); 31 | std::vector genLabels(int alphabet_size, int L); 32 | 33 | float rel_diff(const std::vector& grad, 34 | const std::vector& num_grad) { 35 | float diff = 0.; 36 | float tot = 0.; 37 | for(size_t idx = 0; idx < grad.size(); ++idx) { 38 | diff += (grad[idx] - num_grad[idx]) * (grad[idx] - num_grad[idx]); 39 | tot += grad[idx] * grad[idx]; 40 | } 41 | 42 | return diff / tot; 43 | } 44 | 45 | // Numerically stable softmax for a minibatch of 1 46 | void softmax(const float* const acts, 47 | int alphabet_size, int T, 48 | float *probs) { 49 | 50 | for (int t = 0; t < T; ++t) { 51 | 52 | float max_activation = 53 | -std::numeric_limits::infinity(); 54 | 55 | for (int a = 0; a < alphabet_size; ++a) 56 | max_activation = 57 | std::max(max_activation, acts[t*alphabet_size + a]); 58 | 59 | float denom = 0; 60 | for (int a = 0; a < alphabet_size; ++a) 61 | denom += std::exp(acts[t*alphabet_size + a] - max_activation); 62 | 63 | for (int a = 0; a < alphabet_size; ++a) 64 | probs[t*alphabet_size + a] = 65 | std::exp(acts[t*alphabet_size + a] - max_activation) / denom; 66 | } 67 | } 68 | --------------------------------------------------------------------------------