├── .gitignore ├── CMakeLists.txt ├── LICENSE ├── README.md ├── README.zh_cn.md ├── cmake └── hip.cmake ├── doc ├── baidu-research-logo-small.png └── deep-speech-ctc-small.png ├── include ├── contrib │ └── moderngpu │ │ ├── LICENSE │ │ └── include │ │ ├── device │ │ ├── ctaloadbalance.cuh │ │ ├── ctamerge.cuh │ │ ├── ctascan.cuh │ │ ├── ctasearch.cuh │ │ ├── ctasegreduce.cuh │ │ ├── ctasegscan.cuh │ │ ├── ctasegsort.cuh │ │ ├── ctasortedsearch.cuh │ │ ├── devicetypes.cuh │ │ ├── deviceutil.cuh │ │ ├── intrinsics.cuh │ │ ├── loadstore.cuh │ │ ├── serialsets.cuh │ │ └── sortnetwork.cuh │ │ ├── mgpudevice.cuh │ │ ├── mgpuenums.h │ │ └── util │ │ └── static.h ├── ctc.h └── detail │ ├── cpu_ctc.h │ ├── ctc_helper.h │ ├── gpu_ctc.h │ ├── gpu_ctc_kernels.h │ ├── hostdevice.h │ ├── reduce.h │ └── type_defs.h ├── src ├── ctc_entrypoint.cpp ├── ctc_entrypoint.cu └── reduce.cu ├── tensorflow_binding ├── .gitignore ├── README.md ├── setup.py ├── src │ ├── ctc_op_kernel.cc │ └── warpctc_op.cc ├── tests │ ├── __init__.py │ ├── test_ctc_loss_op.py │ └── test_warpctc_op.py └── warpctc_tensorflow │ └── __init__.py ├── tests ├── test.h ├── test_cpu.cpp └── test_gpu.cu └── torch_binding ├── TUTORIAL.md ├── TUTORIAL.zh_cn.md ├── binding.cpp ├── init.lua ├── rocks └── warp-ctc-scm-1.rockspec ├── tests ├── data │ ├── chars.txt │ ├── labels.txt │ └── sizes.txt └── test.lua ├── utils.c └── utils.h /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | Makefile 3 | build -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | IF (APPLE) 2 | cmake_minimum_required(VERSION 3.4) 3 | ELSE() 4 | cmake_minimum_required(VERSION 3.10) 5 | ENDIF() 6 | 7 | set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") 8 | 9 | project(ctc_release LANGUAGES CXX CUDA) 10 | set(CMAKE_CXX_STANDARD 11) 11 | set(CMAKE_CUDA_STANDARD 11) 12 | 13 | include_directories(include) 14 | 15 | FIND_PACKAGE(CUDA 6.5) 16 | FIND_PACKAGE(Torch) 17 | 18 | MESSAGE(STATUS "cuda found ${CUDA_FOUND}") 19 | MESSAGE(STATUS "Torch found ${Torch_DIR}") 20 | 21 | option(WITH_GPU "compile warp-ctc with CUDA." ${CUDA_FOUND}) 22 | option(WITH_TORCH "compile warp-ctc with Torch." ${Torch_FOUND}) 23 | option(WITH_OMP "compile warp-ctc with OpenMP." ON) 24 | option(BUILD_TESTS "build warp-ctc unit tests." ON) 25 | option(BUILD_SHARED "build warp-ctc shared library." ON) 26 | option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF) 27 | 28 | if(WITH_ROCM) 29 | add_definitions(-DWARPCTC_WITH_HIP) 30 | include(hip) 31 | endif(WITH_ROCM) 32 | 33 | if(BUILD_SHARED) 34 | set(WARPCTC_SHARED "SHARED") 35 | else(BUILD_SHARED) 36 | set(WARPCTC_SHARED "STATIC") 37 | endif(BUILD_SHARED) 38 | 39 | if(WIN32) 40 | set(CMAKE_STATIC_LIBRARY_PREFIX lib) 41 | set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") 42 | set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") 43 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") 44 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") 45 | foreach(flag_var 46 | CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE) 47 | if(${flag_var} MATCHES "/MD") 48 | string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") 49 | endif(${flag_var} MATCHES "/MD") 50 | endforeach(flag_var) 51 | else(WIN32) 52 | # Set c++ flags 53 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") 54 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O2") 55 | endif(WIN32) 56 | 57 | if(APPLE) 58 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") 59 | add_definitions(-DAPPLE) 60 | endif() 61 | 62 | if(WITH_OMP AND NOT APPLE) 63 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") 64 | else() 65 | add_definitions(-DCTC_DISABLE_OMP) 66 | endif() 67 | 68 | # need to be at least 30 or __shfl_down in reduce wont compile 69 | IF (CUDA_VERSION VERSION_LESS "11.0") 70 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30") 71 | ENDIF() 72 | 73 | # sm35 is deprecated after cuda 12.0 74 | IF (CUDA_VERSION VERSION_LESS "12.0") 75 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_35,code=sm_35") 76 | ENDIF() 77 | 78 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_50,code=sm_50") 79 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52") 80 | 81 | IF (CUDA_VERSION VERSION_GREATER "7.6") 82 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60") 83 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61") 84 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62") 85 | ENDIF() 86 | 87 | IF ((CUDA_VERSION VERSION_GREATER "9.0") OR (CUDA_VERSION VERSION_EQUAL "9.0")) 88 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_70,code=sm_70") 89 | ENDIF() 90 | 91 | IF ((CUDA_VERSION VERSION_GREATER "10.0") OR (CUDA_VERSION VERSION_EQUAL "10.0")) 92 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_75,code=sm_75") 93 | ENDIF() 94 | 95 | IF ((CUDA_VERSION VERSION_GREATER "11.0") OR (CUDA_VERSION VERSION_EQUAL "11.0")) 96 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_80,code=sm_80") 97 | ENDIF() 98 | 99 | IF ((CUDA_VERSION VERSION_GREATER "11.2") OR (CUDA_VERSION VERSION_EQUAL "11.2")) 100 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_86,code=sm_86") 101 | ENDIF() 102 | 103 | IF ((CUDA_VERSION VERSION_GREATER "11.8") OR (CUDA_VERSION VERSION_EQUAL "11.8")) 104 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_90,code=sm_90") 105 | ENDIF() 106 | 107 | IF(NOT APPLE AND NOT WIN32) 108 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") 109 | if(WITH_OMP) 110 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fopenmp") 111 | endif() 112 | ENDIF() 113 | 114 | IF (APPLE) 115 | EXEC_PROGRAM(uname ARGS -v OUTPUT_VARIABLE DARWIN_VERSION) 116 | STRING(REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION}) 117 | MESSAGE(STATUS "DARWIN_VERSION=${DARWIN_VERSION}") 118 | 119 | #for el capitain have to use rpath 120 | 121 | IF (DARWIN_VERSION LESS 15) 122 | set(CMAKE_SKIP_RPATH TRUE) 123 | ENDIF () 124 | 125 | ELSE() 126 | #always skip for linux 127 | set(CMAKE_SKIP_RPATH TRUE) 128 | ENDIF() 129 | 130 | # windows treat symbolic file as a real file, which is different with unix 131 | # We create a hidden file and compile it instead of origin source file. 132 | function(windows_symbolic TARGET) 133 | set(oneValueArgs "") 134 | set(multiValueArgs SRCS PATH DEPS) 135 | cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) 136 | set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH}) 137 | foreach(src ${windows_symbolic_SRCS}) 138 | get_filename_component(src ${src} NAME_WE) 139 | if (NOT EXISTS ${final_path}/${src}.cpp OR NOT EXISTS ${final_path}/${src}.cu) 140 | message(FATAL " ${final_path}/${src}.cc and ${final_path}/${src}.cu must exsits, and ${final_path}/${src}.cu must be symbolic file.") 141 | endif() 142 | 143 | # only copy the xx.cu to .xx.cu when the content are modified 144 | add_custom_command(OUTPUT ${final_path}/.${src}.cu 145 | COMMAND ${CMAKE_COMMAND} -E copy_if_different "${final_path}/${src}.cpp" "${final_path}/.${src}.cu" 146 | COMMENT "create hidden file of ${src}.cu") 147 | add_custom_target(${TARGET} ALL DEPENDS ${final_path}/.${src}.cu) 148 | endforeach() 149 | endfunction() 150 | 151 | IF (WITH_GPU OR WITH_ROCM) 152 | 153 | MESSAGE(STATUS "Building shared library with GPU support") 154 | 155 | IF (WITH_GPU) 156 | MESSAGE(STATUS "NVCC_ARCH_FLAGS" ${CUDA_NVCC_FLAGS}) 157 | ENDIF() 158 | 159 | if (WIN32) 160 | SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler \"/wd 4068 /wd 4244 /wd 4267 /wd 4305 /wd 4819\"") 161 | windows_symbolic(ctc_entrypoint SRCS ctc_entrypoint.cu PATH src) 162 | CUDA_ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/.ctc_entrypoint.cu src/reduce.cu) 163 | else() 164 | IF (WITH_GPU) 165 | ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/ctc_entrypoint.cu src/reduce.cu) 166 | ELSE() 167 | HIP_ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/ctc_entrypoint.cu src/reduce.cu) 168 | TARGET_LINK_LIBRARIES(warpctc PUBLIC ${ROCM_HIPRTC_LIB}) 169 | ENDIF() 170 | endif(WIN32) 171 | 172 | IF (!WITH_TORCH) 173 | MESSAGE(STATUS "Link rand library") 174 | 175 | IF (WITH_GPU) 176 | MESSAGE(STATUS "Link cuda rand library: ${CUDA_curand_LIBRARY}") 177 | TARGET_LINK_LIBRARIES(warpctc ${CUDA_curand_LIBRARY}) 178 | ELSE() 179 | MESSAGE(STATUS "Link hip rand library: ${hiprand_LIBRARY_DIRS}") 180 | TARGET_LINK_LIBRARIES(warpctc ${hiprand_LIBRARY_DIRS}/libhiprand.so) 181 | ENDIF() 182 | ENDIF() 183 | 184 | if(BUILD_TESTS) 185 | MESSAGE(STATUS "Build tests") 186 | 187 | IF (WITH_GPU) 188 | add_executable(test_cpu tests/test_cpu.cpp) 189 | ELSE() 190 | add_executable(test_cpu tests/test_cpu.cpp) 191 | ENDIF() 192 | 193 | 194 | TARGET_LINK_LIBRARIES(test_cpu warpctc) 195 | SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS}") 196 | 197 | IF (WITH_GPU) 198 | cuda_add_executable(test_gpu tests/test_gpu.cu) 199 | TARGET_LINK_LIBRARIES(test_gpu warpctc ${CUDA_curand_LIBRARY}) 200 | ELSE() 201 | hip_add_executable(test_gpu tests/test_gpu.cu) 202 | TARGET_LINK_LIBRARIES(test_gpu warpctc ${hiprand_LIBRARY_DIRS}/libhiprand.so) 203 | ENDIF() 204 | endif(BUILD_TESTS) 205 | 206 | INSTALL(TARGETS warpctc 207 | RUNTIME DESTINATION "bin" 208 | LIBRARY DESTINATION "lib" 209 | ARCHIVE DESTINATION "lib") 210 | 211 | INSTALL(FILES include/ctc.h DESTINATION "include") 212 | 213 | IF (WITH_TORCH AND WITH_GPU) 214 | MESSAGE(STATUS "Building Torch Bindings with GPU support") 215 | INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS} "${CUDA_TOOLKIT_ROOT_DIR}/samples/common/inc") 216 | INCLUDE_DIRECTORIES(${Torch_INSTALL_INCLUDE} ${Torch_INSTALL_INCLUDE}/TH ${Torch_INSTALL_INCLUDE}/THC) 217 | 218 | TARGET_LINK_LIBRARIES(warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY}) 219 | INSTALL(TARGETS warpctc 220 | RUNTIME DESTINATION "${Torch_INSTALL_BIN_SUBDIR}" 221 | LIBRARY DESTINATION "${Torch_INSTALL_LIB_SUBDIR}" 222 | ARCHIVE DESTINATION "${Torch_INSTALL_LIB_SUBDIR}") 223 | 224 | SET(src torch_binding/binding.cpp torch_binding/utils.c) 225 | SET(luasrc torch_binding/init.lua) 226 | 227 | ADD_TORCH_PACKAGE(warp_ctc "${src}" "${luasrc}") 228 | IF (APPLE) 229 | TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY}) 230 | ELSE() 231 | TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY} gomp) 232 | ENDIF() 233 | ENDIF() 234 | 235 | ELSE() 236 | MESSAGE(STATUS "Building shared library with no GPU support") 237 | 238 | if (NOT APPLE AND NOT WIN32) 239 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") 240 | ENDIF() 241 | 242 | ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/ctc_entrypoint.cpp) 243 | 244 | if(BUILD_TESTS) 245 | add_executable(test_cpu tests/test_cpu.cpp ) 246 | TARGET_LINK_LIBRARIES(test_cpu warpctc) 247 | SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS}") 248 | endif(BUILD_TESTS) 249 | 250 | INSTALL(TARGETS warpctc 251 | RUNTIME DESTINATION "bin" 252 | LIBRARY DESTINATION "lib" 253 | ARCHIVE DESTINATION "lib") 254 | 255 | INSTALL(FILES include/ctc.h DESTINATION "include") 256 | 257 | IF (WITH_TORCH) 258 | MESSAGE(STATUS "Building Torch Bindings with no GPU support") 259 | add_definitions(-DTORCH_NOGPU) 260 | INCLUDE_DIRECTORIES(${Torch_INSTALL_INCLUDE} ${Torch_INSTALL_INCLUDE}/TH) 261 | 262 | TARGET_LINK_LIBRARIES(warpctc luajit luaT TH) 263 | 264 | INSTALL(TARGETS warpctc 265 | RUNTIME DESTINATION "${Torch_INSTALL_BIN_SUBDIR}" 266 | LIBRARY DESTINATION "${Torch_INSTALL_LIB_SUBDIR}" 267 | ARCHIVE DESTINATION "${Torch_INSTALL_LIB_SUBDIR}") 268 | 269 | SET(src torch_binding/binding.cpp torch_binding/utils.c) 270 | SET(luasrc torch_binding/init.lua) 271 | 272 | ADD_TORCH_PACKAGE(warp_ctc "${src}" "${luasrc}") 273 | IF (APPLE) 274 | TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT TH) 275 | ELSE() 276 | TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT TH gomp) 277 | ENDIF() 278 | ENDIF() 279 | 280 | ENDIF() 281 | 282 | 283 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Baidu Logo](/doc/baidu-research-logo-small.png) 2 | 3 | [In Chinese 中文版](README.zh_cn.md) 4 | 5 | # warp-ctc 6 | 7 | A fast parallel implementation of CTC, on both CPU and GPU. 8 | 9 | ## Introduction 10 | 11 | [Connectionist Temporal Classification](http://www.cs.toronto.edu/~graves/icml_2006.pdf) 12 | is a loss function useful for performing supervised learning on sequence data, 13 | without needing an alignment between input data and labels. For example, CTC 14 | can be used to train 15 | [end-to-end](http://www.jmlr.org/proceedings/papers/v32/graves14.pdf) 16 | [systems](http://arxiv.org/pdf/1408.2873v2.pdf) for 17 | [speech recognition](http://arxiv.org/abs/1512.02595), 18 | which is how we have been using it at Baidu's Silicon Valley AI Lab. 19 | 20 | ![DSCTC](/doc/deep-speech-ctc-small.png) 21 | 22 | The illustration above shows CTC computing the probability of an output 23 | sequence "THE CAT ", as a sum over all possible alignments of input sequences 24 | that could map to "THE CAT ", taking into account that labels may be duplicated 25 | because they may stretch over several time steps of the input data (represented by 26 | the spectrogram at the bottom of the image). 27 | Computing the sum of all such probabilities explicitly would be prohibitively costly due to the 28 | combinatorics involved, but CTC uses dynamic programming to dramatically 29 | reduce the complexity of the computation. Because CTC is a differentiable function, 30 | it can be used during standard SGD training of deep neural networks. 31 | 32 | In our lab, we focus on scaling up recurrent neural networks, and CTC loss is an 33 | important component. To make our system efficient, we parallelized the CTC 34 | algorithm, as described in [this paper](http://arxiv.org/abs/1512.02595). 35 | This project contains our high performance CPU and CUDA versions of the CTC loss, 36 | along with bindings for [Torch](http://torch.ch/). 37 | The library provides a simple C interface, so that it is easy to 38 | integrate into deep learning frameworks. 39 | 40 | This implementation has improved training scalability beyond the 41 | performance improvement from a faster parallel CTC implementation. For 42 | GPU-focused training pipelines, the ability to keep all data local to 43 | GPU memory allows us to spend interconnect bandwidth on increased data 44 | parallelism. 45 | 46 | ## Performance 47 | 48 | Our CTC implementation is efficient compared with many of the other publicly available implementations. It is 49 | also written to be as numerically stable as possible. The algorithm is numerically sensitive and we have observed 50 | catastrophic underflow even in double precision with the standard calculation - the result of division of 51 | two numbers on the order of 1e-324 which should have been approximately one, instead become infinity 52 | when the denominator underflowed to 0. Instead, by performing the calculation in log space, it is numerically 53 | stable even in single precision floating point at the cost of significantly more expensive operations. Instead of 54 | one machine instruction, addition requires the evaluation of multiple transcendental functions. Because of this, 55 | the speed of CTC implementations can only be fairly compared if they are both performing the calculation the same 56 | way. 57 | 58 | We compare our performance with [Eesen](https://github.com/srvk/eesen/commit/68f2bc2d46a5513cce3c232a645292632a1b08f9), 59 | a CTC implementation built on 60 | [Theano](https://github.com/mohammadpz/CTC-Connectionist-Temporal-Classification/commit/904e8c72e15334887609d399254cf05a591d570f), 61 | and a Cython CPU only implementation [Stanford-CTC](https://github.com/amaas/stanford-ctc/commit/c8859897336a349b6c561d2bf2d179fae90b4d67). 62 | We benchmark the Theano implementation operating on 32-bit floating-point numbers and doing the calculation in log-space, 63 | in order to match the other implementations we compare against. Stanford-CTC was modified to perform the calculation 64 | in log-space as it did not support it natively. It also does not support minibatches larger than 1, so would require 65 | an awkward memory layout to use in a real training pipeline, we assume linear increase in cost with minibatch size. 66 | 67 | We show results on two problem sizes relevant to our English and Mandarin end-to-end models, respectively, where *T* represents the number of timesteps in the input to CTC, *L* represents the length of the labels for each example, and *A* represents the alphabet size. 68 | 69 | On the GPU, our performance at a minibatch of 64 examples ranges from 7x faster to 155x faster than Eesen, and 46x to 68x faster than the Theano implementation. 70 | 71 | ### GPU Performance 72 | Benchmarked on a single NVIDIA Titan X GPU. 73 | 74 | | *T*=150, *L*=40, *A*=28 | warp-ctc | Eesen | Theano | 75 | |-----------------------------------|-------|---------|---------| 76 | | *N*=1 | 3.1 ms| .5 ms | 67 ms | 77 | | *N*=16 | 3.2 ms| 6 ms | 94 ms | 78 | | *N*=32 | 3.2 ms| 12 ms | 119 ms | 79 | | *N*=64 | 3.3 ms| 24 ms | 153 ms | 80 | | *N*=128 | 3.5 ms| 49 ms | 231 ms | 81 | 82 | 83 | | *T*=150, *L*=20, *A*=5000 | warp-ctc | Eesen | Theano | 84 | |-----------------------------------|-------|---------|---------| 85 | | *N*=1 | 7 ms | 40 ms | 120 ms | 86 | | *N*=16 | 9 ms | 619 ms | 385 ms | 87 | | *N*=32 | 11 ms | 1238 ms | 665 ms | 88 | | *N*=64 | 16 ms | 2475 ms | 1100 ms | 89 | | *N*=128 | 23 ms | 4950 ms | 2100 ms | 90 | 91 | ### CPU Performance 92 | 93 | Benchmarked on a dual-socket machine with two Intel E5-2660 v3 94 | processors - warp-ctc used 40 threads to maximally take advantage of the CPU resources. 95 | Eesen doesn't provide a CPU implementation. We noticed that the Theano implementation was not 96 | parallelizing computation across multiple threads. Stanford-CTC provides no mechanism 97 | for parallelization across threads. 98 | 99 | 100 | | *T*=150, *L*=40, *A*=28 | warp-ctc | Stanford-CTC | Theano | 101 | |-----------------------------------|-------|---------|---------| 102 | | *N*=1 | 2.6 ms| 13 ms | 15 ms | 103 | | *N*=16 | 3.4 ms| 208 ms | 180 ms | 104 | | *N*=32 | 3.9 ms| 416 ms | 375 ms | 105 | | *N*=64 | 6.6 ms| 832 ms | 700 ms | 106 | | *N*=128 |12.2 ms| 1684 ms | 1340 ms | 107 | 108 | 109 | | *T*=150, *L*=20, *A*=5000 | warp-ctc | Stanford-CTC | Theano | 110 | |-----------------------------------|-------|---------|---------| 111 | | *N*=1 | 21 ms | 31 ms | 850 ms | 112 | | *N*=16 | 37 ms | 496 ms | 10800 ms| 113 | | *N*=32 | 54 ms | 992 ms | 22000 ms| 114 | | *N*=64 | 101 ms| 1984 ms | 42000 ms| 115 | | *N*=128 | 184 ms| 3968 ms | 86000 ms| 116 | 117 | 118 | 119 | 120 | 121 | ## Interface 122 | 123 | The interface is in [`include/ctc.h`](include/ctc.h). 124 | It supports CPU or GPU execution, and you can specify OpenMP parallelism 125 | if running on the CPU, or the CUDA stream if running on the GPU. We 126 | took care to ensure that the library does not perform memory 127 | allocation internally, in order to avoid synchronizations and 128 | overheads caused by memory allocation. 129 | 130 | ## Compilation 131 | 132 | warp-ctc has been tested on Ubuntu 14.04 and OSX 10.10. Windows is not supported 133 | at this time. 134 | 135 | First get the code: 136 | 137 | ``` 138 | git clone https://github.com/baidu-research/warp-ctc.git 139 | cd warp-ctc 140 | ``` 141 | 142 | create a build directory: 143 | 144 | ``` 145 | mkdir build 146 | cd build 147 | ``` 148 | 149 | if you have a non standard CUDA install `export CUDA_BIN_PATH=/path_to_cuda` so that CMake detects CUDA and 150 | to ensure Torch is detected, make sure `th` is in `$PATH` 151 | 152 | run cmake and build: 153 | 154 | ``` 155 | cmake ../ 156 | make 157 | ``` 158 | 159 | The C library and torch shared libraries should now be built along with test 160 | executables. If CUDA was detected, then `test_gpu` will be built; `test_cpu` 161 | will always be built. 162 | 163 | ## Tests 164 | 165 | To run the tests, make sure the CUDA libraries are in `LD_LIBRARY_PATH` (`DYLD_LIBRARY_PATH` for OSX). 166 | 167 | The Torch tests must be run from the `torch_binding/tests/` directory. 168 | 169 | ## Torch Installation 170 | 171 | ```luarocks make torch_binding/rocks/warp-ctc-scm-1.rockspec``` 172 | 173 | You can also install without cloning the repository using 174 | 175 | ```luarocks install http://raw.githubusercontent.com/baidu-research/warp-ctc/master/torch_binding/rocks/warp-ctc-scm-1.rockspec``` 176 | 177 | There is a Torch CTC [tutorial](torch_binding/TUTORIAL.md). 178 | 179 | ## Contributing 180 | 181 | We welcome improvements from the community, please feel free to submit pull 182 | requests. 183 | 184 | ## Known Issues / Limitations 185 | 186 | The CUDA implementation requires a device of at least compute capability 3.0. 187 | 188 | The CUDA implementation supports a maximum label length of 639 (timesteps are 189 | unlimited). 190 | -------------------------------------------------------------------------------- /README.zh_cn.md: -------------------------------------------------------------------------------- 1 | ![Baidu Logo](/doc/baidu-research-logo-small.png) 2 | 3 | [In English](README.md) 4 | 5 | # warp-ctc 6 | 7 | Warp-CTC是一个可以应用在CPU和GPU上高效并行的CTC代码库 (library) 8 | 介绍 9 | CTC[Connectionist Temporal Classification](http://www.cs.toronto.edu/~graves/icml_2006.pdf)作为一个损失函数,用于在序列数据上进行监督式学习,不需要对齐输入数据及标签。比如,CTC可以被用来训练端对端的语音识别系统,这正是我们在百度硅谷试验室所使用的方法。 10 | [端到端](http://www.jmlr.org/proceedings/papers/v32/graves14.pdf) 11 | [系统](http://arxiv.org/pdf/1408.2873v2.pdf) 12 | [语音识别](http://arxiv.org/abs/1512.02595) 13 | 14 | ![DSCTC](/doc/deep-speech-ctc-small.png) 15 | 16 | 上图展示了CTC计算输出序列(“THE CAT”)概率的过程,是对可能映射成“THE CAT”的所有可能输入序列对齐的和。这一过程考虑了标签会被复制的可能性,因为标签有可能在输入数据的几个时间步(time steps)时被拉伸 (请见上图底部的声谱图)。由于涉及到了组合学,计算所有可能概率的和的成本会很高,但是CTC运用了动态规划以大幅降低计算的复杂性。作为一个可微函数,CTC可以被用于深度神经网络的标准SGD训练。 17 | 我们实验室专注于递归神经网络(RNN)的可扩展性 (scalibility), 而CTC损失函数是其中很重要的一部分。为了让我们的系统更有效率,我们并行处理了CTC算法,正如这篇文章中所描述的 。这个项目包含了我们的高性能CPU以及CUDA版本的CTC损失函数, 以及绑定的Torch. 该代码库提供了简单的C接口,易于与深度学习框架整合。 18 | 19 | 这种执行方式提高了训练的的可扩展性,超过了并行CTC的实现方式。对于以GPU为核心的训练, 我们可用所有的的网络带宽来增加数据的可并行性。 20 | 性能 21 | 相比其他的开源工具,Warp-CTC的实现方式相对高效,且代码的数值稳定性也较好。因为CTC本身对数值较为敏感,因此即使使用双精度标准计算,也会出现下溢 (underflow)的情况。 具体来说,两个数值趋近于无穷小且相近的数字相除的结果应该大约为1,却因为分母接近为0而变成无穷。 然而,如果直接取对数执行运算,CTC会在数值上较为稳定,虽然会在单精度浮点中以高成本运算为代价。 22 | 我们将Warp-CTC和[Eesen](https://github.com/srvk/eesen/commit/68f2bc2d46a5513cce3c232a645292632a1b08f9) (建立在[Theano](https://github.com/mohammadpz/CTC-Connectionist-Temporal-Classification/commit/904e8c72e15334887609d399254cf05a591d570f)上的CTC)以及仅运行[Stanford-CTC](https://github.com/amaas/stanford-ctc/commit/c8859897336a349b6c561d2bf2d179fae90b4d67)的Cython CPU进行了比较。为了进行比较,我们对在32位浮点数上运行的Theano进行了基准测试,并且取对数计算。 而Stanford-CTC由于本身不支持对数运算,因此需要被修改。而且它也不支持大于1的迷你批处理 (minibatches), 所以需要在真正的训练流水线上布局非常规内存(我们假设成本与迷你批处理的规模是成正线性关系)。 23 | 我们在Deep Speech 2中分别展示了英文及中文端对端模型的结果, 其中T代表输入CTC的时间步数量,L代表每个例子的标签长度,A代表字母数量。 24 | 在GPU上,Warp-CTC对64个例子迷你批处理的表现比Eesen快7倍到155倍,比Theano快46倍到68倍 25 | ### GPU性能 26 | 单核NVIDIA Titan X GPU基准测试 27 | 28 | | *T*=150, *L*=40, *A*=28 | warp-ctc | Eesen | Theano | 29 | |-----------------------------------|-------|---------|---------| 30 | | *N*=1 | 3.1 ms| .5 ms | 67 ms | 31 | | *N*=16 | 3.2 ms| 6 ms | 94 ms | 32 | | *N*=32 | 3.2 ms| 12 ms | 119 ms | 33 | | *N*=64 | 3.3 ms| 24 ms | 153 ms | 34 | | *N*=128 | 3.5 ms| 49 ms | 231 ms | 35 | 36 | 37 | | *T*=150, *L*=20, *A*=5000 | warp-ctc | Eesen | Theano | 38 | |-----------------------------------|-------|---------|---------| 39 | | *N*=1 | 7 ms | 40 ms | 120 ms | 40 | | *N*=16 | 9 ms | 619 ms | 385 ms | 41 | | *N*=32 | 11 ms | 1238 ms | 665 ms | 42 | | *N*=64 | 16 ms | 2475 ms | 1100 ms | 43 | | *N*=128 | 23 ms | 4950 ms | 2100 ms | 44 | 45 | ### CPU性能 46 | 在一台有两个Intel E5-2660 v3处理器的双槽机上进行基准测试。Warp-CTC用了40个线程从而最大化了对CPU资源的利用。Eesen没有提供CPU实现方式。我们注意到Theano没有在多线程上进行并行计算。同样,Stanford-CTC没有提供多线程并行计算的机制。 47 | 48 | | *T*=150, *L*=40, *A*=28 | warp-ctc | Stanford-CTC | Theano | 49 | |-----------------------------------|-------|---------|---------| 50 | | *N*=1 | 2.6 ms| 13 ms | 15 ms | 51 | | *N*=16 | 3.4 ms| 208 ms | 180 ms | 52 | | *N*=32 | 3.9 ms| 416 ms | 375 ms | 53 | | *N*=64 | 6.6 ms| 832 ms | 700 ms | 54 | | *N*=128 |12.2 ms| 1684 ms | 1340 ms | 55 | 56 | 57 | | *T*=150, *L*=20, *A*=5000 | warp-ctc | Stanford-CTC | Theano | 58 | |-----------------------------------|-------|---------|---------| 59 | | *N*=1 | 21 ms | 31 ms | 850 ms | 60 | | *N*=16 | 37 ms | 496 ms | 10800 ms| 61 | | *N*=32 | 54 ms | 992 ms | 22000 ms| 62 | | *N*=64 | 101 ms| 1984 ms | 42000 ms| 63 | | *N*=128 | 184 ms| 3968 ms | 86000 ms| 64 | 65 | ## 接口 66 | 接口在[`include/ctc.h`](include/ctc.h)中,它支持在CPU或者GPU上执行。 如果是在CPU上运行,可以指定OpenMP并行计算; 如果是在GPU上运行,请用CUDA stream。 为避免内存分配而导致的竞争及间接成本,我们会确保代码库不会在内部进行内存分配。 67 | ## 编译器 68 | Warp-CTC已经在Ubuntu 14.04以及OSX 10.10进行了测试,现不支持Windows. 69 | 首先,请获取代码 70 | 71 | ``` 72 | git clone https://github.com/baidu-research/warp-ctc.git 73 | cd warp-ctc 74 | ``` 75 | 76 | 创建目录 77 | 78 | ``` 79 | mkdir build 80 | cd build 81 | ``` 82 | 83 | 假如使用非标准CUDA,请安装 `export CUDA_BIN_PATH=/path_to_cuda` 以便被CMake检测。且确保Torch被监测到,注意(`th` is in `$PATH`) 84 | 运行cmake, 创建 85 | 86 | ``` 87 | cmake ../ 88 | make 89 | ``` 90 | 91 | 现在,C代码库以及与torch分享的代码库应当和测试可执行文件一同被创建。假如CUDA被检测到,test_gpu则被创建。 92 | 测试 93 | 为了运行测试,确保CUDA代码库在`LD_LIBRARY_PATH` (`DYLD_LIBRARY_PATH` for OSX)中。 94 | Torch测试必须在 `torch_binding/tests/` 目录中运行。 95 | ## Torch安装 96 | 97 | ```luarocks make torch_binding/rocks/warp-ctc-scm-1.rockspec``` 98 | 99 | 即使不复制存储库(repository),你也可以安装 100 | 101 | ```luarocks install http://raw.githubusercontent.com/baidu-research/warp-ctc/master/torch_binding/rocks/warp-ctc-scm-1.rockspec``` 102 | 103 | [请见Torch CTC教程](torch_binding/TUTORIAL.zh_cn.md)。 104 | 105 | ## 限制 106 | CUDA的执行需要至少3.0的计算能力, 所支持的标签长度最大值为639 (时间步数是有限的)。 107 | 108 | 最后我们欢迎大家提出宝贵的意见及建议以改进我们的开源服务。 109 | 110 | 在此鸣谢新智元编译 [http://chuansong.me/account/AI_era](http://chuansong.me/account/AI_era)允许我们参考部分译文,[http://chuansong.me/n/2168385](http://chuansong.me/n/2168385) 111 | -------------------------------------------------------------------------------- /cmake/hip.cmake: -------------------------------------------------------------------------------- 1 | if(NOT WITH_ROCM) 2 | return() 3 | endif() 4 | 5 | if(NOT DEFINED ENV{ROCM_PATH}) 6 | set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCm has been installed") 7 | set(HIP_PATH ${ROCM_PATH}/hip CACHE PATH "Path to which HIP has been installed") 8 | set(HIP_CLANG_PATH ${ROCM_PATH}/llvm/bin CACHE PATH "Path to which clang has been installed") 9 | else() 10 | set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCm has been installed") 11 | set(HIP_PATH ${ROCM_PATH}/hip CACHE PATH "Path to which HIP has been installed") 12 | set(HIP_CLANG_PATH ${ROCM_PATH}/llvm/bin CACHE PATH "Path to which clang has been installed") 13 | endif() 14 | set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) 15 | 16 | find_package(HIP REQUIRED) 17 | include_directories(${ROCM_PATH}/include) 18 | message(STATUS "HIP version: ${HIP_VERSION}") 19 | message(STATUS "HIP_CLANG_PATH: ${HIP_CLANG_PATH}") 20 | MESSAGE(STATUS "HIP_ROOT_DIR: ${HIP_ROOT_DIR}") 21 | 22 | macro(find_package_and_include PACKAGE_NAME) 23 | find_package("${PACKAGE_NAME}" REQUIRED) 24 | include_directories("${ROCM_PATH}/${PACKAGE_NAME}/include") 25 | message(STATUS "${PACKAGE_NAME} version: ${${PACKAGE_NAME}_VERSION}") 26 | endmacro() 27 | 28 | find_package_and_include(hiprand) 29 | find_package_and_include(rocrand) 30 | find_package_and_include(rocthrust) 31 | 32 | # set CXX flags for HIP 33 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__") 34 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__") 35 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP") 36 | set(THRUST_DEVICE_SYSTEM THRUST_DEVICE_SYSTEM_HIP) 37 | 38 | # define HIP_CXX_FLAGS 39 | list(APPEND HIP_CXX_FLAGS -fPIC) 40 | list(APPEND HIP_CXX_FLAGS -D__HIP_PLATFORM_HCC__=1) 41 | # Note(qili93): HIP has compile conflicts of float16.h as platform::float16 overload std::is_floating_point and std::is_integer 42 | list(APPEND HIP_CXX_FLAGS -D__HIP_NO_HALF_CONVERSIONS__=1) 43 | list(APPEND HIP_CXX_FLAGS -Wno-macro-redefined) 44 | list(APPEND HIP_CXX_FLAGS -Wno-inconsistent-missing-override) 45 | list(APPEND HIP_CXX_FLAGS -Wno-exceptions) 46 | list(APPEND HIP_CXX_FLAGS -Wno-shift-count-negative) 47 | list(APPEND HIP_CXX_FLAGS -Wno-shift-count-overflow) 48 | list(APPEND HIP_CXX_FLAGS -Wno-unused-command-line-argument) 49 | list(APPEND HIP_CXX_FLAGS -Wno-duplicate-decl-specifier) 50 | list(APPEND HIP_CXX_FLAGS -Wno-implicit-int-float-conversion) 51 | list(APPEND HIP_CXX_FLAGS -Wno-pass-failed) 52 | list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP) 53 | list(APPEND HIP_CXX_FLAGS -std=c++14) 54 | 55 | if(CMAKE_BUILD_TYPE MATCHES Debug) 56 | list(APPEND HIP_CXX_FLAGS -g2) 57 | list(APPEND HIP_CXX_FLAGS -O0) 58 | list(APPEND HIP_HIPCC_FLAGS -fdebug-info-for-profiling) 59 | endif(CMAKE_BUILD_TYPE MATCHES Debug) 60 | 61 | set(HIP_HCC_FLAGS ${HIP_CXX_FLAGS}) 62 | set(HIP_CLANG_FLAGS ${HIP_CXX_FLAGS}) 63 | # Ask hcc to generate device code during compilation so we can use 64 | # host linker to link. 65 | list(APPEND HIP_HCC_FLAGS -fno-gpu-rdc) 66 | list(APPEND HIP_HCC_FLAGS --amdgpu-target=gfx906) 67 | list(APPEND HIP_CLANG_FLAGS -fno-gpu-rdc) 68 | list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx906) 69 | 70 | 71 | if(HIP_COMPILER STREQUAL clang) 72 | set(hip_library_name amdhip64) 73 | else() 74 | set(hip_library_name hip_hcc) 75 | endif() 76 | message(STATUS "HIP library name: ${hip_library_name}") 77 | 78 | # set HIP link libs 79 | find_library(ROCM_HIPRTC_LIB ${hip_library_name} HINTS ${HIP_PATH}/lib) 80 | message(STATUS "ROCM_HIPRTC_LIB: ${ROCM_HIPRTC_LIB}") 81 | -------------------------------------------------------------------------------- /doc/baidu-research-logo-small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baidu-research/warp-ctc/94b2fa178347cf02757bdc7329dc2f1b46f5d094/doc/baidu-research-logo-small.png -------------------------------------------------------------------------------- /doc/deep-speech-ctc-small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baidu-research/warp-ctc/94b2fa178347cf02757bdc7329dc2f1b46f5d094/doc/deep-speech-ctc-small.png -------------------------------------------------------------------------------- /include/contrib/moderngpu/LICENSE: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | -------------------------------------------------------------------------------- /include/contrib/moderngpu/include/device/ctaloadbalance.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "ctasearch.cuh" 38 | #include "loadstore.cuh" 39 | 40 | namespace mgpu { 41 | 42 | //////////////////////////////////////////////////////////////////////////////// 43 | // DeviceLoadBalancingSearch 44 | // Upper Bound search from A (needles) into B (haystack). The A values are 45 | // natural numbers from aBegin to aEnd. bFirst is the index of the B value at 46 | // bBegin in shared memory. 47 | 48 | template 49 | MGPU_DEVICE void DeviceSerialLoadBalanceSearch(const int* b_shared, int aBegin, 50 | int aEnd, int bFirst, int bBegin, int bEnd, int* a_shared) { 51 | 52 | int bKey = b_shared[bBegin]; 53 | 54 | #pragma unroll 55 | for(int i = 0; i < VT; ++i) { 56 | bool p; 57 | if(RangeCheck) 58 | p = (aBegin < aEnd) && ((bBegin >= bEnd) || (aBegin < bKey)); 59 | else 60 | p = aBegin < bKey; 61 | 62 | if(p) 63 | // Advance A (the needle). 64 | a_shared[aBegin++] = bFirst + bBegin; 65 | else 66 | // Advance B (the haystack). 67 | bKey = b_shared[++bBegin]; 68 | } 69 | } 70 | 71 | //////////////////////////////////////////////////////////////////////////////// 72 | // CTALoadBalance 73 | // Computes upper_bound(counting_iterator(first), b_global) - 1. 74 | 75 | // Unlike most other CTA* functions, CTALoadBalance loads from global memory. 76 | // This returns the loaded B elements at the beginning or end of shared memory 77 | // depending on the aFirst argument. 78 | 79 | // CTALoadBalance requires NT * VT + 2 slots of shared memory. 80 | template 81 | MGPU_DEVICE int4 CTALoadBalance(int destCount, InputIt b_global, 82 | int sourceCount, int block, int tid, const int* mp_global, 83 | int* indices_shared, bool loadPrecedingB) { 84 | 85 | int4 range = ComputeMergeRange(destCount, sourceCount, block, 0, NT * VT, 86 | mp_global); 87 | 88 | int a0 = range.x; 89 | int a1 = range.y; 90 | int b0 = range.z; 91 | int b1 = range.w; 92 | if(!b0) loadPrecedingB = false; 93 | 94 | // Load one trailing term from B. If we're already at the end, fill the 95 | // end of the buffer with destCount. 96 | int aCount = a1 - a0; 97 | int bCount = b1 - b0; 98 | int extended = b1 < sourceCount; 99 | int loadCount = bCount + extended; 100 | int fillCount = NT * VT + 1 - loadCount - aCount; 101 | 102 | int* a_shared = indices_shared; 103 | int* b_shared = indices_shared + aCount + (int)loadPrecedingB; 104 | 105 | // Load the B values. 106 | // DeviceMemToMemLoop(bCount + extended + (int)loadPrecedingB, 107 | // b_global + b0 - (int)loadPrecedingB, tid, 108 | // b_shared - (int)loadPrecedingB); 109 | 110 | for(int i = tid - (int)loadPrecedingB; i < bCount + extended; i += NT) 111 | b_shared[i] = b_global[b0 + i]; 112 | 113 | // Fill the end of the array with destCount. 114 | for(int i = tid + extended; i < fillCount; i += NT) 115 | b_shared[bCount + i] = destCount; 116 | __syncthreads(); 117 | 118 | // Run a merge path to find the start of the serial merge for each thread. 119 | int diag = VT * tid; 120 | int mp = MergePath(mgpu::counting_iterator(a0), 121 | aCount, b_shared, bCount, diag, mgpu::less()); 122 | 123 | int a0tid = a0 + mp; 124 | int b0tid = diag - mp; 125 | 126 | // Subtract 1 from b0 because we want to return upper_bound - 1. 127 | DeviceSerialLoadBalanceSearch(b_shared, a0tid, a1, b0 - 1, 128 | b0tid, bCount, a_shared - a0); 129 | __syncthreads(); 130 | 131 | b0 -= (int)loadPrecedingB; 132 | return make_int4(a0, a1, b0, b1); 133 | } 134 | 135 | 136 | } // namespace mgpu 137 | -------------------------------------------------------------------------------- /include/contrib/moderngpu/include/device/ctascan.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "../mgpuenums.h" 38 | #include "deviceutil.cuh" 39 | #include "intrinsics.cuh" 40 | 41 | namespace mgpu { 42 | 43 | //////////////////////////////////////////////////////////////////////////////// 44 | // CTAReduce 45 | 46 | template > 47 | struct CTAReduce { 48 | typedef typename Op::first_argument_type T; 49 | enum { Size = NT, Capacity = NT }; 50 | struct Storage { T shared[Capacity]; }; 51 | 52 | MGPU_DEVICE static T Reduce(int tid, T x, Storage& storage, Op op = Op()) { 53 | storage.shared[tid] = x; 54 | __syncthreads(); 55 | 56 | // Fold the data in half with each pass. 57 | #pragma unroll 58 | for(int destCount = NT / 2; destCount >= 1; destCount /= 2) { 59 | if(tid < destCount) { 60 | // Read from the right half and store to the left half. 61 | x = op(x, storage.shared[destCount + tid]); 62 | storage.shared[tid] = x; 63 | } 64 | __syncthreads(); 65 | } 66 | T total = storage.shared[0]; 67 | __syncthreads(); 68 | return total; 69 | } 70 | }; 71 | 72 | #if __CUDA_ARCH__ >= 300 73 | 74 | template 75 | struct CTAReduce > { 76 | typedef mgpu::plus Op; 77 | typedef int T; 78 | enum { Size = NT, Capacity = WARP_SIZE }; 79 | struct Storage { int shared[Capacity]; }; 80 | 81 | MGPU_DEVICE static int Reduce(int tid, int x, Storage& storage, 82 | Op op = Op()) { 83 | 84 | const int NumSections = WARP_SIZE; 85 | const int SecSize = NT / NumSections; 86 | int lane = (SecSize - 1) & tid; 87 | int sec = tid / SecSize; 88 | 89 | // In the first phase, threads cooperatively find the reduction within 90 | // their segment. The segments are SecSize threads (NT / WARP_SIZE) 91 | // wide. 92 | #pragma unroll 93 | for(int offset = 1; offset < SecSize; offset *= 2) 94 | x = shfl_add(x, offset, SecSize); 95 | 96 | // The last thread in each segment stores the local reduction to shared 97 | // memory. 98 | if(SecSize - 1 == lane) storage.shared[sec] = x; 99 | __syncthreads(); 100 | 101 | // Reduce the totals of each input segment. The spine is WARP_SIZE 102 | // threads wide. 103 | if(tid < NumSections) { 104 | x = storage.shared[tid]; 105 | #pragma unroll 106 | for(int offset = 1; offset < NumSections; offset *= 2) 107 | x = shfl_add(x, offset, NumSections); 108 | storage.shared[tid] = x; 109 | } 110 | __syncthreads(); 111 | 112 | int reduction = storage.shared[NumSections - 1]; 113 | __syncthreads(); 114 | 115 | return reduction; 116 | } 117 | }; 118 | 119 | template 120 | struct CTAReduce > { 121 | typedef mgpu::maximum Op; 122 | enum { Size = NT, Capacity = WARP_SIZE }; 123 | struct Storage { int shared[Capacity]; }; 124 | 125 | MGPU_DEVICE static int Reduce(int tid, int x, Storage& storage, 126 | Op op = Op()) { 127 | 128 | const int NumSections = WARP_SIZE; 129 | const int SecSize = NT / NumSections; 130 | int lane = (SecSize - 1) & tid; 131 | int sec = tid / SecSize; 132 | 133 | #pragma unroll 134 | for(int offset = 1; offset < SecSize; offset *= 2) 135 | x = shfl_max(x, offset, SecSize); 136 | 137 | if(SecSize - 1 == lane) storage.shared[sec] = x; 138 | __syncthreads(); 139 | 140 | if(tid < NumSections) { 141 | x = storage.shared[tid]; 142 | #pragma unroll 143 | for(int offset = 1; offset < NumSections; offset *= 2) 144 | x = shfl_max(x, offset, NumSections); 145 | storage.shared[tid] = x; 146 | } 147 | __syncthreads(); 148 | 149 | int reduction = storage.shared[NumSections - 1]; 150 | __syncthreads(); 151 | 152 | return reduction; 153 | } 154 | }; 155 | 156 | #endif // __CUDA_ARCH__ >= 300 157 | 158 | //////////////////////////////////////////////////////////////////////////////// 159 | // CTAScan 160 | 161 | template > 162 | struct CTAScan { 163 | typedef typename Op::result_type T; 164 | enum { Size = NT, Capacity = 2 * NT + 1 }; 165 | struct Storage { T shared[Capacity]; }; 166 | 167 | MGPU_DEVICE static T Scan(int tid, T x, Storage& storage, T* total, 168 | MgpuScanType type = MgpuScanTypeExc, T identity = (T)0, Op op = Op()) { 169 | 170 | storage.shared[tid] = x; 171 | int first = 0; 172 | __syncthreads(); 173 | 174 | #pragma unroll 175 | for(int offset = 1; offset < NT; offset += offset) { 176 | if(tid >= offset) 177 | x = op(storage.shared[first + tid - offset], x); 178 | first = NT - first; 179 | storage.shared[first + tid] = x; 180 | __syncthreads(); 181 | } 182 | *total = storage.shared[first + NT - 1]; 183 | 184 | if(MgpuScanTypeExc == type) 185 | x = tid ? storage.shared[first + tid - 1] : identity; 186 | 187 | __syncthreads(); 188 | return x; 189 | } 190 | MGPU_DEVICE static T Scan(int tid, T x, Storage& storage) { 191 | T total; 192 | return Scan(tid, x, storage, &total, MgpuScanTypeExc, (T)0, Op()); 193 | } 194 | }; 195 | 196 | //////////////////////////////////////////////////////////////////////////////// 197 | // Special partial specialization for CTAScan on Kepler. 198 | // This uses the shfl intrinsic to reduce scan latency. 199 | 200 | #if __CUDA_ARCH__ >= 300 201 | 202 | template 203 | struct CTAScan > { 204 | typedef mgpu::plus Op; 205 | enum { Size = NT, NumSegments = WARP_SIZE, SegSize = NT / NumSegments }; 206 | enum { Capacity = NumSegments + 1 }; 207 | struct Storage { int shared[Capacity + 1]; }; 208 | 209 | MGPU_DEVICE static int Scan(int tid, int x, Storage& storage, int* total, 210 | MgpuScanType type = MgpuScanTypeExc, int identity = 0, Op op = Op()) { 211 | 212 | // Define WARP_SIZE segments that are NT / WARP_SIZE large. 213 | // Each warp makes log(SegSize) shfl_add calls. 214 | // The spine makes log(WARP_SIZE) shfl_add calls. 215 | int lane = (SegSize - 1) & tid; 216 | int segment = tid / SegSize; 217 | 218 | // Scan each segment using shfl_add. 219 | int scan = x; 220 | #pragma unroll 221 | for(int offset = 1; offset < SegSize; offset *= 2) 222 | scan = shfl_add(scan, offset, SegSize); 223 | 224 | // Store the reduction (last element) of each segment into storage. 225 | if(SegSize - 1 == lane) storage.shared[segment] = scan; 226 | __syncthreads(); 227 | 228 | // Warp 0 does a full shfl warp scan on the partials. The total is 229 | // stored to shared[NumSegments]. (NumSegments = WARP_SIZE) 230 | if(tid < NumSegments) { 231 | int y = storage.shared[tid]; 232 | int scan = y; 233 | #pragma unroll 234 | for(int offset = 1; offset < NumSegments; offset *= 2) 235 | scan = shfl_add(scan, offset, NumSegments); 236 | storage.shared[tid] = scan - y; 237 | if(NumSegments - 1 == tid) storage.shared[NumSegments] = scan; 238 | } 239 | __syncthreads(); 240 | 241 | // Add the scanned partials back in and convert to exclusive scan. 242 | scan += storage.shared[segment]; 243 | if(MgpuScanTypeExc == type) { 244 | scan -= x; 245 | if(identity && !tid) scan = identity; 246 | } 247 | *total = storage.shared[NumSegments]; 248 | __syncthreads(); 249 | 250 | return scan; 251 | } 252 | MGPU_DEVICE static int Scan(int tid, int x, Storage& storage) { 253 | int total; 254 | return Scan(tid, x, storage, &total, MgpuScanTypeExc, 0); 255 | } 256 | }; 257 | 258 | #endif // __CUDA_ARCH__ >= 300 259 | 260 | //////////////////////////////////////////////////////////////////////////////// 261 | // CTABinaryScan 262 | 263 | template 264 | MGPU_DEVICE int CTABinaryScan(int tid, bool x, int* shared, int* total) { 265 | const int NumWarps = NT / WARP_SIZE; 266 | int warp = tid / WARP_SIZE; 267 | int lane = (WARP_SIZE - 1); 268 | 269 | // Store the bit totals for each warp. 270 | uint bits = __ballot(x); 271 | shared[warp] = popc(bits); 272 | __syncthreads(); 273 | 274 | #if __CUDA_ARCH__ >= 300 275 | if(tid < NumWarps) { 276 | int x = shared[tid]; 277 | int scan = x; 278 | #pragma unroll 279 | for(int offset = 1; offset < NumWarps; offset *= 2) 280 | scan = shfl_add(scan, offset, NumWarps); 281 | shared[tid] = scan - x; 282 | } 283 | __syncthreads(); 284 | 285 | #else 286 | // Thread 0 scans warp totals. 287 | if(!tid) { 288 | int scan = 0; 289 | #pragma unroll 290 | for(int i = 0; i < NumWarps; ++i) { 291 | int y = shared[i]; 292 | shared[i] = scan; 293 | scan += y; 294 | } 295 | shared[NumWarps] = scan; 296 | } 297 | __syncthreads(); 298 | 299 | #endif // __CUDA_ARCH__ >= 300 300 | 301 | // Add the warp scan back into the partials. 302 | int scan = shared[warp] + __popc(bfe(bits, 0, lane)); 303 | *total = shared[NumWarps]; 304 | __syncthreads(); 305 | return scan; 306 | } 307 | 308 | } // namespace mgpu 309 | -------------------------------------------------------------------------------- /include/contrib/moderngpu/include/device/ctasearch.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "deviceutil.cuh" 38 | #include "../mgpudevice.cuh" 39 | 40 | namespace mgpu { 41 | 42 | template 44 | MGPU_HOST_DEVICE void BinarySearchIt(It data, int& begin, int& end, T key, 45 | int shift, Comp comp) { 46 | 47 | IntT scale = (1<< shift) - 1; 48 | int mid = (int)((begin + scale * end)>> shift); 49 | 50 | T key2 = data[mid]; 51 | bool pred = (MgpuBoundsUpper == Bounds) ? 52 | !comp(key, key2) : 53 | comp(key2, key); 54 | if(pred) begin = mid + 1; 55 | else end = mid; 56 | } 57 | 58 | template 60 | MGPU_HOST_DEVICE int BiasedBinarySearch(It data, int count, T key, int levels, 61 | Comp comp) { 62 | 63 | int begin = 0; 64 | int end = count; 65 | 66 | if(levels >= 4 && begin < end) 67 | BinarySearchIt(data, begin, end, key, 9, comp); 68 | if(levels >= 3 && begin < end) 69 | BinarySearchIt(data, begin, end, key, 7, comp); 70 | if(levels >= 2 && begin < end) 71 | BinarySearchIt(data, begin, end, key, 5, comp); 72 | if(levels >= 1 && begin < end) 73 | BinarySearchIt(data, begin, end, key, 4, comp); 74 | 75 | while(begin < end) 76 | BinarySearchIt(data, begin, end, key, 1, comp); 77 | return begin; 78 | } 79 | 80 | template 81 | MGPU_HOST_DEVICE int BinarySearch(It data, int count, T key, Comp comp) { 82 | int begin = 0; 83 | int end = count; 84 | while(begin < end) 85 | BinarySearchIt(data, begin, end, key, 1, comp); 86 | return begin; 87 | } 88 | 89 | //////////////////////////////////////////////////////////////////////////////// 90 | // MergePath search 91 | 92 | template 93 | MGPU_HOST_DEVICE int MergePath(It1 a, int aCount, It2 b, int bCount, int diag, 94 | Comp comp) { 95 | 96 | typedef typename std::iterator_traits::value_type T; 97 | int begin = max(0, diag - bCount); 98 | int end = min(diag, aCount); 99 | 100 | while(begin < end) { 101 | int mid = (begin + end)>> 1; 102 | T aKey = a[mid]; 103 | T bKey = b[diag - 1 - mid]; 104 | bool pred = (MgpuBoundsUpper == Bounds) ? 105 | comp(aKey, bKey) : 106 | !comp(bKey, aKey); 107 | if(pred) begin = mid + 1; 108 | else end = mid; 109 | } 110 | return begin; 111 | } 112 | 113 | 114 | //////////////////////////////////////////////////////////////////////////////// 115 | // SegmentedMergePath search 116 | 117 | template 118 | MGPU_HOST_DEVICE int SegmentedMergePath(InputIt keys, int aOffset, int aCount, 119 | int bOffset, int bCount, int leftEnd, int rightStart, int diag, Comp comp) { 120 | 121 | // leftEnd and rightStart are defined from the origin, and diag is defined 122 | // from aOffset. 123 | // We only need to run a Merge Path search if the diagonal intersects the 124 | // segment that strides the left and right halves (i.e. is between leftEnd 125 | // and rightStart). 126 | if(aOffset + diag <= leftEnd) return diag; 127 | if(aOffset + diag >= rightStart) return aCount; 128 | 129 | bCount = min(bCount, rightStart - bOffset); 130 | int begin = max(max(leftEnd - aOffset, 0), diag - bCount); 131 | int end = min(diag, aCount); 132 | 133 | while(begin < end) { 134 | int mid = (begin + end)>> 1; 135 | int ai = aOffset + mid; 136 | int bi = bOffset + diag - 1 - mid; 137 | 138 | bool pred = !comp(keys[bi], keys[ai]); 139 | if(pred) begin = mid + 1; 140 | else end = mid; 141 | } 142 | return begin; 143 | } 144 | 145 | //////////////////////////////////////////////////////////////////////////////// 146 | // BalancedPath search 147 | 148 | template 150 | MGPU_HOST_DEVICE int2 BalancedPath(InputIt1 a, int aCount, InputIt2 b, 151 | int bCount, int diag, int levels, Comp comp) { 152 | 153 | typedef typename std::iterator_traits::value_type T; 154 | 155 | int p = MergePath(a, aCount, b, bCount, diag, comp); 156 | int aIndex = p; 157 | int bIndex = diag - p; 158 | 159 | bool star = false; 160 | if(bIndex < bCount) { 161 | if(Duplicates) { 162 | T x = b[bIndex]; 163 | 164 | // Search for the beginning of the duplicate run in both A and B. 165 | // Because 166 | int aStart = BiasedBinarySearch(a, aIndex, x, 167 | levels, comp); 168 | int bStart = BiasedBinarySearch(b, bIndex, x, 169 | levels, comp); 170 | 171 | // The distance between the merge path and the lower_bound is the 172 | // 'run'. We add up the a- and b- runs and evenly distribute them to 173 | // get a stairstep path. 174 | int aRun = aIndex - aStart; 175 | int bRun = bIndex - bStart; 176 | int xCount = aRun + bRun; 177 | 178 | // Attempt to advance b and regress a. 179 | int bAdvance = max(xCount>> 1, bRun); 180 | int bEnd = min(bCount, bStart + bAdvance + 1); 181 | int bRunEnd = BinarySearch(b + bIndex, 182 | bEnd - bIndex, x, comp) + bIndex; 183 | bRun = bRunEnd - bStart; 184 | 185 | bAdvance = min(bAdvance, bRun); 186 | int aAdvance = xCount - bAdvance; 187 | 188 | bool roundUp = (aAdvance == bAdvance + 1) && (bAdvance < bRun); 189 | aIndex = aStart + aAdvance; 190 | 191 | if(roundUp) star = true; 192 | } else { 193 | if(aIndex && aCount) { 194 | T aKey = a[aIndex - 1]; 195 | T bKey = b[bIndex]; 196 | 197 | // If the last consumed element in A (aIndex - 1) is the same as 198 | // the next element in B (bIndex), we're sitting at a starred 199 | // partition. 200 | if(!comp(aKey, bKey)) star = true; 201 | } 202 | } 203 | } 204 | return make_int2(aIndex, star); 205 | } 206 | 207 | } // namespace mgpu 208 | -------------------------------------------------------------------------------- /include/contrib/moderngpu/include/device/ctasegreduce.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "ctasegscan.cuh" 38 | #include "ctasearch.cuh" 39 | 40 | namespace mgpu { 41 | 42 | //////////////////////////////////////////////////////////////////////////////// 43 | // Segmented reduce utility functions. 44 | 45 | // Extract the upper-bound indices from the coded ranges. Decrement to include 46 | // the first addressed row/segment. 47 | 48 | struct SegReduceRange { 49 | int begin; 50 | int end; 51 | int total; 52 | bool flushLast; 53 | }; 54 | 55 | MGPU_DEVICE SegReduceRange DeviceShiftRange(int limit0, int limit1) { 56 | SegReduceRange range; 57 | range.begin = 0x7fffffff & limit0; 58 | range.end = 0x7fffffff & limit1; 59 | range.total = range.end - range.begin; 60 | range.flushLast = 0 == (0x80000000 & limit1); 61 | range.end += !range.flushLast; 62 | return range; 63 | } 64 | 65 | // Reconstitute row/segment indices from a starting row index and packed end 66 | // flags. Used for pre-processed versions of interval reduce and interval Spmv. 67 | template 68 | MGPU_DEVICE void DeviceExpandFlagsToRows(int first, int endFlags, 69 | int rows[VT + 1]) { 70 | 71 | rows[0] = first; 72 | #pragma unroll 73 | for(int i = 0; i < VT; ++i) { 74 | if((1<< i) & endFlags) ++first; 75 | rows[i + 1] = first; 76 | } 77 | } 78 | 79 | //////////////////////////////////////////////////////////////////////////////// 80 | // After loading CSR terms into shared memory, each thread binary searches 81 | // (upper-bound) to find its starting point. Each thread then walks forward, 82 | // emitting the csr0-relative row indices to register. 83 | 84 | template 85 | MGPU_DEVICE int DeviceExpandCsrRows(int tidOffset, int* csr_shared, 86 | int numRows, int end, int rows[VT + 1], int rowStarts[VT]) { 87 | 88 | // Each thread binary searches for its starting row. 89 | int row = BinarySearch(csr_shared, numRows, tidOffset, 90 | mgpu::less()) - 1; 91 | 92 | // Each thread starts at row and scans forward, emitting row IDs into 93 | // register. Store the CTA-local row index (starts at 0) to rows and the 94 | // start of the row (globally) to rowStarts. 95 | int curOffset = csr_shared[row]; 96 | int nextOffset = (row + 1 < numRows) ? csr_shared[row + 1] : end; 97 | 98 | rows[0] = row; 99 | rowStarts[0] = curOffset; 100 | int endFlags = 0; 101 | 102 | #pragma unroll 103 | for(int i = 1; i <= VT; ++i) { 104 | // Advance the row cursor when the iterator hits the next row offset. 105 | if(tidOffset + i == nextOffset) { 106 | // Set an end flag when the cursor advances to the next row. 107 | endFlags |= 1<< (i - 1); 108 | 109 | // Advance the cursor and load the next row offset. 110 | ++row; 111 | curOffset = nextOffset; 112 | nextOffset = (row + 1 < numRows) ? csr_shared[row + 1] : end; 113 | } 114 | rows[i] = row; 115 | if(i < VT) rowStarts[i] = curOffset; 116 | } 117 | __syncthreads(); 118 | 119 | return endFlags; 120 | } 121 | 122 | //////////////////////////////////////////////////////////////////////////////// 123 | // DeviceSegReducePrepare 124 | // Expand non-empty interval of CSR elements into row indices. Compute end-flags 125 | // by comparing adjacent row IDs. 126 | 127 | // DeviceSegReducePrepare may be called either by a pre-processing kernel or by 128 | // the kernel that actually evaluates the segmented reduction if no preprocesing 129 | // is desired. 130 | struct SegReduceTerms { 131 | int endFlags; 132 | int tidDelta; 133 | }; 134 | 135 | template 136 | MGPU_DEVICE SegReduceTerms DeviceSegReducePrepare(int* csr_shared, int numRows, 137 | int tid, int gid, bool flushLast, int rows[VT + 1], int rowStarts[VT]) { 138 | 139 | // Pass a sentinel (end) to point to the next segment start. If we flush, 140 | // this is the end of this tile. Otherwise it is INT_MAX 141 | int endFlags = DeviceExpandCsrRows(gid + VT * tid, csr_shared, 142 | numRows, flushLast ? (gid + NT * VT) : INT_MAX, rows, rowStarts); 143 | 144 | // Find the distance to to scan to compute carry-in for each thread. Use the 145 | // existance of an end flag anywhere in the thread to determine if carry-out 146 | // values from the left should propagate through to the right. 147 | int tidDelta = DeviceFindSegScanDelta(tid, rows[0] != rows[VT], 148 | csr_shared); 149 | 150 | SegReduceTerms terms = { endFlags, tidDelta }; 151 | return terms; 152 | } 153 | 154 | //////////////////////////////////////////////////////////////////////////////// 155 | // CTASegReduce 156 | // Core segmented reduction code. Supports fast-path and slow-path for intra-CTA 157 | // segmented reduction. Stores partials to global memory. 158 | // Callers feed CTASegReduce::ReduceToGlobal values in thread order. 159 | template 160 | struct CTASegReduce { 161 | typedef CTASegScan SegScan; 162 | 163 | enum { 164 | NV = NT * VT, 165 | Capacity = HalfCapacity ? (NV / 2) : NV 166 | }; 167 | 168 | union Storage { 169 | typename SegScan::Storage segScanStorage; 170 | T values[Capacity]; 171 | }; 172 | 173 | template 174 | MGPU_DEVICE static void ReduceToGlobal(const int rows[VT + 1], int total, 175 | int tidDelta, int startRow, int block, int tid, T data[VT], 176 | DestIt dest_global, T* carryOut_global, T identity, Op op, 177 | Storage& storage) { 178 | 179 | // Run a segmented scan within the thread. 180 | T x, localScan[VT]; 181 | #pragma unroll 182 | for(int i = 0; i < VT; ++i) { 183 | x = i ? op(x, data[i]) : data[i]; 184 | localScan[i] = x; 185 | if(rows[i] != rows[i + 1]) x = identity; 186 | } 187 | 188 | // Run a parallel segmented scan over the carry-out values to compute 189 | // carry-in. 190 | T carryOut; 191 | T carryIn = SegScan::SegScanDelta(tid, tidDelta, x, 192 | storage.segScanStorage, &carryOut, identity, op); 193 | 194 | // Store the carry-out for the entire CTA to global memory. 195 | if(!tid) carryOut_global[block] = carryOut; 196 | 197 | dest_global += startRow; 198 | if(HalfCapacity && total > Capacity) { 199 | // Add carry-in to each thread-local scan value. Store directly 200 | // to global. 201 | #pragma unroll 202 | for(int i = 0; i < VT; ++i) { 203 | // Add the carry-in to the local scan. 204 | T x2 = op(carryIn, localScan[i]); 205 | 206 | // Store on the end flag and clear the carry-in. 207 | if(rows[i] != rows[i + 1]) { 208 | carryIn = identity; 209 | dest_global[rows[i]] = x2; 210 | } 211 | } 212 | } else { 213 | // All partials fit in shared memory. Add carry-in to each thread- 214 | // local scan value. 215 | #pragma unroll 216 | for(int i = 0; i < VT; ++i) { 217 | // Add the carry-in to the local scan. 218 | T x2 = op(carryIn, localScan[i]); 219 | 220 | // Store reduction when the segment changes and clear the 221 | // carry-in. 222 | if(rows[i] != rows[i + 1]) { 223 | storage.values[rows[i]] = x2; 224 | carryIn = identity; 225 | } 226 | } 227 | __syncthreads(); 228 | 229 | // Cooperatively store reductions to global memory. 230 | for(int index = tid; index < total; index += NT) 231 | dest_global[index] = storage.values[index]; 232 | __syncthreads(); 233 | } 234 | } 235 | }; 236 | 237 | } // namespace mgpu 238 | 239 | -------------------------------------------------------------------------------- /include/contrib/moderngpu/include/device/ctasegscan.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "ctascan.cuh" 38 | 39 | namespace mgpu { 40 | 41 | //////////////////////////////////////////////////////////////////////////////// 42 | // DeviceFindSegScanDelta 43 | // Runs an inclusive max-index scan over binary inputs. 44 | 45 | template 46 | MGPU_DEVICE int DeviceFindSegScanDelta(int tid, bool flag, int* delta_shared) { 47 | const int NumWarps = NT / 32; 48 | 49 | int warp = tid / 32; 50 | int lane = 31 & tid; 51 | uint warpMask = 0xffffffff>> (31 - lane); // inclusive search 52 | uint ctaMask = 0x7fffffff>> (31 - lane); // exclusive search 53 | 54 | uint warpBits = __ballot(flag); 55 | delta_shared[warp] = warpBits; 56 | __syncthreads(); 57 | 58 | if(tid < NumWarps) { 59 | uint ctaBits = __ballot(0 != delta_shared[tid]); 60 | int warpSegment = 31 - clz(ctaMask & ctaBits); 61 | int start = (-1 != warpSegment) ? 62 | (31 - clz(delta_shared[warpSegment]) + 32 * warpSegment) : 0; 63 | delta_shared[NumWarps + tid] = start; 64 | } 65 | __syncthreads(); 66 | 67 | // Find the closest flag to the left of this thread within the warp. 68 | // Include the flag for this thread. 69 | int start = 31 - clz(warpMask & warpBits); 70 | if(-1 != start) start += ~31 & tid; 71 | else start = delta_shared[NumWarps + warp]; 72 | __syncthreads(); 73 | 74 | return tid - start; 75 | } 76 | 77 | //////////////////////////////////////////////////////////////////////////////// 78 | // CTASegScan 79 | 80 | template > 81 | struct CTASegScan { 82 | typedef _Op Op; 83 | typedef typename Op::result_type T; 84 | enum { NumWarps = NT / 32, Size = NT, Capacity = 2 * NT }; 85 | union Storage { 86 | int delta[NumWarps]; 87 | T values[Capacity]; 88 | }; 89 | 90 | // Each thread passes the reduction of the LAST SEGMENT that it covers. 91 | // flag is set to true if there's at least one segment flag in the thread. 92 | // SegScan returns the reduction of values for the first segment in this 93 | // thread over the preceding threads. 94 | // Return the value init for the first thread. 95 | 96 | // When scanning single elements per thread, interpret the flag as a BEGIN 97 | // FLAG. If tid's flag is set, its value belongs to thread tid + 1, not 98 | // thread tid. 99 | 100 | // The function returns the reduction of the last segment in the CTA. 101 | 102 | MGPU_DEVICE static T SegScanDelta(int tid, int tidDelta, T x, 103 | Storage& storage, T* carryOut, T identity = (T)0, Op op = Op()) { 104 | 105 | // Run an inclusive scan 106 | int first = 0; 107 | storage.values[first + tid] = x; 108 | __syncthreads(); 109 | 110 | #pragma unroll 111 | for(int offset = 1; offset < NT; offset += offset) { 112 | if(tidDelta >= offset) 113 | x = op(storage.values[first + tid - offset], x); 114 | first = NT - first; 115 | storage.values[first + tid] = x; 116 | __syncthreads(); 117 | } 118 | 119 | // Get the exclusive scan. 120 | x = tid ? storage.values[first + tid - 1] : identity; 121 | *carryOut = storage.values[first + NT - 1]; 122 | __syncthreads(); 123 | return x; 124 | } 125 | 126 | MGPU_DEVICE static T SegScan(int tid, T x, bool flag, Storage& storage, 127 | T* carryOut, T identity = (T)0, Op op = Op()) { 128 | 129 | // Find the left-most thread that covers the first segment of this 130 | // thread. 131 | int tidDelta = DeviceFindSegScanDelta(tid, flag, storage.delta); 132 | 133 | return SegScanDelta(tid, tidDelta, x, storage, carryOut, identity, op); 134 | } 135 | }; 136 | 137 | } // namespace mgpu 138 | -------------------------------------------------------------------------------- /include/contrib/moderngpu/include/device/ctasortedsearch.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "../mgpudevice.cuh" 38 | #include "ctasearch.cuh" 39 | 40 | namespace mgpu { 41 | 42 | 43 | //////////////////////////////////////////////////////////////////////////////// 44 | // DeviceSerialSearch 45 | 46 | template 48 | MGPU_DEVICE int3 DeviceSerialSearch(const T* keys_shared, int aBegin, 49 | int aEnd, int bBegin, int bEnd, int aOffset, int bOffset, int* indices, 50 | Comp comp) { 51 | 52 | const int FlagA = IndexA ? 0x80000000 : 1; 53 | const int FlagB = IndexB ? 0x80000000 : 1; 54 | 55 | T aKey = keys_shared[aBegin]; 56 | T bKey = keys_shared[bBegin]; 57 | T aPrev, bPrev; 58 | if(aBegin > 0) aPrev = keys_shared[aBegin - 1]; 59 | if(bBegin > 0) bPrev = keys_shared[bBegin - 1]; 60 | int decisions = 0; 61 | int matchCountA = 0; 62 | int matchCountB = 0; 63 | 64 | #pragma unroll 65 | for(int i = 0; i < VT; ++i) { 66 | bool p; 67 | if(RangeCheck && aBegin >= aEnd) p = false; 68 | else if(RangeCheck && bBegin >= bEnd) p = true; 69 | else p = (MgpuBoundsUpper == Bounds) ? 70 | comp(aKey, bKey) : 71 | !comp(bKey, aKey); 72 | 73 | if(p) { 74 | // aKey is smaller than bKey, so it is inserted before bKey. 75 | // Save bKey's index (bBegin + first) as the result of the search 76 | // and advance to the next needle in A. 77 | bool match = false; 78 | if(MatchA) { 79 | // Test if there is an element in B that matches aKey. 80 | if(MgpuBoundsUpper == Bounds) { 81 | // Upper Bound: We're inserting aKey after bKey. If there 82 | // is a match for aKey it must be bPrev. Check that bPrev 83 | // is in range and equal to aKey. 84 | // The predicate test result !comp(aKey, bPrev) was 85 | // established on the previous A-advancing iteration (it 86 | // failed the comp(aKey, bKey) test to get us to this 87 | // point). Check the other half of the equality condition 88 | // with a second comparison. 89 | bool inRange = !RangeCheck || (bBegin > aEnd); 90 | match = inRange && !comp(bPrev, aKey); 91 | } else { 92 | // Lower Bound: We're inserting aKey before bKey. If there 93 | // is a match for aKey, it must be bKey. Check that bKey 94 | // is in range and equal to aKey. 95 | // The predicate test !comp(bKey, aKey) has established one 96 | // half of the equality condition. We establish the other 97 | // half with a second comparison. 98 | bool inRange = !RangeCheck || (bBegin < bEnd); 99 | match = inRange && !comp(aKey, bKey); 100 | } 101 | } 102 | 103 | int index = 0; 104 | if(IndexA) index = bOffset + bBegin; 105 | if(match) index |= FlagA; 106 | if(IndexA || MatchA) indices[i] = index; 107 | matchCountA += match; 108 | 109 | // Mark the decision bit to indicate that this iteration has 110 | // progressed A (the needles). 111 | decisions |= 1<< i; 112 | aPrev = aKey; 113 | aKey = keys_shared[++aBegin]; 114 | } else { 115 | // aKey is larger than bKey, so it is inserted after bKey (but we 116 | // don't know where yet). Advance the B index to the next element in 117 | // the haystack to continue the search for the current needle. 118 | bool match = false; 119 | if(MatchB) { 120 | if(MgpuBoundsUpper == Bounds) { 121 | // Upper Bound: aKey is not smaller than bKey. We advance to 122 | // the next haystack element in B. If there is a match in A 123 | // for bKey it must be aKey. By entering this branch we've 124 | // verified that !comp(aKey, bKey). Making the reciprocal 125 | // comparison !comp(bKey, aKey) establishes aKey == bKey. 126 | bool inRange = !RangeCheck || 127 | ((bBegin < bEnd) && (aBegin < aEnd)); 128 | match = inRange && !comp(bKey, aKey); 129 | } else { 130 | // Lower Bound: bKey is smaller than aKey. We advance to the 131 | // next element in B. If there is a match for bKey, it must 132 | // be aPrev. The previous A-advancing iteration proved that 133 | // !comp(bKey, aPrev). We test !comp(aPrev, bKey) for the 134 | // other half of the equality condition. 135 | bool inRange = !RangeCheck || 136 | ((bBegin < bEnd) && (aBegin > 0)); 137 | match = inRange && !comp(aPrev, bKey); 138 | } 139 | } 140 | 141 | int index = 0; 142 | if(IndexB) index = aOffset + aBegin; 143 | if(match) index |= FlagB; 144 | if(IndexB || MatchB) indices[i] = index; 145 | matchCountB += match; 146 | 147 | // Keep the decision bit cleared to indicate that this iteration 148 | // has progressed B (the haystack). 149 | bPrev = bKey; 150 | bKey = keys_shared[++bBegin]; 151 | } 152 | } 153 | return make_int3(decisions, matchCountA, matchCountB); 154 | } 155 | 156 | //////////////////////////////////////////////////////////////////////////////// 157 | // CTASortedSearch 158 | // Take keys in shared memory and return indices and b-match flags in shared 159 | // memory. 160 | // NOTE: This function doesn't do any strided-to-thread order transposes so 161 | // using an even number of values per thread will incur no additional bank 162 | // conflicts. 163 | 164 | template 166 | MGPU_DEVICE int2 CTASortedSearch(T* keys_shared, int aStart, int aCount, 167 | int aEnd, int a0, int bStart, int bCount, int bEnd, int b0, bool extended, 168 | int tid, int* indices_shared, Comp comp) { 169 | 170 | // Run a merge path to find the start of the serial search for each thread. 171 | int diag = VT * tid; 172 | int mp = MergePath(keys_shared + aStart, aCount, 173 | keys_shared + bStart, bCount, diag, comp); 174 | int a0tid = mp; 175 | int b0tid = diag - mp; 176 | 177 | // Serial search into register. 178 | int3 results; 179 | int indices[VT]; 180 | if(extended) 181 | results = DeviceSerialSearch(keys_shared, a0tid + aStart, aEnd, b0tid + bStart, bEnd, 183 | a0 - aStart, b0 - bStart, indices, comp); 184 | else 185 | results = DeviceSerialSearch(keys_shared, a0tid + aStart, aEnd, b0tid + bStart, bEnd, 187 | a0 - aStart, b0 - bStart, indices, comp); 188 | __syncthreads(); 189 | 190 | // Compact the indices into shared memory. Use the decision bits (set is A, 191 | // cleared is B) to select the destination. 192 | int decisions = results.x; 193 | b0tid += aCount; 194 | #pragma unroll 195 | for(int i = 0; i < VT; ++i) { 196 | if((1<< i) & decisions) { 197 | if(IndexA || MatchA) indices_shared[a0tid++] = indices[i]; 198 | } else { 199 | if(IndexB || MatchB) indices_shared[b0tid++] = indices[i]; 200 | } 201 | } 202 | __syncthreads(); 203 | 204 | // Return the match counts for A and B keys. 205 | return make_int2(results.y, results.z); 206 | } 207 | 208 | } // namespace mgpu 209 | -------------------------------------------------------------------------------- /include/contrib/moderngpu/include/device/deviceutil.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "intrinsics.cuh" 38 | 39 | namespace mgpu { 40 | 41 | // Get the difference between two pointers in bytes. 42 | MGPU_HOST_DEVICE ptrdiff_t PtrDiff(const void* a, const void* b) { 43 | return (const byte*)b - (const byte*)a; 44 | } 45 | 46 | // Offset a pointer by i bytes. 47 | template 48 | MGPU_HOST_DEVICE const T* PtrOffset(const T* p, ptrdiff_t i) { 49 | return (const T*)((const byte*)p + i); 50 | } 51 | template 52 | MGPU_HOST_DEVICE T* PtrOffset(T* p, ptrdiff_t i) { 53 | return (T*)((byte*)p + i); 54 | } 55 | 56 | //////////////////////////////////////////////////////////////////////////////// 57 | // Task range support 58 | // Evenly distributes variable-length arrays over a fixed number of CTAs. 59 | 60 | MGPU_HOST int2 DivideTaskRange(int numItems, int numWorkers) { 61 | div_t d = div(numItems, numWorkers); 62 | return make_int2(d.quot, d.rem); 63 | } 64 | 65 | MGPU_HOST_DEVICE int2 ComputeTaskRange(int block, int2 task) { 66 | int2 range; 67 | range.x = task.x * block; 68 | range.x += min(block, task.y); 69 | range.y = range.x + task.x + (block < task.y); 70 | return range; 71 | } 72 | 73 | MGPU_HOST_DEVICE int2 ComputeTaskRange(int block, int2 task, int blockSize, 74 | int count) { 75 | int2 range = ComputeTaskRange(block, task); 76 | range.x *= blockSize; 77 | range.y = min(count, range.y * blockSize); 78 | return range; 79 | } 80 | 81 | //////////////////////////////////////////////////////////////////////////////// 82 | // DeviceExtractHeadFlags 83 | // Input array flags is a bit array with 32 head flags per word. 84 | // ExtractThreadHeadFlags returns numBits flags starting at bit index. 85 | 86 | MGPU_HOST_DEVICE uint DeviceExtractHeadFlags(const uint* flags, int index, 87 | int numBits) { 88 | 89 | int index2 = index>> 5; 90 | int shift = 31 & index; 91 | uint headFlags = flags[index2]>> shift; 92 | int shifted = 32 - shift; 93 | 94 | if(shifted < numBits) 95 | // We also need to shift in the next set of bits. 96 | headFlags = bfi(flags[index2 + 1], headFlags, shifted, shift); 97 | headFlags &= (1<< numBits) - 1; 98 | return headFlags; 99 | } 100 | 101 | //////////////////////////////////////////////////////////////////////////////// 102 | // DevicePackHeadFlags 103 | // Pack VT bits per thread at 32 bits/thread. Will consume an integer number of 104 | // words, because CTA size is a multiple of 32. The first NT * VT / 32 threads 105 | // return packed words. 106 | 107 | template 108 | MGPU_DEVICE uint DevicePackHeadFlags(uint threadBits, int tid, 109 | uint* flags_shared) { 110 | 111 | const int WordCount = NT * VT / 32; 112 | 113 | // Each thread stores its thread bits to flags_shared[tid]. 114 | flags_shared[tid] = threadBits; 115 | __syncthreads(); 116 | 117 | uint packed = 0; 118 | if(tid < WordCount) { 119 | const int Items = MGPU_DIV_UP(32, VT); 120 | int index = 32 * tid; 121 | int first = index / VT; 122 | int bit = 0; 123 | 124 | int rem = index - VT * first; 125 | packed = flags_shared[first]>> rem; 126 | bit = VT - rem; 127 | ++first; 128 | 129 | #pragma unroll 130 | for(int i = 0; i < Items; ++i) { 131 | if(i < Items - 1 || bit < 32) { 132 | uint x = flags_shared[first + i]; 133 | if(bit < 32) packed |= x<< bit; 134 | bit += VT; 135 | } 136 | } 137 | } 138 | __syncthreads(); 139 | 140 | return packed; 141 | } 142 | 143 | } // namespace mgpu 144 | -------------------------------------------------------------------------------- /include/contrib/moderngpu/include/device/serialsets.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "deviceutil.cuh" 38 | 39 | namespace mgpu { 40 | 41 | //////////////////////////////////////////////////////////////////////////////// 42 | // SerialSetIntersection 43 | // Emit A if A and B are in range and equal. 44 | 45 | template 46 | MGPU_DEVICE int SerialSetIntersection(const T* data, int aBegin, int aEnd, 47 | int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) { 48 | 49 | const int MinIterations = VT / 2; 50 | int commit = 0; 51 | 52 | #pragma unroll 53 | for(int i = 0; i < VT; ++i) { 54 | bool test = RangeCheck ? 55 | ((aBegin + bBegin < end) && (aBegin < aEnd) && (bBegin < bEnd)) : 56 | (i < MinIterations || (aBegin + bBegin < end)); 57 | 58 | if(test) { 59 | T aKey = data[aBegin]; 60 | T bKey = data[bBegin]; 61 | 62 | bool pA = comp(aKey, bKey); 63 | bool pB = comp(bKey, aKey); 64 | 65 | // The outputs must come from A by definition of set interection. 66 | results[i] = aKey; 67 | indices[i] = aBegin; 68 | 69 | if(!pB) ++aBegin; 70 | if(!pA) ++bBegin; 71 | if(pA == pB) commit |= 1<< i; 72 | } 73 | } 74 | return commit; 75 | } 76 | 77 | //////////////////////////////////////////////////////////////////////////////// 78 | // SerialSetUnion 79 | // Emit A if A <= B. Emit B if B < A. 80 | 81 | template 82 | MGPU_DEVICE int SerialSetUnion(const T* data, int aBegin, int aEnd, 83 | int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) { 84 | 85 | const int MinIterations = VT / 2; 86 | int commit = 0; 87 | 88 | #pragma unroll 89 | for(int i = 0; i < VT; ++i) { 90 | bool test = RangeCheck ? 91 | (aBegin + bBegin < end) : 92 | (i < MinIterations || (aBegin + bBegin < end)); 93 | 94 | if(test) { 95 | T aKey = data[aBegin]; 96 | T bKey = data[bBegin]; 97 | 98 | bool pA = false, pB = false; 99 | if(RangeCheck && aBegin >= aEnd) 100 | pB = true; 101 | else if(RangeCheck && bBegin >= bEnd) 102 | pA = true; 103 | else { 104 | // Both are in range. 105 | pA = comp(aKey, bKey); 106 | pB = comp(bKey, aKey); 107 | } 108 | 109 | // Output A in case of a tie, so check if b < a. 110 | results[i] = pB ? bKey : aKey; 111 | indices[i] = pB ? bBegin : aBegin; 112 | if(!pB) ++aBegin; 113 | if(!pA) ++bBegin; 114 | commit |= 1<< i; 115 | } 116 | } 117 | return commit; 118 | } 119 | 120 | //////////////////////////////////////////////////////////////////////////////// 121 | // SerialSetDifference 122 | // Emit A if A < B. 123 | 124 | template 125 | MGPU_DEVICE int SerialSetDifference(const T* data, int aBegin, int aEnd, 126 | int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) { 127 | 128 | const int MinIterations = VT / 2; 129 | int commit = 0; 130 | 131 | #pragma unroll 132 | for(int i = 0; i < VT; ++i) { 133 | bool test = RangeCheck ? 134 | (aBegin + bBegin < end) : 135 | (i < MinIterations || (aBegin + bBegin < end)); 136 | if(test) { 137 | T aKey = data[aBegin]; 138 | T bKey = data[bBegin]; 139 | 140 | bool pA = false, pB = false; 141 | if(RangeCheck && aBegin >= aEnd) 142 | pB = true; 143 | else if(RangeCheck && bBegin >= bEnd) 144 | pA = true; 145 | else { 146 | pA = comp(aKey, bKey); 147 | pB = comp(bKey, aKey); 148 | } 149 | 150 | // The outputs must come from A by definition of set difference. 151 | results[i] = aKey; 152 | indices[i] = aBegin; 153 | if(!pB) ++aBegin; 154 | if(!pA) ++bBegin; 155 | if(pA) commit |= 1<< i; 156 | } 157 | } 158 | return commit; 159 | } 160 | 161 | //////////////////////////////////////////////////////////////////////////////// 162 | // SerialSetSymDiff 163 | // Emit A if A < B and emit B if B < A. 164 | 165 | template 166 | MGPU_DEVICE int SerialSetSymDiff(const T* data, int aBegin, int aEnd, 167 | int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) { 168 | 169 | const int MinIterations = VT / 2; 170 | int commit = 0; 171 | 172 | #pragma unroll 173 | for(int i = 0; i < VT; ++i) { 174 | bool test = RangeCheck ? 175 | (aBegin + bBegin < end) : 176 | (i < MinIterations || (aBegin + bBegin < end)); 177 | if(test) { 178 | T aKey = data[aBegin]; 179 | T bKey = data[bBegin]; 180 | 181 | bool pA = false, pB = false; 182 | if(RangeCheck && (bBegin >= bEnd)) 183 | pA = true; 184 | else if(RangeCheck && (aBegin >= aEnd)) 185 | pB = true; 186 | else { 187 | pA = comp(aKey, bKey); 188 | pB = comp(bKey, aKey); 189 | } 190 | 191 | results[i] = pA ? aKey : bKey; 192 | indices[i] = pA ? aBegin : bBegin; 193 | if(!pA) ++bBegin; 194 | if(!pB) ++aBegin; 195 | if(pA != pB) commit |= 1<< i; 196 | } 197 | } 198 | return commit; 199 | } 200 | 201 | //////////////////////////////////////////////////////////////////////////////// 202 | // SerialSetOp 203 | // Uses the MgpuSetOp enum to statically select one of the four serial ops 204 | // above. 205 | 206 | template 207 | MGPU_DEVICE int SerialSetOp(const T* data, int aBegin, int aEnd, 208 | int bBegin, int bEnd, int star, T* results, int* indices, Comp comp) { 209 | 210 | int end = aBegin + bBegin + VT - star; 211 | if(RangeCheck) end = min(end, aEnd + bEnd); 212 | int commit; 213 | switch(Op) { 214 | case MgpuSetOpIntersection: 215 | commit = SerialSetIntersection(data, aBegin, 216 | aEnd, bBegin, bEnd, end, results, indices, comp); 217 | break; 218 | case MgpuSetOpUnion: 219 | commit = SerialSetUnion(data, aBegin, aEnd, 220 | bBegin, bEnd, end, results, indices, comp); 221 | break; 222 | case MgpuSetOpDiff: 223 | commit = SerialSetDifference(data, aBegin, aEnd, 224 | bBegin, bEnd, end, results, indices, comp); 225 | break; 226 | case MgpuSetOpSymDiff: 227 | commit = SerialSetSymDiff(data, aBegin, aEnd, 228 | bBegin, bEnd, end, results, indices, comp); 229 | break; 230 | } 231 | __syncthreads(); 232 | return commit; 233 | } 234 | 235 | } // namespace mgpu 236 | -------------------------------------------------------------------------------- /include/contrib/moderngpu/include/device/sortnetwork.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "deviceutil.cuh" 38 | 39 | namespace mgpu { 40 | 41 | //////////////////////////////////////////////////////////////////////////////// 42 | // Odd-even transposition sorting network. Sorts keys and values in-place in 43 | // register. 44 | // http://en.wikipedia.org/wiki/Odd%E2%80%93even_sort 45 | 46 | // CUDA Compiler does not currently unroll these loops correctly. Write using 47 | // template loop unrolling. 48 | /* 49 | template 50 | MGPU_DEVICE void OddEvenTransposeSort(T* keys, V* values, Comp comp) { 51 | #pragma unroll 52 | for(int level = 0; level < VT; ++level) { 53 | 54 | #pragma unroll 55 | for(int i = 1 & level; i < VT - 1; i += 2) { 56 | if(comp(keys[i + 1], keys[i])) { 57 | mgpu::swap(keys[i], keys[i + 1]); 58 | mgpu::swap(values[i], values[i + 1]); 59 | } 60 | } 61 | } 62 | }*/ 63 | 64 | template 65 | struct OddEvenTransposeSortT { 66 | // Sort segments marked by head flags. If the head flag between i and i + 1 67 | // is set (so that (2<< i) & flags is true), the values belong to different 68 | // segments and are not swapped. 69 | template 70 | static MGPU_DEVICE void Sort(K* keys, V* values, int flags, Comp comp) { 71 | #pragma unroll 72 | for(int i = 1 & I; i < VT - 1; i += 2) 73 | if((0 == ((2<< i) & flags)) && comp(keys[i + 1], keys[i])) { 74 | mgpu::swap(keys[i], keys[i + 1]); 75 | mgpu::swap(values[i], values[i + 1]); 76 | } 77 | OddEvenTransposeSortT::Sort(keys, values, flags, comp); 78 | } 79 | }; 80 | template struct OddEvenTransposeSortT { 81 | template 82 | static MGPU_DEVICE void Sort(K* keys, V* values, int flags, Comp comp) { } 83 | }; 84 | 85 | template 86 | MGPU_DEVICE void OddEvenTransposeSort(K* keys, V* values, Comp comp) { 87 | OddEvenTransposeSortT<0, VT>::Sort(keys, values, 0, comp); 88 | } 89 | template 90 | MGPU_DEVICE void OddEvenTransposeSortFlags(K* keys, V* values, int flags, 91 | Comp comp) { 92 | OddEvenTransposeSortT<0, VT>::Sort(keys, values, flags, comp); 93 | } 94 | 95 | //////////////////////////////////////////////////////////////////////////////// 96 | // Batcher Odd-Even Mergesort network 97 | // Unstable but executes much faster than the transposition sort. 98 | // http://en.wikipedia.org/wiki/Batcher_odd%E2%80%93even_mergesort 99 | 100 | template 101 | struct OddEvenMergesortT { 102 | template 103 | MGPU_DEVICE static void CompareAndSwap(K* keys, V* values, int flags, 104 | int a, int b, Comp comp) { 105 | if(b < Count) { 106 | // Mask the bits between a and b. Any head flags in this interval 107 | // means the keys are in different segments and must not be swapped. 108 | const int Mask = ((2<< b) - 1) ^ ((2<< a) - 1); 109 | if(!(Mask & flags) && comp(keys[b], keys[a])) { 110 | mgpu::swap(keys[b], keys[a]); 111 | mgpu::swap(values[b], values[a]); 112 | } 113 | } 114 | } 115 | 116 | template 117 | struct OddEvenMerge { 118 | template 119 | MGPU_DEVICE static void Merge(K* keys, V* values, int flags, 120 | Comp comp) { 121 | // Compare and swap 122 | const int M = 2 * R; 123 | OddEvenMerge::Merge(keys, values, flags, comp); 124 | OddEvenMerge::Merge(keys, values, flags, comp); 125 | 126 | #pragma unroll 127 | for(int i = Low2 + R; i + R < Low2 + Width; i += M) 128 | CompareAndSwap(keys, values, flags, i, i + R, comp); 129 | } 130 | }; 131 | template 132 | struct OddEvenMerge { 133 | template 134 | MGPU_DEVICE static void Merge(K* keys, V* values, int flags, 135 | Comp comp) { 136 | CompareAndSwap(keys, values, flags, Low2, Low2 + R, comp); 137 | } 138 | }; 139 | 140 | template 141 | MGPU_DEVICE static void Sort(K* keys, V* values, int flags, 142 | Comp comp) { 143 | 144 | const int M = Width / 2; 145 | OddEvenMergesortT::Sort(keys, values, flags, comp); 146 | OddEvenMergesortT::Sort(keys, values, flags, comp); 147 | OddEvenMerge<1, Low>::Merge(keys, values, flags, comp); 148 | } 149 | }; 150 | template struct OddEvenMergesortT<1, Low, Count> { 151 | template 152 | MGPU_DEVICE static void Sort(K* keys, V* values, int flags, 153 | Comp comp) { } 154 | }; 155 | 156 | template 157 | MGPU_DEVICE void OddEvenMergesort(K* keys, V* values, Comp comp) { 158 | const int Width = 1<< sLogPow2::value; 159 | OddEvenMergesortT::Sort(keys, values, 0, comp); 160 | } 161 | template 162 | MGPU_DEVICE void OddEvenMergesortFlags(K* keys, V* values, int flags, 163 | Comp comp) { 164 | const int Width = 1<< sLogPow2::value; 165 | OddEvenMergesortT::Sort(keys, values, flags, comp); 166 | } 167 | 168 | } // namespace mgpu 169 | -------------------------------------------------------------------------------- /include/contrib/moderngpu/include/mgpuenums.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | namespace mgpu { 38 | 39 | enum MgpuBounds { 40 | MgpuBoundsLower, 41 | MgpuBoundsUpper 42 | }; 43 | 44 | enum MgpuScanType { 45 | MgpuScanTypeExc, 46 | MgpuScanTypeInc 47 | }; 48 | 49 | enum MgpuSearchType { 50 | MgpuSearchTypeNone, 51 | MgpuSearchTypeIndex, 52 | MgpuSearchTypeMatch, 53 | MgpuSearchTypeIndexMatch 54 | }; 55 | 56 | enum MgpuJoinKind { 57 | MgpuJoinKindInner, 58 | MgpuJoinKindLeft, 59 | MgpuJoinKindRight, 60 | MgpuJoinKindOuter 61 | }; 62 | 63 | enum MgpuSetOp { 64 | MgpuSetOpIntersection, 65 | MgpuSetOpUnion, 66 | MgpuSetOpDiff, 67 | MgpuSetOpSymDiff 68 | }; 69 | 70 | } // namespace mgpu 71 | -------------------------------------------------------------------------------- /include/contrib/moderngpu/include/util/static.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | #include 46 | #include 47 | #include 48 | #include 49 | #include 50 | 51 | #ifndef MGPU_MIN 52 | #define MGPU_MIN(x, y) (((x) <= (y)) ? (x) : (y)) 53 | #define MGPU_MAX(x, y) (((x) >= (y)) ? (x) : (y)) 54 | #define MGPU_MAX0(x) (((x) >= 0) ? (x) : 0) 55 | #define MGPU_ABS(x) (((x) >= 0) ? (x) : (-x)) 56 | 57 | #define MGPU_DIV_UP(x, y) (((x) + (y) - 1) / (y)) 58 | #define MGPU_DIV_ROUND(x, y) (((x) + (y) / 2) / (y)) 59 | #define MGPU_ROUND_UP(x, y) ((y) * MGPU_DIV_UP(x, y)) 60 | #define MGPU_SHIFT_DIV_UP(x, y) (((x) + ((1<< (y)) - 1))>> y) 61 | #define MGPU_ROUND_UP_POW2(x, y) (((x) + (y) - 1) & ~((y) - 1)) 62 | #define MGPU_ROUND_DOWN_POW2(x, y) ((x) & ~((y) - 1)) 63 | #define MGPU_IS_POW_2(x) (0 == ((x) & ((x) - 1))) 64 | 65 | #endif // MGPU_MIN 66 | 67 | namespace mgpu { 68 | 69 | 70 | typedef unsigned char byte; 71 | 72 | typedef unsigned int uint; 73 | typedef signed short int16; 74 | 75 | typedef unsigned short ushort; 76 | typedef unsigned short uint16; 77 | 78 | typedef long long int64; 79 | typedef unsigned long long uint64; 80 | 81 | // IsPow2::value is true if X is a power of 2. 82 | template struct sIsPow2 { 83 | enum { value = 0 == (X & (X - 1)) }; 84 | }; 85 | 86 | // Finds the base-2 logarithm of X. value is -1 if X is not a power of 2. 87 | template struct sLogPow2 { 88 | enum { extra = sIsPow2::value ? 0 : (roundUp ? 1 : 0) }; 89 | enum { inner = sLogPow2::inner + 1 }; 90 | enum { value = inner + extra }; 91 | }; 92 | template struct sLogPow2<0, roundUp> { 93 | enum { inner = 0 }; 94 | enum { value = 0 }; 95 | }; 96 | template struct sLogPow2<1, roundUp> { 97 | enum { inner = 0 }; 98 | enum { value = 0 }; 99 | }; 100 | 101 | template 102 | struct sDivUp { 103 | enum { value = (X + Y - 1) / Y }; 104 | }; 105 | 106 | template struct sDiv2RoundUp { 107 | enum { value = sDiv2RoundUp::value, levels - 1>::value }; 108 | }; 109 | template struct sDiv2RoundUp { 110 | enum { value = count }; 111 | }; 112 | 113 | template 114 | struct sDivSafe { 115 | enum { value = X / Y }; 116 | }; 117 | template 118 | struct sDivSafe { 119 | enum { value = 0 }; 120 | }; 121 | 122 | template 123 | struct sRoundUp { 124 | enum { rem = X % Y }; 125 | enum { value = X + (rem ? (Y - rem) : 0) }; 126 | }; 127 | 128 | template 129 | struct sRoundDown { 130 | enum { rem = X % Y }; 131 | enum { value = X - rem }; 132 | }; 133 | 134 | // IntegerDiv is a template for avoiding divisions by zero in template 135 | // evaluation. Templates always evaluate both b and c in an expression like 136 | // a ? b : c, and will error if either rhs contains an illegal expression, 137 | // even if the ternary is explictly designed to guard against that. 138 | template 139 | struct sIntegerDiv { 140 | enum { value = X / (Y ? Y : (X + 1)) }; 141 | }; 142 | 143 | template 144 | struct sMax { 145 | enum { value = (X >= Y) ? X : Y }; 146 | }; 147 | template 148 | struct sMin { 149 | enum { value = (X <= Y) ? X : Y }; 150 | }; 151 | 152 | template 153 | struct sAbs { 154 | enum { value = (X >= 0) ? X : -X }; 155 | }; 156 | 157 | 158 | // Finds the number of powers of 2 in the prime factorization of X. 159 | template struct sNumFactorsOf2 { 160 | enum { shifted = X >> 1 }; 161 | enum { value = 1 + sNumFactorsOf2::value }; 162 | }; 163 | template struct sNumFactorsOf2 { 164 | enum { value = 0 }; 165 | }; 166 | 167 | // Returns the divisor for a conflict-free transpose. 168 | template struct sBankConflictDivisor { 169 | enum { value = 170 | (1 & X) ? 0 : 171 | (sIsPow2::value ? NumBanks : 172 | (1<< sNumFactorsOf2::value)) }; 173 | enum { log_value = sLogPow2::value }; 174 | }; 175 | 176 | template struct sConflictFreeStorage { 177 | enum { count = NT * X }; 178 | enum { divisor = sBankConflictDivisor::value }; 179 | enum { padding = sDivSafe::value }; 180 | enum { value = count + padding }; 181 | }; 182 | 183 | } // namespace mgpu 184 | -------------------------------------------------------------------------------- /include/ctc.h: -------------------------------------------------------------------------------- 1 | /** \file ctc.h 2 | * Contains a simple C interface to call fast CPU and GPU based computation 3 | * of the CTC loss. 4 | */ 5 | 6 | #pragma once 7 | 8 | #ifdef _WIN32 9 | #ifdef warpctc_EXPORTS 10 | #define API_REFERENCE extern "C" __declspec(dllexport) 11 | #else 12 | #define API_REFERENCE extern "C" __declspec(dllimport) 13 | #endif 14 | #else 15 | #define API_REFERENCE 16 | #endif 17 | 18 | #include 19 | 20 | #ifdef __cplusplus 21 | #include 22 | extern "C" { 23 | #endif 24 | 25 | #ifdef WARPCTC_WITH_HIP 26 | //forward declare of HIP typedef to avoid needing to pull in HIP headers 27 | typedef struct ihipStream_t* GPUstream; 28 | #else 29 | //forward declare of CUDA typedef to avoid needing to pull in CUDA headers 30 | typedef struct CUstream_st* GPUstream; 31 | #endif 32 | 33 | typedef enum { 34 | CTC_STATUS_SUCCESS = 0, 35 | CTC_STATUS_MEMOPS_FAILED = 1, 36 | CTC_STATUS_INVALID_VALUE = 2, 37 | CTC_STATUS_EXECUTION_FAILED = 3, 38 | CTC_STATUS_UNKNOWN_ERROR = 4 39 | } ctcStatus_t; 40 | 41 | /** Returns a single integer which specifies the API version of the warpctc library */ 42 | API_REFERENCE int get_warpctc_version(); 43 | 44 | /** Returns a string containing a description of status that was passed in 45 | * \param[in] status identifies which string should be returned 46 | * \return C style string containing the text description 47 | * */ 48 | API_REFERENCE const char* ctcGetStatusString(ctcStatus_t status); 49 | 50 | typedef enum { 51 | CTC_CPU = 0, 52 | CTC_GPU = 1 53 | } ctcComputeLocation; 54 | 55 | /** Structure used for options to the CTC compution. Applications 56 | * should zero out the array using memset and sizeof(struct 57 | * ctcOptions) in C or default initialization (e.g. 'ctcOptions 58 | * options{};' or 'auto options = ctcOptions{}') in C++ to ensure 59 | * forward compatibility with added options. */ 60 | struct ctcOptions { 61 | /// indicates where the ctc calculation should take place {CTC_CPU | CTC_GPU} 62 | ctcComputeLocation loc; 63 | union { 64 | /// used when loc == CTC_CPU, the maximum number of threads that can be used 65 | unsigned int num_threads; 66 | 67 | /// used when loc == CTC_GPU, which stream the kernels should be launched in 68 | GPUstream stream; 69 | }; 70 | 71 | /// the label value/index that the CTC calculation should use as the blank label 72 | int blank_label; 73 | }; 74 | 75 | /** Compute the connectionist temporal classification loss between 76 | * a probability sequence with dtype float and a ground truth labeling. 77 | * Optionally compute the gradient with respect to the inputs. 78 | * \param [in] activations pointer to the activations in either CPU or GPU 79 | * addressable memory, depending on info. We assume a fixed 80 | * memory layout for this 3 dimensional tensor, which has dimension 81 | * (t, n, p), where t is the time index, n is the minibatch index, 82 | * and p indexes over probabilities of each symbol in the alphabet. 83 | * The memory layout is (t, n, p) in C order (slowest to fastest changing 84 | * index, aka row-major), or (p, n, t) in Fortran order (fastest to slowest 85 | * changing index, aka column-major). We also assume strides are equal to 86 | * dimensions - there is no padding between dimensions. 87 | * More precisely, element (t, n, p), for a problem with mini_batch examples 88 | * in the mini batch, and alphabet_size symbols in the alphabet, is located at: 89 | * activations[(t * mini_batch + n) * alphabet_size + p] 90 | * \param [out] gradients if not NULL, then gradients are computed. Should be 91 | * allocated in the same memory space as probs and memory 92 | * ordering is identical. 93 | * \param [in] flat_labels Always in CPU memory. A concatenation 94 | * of all the labels for the minibatch. 95 | * \param [in] label_lengths Always in CPU memory. The length of each label 96 | * for each example in the minibatch. 97 | * \param [in] input_lengths Always in CPU memory. The number of time steps 98 | * for each sequence in the minibatch. 99 | * \param [in] alphabet_size The number of possible output symbols. There 100 | * should be this many probabilities for each time step. 101 | * \param [in] mini_batch How many examples in a minibatch. 102 | * \param [out] costs Always in CPU memory. The cost of each example in the 103 | * minibatch. 104 | * \param [in,out] workspace In same memory space as probs. Should be of 105 | * size requested by get_workspace_size. 106 | * \param [in] options see struct ctcOptions 107 | * 108 | * \return Status information 109 | * 110 | * */ 111 | API_REFERENCE ctcStatus_t compute_ctc_loss(const float* const activations, 112 | float* gradients, 113 | const int* const flat_labels, 114 | const int* const label_lengths, 115 | const int* const input_lengths, 116 | int alphabet_size, 117 | int minibatch, 118 | float *costs, 119 | void *workspace, 120 | ctcOptions options); 121 | 122 | /** Compute the connectionist temporal classification loss between 123 | * a probability sequence of dtype double and a ground truth labeling. 124 | * Optionally compute the gradient with respect to the inputs. 125 | * \param [in] activations pointer to the activations in either CPU or GPU 126 | * addressable memory, depending on info. We assume a fixed 127 | * memory layout for this 3 dimensional tensor, which has dimension 128 | * (t, n, p), where t is the time index, n is the minibatch index, 129 | * and p indexes over probabilities of each symbol in the alphabet. 130 | * The memory layout is (t, n, p) in C order (slowest to fastest changing 131 | * index, aka row-major), or (p, n, t) in Fortran order (fastest to slowest 132 | * changing index, aka column-major). We also assume strides are equal to 133 | * dimensions - there is no padding between dimensions. 134 | * More precisely, element (t, n, p), for a problem with mini_batch examples 135 | * in the mini batch, and alphabet_size symbols in the alphabet, is located at: 136 | * activations[(t * mini_batch + n) * alphabet_size + p] 137 | * \param [out] gradients if not NULL, then gradients are computed. Should be 138 | * allocated in the same memory space as probs and memory 139 | * ordering is identical. 140 | * \param [in] flat_labels Always in CPU memory. A concatenation 141 | * of all the labels for the minibatch. 142 | * \param [in] label_lengths Always in CPU memory. The length of each label 143 | * for each example in the minibatch. 144 | * \param [in] input_lengths Always in CPU memory. The number of time steps 145 | * for each sequence in the minibatch. 146 | * \param [in] alphabet_size The number of possible output symbols. There 147 | * should be this many probabilities for each time step. 148 | * \param [in] mini_batch How many examples in a minibatch. 149 | * \param [out] costs Always in CPU memory. The cost of each example in the 150 | * minibatch. 151 | * \param [in,out] workspace In same memory space as probs. Should be of 152 | * size requested by get_workspace_size. 153 | * \param [in] options see struct ctcOptions 154 | * 155 | * \return Status information 156 | * 157 | * */ 158 | API_REFERENCE ctcStatus_t compute_ctc_loss_double(const double* const activations, 159 | double* gradients, 160 | const int* const flat_labels, 161 | const int* const label_lengths, 162 | const int* const input_lengths, 163 | int alphabet_size, 164 | int minibatch, 165 | double *costs, 166 | void *workspace, 167 | ctcOptions options); 168 | 169 | 170 | /** For a given set of labels and minibatch size return the required workspace 171 | * size when the dtype of your probabilities is float. This will need to be allocated 172 | * in the same memory space as your probabilities. 173 | * \param [in] label_lengths Always in CPU memory. The length of each label 174 | * for each example in the minibatch. 175 | * \param [in] input_lengths Always in CPU memory. The number of time steps 176 | * for each sequence in the minibatch. 177 | * \param [in] alphabet_size How many symbols in the alphabet or, equivalently, 178 | * the number of probabilities at each time step 179 | * \param [in] mini_batch How many examples in a minibatch. 180 | * \param [in] info see struct ctcOptions 181 | * \param [out] size_bytes is pointer to a scalar where the memory 182 | * requirement in bytes will be placed. This memory should be allocated 183 | * at the same place, CPU or GPU, that the probs are in 184 | * 185 | * \return Status information 186 | **/ 187 | API_REFERENCE ctcStatus_t get_workspace_size(const int* const label_lengths, 188 | const int* const input_lengths, 189 | int alphabet_size, int minibatch, 190 | ctcOptions info, 191 | size_t* size_bytes); 192 | 193 | /** For a given set of labels and minibatch size return the required workspace 194 | * size when the dtype of your probabilities is double. This will need to be allocated 195 | * in the same memory space as your probabilities. 196 | * \param [in] label_lengths Always in CPU memory. The length of each label 197 | * for each example in the minibatch. 198 | * \param [in] input_lengths Always in CPU memory. The number of time steps 199 | * for each sequence in the minibatch. 200 | * \param [in] alphabet_size How many symbols in the alphabet or, equivalently, 201 | * the number of probabilities at each time step 202 | * \param [in] mini_batch How many examples in a minibatch. 203 | * \param [in] info see struct ctcOptions 204 | * \param [out] size_bytes is pointer to a scalar where the memory 205 | * requirement in bytes will be placed. This memory should be allocated 206 | * at the same place, CPU or GPU, that the probs are in 207 | * 208 | * \return Status information 209 | **/ 210 | API_REFERENCE ctcStatus_t get_workspace_size_double(const int* const label_lengths, 211 | const int* const input_lengths, 212 | int alphabet_size, int minibatch, 213 | ctcOptions info, 214 | size_t* size_bytes); 215 | 216 | #ifdef __cplusplus 217 | } 218 | #endif 219 | -------------------------------------------------------------------------------- /include/detail/ctc_helper.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "hostdevice.h" 8 | #include "type_defs.h" 9 | 10 | namespace ctc_helper { 11 | 12 | static const float threshold = 1e-1; 13 | 14 | template 15 | HOSTDEVICE 16 | T neg_inf() { return -T(INFINITY); } 17 | 18 | inline int div_up(int x, int y) { 19 | return (x + y - 1) / y; 20 | } 21 | 22 | template struct maximum { 23 | HOSTDEVICE 24 | Res operator()(const Arg& x, const Arg& y) const { 25 | return x < y ? y : x; 26 | } 27 | }; 28 | 29 | template struct add { 30 | HOSTDEVICE 31 | Res operator()(const Arg& x, const Arg& y) const { 32 | return x + y; 33 | } 34 | }; 35 | 36 | template struct identity { 37 | HOSTDEVICE Res operator()(const Arg& x) const {return Res(x);} 38 | }; 39 | 40 | template struct negate { 41 | HOSTDEVICE Res operator()(const Arg& x) const {return Res(-x);} 42 | }; 43 | 44 | template struct exponential { 45 | HOSTDEVICE Res operator()(const Arg& x) const {return std::exp(x);} 46 | }; 47 | 48 | template 49 | struct log_plus { 50 | typedef Res result_type; 51 | HOSTDEVICE 52 | Res operator()(const Arg1& p1, const Arg2& p2) { 53 | if (p1 == neg_inf()) 54 | return p2; 55 | if (p2 == neg_inf()) 56 | return p1; 57 | Res result = log1p(exp(-fabs(p1 - p2))) + maximum()(p1, p2); 58 | return result; 59 | } 60 | }; 61 | 62 | } 63 | -------------------------------------------------------------------------------- /include/detail/hostdevice.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #if (defined(__HIPCC__) || defined(__CUDACC__)) 4 | #define HOSTDEVICE __host__ __device__ 5 | #else 6 | #define HOSTDEVICE 7 | #endif 8 | 9 | // NOTE(dzhwinter) 10 | // the warp primitive is different in cuda9(Volta) GPU. 11 | // add a wrapper to compatible with cuda7 to cuda9 12 | #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 13 | #define DEFAULT_MASK 0u 14 | template 15 | __forceinline__ __device__ T __shfl_down(T input, int delta) { 16 | return __shfl_down_sync(DEFAULT_MASK, input, delta); 17 | } 18 | 19 | template 20 | __forceinline__ __device__ T __shfl_up(T input, int delta) { 21 | return __shfl_up_sync(DEFAULT_MASK, input, delta); 22 | } 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /include/detail/reduce.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | template 4 | ctcStatus_t reduce_negate(const T* input, T* output, int rows, int cols, bool axis, gpuStream_t stream); 5 | template 6 | ctcStatus_t reduce_exp(const T* input, T* output, int rows, int cols, bool axis, gpuStream_t stream); 7 | template 8 | ctcStatus_t reduce_max(const T* input, T* output, int rows, int cols, bool axis, gpuStream_t stream); 9 | -------------------------------------------------------------------------------- /include/detail/type_defs.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #if (defined(__HIPCC__) || defined(__CUDACC__)) 4 | 5 | #ifdef __HIPCC__ 6 | #include 7 | #else 8 | #include 9 | #endif 10 | 11 | #ifdef __HIPCC__ 12 | #define gpuSuccess hipSuccess 13 | using gpuStream_t = hipStream_t; 14 | using gpuError_t = hipError_t; 15 | using gpuEvent_t = hipEvent_t; 16 | #else 17 | #define gpuSuccess cudaSuccess 18 | using gpuStream_t = cudaStream_t; 19 | using gpuError_t = cudaError_t; 20 | using gpuEvent_t = cudaEvent_t; 21 | #endif 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /src/ctc_entrypoint.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | #include "detail/cpu_ctc.h" 9 | #if (defined(__HIPCC__) || defined(__CUDACC__)) 10 | #include "detail/gpu_ctc.h" 11 | #endif 12 | 13 | 14 | extern "C" { 15 | 16 | int get_warpctc_version() { 17 | return 2; 18 | } 19 | 20 | const char* ctcGetStatusString(ctcStatus_t status) { 21 | switch (status) { 22 | case CTC_STATUS_SUCCESS: 23 | return "no error"; 24 | case CTC_STATUS_MEMOPS_FAILED: 25 | return "cuda memcpy or memset failed"; 26 | case CTC_STATUS_INVALID_VALUE: 27 | return "invalid value"; 28 | case CTC_STATUS_EXECUTION_FAILED: 29 | return "execution failed"; 30 | 31 | case CTC_STATUS_UNKNOWN_ERROR: 32 | default: 33 | return "unknown error"; 34 | 35 | } 36 | 37 | } 38 | 39 | 40 | ctcStatus_t compute_ctc_loss(const float* const activations, 41 | float* gradients, 42 | const int* const flat_labels, 43 | const int* const label_lengths, 44 | const int* const input_lengths, 45 | int alphabet_size, 46 | int minibatch, 47 | float *costs, 48 | void *workspace, 49 | ctcOptions options) { 50 | if (activations == nullptr || 51 | flat_labels == nullptr || 52 | label_lengths == nullptr || 53 | input_lengths == nullptr || 54 | costs == nullptr || 55 | workspace == nullptr || 56 | alphabet_size <= 0 || 57 | minibatch <= 0) 58 | return CTC_STATUS_INVALID_VALUE; 59 | 60 | if (options.loc == CTC_CPU) { 61 | CpuCTC ctc(alphabet_size, minibatch, workspace, options.num_threads, 62 | options.blank_label); 63 | 64 | if (gradients != NULL) 65 | return ctc.cost_and_grad(activations, gradients, 66 | costs, 67 | flat_labels, label_lengths, 68 | input_lengths); 69 | else 70 | return ctc.score_forward(activations, costs, flat_labels, 71 | label_lengths, input_lengths); 72 | } else if (options.loc == CTC_GPU) { 73 | #if (defined(__HIPCC__) || defined(__CUDACC__)) 74 | GpuCTC ctc(alphabet_size, minibatch, workspace, options.stream, 75 | options.blank_label); 76 | 77 | if (gradients != NULL) 78 | return ctc.cost_and_grad(activations, gradients, costs, 79 | flat_labels, label_lengths, 80 | input_lengths); 81 | else 82 | return ctc.score_forward(activations, costs, flat_labels, 83 | label_lengths, input_lengths); 84 | #else 85 | std::cerr << "GPU execution requested, but not compiled with GPU support" << std::endl; 86 | return CTC_STATUS_EXECUTION_FAILED; 87 | #endif 88 | } else { 89 | return CTC_STATUS_INVALID_VALUE; 90 | } 91 | } 92 | 93 | ctcStatus_t compute_ctc_loss_double(const double* const activations, 94 | double* gradients, 95 | const int* const flat_labels, 96 | const int* const label_lengths, 97 | const int* const input_lengths, 98 | int alphabet_size, 99 | int minibatch, 100 | double *costs, 101 | void *workspace, 102 | ctcOptions options) { 103 | if (activations == nullptr || 104 | flat_labels == nullptr || 105 | label_lengths == nullptr || 106 | input_lengths == nullptr || 107 | costs == nullptr || 108 | workspace == nullptr || 109 | alphabet_size <= 0 || 110 | minibatch <= 0) 111 | return CTC_STATUS_INVALID_VALUE; 112 | 113 | if (options.loc == CTC_CPU) { 114 | CpuCTC ctc(alphabet_size, minibatch, workspace, options.num_threads, 115 | options.blank_label); 116 | 117 | if (gradients != NULL) 118 | return ctc.cost_and_grad(activations, gradients, 119 | costs, 120 | flat_labels, label_lengths, 121 | input_lengths); 122 | else 123 | return ctc.score_forward(activations, costs, flat_labels, 124 | label_lengths, input_lengths); 125 | } else if (options.loc == CTC_GPU) { 126 | #if (defined(__HIPCC__) || defined(__CUDACC__)) 127 | GpuCTC ctc(alphabet_size, minibatch, workspace, options.stream, 128 | options.blank_label); 129 | 130 | if (gradients != NULL) 131 | return ctc.cost_and_grad(activations, gradients, costs, 132 | flat_labels, label_lengths, 133 | input_lengths); 134 | else 135 | return ctc.score_forward(activations, costs, flat_labels, 136 | label_lengths, input_lengths); 137 | #else 138 | std::cerr << "GPU execution requested, but not compiled with GPU support" << std::endl; 139 | return CTC_STATUS_EXECUTION_FAILED; 140 | #endif 141 | } else { 142 | return CTC_STATUS_INVALID_VALUE; 143 | } 144 | } 145 | 146 | 147 | ctcStatus_t get_workspace_size(const int* const label_lengths, 148 | const int* const input_lengths, 149 | int alphabet_size, int minibatch, 150 | ctcOptions options, 151 | size_t* size_bytes) 152 | { 153 | if (label_lengths == nullptr || 154 | input_lengths == nullptr || 155 | size_bytes == nullptr || 156 | alphabet_size <= 0 || 157 | minibatch <= 0) 158 | return CTC_STATUS_INVALID_VALUE; 159 | 160 | // This is the max of all S and T for all examples in the minibatch. 161 | int maxL = *std::max_element(label_lengths, label_lengths + minibatch); 162 | int maxT = *std::max_element(input_lengths, input_lengths + minibatch); 163 | 164 | const int S = 2 * maxL + 1; 165 | 166 | *size_bytes = 0; 167 | 168 | if (options.loc == CTC_GPU) { 169 | // GPU storage 170 | //nll_forward, nll_backward 171 | *size_bytes += 2 * sizeof(float) * minibatch; 172 | 173 | //repeats 174 | *size_bytes += sizeof(int) * minibatch; 175 | 176 | //label offsets 177 | *size_bytes += sizeof(int) * minibatch; 178 | 179 | //utt_length 180 | *size_bytes += sizeof(int) * minibatch; 181 | 182 | //label lengths 183 | *size_bytes += sizeof(int) * minibatch; 184 | 185 | //labels without blanks - overallocate for now 186 | *size_bytes += sizeof(int) * maxL * minibatch; 187 | 188 | //labels with blanks 189 | *size_bytes += sizeof(int) * S * minibatch; 190 | 191 | //alphas 192 | *size_bytes += sizeof(float) * S * maxT * minibatch; 193 | 194 | //denoms 195 | *size_bytes += sizeof(float) * maxT * minibatch; 196 | 197 | //probs (since we will pass in activations) 198 | *size_bytes += sizeof(float) * alphabet_size * maxT * minibatch; 199 | 200 | } else { 201 | //cpu can eventually replace all minibatch with 202 | //max number of concurrent threads if memory is 203 | //really tight 204 | 205 | //per minibatch memory 206 | size_t per_minibatch_bytes = 0; 207 | 208 | //output 209 | per_minibatch_bytes += sizeof(float) * alphabet_size ; 210 | 211 | //alphas 212 | per_minibatch_bytes += sizeof(float) * S * maxT; 213 | 214 | //betas 215 | per_minibatch_bytes += sizeof(float) * S; 216 | 217 | //labels w/blanks, e_inc, s_inc 218 | per_minibatch_bytes += 3 * sizeof(int) * S; 219 | 220 | *size_bytes = per_minibatch_bytes * minibatch; 221 | 222 | //probs 223 | *size_bytes += sizeof(float) * alphabet_size * maxT * minibatch; 224 | } 225 | 226 | return CTC_STATUS_SUCCESS; 227 | } 228 | 229 | ctcStatus_t get_workspace_size_double(const int* const label_lengths, 230 | const int* const input_lengths, 231 | int alphabet_size, int minibatch, 232 | ctcOptions options, 233 | size_t* size_bytes) 234 | { 235 | if (label_lengths == nullptr || 236 | input_lengths == nullptr || 237 | size_bytes == nullptr || 238 | alphabet_size <= 0 || 239 | minibatch <= 0) 240 | return CTC_STATUS_INVALID_VALUE; 241 | 242 | // This is the max of all S and T for all examples in the minibatch. 243 | int maxL = *std::max_element(label_lengths, label_lengths + minibatch); 244 | int maxT = *std::max_element(input_lengths, input_lengths + minibatch); 245 | 246 | const int S = 2 * maxL + 1; 247 | 248 | *size_bytes = 0; 249 | 250 | if (options.loc == CTC_GPU) { 251 | // GPU storage 252 | //nll_forward, nll_backward 253 | *size_bytes += 2 * sizeof(double) * minibatch; 254 | 255 | //repeats 256 | *size_bytes += sizeof(int) * minibatch; 257 | 258 | //label offsets 259 | *size_bytes += sizeof(int) * minibatch; 260 | 261 | //utt_length 262 | *size_bytes += sizeof(int) * minibatch; 263 | 264 | //label lengths 265 | *size_bytes += sizeof(int) * minibatch; 266 | 267 | //labels without blanks - overallocate for now 268 | *size_bytes += sizeof(int) * maxL * minibatch; 269 | 270 | //labels with blanks 271 | *size_bytes += sizeof(int) * S * minibatch; 272 | 273 | //alphas 274 | *size_bytes += sizeof(double) * S * maxT * minibatch; 275 | 276 | //denoms 277 | *size_bytes += sizeof(double) * maxT * minibatch; 278 | 279 | //probs (since we will pass in activations) 280 | *size_bytes += sizeof(double) * alphabet_size * maxT * minibatch; 281 | 282 | } else { 283 | //cpu can eventually replace all minibatch with 284 | //max number of concurrent threads if memory is 285 | //really tight 286 | 287 | //per minibatch memory 288 | size_t per_minibatch_bytes = 0; 289 | 290 | //output 291 | per_minibatch_bytes += sizeof(double) * alphabet_size ; 292 | 293 | //alphas 294 | per_minibatch_bytes += sizeof(double) * S * maxT; 295 | 296 | //betas 297 | per_minibatch_bytes += sizeof(double) * S; 298 | 299 | //labels w/blanks, e_inc, s_inc 300 | per_minibatch_bytes += 3 * sizeof(int) * S; 301 | 302 | *size_bytes = per_minibatch_bytes * minibatch; 303 | 304 | //probs 305 | *size_bytes += sizeof(double) * alphabet_size * maxT * minibatch; 306 | } 307 | 308 | return CTC_STATUS_SUCCESS; 309 | } 310 | 311 | } 312 | -------------------------------------------------------------------------------- /src/ctc_entrypoint.cu: -------------------------------------------------------------------------------- 1 | ctc_entrypoint.cpp -------------------------------------------------------------------------------- /src/reduce.cu: -------------------------------------------------------------------------------- 1 | // Includes, system 2 | // #include 3 | // #include 4 | 5 | // Includes, cuda 6 | // #include 7 | // #include 8 | 9 | // Includes, cuda helper functions 10 | // #include 11 | 12 | // For the functors 13 | #include "detail/ctc_helper.h" 14 | #include "ctc.h" 15 | 16 | const int warp_size = 32; 17 | 18 | template 19 | struct CTAReduce; 20 | 21 | template 22 | struct CTAReduce { 23 | enum { Size = NT, Capacity = NT }; 24 | struct Storage { T shared[Capacity]; }; 25 | 26 | __device__ static T reduce(int tid, T x, Storage& storage, int count, Rop g) { 27 | T* s = storage.shared; 28 | s[tid] = x; 29 | __syncthreads(); 30 | 31 | // Fold the data in half with each pass. 32 | #pragma unroll 33 | for(int offset = NT / 2; offset >= warp_size; offset /= 2) { 34 | if(tid + offset < count && tid < offset) { 35 | // Read from the right half and store to the left half. 36 | x = g(x, s[offset + tid]); 37 | s[tid] = x; 38 | } 39 | __syncthreads(); 40 | } 41 | 42 | T shuff; 43 | for (int offset = warp_size / 2; offset > 0; offset /= 2) { 44 | #if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) 45 | shuff = __shfl_down_sync(0xFFFFFFFF, x, offset); 46 | #else 47 | shuff = __shfl_down(x, offset); 48 | #endif 49 | if (tid + offset < count && tid < offset) 50 | x = g(x, shuff); 51 | } 52 | return x; 53 | } 54 | }; 55 | 56 | template 57 | __global__ void reduce_rows(Iop f, Rop g, const T* input, T* output, 58 | int num_rows, int num_cols) { 59 | 60 | typedef CTAReduce R; 61 | __shared__ typename R::Storage storage; 62 | 63 | int tid = threadIdx.x; 64 | int idx = tid; 65 | int col = blockIdx.x; 66 | T curr; 67 | 68 | // Each block works on a column 69 | if (idx < num_rows) 70 | curr = f(input[idx + col*num_rows]); 71 | idx += NT; 72 | 73 | 74 | while (idx < num_rows) { 75 | curr = g(curr, f(input[idx + col*num_rows])); 76 | idx += NT; 77 | } 78 | 79 | // Sum thread-totals over the CTA. 80 | curr = R::reduce(tid, curr, storage, num_rows, g); 81 | 82 | // Store result in out 83 | if (tid == 0) 84 | output[col] = curr; 85 | } 86 | 87 | template 88 | __global__ void reduce_cols(Iop f, Rop g, const T* input, T* output, 89 | int num_rows, int num_cols) { 90 | 91 | __shared__ T s[NT]; 92 | 93 | int warps_per_block = NT / warp_size; 94 | int row = blockDim.x * blockIdx.x + threadIdx.x; 95 | int col = threadIdx.y; 96 | T curr; 97 | 98 | if (row < num_rows && col < num_cols) { 99 | curr = f(input[row + col*num_rows]); 100 | col += blockDim.y; 101 | while (col < num_cols) { 102 | curr = g(curr, f(input[row + col*num_rows])); 103 | col += blockDim.y; 104 | } 105 | } 106 | s[threadIdx.x * warps_per_block + threadIdx.y] = curr; 107 | __syncthreads(); 108 | 109 | // Reduce 110 | if (threadIdx.y == 0 && row < num_rows) { 111 | #pragma unroll 112 | for (int i = 1; i < warps_per_block && i < num_cols; ++i) 113 | curr = g(curr, s[i + threadIdx.x * warps_per_block]); 114 | output[row] = curr; 115 | } 116 | } 117 | 118 | struct ReduceHelper { 119 | 120 | template 121 | static void impl(Iof f, Rof g, const T* input, T* output, int num_rows, int num_cols, bool axis, gpuStream_t stream) { 122 | 123 | int grid_size; 124 | 125 | if (axis) { 126 | grid_size = num_cols; 127 | reduce_rows<128><<>> 128 | (f, g, input, output, num_rows, num_cols); 129 | 130 | } else { 131 | dim3 tpb(warp_size, 128 / warp_size); 132 | grid_size = (num_cols + warp_size - 1)/warp_size; 133 | reduce_cols<128><<>> 134 | (f, g, input, output, num_rows, num_cols); 135 | 136 | } 137 | } 138 | }; 139 | 140 | 141 | template 142 | ctcStatus_t reduce(Iof f, Rof g, const T* input, T* output, int rows, int cols, bool axis, gpuStream_t stream) { 143 | ReduceHelper::impl(f, g, input, output, rows, cols, axis, stream); 144 | 145 | #ifdef __HIPCC__ 146 | hipStreamSynchronize(stream); 147 | gpuError_t err = hipGetLastError(); 148 | #else 149 | cudaStreamSynchronize(stream); 150 | gpuError_t err = cudaGetLastError(); 151 | #endif 152 | 153 | if (err != gpuSuccess) 154 | return CTC_STATUS_EXECUTION_FAILED; 155 | 156 | return CTC_STATUS_SUCCESS; 157 | } 158 | template 159 | ctcStatus_t reduce_negate(const T *input, T *output, int rows, int cols, bool axis, gpuStream_t stream) { 160 | return reduce(ctc_helper::negate(), ctc_helper::add(), input, output, rows, cols, axis, stream); 161 | } 162 | template ctcStatus_t reduce_negate(const float *input, float *output, int rows, int cols, bool axis, gpuStream_t stream); 163 | template ctcStatus_t reduce_negate(const double *input, double *output, int rows, int cols, bool axis, gpuStream_t stream); 164 | 165 | template 166 | ctcStatus_t reduce_exp(const T *input, T *output, int rows, int cols, bool axis, gpuStream_t stream) { 167 | return reduce(ctc_helper::exponential(), ctc_helper::add(), input, output, rows, cols, axis, stream); 168 | } 169 | template ctcStatus_t reduce_exp(const float *input, float *output, int rows, int cols, bool axis, gpuStream_t stream); 170 | template ctcStatus_t reduce_exp(const double *input, double *output, int rows, int cols, bool axis, gpuStream_t stream); 171 | 172 | template 173 | ctcStatus_t reduce_max(const T *input, T *output, int rows, int cols, bool axis, gpuStream_t stream) { 174 | return reduce(ctc_helper::identity(), ctc_helper::maximum(),input, output, rows, cols, axis, stream); 175 | } 176 | template ctcStatus_t reduce_max(const float *input, float *output, int rows, int cols, bool axis, gpuStream_t stream); 177 | template ctcStatus_t reduce_max(const double *input, double *output, int rows, int cols, bool axis, gpuStream_t stream); 178 | -------------------------------------------------------------------------------- /tensorflow_binding/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | dist 3 | *.egg-info 4 | *.so 5 | include/cuda 6 | *.pyc 7 | -------------------------------------------------------------------------------- /tensorflow_binding/README.md: -------------------------------------------------------------------------------- 1 | 2 | # TensorFlow binding for WarpCTC 3 | 4 | This package provides TensorFlow kernels that wrap the WarpCTC 5 | library. Kernels are provided for both the CTCLoss op already in 6 | TensorFlow, as well as a new WarpCTC op provided in this package. The 7 | WarpCTC op has an interface that more closely matches the native 8 | WarpCTC interface than TensorFlow's CTCLoss op. Note that the CTCLoss 9 | op expects the reserved blank label to be the largest value while the 10 | WarpCTC op takes the reserved blank label value as an attribute which 11 | defaults to `0`. 12 | 13 | ## Installation 14 | 15 | To build the kernels it is necessary to have the TensorFlow source 16 | code available, since TensorFlow doesn't currently install the 17 | necessary headers to handle the SparseTensor that the CTCLoss op uses 18 | to input the labels. You can retrieve the TensorFlow source from 19 | github.com: 20 | 21 | ```bash 22 | git clone https://github.com/tensorflow/tensorflow.git 23 | ``` 24 | 25 | Tell the build scripts where you have the TensorFlow source tree by 26 | setting the `TENSORFLOW_SRC_PATH` environment variable: 27 | 28 | ```bash 29 | export TENSORFLOW_SRC_PATH=/path/to/tensorflow 30 | ``` 31 | 32 | `WARP_CTC_PATH` should be set to the location of a built WarpCTC 33 | (i.e. `libwarpctc.so`). This defaults to `../build`, so from within a 34 | new warp-ctc clone you could build WarpCTC like this: 35 | 36 | ```bash 37 | mkdir build; cd build 38 | cmake .. 39 | make 40 | ``` 41 | 42 | Otherwise, set `WARP_CTC_PATH` to wherever you have `libwarpctc.so` 43 | installed. If you have a GPU, you should also make sure that 44 | `CUDA_HOME` is set to the home cuda directory (i.e. where 45 | `include/cuda.h` and `lib/libcudart.so` live). 46 | 47 | You should now be able to use `setup.py` to install the package into 48 | your current Python environment: 49 | 50 | ```bash 51 | python setup.py install 52 | ``` 53 | 54 | You can run a few unit tests with `setup.py` as well if you want: 55 | 56 | ```bash 57 | python setup.py test 58 | ``` 59 | 60 | ## Using the kernels 61 | 62 | First import the module: 63 | 64 | ```python 65 | import warpctc_tensorflow 66 | ``` 67 | 68 | The GPU kernel for the existing `CTCLoss` op is registered and ready 69 | to use. If you want to use WarpCTC as the CPU kernel for the 70 | `CTCLoss` op you can use the ("experimental") `_kernel_label_map` 71 | function to tell TensorFlow to use WarpCTC kernels instead of the 72 | default CPU kernel: 73 | 74 | ```python 75 | with tf.get_default_graph()._kernel_label_map({"CTCLoss": "WarpCTC"}): 76 | ... 77 | loss = tf.nn.ctc_loss(inputs, labels, seq_lens) 78 | ``` 79 | 80 | Note that `preprocess_collapse_repeated` must be `False` and 81 | `ctc_merge_repeated` must be `True` (their default values) as these 82 | options are not currently supported. 83 | 84 | The WarpCTC op is available via the `warpctc_tensorflow.ctc` function: 85 | 86 | ```python 87 | costs = warpctc_tensorflow.ctc(activations, flat_labels, label_lengths, input_lengths) 88 | ``` 89 | 90 | The `activations` input is a 3 dimensional Tensor and all the others 91 | are single dimension Tensors. See the main WarpCTC documentation for 92 | more information. 93 | 94 | 95 | -------------------------------------------------------------------------------- /tensorflow_binding/setup.py: -------------------------------------------------------------------------------- 1 | """setup.py script for warp-ctc TensorFlow wrapper""" 2 | 3 | from __future__ import print_function 4 | 5 | import os 6 | import platform 7 | import re 8 | import setuptools 9 | import sys 10 | import unittest 11 | import warnings 12 | from setuptools.command.build_ext import build_ext as orig_build_ext 13 | 14 | # We need to import tensorflow to find where its include directory is. 15 | try: 16 | import tensorflow as tf 17 | except ImportError: 18 | raise RuntimeError("Tensorflow must be installed to build the tensorflow wrapper.") 19 | 20 | if "CUDA_HOME" not in os.environ: 21 | print("CUDA_HOME not found in the environment so building " 22 | "without GPU support. To build with GPU support " 23 | "please define the CUDA_HOME environment variable. " 24 | "This should be a path which contains include/cuda.h", 25 | file=sys.stderr) 26 | enable_gpu = False 27 | else: 28 | enable_gpu = True 29 | 30 | 31 | if "TENSORFLOW_SRC_PATH" not in os.environ: 32 | print("Please define the TENSORFLOW_SRC_PATH environment variable.\n" 33 | "This should be a path to the Tensorflow source directory.", 34 | file=sys.stderr) 35 | sys.exit(1) 36 | 37 | if platform.system() == 'Darwin': 38 | lib_ext = ".dylib" 39 | else: 40 | lib_ext = ".so" 41 | 42 | warp_ctc_path = "../build" 43 | if "WARP_CTC_PATH" in os.environ: 44 | warp_ctc_path = os.environ["WARP_CTC_PATH"] 45 | if not os.path.exists(os.path.join(warp_ctc_path, "libwarpctc"+lib_ext)): 46 | print(("Could not find libwarpctc.so in {}.\n" 47 | "Build warp-ctc and set WARP_CTC_PATH to the location of" 48 | " libwarpctc.so (default is '../build')").format(warp_ctc_path), 49 | file=sys.stderr) 50 | sys.exit(1) 51 | 52 | root_path = os.path.realpath(os.path.dirname(__file__)) 53 | 54 | tf_include = tf.sysconfig.get_include() 55 | tf_src_dir = os.environ["TENSORFLOW_SRC_PATH"] 56 | tf_includes = [tf_include, tf_src_dir] 57 | warp_ctc_includes = [os.path.join(root_path, '../include')] 58 | include_dirs = tf_includes + warp_ctc_includes 59 | 60 | if tf.__version__ >= '1.4': 61 | include_dirs += [tf_include + '/../../external/nsync/public'] 62 | 63 | if os.getenv("TF_CXX11_ABI") is not None: 64 | TF_CXX11_ABI = os.getenv("TF_CXX11_ABI") 65 | else: 66 | warnings.warn("Assuming tensorflow was compiled without C++11 ABI. " 67 | "It is generally true if you are using binary pip package. " 68 | "If you compiled tensorflow from source with gcc >= 5 and didn't set " 69 | "-D_GLIBCXX_USE_CXX11_ABI=0 during compilation, you need to set " 70 | "environment variable TF_CXX11_ABI=1 when compiling this bindings. " 71 | "Also be sure to touch some files in src to trigger recompilation. " 72 | "Also, you need to set (or unsed) this environment variable if getting " 73 | "undefined symbol: _ZN10tensorflow... errors") 74 | TF_CXX11_ABI = "0" 75 | 76 | extra_compile_args = ['-std=c++11', '-fPIC', '-D_GLIBCXX_USE_CXX11_ABI=' + TF_CXX11_ABI] 77 | # current tensorflow code triggers return type errors, silence those for now 78 | extra_compile_args += ['-Wno-return-type'] 79 | 80 | extra_link_args = [] 81 | if tf.__version__ >= '1.4': 82 | if os.path.exists(os.path.join(tf_src_dir, 'libtensorflow_framework.so')): 83 | extra_link_args = ['-L' + tf.sysconfig.get_lib(), '-ltensorflow_framework'] 84 | 85 | if (enable_gpu): 86 | extra_compile_args += ['-DWARPCTC_ENABLE_GPU'] 87 | include_dirs += [os.path.join(os.environ["CUDA_HOME"], 'include')] 88 | 89 | # mimic tensorflow cuda include setup so that their include command work 90 | if not os.path.exists(os.path.join(root_path, "include")): 91 | os.mkdir(os.path.join(root_path, "include")) 92 | 93 | cuda_inc_path = os.path.join(root_path, "include/cuda") 94 | if not os.path.exists(cuda_inc_path) or os.readlink(cuda_inc_path) != os.environ["CUDA_HOME"]: 95 | if os.path.exists(cuda_inc_path): 96 | os.remove(cuda_inc_path) 97 | os.symlink(os.environ["CUDA_HOME"], cuda_inc_path) 98 | include_dirs += [os.path.join(root_path, 'include')] 99 | 100 | # Ensure that all expected files and directories exist. 101 | for loc in include_dirs: 102 | if not os.path.exists(loc): 103 | print(("Could not find file or directory {}.\n" 104 | "Check your environment variables and paths?").format(loc), 105 | file=sys.stderr) 106 | sys.exit(1) 107 | 108 | lib_srcs = ['src/ctc_op_kernel.cc', 'src/warpctc_op.cc'] 109 | 110 | ext = setuptools.Extension('warpctc_tensorflow.kernels', 111 | sources = lib_srcs, 112 | language = 'c++', 113 | include_dirs = include_dirs, 114 | library_dirs = [warp_ctc_path], 115 | runtime_library_dirs = [os.path.realpath(warp_ctc_path)], 116 | libraries = ['warpctc', 'tensorflow_framework'], 117 | extra_compile_args = extra_compile_args, 118 | extra_link_args = extra_link_args) 119 | 120 | class build_tf_ext(orig_build_ext): 121 | def build_extensions(self): 122 | self.compiler.compiler_so.remove('-Wstrict-prototypes') 123 | orig_build_ext.build_extensions(self) 124 | 125 | def discover_test_suite(): 126 | test_loader = unittest.TestLoader() 127 | test_suite = test_loader.discover('tests', pattern='test_*.py') 128 | return test_suite 129 | 130 | # Read the README.md file for the long description. This lets us avoid 131 | # duplicating the package description in multiple places in the source. 132 | README_PATH = os.path.join(os.path.dirname(__file__), "README.md") 133 | with open(README_PATH, "r") as handle: 134 | # Extract everything between the first set of ## headlines 135 | LONG_DESCRIPTION = re.search("#.*([^#]*)##", handle.read()).group(1).strip() 136 | 137 | setuptools.setup( 138 | name = "warpctc_tensorflow", 139 | version = "0.1", 140 | description = "TensorFlow wrapper for warp-ctc", 141 | long_description = LONG_DESCRIPTION, 142 | url = "https://github.com/baidu-research/warp-ctc", 143 | author = "Jared Casper", 144 | author_email = "jared.casper@baidu.com", 145 | license = "Apache", 146 | packages = ["warpctc_tensorflow"], 147 | ext_modules = [ext], 148 | cmdclass = {'build_ext': build_tf_ext}, 149 | test_suite = 'setup.discover_test_suite', 150 | ) 151 | -------------------------------------------------------------------------------- /tensorflow_binding/src/ctc_op_kernel.cc: -------------------------------------------------------------------------------- 1 | #ifdef WARPCTC_ENABLE_GPU 2 | #define EIGEN_USE_GPU 3 | #include 4 | #endif 5 | 6 | #include "tensorflow/core/framework/op_kernel.h" 7 | #include "tensorflow/core/kernels/bounds_check.h" 8 | #include "tensorflow/core/util/sparse/sparse_tensor.h" 9 | 10 | #include "ctc.h" 11 | 12 | namespace tf = tensorflow; 13 | 14 | namespace warp_ctc { 15 | 16 | class CTCLossOpBase : public tf::OpKernel { 17 | public: 18 | explicit CTCLossOpBase(tf::OpKernelConstruction* ctx) : tf::OpKernel(ctx) { 19 | bool preprocess_collapse_repeated; 20 | OP_REQUIRES_OK(ctx, ctx->GetAttr("preprocess_collapse_repeated", 21 | &preprocess_collapse_repeated)); 22 | OP_REQUIRES(ctx, preprocess_collapse_repeated == false, 23 | tf::errors::InvalidArgument("preprocess collapse repeated is not currently " 24 | "supported in the WarpCTC kernel.")); 25 | 26 | bool ctc_merge_repeated; 27 | OP_REQUIRES_OK(ctx, ctx->GetAttr("ctc_merge_repeated", &ctc_merge_repeated)); 28 | OP_REQUIRES(ctx, ctc_merge_repeated == true, 29 | tf::errors::InvalidArgument("ctc_merge_repeated == false is not currently " 30 | "supported. WarpCTC always merges repeated symbols.")); 31 | } 32 | 33 | void Compute(tf::OpKernelContext* ctx) override { 34 | const tf::Tensor* inputs; 35 | const tf::Tensor* labels_indices; 36 | const tf::Tensor* labels_values; 37 | const tf::Tensor* seq_len; 38 | OP_REQUIRES_OK(ctx, ctx->input("inputs", &inputs)); 39 | OP_REQUIRES_OK(ctx, ctx->input("labels_indices", &labels_indices)); 40 | OP_REQUIRES_OK(ctx, ctx->input("labels_values", &labels_values)); 41 | OP_REQUIRES_OK(ctx, ctx->input("sequence_length", &seq_len)); 42 | 43 | OP_REQUIRES(ctx, inputs->shape().dims() == 3, 44 | tf::errors::InvalidArgument("inputs is not a 3-Tensor")); 45 | OP_REQUIRES(ctx, tf::TensorShapeUtils::IsVector(seq_len->shape()), 46 | tf::errors::InvalidArgument("sequence_length is not a vector")); 47 | OP_REQUIRES(ctx, tf::TensorShapeUtils::IsMatrix(labels_indices->shape()), 48 | tf::errors::InvalidArgument("labels_indices is not a matrix")); 49 | OP_REQUIRES(ctx, tf::TensorShapeUtils::IsVector(labels_values->shape()), 50 | tf::errors::InvalidArgument("labels_values is not a vector")); 51 | 52 | const auto& inputs_shape = inputs->shape(); 53 | const auto max_time = inputs_shape.dim_size(0); 54 | const auto batch_size = inputs_shape.dim_size(1); 55 | const auto num_classes_raw = inputs_shape.dim_size(2); 56 | OP_REQUIRES( 57 | ctx, tf::FastBoundsCheck(num_classes_raw, std::numeric_limits::max()), 58 | tf::errors::InvalidArgument("num_classes cannot exceed max int")); 59 | const auto num_classes = static_cast(num_classes_raw); 60 | 61 | OP_REQUIRES( 62 | ctx, batch_size == seq_len->dim_size(0), 63 | tf::errors::InvalidArgument("len(sequence_length) != batch_size. ", 64 | "len(sequence_length): ", seq_len->dim_size(0), 65 | " batch_size: ", batch_size)); 66 | auto seq_len_t = seq_len->vec(); 67 | 68 | OP_REQUIRES(ctx, labels_indices->dim_size(0) == labels_values->dim_size(0), 69 | tf::errors::InvalidArgument( 70 | "labels_indices and labels_values must contain the " 71 | "same number of rows, but saw shapes: ", 72 | labels_indices->shape().DebugString(), " vs. ", 73 | labels_values->shape().DebugString())); 74 | 75 | auto labels_shape = tf::TensorShape({batch_size, max_time}); 76 | auto order = std::vector{0, 1}; 77 | auto labels_sp = tf::sparse::SparseTensor(*labels_indices, *labels_values, 78 | labels_shape, order); 79 | 80 | auto labels_sp_valid = labels_sp.IndicesValid(); 81 | OP_REQUIRES(ctx, labels_sp_valid.ok(), 82 | tf::errors::InvalidArgument("label SparseTensor is not valid: ", 83 | labels_sp_valid.error_message())); 84 | 85 | auto label_lengths = std::vector{}; 86 | for (const auto& g : labels_sp.group({0})) { // iterate by batch 87 | const auto batch_indices = g.group()[0]; 88 | OP_REQUIRES(ctx, tf::FastBoundsCheck(batch_indices, batch_size), 89 | tf::errors::InvalidArgument("labels batch index must be between ", 90 | 0, " and ", batch_size, " but saw: ", 91 | batch_indices)); 92 | 93 | auto values = g.values(); 94 | label_lengths.push_back(values.size()); 95 | } 96 | auto label_values_t = labels_values->vec(); 97 | 98 | 99 | OP_REQUIRES(ctx, static_cast(batch_size) == label_lengths.size(), 100 | tf::errors::InvalidArgument("len(labels) != batch_size. ", 101 | "len(labels): ", label_lengths.size(), 102 | " batch_size: ", batch_size)); 103 | 104 | for (int b = 0; b < batch_size; ++b) { 105 | OP_REQUIRES( 106 | ctx, seq_len_t(b) <= max_time, 107 | tf::errors::InvalidArgument("sequence_length(", b, ") <= ", max_time)); 108 | } 109 | 110 | tf::Tensor* loss = nullptr; 111 | OP_REQUIRES_OK(ctx, ctx->allocate_output("loss", seq_len->shape(), &loss)); 112 | auto loss_t = loss->vec(); 113 | 114 | tf::Tensor* gradient; 115 | OP_REQUIRES_OK(ctx, 116 | ctx->allocate_output("gradient", inputs_shape, &gradient)); 117 | set_zero(gradient); 118 | auto gradient_t = gradient->tensor(); 119 | 120 | auto inputs_t = inputs->tensor(); 121 | 122 | auto options = create_options(ctx); 123 | options.blank_label = num_classes - 1; 124 | 125 | size_t workspace_size_bytes; 126 | auto warp_status = get_workspace_size(label_lengths.data(), seq_len_t.data(), 127 | num_classes, batch_size, 128 | options, &workspace_size_bytes); 129 | OP_REQUIRES(ctx, warp_status == CTC_STATUS_SUCCESS, 130 | tf::errors::Internal("warp_ctc error in get_workspace_size: ", 131 | ctcGetStatusString(warp_status))); 132 | 133 | auto workspace_shape = tf::TensorShape{static_cast(workspace_size_bytes)}; 134 | tf::Tensor workspace; 135 | OP_REQUIRES_OK(ctx, ctx->allocate_temp(tf::DT_UINT8, workspace_shape, &workspace)); 136 | auto workspace_t = workspace.flat(); 137 | 138 | warp_status = compute_ctc_loss(inputs_t.data(), 139 | gradient_t.data(), 140 | label_values_t.data(), 141 | label_lengths.data(), 142 | seq_len_t.data(), 143 | num_classes, batch_size, 144 | loss_t.data(), workspace_t.data(), options); 145 | 146 | OP_REQUIRES(ctx, warp_status == CTC_STATUS_SUCCESS, 147 | tf::errors::Internal("warp_ctc error in compute_ctc_loss: ", 148 | ctcGetStatusString(warp_status))); 149 | 150 | } 151 | 152 | private: 153 | virtual void set_zero(tf::Tensor* t) = 0; 154 | virtual ctcOptions create_options(tf::OpKernelContext* ctx) = 0; 155 | }; 156 | 157 | class CTCLossOpCPU : public CTCLossOpBase { 158 | public: 159 | explicit CTCLossOpCPU(tf::OpKernelConstruction* ctx) : CTCLossOpBase(ctx) { 160 | } 161 | 162 | private: 163 | void set_zero(tf::Tensor* t) override { 164 | t->flat().setZero(); 165 | } 166 | 167 | ctcOptions create_options(tf::OpKernelContext* ctx) override { 168 | auto options = ctcOptions{}; 169 | options.loc = CTC_CPU; 170 | options.num_threads = ctx->device()->tensorflow_cpu_worker_threads()->num_threads; 171 | return options; 172 | } 173 | }; 174 | 175 | REGISTER_KERNEL_BUILDER(Name("CTCLoss") 176 | .Device(::tensorflow::DEVICE_CPU) 177 | .Label("WarpCTC"), 178 | CTCLossOpCPU); 179 | 180 | #ifdef WARPCTC_ENABLE_GPU 181 | 182 | class CTCLossOpGPU : public CTCLossOpBase { 183 | public: 184 | explicit CTCLossOpGPU(tf::OpKernelConstruction* ctx) : CTCLossOpBase(ctx) { 185 | } 186 | 187 | private: 188 | void set_zero(tf::Tensor* t) override { 189 | cudaMemset(t->flat().data(), 0, t->NumElements()*sizeof(float)); 190 | } 191 | 192 | ctcOptions create_options(tf::OpKernelContext* ctx) override { 193 | auto cuda_stream = ctx->eigen_device().stream(); 194 | auto options = ctcOptions{}; 195 | options.loc = CTC_GPU; 196 | options.stream = cuda_stream; 197 | return options; 198 | } 199 | }; 200 | 201 | // Register GPU kernel both with and without the label 202 | REGISTER_KERNEL_BUILDER(Name("CTCLoss") 203 | .Device(::tensorflow::DEVICE_GPU) 204 | .Label("WarpCTC") 205 | .HostMemory("labels_indices") 206 | .HostMemory("labels_values") 207 | .HostMemory("sequence_length") 208 | .HostMemory("loss"), 209 | CTCLossOpGPU); 210 | REGISTER_KERNEL_BUILDER(Name("CTCLoss") 211 | .Device(::tensorflow::DEVICE_GPU) 212 | .HostMemory("labels_indices") 213 | .HostMemory("labels_values") 214 | .HostMemory("sequence_length") 215 | .HostMemory("loss"), 216 | CTCLossOpGPU); 217 | 218 | #undef EIGEN_USE_GPU 219 | #endif 220 | 221 | } 222 | -------------------------------------------------------------------------------- /tensorflow_binding/src/warpctc_op.cc: -------------------------------------------------------------------------------- 1 | #ifdef WARPCTC_ENABLE_GPU 2 | #define EIGEN_USE_GPU 3 | #include 4 | #endif 5 | 6 | #include "tensorflow/core/framework/op.h" 7 | #include "tensorflow/core/framework/op_kernel.h" 8 | #include "tensorflow/core/kernels/bounds_check.h" 9 | #include "tensorflow/core/framework/allocator.h" 10 | #include "ctc.h" 11 | 12 | 13 | REGISTER_OP("WarpCTC") 14 | .Input("activations: float32") 15 | .Input("flat_labels: int32") 16 | .Input("label_lengths: int32") 17 | .Input("input_lengths: int32") 18 | .Attr("blank_label: int = 0") 19 | .Output("costs: float32") 20 | .Output("gradients: float32"); 21 | 22 | namespace tf = tensorflow; 23 | 24 | namespace warp_ctc { 25 | 26 | class WarpCTCOpBase : public tf::OpKernel { 27 | public: 28 | explicit WarpCTCOpBase(tf::OpKernelConstruction* ctx) : tf::OpKernel(ctx) { 29 | OP_REQUIRES_OK(ctx, ctx->GetAttr("blank_label", &blank_label_)); 30 | } 31 | 32 | void Compute(tf::OpKernelContext* ctx) override { 33 | // Grab the input tensors 34 | const tf::Tensor* activations; 35 | const tf::Tensor* flat_labels; 36 | const tf::Tensor* label_lengths; 37 | const tf::Tensor* input_lengths; 38 | OP_REQUIRES_OK(ctx, ctx->input("activations", &activations)); 39 | OP_REQUIRES_OK(ctx, ctx->input("flat_labels", &flat_labels)); 40 | OP_REQUIRES_OK(ctx, ctx->input("label_lengths", &label_lengths)); 41 | OP_REQUIRES_OK(ctx, ctx->input("input_lengths", &input_lengths)); 42 | 43 | OP_REQUIRES(ctx, activations->shape().dims() == 3, 44 | tf::errors::InvalidArgument("activations is not a 3-Tensor")); 45 | OP_REQUIRES(ctx, tf::TensorShapeUtils::IsVector(flat_labels->shape()), 46 | tf::errors::InvalidArgument("flat_labels is not a vector")); 47 | OP_REQUIRES(ctx, tf::TensorShapeUtils::IsVector(label_lengths->shape()), 48 | tf::errors::InvalidArgument("label_lengths is not a vector")); 49 | OP_REQUIRES(ctx, tf::TensorShapeUtils::IsVector(input_lengths->shape()), 50 | tf::errors::InvalidArgument("input_lengths is not a vector")); 51 | 52 | const auto& acts_shape = activations->shape(); 53 | const auto max_time = acts_shape.dim_size(0); 54 | const auto batch_size = acts_shape.dim_size(1); 55 | const auto num_classes_raw = acts_shape.dim_size(2); 56 | 57 | auto activations_t = activations->tensor(); 58 | auto flat_labels_t = flat_labels->vec(); 59 | 60 | OP_REQUIRES( 61 | ctx, tf::FastBoundsCheck(num_classes_raw, std::numeric_limits::max()), 62 | tf::errors::InvalidArgument("num_classes cannot exceed max int")); 63 | const auto alphabet_size = static_cast(num_classes_raw); 64 | 65 | OP_REQUIRES( 66 | ctx, batch_size == input_lengths->dim_size(0), 67 | tf::errors::InvalidArgument("len(input_lengths) != batch_size. ", 68 | "len(input_length): ", input_lengths->dim_size(0), 69 | " batch_size: ", batch_size)); 70 | auto input_lengths_t = input_lengths->vec(); 71 | 72 | OP_REQUIRES( 73 | ctx, batch_size == label_lengths->dim_size(0), 74 | tf::errors::InvalidArgument("len(label_lengths) != batch_size. ", 75 | "len(label_length): ", label_lengths->dim_size(0), 76 | " batch_size: ", batch_size)); 77 | auto label_lengths_t = label_lengths->vec(); 78 | 79 | // check that labels are in the alphabet? 80 | 81 | for (int b = 0; b < batch_size; b++) { 82 | OP_REQUIRES(ctx, input_lengths_t(b) <= max_time, 83 | tf::errors::InvalidArgument("input_lengths(", b, ") <= ", max_time)); 84 | } 85 | 86 | tf::Tensor* costs = nullptr; 87 | OP_REQUIRES_OK(ctx, ctx->allocate_output("costs", input_lengths->shape(), &costs)); 88 | auto costs_t = costs->vec(); 89 | 90 | tf::Tensor* grads = nullptr; 91 | OP_REQUIRES_OK(ctx, ctx->allocate_output("gradients", activations->shape(), 92 | &grads)); 93 | set_zero(grads); 94 | auto grads_t = grads->tensor(); 95 | 96 | auto options = create_options(ctx); 97 | options.blank_label = blank_label_; 98 | 99 | size_t workspace_size_bytes; 100 | auto warp_status = get_workspace_size(label_lengths_t.data(), 101 | input_lengths_t.data(), 102 | alphabet_size, batch_size, 103 | options, &workspace_size_bytes); 104 | 105 | OP_REQUIRES(ctx, warp_status == CTC_STATUS_SUCCESS, 106 | tf::errors::Internal("warp_ctc error in get_workspace_size: ", 107 | ctcGetStatusString(warp_status))); 108 | 109 | auto workspace_shape = tf::TensorShape{static_cast(workspace_size_bytes)}; 110 | tf::Tensor workspace; 111 | OP_REQUIRES_OK(ctx, ctx->allocate_temp(tf::DT_UINT8, workspace_shape, &workspace)); 112 | auto workspace_t = workspace.flat(); 113 | 114 | // compute CTC 115 | warp_status = compute_ctc_loss(activations_t.data(), 116 | grads_t.data(), 117 | flat_labels_t.data(), 118 | label_lengths_t.data(), 119 | input_lengths_t.data(), 120 | alphabet_size, batch_size, 121 | costs_t.data(), workspace_t.data(), options); 122 | 123 | OP_REQUIRES(ctx, warp_status == CTC_STATUS_SUCCESS, 124 | tf::errors::Internal("warp_ctc error in compute_ctc_loss: ", 125 | ctcGetStatusString(warp_status))); 126 | 127 | } 128 | private: 129 | int blank_label_; 130 | virtual void set_zero(tf::Tensor* t) = 0; 131 | virtual ctcOptions create_options(tf::OpKernelContext* ctx) = 0; 132 | }; 133 | 134 | class WarpCTCOpCPU : public WarpCTCOpBase { 135 | public: 136 | explicit WarpCTCOpCPU(tf::OpKernelConstruction* ctx) : WarpCTCOpBase(ctx) { 137 | } 138 | 139 | private: 140 | void set_zero(tf::Tensor* t) override { 141 | t->flat().setZero(); 142 | } 143 | 144 | ctcOptions create_options(tf::OpKernelContext* ctx) override { 145 | auto options = ctcOptions{}; 146 | options.loc = CTC_CPU; 147 | options.num_threads = ctx->device()->tensorflow_cpu_worker_threads()->num_threads; 148 | return options; 149 | } 150 | }; 151 | 152 | REGISTER_KERNEL_BUILDER(Name("WarpCTC").Device(::tensorflow::DEVICE_CPU), WarpCTCOpCPU); 153 | 154 | #ifdef WARPCTC_ENABLE_GPU 155 | 156 | class WarpCTCOpGPU : public WarpCTCOpBase { 157 | public: 158 | explicit WarpCTCOpGPU(tf::OpKernelConstruction* ctx) : WarpCTCOpBase(ctx) { 159 | } 160 | 161 | private: 162 | void set_zero(tf::Tensor* t) override { 163 | cudaMemset(t->flat().data(), 0, t->NumElements()*sizeof(float)); 164 | } 165 | 166 | ctcOptions create_options(tf::OpKernelContext* ctx) override { 167 | auto cuda_stream = ctx->eigen_device().stream(); 168 | auto options = ctcOptions{}; 169 | options.loc = CTC_GPU; 170 | options.stream = cuda_stream; 171 | return options; 172 | } 173 | }; 174 | 175 | REGISTER_KERNEL_BUILDER(Name("WarpCTC").Device(::tensorflow::DEVICE_GPU) 176 | .HostMemory("flat_labels") 177 | .HostMemory("label_lengths") 178 | .HostMemory("input_lengths") 179 | .HostMemory("costs"), 180 | WarpCTCOpGPU); 181 | #undef EIGEN_USE_GPU 182 | #endif 183 | 184 | } 185 | 186 | -------------------------------------------------------------------------------- /tensorflow_binding/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/baidu-research/warp-ctc/94b2fa178347cf02757bdc7329dc2f1b46f5d094/tensorflow_binding/tests/__init__.py -------------------------------------------------------------------------------- /tensorflow_binding/tests/test_ctc_loss_op.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Tests for tensorflow.ctc_ops.ctc_decoder_ops.""" 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import numpy as np 22 | import tensorflow as tf 23 | import warpctc_tensorflow 24 | from tensorflow.python.client import device_lib 25 | 26 | 27 | def SimpleSparseTensorFrom(x): 28 | """Create a very simple SparseTensor with dimensions (batch, time). 29 | 30 | Args: 31 | x: a list of lists of type int 32 | 33 | Returns: 34 | x_ix and x_val, the indices and values of the SparseTensor<2>. 35 | """ 36 | x_ix = [] 37 | x_val = [] 38 | for batch_i, batch in enumerate(x): 39 | for time, val in enumerate(batch): 40 | x_ix.append([batch_i, time]) 41 | x_val.append(val) 42 | x_shape = [len(x), np.asarray(x_ix).max(0)[1]+1] 43 | x_ix = tf.constant(x_ix, tf.int64) 44 | x_val = tf.constant(x_val, tf.int32) 45 | x_shape = tf.constant(x_shape, tf.int64) 46 | 47 | return tf.SparseTensor(x_ix, x_val, x_shape) 48 | 49 | def is_gpu_available(): 50 | """Returns whether TensorFlow can access a GPU.""" 51 | return any(x.device_type == 'GPU' for x in device_lib.list_local_devices()) 52 | 53 | class CTCLossTest(tf.test.TestCase): 54 | 55 | def _testCTCLoss(self, inputs, seq_lens, labels, 56 | loss_truth, grad_truth, 57 | use_gpu=False, expected_err_re=None): 58 | self.assertEqual(len(inputs), len(grad_truth)) 59 | 60 | inputs_t = tf.constant(inputs) 61 | 62 | log_dev_placement = False 63 | if not use_gpu: 64 | # Note: using use_gpu=False seems to not work 65 | # it runs the GPU version instead 66 | config = tf.ConfigProto(log_device_placement=log_dev_placement, 67 | device_count={'GPU': 0}) 68 | else: 69 | config = tf.ConfigProto(log_device_placement=log_dev_placement, 70 | allow_soft_placement=False) 71 | 72 | with tf.get_default_graph()._kernel_label_map({"CTCLoss": "WarpCTC"}): 73 | with self.test_session(use_gpu=use_gpu, force_gpu=use_gpu, config=config) as sess: 74 | loss = tf.nn.ctc_loss(inputs=inputs_t, 75 | labels=labels, 76 | sequence_length=seq_lens) 77 | grad = tf.gradients(loss, [inputs_t])[0] 78 | 79 | self.assertShapeEqual(loss_truth, loss) 80 | self.assertShapeEqual(grad_truth, grad) 81 | 82 | if expected_err_re is None: 83 | (tf_loss, tf_grad) = sess.run([loss, grad]) 84 | self.assertAllClose(tf_loss, loss_truth, rtol=1e-4, atol=1e-4) 85 | self.assertAllClose(tf_grad, grad_truth, rtol=1e-4, atol=1e-4) 86 | else: 87 | with self.assertRaisesOpError(expected_err_re): 88 | sess.run([loss, grad]) 89 | 90 | def _testBasic(self, use_gpu): 91 | """Test two batch entries.""" 92 | # Input and ground truth from Alex Graves' implementation. 93 | # 94 | #### Batch entry 0 ##### 95 | # targets: 0 1 2 1 0 96 | # outputs: 97 | # 0 0.633766 0.221185 0.0917319 0.0129757 0.0142857 0.0260553 98 | # 1 0.111121 0.588392 0.278779 0.0055756 0.00569609 0.010436 99 | # 2 0.0357786 0.633813 0.321418 0.00249248 0.00272882 0.0037688 100 | # 3 0.0663296 0.643849 0.280111 0.00283995 0.0035545 0.00331533 101 | # 4 0.458235 0.396634 0.123377 0.00648837 0.00903441 0.00623107 102 | # alpha: 103 | # 0 -3.64753 -0.456075 -inf -inf -inf -inf -inf -inf -inf -inf -inf 104 | # 1 -inf -inf -inf -0.986437 -inf -inf -inf -inf -inf -inf -inf 105 | # 2 -inf -inf -inf -inf -inf -2.12145 -inf -inf -inf -inf -inf 106 | # 3 -inf -inf -inf -inf -inf -inf -inf -2.56174 -inf -inf -inf 107 | # 4 -inf -inf -inf -inf -inf -inf -inf -inf -inf -3.34211 -inf 108 | # beta: 109 | # 0 -inf -2.88604 -inf -inf -inf -inf -inf -inf -inf -inf -inf 110 | # 1 -inf -inf -inf -2.35568 -inf -inf -inf -inf -inf -inf -inf 111 | # 2 -inf -inf -inf -inf -inf -1.22066 -inf -inf -inf -inf -inf 112 | # 3 -inf -inf -inf -inf -inf -inf -inf -0.780373 -inf -inf -inf 113 | # 4 -inf -inf -inf -inf -inf -inf -inf -inf -inf 0 0 114 | # prob: -3.34211 115 | # outputDerivs: 116 | # 0 -0.366234 0.221185 0.0917319 0.0129757 0.0142857 0.0260553 117 | # 1 0.111121 -0.411608 0.278779 0.0055756 0.00569609 0.010436 118 | # 2 0.0357786 0.633813 -0.678582 0.00249248 0.00272882 0.0037688 119 | # 3 0.0663296 -0.356151 0.280111 0.00283995 0.0035545 0.00331533 120 | # 4 -0.541765 0.396634 0.123377 0.00648837 0.00903441 0.00623107 121 | # 122 | #### Batch entry 1 ##### 123 | # 124 | # targets: 0 1 1 0 125 | # outputs: 126 | # 0 0.30176 0.28562 0.0831517 0.0862751 0.0816851 0.161508 127 | # 1 0.24082 0.397533 0.0557226 0.0546814 0.0557528 0.19549 128 | # 2 0.230246 0.450868 0.0389607 0.038309 0.0391602 0.202456 129 | # 3 0.280884 0.429522 0.0326593 0.0339046 0.0326856 0.190345 130 | # 4 0.423286 0.315517 0.0338439 0.0393744 0.0339315 0.154046 131 | # alpha: 132 | # 0 -1.8232 -1.19812 -inf -inf -inf -inf -inf -inf -inf 133 | # 1 -inf -2.19315 -2.83037 -2.1206 -inf -inf -inf -inf -inf 134 | # 2 -inf -inf -inf -2.03268 -3.71783 -inf -inf -inf -inf 135 | # 3 -inf -inf -inf -inf -inf -4.56292 -inf -inf -inf 136 | # 4 -inf -inf -inf -inf -inf -inf -inf -5.42262 -inf 137 | # beta: 138 | # 0 -inf -4.2245 -inf -inf -inf -inf -inf -inf -inf 139 | # 1 -inf -inf -inf -3.30202 -inf -inf -inf -inf -inf 140 | # 2 -inf -inf -inf -inf -1.70479 -0.856738 -inf -inf -inf 141 | # 3 -inf -inf -inf -inf -inf -0.859706 -0.859706 -0.549337 -inf 142 | # 4 -inf -inf -inf -inf -inf -inf -inf 0 0 143 | # prob: -5.42262 144 | # outputDerivs: 145 | # 0 -0.69824 0.28562 0.0831517 0.0862751 0.0816851 0.161508 146 | # 1 0.24082 -0.602467 0.0557226 0.0546814 0.0557528 0.19549 147 | # 2 0.230246 0.450868 0.0389607 0.038309 0.0391602 -0.797544 148 | # 3 0.280884 -0.570478 0.0326593 0.0339046 0.0326856 0.190345 149 | # 4 -0.576714 0.315517 0.0338439 0.0393744 0.0339315 0.154046 150 | 151 | # max_time_steps == 7 152 | depth = 6 153 | 154 | # seq_len_0 == 5 155 | targets_0 = [0, 1, 2, 1, 0] 156 | loss_log_prob_0 = -3.34211 157 | # dimensions are time x depth 158 | input_prob_matrix_0 = np.asarray( 159 | [[0.633766, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553], 160 | [0.111121, 0.588392, 0.278779, 0.0055756, 0.00569609, 0.010436], 161 | [0.0357786, 0.633813, 0.321418, 0.00249248, 0.00272882, 0.0037688], 162 | [0.0663296, 0.643849, 0.280111, 0.00283995, 0.0035545, 0.00331533], 163 | [0.458235, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107]], 164 | dtype=np.float32) 165 | input_log_prob_matrix_0 = np.log(input_prob_matrix_0) 166 | gradient_log_prob_0 = np.asarray( 167 | [[-0.366234, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553], 168 | [0.111121, -0.411608, 0.278779, 0.0055756, 0.00569609, 0.010436], 169 | [0.0357786, 0.633813, -0.678582, 0.00249248, 0.00272882, 0.0037688], 170 | [0.0663296, -0.356151, 0.280111, 0.00283995, 0.0035545, 0.00331533], 171 | [-0.541765, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107]], 172 | dtype=np.float32) 173 | 174 | # seq_len_1 == 5 175 | targets_1 = [0, 1, 1, 0] 176 | loss_log_prob_1 = -5.42262 177 | # dimensions are time x depth 178 | 179 | input_prob_matrix_1 = np.asarray( 180 | [[0.30176, 0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508], 181 | [0.24082, 0.397533, 0.0557226, 0.0546814, 0.0557528, 0.19549], 182 | [0.230246, 0.450868, 0.0389607, 0.038309, 0.0391602, 0.202456], 183 | [0.280884, 0.429522, 0.0326593, 0.0339046, 0.0326856, 0.190345], 184 | [0.423286, 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046]], 185 | dtype=np.float32) 186 | input_log_prob_matrix_1 = np.log(input_prob_matrix_1) 187 | gradient_log_prob_1 = np.asarray( 188 | [[-0.69824, 0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508], 189 | [0.24082, -0.602467, 0.0557226, 0.0546814, 0.0557528, 0.19549], 190 | [0.230246, 0.450868, 0.0389607, 0.038309, 0.0391602, -0.797544], 191 | [0.280884, -0.570478, 0.0326593, 0.0339046, 0.0326856, 0.190345], 192 | [-0.576714, 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046]], 193 | dtype=np.float32) 194 | 195 | # len max_time_steps array of 2 x depth matrices 196 | inputs = [np.vstack([input_log_prob_matrix_0[t, :], 197 | input_log_prob_matrix_1[t, :]]) 198 | for t in range(5)] + 2 * [np.nan*np.ones((2, depth), np.float32)] 199 | 200 | # convert inputs into [max_time x batch_size x depth tensor] Tensor 201 | inputs = np.asarray(inputs, dtype=np.float32) 202 | 203 | # len batch_size array of label vectors 204 | labels = SimpleSparseTensorFrom([targets_0, targets_1]) 205 | 206 | # batch_size length vector of sequence_lengths 207 | seq_lens = np.array([5, 5], dtype=np.int32) 208 | 209 | # output: batch_size length vector of negative log probabilities 210 | loss_truth = np.array([-loss_log_prob_0, -loss_log_prob_1], np.float32) 211 | 212 | # output: len max_time_steps array of 2 x depth matrices 213 | grad_truth = [np.vstack([gradient_log_prob_0[t, :], 214 | gradient_log_prob_1[t, :]]) 215 | for t in range(5)] + 2 * [np.zeros((2, depth), np.float32)] 216 | 217 | # convert grad_truth into [max_time x batch_size x depth] Tensor 218 | grad_truth = np.asarray(grad_truth, dtype=np.float32) 219 | 220 | self._testCTCLoss(inputs, seq_lens, labels, loss_truth, grad_truth, use_gpu=use_gpu) 221 | 222 | def testBasicCPU(self): 223 | self._testBasic(use_gpu=False) 224 | 225 | def testBasicGPU(self): 226 | if (is_gpu_available()): 227 | self._testBasic(use_gpu=True) 228 | else: 229 | print("Skipping GPU test, no gpus available") 230 | 231 | if __name__ == "__main__": 232 | tf.test.main() 233 | -------------------------------------------------------------------------------- /tensorflow_binding/tests/test_warpctc_op.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from warpctc_tensorflow import ctc 4 | from tensorflow.python.client import device_lib 5 | 6 | def is_gpu_available(): 7 | """Returns whether TensorFlow can access a GPU.""" 8 | return any(x.device_type == 'GPU' for x in device_lib.list_local_devices()) 9 | 10 | class WarpCTCTest(tf.test.TestCase): 11 | 12 | def _run_ctc(self, activations, input_lengths, 13 | flat_labels, label_lengths, 14 | expected_costs, expected_gradients, 15 | use_gpu=False, expected_error=None): 16 | self.assertEquals(activations.shape, expected_gradients.shape) 17 | activations_t = tf.constant(activations) 18 | input_lengths_t = tf.constant(input_lengths) 19 | flat_labels_t = tf.constant(flat_labels) 20 | label_lengths_t = tf.constant(label_lengths) 21 | costs = ctc(activations=activations_t, 22 | flat_labels=flat_labels_t, 23 | label_lengths=label_lengths_t, 24 | input_lengths=input_lengths_t) 25 | 26 | grad = tf.gradients(costs, [activations_t])[0] 27 | 28 | self.assertShapeEqual(expected_costs, costs) 29 | 30 | self.assertShapeEqual(expected_gradients, grad) 31 | 32 | log_dev_placement = False 33 | if not use_gpu: 34 | # Note: using use_gpu=False seems to not work 35 | # it runs the GPU version instead 36 | config = tf.ConfigProto(log_device_placement=log_dev_placement, 37 | device_count={'GPU': 0}) 38 | else: 39 | config = tf.ConfigProto(log_device_placement=log_dev_placement, 40 | allow_soft_placement=False) 41 | 42 | with self.test_session(use_gpu=use_gpu, force_gpu=use_gpu, config=config) as sess: 43 | if expected_error is None: 44 | (tf_costs, tf_grad) = sess.run([costs, grad]) 45 | self.assertAllClose(tf_costs, expected_costs, atol=1e-6) 46 | self.assertAllClose(tf_grad, expected_gradients, atol=1e-6) 47 | else: 48 | with self.assertRaisesOpError(expected_error): 49 | sess.run([costs, grad]) 50 | 51 | sess.run([costs, grad]) 52 | 53 | def _test_basic(self, use_gpu): 54 | # Softmax activations for the following inputs: 55 | activations = np.array([ 56 | [0.1, 0.6, 0.1, 0.1, 0.1], 57 | [0.1, 0.1, 0.6, 0.1, 0.1] 58 | ], dtype=np.float32) 59 | 60 | alphabet_size = 5 61 | # dimensions should be t, n, p: (t timesteps, n minibatches, 62 | # p prob of each alphabet). This is one instance, so expand 63 | # dimensions in the middle 64 | activations = np.expand_dims(activations, 1) 65 | labels = np.asarray([1, 2], dtype=np.int32) 66 | expected_costs = np.asarray([2.46286], dtype=np.float32) 67 | gradients = np.asarray([ 68 | [0.177031, -0.708125, 0.177031, 0.177031, 0.177031], 69 | [0.177031, 0.177031, -0.708125, 0.177031, 0.177031] 70 | ], dtype=np.float32) 71 | expected_gradients = np.expand_dims(gradients, 1) 72 | label_lengths = np.asarray([2], dtype=np.int32) 73 | input_lengths = np.asarray([2], dtype=np.int32) 74 | 75 | self._run_ctc(activations=activations, 76 | input_lengths=input_lengths, 77 | flat_labels=labels, label_lengths=label_lengths, 78 | expected_costs=expected_costs, 79 | expected_gradients=expected_gradients, 80 | use_gpu=use_gpu) 81 | 82 | def test_basic_cpu(self): 83 | self._test_basic(use_gpu=False) 84 | 85 | def test_basic_gpu(self): 86 | if (is_gpu_available()): 87 | self._test_basic(use_gpu=True) 88 | else: 89 | print("Skipping GPU test, no gpus available") 90 | 91 | def _test_multiple_batches(self, use_gpu): 92 | activations = np.array([ 93 | [0.1, 0.6, 0.1, 0.1, 0.1], 94 | [0.1, 0.1, 0.6, 0.1, 0.1] 95 | ], dtype=np.float32) 96 | 97 | alphabet_size = 5 98 | # dimensions should be t, n, p: (t timesteps, n minibatches, 99 | # p prob of each alphabet). This is one instance, so expand 100 | # dimensions in the middle 101 | _activations = np.expand_dims(activations, 1) 102 | activations = np.concatenate([_activations, _activations[...]], axis=1) 103 | labels = np.asarray([1, 2, 1, 2], dtype=np.int32) 104 | expected_costs = np.asarray([2.46286, 2.46286], dtype=np.float32) 105 | gradients = np.asarray([ 106 | [0.177031, -0.708125, 0.177031, 0.177031, 0.177031], 107 | [0.177031, 0.177031, -0.708125, 0.177031, 0.177031] 108 | ], dtype=np.float32) 109 | _expected_gradients = np.expand_dims(gradients, 1) 110 | expected_gradients = np.concatenate( 111 | [_expected_gradients, _expected_gradients[...]], axis=1) 112 | 113 | label_lengths = np.asarray([2, 2], dtype=np.int32) 114 | input_lengths = np.asarray([2, 2], dtype=np.int32) 115 | 116 | self._run_ctc(activations=activations, 117 | input_lengths=input_lengths, 118 | flat_labels=labels, label_lengths=label_lengths, 119 | expected_costs=expected_costs, 120 | expected_gradients=expected_gradients, 121 | use_gpu=use_gpu) 122 | 123 | def test_multiple_batches_cpu(self): 124 | self._test_multiple_batches(use_gpu=False) 125 | 126 | def test_multiple_batches_gpu(self): 127 | if (is_gpu_available()): 128 | self._test_multiple_batches(use_gpu=True) 129 | else: 130 | print("Skipping GPU test, no gpus available") 131 | 132 | if __name__ == "__main__": 133 | tf.test.main() 134 | -------------------------------------------------------------------------------- /tensorflow_binding/warpctc_tensorflow/__init__.py: -------------------------------------------------------------------------------- 1 | import imp 2 | import tensorflow as tf 3 | from tensorflow.python.framework import ops 4 | from tensorflow.python.ops.nn_grad import _BroadcastMul 5 | 6 | lib_file = imp.find_module('kernels', __path__)[1] 7 | _warpctc = tf.load_op_library(lib_file) 8 | 9 | def ctc(activations, flat_labels, label_lengths, input_lengths, 10 | blank_label=0): 11 | '''Computes the CTC loss between a sequence of activations and a 12 | ground truth labeling. 13 | 14 | Args: 15 | 16 | activations: A 3-D Tensor of floats. The dimensions 17 | should be (t, n, a), where t is the time index, n 18 | is the minibatch index, and a indexes over 19 | activations for each symbol in the alphabet. 20 | 21 | flat_labels: A 1-D Tensor of ints, a concatenation of all the 22 | labels for the minibatch. 23 | 24 | label_lengths: A 1-D Tensor of ints, the length of each label 25 | for each example in the minibatch. 26 | 27 | input_lengths: A 1-D Tensor of ints, the number of time steps 28 | for each sequence in the minibatch. 29 | 30 | blank_label: int, the label value/index that the CTC 31 | calculation should use as the blank label 32 | 33 | Returns: 34 | 1-D float Tensor, the cost of each example in the minibatch 35 | (as negative log probabilities). 36 | 37 | * This class performs the softmax operation internally. 38 | 39 | * The label reserved for the blank symbol should be label 0. 40 | 41 | ''' 42 | loss, _ = _warpctc.warp_ctc(activations, flat_labels, label_lengths, 43 | input_lengths, blank_label) 44 | return loss 45 | 46 | 47 | @ops.RegisterGradient("WarpCTC") 48 | def _CTCLossGrad(op, grad_loss, _): 49 | grad = op.outputs[1] 50 | return [_BroadcastMul(grad_loss, grad), None, None, None] 51 | 52 | 53 | @ops.RegisterShape("WarpCTC") 54 | def _CTCLossShape(op): 55 | inputs_shape = op.inputs[0].get_shape().with_rank(3) 56 | batch_size = inputs_shape[1] 57 | return [batch_size, inputs_shape] 58 | 59 | -------------------------------------------------------------------------------- /tests/test.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | 11 | inline void throw_on_error(ctcStatus_t status, const char* message) { 12 | if (status != CTC_STATUS_SUCCESS) { 13 | throw std::runtime_error(message + (", stat = " + 14 | std::string(ctcGetStatusString(status)))); 15 | } 16 | } 17 | 18 | #if (defined(__HIPCC__) || defined(__CUDACC__)) 19 | #ifdef __HIPCC__ 20 | #include 21 | #include 22 | 23 | inline void throw_on_error(hipError_t error, const char* message) { 24 | if (error) { 25 | throw thrust::system_error(error, thrust::hip_category(), message); 26 | } 27 | } 28 | #else 29 | #include 30 | #include 31 | 32 | inline void throw_on_error(cudaError_t error, const char* message) { 33 | if (error) { 34 | throw thrust::system_error(error, thrust::cuda_category(), message); 35 | } 36 | } 37 | #endif 38 | #endif 39 | 40 | std::vector 41 | genActs(int size) { 42 | std::vector arr(size); 43 | std::mt19937 gen(0); 44 | std::uniform_real_distribution<> dis(0, 1); 45 | for(int i = 0; i < size; ++i) 46 | arr[i] = dis(gen); 47 | return arr; 48 | } 49 | 50 | std::vector 51 | genLabels(int alphabet_size, int L) { 52 | std::vector label(L); 53 | 54 | std::mt19937 gen(1); 55 | std::uniform_int_distribution<> dis(1, alphabet_size - 1); 56 | 57 | for(int i = 0; i < L; ++i) { 58 | label[i] = dis(gen); 59 | } 60 | // guarantee repeats for testing 61 | if (L >= 3) { 62 | label[L / 2] = label[L / 2 + 1]; 63 | label[L / 2 - 1] = label[L / 2]; 64 | } 65 | return label; 66 | } 67 | 68 | float rel_diff(const std::vector& grad, 69 | const std::vector& num_grad) { 70 | float diff = 0.; 71 | float tot = 0.; 72 | for(size_t idx = 0; idx < grad.size(); ++idx) { 73 | diff += (grad[idx] - num_grad[idx]) * (grad[idx] - num_grad[idx]); 74 | tot += grad[idx] * grad[idx]; 75 | } 76 | 77 | return diff / tot; 78 | } 79 | 80 | // Numerically stable softmax for a minibatch of 1 81 | void softmax(const float* const acts, 82 | int alphabet_size, int T, 83 | float *probs) { 84 | 85 | for (int t = 0; t < T; ++t) { 86 | 87 | float max_activation = 88 | -std::numeric_limits::infinity(); 89 | 90 | for (int a = 0; a < alphabet_size; ++a) 91 | max_activation = 92 | std::max(max_activation, acts[t*alphabet_size + a]); 93 | 94 | float denom = 0; 95 | for (int a = 0; a < alphabet_size; ++a) 96 | denom += std::exp(acts[t*alphabet_size + a] - max_activation); 97 | 98 | for (int a = 0; a < alphabet_size; ++a) 99 | probs[t*alphabet_size + a] = 100 | std::exp(acts[t*alphabet_size + a] - max_activation) / denom; 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /torch_binding/TUTORIAL.md: -------------------------------------------------------------------------------- 1 | ## Torch Tutorial 2 | 3 | [In Chinese 中文版](TUTORIAL.zh_cn.md) 4 | 5 | Make sure you have `warp-ctc` installed by running ```luarocks make torch_binding/rocks/warp-ctc-scm-1.rockspec``` at the top level directory. 6 | 7 | Using the torch bindings, it is easy to experiment with CTC interactively. 8 | 9 | If you have compiled without GPU support, replace `torch.Tensor(...):cuda()` with 10 | `torch.Tensor(...):float()` and calls to `gpu_ctc` with `cpu_ctc`. 11 | 12 | The CTC algorithm gives the loss between input sequences and target output sequences. Since CTC 13 | is commonly used with neural networks, we will call the input sequences activation sequences. 14 | The target output sequences are drawn from a fixed alphabet. For the discussion here we choose 15 | the four characters `a,b,c,d`. The algorithm requires a `` symbol distinct from the alphabet. 16 | This means that the activation sequences will be sequences of vectors of dimension five (the size of our alphabet 17 | together with ``). The vectors will be converted to a probability distribution over the alphabet 18 | and the `` with a SoftMax function. So for example a problem with an activation sequence of length seven 19 | would be (the components of the vectors here are arbitrary) 20 | 21 | ```{<2,0,0,0,0>, <1,3,0,0,0>, <1,4,1,0,0>, <1,1,5,6,0>, <1,1,1,1,1>, <1,1,7,1,1>, <9,1,1,1,1>}``` 22 | 23 | and a valid target output sequence would be `dacba`. 24 | 25 | 26 | 27 | To start we are going to use a very simple example. In the example we will have an activation sequence of length 28 | one and also a target output sequence of length one. To specify the activation sequence then, 29 | we have to write down the components of a single five dimensional vector. 30 | We are going to use `<0,0,0,0,0>` as the single vector in the activation sequence 31 | and so the resulting probabilities will be `0.2,0.2,0.2,0.2,0.2`. 32 | 33 | For the targets, we are going to have a single label `a`. 34 | 35 | Firstly, how do we present the data to the algorithm? As usual in Torch, the activations are 36 | put into rows in a 2 dimensional Tensor. The target labels are put into a lua table of tables 37 | with one table for each sequence of target labels. We only have one sequence (of one label) 38 | and so the table is `{{1}}` as the label `a` has index 1 (the index 0 is reserved for the blank symbol). 39 | Since we are allowing the possibility of inputting different length activation sequences, we have to specify 40 | the length of our input activation sequence, which in this case is 1 with a lua table `{1}`. 41 | 42 | To calculate the value of the CTC loss for the above problem just observe that with a one element input 43 | sequence and a single output label, there is only one possible alignment and so the symbol 44 | must be emitted at the first time step. The probability of emitting the symbol is `0.2`. The algorithm 45 | returns the negative log likelihood which is `-ln(0.2)=1.6094`. 46 | 47 | Now we want to use the code to do the calculation. Start with a Torch session and require the libraries. 48 | 49 | If you have GPU support 50 | 51 | ``` 52 | th>require 'cutorch' 53 | ``` 54 | 55 | for CPU only 56 | 57 | ``` 58 | th>require 'warp_ctc' 59 | ``` 60 | 61 | We need to put the activations in rows - so note the double braces. 62 | 63 | ``` 64 | th>acts = torch.Tensor({{0,0,0,0,0}}):cuda() 65 | ``` 66 | 67 | If an empty grad Tensor is passed, the gradient calculation will not be done. 68 | 69 | ``` 70 | th>grads = torch.Tensor():cuda() 71 | ``` 72 | 73 | For the target labels and sizes of the input sequence, 74 | 75 | ``` 76 | th>labels = {{1}} 77 | th>sizes ={1} 78 | ``` 79 | 80 | If you have CUDA support, use `gpu_ctc` otherwise use `cpu_ctc` 81 | 82 | ``` 83 | th> gpu_ctc(acts, grads, labels, sizes) 84 | 85 | { 86 | 1 : 1.6094379425049 87 | } 88 | ``` 89 | 90 | The function returns a lua table of the CTC loss for each set of sequences. 91 | 92 | Now for a slightly more interesting example. Suppose we have an input sequence of 93 | length three, with activations 94 | 95 | `<1,2,3,4,5>`,`<6,7,8,9,10>` and `<11,12,13,14,15>`. 96 | 97 | The corresponding probabilities for the frames are then 98 | 99 | `0.0117, 0.0317, 0.0861, 0.2341, 0.6364` 100 | 101 | (the probabilties are the same for each frame in this special case). 102 | 103 | For target symbols we will use the sequence `c,c`. 104 | 105 | ``` 106 | th>acts = torch.Tensor({{1,2,3,4,5},{6,7,8,9,10},{11,12,13,14,15}}):cuda() 107 | th>labels = {{3,3}} 108 | th>sizes = {3} 109 | ``` 110 | CTC calculates the probability of all the possible alignments. Note that the targets 111 | contain the repeated symbol `c`. CTC cannot emit a repeated symbol on consecutive timesteps 112 | (for more details consult http://www.cs.toronto.edu/~graves/icml_2006.pdf) it must separate 113 | the repeated symbol with a blank and so the only possible aligned sequence is 114 | 115 | `c c`. 116 | 117 | CTC assumes the label probabilities are conditionally independent given the data and so 118 | we expect the answer to be `Pr(c at frame 1)*Pr( at frame 2)*Pr(c at frame 3) = 0.2341*0.0117*0.2341` 119 | and `-ln(0.2341*0.0117*0.2341) = 7.3522`. 120 | 121 | ``` 122 | th> gpu_ctc(acts, grads, labels, sizes) 123 | 124 | { 125 | 1 : 7.355742931366 126 | } 127 | ``` 128 | 129 | The small numerical difference is from doing one of the calculations by hand. 130 | 131 | Suppose the target sequence is `b,c` and the activations are 132 | 133 | `<-5,-4,-3,-2,-1>`,`<-10,-9,-8,-7,-6>` and `<-15,-14,-13,-12,-11>`. 134 | 135 | The corresponding probabilities for the frames are then again 136 | 137 | `0.0117, 0.0317, 0.0861, 0.2341, 0.6364`. 138 | 139 | Now there are five possible alignments as repeated symbols 140 | are collapsed and blanks are removed: 141 | ` b c`, `b c`, `b c `, `b b c` and `b c c`. 142 | 143 | The result should be 144 | `-ln(3*0.0117*0.0861*0.2341 + 0.0861*0.0861*0.2341 + 0.0861*0.2341*0.2341) = 4.9390` 145 | 146 | ``` 147 | th>acts = torch.Tensor({{-5,-4,-3,-2,-1},{-10,-9,-8,-7,-6},{-15,-14,-13,-12,-11}}):cuda() 148 | th>labels = {{2,3}} 149 | th>sizes = {3} 150 | th>gpu_ctc(acts, grads, labels, sizes) 151 | 152 | { 153 | 1 : 4.938850402832 154 | } 155 | ``` 156 | 157 | So we have three examples. The final example shows how to do all three at once is the case 158 | where we want to put minibatches through the algorithm. The labels are now `{{1}, {3,3}, {2,3}}` 159 | and the lengths of the input sequences are `{1,3,3}`. We have to put all of the input sequences in 160 | a single two dimensional matrix. This is done by interleaving the input sequence elements so that the 161 | input matrix will look like this. For clarity we start with the first two input sequences 162 | 163 | 164 | | entries | col1 | col2 | col3 | col4 | col5 | 165 | |---------|------|------|------|------|------| 166 | |seq1 item 1|0|0|0|0|0| 167 | |seq2 item 1|1|2|3|4|5| 168 | |seq1 item 2|P|P|P|P|P| 169 | |seq2 item 2|6|7|8|9|10| 170 | |seq1 item 3|P|P|P|P|P| 171 | |seq2 item 3|11|12|13|14|15| 172 | 173 | Since the first sequence has no second or third elements, we pad the matrix with zeros (which appear as 174 | `P` in the above table). Now we put the third sequence in 175 | 176 | | entries | col1 | col2 | col3 | col4 | col5 | 177 | |---------|------|------|------|------|------| 178 | |seq1 item 1|0|0|0|0|0| 179 | |seq2 item 1|1|2|3|4|5| 180 | |seq3 item 1|-5|-4|-3|-2|-1| 181 | |seq1 item 2|P|P|P|P|P| 182 | |seq2 item 2|6|7|8|9|10| 183 | |seq3 item 2|-10|-9|-8|7|-6| 184 | |seq1 item 3|P|P|P|P|P| 185 | |seq2 item 3|11|12|13|14|15| 186 | |seq3 item 3|-15|-14|-13|-12|-11| 187 | 188 | 189 | The complete example in Torch is 190 | 191 | ``` 192 | th>acts = torch.Tensor({{0,0,0,0,0},{1,2,3,4,5},{-5,-4,-3,-2,-1}, 193 | {0,0,0,0,0},{6,7,8,9,10},{-10,-9,-8,-7,-6}, 194 | {0,0,0,0,0},{11,12,13,14,15},{-15,-14,-13,-12,-11}}):cuda() 195 | th>labels = {{1}, {3,3}, {2,3}} 196 | th>sizes = {1,3,3} 197 | th>gpu_ctc(acts, grads, labels, sizes) 198 | 199 | { 200 | 1 : 1.6094379425049 201 | 2 : 7.355742931366 202 | 3 : 4.938850402832 203 | } 204 | ``` 205 | 206 | In order to obtain gradients wrt the incoming activations simply pass a 207 | tensor of the same size as the activations tensor. Also see 208 | `torch_binding/tests/test.lua` for more examples. 209 | -------------------------------------------------------------------------------- /torch_binding/TUTORIAL.zh_cn.md: -------------------------------------------------------------------------------- 1 | ## Torch教程 2 | 3 | 为了确保您成功将‘Warp-CTC’和Torch绑定,请在warp-ctc根目录中运行“luarocks make torch_binding/rocks/warp-ctc-scm-1.rockspec”。 4 | 5 | 现在,您可以非常容易的通过torch_binding来测试CTC。 6 | 7 | 假如你的编译没有GPU支持,请用`torch.Tensor(...):float()`替代`torch.Tensor(...):cuda()`,用`cpu_ctc`取代`gpu_ctc` 8 | 9 | 10 | CTC是一种计算输入序列与目标输出序列之间相似程度的目标函数。由于CTC普遍运用于神经网络,我们称输入序列为激活序列。目标输出序列是从一个固定的字母表中得出的。 11 | 为了在此讨论, 我们选择了四个字母```a,b,c,d```. 算法需要一个`<空白>`符号区别于字母。这就意味着激活序列会是一个五维向量序列(字母的数量加<空白>)。这个向量(通过softmax函数)将会被转化成字母以及<空白>上的概率分布。 12 | 13 | 比如CTC可以用来衡量一个长度为7的激活序列和标签 ```dacba``` 之间的误差。 14 | 15 | 一个长度为7的激活序列就会是(向量的组成部分是任意的) 16 | 17 | ```{<2,0,0,0,0>, <1,3,0,0,0>, <1,4,1,0,0>, <1,1,5,6,0>, <1,1,1,1,1>, <1,1,7,1,1>, <9,1,1,1,1>}``` 18 | 19 | 得到的有效输出序列即`daceba`. 20 | 21 | 一开始我们会举一个非常简单的例子。在这个例子中我们会用一个长度为1的激活序列,以及一个长度为1的目标输出序列。 22 | 为了指定这个激活序列,我们必须写下每一个五维向量的组成部分。我们使用`<0,0,0,0,0>`作为激活序列的单一向量,得到的概率分布及`0.2,0.2,0.2,0.2,0.2`. 23 | 对于目标输出,我们会用一个单一标签`a`. 24 | 25 | 首先,我们如何将数据展现给算法? 像平时使用Torch一样,激活表示要在一个2维张量中放入行。目标标签需要放入lua table, 每个目标标签序列都有一个对应的表。 26 | 我们每一个标签仅有一个序列,因此当标签`a`有指数1时,表即`{{1}}` (指数0预留给空白符号)。因为我们允许输入不同长度的激活序列的可能性,我们需要指定 27 | 输入激活序列的长度,在这个例子即包涵一个lua table`{1}`的1. 28 | 29 | 为了计算以上问题(单一元素输入序列,单一输出标签)的CTC损失函数的价值, 只有一种可能的对齐方式,所以符号必须在第一个时间步(time step)发出。 30 | 发出符号的概率为`0.2`。 算法返回的负对数似然值为`-ln(0.2)=1.6094`. 31 | 32 | 现在让我们通过代码来做计算。先从Torch部分开始,需要代码库。 33 | 34 | 35 | 假如你有GPU的支持 36 | 37 | ``` 38 | th>require 'cutorch' 39 | ``` 40 | 41 | 如果仅有CPU 42 | 43 | ``` 44 | th>require 'warp_ctc' 45 | ``` 46 | 47 | 请将激活输入行-- 注意用两个大括号 48 | 49 | ``` 50 | th>acts = torch.Tensor({{0,0,0,0,0}}):cuda() 51 | ``` 52 | 53 | 假如输入为空,梯度计算则不能完成。 54 | 55 | ``` 56 | th>grads = torch.Tensor():cuda() 57 | ``` 58 | 59 | 对于目标标签以及输入序列的大小 60 | ``` 61 | th>labels = {{1}} 62 | th>sizes ={1} 63 | ``` 64 | 65 | 如果你有CUDA支持,请使用`gpu_ctc` ,否则请使用`cpu_ctc` 66 | 67 | ``` 68 | th> gpu_ctc(acts, grads, labels, sizes) 69 | 70 | { 71 | 1 : 1.6094379425049 72 | } 73 | ``` 74 | 75 | 对每一组序列,函数会返回CTC损失的一个lua table. 76 | 77 | 78 | 现在,我们来看一个更有意思的例子。假如我们有一个长度为3的输入序列,激活后: 79 | 80 | `<1,2,3,4,5>`,`<6,7,8,9,10>` and `<11,12,13,14,15>`. 81 | 82 | 对应这些帧的概率则为 83 | 84 | `0.0117, 0.0317, 0.0861, 0.2341, 0.6364` 85 | 86 | (在这个特殊例子中,每一帧的概率都一样) 87 | 对于目标符号,我们将使用序列`c,c`. 88 | 89 | ``` 90 | th>acts = torch.Tensor({{1,2,3,4,5},{6,7,8,9,10},{11,12,13,14,15}}):cuda() 91 | th>labels = {{3,3}} 92 | th>sizes = {3} 93 | ``` 94 | CTC计算了所有可能对齐的概率。请注意目标包涵了重复的符号`c`.CTC不能在连续的时间步上发出重复的符号(更多细节,[请见](http://www.cs.toronto.edu/~graves/icml_2006.pdf))。对于重复的符号必须用一个空白分开,所以唯一可能的对齐序列为`c <空白> c`. 95 | 96 | CTC假设,在给定数据的情况下,标签概率是有条件独立的,所以我们期待的答案即`Pr(c at frame 1)*Pr(<空白> at frame 2)*Pr(c at frame 3) = 0.2341*0.0117*0.2341` 97 | and `-ln(0.2341*0.0117*0.2341) = 7.3522`. 98 | ``` 99 | th> gpu_ctc(acts, grads, labels, sizes) 100 | 101 | { 102 | 1 : 7.355742931366 103 | } 104 | ``` 105 | 106 | 小的数值差由于其中一个计算人工完成。 107 | 108 | 假设目标序列为`b,c`,激活序列则为 109 | 110 | `<-5,-4,-3,-2,-1>`,`<-10,-9,-8,-7,-6>` and `<-15,-14,-13,-12,-11>`. 111 | 112 | 对应这些帧的概率则为 113 | 114 | `0.0117, 0.0317, 0.0861, 0.2341, 0.6364`. 115 | 116 | 117 | 由于重复的符号被清空,空白被取消,现在有五种可能的对齐 118 | `<空白> b c`, `b <空白> c`, `b c <空白>`, `b b c` and `b c c`. 119 | 120 | 结果应当是 121 | `-ln(3*0.0117*0.0861*0.2341 + 0.0861*0.0861*0.2341 + 0.0861*0.2341*0.2341) = 4.9390` 122 | ``` 123 | th>acts = torch.Tensor({{-5,-4,-3,-2,-1},{-10,-9,-8,-7,-6},{-15,-14,-13,-12,-11}}):cuda() 124 | th>labels = {{2,3}} 125 | th>sizes = {3} 126 | th>gpu_ctc(acts, grads, labels, sizes) 127 | 128 | { 129 | 1 : 4.938850402832 130 | } 131 | ``` 132 | 133 | 因此,我们有三个例子。最后一个例子显示如果通过算法将3个例子做迷你批处理 (minibatch). 标签现在是`{{1}, {3,3}, {2,3}}`,输入序列的长度是`{1,3,3}`. 134 | 我们必须将输入序列放入一个单独的两维矩阵。通过交织输入序列的元素,我们的输入矩阵如下: 135 | 为了清楚起见,我们从前两个输入序列开始 136 | 137 | | entries | col1 | col2 | col3 | col4 | col5 | 138 | |---------|------|------|------|------|------| 139 | |seq1 item 1|0|0|0|0|0| 140 | |seq2 item 1|1|2|3|4|5| 141 | |seq1 item 2|P|P|P|P|P| 142 | |seq2 item 2|6|7|8|9|10| 143 | |seq1 item 3|P|P|P|P|P| 144 | |seq2 item 3|11|12|13|14|15| 145 | 146 | 由于第一个序列没有第二个或第三个元素,我们用0填入矩阵(在上面一个表格中显示为`P`)。 现在我们将第三个序列放入表格中 147 | 148 | | entries | col1 | col2 | col3 | col4 | col5 | 149 | |---------|------|------|------|------|------| 150 | |seq1 item 1|0|0|0|0|0| 151 | |seq2 item 1|1|2|3|4|5| 152 | |seq3 item 1|-5|-4|-3|-2|-1| 153 | |seq1 item 2|P|P|P|P|P| 154 | |seq2 item 2|6|7|8|9|10| 155 | |seq3 item 2|-10|-9|-8|7|-6| 156 | |seq1 item 3|P|P|P|P|P| 157 | |seq2 item 3|11|12|13|14|15| 158 | |seq3 item 3|-15|-14|-13|-12|-11| 159 | 160 | 161 | 在Torch中完整的例子如下 162 | ``` 163 | th>acts = torch.Tensor({{0,0,0,0,0},{1,2,3,4,5},{-5,-4,-3,-2,-1}, 164 | {0,0,0,0,0},{6,7,8,9,10},{-10,-9,-8,-7,-6}, 165 | {0,0,0,0,0},{11,12,13,14,15},{-15,-14,-13,-12,-11}}):cuda() 166 | th>labels = {{1}, {3,3}, {2,3}} 167 | th>sizes = {1,3,3} 168 | th>gpu_ctc(acts, grads, labels, sizes) 169 | 170 | { 171 | 1 : 1.6094379425049 172 | 2 : 7.355742931366 173 | 3 : 4.938850402832 174 | } 175 | ``` 176 | 177 | 为了获取接下来激活的梯度,传递和激活张量同样大小的张量即可。 178 | 如果想看更多例子,请见`torch_binding/tests/test.lua`。 179 | -------------------------------------------------------------------------------- /torch_binding/binding.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "utils.h" 5 | 6 | #include 7 | #include 8 | 9 | #include 10 | 11 | #include "ctc.h" 12 | 13 | #ifdef TORCH_NOGPU 14 | #include "TH.h" 15 | #else 16 | #include "THC.h" 17 | #include "THCTensor.h" 18 | #include "detail/reduce.h" 19 | #endif 20 | 21 | int processTargets(lua_State* L, int** sizes, int** labels, int** label_sizes) { 22 | // sizes table is 4 item on stack 23 | // labels is 3 item on stack 24 | 25 | if (!lua_istable(L, 4)) { 26 | lua_pushfstring(L, "invalid argument 4 for sequence lengths (expected table, got %s)", 27 | luaL_typename(L, -1)); 28 | lua_error(L); 29 | } 30 | 31 | int minibatch_size = lua_objlen(L, 4); 32 | 33 | *sizes = new int[minibatch_size]; 34 | 35 | for(int i = 0; i < minibatch_size; i++) { 36 | lua_pushinteger(L, i+1); 37 | lua_gettable(L, 4); 38 | if(lua_isnumber(L, -1)) { 39 | (*sizes)[i] = (int) lua_tonumber(L, -1); 40 | } else { 41 | lua_pushfstring(L, "invalid entry #%d in array sizes (expected number, got %s)", 42 | i, luaL_typename(L, -1)); 43 | lua_error(L); 44 | } 45 | lua_pop(L, 1); 46 | } 47 | 48 | if (!lua_istable(L, 3)) { 49 | lua_pushfstring(L, "invalid argument 3 for sequence labels (expected table, got %s)", 50 | luaL_typename(L, -1)); 51 | lua_error(L); 52 | } 53 | 54 | int number_of_target_seq = lua_objlen(L, 3); 55 | 56 | if (number_of_target_seq != minibatch_size) { 57 | lua_pushfstring(L, "The minibatch size %d and the number of target sequences %d must be the same", 58 | minibatch_size, number_of_target_seq); 59 | lua_error(L); 60 | } 61 | 62 | std::vector labels_vec; 63 | *label_sizes = new int[minibatch_size]; 64 | 65 | for(int i = 0; i < minibatch_size; i++) { 66 | lua_pushinteger(L, i+1); 67 | lua_gettable(L, 3); 68 | 69 | if(lua_istable(L, -1)) { 70 | 71 | int current_label_length = (int) lua_objlen(L, -1); 72 | (*label_sizes)[i] = current_label_length; 73 | 74 | for (int ix = 0; ix < current_label_length; ix++) { 75 | lua_pushinteger(L, ix + 1); 76 | lua_gettable(L, -2); 77 | if(lua_isnumber(L, -1)) { 78 | labels_vec.push_back((int) lua_tonumber(L, -1)); 79 | } else { 80 | lua_pushfstring(L, "invalid entry #%d in array labels (expected number, got %s)", 81 | ix + 1, luaL_typename(L, -1)); 82 | lua_error(L); 83 | } 84 | 85 | lua_pop(L, 1); 86 | } 87 | 88 | } else { 89 | lua_pushfstring(L, "invalid entry #%d in table labels (expected table, got %s)", 90 | i + 1, luaL_typename(L, -1)); 91 | lua_error(L); 92 | } 93 | 94 | lua_pop(L, 1); 95 | 96 | } 97 | 98 | *labels = new int[labels_vec.size()]; 99 | 100 | std::copy(labels_vec.begin(), labels_vec.end(), *labels); 101 | 102 | return minibatch_size; 103 | } 104 | 105 | extern "C" int gpu_ctc(lua_State* L) { 106 | #ifdef TORCH_NOGPU 107 | std::cout << "Compiled without CUDA support." << std::endl; 108 | lua_newtable(L); 109 | 110 | 111 | lua_pushnumber(L, -999999.0); 112 | lua_rawseti(L, -2, 1); 113 | 114 | #else 115 | THCudaTensor *probs = 116 | static_cast(luaT_checkudata(L, 1, "torch.CudaTensor")); 117 | THCudaTensor *grads = 118 | static_cast(luaT_checkudata(L, 2, "torch.CudaTensor")); 119 | 120 | int* sizes; 121 | int* labels_ptr; 122 | int* label_sizes_ptr; 123 | 124 | int minibatch_size = processTargets(L, &sizes, &labels_ptr, &label_sizes_ptr); 125 | 126 | float *probs_ptr; 127 | 128 | if (probs->storage) { 129 | probs_ptr = probs->storage->data + probs->storageOffset; 130 | } else { 131 | lua_pushfstring(L, "probs cannot be an empty tensor"); 132 | lua_error(L); 133 | } 134 | 135 | float *grads_ptr; 136 | 137 | if (grads->storage) { 138 | grads_ptr = grads->storage->data + grads->storageOffset;; 139 | } else { 140 | grads_ptr = NULL; // this will trigger the score forward code path 141 | } 142 | 143 | float* costs = new float[minibatch_size]; 144 | 145 | ctcOptions options; 146 | memset(&options, 0, sizeof(options)); 147 | options.loc = CTC_GPU; 148 | options.stream = THCState_getCurrentStream(cutorch_getstate(L)); 149 | 150 | size_t gpu_size_bytes; 151 | get_workspace_size(label_sizes_ptr, sizes, 152 | (int) probs->size[1], minibatch_size, 153 | options, &gpu_size_bytes); 154 | 155 | float* gpu_workspace; 156 | THCudaMalloc(cutorch_getstate(L), (void **) &gpu_workspace, gpu_size_bytes); 157 | 158 | compute_ctc_loss(probs_ptr, grads_ptr, 159 | labels_ptr, label_sizes_ptr, 160 | sizes, (int) probs->size[1], 161 | minibatch_size, costs, 162 | gpu_workspace, options); 163 | 164 | lua_newtable(L); 165 | 166 | for (int ix = 0; ix < minibatch_size; ix++) { 167 | lua_pushnumber(L, costs[ix]); 168 | lua_rawseti(L, -2, ix+1); 169 | } 170 | 171 | THCudaFree(cutorch_getstate(L), (void *) gpu_workspace); 172 | 173 | delete sizes; 174 | delete labels_ptr; 175 | delete label_sizes_ptr; 176 | delete costs; 177 | #endif 178 | return 1; 179 | } 180 | 181 | extern "C" int cpu_ctc(lua_State* L) { 182 | 183 | THFloatTensor *probs = 184 | static_cast(luaT_checkudata(L, 1, "torch.FloatTensor")); 185 | THFloatTensor *grads = 186 | static_cast(luaT_checkudata(L, 2, "torch.FloatTensor")); 187 | 188 | int* sizes; 189 | int* labels_ptr; 190 | int* label_sizes_ptr; 191 | 192 | int minibatch_size = processTargets(L, &sizes, &labels_ptr, &label_sizes_ptr); 193 | float *probs_ptr; 194 | 195 | if (probs->storage) { 196 | probs_ptr = probs->storage->data + probs->storageOffset; 197 | } else { 198 | lua_pushfstring(L, "probs cannot be an empty tensor"); 199 | lua_error(L); 200 | } 201 | 202 | float *grads_ptr; 203 | 204 | if (grads->storage) { 205 | grads_ptr = grads->storage->data + grads->storageOffset;; 206 | } else { 207 | grads_ptr = NULL; // this will trigger the score forward code path 208 | } 209 | 210 | float* costs = new float[minibatch_size]; 211 | 212 | ctcOptions options; 213 | memset(&options, 0, sizeof(options)); 214 | options.loc = CTC_CPU; 215 | options.num_threads = 0; // will use default number of threads 216 | 217 | #if defined(CTC_DISABLE_OMP) || defined(APPLE) 218 | // have to use at least one 219 | options.num_threads = std::max(options.num_threads, (unsigned int) 1); 220 | #endif 221 | 222 | size_t cpu_size_bytes; 223 | get_workspace_size(label_sizes_ptr, sizes, 224 | (int) probs->size[1], minibatch_size, 225 | options, &cpu_size_bytes); 226 | 227 | float* cpu_workspace = (float*) new unsigned char[cpu_size_bytes]; 228 | 229 | compute_ctc_loss(probs_ptr, grads_ptr, 230 | labels_ptr, label_sizes_ptr, 231 | sizes, probs->size[1], 232 | minibatch_size, costs, 233 | cpu_workspace, options); 234 | 235 | lua_newtable(L); 236 | 237 | for (int ix = 0; ix < minibatch_size; ix++) { 238 | lua_pushnumber(L, costs[ix]); 239 | lua_rawseti(L, -2, ix+1); 240 | } 241 | 242 | delete cpu_workspace; 243 | delete sizes; 244 | delete labels_ptr; 245 | delete label_sizes_ptr; 246 | delete costs; 247 | 248 | return 1; 249 | } 250 | 251 | extern "C" int luaopen_libwarp_ctc(lua_State *L) { 252 | lua_register(L, "gpu_ctc", gpu_ctc); 253 | lua_register(L, "cpu_ctc", cpu_ctc); 254 | 255 | return 0; 256 | } 257 | -------------------------------------------------------------------------------- /torch_binding/init.lua: -------------------------------------------------------------------------------- 1 | require 'libwarp_ctc' -------------------------------------------------------------------------------- /torch_binding/rocks/warp-ctc-scm-1.rockspec: -------------------------------------------------------------------------------- 1 | package = "warp-ctc" 2 | version = "scm-1" 3 | 4 | source = { 5 | url = "git://github.com/baidu-research/warp-ctc.git", 6 | } 7 | 8 | description = { 9 | summary = "Baidu CTC Implementation", 10 | detailed = [[ 11 | ]], 12 | homepage = "https://github.com/baidu-research/warp-ctc", 13 | license = "Apache" 14 | } 15 | 16 | dependencies = { 17 | "torch >= 7.0", 18 | } 19 | 20 | build = { 21 | type = "command", 22 | build_command = [[ 23 | cmake -E make_directory build && cd build && cmake .. -DLUALIB=$(LUALIB) -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) -j$(getconf _NPROCESSORS_ONLN) && make install 24 | ]], 25 | platforms = {}, 26 | install_command = "cd build" 27 | } 28 | -------------------------------------------------------------------------------- /torch_binding/tests/data/chars.txt: -------------------------------------------------------------------------------- 1 | ' 1 2 | 2 3 | a 3 4 | b 4 5 | c 5 6 | d 6 7 | e 7 8 | f 8 9 | g 9 10 | h 10 11 | i 11 12 | j 12 13 | k 13 14 | l 14 15 | m 15 16 | n 16 17 | o 17 18 | p 18 19 | q 19 20 | r 20 21 | s 21 22 | t 22 23 | u 23 24 | v 24 25 | w 25 26 | x 26 27 | y 27 28 | z 28 29 | -------------------------------------------------------------------------------- /torch_binding/tests/data/sizes.txt: -------------------------------------------------------------------------------- 1 | 433 2 | 434 3 | 434 4 | 434 5 | 434 6 | 434 7 | 434 8 | 434 9 | 434 10 | 434 11 | 434 12 | 434 13 | 434 14 | 434 15 | 434 16 | 434 17 | 435 18 | 435 19 | 435 20 | 435 21 | 435 22 | 435 23 | 435 24 | 435 25 | 436 26 | 437 27 | 438 28 | 438 29 | 438 30 | 439 31 | 439 32 | 440 33 | 433 34 | 434 35 | 434 36 | 434 37 | 434 38 | 434 39 | 434 40 | 434 41 | 434 42 | 434 43 | 434 44 | 434 45 | 434 46 | 434 47 | 434 48 | 434 49 | 435 50 | 435 51 | 435 52 | 435 53 | 435 54 | 435 55 | 435 56 | 435 57 | 436 58 | 437 59 | 438 60 | 438 61 | 438 62 | 439 63 | 439 64 | 440 65 | 433 66 | 434 67 | 434 68 | 434 69 | 434 70 | 434 71 | 434 72 | 434 73 | 434 74 | 434 75 | 434 76 | 434 77 | 434 78 | 434 79 | 434 80 | 434 81 | 435 82 | 435 83 | 435 84 | 435 85 | 435 86 | 435 87 | 435 88 | 435 89 | 436 90 | 437 91 | 438 92 | 438 93 | 438 94 | 439 95 | 439 96 | 440 97 | 433 98 | 434 99 | 434 100 | 434 101 | 434 102 | 434 103 | 434 104 | 434 105 | 434 106 | 434 107 | 434 108 | 434 109 | 434 110 | 434 111 | 434 112 | 434 113 | 435 114 | 435 115 | 435 116 | 435 117 | 435 118 | 435 119 | 435 120 | 435 121 | 436 122 | 437 123 | 438 124 | 438 125 | 438 126 | 439 127 | 439 128 | 440 129 | 433 130 | 434 131 | 434 132 | 434 133 | 434 134 | 434 135 | 434 136 | 434 137 | 434 138 | 434 139 | 434 140 | 434 141 | 434 142 | 434 143 | 434 144 | 434 145 | 435 146 | 435 147 | 435 148 | 435 149 | 435 150 | 435 151 | 435 152 | 435 153 | 436 154 | 437 155 | 438 156 | 438 157 | 438 158 | 439 159 | 439 160 | 440 161 | 433 162 | 434 163 | 434 164 | 434 165 | 434 166 | 434 167 | 434 168 | 434 169 | 434 170 | 434 171 | 434 172 | 434 173 | 434 174 | 434 175 | 434 176 | 434 177 | 435 178 | 435 179 | 435 180 | 435 181 | 435 182 | 435 183 | 435 184 | 435 185 | 436 186 | 437 187 | 438 188 | 438 189 | 438 190 | 439 191 | 439 192 | 440 193 | -------------------------------------------------------------------------------- /torch_binding/tests/test.lua: -------------------------------------------------------------------------------- 1 | require 'nn' 2 | require 'cutorch' 3 | require 'warp_ctc' 4 | 5 | function os.capture(cmd, raw) 6 | local f = assert(io.popen(cmd, 'r')) 7 | local s = assert(f:read('*a')) 8 | f:close() 9 | if raw then return s end 10 | s = string.gsub(s, '^%s+', '') 11 | s = string.gsub(s, '%s+$', '') 12 | s = string.gsub(s, '[\n\r]+', ' ') 13 | return s 14 | end 15 | 16 | function reduce(list) 17 | local acc 18 | for k, v in ipairs(list) do 19 | if 1 == k then 20 | acc = v 21 | else 22 | acc = acc + v 23 | end 24 | end 25 | return acc 26 | end 27 | 28 | function simpleTest() 29 | local cpu_acts = torch.Tensor({{0.1, 0.6, 0.1, 0.1,0.1},{0.1, 0.1, 0.6, 0.1, 0.1}}):float() 30 | local cpu_probs = nn.SoftMax():updateOutput(cpu_acts:double()):float() 31 | local cpu_grads = cpu_probs:clone():zero() 32 | 33 | local labels = {{1,2}} 34 | --local label_lengths = torch.Tensor({2}):int() 35 | 36 | local sizes = {2} 37 | --print(cpu_probs, cpu_grads, labels, sizes) 38 | 39 | print("CPU_cost:", reduce(cpu_ctc(cpu_acts, cpu_grads, labels, sizes))) 40 | print(cpu_grads) 41 | 42 | local cpu_grads = torch.Tensor():float() 43 | print("CPU_cost: score forward", reduce(cpu_ctc(cpu_acts, cpu_grads, labels, sizes))) 44 | 45 | local acts = cpu_acts:cuda() 46 | local grads = acts:clone():zero() 47 | 48 | --print(probs, grads, labels, label_lengths, sizes) 49 | 50 | local cost = reduce(gpu_ctc(acts, grads, labels, sizes)) 51 | print("GPU_cost:", cost) 52 | print(grads) 53 | 54 | local grads = torch.Tensor():cuda() 55 | print("GPU_cost: score forward", reduce(gpu_ctc(acts, grads, labels, sizes))) 56 | 57 | end 58 | 59 | function mediumTest(multiplier) 60 | local cpu_acts = torch.Tensor({{0.1, 0.6, 0.1, 0.1,0.1},{0.1, 0.1, 0.6, 0.1, 0.1}, 61 | {0.6, 0.1, 0.1, 0.1,0.1},{0.1, 0.1, 0.5, 0.2, 0.1}}):float()*multiplier 62 | local cpu_grads = cpu_acts:clone():zero() 63 | 64 | local labels = {{1,2},{1,2}} 65 | local sizes = {2, 2 } 66 | 67 | --print(cpu_probs, cpu_grads, labels, sizes) 68 | 69 | print("CPU_cost:", reduce(cpu_ctc(cpu_acts, cpu_grads, labels, sizes))) 70 | print(cpu_grads) 71 | 72 | local cpu_grads = torch.Tensor():float() 73 | print("CPU_cost: score forward", reduce(cpu_ctc(cpu_acts, cpu_grads, labels, sizes))) 74 | 75 | local acts = cpu_acts:cuda() 76 | local grads = acts:clone():zero() 77 | 78 | --print(probs, grads, labels, sizes) 79 | 80 | local cost = reduce(gpu_ctc(acts, grads, labels, sizes)) 81 | print("GPU_cost:", cost) 82 | print(grads) 83 | 84 | local grads = torch.Tensor():cuda() 85 | print("GPU_cost: score forward", reduce(gpu_ctc(acts, grads, labels, sizes))) 86 | 87 | end 88 | 89 | function emptyLabelTest() 90 | local cpu_acts = torch.Tensor({{0.1, 0.6, 0.1, 0.1,0.1},{0.1, 0.1, 0.6, 0.1, 0.1}, 91 | {0.6, 0.1, 0.1, 0.1,0.1},{0.1, 0.1, 0.5, 0.2, 0.1}}):float() 92 | local cpu_grads = cpu_acts:clone():zero() 93 | 94 | local labels = {{1,2},{}} 95 | local sizes = {2, 2 } 96 | 97 | --print(cpu_probs, cpu_grads, labels, sizes) 98 | 99 | print("CPU_cost:", reduce(cpu_ctc(cpu_acts, cpu_grads, labels, sizes))) 100 | print(cpu_grads) 101 | 102 | local cpu_grads = torch.Tensor():float() 103 | print("CPU_cost: score forward", reduce(cpu_ctc(cpu_acts, cpu_grads, labels, sizes))) 104 | 105 | local acts = cpu_acts:cuda() 106 | local grads = acts:clone():zero() 107 | 108 | --print(probs, grads, labels, sizes) 109 | 110 | local cost = reduce(gpu_ctc(acts, grads, labels, sizes)) 111 | print("GPU_cost:", cost) 112 | print(grads) 113 | 114 | local grads = torch.Tensor():cuda() 115 | print("GPU_cost: score forward", reduce(gpu_ctc(acts, grads, labels, sizes))) 116 | 117 | end 118 | 119 | function getTargets() 120 | local outdim = 29 --TODO count chars.txt 121 | local file = io.open("data/sizes.txt", "r"); 122 | 123 | if not file then 124 | print("File not found data/sizes.txt are you runnng the test from the tests dir?") 125 | end 126 | 127 | local sizes = {} 128 | for line in file:lines() do 129 | table.insert (sizes, tonumber(line)); 130 | end 131 | 132 | 133 | local label_file = io.open("data/labels.txt", "r"); 134 | local labels = {} 135 | 136 | for line in label_file:lines() do 137 | local current_labels = {} 138 | for w in line:gmatch("%S+") do 139 | table.insert (current_labels, tonumber(w)); 140 | end 141 | table.insert (labels, current_labels); 142 | end 143 | 144 | return outdim, sizes, labels 145 | end 146 | 147 | 148 | function bigTest(minibatch_size) 149 | 150 | detected_OS = os.capture('uname', false) 151 | 152 | local outdim, raw_sizes, raw_labels = getTargets() 153 | 154 | -- truncate tables to given minibatch_size 155 | local sizes = {} 156 | local labels = {} 157 | 158 | local max_length = 0 159 | 160 | for idx = 1,minibatch_size do 161 | if raw_sizes[idx] > max_length then 162 | max_length = raw_sizes[idx] 163 | end 164 | 165 | table.insert(sizes, raw_sizes[idx]) 166 | table.insert(labels, raw_labels[idx]) 167 | end 168 | 169 | local minibatch_size = table.getn(sizes) 170 | 171 | print("Using minibatch size: ", #sizes) 172 | print("Using outdim size: ", outdim) 173 | print("Max size: ", max_length) 174 | 175 | torch.manualSeed(123) 176 | 177 | local cpu_acts = torch.rand(minibatch_size*max_length, outdim):float() 178 | local cpu_grads = cpu_acts:clone():fill(0) 179 | 180 | print("CPU_cost:", reduce(cpu_ctc(cpu_acts, cpu_grads, labels, sizes))) 181 | 182 | if detected_OS == "Darwin" then 183 | if cpu_grads:ne(cpu_grads):sum() > 0 then 184 | print(sys.COLORS.red .. ' cpu_grads after update has NaN/s') 185 | else 186 | print('cpu_grads do not have nans') 187 | end 188 | end 189 | 190 | local cpu_null_grads = torch.Tensor():float() 191 | print("CPU_cost: score forward", reduce(cpu_ctc(cpu_acts, cpu_null_grads, labels, sizes))) 192 | 193 | local acts = cpu_acts:cuda() 194 | local grads = acts:clone():zero() 195 | 196 | --print(probs, grads, labels, sizes) 197 | 198 | local cost = reduce(gpu_ctc(acts, grads, labels, sizes)) 199 | print("GPU_cost:", cost) 200 | 201 | if detected_OS == "Darwin" then 202 | if grads:ne(grads):sum() > 0 then 203 | print(sys.COLORS.red .. ' gpu_grads after update has NaN/s') 204 | else 205 | print('gpu_grads do not have nans') 206 | end 207 | 208 | print("L2 norm grad diff: ", torch.norm(cpu_grads - grads:float())) 209 | 210 | end 211 | 212 | local grads = torch.Tensor():cuda() 213 | print("GPU_cost: score forward", reduce(gpu_ctc(acts, grads, labels, sizes))) 214 | 215 | end 216 | 217 | simpleTest() 218 | mediumTest(1.0) 219 | print("Stability test") 220 | mediumTest(200.0) -- test SM stability if compiled with USE_NSM this will not have nans 221 | print("Empty label test") 222 | emptyLabelTest() 223 | bigTest(32) 224 | bigTest(64) 225 | bigTest(96) 226 | bigTest(111) -------------------------------------------------------------------------------- /torch_binding/utils.c: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | 3 | THLongStorage* cutorch_checklongargs(lua_State *L, int index) { 4 | THLongStorage *storage; 5 | int i; 6 | int narg = lua_gettop(L)-index+1; 7 | 8 | if(narg == 1 && luaT_toudata(L, index, "torch.LongStorage")) { 9 | THLongStorage *storagesrc = luaT_toudata(L, index, "torch.LongStorage"); 10 | storage = THLongStorage_newWithSize(storagesrc->size); 11 | THLongStorage_copy(storage, storagesrc); 12 | } else { 13 | storage = THLongStorage_newWithSize(narg); 14 | for(i = index; i < index+narg; i++) { 15 | if(!lua_isnumber(L, i)) { 16 | THLongStorage_free(storage); 17 | luaL_argerror(L, i, "number expected"); 18 | } 19 | THLongStorage_set(storage, i-index, lua_tonumber(L, i)); 20 | } 21 | } 22 | return storage; 23 | } 24 | 25 | int cutorch_islongargs(lua_State *L, int index) { 26 | int narg = lua_gettop(L)-index+1; 27 | 28 | if(narg == 1 && luaT_toudata(L, index, "torch.LongStorage")) { 29 | return 1; 30 | } else { 31 | int i; 32 | 33 | for(i = index; i < index+narg; i++) { 34 | if(!lua_isnumber(L, i)) 35 | return 0; 36 | } 37 | return 1; 38 | } 39 | return 0; 40 | } 41 | 42 | struct THCState* cutorch_getstate(lua_State* L) { 43 | lua_getglobal(L, "cutorch"); 44 | lua_getfield(L, -1, "_state"); 45 | struct THCState *state = lua_touserdata(L, -1); 46 | lua_pop(L, 2); 47 | return state; 48 | } 49 | -------------------------------------------------------------------------------- /torch_binding/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef CUTORCH_UTILS_INC 2 | #define CUTORCH_UTILS_INC 3 | 4 | #include "luaT.h" 5 | #include "TH.h" 6 | 7 | #ifdef __cplusplus 8 | # define TORCH_EXTERNC extern "C" 9 | #else 10 | # define TORCH_EXTERNC extern 11 | #endif 12 | 13 | #ifdef __GNUC__ 14 | # define TORCH_UNUSED __attribute__((unused)) 15 | #else 16 | # define TORCH_UNUSED 17 | #endif 18 | 19 | #ifdef _WIN32 20 | # ifdef torch_EXPORTS 21 | # define TORCH_API TORCH_EXTERNC __declspec(dllexport) 22 | # else 23 | # define TORCH_API TORCH_EXTERNC __declspec(dllimport) 24 | # endif 25 | #else 26 | # define TORCH_API TORCH_EXTERNC 27 | #endif 28 | 29 | #if LUA_VERSION_NUM == 501 30 | /* 31 | ** Adapted from Lua 5.2.0 32 | */ 33 | TORCH_UNUSED static void luaL_setfuncs (lua_State *L, const luaL_Reg *l, int nup) { 34 | luaL_checkstack(L, nup+1, "too many upvalues"); 35 | for (; l->name != NULL; l++) { /* fill the table with given functions */ 36 | int i; 37 | lua_pushstring(L, l->name); 38 | for (i = 0; i < nup; i++) /* copy upvalues to the top */ 39 | lua_pushvalue(L, -(nup+1)); 40 | lua_pushcclosure(L, l->func, nup); /* closure with those upvalues */ 41 | lua_settable(L, -(nup + 3)); 42 | } 43 | lua_pop(L, nup); /* remove upvalues */ 44 | } 45 | #endif 46 | 47 | 48 | TORCH_API THLongStorage* cutorch_checklongargs(lua_State *L, int index); 49 | TORCH_API int cutorch_islongargs(lua_State *L, int index); 50 | 51 | struct THCState; 52 | TORCH_API struct THCState* cutorch_getstate(lua_State* L); 53 | 54 | #endif --------------------------------------------------------------------------------