├── .gitignore
├── CMakeLists.txt
├── LICENSE
├── README.md
├── README.zh_cn.md
├── cmake
    └── hip.cmake
├── doc
    ├── baidu-research-logo-small.png
    └── deep-speech-ctc-small.png
├── include
    ├── contrib
    │   └── moderngpu
    │   │   ├── LICENSE
    │   │   └── include
    │   │       ├── device
    │   │           ├── ctaloadbalance.cuh
    │   │           ├── ctamerge.cuh
    │   │           ├── ctascan.cuh
    │   │           ├── ctasearch.cuh
    │   │           ├── ctasegreduce.cuh
    │   │           ├── ctasegscan.cuh
    │   │           ├── ctasegsort.cuh
    │   │           ├── ctasortedsearch.cuh
    │   │           ├── devicetypes.cuh
    │   │           ├── deviceutil.cuh
    │   │           ├── intrinsics.cuh
    │   │           ├── loadstore.cuh
    │   │           ├── serialsets.cuh
    │   │           └── sortnetwork.cuh
    │   │       ├── mgpudevice.cuh
    │   │       ├── mgpuenums.h
    │   │       └── util
    │   │           └── static.h
    ├── ctc.h
    └── detail
    │   ├── cpu_ctc.h
    │   ├── ctc_helper.h
    │   ├── gpu_ctc.h
    │   ├── gpu_ctc_kernels.h
    │   ├── hostdevice.h
    │   ├── reduce.h
    │   └── type_defs.h
├── src
    ├── ctc_entrypoint.cpp
    ├── ctc_entrypoint.cu
    └── reduce.cu
├── tensorflow_binding
    ├── .gitignore
    ├── README.md
    ├── setup.py
    ├── src
    │   ├── ctc_op_kernel.cc
    │   └── warpctc_op.cc
    ├── tests
    │   ├── __init__.py
    │   ├── test_ctc_loss_op.py
    │   └── test_warpctc_op.py
    └── warpctc_tensorflow
    │   └── __init__.py
├── tests
    ├── test.h
    ├── test_cpu.cpp
    └── test_gpu.cu
└── torch_binding
    ├── TUTORIAL.md
    ├── TUTORIAL.zh_cn.md
    ├── binding.cpp
    ├── init.lua
    ├── rocks
        └── warp-ctc-scm-1.rockspec
    ├── tests
        ├── data
        │   ├── chars.txt
        │   ├── labels.txt
        │   └── sizes.txt
        └── test.lua
    ├── utils.c
    └── utils.h


/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | Makefile
3 | build


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | IF (APPLE)
  2 |     cmake_minimum_required(VERSION 3.4)
  3 | ELSE()
  4 |     cmake_minimum_required(VERSION 3.10)
  5 | ENDIF()
  6 | 
  7 | set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
  8 | 
  9 | project(ctc_release LANGUAGES CXX CUDA)
 10 | set(CMAKE_CXX_STANDARD 11)
 11 | set(CMAKE_CUDA_STANDARD 11)
 12 | 
 13 | include_directories(include)
 14 | 
 15 | FIND_PACKAGE(CUDA 6.5)
 16 | FIND_PACKAGE(Torch)
 17 | 
 18 | MESSAGE(STATUS "cuda found ${CUDA_FOUND}")
 19 | MESSAGE(STATUS "Torch found ${Torch_DIR}")
 20 | 
 21 | option(WITH_GPU     "compile warp-ctc with CUDA."     ${CUDA_FOUND})
 22 | option(WITH_TORCH   "compile warp-ctc with Torch."    ${Torch_FOUND})
 23 | option(WITH_OMP     "compile warp-ctc with OpenMP."   ON)
 24 | option(BUILD_TESTS  "build warp-ctc unit tests."      ON)
 25 | option(BUILD_SHARED "build warp-ctc shared library."  ON)
 26 | option(WITH_ROCM     "Compile PaddlePaddle with ROCM platform"             OFF)
 27 | 
 28 | if(WITH_ROCM)
 29 |     add_definitions(-DWARPCTC_WITH_HIP)
 30 |     include(hip)
 31 | endif(WITH_ROCM)
 32 | 
 33 | if(BUILD_SHARED)
 34 |     set(WARPCTC_SHARED "SHARED")
 35 | else(BUILD_SHARED)
 36 |     set(WARPCTC_SHARED "STATIC")
 37 | endif(BUILD_SHARED)
 38 | 
 39 | if(WIN32)
 40 |     set(CMAKE_STATIC_LIBRARY_PREFIX lib)
 41 |     set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
 42 |     set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
 43 |     set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
 44 |     set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
 45 |     foreach(flag_var
 46 |             CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE)
 47 |         if(${flag_var} MATCHES "/MD")
 48 |             string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
 49 |         endif(${flag_var} MATCHES "/MD")
 50 |     endforeach(flag_var)
 51 | else(WIN32)
 52 |     # Set c++ flags
 53 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
 54 |     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O2")
 55 | endif(WIN32)
 56 | 
 57 | if(APPLE)
 58 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 59 |     add_definitions(-DAPPLE)
 60 | endif()
 61 | 
 62 | if(WITH_OMP AND NOT APPLE)
 63 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
 64 | else()
 65 |     add_definitions(-DCTC_DISABLE_OMP)
 66 | endif()
 67 | 
 68 | # need to be at least 30 or __shfl_down in reduce wont compile
 69 | IF (CUDA_VERSION VERSION_LESS "11.0")
 70 |     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30")
 71 | ENDIF()
 72 | 
 73 | # sm35 is deprecated after cuda 12.0
 74 | IF (CUDA_VERSION VERSION_LESS "12.0")
 75 |     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_35,code=sm_35")
 76 | ENDIF()
 77 | 
 78 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_50,code=sm_50")
 79 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52")
 80 | 
 81 | IF (CUDA_VERSION VERSION_GREATER "7.6")
 82 |     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60")
 83 |     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61")
 84 |     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62")
 85 | ENDIF()
 86 | 
 87 | IF ((CUDA_VERSION VERSION_GREATER "9.0") OR (CUDA_VERSION VERSION_EQUAL "9.0"))
 88 |     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_70,code=sm_70")
 89 | ENDIF()
 90 | 
 91 | IF ((CUDA_VERSION VERSION_GREATER "10.0") OR (CUDA_VERSION VERSION_EQUAL "10.0"))
 92 |     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_75,code=sm_75")
 93 | ENDIF()
 94 | 
 95 | IF ((CUDA_VERSION VERSION_GREATER "11.0") OR (CUDA_VERSION VERSION_EQUAL "11.0"))
 96 |     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_80,code=sm_80")
 97 | ENDIF()
 98 | 
 99 | IF ((CUDA_VERSION VERSION_GREATER "11.2") OR (CUDA_VERSION VERSION_EQUAL "11.2"))
100 |     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_86,code=sm_86")
101 | ENDIF()
102 | 
103 | IF ((CUDA_VERSION VERSION_GREATER "11.8") OR (CUDA_VERSION VERSION_EQUAL "11.8"))
104 |     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_90,code=sm_90")
105 | ENDIF()
106 | 
107 | IF(NOT APPLE AND NOT WIN32)
108 |     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
109 |     if(WITH_OMP)
110 |         set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fopenmp")
111 |     endif()
112 | ENDIF()
113 | 
114 | IF (APPLE)
115 |     EXEC_PROGRAM(uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION)
116 |     STRING(REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
117 |     MESSAGE(STATUS "DARWIN_VERSION=${DARWIN_VERSION}")
118 | 
119 |     #for el capitain have to use rpath
120 | 
121 |     IF (DARWIN_VERSION LESS 15)
122 |         set(CMAKE_SKIP_RPATH TRUE)
123 |     ENDIF ()
124 | 
125 | ELSE()
126 |     #always skip for linux
127 |     set(CMAKE_SKIP_RPATH TRUE)
128 | ENDIF()
129 | 
130 | # windows treat symbolic file as a real file, which is different with unix
131 | # We create a hidden file and compile it instead of origin source file.
132 | function(windows_symbolic TARGET)
133 |     set(oneValueArgs "")
134 |     set(multiValueArgs SRCS PATH DEPS)
135 |     cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
136 |     set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH})
137 |     foreach(src ${windows_symbolic_SRCS})
138 |         get_filename_component(src ${src} NAME_WE)
139 |         if (NOT EXISTS ${final_path}/${src}.cpp OR NOT EXISTS ${final_path}/${src}.cu)
140 |             message(FATAL " ${final_path}/${src}.cc and ${final_path}/${src}.cu must exsits, and ${final_path}/${src}.cu must be symbolic file.")
141 |         endif()
142 | 
143 |         # only copy the xx.cu to .xx.cu when the content are modified
144 |         add_custom_command(OUTPUT ${final_path}/.${src}.cu
145 |                 COMMAND ${CMAKE_COMMAND} -E copy_if_different "${final_path}/${src}.cpp" "${final_path}/.${src}.cu"
146 |                 COMMENT "create hidden file of ${src}.cu")
147 |         add_custom_target(${TARGET} ALL DEPENDS ${final_path}/.${src}.cu)
148 |     endforeach()
149 | endfunction()
150 | 
151 | IF (WITH_GPU OR WITH_ROCM)
152 | 
153 |     MESSAGE(STATUS "Building shared library with GPU support")
154 | 
155 |     IF (WITH_GPU)
156 |         MESSAGE(STATUS "NVCC_ARCH_FLAGS" ${CUDA_NVCC_FLAGS})
157 |     ENDIF()
158 | 
159 |     if (WIN32)
160 |         SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler \"/wd 4068 /wd 4244 /wd 4267 /wd 4305 /wd 4819\"")
161 |         windows_symbolic(ctc_entrypoint SRCS ctc_entrypoint.cu PATH src)
162 |         CUDA_ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/.ctc_entrypoint.cu src/reduce.cu)
163 |     else()
164 |         IF (WITH_GPU)
165 |             ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/ctc_entrypoint.cu src/reduce.cu)
166 |         ELSE()
167 |             HIP_ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/ctc_entrypoint.cu src/reduce.cu)
168 |             TARGET_LINK_LIBRARIES(warpctc PUBLIC ${ROCM_HIPRTC_LIB})
169 |         ENDIF()
170 |     endif(WIN32)
171 | 
172 |     IF (!WITH_TORCH)
173 |         MESSAGE(STATUS "Link rand library")
174 | 
175 |         IF (WITH_GPU)
176 |             MESSAGE(STATUS "Link cuda rand library: ${CUDA_curand_LIBRARY}")
177 |             TARGET_LINK_LIBRARIES(warpctc ${CUDA_curand_LIBRARY})
178 |         ELSE()
179 |             MESSAGE(STATUS "Link hip rand library: ${hiprand_LIBRARY_DIRS}")
180 |             TARGET_LINK_LIBRARIES(warpctc ${hiprand_LIBRARY_DIRS}/libhiprand.so)
181 |         ENDIF()
182 |     ENDIF()
183 | 
184 |     if(BUILD_TESTS)
185 |         MESSAGE(STATUS "Build tests")
186 | 
187 |         IF (WITH_GPU)
188 |             add_executable(test_cpu tests/test_cpu.cpp)
189 |         ELSE()
190 |             add_executable(test_cpu tests/test_cpu.cpp)
191 |         ENDIF()
192 | 
193 | 
194 |         TARGET_LINK_LIBRARIES(test_cpu warpctc)
195 |         SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS}")
196 | 
197 |         IF (WITH_GPU)
198 |             cuda_add_executable(test_gpu tests/test_gpu.cu)
199 |             TARGET_LINK_LIBRARIES(test_gpu warpctc ${CUDA_curand_LIBRARY})
200 |         ELSE()
201 |             hip_add_executable(test_gpu tests/test_gpu.cu)
202 |             TARGET_LINK_LIBRARIES(test_gpu warpctc ${hiprand_LIBRARY_DIRS}/libhiprand.so)
203 |         ENDIF()
204 |     endif(BUILD_TESTS)
205 | 
206 |     INSTALL(TARGETS warpctc
207 |             RUNTIME DESTINATION "bin"
208 |             LIBRARY DESTINATION "lib"
209 |             ARCHIVE DESTINATION "lib")
210 | 
211 |     INSTALL(FILES include/ctc.h DESTINATION "include")
212 | 
213 |     IF (WITH_TORCH AND WITH_GPU)
214 |         MESSAGE(STATUS "Building Torch Bindings with GPU support")
215 |         INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS} "${CUDA_TOOLKIT_ROOT_DIR}/samples/common/inc")
216 |         INCLUDE_DIRECTORIES(${Torch_INSTALL_INCLUDE} ${Torch_INSTALL_INCLUDE}/TH ${Torch_INSTALL_INCLUDE}/THC)
217 | 
218 |         TARGET_LINK_LIBRARIES(warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY})
219 |         INSTALL(TARGETS warpctc
220 |                 RUNTIME DESTINATION "${Torch_INSTALL_BIN_SUBDIR}"
221 |                 LIBRARY DESTINATION "${Torch_INSTALL_LIB_SUBDIR}"
222 |                 ARCHIVE DESTINATION "${Torch_INSTALL_LIB_SUBDIR}")
223 | 
224 |         SET(src torch_binding/binding.cpp torch_binding/utils.c)
225 |         SET(luasrc torch_binding/init.lua)
226 | 
227 |         ADD_TORCH_PACKAGE(warp_ctc "${src}" "${luasrc}")
228 |         IF (APPLE)
229 |             TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY})
230 |         ELSE()
231 |             TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT THC TH ${CUDA_curand_LIBRARY} gomp)
232 |         ENDIF()
233 |     ENDIF()
234 | 
235 | ELSE()
236 |     MESSAGE(STATUS "Building shared library with no GPU support")
237 | 
238 |     if (NOT APPLE AND NOT WIN32)
239 |         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
240 |     ENDIF()
241 | 
242 |     ADD_LIBRARY(warpctc ${WARPCTC_SHARED} src/ctc_entrypoint.cpp)
243 | 
244 |     if(BUILD_TESTS)
245 |         add_executable(test_cpu tests/test_cpu.cpp )
246 |         TARGET_LINK_LIBRARIES(test_cpu warpctc)
247 |         SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS}")
248 |     endif(BUILD_TESTS)
249 | 
250 |     INSTALL(TARGETS warpctc
251 |             RUNTIME DESTINATION "bin"
252 |             LIBRARY DESTINATION "lib"
253 |             ARCHIVE DESTINATION "lib")
254 | 
255 |     INSTALL(FILES include/ctc.h DESTINATION "include")
256 | 
257 |     IF (WITH_TORCH)
258 |         MESSAGE(STATUS "Building Torch Bindings with no GPU support")
259 |         add_definitions(-DTORCH_NOGPU)
260 |         INCLUDE_DIRECTORIES(${Torch_INSTALL_INCLUDE} ${Torch_INSTALL_INCLUDE}/TH)
261 | 
262 |         TARGET_LINK_LIBRARIES(warpctc luajit luaT TH)
263 | 
264 |         INSTALL(TARGETS warpctc
265 |                 RUNTIME DESTINATION "${Torch_INSTALL_BIN_SUBDIR}"
266 |                 LIBRARY DESTINATION "${Torch_INSTALL_LIB_SUBDIR}"
267 |                 ARCHIVE DESTINATION "${Torch_INSTALL_LIB_SUBDIR}")
268 | 
269 |         SET(src torch_binding/binding.cpp torch_binding/utils.c)
270 |         SET(luasrc torch_binding/init.lua)
271 | 
272 |         ADD_TORCH_PACKAGE(warp_ctc "${src}" "${luasrc}")
273 |         IF (APPLE)
274 |             TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT TH)
275 |         ELSE()
276 |             TARGET_LINK_LIBRARIES(warp_ctc warpctc luajit luaT TH gomp)
277 |         ENDIF()
278 |     ENDIF()
279 | 
280 | ENDIF()
281 | 
282 | 
283 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![Baidu Logo](/doc/baidu-research-logo-small.png)
  2 | 
  3 | [In Chinese 中文版](README.zh_cn.md)
  4 | 
  5 | # warp-ctc
  6 | 
  7 | A fast parallel implementation of CTC, on both CPU and GPU.
  8 | 
  9 | ## Introduction
 10 | 
 11 | [Connectionist Temporal Classification](http://www.cs.toronto.edu/~graves/icml_2006.pdf)
 12 | is a loss function useful for performing supervised learning on sequence data,
 13 | without needing an alignment between input data and labels.  For example, CTC
 14 | can be used to train
 15 | [end-to-end](http://www.jmlr.org/proceedings/papers/v32/graves14.pdf)
 16 | [systems](http://arxiv.org/pdf/1408.2873v2.pdf) for
 17 | [speech recognition](http://arxiv.org/abs/1512.02595),
 18 | which is how we have been using it at Baidu's Silicon Valley AI Lab.
 19 | 
 20 | ![DSCTC](/doc/deep-speech-ctc-small.png)
 21 | 
 22 | The illustration above shows CTC computing the probability of an output
 23 | sequence "THE CAT ", as a sum over all possible alignments of input sequences
 24 | that could map to "THE CAT ", taking into account that labels may be duplicated
 25 | because they may stretch over several time steps of the input data (represented by
 26 | the spectrogram at the bottom of the image).
 27 | Computing the sum of all such probabilities explicitly would be prohibitively costly due to the
 28 | combinatorics involved, but CTC uses dynamic programming to dramatically
 29 | reduce the complexity of the computation. Because CTC is a differentiable function,
 30 | it can be used during standard SGD training of deep neural networks.
 31 | 
 32 | In our lab, we focus on scaling up recurrent neural networks, and CTC loss is an
 33 | important component. To make our system efficient, we parallelized the CTC
 34 | algorithm, as described in [this paper](http://arxiv.org/abs/1512.02595).
 35 | This project contains our high performance CPU and CUDA versions of the CTC loss,
 36 | along with bindings for [Torch](http://torch.ch/).
 37 | The library provides a simple C interface, so that it is easy to
 38 | integrate into deep learning frameworks.
 39 | 
 40 | This implementation has improved training scalability beyond the
 41 | performance improvement from a faster parallel CTC implementation. For
 42 | GPU-focused training pipelines, the ability to keep all data local to
 43 | GPU memory allows us to spend interconnect bandwidth on increased data
 44 | parallelism.
 45 | 
 46 | ## Performance
 47 | 
 48 | Our CTC implementation is efficient compared with many of the other publicly available implementations.  It is
 49 | also written to be as numerically stable as possible.  The algorithm is numerically sensitive and we have observed
 50 | catastrophic underflow even in double precision with the standard calculation - the result of division of 
 51 | two numbers on the order of 1e-324 which should have been approximately one, instead become infinity 
 52 | when the denominator underflowed to 0.  Instead, by performing the calculation in log space, it is numerically
 53 | stable even in single precision floating point at the cost of significantly more expensive operations.  Instead of
 54 | one machine instruction, addition requires the evaluation of multiple transcendental functions.  Because of this,
 55 | the speed of CTC implementations can only be fairly compared if they are both performing the calculation the same
 56 | way.
 57 | 
 58 | We compare our performance with [Eesen](https://github.com/srvk/eesen/commit/68f2bc2d46a5513cce3c232a645292632a1b08f9), 
 59 | a CTC implementation built on 
 60 | [Theano](https://github.com/mohammadpz/CTC-Connectionist-Temporal-Classification/commit/904e8c72e15334887609d399254cf05a591d570f),
 61 | and a Cython CPU only implementation [Stanford-CTC](https://github.com/amaas/stanford-ctc/commit/c8859897336a349b6c561d2bf2d179fae90b4d67).
 62 | We benchmark the Theano implementation operating on 32-bit floating-point numbers and doing the calculation in log-space,
 63 | in order to match the other implementations we compare against.  Stanford-CTC was modified to perform the calculation
 64 | in log-space as it did not support it natively.  It also does not support minibatches larger than 1, so would require
 65 | an awkward memory layout to use in a real training pipeline, we assume linear increase in cost with minibatch size.
 66 | 
 67 | We show results on two problem sizes relevant to our English and Mandarin end-to-end models, respectively, where *T* represents the number of timesteps in the input to CTC, *L* represents the length of the labels for each example, and *A* represents the alphabet size.
 68 | 
 69 | On the GPU, our performance at a minibatch of 64 examples ranges from 7x faster to 155x faster than Eesen, and 46x to 68x faster than the Theano implementation.
 70 | 
 71 | ### GPU Performance
 72 | Benchmarked on a single NVIDIA Titan X GPU.
 73 | 
 74 | | *T*=150, *L*=40, *A*=28           | warp-ctc  | Eesen   | Theano  |
 75 | |-----------------------------------|-------|---------|---------|
 76 | | *N*=1                             | 3.1 ms| .5 ms   | 67 ms |
 77 | | *N*=16                            | 3.2 ms| 6  ms   | 94 ms |
 78 | | *N*=32                            | 3.2 ms| 12 ms   | 119 ms |
 79 | | *N*=64                            | 3.3 ms| 24 ms   | 153 ms |
 80 | | *N*=128                           | 3.5 ms| 49 ms   | 231 ms |
 81 | 
 82 | 
 83 | | *T*=150, *L*=20, *A*=5000         | warp-ctc  | Eesen   | Theano  |
 84 | |-----------------------------------|-------|---------|---------|
 85 | | *N*=1                             | 7 ms  | 40   ms | 120 ms |
 86 | | *N*=16                            | 9 ms  | 619  ms | 385 ms |
 87 | | *N*=32                            | 11 ms | 1238 ms | 665 ms |
 88 | | *N*=64                            | 16 ms | 2475 ms | 1100 ms |
 89 | | *N*=128                           | 23 ms | 4950 ms | 2100 ms |
 90 | 
 91 | ### CPU Performance
 92 | 
 93 | Benchmarked on a dual-socket machine with two Intel E5-2660 v3
 94 | processors - warp-ctc used 40 threads to maximally take advantage of the CPU resources.
 95 | Eesen doesn't provide a CPU implementation. We noticed that the Theano implementation was not
 96 | parallelizing computation across multiple threads.  Stanford-CTC provides no mechanism
 97 | for parallelization across threads.
 98 | 
 99 | 
100 | | *T*=150, *L*=40, *A*=28           | warp-ctc  | Stanford-CTC   | Theano  |
101 | |-----------------------------------|-------|---------|---------|
102 | | *N*=1                             | 2.6 ms|  13 ms  | 15 ms |
103 | | *N*=16                            | 3.4 ms|  208 ms | 180 ms |
104 | | *N*=32                            | 3.9 ms|  416 ms | 375 ms |
105 | | *N*=64                            | 6.6 ms|  832 ms | 700 ms |
106 | | *N*=128                           |12.2 ms| 1684 ms | 1340 ms |
107 | 
108 | 
109 | | *T*=150, *L*=20, *A*=5000         | warp-ctc  | Stanford-CTC   | Theano  |
110 | |-----------------------------------|-------|---------|---------|
111 | | *N*=1                             | 21 ms |  31 ms  | 850 ms  |
112 | | *N*=16                            | 37 ms |  496 ms | 10800 ms|
113 | | *N*=32                            | 54 ms |  992 ms | 22000 ms|
114 | | *N*=64                            | 101 ms| 1984 ms | 42000 ms|
115 | | *N*=128                           | 184 ms| 3968 ms | 86000 ms|
116 | 
117 | 
118 | 
119 | 
120 | 
121 | ## Interface
122 | 
123 | The interface is in [`include/ctc.h`](include/ctc.h).
124 | It supports CPU or GPU execution, and you can specify OpenMP parallelism
125 | if running on the CPU, or the CUDA stream if running on the GPU. We
126 | took care to ensure that the library does not perform memory
127 | allocation internally, in order to avoid synchronizations and
128 | overheads caused by memory allocation.
129 | 
130 | ## Compilation
131 | 
132 | warp-ctc has been tested on Ubuntu 14.04 and OSX 10.10.  Windows is not supported
133 | at this time.
134 | 
135 | First get the code:
136 | 
137 | ```
138 | git clone https://github.com/baidu-research/warp-ctc.git
139 | cd warp-ctc
140 | ```
141 | 
142 | create a build directory:
143 | 
144 | ```
145 | mkdir build
146 | cd build
147 | ```
148 | 
149 | if you have a non standard CUDA install `export CUDA_BIN_PATH=/path_to_cuda` so that CMake detects CUDA and
150 | to ensure Torch is detected, make sure `th` is in `$PATH`
151 | 
152 | run cmake and build:
153 | 
154 | ```
155 | cmake ../
156 | make
157 | ```
158 | 
159 | The C library and torch shared libraries should now be built along with test
160 | executables.  If CUDA was detected, then `test_gpu` will be built; `test_cpu`
161 | will always be built.
162 | 
163 | ## Tests
164 | 
165 | To run the tests, make sure the CUDA libraries are in `LD_LIBRARY_PATH` (`DYLD_LIBRARY_PATH` for OSX).
166 | 
167 | The Torch tests must be run from the `torch_binding/tests/` directory.
168 | 
169 | ## Torch Installation
170 | 
171 | ```luarocks make torch_binding/rocks/warp-ctc-scm-1.rockspec```
172 | 
173 | You can also install without cloning the repository using
174 | 
175 | ```luarocks install http://raw.githubusercontent.com/baidu-research/warp-ctc/master/torch_binding/rocks/warp-ctc-scm-1.rockspec```
176 | 
177 | There is a Torch CTC [tutorial](torch_binding/TUTORIAL.md).
178 | 
179 | ## Contributing
180 | 
181 | We welcome improvements from the community, please feel free to submit pull
182 | requests.
183 | 
184 | ## Known Issues  / Limitations
185 | 
186 | The CUDA implementation requires a device of at least compute capability 3.0.
187 | 
188 | The CUDA implementation supports a maximum label length of 639 (timesteps are
189 | unlimited).
190 | 


--------------------------------------------------------------------------------
/README.zh_cn.md:
--------------------------------------------------------------------------------
  1 | ![Baidu Logo](/doc/baidu-research-logo-small.png)
  2 | 
  3 | [In English](README.md)
  4 | 
  5 | # warp-ctc
  6 | 
  7 | Warp-CTC是一个可以应用在CPU和GPU上高效并行的CTC代码库 （library）
  8 | 介绍
  9 | CTC[Connectionist Temporal Classification](http://www.cs.toronto.edu/~graves/icml_2006.pdf)作为一个损失函数，用于在序列数据上进行监督式学习，不需要对齐输入数据及标签。比如，CTC可以被用来训练端对端的语音识别系统，这正是我们在百度硅谷试验室所使用的方法。
 10 | [端到端](http://www.jmlr.org/proceedings/papers/v32/graves14.pdf)
 11 | [系统](http://arxiv.org/pdf/1408.2873v2.pdf)
 12 | [语音识别](http://arxiv.org/abs/1512.02595)
 13 | 
 14 | ![DSCTC](/doc/deep-speech-ctc-small.png)
 15 | 
 16 | 上图展示了CTC计算输出序列（“THE CAT”）概率的过程，是对可能映射成“THE CAT”的所有可能输入序列对齐的和。这一过程考虑了标签会被复制的可能性，因为标签有可能在输入数据的几个时间步（time steps)时被拉伸 （请见上图底部的声谱图）。由于涉及到了组合学，计算所有可能概率的和的成本会很高，但是CTC运用了动态规划以大幅降低计算的复杂性。作为一个可微函数，CTC可以被用于深度神经网络的标准SGD训练。
 17 | 我们实验室专注于递归神经网络（RNN）的可扩展性 （scalibility), 而CTC损失函数是其中很重要的一部分。为了让我们的系统更有效率，我们并行处理了CTC算法，正如这篇文章中所描述的 。这个项目包含了我们的高性能CPU以及CUDA版本的CTC损失函数, 以及绑定的Torch. 该代码库提供了简单的C接口，易于与深度学习框架整合。
 18 | 
 19 | 这种执行方式提高了训练的的可扩展性，超过了并行CTC的实现方式。对于以GPU为核心的训练， 我们可用所有的的网络带宽来增加数据的可并行性。
 20 | 性能
 21 | 相比其他的开源工具，Warp-CTC的实现方式相对高效，且代码的数值稳定性也较好。因为CTC本身对数值较为敏感，因此即使使用双精度标准计算，也会出现下溢 (underflow)的情况。 具体来说，两个数值趋近于无穷小且相近的数字相除的结果应该大约为1，却因为分母接近为0而变成无穷。 然而，如果直接取对数执行运算，CTC会在数值上较为稳定，虽然会在单精度浮点中以高成本运算为代价。
 22 | 我们将Warp-CTC和[Eesen](https://github.com/srvk/eesen/commit/68f2bc2d46a5513cce3c232a645292632a1b08f9) (建立在[Theano](https://github.com/mohammadpz/CTC-Connectionist-Temporal-Classification/commit/904e8c72e15334887609d399254cf05a591d570f)上的CTC)以及仅运行[Stanford-CTC](https://github.com/amaas/stanford-ctc/commit/c8859897336a349b6c561d2bf2d179fae90b4d67)的Cython CPU进行了比较。为了进行比较，我们对在32位浮点数上运行的Theano进行了基准测试，并且取对数计算。 而Stanford-CTC由于本身不支持对数运算，因此需要被修改。而且它也不支持大于1的迷你批处理 （minibatches), 所以需要在真正的训练流水线上布局非常规内存（我们假设成本与迷你批处理的规模是成正线性关系）。
 23 | 我们在Deep Speech 2中分别展示了英文及中文端对端模型的结果, 其中T代表输入CTC的时间步数量，L代表每个例子的标签长度，A代表字母数量。
 24 | 在GPU上，Warp-CTC对64个例子迷你批处理的表现比Eesen快7倍到155倍，比Theano快46倍到68倍
 25 | ### GPU性能
 26 | 单核NVIDIA Titan X GPU基准测试
 27 | 
 28 | | *T*=150, *L*=40, *A*=28           | warp-ctc  | Eesen   | Theano  |
 29 | |-----------------------------------|-------|---------|---------|
 30 | | *N*=1                             | 3.1 ms| .5 ms   | 67 ms |
 31 | | *N*=16                            | 3.2 ms| 6  ms   | 94 ms |
 32 | | *N*=32                            | 3.2 ms| 12 ms   | 119 ms |
 33 | | *N*=64                            | 3.3 ms| 24 ms   | 153 ms |
 34 | | *N*=128                           | 3.5 ms| 49 ms   | 231 ms |
 35 | 
 36 | 
 37 | | *T*=150, *L*=20, *A*=5000         | warp-ctc  | Eesen   | Theano  |
 38 | |-----------------------------------|-------|---------|---------|
 39 | | *N*=1                             | 7 ms  | 40   ms | 120 ms |
 40 | | *N*=16                            | 9 ms  | 619  ms | 385 ms |
 41 | | *N*=32                            | 11 ms | 1238 ms | 665 ms |
 42 | | *N*=64                            | 16 ms | 2475 ms | 1100 ms |
 43 | | *N*=128                           | 23 ms | 4950 ms | 2100 ms |
 44 | 
 45 | ### CPU性能
 46 | 在一台有两个Intel E5-2660 v3处理器的双槽机上进行基准测试。Warp-CTC用了40个线程从而最大化了对CPU资源的利用。Eesen没有提供CPU实现方式。我们注意到Theano没有在多线程上进行并行计算。同样，Stanford－CTC没有提供多线程并行计算的机制。 
 47 | 
 48 | | *T*=150, *L*=40, *A*=28           | warp-ctc  | Stanford-CTC   | Theano  |
 49 | |-----------------------------------|-------|---------|---------|
 50 | | *N*=1                             | 2.6 ms|  13 ms  | 15 ms |
 51 | | *N*=16                            | 3.4 ms|  208 ms | 180 ms |
 52 | | *N*=32                            | 3.9 ms|  416 ms | 375 ms |
 53 | | *N*=64                            | 6.6 ms|  832 ms | 700 ms |
 54 | | *N*=128                           |12.2 ms| 1684 ms | 1340 ms |
 55 | 
 56 | 
 57 | | *T*=150, *L*=20, *A*=5000         | warp-ctc  | Stanford-CTC   | Theano  |
 58 | |-----------------------------------|-------|---------|---------|
 59 | | *N*=1                             | 21 ms |  31 ms  | 850 ms  |
 60 | | *N*=16                            | 37 ms |  496 ms | 10800 ms|
 61 | | *N*=32                            | 54 ms |  992 ms | 22000 ms|
 62 | | *N*=64                            | 101 ms| 1984 ms | 42000 ms|
 63 | | *N*=128                           | 184 ms| 3968 ms | 86000 ms|
 64 | 
 65 | ## 接口
 66 | 接口在[`include/ctc.h`](include/ctc.h)中，它支持在CPU或者GPU上执行。 如果是在CPU上运行，可以指定OpenMP并行计算; 如果是在GPU上运行，请用CUDA stream。 为避免内存分配而导致的竞争及间接成本，我们会确保代码库不会在内部进行内存分配。 
 67 | ## 编译器
 68 | Warp-CTC已经在Ubuntu 14.04以及OSX 10.10进行了测试，现不支持Windows. 
 69 | 首先，请获取代码
 70 | 
 71 | ```
 72 | git clone https://github.com/baidu-research/warp-ctc.git
 73 | cd warp-ctc
 74 | ```
 75 | 
 76 | 创建目录
 77 | 
 78 | ```
 79 | mkdir build
 80 | cd build
 81 | ```
 82 | 
 83 | 假如使用非标准CUDA，请安装 `export CUDA_BIN_PATH=/path_to_cuda` 以便被CMake检测。且确保Torch被监测到，注意（`th` is in `$PATH`）
 84 | 运行cmake, 创建
 85 | 
 86 | ```
 87 | cmake ../
 88 | make
 89 | ```
 90 | 
 91 | 现在，C代码库以及与torch分享的代码库应当和测试可执行文件一同被创建。假如CUDA被检测到，test_gpu则被创建。
 92 | 测试
 93 | 为了运行测试，确保CUDA代码库在`LD_LIBRARY_PATH` (`DYLD_LIBRARY_PATH` for OSX)中。
 94 | Torch测试必须在 `torch_binding/tests/` 目录中运行。
 95 | ## Torch安装
 96 | 
 97 | ```luarocks make torch_binding/rocks/warp-ctc-scm-1.rockspec```
 98 | 
 99 | 即使不复制存储库（repository)，你也可以安装
100 | 
101 | ```luarocks install http://raw.githubusercontent.com/baidu-research/warp-ctc/master/torch_binding/rocks/warp-ctc-scm-1.rockspec```
102 | 
103 | [请见Torch CTC教程](torch_binding/TUTORIAL.zh_cn.md)。
104 | 
105 | ## 限制
106 | CUDA的执行需要至少3.0的计算能力， 所支持的标签长度最大值为639 （时间步数是有限的）。
107 | 
108 | 最后我们欢迎大家提出宝贵的意见及建议以改进我们的开源服务。
109 | 
110 | 在此鸣谢新智元编译 [http://chuansong.me/account/AI_era](http://chuansong.me/account/AI_era)允许我们参考部分译文，[http://chuansong.me/n/2168385](http://chuansong.me/n/2168385)
111 | 


--------------------------------------------------------------------------------
/cmake/hip.cmake:
--------------------------------------------------------------------------------
 1 | if(NOT WITH_ROCM)
 2 |     return()
 3 | endif()
 4 | 
 5 | if(NOT DEFINED ENV{ROCM_PATH})
 6 |     set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCm has been installed")
 7 |     set(HIP_PATH ${ROCM_PATH}/hip CACHE PATH "Path to which HIP has been installed")
 8 |     set(HIP_CLANG_PATH ${ROCM_PATH}/llvm/bin CACHE PATH "Path to which clang has been installed")
 9 | else()
10 |     set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCm has been installed")
11 |     set(HIP_PATH ${ROCM_PATH}/hip CACHE PATH "Path to which HIP has been installed")
12 |     set(HIP_CLANG_PATH ${ROCM_PATH}/llvm/bin CACHE PATH "Path to which clang has been installed")
13 | endif()
14 | set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
15 | 
16 | find_package(HIP REQUIRED)
17 | include_directories(${ROCM_PATH}/include)
18 | message(STATUS "HIP version: ${HIP_VERSION}")
19 | message(STATUS "HIP_CLANG_PATH: ${HIP_CLANG_PATH}")
20 | MESSAGE(STATUS "HIP_ROOT_DIR: ${HIP_ROOT_DIR}")
21 | 
22 | macro(find_package_and_include PACKAGE_NAME)
23 |   find_package("${PACKAGE_NAME}" REQUIRED)
24 |   include_directories("${ROCM_PATH}/${PACKAGE_NAME}/include")
25 |   message(STATUS "${PACKAGE_NAME} version: ${${PACKAGE_NAME}_VERSION}")
26 | endmacro()
27 | 
28 | find_package_and_include(hiprand)
29 | find_package_and_include(rocrand)
30 | find_package_and_include(rocthrust)
31 | 
32 | # set CXX flags for HIP
33 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__")
34 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__")
35 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP")
36 | set(THRUST_DEVICE_SYSTEM THRUST_DEVICE_SYSTEM_HIP)
37 | 
38 | # define HIP_CXX_FLAGS
39 | list(APPEND HIP_CXX_FLAGS -fPIC)
40 | list(APPEND HIP_CXX_FLAGS -D__HIP_PLATFORM_HCC__=1)
41 | # Note(qili93): HIP has compile conflicts of float16.h as platform::float16 overload std::is_floating_point and std::is_integer
42 | list(APPEND HIP_CXX_FLAGS -D__HIP_NO_HALF_CONVERSIONS__=1)
43 | list(APPEND HIP_CXX_FLAGS -Wno-macro-redefined)
44 | list(APPEND HIP_CXX_FLAGS -Wno-inconsistent-missing-override)
45 | list(APPEND HIP_CXX_FLAGS -Wno-exceptions)
46 | list(APPEND HIP_CXX_FLAGS -Wno-shift-count-negative)
47 | list(APPEND HIP_CXX_FLAGS -Wno-shift-count-overflow)
48 | list(APPEND HIP_CXX_FLAGS -Wno-unused-command-line-argument)
49 | list(APPEND HIP_CXX_FLAGS -Wno-duplicate-decl-specifier)
50 | list(APPEND HIP_CXX_FLAGS -Wno-implicit-int-float-conversion)
51 | list(APPEND HIP_CXX_FLAGS -Wno-pass-failed)
52 | list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP)
53 | list(APPEND HIP_CXX_FLAGS -std=c++14)
54 | 
55 | if(CMAKE_BUILD_TYPE MATCHES Debug)
56 |   list(APPEND HIP_CXX_FLAGS -g2)
57 |   list(APPEND HIP_CXX_FLAGS -O0)
58 |   list(APPEND HIP_HIPCC_FLAGS -fdebug-info-for-profiling)
59 | endif(CMAKE_BUILD_TYPE MATCHES Debug)
60 | 
61 | set(HIP_HCC_FLAGS ${HIP_CXX_FLAGS})
62 | set(HIP_CLANG_FLAGS ${HIP_CXX_FLAGS})
63 | # Ask hcc to generate device code during compilation so we can use
64 | # host linker to link.
65 | list(APPEND HIP_HCC_FLAGS -fno-gpu-rdc)
66 | list(APPEND HIP_HCC_FLAGS --amdgpu-target=gfx906)
67 | list(APPEND HIP_CLANG_FLAGS -fno-gpu-rdc)
68 | list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx906)
69 | 
70 | 
71 | if(HIP_COMPILER STREQUAL clang)
72 |   set(hip_library_name amdhip64)
73 | else()
74 |   set(hip_library_name hip_hcc)
75 | endif()
76 | message(STATUS "HIP library name: ${hip_library_name}")
77 | 
78 | # set HIP link libs
79 | find_library(ROCM_HIPRTC_LIB ${hip_library_name} HINTS ${HIP_PATH}/lib)
80 | message(STATUS "ROCM_HIPRTC_LIB: ${ROCM_HIPRTC_LIB}")
81 | 


--------------------------------------------------------------------------------
/doc/baidu-research-logo-small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baidu-research/warp-ctc/94b2fa178347cf02757bdc7329dc2f1b46f5d094/doc/baidu-research-logo-small.png


--------------------------------------------------------------------------------
/doc/deep-speech-ctc-small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baidu-research/warp-ctc/94b2fa178347cf02757bdc7329dc2f1b46f5d094/doc/deep-speech-ctc-small.png


--------------------------------------------------------------------------------
/include/contrib/moderngpu/LICENSE:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 | * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
 3 | * 
 4 | * Redistribution and use in source and binary forms, with or without
 5 | * modification, are permitted provided that the following conditions are met:
 6 | *     * Redistributions of source code must retain the above copyright
 7 | *       notice, this list of conditions and the following disclaimer.
 8 | *     * Redistributions in binary form must reproduce the above copyright
 9 | *       notice, this list of conditions and the following disclaimer in the
10 | *       documentation and/or other materials provided with the distribution.
11 | *     * Neither the name of the NVIDIA CORPORATION nor the
12 | *       names of its contributors may be used to endorse or promote products
13 | *       derived from this software without specific prior written permission.
14 | * 
15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | *
26 | ******************************************************************************/
27 | 


--------------------------------------------------------------------------------
/include/contrib/moderngpu/include/device/ctaloadbalance.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /******************************************************************************
 29 |  *
 30 |  * Code and text by Sean Baxter, NVIDIA Research
 31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
 32 |  *
 33 |  ******************************************************************************/
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "ctasearch.cuh"
 38 | #include "loadstore.cuh"
 39 | 
 40 | namespace mgpu {
 41 | 
 42 | ////////////////////////////////////////////////////////////////////////////////
 43 | // DeviceLoadBalancingSearch
 44 | // Upper Bound search from A (needles) into B (haystack). The A values are
 45 | // natural numbers from aBegin to aEnd. bFirst is the index of the B value at
 46 | // bBegin in shared memory.
 47 | 
 48 | template<int VT, bool RangeCheck>
 49 | MGPU_DEVICE void DeviceSerialLoadBalanceSearch(const int* b_shared, int aBegin,
 50 | 	int aEnd, int bFirst, int bBegin, int bEnd, int* a_shared) {
 51 | 
 52 | 	int bKey = b_shared[bBegin];
 53 | 
 54 | 	#pragma unroll
 55 | 	for(int i = 0; i < VT; ++i) {
 56 | 		bool p;
 57 | 		if(RangeCheck)
 58 | 			p = (aBegin < aEnd) && ((bBegin >= bEnd) || (aBegin < bKey));
 59 | 		else
 60 | 			p = aBegin < bKey;
 61 | 
 62 | 		if(p)
 63 | 			// Advance A (the needle).
 64 | 			a_shared[aBegin++] = bFirst + bBegin;
 65 | 		else
 66 | 			// Advance B (the haystack).
 67 | 			bKey = b_shared[++bBegin];
 68 | 	}
 69 | }
 70 | 
 71 | ////////////////////////////////////////////////////////////////////////////////
 72 | // CTALoadBalance
 73 | // Computes upper_bound(counting_iterator<int>(first), b_global) - 1.
 74 | 
 75 | // Unlike most other CTA* functions, CTALoadBalance loads from global memory.
 76 | // This returns the loaded B elements at the beginning or end of shared memory
 77 | // depending on the aFirst argument.
 78 | 
 79 | // CTALoadBalance requires NT * VT + 2 slots of shared memory.
 80 | template<int NT, int VT, typename InputIt>
 81 | MGPU_DEVICE int4 CTALoadBalance(int destCount, InputIt b_global,
 82 | 	int sourceCount, int block, int tid, const int* mp_global,
 83 | 	int* indices_shared, bool loadPrecedingB) {
 84 | 
 85 | 	int4 range = ComputeMergeRange(destCount, sourceCount, block, 0, NT * VT,
 86 | 		mp_global);
 87 | 
 88 | 	int a0 = range.x;
 89 | 	int a1 = range.y;
 90 | 	int b0 = range.z;
 91 | 	int b1 = range.w;
 92 | 	if(!b0) loadPrecedingB = false;
 93 | 
 94 | 	// Load one trailing term from B. If we're already at the end, fill the
 95 | 	// end of the buffer with destCount.
 96 | 	int aCount = a1 - a0;
 97 | 	int bCount = b1 - b0;
 98 | 	int extended = b1 < sourceCount;
 99 | 	int loadCount = bCount + extended;
100 | 	int fillCount = NT * VT + 1 - loadCount - aCount;
101 | 
102 | 	int* a_shared = indices_shared;
103 | 	int* b_shared = indices_shared + aCount + (int)loadPrecedingB;
104 | 
105 | 	// Load the B values.
106 | //	DeviceMemToMemLoop<NT>(bCount + extended + (int)loadPrecedingB,
107 | //		b_global + b0 - (int)loadPrecedingB, tid,
108 | //		b_shared - (int)loadPrecedingB);
109 | 
110 | 	for(int i = tid - (int)loadPrecedingB; i < bCount + extended; i += NT)
111 | 		b_shared[i] = b_global[b0 + i];
112 | 
113 | 	// Fill the end of the array with destCount.
114 | 	for(int i = tid + extended; i < fillCount; i += NT)
115 | 		b_shared[bCount + i] = destCount;
116 | 	__syncthreads();
117 | 
118 | 	// Run a merge path to find the start of the serial merge for each thread.
119 | 	int diag = VT * tid;
120 | 	int mp = MergePath<MgpuBoundsUpper>(mgpu::counting_iterator<int>(a0),
121 | 		aCount, b_shared, bCount, diag, mgpu::less<int>());
122 | 
123 | 	int a0tid = a0 + mp;
124 | 	int b0tid = diag - mp;
125 | 
126 | 	// Subtract 1 from b0 because we want to return upper_bound - 1.
127 | 	DeviceSerialLoadBalanceSearch<VT, false>(b_shared, a0tid, a1, b0 - 1,
128 | 		b0tid, bCount, a_shared - a0);
129 | 	__syncthreads();
130 | 
131 | 	b0 -= (int)loadPrecedingB;
132 | 	return make_int4(a0, a1, b0, b1);
133 | }
134 | 
135 | 
136 | } // namespace mgpu
137 | 


--------------------------------------------------------------------------------
/include/contrib/moderngpu/include/device/ctascan.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /******************************************************************************
 29 |  *
 30 |  * Code and text by Sean Baxter, NVIDIA Research
 31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
 32 |  *
 33 |  ******************************************************************************/
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "../mgpuenums.h"
 38 | #include "deviceutil.cuh"
 39 | #include "intrinsics.cuh"
 40 | 
 41 | namespace mgpu {
 42 | 
 43 | ////////////////////////////////////////////////////////////////////////////////
 44 | // CTAReduce
 45 | 
 46 | template<int NT, typename Op = mgpu::plus<int> >
 47 | struct CTAReduce {
 48 | 	typedef typename Op::first_argument_type T;
 49 | 	enum { Size = NT, Capacity = NT };
 50 | 	struct Storage { T shared[Capacity]; };
 51 | 
 52 | 	MGPU_DEVICE static T Reduce(int tid, T x, Storage& storage, Op op = Op()) {
 53 | 		storage.shared[tid] = x;
 54 | 		__syncthreads();
 55 | 
 56 | 		// Fold the data in half with each pass.
 57 | 		#pragma unroll
 58 | 		for(int destCount = NT / 2; destCount >= 1; destCount /= 2) {
 59 | 			if(tid < destCount) {
 60 | 				// Read from the right half and store to the left half.
 61 | 				x = op(x, storage.shared[destCount + tid]);
 62 | 				storage.shared[tid] = x;
 63 | 			}
 64 | 			__syncthreads();
 65 | 		}
 66 | 		T total = storage.shared[0];
 67 | 		__syncthreads();
 68 | 		return total;
 69 | 	}
 70 | };
 71 | 
 72 | #if __CUDA_ARCH__ >= 300
 73 | 
 74 | template<int NT>
 75 | struct CTAReduce<NT, mgpu::plus<int> > {
 76 | 	typedef mgpu::plus<int> Op;
 77 | 	typedef int T;
 78 | 	enum { Size = NT, Capacity = WARP_SIZE };
 79 | 	struct Storage { int shared[Capacity]; };
 80 | 
 81 | 	MGPU_DEVICE static int Reduce(int tid, int x, Storage& storage,
 82 | 		Op op = Op()) {
 83 | 
 84 | 		const int NumSections = WARP_SIZE;
 85 | 		const int SecSize = NT / NumSections;
 86 | 		int lane = (SecSize - 1) & tid;
 87 | 		int sec = tid / SecSize;
 88 | 
 89 | 		// In the first phase, threads cooperatively find the reduction within
 90 | 		// their segment. The segments are SecSize threads (NT / WARP_SIZE)
 91 | 		// wide.
 92 | 		#pragma unroll
 93 | 		for(int offset = 1; offset < SecSize; offset *= 2)
 94 | 			x = shfl_add(x, offset, SecSize);
 95 | 
 96 | 		// The last thread in each segment stores the local reduction to shared
 97 | 		// memory.
 98 | 		if(SecSize - 1 == lane) storage.shared[sec] = x;
 99 | 		__syncthreads();
100 | 
101 | 		// Reduce the totals of each input segment. The spine is WARP_SIZE
102 | 		// threads wide.
103 | 		if(tid < NumSections) {
104 | 			x = storage.shared[tid];
105 | 			#pragma unroll
106 | 			for(int offset = 1; offset < NumSections; offset *= 2)
107 | 				x = shfl_add(x, offset, NumSections);
108 | 			storage.shared[tid] = x;
109 | 		}
110 | 		__syncthreads();
111 | 
112 | 		int reduction = storage.shared[NumSections - 1];
113 | 		__syncthreads();
114 | 
115 | 		return reduction;
116 | 	}
117 | };
118 | 
119 | template<int NT>
120 | struct CTAReduce<NT, mgpu::maximum<int> > {
121 | 	typedef mgpu::maximum<int> Op;
122 | 	enum { Size = NT, Capacity = WARP_SIZE };
123 | 	struct Storage { int shared[Capacity]; };
124 | 
125 | 	MGPU_DEVICE static int Reduce(int tid, int x, Storage& storage,
126 | 		Op op = Op()) {
127 | 
128 | 		const int NumSections = WARP_SIZE;
129 | 		const int SecSize = NT / NumSections;
130 | 		int lane = (SecSize - 1) & tid;
131 | 		int sec = tid / SecSize;
132 | 
133 | 		#pragma unroll
134 | 		for(int offset = 1; offset < SecSize; offset *= 2)
135 | 			x = shfl_max(x, offset, SecSize);
136 | 
137 | 		if(SecSize - 1 == lane) storage.shared[sec] = x;
138 | 		__syncthreads();
139 | 
140 | 		if(tid < NumSections) {
141 | 			x = storage.shared[tid];
142 | 			#pragma unroll
143 | 			for(int offset = 1; offset < NumSections; offset *= 2)
144 | 				x = shfl_max(x, offset, NumSections);
145 | 			storage.shared[tid] = x;
146 | 		}
147 | 		__syncthreads();
148 | 
149 | 		int reduction = storage.shared[NumSections - 1];
150 | 		__syncthreads();
151 | 
152 | 		return reduction;
153 | 	}
154 | };
155 | 
156 | #endif // __CUDA_ARCH__ >= 300
157 | 
158 | ////////////////////////////////////////////////////////////////////////////////
159 | // CTAScan
160 | 
161 | template<int NT, typename Op = mgpu::plus<int> >
162 | struct CTAScan {
163 | 	typedef typename Op::result_type T;
164 | 	enum { Size = NT, Capacity = 2 * NT + 1 };
165 | 	struct Storage { T shared[Capacity]; };
166 | 
167 | 	MGPU_DEVICE static T Scan(int tid, T x, Storage& storage, T* total,
168 | 		MgpuScanType type = MgpuScanTypeExc, T identity = (T)0, Op op = Op()) {
169 | 
170 | 		storage.shared[tid] = x;
171 | 		int first = 0;
172 | 		__syncthreads();
173 | 
174 | 		#pragma unroll
175 | 		for(int offset = 1; offset < NT; offset += offset) {
176 | 			if(tid >= offset)
177 | 				x = op(storage.shared[first + tid - offset], x);
178 | 			first = NT - first;
179 | 			storage.shared[first + tid] = x;
180 | 			__syncthreads();
181 | 		}
182 | 		*total = storage.shared[first + NT - 1];
183 | 
184 | 		if(MgpuScanTypeExc == type)
185 | 			x = tid ? storage.shared[first + tid - 1] : identity;
186 | 
187 | 		__syncthreads();
188 | 		return x;
189 | 	}
190 | 	MGPU_DEVICE static T Scan(int tid, T x, Storage& storage) {
191 | 		T total;
192 | 		return Scan(tid, x, storage, &total, MgpuScanTypeExc, (T)0, Op());
193 | 	}
194 | };
195 | 
196 | ////////////////////////////////////////////////////////////////////////////////
197 | // Special partial specialization for CTAScan<NT, ScanOpAdd> on Kepler.
198 | // This uses the shfl intrinsic to reduce scan latency.
199 | 
200 | #if __CUDA_ARCH__ >= 300
201 | 
202 | template<int NT>
203 | struct CTAScan<NT, mgpu::plus<int> > {
204 | 	typedef mgpu::plus<int> Op;
205 | 	enum { Size = NT, NumSegments = WARP_SIZE, SegSize = NT / NumSegments };
206 | 	enum { Capacity = NumSegments + 1 };
207 | 	struct Storage { int shared[Capacity + 1]; };
208 | 
209 | 	MGPU_DEVICE static int Scan(int tid, int x, Storage& storage, int* total,
210 | 		MgpuScanType type = MgpuScanTypeExc, int identity = 0, Op op = Op()) {
211 | 
212 | 		// Define WARP_SIZE segments that are NT / WARP_SIZE large.
213 | 		// Each warp makes log(SegSize) shfl_add calls.
214 | 		// The spine makes log(WARP_SIZE) shfl_add calls.
215 | 		int lane = (SegSize - 1) & tid;
216 | 		int segment = tid / SegSize;
217 | 
218 | 		// Scan each segment using shfl_add.
219 | 		int scan = x;
220 | 		#pragma unroll
221 | 		for(int offset = 1; offset < SegSize; offset *= 2)
222 | 			scan = shfl_add(scan, offset, SegSize);
223 | 
224 | 		// Store the reduction (last element) of each segment into storage.
225 | 		if(SegSize - 1 == lane) storage.shared[segment] = scan;
226 | 		__syncthreads();
227 | 
228 | 		// Warp 0 does a full shfl warp scan on the partials. The total is
229 | 		// stored to shared[NumSegments]. (NumSegments = WARP_SIZE)
230 | 		if(tid < NumSegments) {
231 | 			int y = storage.shared[tid];
232 | 			int scan = y;
233 | 			#pragma unroll
234 | 			for(int offset = 1; offset < NumSegments; offset *= 2)
235 | 				scan = shfl_add(scan, offset, NumSegments);
236 | 			storage.shared[tid] = scan - y;
237 | 			if(NumSegments - 1 == tid) storage.shared[NumSegments] = scan;
238 | 		}
239 | 		__syncthreads();
240 | 
241 | 		// Add the scanned partials back in and convert to exclusive scan.
242 | 		scan += storage.shared[segment];
243 | 		if(MgpuScanTypeExc == type) {
244 | 			scan -= x;
245 | 			if(identity && !tid) scan = identity;
246 | 		}
247 | 		*total = storage.shared[NumSegments];
248 | 		__syncthreads();
249 | 
250 | 		return scan;
251 | 	}
252 | 	MGPU_DEVICE static int Scan(int tid, int x, Storage& storage) {
253 | 		int total;
254 | 		return Scan(tid, x, storage, &total, MgpuScanTypeExc, 0);
255 | 	}
256 | };
257 | 
258 | #endif // __CUDA_ARCH__ >= 300
259 | 
260 | ////////////////////////////////////////////////////////////////////////////////
261 | // CTABinaryScan
262 | 
263 | template<int NT>
264 | MGPU_DEVICE int CTABinaryScan(int tid, bool x, int* shared, int* total) {
265 | 	const int NumWarps = NT / WARP_SIZE;
266 | 	int warp = tid / WARP_SIZE;
267 | 	int lane = (WARP_SIZE - 1);
268 | 
269 | 	// Store the bit totals for each warp.
270 | 	uint bits = __ballot(x);
271 | 	shared[warp] = popc(bits);
272 | 	__syncthreads();
273 | 
274 | #if __CUDA_ARCH__ >= 300
275 | 	if(tid < NumWarps) {
276 | 		int x = shared[tid];
277 | 		int scan = x;
278 | 		#pragma unroll
279 | 		for(int offset = 1; offset < NumWarps; offset *= 2)
280 | 			scan = shfl_add(scan, offset, NumWarps);
281 | 		shared[tid] = scan - x;
282 | 	}
283 | 	__syncthreads();
284 | 
285 | #else
286 | 	// Thread 0 scans warp totals.
287 | 	if(!tid) {
288 | 		int scan = 0;
289 | 		#pragma unroll
290 | 		for(int i = 0; i < NumWarps; ++i) {
291 | 			int y = shared[i];
292 | 			shared[i] = scan;
293 | 			scan += y;
294 | 		}
295 | 		shared[NumWarps] = scan;
296 | 	}
297 | 	__syncthreads();
298 | 
299 | #endif // __CUDA_ARCH__ >= 300
300 | 
301 | 	// Add the warp scan back into the partials.
302 | 	int scan = shared[warp] + __popc(bfe(bits, 0, lane));
303 | 	*total = shared[NumWarps];
304 | 	__syncthreads();
305 | 	return scan;
306 | }
307 | 
308 | } // namespace mgpu
309 | 


--------------------------------------------------------------------------------
/include/contrib/moderngpu/include/device/ctasearch.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /******************************************************************************
 29 |  *
 30 |  * Code and text by Sean Baxter, NVIDIA Research
 31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
 32 |  *
 33 |  ******************************************************************************/
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "deviceutil.cuh"
 38 | #include "../mgpudevice.cuh"
 39 | 
 40 | namespace mgpu {
 41 | 
 42 | template<MgpuBounds Bounds, typename IntT, typename It, typename T,
 43 | 	typename Comp>
 44 | MGPU_HOST_DEVICE void BinarySearchIt(It data, int& begin, int& end, T key,
 45 | 	int shift, Comp comp) {
 46 | 
 47 | 	IntT scale = (1<< shift) - 1;
 48 | 	int mid = (int)((begin + scale * end)>> shift);
 49 | 
 50 | 	T key2 = data[mid];
 51 | 	bool pred = (MgpuBoundsUpper == Bounds) ?
 52 | 		!comp(key, key2) :
 53 | 		comp(key2, key);
 54 | 	if(pred) begin = mid + 1;
 55 | 	else end = mid;
 56 | }
 57 | 
 58 | template<MgpuBounds Bounds, typename IntT, typename T, typename It,
 59 | 	typename Comp>
 60 | MGPU_HOST_DEVICE int BiasedBinarySearch(It data, int count, T key, int levels,
 61 | 	Comp comp) {
 62 | 
 63 | 	int begin = 0;
 64 | 	int end = count;
 65 | 
 66 | 	if(levels >= 4 && begin < end)
 67 | 		BinarySearchIt<Bounds, IntT>(data, begin, end, key, 9, comp);
 68 | 	if(levels >= 3 && begin < end)
 69 | 		BinarySearchIt<Bounds, IntT>(data, begin, end, key, 7, comp);
 70 | 	if(levels >= 2 && begin < end)
 71 | 		BinarySearchIt<Bounds, IntT>(data, begin, end, key, 5, comp);
 72 | 	if(levels >= 1 && begin < end)
 73 | 		BinarySearchIt<Bounds, IntT>(data, begin, end, key, 4, comp);
 74 | 
 75 | 	while(begin < end)
 76 | 		BinarySearchIt<Bounds, int>(data, begin, end, key, 1, comp);
 77 | 	return begin;
 78 | }
 79 | 
 80 | template<MgpuBounds Bounds, typename T, typename It, typename Comp>
 81 | MGPU_HOST_DEVICE int BinarySearch(It data, int count, T key, Comp comp) {
 82 | 	int begin = 0;
 83 | 	int end = count;
 84 | 	while(begin < end)
 85 | 		BinarySearchIt<Bounds, int>(data, begin, end, key, 1, comp);
 86 | 	return begin;
 87 | }
 88 | 
 89 | ////////////////////////////////////////////////////////////////////////////////
 90 | // MergePath search
 91 | 
 92 | template<MgpuBounds Bounds, typename It1, typename It2, typename Comp>
 93 | MGPU_HOST_DEVICE int MergePath(It1 a, int aCount, It2 b, int bCount, int diag,
 94 | 	Comp comp) {
 95 | 
 96 | 	typedef typename std::iterator_traits<It1>::value_type T;
 97 | 	int begin = max(0, diag - bCount);
 98 | 	int end = min(diag, aCount);
 99 | 
100 | 	while(begin < end) {
101 | 		int mid = (begin + end)>> 1;
102 | 		T aKey = a[mid];
103 | 		T bKey = b[diag - 1 - mid];
104 | 		bool pred = (MgpuBoundsUpper == Bounds) ?
105 | 			comp(aKey, bKey) :
106 | 			!comp(bKey, aKey);
107 | 		if(pred) begin = mid + 1;
108 | 		else end = mid;
109 | 	}
110 | 	return begin;
111 | }
112 | 
113 | 
114 | ////////////////////////////////////////////////////////////////////////////////
115 | // SegmentedMergePath search
116 | 
117 | template<typename InputIt, typename Comp>
118 | MGPU_HOST_DEVICE int SegmentedMergePath(InputIt keys, int aOffset, int aCount,
119 | 	int bOffset, int bCount, int leftEnd, int rightStart, int diag, Comp comp) {
120 | 
121 | 	// leftEnd and rightStart are defined from the origin, and diag is defined
122 | 	// from aOffset.
123 | 	// We only need to run a Merge Path search if the diagonal intersects the
124 | 	// segment that strides the left and right halves (i.e. is between leftEnd
125 | 	// and rightStart).
126 | 	if(aOffset + diag <= leftEnd) return diag;
127 | 	if(aOffset + diag >= rightStart) return aCount;
128 | 
129 | 	bCount = min(bCount, rightStart - bOffset);
130 | 	int begin = max(max(leftEnd - aOffset, 0), diag - bCount);
131 | 	int end = min(diag, aCount);
132 | 
133 | 	while(begin < end) {
134 | 		int mid = (begin + end)>> 1;
135 | 		int ai = aOffset + mid;
136 | 		int bi = bOffset + diag - 1 - mid;
137 | 
138 | 		bool pred = !comp(keys[bi], keys[ai]);
139 | 		if(pred) begin = mid + 1;
140 | 		else end = mid;
141 | 	}
142 | 	return begin;
143 | }
144 | 
145 | ////////////////////////////////////////////////////////////////////////////////
146 | // BalancedPath search
147 | 
148 | template<bool Duplicates, typename IntT, typename InputIt1, typename InputIt2,
149 | 	typename Comp>
150 | MGPU_HOST_DEVICE int2 BalancedPath(InputIt1 a, int aCount, InputIt2 b,
151 | 	int bCount, int diag, int levels, Comp comp) {
152 | 
153 | 	typedef typename std::iterator_traits<InputIt1>::value_type T;
154 | 
155 | 	int p = MergePath<MgpuBoundsLower>(a, aCount, b, bCount, diag, comp);
156 | 	int aIndex = p;
157 | 	int bIndex = diag - p;
158 | 
159 | 	bool star = false;
160 | 	if(bIndex < bCount) {
161 | 		if(Duplicates) {
162 | 			T x = b[bIndex];
163 | 
164 | 			// Search for the beginning of the duplicate run in both A and B.
165 | 			// Because
166 | 			int aStart = BiasedBinarySearch<MgpuBoundsLower, IntT>(a, aIndex, x,
167 | 				levels, comp);
168 | 			int bStart = BiasedBinarySearch<MgpuBoundsLower, IntT>(b, bIndex, x,
169 | 				levels, comp);
170 | 
171 | 			// The distance between the merge path and the lower_bound is the
172 | 			// 'run'. We add up the a- and b- runs and evenly distribute them to
173 | 			// get a stairstep path.
174 | 			int aRun = aIndex - aStart;
175 | 			int bRun = bIndex - bStart;
176 | 			int xCount = aRun + bRun;
177 | 
178 | 			// Attempt to advance b and regress a.
179 | 			int bAdvance = max(xCount>> 1, bRun);
180 | 			int bEnd = min(bCount, bStart + bAdvance + 1);
181 | 			int bRunEnd = BinarySearch<MgpuBoundsUpper>(b + bIndex,
182 | 				bEnd - bIndex, x, comp) + bIndex;
183 | 			bRun = bRunEnd - bStart;
184 | 
185 | 			bAdvance = min(bAdvance, bRun);
186 | 			int aAdvance = xCount - bAdvance;
187 | 
188 | 			bool roundUp = (aAdvance == bAdvance + 1) && (bAdvance < bRun);
189 | 			aIndex = aStart + aAdvance;
190 | 
191 | 			if(roundUp) star = true;
192 | 		} else {
193 | 			if(aIndex && aCount) {
194 | 				T aKey = a[aIndex - 1];
195 | 				T bKey = b[bIndex];
196 | 
197 | 				// If the last consumed element in A (aIndex - 1) is the same as
198 | 				// the next element in B (bIndex), we're sitting at a starred
199 | 				// partition.
200 | 				if(!comp(aKey, bKey)) star = true;
201 | 			}
202 | 		}
203 | 	}
204 | 	return make_int2(aIndex, star);
205 | }
206 | 
207 | } // namespace mgpu
208 | 


--------------------------------------------------------------------------------
/include/contrib/moderngpu/include/device/ctasegreduce.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /******************************************************************************
 29 |  *
 30 |  * Code and text by Sean Baxter, NVIDIA Research
 31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
 32 |  *
 33 |  ******************************************************************************/
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "ctasegscan.cuh"
 38 | #include "ctasearch.cuh"
 39 | 
 40 | namespace mgpu {
 41 | 
 42 | ////////////////////////////////////////////////////////////////////////////////
 43 | // Segmented reduce utility functions.
 44 | 
 45 | // Extract the upper-bound indices from the coded ranges. Decrement to include
 46 | // the first addressed row/segment.
 47 | 
 48 | struct SegReduceRange {
 49 | 	int begin;
 50 | 	int end;
 51 | 	int total;
 52 | 	bool flushLast;
 53 | };
 54 | 
 55 | MGPU_DEVICE SegReduceRange DeviceShiftRange(int limit0, int limit1) {
 56 | 	SegReduceRange range;
 57 | 	range.begin = 0x7fffffff & limit0;
 58 | 	range.end = 0x7fffffff & limit1;
 59 | 	range.total = range.end - range.begin;
 60 | 	range.flushLast = 0 == (0x80000000 & limit1);
 61 | 	range.end += !range.flushLast;
 62 | 	return range;
 63 | }
 64 | 
 65 | // Reconstitute row/segment indices from a starting row index and packed end
 66 | // flags. Used for pre-processed versions of interval reduce and interval Spmv.
 67 | template<int VT>
 68 | MGPU_DEVICE void DeviceExpandFlagsToRows(int first, int endFlags,
 69 | 	int rows[VT + 1]) {
 70 | 
 71 | 	rows[0] = first;
 72 | 	#pragma unroll
 73 | 	for(int i = 0; i < VT; ++i) {
 74 | 		if((1<< i) & endFlags) ++first;
 75 | 		rows[i + 1] = first;
 76 | 	}
 77 | }
 78 | 
 79 | ////////////////////////////////////////////////////////////////////////////////
 80 | // After loading CSR terms into shared memory, each thread binary searches
 81 | // (upper-bound) to find its starting point. Each thread then walks forward,
 82 | // emitting the csr0-relative row indices to register.
 83 | 
 84 | template<int NT, int VT>
 85 | MGPU_DEVICE int DeviceExpandCsrRows(int tidOffset, int* csr_shared,
 86 | 	int numRows, int end, int rows[VT + 1], int rowStarts[VT]) {
 87 | 
 88 | 	// Each thread binary searches for its starting row.
 89 | 	int row = BinarySearch<MgpuBoundsUpper>(csr_shared, numRows, tidOffset,
 90 | 		mgpu::less<int>()) - 1;
 91 | 
 92 | 	// Each thread starts at row and scans forward, emitting row IDs into
 93 | 	// register. Store the CTA-local row index (starts at 0) to rows and the
 94 | 	// start of the row (globally) to rowStarts.
 95 | 	int curOffset = csr_shared[row];
 96 | 	int nextOffset = (row + 1 < numRows) ? csr_shared[row + 1] : end;
 97 | 
 98 | 	rows[0] = row;
 99 | 	rowStarts[0] = curOffset;
100 | 	int endFlags = 0;
101 | 
102 | 	#pragma unroll
103 | 	for(int i = 1; i <= VT; ++i) {
104 | 		// Advance the row cursor when the iterator hits the next row offset.
105 | 		if(tidOffset + i == nextOffset) {
106 | 			// Set an end flag when the cursor advances to the next row.
107 | 			endFlags |= 1<< (i - 1);
108 | 
109 | 			// Advance the cursor and load the next row offset.
110 | 			++row;
111 | 			curOffset = nextOffset;
112 | 			nextOffset = (row + 1 < numRows) ? csr_shared[row + 1] : end;
113 | 		}
114 | 		rows[i] = row;
115 | 		if(i < VT) rowStarts[i] = curOffset;
116 | 	}
117 | 	__syncthreads();
118 | 
119 | 	return endFlags;
120 | }
121 | 
122 | ////////////////////////////////////////////////////////////////////////////////
123 | // DeviceSegReducePrepare
124 | // Expand non-empty interval of CSR elements into row indices. Compute end-flags
125 | // by comparing adjacent row IDs.
126 | 
127 | // DeviceSegReducePrepare may be called either by a pre-processing kernel or by
128 | // the kernel that actually evaluates the segmented reduction if no preprocesing
129 | // is desired.
130 | struct SegReduceTerms {
131 | 	int endFlags;
132 | 	int tidDelta;
133 | };
134 | 
135 | template<int NT, int VT>
136 | MGPU_DEVICE SegReduceTerms DeviceSegReducePrepare(int* csr_shared, int numRows,
137 | 	int tid, int gid, bool flushLast, int rows[VT + 1], int rowStarts[VT]) {
138 | 
139 | 	// Pass a sentinel (end) to point to the next segment start. If we flush,
140 | 	// this is the end of this tile. Otherwise it is INT_MAX
141 | 	int endFlags = DeviceExpandCsrRows<NT, VT>(gid + VT * tid, csr_shared,
142 | 		numRows, flushLast ? (gid + NT * VT) : INT_MAX, rows, rowStarts);
143 | 
144 | 	// Find the distance to to scan to compute carry-in for each thread. Use the
145 | 	// existance of an end flag anywhere in the thread to determine if carry-out
146 | 	// values from the left should propagate through to the right.
147 | 	int tidDelta = DeviceFindSegScanDelta<NT>(tid, rows[0] != rows[VT],
148 | 		csr_shared);
149 | 
150 | 	SegReduceTerms terms = { endFlags, tidDelta };
151 | 	return terms;
152 | }
153 | 
154 | ////////////////////////////////////////////////////////////////////////////////
155 | // CTASegReduce
156 | // Core segmented reduction code. Supports fast-path and slow-path for intra-CTA
157 | // segmented reduction. Stores partials to global memory.
158 | // Callers feed CTASegReduce::ReduceToGlobal values in thread order.
159 | template<int NT, int VT, bool HalfCapacity, typename T, typename Op>
160 | struct CTASegReduce {
161 | 	typedef CTASegScan<NT, Op> SegScan;
162 | 
163 | 	enum {
164 | 		NV = NT * VT,
165 | 		Capacity = HalfCapacity ? (NV / 2) : NV
166 | 	};
167 | 
168 | 	union Storage {
169 | 		typename SegScan::Storage segScanStorage;
170 | 		T values[Capacity];
171 | 	};
172 | 
173 | 	template<typename DestIt>
174 | 	MGPU_DEVICE static void ReduceToGlobal(const int rows[VT + 1], int total,
175 | 		int tidDelta, int startRow, int block, int tid, T data[VT],
176 | 		DestIt dest_global, T* carryOut_global, T identity, Op op,
177 | 		Storage& storage) {
178 | 
179 | 		// Run a segmented scan within the thread.
180 | 		T x, localScan[VT];
181 | 		#pragma unroll
182 | 		for(int i = 0; i < VT; ++i) {
183 | 			x = i ? op(x, data[i]) : data[i];
184 | 			localScan[i] = x;
185 | 			if(rows[i] != rows[i + 1]) x = identity;
186 | 		}
187 | 
188 | 		// Run a parallel segmented scan over the carry-out values to compute
189 | 		// carry-in.
190 | 		T carryOut;
191 | 		T carryIn = SegScan::SegScanDelta(tid, tidDelta, x,
192 | 			storage.segScanStorage, &carryOut, identity, op);
193 | 
194 | 		// Store the carry-out for the entire CTA to global memory.
195 | 		if(!tid) carryOut_global[block] = carryOut;
196 | 
197 | 		dest_global += startRow;
198 | 		if(HalfCapacity && total > Capacity) {
199 | 			// Add carry-in to each thread-local scan value. Store directly
200 | 			// to global.
201 | 			#pragma unroll
202 | 			for(int i = 0; i < VT; ++i) {
203 | 				// Add the carry-in to the local scan.
204 | 				T x2 = op(carryIn, localScan[i]);
205 | 
206 | 				// Store on the end flag and clear the carry-in.
207 | 				if(rows[i] != rows[i + 1]) {
208 | 					carryIn = identity;
209 | 					dest_global[rows[i]] = x2;
210 | 				}
211 | 			}
212 | 		} else {
213 | 			// All partials fit in shared memory. Add carry-in to each thread-
214 | 			// local scan value.
215 | 			#pragma unroll
216 | 			for(int i = 0; i < VT; ++i) {
217 | 				// Add the carry-in to the local scan.
218 | 				T x2 = op(carryIn, localScan[i]);
219 | 
220 | 				// Store reduction when the segment changes and clear the
221 | 				// carry-in.
222 | 				if(rows[i] != rows[i + 1]) {
223 | 					storage.values[rows[i]] = x2;
224 | 					carryIn = identity;
225 | 				}
226 | 			}
227 | 			__syncthreads();
228 | 
229 | 			// Cooperatively store reductions to global memory.
230 | 			for(int index = tid; index < total; index += NT)
231 | 				dest_global[index] = storage.values[index];
232 | 			__syncthreads();
233 | 		}
234 | 	}
235 | };
236 | 
237 | } // namespace mgpu
238 | 
239 | 


--------------------------------------------------------------------------------
/include/contrib/moderngpu/include/device/ctasegscan.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /******************************************************************************
 29 |  *
 30 |  * Code and text by Sean Baxter, NVIDIA Research
 31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
 32 |  *
 33 |  ******************************************************************************/
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "ctascan.cuh"
 38 | 
 39 | namespace mgpu {
 40 | 
 41 | ////////////////////////////////////////////////////////////////////////////////
 42 | // DeviceFindSegScanDelta
 43 | // Runs an inclusive max-index scan over binary inputs.
 44 | 
 45 | template<int NT>
 46 | MGPU_DEVICE int DeviceFindSegScanDelta(int tid, bool flag, int* delta_shared) {
 47 | 	const int NumWarps = NT / 32;
 48 | 
 49 | 	int warp = tid / 32;
 50 | 	int lane = 31 & tid;
 51 | 	uint warpMask = 0xffffffff>> (31 - lane);		// inclusive search
 52 | 	uint ctaMask = 0x7fffffff>> (31 - lane);		// exclusive search
 53 | 
 54 | 	uint warpBits = __ballot(flag);
 55 | 	delta_shared[warp] = warpBits;
 56 | 	__syncthreads();
 57 | 
 58 | 	if(tid < NumWarps) {
 59 | 		uint ctaBits = __ballot(0 != delta_shared[tid]);
 60 | 		int warpSegment = 31 - clz(ctaMask & ctaBits);
 61 | 		int start = (-1 != warpSegment) ?
 62 | 			(31 - clz(delta_shared[warpSegment]) + 32 * warpSegment) : 0;
 63 | 		delta_shared[NumWarps + tid] = start;
 64 | 	}
 65 | 	__syncthreads();
 66 | 
 67 | 	// Find the closest flag to the left of this thread within the warp.
 68 | 	// Include the flag for this thread.
 69 | 	int start = 31 - clz(warpMask & warpBits);
 70 | 	if(-1 != start) start += ~31 & tid;
 71 | 	else start = delta_shared[NumWarps + warp];
 72 | 	__syncthreads();
 73 | 
 74 | 	return tid - start;
 75 | }
 76 | 
 77 | ////////////////////////////////////////////////////////////////////////////////
 78 | // CTASegScan
 79 | 
 80 | template<int NT, typename _Op = mgpu::plus<int> >
 81 | struct CTASegScan {
 82 | 	typedef _Op Op;
 83 | 	typedef typename Op::result_type T;
 84 | 	enum { NumWarps = NT / 32, Size = NT, Capacity = 2 * NT };
 85 | 	union Storage {
 86 | 		int delta[NumWarps];
 87 | 		T values[Capacity];
 88 | 	};
 89 | 
 90 | 	// Each thread passes the reduction of the LAST SEGMENT that it covers.
 91 | 	// flag is set to true if there's at least one segment flag in the thread.
 92 | 	// SegScan returns the reduction of values for the first segment in this
 93 | 	// thread over the preceding threads.
 94 | 	// Return the value init for the first thread.
 95 | 
 96 | 	// When scanning single elements per thread, interpret the flag as a BEGIN
 97 | 	// FLAG. If tid's flag is set, its value belongs to thread tid + 1, not
 98 | 	// thread tid.
 99 | 
100 | 	// The function returns the reduction of the last segment in the CTA.
101 | 
102 | 	MGPU_DEVICE static T SegScanDelta(int tid, int tidDelta, T x,
103 | 		Storage& storage, T* carryOut, T identity = (T)0, Op op = Op()) {
104 | 
105 | 		// Run an inclusive scan
106 | 		int first = 0;
107 | 		storage.values[first + tid] = x;
108 | 		__syncthreads();
109 | 
110 | 		#pragma unroll
111 | 		for(int offset = 1; offset < NT; offset += offset) {
112 | 			if(tidDelta >= offset)
113 | 				x = op(storage.values[first + tid - offset], x);
114 | 			first = NT - first;
115 | 			storage.values[first + tid] = x;
116 | 			__syncthreads();
117 | 		}
118 | 
119 | 		// Get the exclusive scan.
120 | 		x = tid ? storage.values[first + tid - 1] : identity;
121 | 		*carryOut = storage.values[first + NT - 1];
122 | 		__syncthreads();
123 | 		return x;
124 | 	}
125 | 
126 | 	MGPU_DEVICE static T SegScan(int tid, T x, bool flag, Storage& storage,
127 | 		T* carryOut, T identity = (T)0, Op op = Op()) {
128 | 
129 | 		// Find the left-most thread that covers the first segment of this
130 | 		// thread.
131 | 		int tidDelta = DeviceFindSegScanDelta<NT>(tid, flag, storage.delta);
132 | 
133 | 		return SegScanDelta(tid, tidDelta, x, storage, carryOut, identity, op);
134 | 	}
135 | };
136 | 
137 | } // namespace mgpu
138 | 


--------------------------------------------------------------------------------
/include/contrib/moderngpu/include/device/ctasortedsearch.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /******************************************************************************
 29 |  *
 30 |  * Code and text by Sean Baxter, NVIDIA Research
 31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
 32 |  *
 33 |  ******************************************************************************/
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "../mgpudevice.cuh"
 38 | #include "ctasearch.cuh"
 39 | 
 40 | namespace mgpu {
 41 | 
 42 | 
 43 | ////////////////////////////////////////////////////////////////////////////////
 44 | // DeviceSerialSearch
 45 | 
 46 | template<int VT, MgpuBounds Bounds, bool RangeCheck, bool IndexA, bool MatchA,
 47 | 	bool IndexB, bool MatchB, typename T, typename Comp>
 48 | MGPU_DEVICE int3 DeviceSerialSearch(const T* keys_shared, int aBegin,
 49 | 	int aEnd, int bBegin, int bEnd, int aOffset, int bOffset, int* indices,
 50 | 	Comp comp) {
 51 | 
 52 | 	const int FlagA = IndexA ? 0x80000000 : 1;
 53 | 	const int FlagB = IndexB ? 0x80000000 : 1;
 54 | 
 55 | 	T aKey = keys_shared[aBegin];
 56 | 	T bKey = keys_shared[bBegin];
 57 | 	T aPrev, bPrev;
 58 | 	if(aBegin > 0) aPrev = keys_shared[aBegin - 1];
 59 | 	if(bBegin > 0) bPrev = keys_shared[bBegin - 1];
 60 | 	int decisions = 0;
 61 | 	int matchCountA = 0;
 62 | 	int matchCountB = 0;
 63 | 
 64 | 	#pragma unroll
 65 | 	for(int i = 0; i < VT; ++i) {
 66 | 		bool p;
 67 | 		if(RangeCheck && aBegin >= aEnd) p = false;
 68 | 		else if(RangeCheck && bBegin >= bEnd) p = true;
 69 | 		else p = (MgpuBoundsUpper == Bounds) ?
 70 | 			comp(aKey, bKey) :
 71 | 			!comp(bKey, aKey);
 72 | 
 73 | 		if(p) {
 74 | 			// aKey is smaller than bKey, so it is inserted before bKey.
 75 | 			// Save bKey's index (bBegin + first) as the result of the search
 76 | 			// and advance to the next needle in A.
 77 | 			bool match = false;
 78 | 			if(MatchA) {
 79 | 				// Test if there is an element in B that matches aKey.
 80 | 				if(MgpuBoundsUpper == Bounds) {
 81 | 					// Upper Bound: We're inserting aKey after bKey. If there
 82 | 					// is a match for aKey it must be bPrev. Check that bPrev
 83 | 					// is in range and equal to aKey.
 84 | 					// The predicate test result !comp(aKey, bPrev) was
 85 | 					// established on the previous A-advancing iteration (it
 86 | 					// failed the comp(aKey, bKey) test to get us to this
 87 | 					// point). Check the other half of the equality condition
 88 | 					// with a second comparison.
 89 | 					bool inRange = !RangeCheck || (bBegin > aEnd);
 90 | 					match = inRange && !comp(bPrev, aKey);
 91 | 				} else {
 92 | 					// Lower Bound: We're inserting aKey before bKey. If there
 93 | 					// is a match for aKey, it must be bKey. Check that bKey
 94 | 					// is in range and equal to aKey.
 95 | 					// The predicate test !comp(bKey, aKey) has established one
 96 | 					// half of the equality condition. We establish the other
 97 | 					// half with a second comparison.
 98 | 					bool inRange = !RangeCheck || (bBegin < bEnd);
 99 | 					match = inRange && !comp(aKey, bKey);
100 | 				}
101 | 			}
102 | 
103 | 			int index = 0;
104 | 		 	if(IndexA) index = bOffset + bBegin;
105 | 			if(match) index |= FlagA;
106 | 			if(IndexA || MatchA) indices[i] = index;
107 | 			matchCountA += match;
108 | 
109 | 			// Mark the decision bit to indicate that this iteration has
110 | 			// progressed A (the needles).
111 | 			decisions |= 1<< i;
112 | 			aPrev = aKey;
113 | 			aKey = keys_shared[++aBegin];
114 | 		} else {
115 | 			// aKey is larger than bKey, so it is inserted after bKey (but we
116 | 			// don't know where yet). Advance the B index to the next element in
117 | 			// the haystack to continue the search for the current needle.
118 | 			bool match = false;
119 | 			if(MatchB) {
120 | 				if(MgpuBoundsUpper == Bounds) {
121 | 					// Upper Bound: aKey is not smaller than bKey. We advance to
122 | 					// the next haystack element in B. If there is a match in A
123 | 					// for bKey it must be aKey. By entering this branch we've
124 | 					// verified that !comp(aKey, bKey). Making the reciprocal
125 | 					// comparison !comp(bKey, aKey) establishes aKey == bKey.
126 | 					bool inRange = !RangeCheck ||
127 | 						((bBegin < bEnd) && (aBegin < aEnd));
128 | 					match = inRange && !comp(bKey, aKey);
129 | 				} else {
130 | 					// Lower Bound: bKey is smaller than aKey. We advance to the
131 | 					// next element in B. If there is a match for bKey, it must
132 | 					// be aPrev. The previous A-advancing iteration proved that
133 | 					// !comp(bKey, aPrev). We test !comp(aPrev, bKey) for the
134 | 					// other half of the equality condition.
135 | 					bool inRange = !RangeCheck ||
136 | 						((bBegin < bEnd) && (aBegin > 0));
137 | 					match = inRange && !comp(aPrev, bKey);
138 | 				}
139 | 			}
140 | 
141 | 			int index = 0;
142 | 			if(IndexB) index = aOffset + aBegin;
143 | 			if(match) index |= FlagB;
144 | 			if(IndexB || MatchB) indices[i] = index;
145 | 			matchCountB += match;
146 | 
147 | 			// Keep the decision bit cleared to indicate that this iteration
148 | 			// has progressed B (the haystack).
149 | 			bPrev = bKey;
150 | 			bKey = keys_shared[++bBegin];
151 | 		}
152 | 	}
153 | 	return make_int3(decisions, matchCountA, matchCountB);
154 | }
155 | 
156 | ////////////////////////////////////////////////////////////////////////////////
157 | // CTASortedSearch
158 | // Take keys in shared memory and return indices and b-match flags in shared
159 | // memory.
160 | // NOTE: This function doesn't do any strided-to-thread order transposes so
161 | // using an even number of values per thread will incur no additional bank
162 | // conflicts.
163 | 
164 | template<int NT, int VT, MgpuBounds Bounds, bool IndexA, bool MatchA,
165 | 	bool IndexB, bool MatchB, typename T, typename Comp>
166 | MGPU_DEVICE int2 CTASortedSearch(T* keys_shared, int aStart, int aCount,
167 | 	int aEnd, int a0, int bStart, int bCount, int bEnd, int b0, bool extended,
168 | 	int tid, int* indices_shared, Comp comp) {
169 | 
170 | 	// Run a merge path to find the start of the serial search for each thread.
171 | 	int diag = VT * tid;
172 | 	int mp = MergePath<Bounds>(keys_shared + aStart, aCount,
173 | 		keys_shared + bStart, bCount, diag, comp);
174 | 	int a0tid = mp;
175 | 	int b0tid = diag - mp;
176 | 
177 | 	// Serial search into register.
178 | 	int3 results;
179 | 	int indices[VT];
180 | 	if(extended)
181 | 		results = DeviceSerialSearch<VT, Bounds, false, IndexA, MatchA, IndexB,
182 | 			MatchB>(keys_shared, a0tid + aStart, aEnd, b0tid + bStart, bEnd,
183 | 			a0 - aStart, b0 - bStart, indices, comp);
184 | 	else
185 | 		results = DeviceSerialSearch<VT, Bounds, true, IndexA, MatchA, IndexB,
186 | 			MatchB>(keys_shared, a0tid + aStart, aEnd, b0tid + bStart, bEnd,
187 | 			a0 - aStart, b0 - bStart, indices, comp);
188 | 	__syncthreads();
189 | 
190 | 	// Compact the indices into shared memory. Use the decision bits (set is A,
191 | 	// cleared is B) to select the destination.
192 | 	int decisions = results.x;
193 | 	b0tid += aCount;
194 | 	#pragma unroll
195 | 	for(int i = 0; i < VT; ++i) {
196 | 		if((1<< i) & decisions) {
197 | 			if(IndexA || MatchA) indices_shared[a0tid++] = indices[i];
198 | 		} else {
199 | 			if(IndexB || MatchB) indices_shared[b0tid++] = indices[i];
200 | 		}
201 | 	}
202 | 	__syncthreads();
203 | 
204 | 	// Return the match counts for A and B keys.
205 | 	return make_int2(results.y, results.z);
206 | }
207 | 
208 | } // namespace mgpu
209 | 


--------------------------------------------------------------------------------
/include/contrib/moderngpu/include/device/deviceutil.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /******************************************************************************
 29 |  *
 30 |  * Code and text by Sean Baxter, NVIDIA Research
 31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
 32 |  *
 33 |  ******************************************************************************/
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "intrinsics.cuh"
 38 | 
 39 | namespace mgpu {
 40 | 
 41 | // Get the difference between two pointers in bytes.
 42 | MGPU_HOST_DEVICE ptrdiff_t PtrDiff(const void* a, const void* b) {
 43 | 	return (const byte*)b - (const byte*)a;
 44 | }
 45 | 
 46 | // Offset a pointer by i bytes.
 47 | template<typename T>
 48 | MGPU_HOST_DEVICE const T* PtrOffset(const T* p, ptrdiff_t i) {
 49 | 	return (const T*)((const byte*)p + i);
 50 | }
 51 | template<typename T>
 52 | MGPU_HOST_DEVICE T* PtrOffset(T* p, ptrdiff_t i) {
 53 | 	return (T*)((byte*)p + i);
 54 | }
 55 | 
 56 | ////////////////////////////////////////////////////////////////////////////////
 57 | // Task range support
 58 | // Evenly distributes variable-length arrays over a fixed number of CTAs.
 59 | 
 60 | MGPU_HOST int2 DivideTaskRange(int numItems, int numWorkers) {
 61 | 	div_t d = div(numItems, numWorkers);
 62 | 	return make_int2(d.quot, d.rem);
 63 | }
 64 | 
 65 | MGPU_HOST_DEVICE int2 ComputeTaskRange(int block, int2 task) {
 66 | 	int2 range;
 67 | 	range.x = task.x * block;
 68 | 	range.x += min(block, task.y);
 69 | 	range.y = range.x + task.x + (block < task.y);
 70 | 	return range;
 71 | }
 72 | 
 73 | MGPU_HOST_DEVICE int2 ComputeTaskRange(int block, int2 task, int blockSize,
 74 | 	int count) {
 75 | 	int2 range = ComputeTaskRange(block, task);
 76 | 	range.x *= blockSize;
 77 | 	range.y = min(count, range.y * blockSize);
 78 | 	return range;
 79 | }
 80 | 
 81 | ////////////////////////////////////////////////////////////////////////////////
 82 | // DeviceExtractHeadFlags
 83 | // Input array flags is a bit array with 32 head flags per word.
 84 | // ExtractThreadHeadFlags returns numBits flags starting at bit index.
 85 | 
 86 | MGPU_HOST_DEVICE uint DeviceExtractHeadFlags(const uint* flags, int index,
 87 | 	int numBits) {
 88 | 
 89 | 	int index2 = index>> 5;
 90 | 	int shift = 31 & index;
 91 | 	uint headFlags = flags[index2]>> shift;
 92 | 	int shifted = 32 - shift;
 93 | 
 94 | 	if(shifted < numBits)
 95 | 		// We also need to shift in the next set of bits.
 96 | 		headFlags = bfi(flags[index2 + 1], headFlags, shifted, shift);
 97 | 	headFlags &= (1<< numBits) - 1;
 98 | 	return headFlags;
 99 | }
100 | 
101 | ////////////////////////////////////////////////////////////////////////////////
102 | // DevicePackHeadFlags
103 | // Pack VT bits per thread at 32 bits/thread. Will consume an integer number of
104 | // words, because CTA size is a multiple of 32. The first NT * VT / 32 threads
105 | // return packed words.
106 | 
107 | template<int NT, int VT>
108 | MGPU_DEVICE uint DevicePackHeadFlags(uint threadBits, int tid,
109 | 	uint* flags_shared) {
110 | 
111 | 	const int WordCount = NT * VT / 32;
112 | 
113 | 	// Each thread stores its thread bits to flags_shared[tid].
114 | 	flags_shared[tid] = threadBits;
115 | 	__syncthreads();
116 | 
117 | 	uint packed = 0;
118 | 	if(tid < WordCount) {
119 | 		const int Items = MGPU_DIV_UP(32, VT);
120 | 		int index = 32 * tid;
121 | 		int first = index / VT;
122 | 		int bit = 0;
123 | 
124 | 		int rem = index - VT * first;
125 | 		packed = flags_shared[first]>> rem;
126 | 		bit = VT - rem;
127 | 		++first;
128 | 
129 | 		#pragma unroll
130 | 		for(int i = 0; i < Items; ++i) {
131 | 			if(i < Items - 1 || bit < 32) {
132 | 				uint x = flags_shared[first + i];
133 | 				if(bit < 32) packed |= x<< bit;
134 | 				bit += VT;
135 | 			}
136 | 		}
137 | 	}
138 | 	__syncthreads();
139 | 
140 | 	return packed;
141 | }
142 | 
143 | } // namespace mgpu
144 | 


--------------------------------------------------------------------------------
/include/contrib/moderngpu/include/device/serialsets.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /******************************************************************************
 29 |  *
 30 |  * Code and text by Sean Baxter, NVIDIA Research
 31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
 32 |  *
 33 |  ******************************************************************************/
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "deviceutil.cuh"
 38 | 
 39 | namespace mgpu {
 40 | 
 41 | ////////////////////////////////////////////////////////////////////////////////
 42 | // SerialSetIntersection
 43 | // Emit A if A and B are in range and equal.
 44 | 
 45 | template<int VT, bool RangeCheck, typename T, typename Comp>
 46 | MGPU_DEVICE int SerialSetIntersection(const T* data, int aBegin, int aEnd,
 47 | 	int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {
 48 | 
 49 | 	const int MinIterations = VT / 2;
 50 | 	int commit = 0;
 51 | 
 52 | 	#pragma unroll
 53 | 	for(int i = 0; i < VT; ++i) {
 54 | 		bool test = RangeCheck ?
 55 | 			((aBegin + bBegin < end) && (aBegin < aEnd) && (bBegin < bEnd)) :
 56 | 			(i < MinIterations || (aBegin + bBegin < end));
 57 | 
 58 | 		if(test) {
 59 | 			T aKey = data[aBegin];
 60 | 			T bKey = data[bBegin];
 61 | 
 62 | 			bool pA = comp(aKey, bKey);
 63 | 			bool pB = comp(bKey, aKey);
 64 | 
 65 | 			// The outputs must come from A by definition of set interection.
 66 | 			results[i] = aKey;
 67 | 			indices[i] = aBegin;
 68 | 
 69 | 			if(!pB) ++aBegin;
 70 | 			if(!pA) ++bBegin;
 71 | 			if(pA == pB) commit |= 1<< i;
 72 | 		}
 73 | 	}
 74 | 	return commit;
 75 | }
 76 | 
 77 | ////////////////////////////////////////////////////////////////////////////////
 78 | // SerialSetUnion
 79 | // Emit A if A <= B. Emit B if B < A.
 80 | 
 81 | template<int VT, bool RangeCheck, typename T, typename Comp>
 82 | MGPU_DEVICE int SerialSetUnion(const T* data, int aBegin, int aEnd,
 83 | 	int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {
 84 | 
 85 | 	const int MinIterations = VT / 2;
 86 | 	int commit = 0;
 87 | 
 88 | 	#pragma unroll
 89 | 	for(int i = 0; i < VT; ++i) {
 90 | 		bool test = RangeCheck ?
 91 | 			(aBegin + bBegin < end) :
 92 | 			(i < MinIterations || (aBegin + bBegin < end));
 93 | 
 94 | 		if(test) {
 95 | 			T aKey = data[aBegin];
 96 | 			T bKey = data[bBegin];
 97 | 
 98 | 			bool pA = false, pB = false;
 99 | 			if(RangeCheck && aBegin >= aEnd)
100 | 				pB = true;
101 | 			else if(RangeCheck && bBegin >= bEnd)
102 | 				pA = true;
103 | 			else {
104 | 				// Both are in range.
105 | 				pA = comp(aKey, bKey);
106 | 				pB = comp(bKey, aKey);
107 | 			}
108 | 
109 | 			// Output A in case of a tie, so check if b < a.
110 | 			results[i] = pB ? bKey : aKey;
111 | 			indices[i] = pB ? bBegin : aBegin;
112 | 			if(!pB) ++aBegin;
113 | 			if(!pA) ++bBegin;
114 | 			commit |= 1<< i;
115 | 		}
116 | 	}
117 | 	return commit;
118 | }
119 | 
120 | ////////////////////////////////////////////////////////////////////////////////
121 | // SerialSetDifference
122 | // Emit A if A < B.
123 | 
124 | template<int VT, bool RangeCheck, typename T, typename Comp>
125 | MGPU_DEVICE int SerialSetDifference(const T* data, int aBegin, int aEnd,
126 | 	int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {
127 | 
128 | 	const int MinIterations = VT / 2;
129 | 	int commit = 0;
130 | 
131 | 	#pragma unroll
132 | 	for(int i = 0; i < VT; ++i) {
133 | 		bool test = RangeCheck ?
134 | 			(aBegin + bBegin < end) :
135 | 			(i < MinIterations || (aBegin + bBegin < end));
136 | 		if(test) {
137 | 			T aKey = data[aBegin];
138 | 			T bKey = data[bBegin];
139 | 
140 | 			bool pA = false, pB = false;
141 | 			if(RangeCheck && aBegin >= aEnd)
142 | 				pB = true;
143 | 			else if(RangeCheck && bBegin >= bEnd)
144 | 				pA = true;
145 | 			else {
146 | 				pA = comp(aKey, bKey);
147 | 				pB = comp(bKey, aKey);
148 | 			}
149 | 
150 | 			// The outputs must come from A by definition of set difference.
151 | 			results[i] = aKey;
152 | 			indices[i] = aBegin;
153 | 			if(!pB) ++aBegin;
154 | 			if(!pA) ++bBegin;
155 | 			if(pA) commit |= 1<< i;
156 | 		}
157 | 	}
158 | 	return commit;
159 | }
160 | 
161 | ////////////////////////////////////////////////////////////////////////////////
162 | // SerialSetSymDiff
163 | // Emit A if A < B and emit B if B < A.
164 | 
165 | template<int VT, bool RangeCheck, typename T, typename Comp>
166 | MGPU_DEVICE int SerialSetSymDiff(const T* data, int aBegin, int aEnd,
167 | 	int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {
168 | 
169 | 	const int MinIterations = VT / 2;
170 | 	int commit = 0;
171 | 
172 | 	#pragma unroll
173 | 	for(int i = 0; i < VT; ++i) {
174 | 		bool test = RangeCheck ?
175 | 			(aBegin + bBegin < end) :
176 | 			(i < MinIterations || (aBegin + bBegin < end));
177 | 		if(test) {
178 | 			T aKey = data[aBegin];
179 | 			T bKey = data[bBegin];
180 | 
181 | 			bool pA = false, pB = false;
182 | 			if(RangeCheck && (bBegin >= bEnd))
183 | 				pA = true;
184 | 			else if(RangeCheck && (aBegin >= aEnd))
185 | 				pB = true;
186 | 			else {
187 | 				pA = comp(aKey, bKey);
188 | 				pB = comp(bKey, aKey);
189 | 			}
190 | 
191 | 			results[i] = pA ? aKey : bKey;
192 | 			indices[i] = pA ? aBegin : bBegin;
193 | 			if(!pA) ++bBegin;
194 | 			if(!pB) ++aBegin;
195 | 			if(pA != pB) commit |= 1<< i;
196 | 		}
197 | 	}
198 | 	return commit;
199 | }
200 | 
201 | ////////////////////////////////////////////////////////////////////////////////
202 | // SerialSetOp
203 | // Uses the MgpuSetOp enum to statically select one of the four serial ops
204 | // above.
205 | 
206 | template<int VT, bool RangeCheck, MgpuSetOp Op, typename T, typename Comp>
207 | MGPU_DEVICE int SerialSetOp(const T* data, int aBegin, int aEnd,
208 | 	int bBegin, int bEnd, int star, T* results, int* indices, Comp comp) {
209 | 
210 | 	int end = aBegin + bBegin + VT - star;
211 | 	if(RangeCheck) end = min(end, aEnd + bEnd);
212 | 	int commit;
213 | 	switch(Op) {
214 | 		case MgpuSetOpIntersection:
215 | 			commit = SerialSetIntersection<VT, RangeCheck>(data, aBegin,
216 | 				aEnd, bBegin, bEnd, end, results, indices, comp);
217 | 			break;
218 | 		case MgpuSetOpUnion:
219 | 			commit = SerialSetUnion<VT, RangeCheck>(data, aBegin, aEnd,
220 | 				bBegin, bEnd, end, results, indices, comp);
221 | 			break;
222 | 		case MgpuSetOpDiff:
223 | 			commit = SerialSetDifference<VT, RangeCheck>(data, aBegin, aEnd,
224 | 				bBegin, bEnd, end, results, indices, comp);
225 | 			break;
226 | 		case MgpuSetOpSymDiff:
227 | 			commit = SerialSetSymDiff<VT, RangeCheck>(data, aBegin, aEnd,
228 | 				bBegin, bEnd, end, results, indices, comp);
229 | 			break;
230 | 	}
231 | 	__syncthreads();
232 | 	return commit;
233 | }
234 | 
235 | } // namespace mgpu
236 | 


--------------------------------------------------------------------------------
/include/contrib/moderngpu/include/device/sortnetwork.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /******************************************************************************
 29 |  *
 30 |  * Code and text by Sean Baxter, NVIDIA Research
 31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
 32 |  *
 33 |  ******************************************************************************/
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "deviceutil.cuh"
 38 | 
 39 | namespace mgpu {
 40 | 
 41 | ////////////////////////////////////////////////////////////////////////////////
 42 | // Odd-even transposition sorting network. Sorts keys and values in-place in
 43 | // register.
 44 | // http://en.wikipedia.org/wiki/Odd%E2%80%93even_sort
 45 | 
 46 | // CUDA Compiler does not currently unroll these loops correctly. Write using
 47 | // template loop unrolling.
 48 | /*
 49 | template<int VT, typename T, typename V, typename Comp>
 50 | MGPU_DEVICE void OddEvenTransposeSort(T* keys, V* values, Comp comp) {
 51 | 	#pragma unroll
 52 | 	for(int level = 0; level < VT; ++level) {
 53 | 
 54 | 		#pragma unroll
 55 | 		for(int i = 1 & level; i < VT - 1; i += 2) {
 56 | 			if(comp(keys[i + 1], keys[i])) {
 57 | 				mgpu::swap(keys[i], keys[i + 1]);
 58 | 				mgpu::swap(values[i], values[i + 1]);
 59 | 			}
 60 | 		}
 61 | 	}
 62 | }*/
 63 | 
 64 | template<int I, int VT>
 65 | struct OddEvenTransposeSortT {
 66 | 	// Sort segments marked by head flags. If the head flag between i and i + 1
 67 | 	// is set (so that (2<< i) & flags is true), the values belong to different
 68 | 	// segments and are not swapped.
 69 | 	template<typename K, typename V, typename Comp>
 70 | 	static MGPU_DEVICE void Sort(K* keys, V* values, int flags, Comp comp) {
 71 | 		#pragma unroll
 72 | 		for(int i = 1 & I; i < VT - 1; i += 2)
 73 | 			if((0 == ((2<< i) & flags)) && comp(keys[i + 1], keys[i])) {
 74 | 				mgpu::swap(keys[i], keys[i + 1]);
 75 | 				mgpu::swap(values[i], values[i + 1]);
 76 | 			}
 77 | 		OddEvenTransposeSortT<I + 1, VT>::Sort(keys, values, flags, comp);
 78 | 	}
 79 | };
 80 | template<int I> struct OddEvenTransposeSortT<I, I> {
 81 | 	template<typename K, typename V, typename Comp>
 82 | 	static MGPU_DEVICE void Sort(K* keys, V* values, int flags, Comp comp) { }
 83 | };
 84 | 
 85 | template<int VT, typename K, typename V, typename Comp>
 86 | MGPU_DEVICE void OddEvenTransposeSort(K* keys, V* values, Comp comp) {
 87 | 	OddEvenTransposeSortT<0, VT>::Sort(keys, values, 0, comp);
 88 | }
 89 | template<int VT, typename K, typename V, typename Comp>
 90 | MGPU_DEVICE void OddEvenTransposeSortFlags(K* keys, V* values, int flags,
 91 | 	Comp comp) {
 92 | 	OddEvenTransposeSortT<0, VT>::Sort(keys, values, flags, comp);
 93 | }
 94 | 
 95 | ////////////////////////////////////////////////////////////////////////////////
 96 | // Batcher Odd-Even Mergesort network
 97 | // Unstable but executes much faster than the transposition sort.
 98 | // http://en.wikipedia.org/wiki/Batcher_odd%E2%80%93even_mergesort
 99 | 
100 | template<int Width, int Low, int Count>
101 | struct OddEvenMergesortT {
102 | 	template<typename K, typename V, typename Comp>
103 | 	MGPU_DEVICE static void CompareAndSwap(K* keys, V* values, int flags,
104 | 		int a, int b, Comp comp) {
105 | 		if(b < Count) {
106 | 			// Mask the bits between a and b. Any head flags in this interval
107 | 			// means the keys are in different segments and must not be swapped.
108 | 			const int Mask = ((2<< b) - 1) ^ ((2<< a) - 1);
109 | 			if(!(Mask & flags) && comp(keys[b], keys[a])) {
110 | 				mgpu::swap(keys[b], keys[a]);
111 | 				mgpu::swap(values[b], values[a]);
112 | 			}
113 | 		}
114 | 	}
115 | 
116 | 	template<int R, int Low2, bool Recurse = 2 * R < Width>
117 | 	struct OddEvenMerge {
118 | 		template<typename K, typename V, typename Comp>
119 | 		MGPU_DEVICE static void Merge(K* keys, V* values, int flags,
120 | 			Comp comp) {
121 | 			// Compare and swap
122 | 			const int M = 2 * R;
123 | 			OddEvenMerge<M, Low2>::Merge(keys, values, flags, comp);
124 | 			OddEvenMerge<M, Low2 + R>::Merge(keys, values, flags, comp);
125 | 
126 | 			#pragma unroll
127 | 			for(int i = Low2 + R; i + R < Low2 + Width; i += M)
128 | 				CompareAndSwap(keys, values, flags, i, i + R, comp);
129 | 		}
130 | 	};
131 | 	template<int R, int Low2>
132 | 	struct OddEvenMerge<R, Low2, false> {
133 | 		template<typename K, typename V, typename Comp>
134 | 		MGPU_DEVICE static void Merge(K* keys, V* values, int flags,
135 | 			Comp comp) {
136 | 			CompareAndSwap(keys, values, flags, Low2, Low2 + R, comp);
137 | 		}
138 | 	};
139 | 
140 | 	template<typename K, typename V, typename Comp>
141 | 	MGPU_DEVICE static void Sort(K* keys, V* values, int flags,
142 | 		Comp comp) {
143 | 
144 | 		const int M = Width / 2;
145 | 		OddEvenMergesortT<M, Low, Count>::Sort(keys, values, flags, comp);
146 | 		OddEvenMergesortT<M, Low + M, Count>::Sort(keys, values, flags, comp);
147 | 		OddEvenMerge<1, Low>::Merge(keys, values, flags, comp);
148 | 	}
149 | };
150 | template<int Low, int Count> struct OddEvenMergesortT<1, Low, Count> {
151 | 	template<typename K, typename V, typename Comp>
152 | 	MGPU_DEVICE static void Sort(K* keys, V* values, int flags,
153 | 		Comp comp) { }
154 | };
155 | 
156 | template<int VT, typename K, typename V, typename Comp>
157 | MGPU_DEVICE void OddEvenMergesort(K* keys, V* values, Comp comp) {
158 | 	const int Width = 1<< sLogPow2<VT, true>::value;
159 | 	OddEvenMergesortT<Width, 0, VT>::Sort(keys, values, 0, comp);
160 | }
161 | template<int VT, typename K, typename V, typename Comp>
162 | MGPU_DEVICE void OddEvenMergesortFlags(K* keys, V* values, int flags,
163 | 	Comp comp) {
164 | 	const int Width = 1<< sLogPow2<VT, true>::value;
165 | 	OddEvenMergesortT<Width, 0, VT>::Sort(keys, values, flags, comp);
166 | }
167 | 
168 | } // namespace mgpu
169 | 


--------------------------------------------------------------------------------
/include/contrib/moderngpu/include/mgpuenums.h:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
 3 |  * 
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  * 
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | /******************************************************************************
29 |  *
30 |  * Code and text by Sean Baxter, NVIDIA Research
31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
32 |  *
33 |  ******************************************************************************/
34 | 
35 | #pragma once 
36 | 
37 | namespace mgpu {
38 | 
39 | enum MgpuBounds {
40 | 	MgpuBoundsLower,
41 | 	MgpuBoundsUpper
42 | };
43 | 
44 | enum MgpuScanType {
45 | 	MgpuScanTypeExc,
46 | 	MgpuScanTypeInc
47 | };
48 | 
49 | enum MgpuSearchType {
50 | 	MgpuSearchTypeNone,
51 | 	MgpuSearchTypeIndex,
52 | 	MgpuSearchTypeMatch,
53 | 	MgpuSearchTypeIndexMatch
54 | };
55 | 
56 | enum MgpuJoinKind {
57 | 	MgpuJoinKindInner,
58 | 	MgpuJoinKindLeft,
59 | 	MgpuJoinKindRight,
60 | 	MgpuJoinKindOuter
61 | };
62 | 
63 | enum MgpuSetOp {
64 | 	MgpuSetOpIntersection,
65 | 	MgpuSetOpUnion,
66 | 	MgpuSetOpDiff,
67 | 	MgpuSetOpSymDiff
68 | };
69 | 
70 | } // namespace mgpu
71 | 


--------------------------------------------------------------------------------
/include/contrib/moderngpu/include/util/static.h:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  3 |  * 
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  * 
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
 16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
 18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /******************************************************************************
 29 |  *
 30 |  * Code and text by Sean Baxter, NVIDIA Research
 31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
 32 |  *
 33 |  ******************************************************************************/
 34 | 
 35 | #pragma once
 36 | 
 37 | #include <functional>
 38 | #include <iterator>
 39 | #include <cfloat>
 40 | #include <typeinfo>
 41 | #include <vector>
 42 | #include <list>
 43 | #include <map>
 44 | #include <algorithm>
 45 | #include <cassert>
 46 | #include <memory>
 47 | #include <cmath>
 48 | #include <cstdio>
 49 | #include <cstdlib>
 50 | 
 51 | #ifndef MGPU_MIN
 52 | #define MGPU_MIN(x, y) (((x) <= (y)) ? (x) : (y))
 53 | #define MGPU_MAX(x, y) (((x) >= (y)) ? (x) : (y))
 54 | #define MGPU_MAX0(x) (((x) >= 0) ? (x) : 0)
 55 | #define MGPU_ABS(x) (((x) >= 0) ? (x) : (-x))
 56 | 
 57 | #define MGPU_DIV_UP(x, y) (((x) + (y) - 1) / (y))
 58 | #define MGPU_DIV_ROUND(x, y) (((x) + (y) / 2) / (y))
 59 | #define MGPU_ROUND_UP(x, y) ((y) * MGPU_DIV_UP(x, y))
 60 | #define MGPU_SHIFT_DIV_UP(x, y) (((x) + ((1<< (y)) - 1))>> y)
 61 | #define MGPU_ROUND_UP_POW2(x, y) (((x) + (y) - 1) & ~((y) - 1))
 62 | #define MGPU_ROUND_DOWN_POW2(x, y) ((x) & ~((y) - 1))
 63 | #define MGPU_IS_POW_2(x) (0 == ((x) & ((x) - 1)))
 64 | 
 65 | #endif // MGPU_MIN
 66 | 
 67 | namespace mgpu {
 68 | 
 69 | 
 70 | typedef unsigned char byte;
 71 | 
 72 | typedef unsigned int uint;
 73 | typedef signed short int16;
 74 | 
 75 | typedef unsigned short ushort;
 76 | typedef unsigned short uint16;
 77 | 
 78 | typedef long long int64;
 79 | typedef unsigned long long uint64;
 80 | 
 81 | // IsPow2<X>::value is true if X is a power of 2.
 82 | template<int X> struct sIsPow2 {
 83 | 	enum { value = 0 == (X & (X - 1)) };
 84 | };
 85 | 
 86 | // Finds the base-2 logarithm of X. value is -1 if X is not a power of 2.
 87 | template<int X, bool roundUp = true> struct sLogPow2 { 
 88 | 	enum { extra = sIsPow2<X>::value ? 0 : (roundUp ? 1 : 0) };
 89 | 	enum { inner = sLogPow2<X / 2>::inner + 1 };
 90 | 	enum { value = inner + extra };
 91 | };
 92 | template<bool roundUp> struct sLogPow2<0, roundUp> {
 93 | 	enum { inner = 0 };
 94 | 	enum { value = 0 };
 95 | };
 96 | template<bool roundUp> struct sLogPow2<1, roundUp> { 
 97 | 	enum { inner = 0 };
 98 | 	enum { value = 0 };
 99 | };
100 | 
101 | template<int X, int Y>
102 | struct sDivUp {
103 | 	enum { value = (X + Y - 1) / Y };
104 | };
105 | 
106 | template<int count, int levels> struct sDiv2RoundUp {
107 | 	enum { value = sDiv2RoundUp<sDivUp<count, 2>::value, levels - 1>::value };
108 | };
109 | template<int count> struct sDiv2RoundUp<count, 0> {
110 | 	enum { value = count };
111 | };
112 | 
113 | template<int X, int Y>
114 | struct sDivSafe {
115 | 	enum { value = X / Y };
116 | };
117 | template<int X>
118 | struct sDivSafe<X, 0> {
119 | 	enum { value = 0 };
120 | };
121 | 
122 | template<int X, int Y>
123 | struct sRoundUp {
124 | 	enum { rem = X % Y };
125 | 	enum { value = X + (rem ? (Y - rem) : 0) };
126 | };
127 | 
128 | template<int X, int Y>
129 | struct sRoundDown {
130 | 	enum { rem = X % Y };
131 | 	enum { value = X - rem };
132 | };
133 | 
134 | // IntegerDiv is a template for avoiding divisions by zero in template 
135 | // evaluation. Templates always evaluate both b and c in an expression like
136 | // a ? b : c, and will error if either rhs contains an illegal expression,
137 | // even if the ternary is explictly designed to guard against that.
138 | template<int X, int Y>
139 | struct sIntegerDiv {
140 | 	enum { value = X / (Y ? Y : (X + 1)) };
141 | };
142 | 
143 | template<int X, int Y>
144 | struct sMax {
145 | 	enum { value = (X >= Y) ? X : Y };
146 | };
147 | template<int X, int Y>
148 | struct sMin {
149 | 	enum { value = (X <= Y) ? X : Y };
150 | };
151 | 
152 | template<int X>
153 | struct sAbs {
154 | 	enum { value = (X >= 0) ? X : -X };
155 | };
156 | 
157 | 
158 | // Finds the number of powers of 2 in the prime factorization of X.
159 | template<int X, int LSB = 1 & X> struct sNumFactorsOf2 {
160 | 	enum { shifted = X >> 1 };
161 | 	enum { value = 1 + sNumFactorsOf2<shifted>::value };
162 | };
163 | template<int X> struct sNumFactorsOf2<X, 1> {
164 | 	enum { value = 0 };
165 | };
166 | 
167 | // Returns the divisor for a conflict-free transpose.
168 | template<int X, int NumBanks = 32> struct sBankConflictDivisor {
169 | 	enum { value = 
170 | 		(1 & X) ? 0 : 
171 | 		(sIsPow2<X>::value ? NumBanks :
172 | 		(1<< sNumFactorsOf2<X>::value)) }; 
173 | 	enum { log_value = sLogPow2<value>::value };
174 | };
175 | 
176 | template<int NT, int X, int NumBanks = 32> struct sConflictFreeStorage {
177 | 	enum { count = NT * X };
178 | 	enum { divisor = sBankConflictDivisor<X, NumBanks>::value };
179 | 	enum { padding = sDivSafe<count, divisor>::value };
180 | 	enum { value = count + padding };
181 | };
182 | 
183 | } // namespace mgpu
184 | 


--------------------------------------------------------------------------------
/include/ctc.h:
--------------------------------------------------------------------------------
  1 | /** \file ctc.h
  2 |  * Contains a simple C interface to call fast CPU and GPU based computation
  3 |  * of the CTC loss.
  4 |  */
  5 | 
  6 | #pragma once
  7 | 
  8 | #ifdef _WIN32
  9 | #ifdef warpctc_EXPORTS
 10 | #define API_REFERENCE extern "C" __declspec(dllexport)
 11 | #else
 12 | #define API_REFERENCE extern "C" __declspec(dllimport)
 13 | #endif
 14 | #else
 15 | #define API_REFERENCE
 16 | #endif
 17 | 
 18 | #include <stdio.h>
 19 | 
 20 | #ifdef __cplusplus
 21 | #include <cstddef>
 22 | extern "C" {
 23 | #endif
 24 | 
 25 | #ifdef WARPCTC_WITH_HIP
 26 | //forward declare of HIP typedef to avoid needing to pull in HIP headers
 27 | typedef struct ihipStream_t* GPUstream;
 28 | #else
 29 | //forward declare of CUDA typedef to avoid needing to pull in CUDA headers
 30 | typedef struct CUstream_st* GPUstream;
 31 | #endif
 32 | 
 33 | typedef enum {
 34 |     CTC_STATUS_SUCCESS = 0,
 35 |     CTC_STATUS_MEMOPS_FAILED = 1,
 36 |     CTC_STATUS_INVALID_VALUE = 2,
 37 |     CTC_STATUS_EXECUTION_FAILED = 3,
 38 |     CTC_STATUS_UNKNOWN_ERROR = 4
 39 | } ctcStatus_t;
 40 | 
 41 | /** Returns a single integer which specifies the API version of the warpctc library */
 42 | API_REFERENCE int get_warpctc_version();
 43 | 
 44 | /** Returns a string containing a description of status that was passed in
 45 |  *  \param[in] status identifies which string should be returned
 46 |  *  \return C style string containing the text description
 47 |  *  */
 48 | API_REFERENCE const char* ctcGetStatusString(ctcStatus_t status);
 49 | 
 50 | typedef enum {
 51 |     CTC_CPU = 0,
 52 |     CTC_GPU = 1
 53 | } ctcComputeLocation;
 54 | 
 55 | /** Structure used for options to the CTC compution.  Applications
 56 |  *  should zero out the array using memset and sizeof(struct
 57 |  *  ctcOptions) in C or default initialization (e.g. 'ctcOptions
 58 |  *  options{};' or 'auto options = ctcOptions{}') in C++ to ensure
 59 |  *  forward compatibility with added options. */
 60 | struct ctcOptions {
 61 |     /// indicates where the ctc calculation should take place {CTC_CPU | CTC_GPU}
 62 |     ctcComputeLocation loc;
 63 |     union {
 64 |         /// used when loc == CTC_CPU, the maximum number of threads that can be used
 65 |         unsigned int num_threads;
 66 | 
 67 |         /// used when loc == CTC_GPU, which stream the kernels should be launched in
 68 |         GPUstream stream;
 69 |     };
 70 | 
 71 |     /// the label value/index that the CTC calculation should use as the blank label
 72 |     int blank_label;
 73 | };
 74 | 
 75 | /** Compute the connectionist temporal classification loss between 
 76 |  *  a probability sequence with dtype float and a ground truth labeling.
 77 |  *  Optionally compute the gradient with respect to the inputs.
 78 |  * \param [in] activations pointer to the activations in either CPU or GPU
 79 |  *             addressable memory, depending on info.  We assume a fixed
 80 |  *             memory layout for this 3 dimensional tensor, which has dimension
 81 |  *             (t, n, p), where t is the time index, n is the minibatch index,
 82 |  *             and p indexes over probabilities of each symbol in the alphabet.
 83 |  *             The memory layout is (t, n, p) in C order (slowest to fastest changing
 84 |  *             index, aka row-major), or (p, n, t) in Fortran order (fastest to slowest
 85 |  *             changing index, aka column-major). We also assume strides are equal to
 86 |  *             dimensions - there is no padding between dimensions.
 87 |  *             More precisely, element (t, n, p), for a problem with mini_batch examples
 88 |  *             in the mini batch, and alphabet_size symbols in the alphabet, is located at:
 89 |  *             activations[(t * mini_batch + n) * alphabet_size + p]
 90 |  * \param [out] gradients if not NULL, then gradients are computed.  Should be
 91 |  *              allocated in the same memory space as probs and memory
 92 |  *              ordering is identical.
 93 |  * \param [in]  flat_labels Always in CPU memory.  A concatenation
 94 |  *              of all the labels for the minibatch.
 95 |  * \param [in]  label_lengths Always in CPU memory. The length of each label
 96 |  *              for each example in the minibatch.
 97 |  * \param [in]  input_lengths Always in CPU memory.  The number of time steps
 98 |  *              for each sequence in the minibatch.
 99 |  * \param [in]  alphabet_size The number of possible output symbols.  There
100 |  *              should be this many probabilities for each time step.
101 |  * \param [in]  mini_batch How many examples in a minibatch.
102 |  * \param [out] costs Always in CPU memory.  The cost of each example in the
103 |  *              minibatch.
104 |  * \param [in,out] workspace In same memory space as probs. Should be of
105 |  *                 size requested by get_workspace_size.
106 |  * \param [in]  options see struct ctcOptions
107 |  *
108 |  *  \return Status information
109 |  *
110 |  * */
111 | API_REFERENCE ctcStatus_t compute_ctc_loss(const float* const activations,
112 |                              float* gradients,
113 |                              const int* const flat_labels,
114 |                              const int* const label_lengths,
115 |                              const int* const input_lengths,
116 |                              int alphabet_size,
117 |                              int minibatch,
118 |                              float *costs,
119 |                              void *workspace,
120 |                              ctcOptions options);
121 | 
122 | /** Compute the connectionist temporal classification loss between 
123 |  *  a probability sequence of dtype double and a ground truth labeling.
124 |  *  Optionally compute the gradient with respect to the inputs.
125 |  * \param [in] activations pointer to the activations in either CPU or GPU
126 |  *             addressable memory, depending on info.  We assume a fixed
127 |  *             memory layout for this 3 dimensional tensor, which has dimension
128 |  *             (t, n, p), where t is the time index, n is the minibatch index,
129 |  *             and p indexes over probabilities of each symbol in the alphabet.
130 |  *             The memory layout is (t, n, p) in C order (slowest to fastest changing
131 |  *             index, aka row-major), or (p, n, t) in Fortran order (fastest to slowest
132 |  *             changing index, aka column-major). We also assume strides are equal to
133 |  *             dimensions - there is no padding between dimensions.
134 |  *             More precisely, element (t, n, p), for a problem with mini_batch examples
135 |  *             in the mini batch, and alphabet_size symbols in the alphabet, is located at:
136 |  *             activations[(t * mini_batch + n) * alphabet_size + p]
137 |  * \param [out] gradients if not NULL, then gradients are computed.  Should be
138 |  *              allocated in the same memory space as probs and memory
139 |  *              ordering is identical.
140 |  * \param [in]  flat_labels Always in CPU memory.  A concatenation
141 |  *              of all the labels for the minibatch.
142 |  * \param [in]  label_lengths Always in CPU memory. The length of each label
143 |  *              for each example in the minibatch.
144 |  * \param [in]  input_lengths Always in CPU memory.  The number of time steps
145 |  *              for each sequence in the minibatch.
146 |  * \param [in]  alphabet_size The number of possible output symbols.  There
147 |  *              should be this many probabilities for each time step.
148 |  * \param [in]  mini_batch How many examples in a minibatch.
149 |  * \param [out] costs Always in CPU memory.  The cost of each example in the
150 |  *              minibatch.
151 |  * \param [in,out] workspace In same memory space as probs. Should be of
152 |  *                 size requested by get_workspace_size.
153 |  * \param [in]  options see struct ctcOptions
154 |  *
155 |  *  \return Status information
156 |  *
157 |  * */
158 | API_REFERENCE ctcStatus_t compute_ctc_loss_double(const double* const activations,
159 |                              double* gradients,
160 |                              const int* const flat_labels,
161 |                              const int* const label_lengths,
162 |                              const int* const input_lengths,
163 |                              int alphabet_size,
164 |                              int minibatch,
165 |                              double *costs,
166 |                              void *workspace,
167 |                              ctcOptions options);
168 | 
169 | 
170 | /** For a given set of labels and minibatch size return the required workspace
171 |  *  size when the dtype of your probabilities is float.  This will need to be allocated 
172 |  *  in the same memory space as your probabilities.
173 |  * \param [in]  label_lengths Always in CPU memory. The length of each label
174 |  *              for each example in the minibatch.
175 |  * \param [in]  input_lengths Always in CPU memory.  The number of time steps
176 |  *              for each sequence in the minibatch.
177 |  * \param [in]  alphabet_size How many symbols in the alphabet or, equivalently,
178 |  *              the number of probabilities at each time step
179 |  * \param [in]  mini_batch How many examples in a minibatch.
180 |  * \param [in]  info see struct ctcOptions
181 |  * \param [out] size_bytes is pointer to a scalar where the memory
182 |  *              requirement in bytes will be placed. This memory should be allocated
183 |  *              at the same place, CPU or GPU, that the probs are in
184 |  *
185 |  *  \return Status information
186 |  **/
187 | API_REFERENCE ctcStatus_t get_workspace_size(const int* const label_lengths,
188 |                                const int* const input_lengths,
189 |                                int alphabet_size, int minibatch,
190 |                                ctcOptions info,
191 |                                size_t* size_bytes);
192 | 
193 | /** For a given set of labels and minibatch size return the required workspace
194 |  *  size when the dtype of your probabilities is double. This will need to be allocated 
195 |  *  in the same memory space as your probabilities.
196 |  * \param [in]  label_lengths Always in CPU memory. The length of each label
197 |  *              for each example in the minibatch.
198 |  * \param [in]  input_lengths Always in CPU memory.  The number of time steps
199 |  *              for each sequence in the minibatch.
200 |  * \param [in]  alphabet_size How many symbols in the alphabet or, equivalently,
201 |  *              the number of probabilities at each time step
202 |  * \param [in]  mini_batch How many examples in a minibatch.
203 |  * \param [in]  info see struct ctcOptions
204 |  * \param [out] size_bytes is pointer to a scalar where the memory
205 |  *              requirement in bytes will be placed. This memory should be allocated
206 |  *              at the same place, CPU or GPU, that the probs are in
207 |  *
208 |  *  \return Status information
209 |  **/
210 | API_REFERENCE ctcStatus_t  get_workspace_size_double(const int* const label_lengths,
211 |                                const int* const input_lengths,
212 |                                int alphabet_size, int minibatch,
213 |                                ctcOptions info,
214 |                                size_t* size_bytes);
215 | 
216 | #ifdef __cplusplus
217 | }
218 | #endif
219 | 


--------------------------------------------------------------------------------
/include/detail/ctc_helper.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <limits>
 4 | #include <algorithm>
 5 | #include <cmath>
 6 | 
 7 | #include "hostdevice.h"
 8 | #include "type_defs.h"
 9 | 
10 | namespace ctc_helper {
11 | 
12 | static const float threshold = 1e-1;
13 | 
14 | template<typename T>
15 | HOSTDEVICE
16 | T neg_inf() { return -T(INFINITY); }
17 | 
18 | inline int div_up(int x, int y) {
19 |     return (x + y - 1) / y;
20 | }
21 | 
22 | template <typename Arg, typename Res = Arg> struct maximum {
23 |     HOSTDEVICE
24 |     Res operator()(const Arg& x, const Arg& y) const {
25 |         return x < y ? y : x;
26 |     }
27 | };
28 | 
29 | template <typename Arg, typename Res = Arg> struct add {
30 |     HOSTDEVICE
31 |     Res operator()(const Arg& x, const Arg& y) const {
32 |         return x + y;
33 |     }
34 | };
35 | 
36 | template <typename Arg, typename Res = Arg> struct identity {
37 |     HOSTDEVICE Res operator()(const Arg& x) const {return Res(x);}
38 | };
39 | 
40 | template <typename Arg, typename Res = Arg> struct negate {
41 |     HOSTDEVICE Res operator()(const Arg& x) const {return Res(-x);}
42 | };
43 | 
44 | template <typename Arg, typename Res = Arg> struct exponential {
45 |     HOSTDEVICE Res operator()(const Arg& x) const {return std::exp(x);}
46 | };
47 | 
48 | template<typename Arg1, typename Arg2 = Arg1, typename Res=Arg1>
49 | struct log_plus {
50 |     typedef Res result_type;
51 |     HOSTDEVICE
52 |     Res operator()(const Arg1& p1, const Arg2& p2) {
53 |         if (p1 == neg_inf<Arg1>())
54 |             return p2;
55 |         if (p2 == neg_inf<Arg2>())
56 |             return p1;
57 |         Res result = log1p(exp(-fabs(p1 - p2))) + maximum<Res>()(p1, p2);
58 |         return result;
59 |     }
60 | };
61 | 
62 | }
63 | 


--------------------------------------------------------------------------------
/include/detail/hostdevice.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #if (defined(__HIPCC__) || defined(__CUDACC__))
 4 |     #define HOSTDEVICE __host__ __device__
 5 | #else
 6 |     #define HOSTDEVICE
 7 | #endif
 8 | 
 9 | // NOTE(dzhwinter)
10 | // the warp primitive is different in cuda9(Volta) GPU.
11 | // add a wrapper to compatible with cuda7 to cuda9
12 | #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
13 | #define DEFAULT_MASK 0u
14 | template<typename T>
15 | __forceinline__ __device__ T __shfl_down(T input, int delta) {
16 |   return __shfl_down_sync(DEFAULT_MASK, input, delta);
17 | }
18 | 
19 | template<typename T>
20 | __forceinline__ __device__ T __shfl_up(T input, int delta) {
21 |   return __shfl_up_sync(DEFAULT_MASK, input, delta);
22 | }
23 | 
24 | #endif
25 | 


--------------------------------------------------------------------------------
/include/detail/reduce.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | template <typename T>
4 | ctcStatus_t reduce_negate(const T* input, T* output, int rows, int cols, bool axis, gpuStream_t stream);
5 | template <typename T>
6 | ctcStatus_t reduce_exp(const T* input, T* output, int rows, int cols, bool axis, gpuStream_t stream);
7 | template <typename T>
8 | ctcStatus_t reduce_max(const T* input, T* output, int rows, int cols, bool axis, gpuStream_t stream);
9 | 


--------------------------------------------------------------------------------
/include/detail/type_defs.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #if (defined(__HIPCC__) || defined(__CUDACC__))
 4 | 
 5 | #ifdef __HIPCC__
 6 | #include <hip/hip_runtime.h>
 7 | #else
 8 | #include <cuda_runtime.h>
 9 | #endif
10 | 
11 | #ifdef __HIPCC__
12 | #define gpuSuccess hipSuccess
13 | using gpuStream_t = hipStream_t;
14 | using gpuError_t = hipError_t;
15 | using gpuEvent_t = hipEvent_t;
16 | #else
17 | #define gpuSuccess cudaSuccess
18 | using gpuStream_t = cudaStream_t;
19 | using gpuError_t = cudaError_t;
20 | using gpuEvent_t = cudaEvent_t;
21 | #endif
22 | 
23 | #endif
24 | 


--------------------------------------------------------------------------------
/src/ctc_entrypoint.cpp:
--------------------------------------------------------------------------------
  1 | #include <cstddef>
  2 | #include <iostream>
  3 | #include <algorithm>
  4 | #include <cstdio>
  5 | 
  6 | #include <ctc.h>
  7 | 
  8 | #include "detail/cpu_ctc.h"
  9 | #if (defined(__HIPCC__) || defined(__CUDACC__))
 10 |     #include "detail/gpu_ctc.h"
 11 | #endif
 12 | 
 13 | 
 14 | extern "C" {
 15 | 
 16 | int get_warpctc_version() {
 17 |     return 2;
 18 | }
 19 | 
 20 | const char* ctcGetStatusString(ctcStatus_t status) {
 21 |     switch (status) {
 22 |     case CTC_STATUS_SUCCESS:
 23 |         return "no error";
 24 |     case CTC_STATUS_MEMOPS_FAILED:
 25 |         return "cuda memcpy or memset failed";
 26 |     case CTC_STATUS_INVALID_VALUE:
 27 |         return "invalid value";
 28 |     case CTC_STATUS_EXECUTION_FAILED:
 29 |         return "execution failed";
 30 | 
 31 |     case CTC_STATUS_UNKNOWN_ERROR:
 32 |     default:
 33 |         return "unknown error";
 34 | 
 35 |     }
 36 | 
 37 | }
 38 | 
 39 | 
 40 | ctcStatus_t compute_ctc_loss(const float* const activations,
 41 |                              float* gradients,
 42 |                              const int* const flat_labels,
 43 |                              const int* const label_lengths,
 44 |                              const int* const input_lengths,
 45 |                              int alphabet_size,
 46 |                              int minibatch,
 47 |                              float *costs,
 48 |                              void *workspace,
 49 |                              ctcOptions options) {
 50 |     if (activations == nullptr ||
 51 |         flat_labels == nullptr ||
 52 |         label_lengths == nullptr ||
 53 |         input_lengths == nullptr ||
 54 |         costs == nullptr ||
 55 |         workspace == nullptr ||
 56 |         alphabet_size <= 0 ||
 57 |         minibatch <= 0)
 58 |         return CTC_STATUS_INVALID_VALUE;
 59 | 
 60 |     if (options.loc == CTC_CPU) {
 61 |         CpuCTC<float> ctc(alphabet_size, minibatch, workspace, options.num_threads,
 62 |                           options.blank_label);
 63 | 
 64 |         if (gradients != NULL)
 65 |             return ctc.cost_and_grad(activations, gradients,
 66 |                                      costs,
 67 |                                      flat_labels, label_lengths,
 68 |                                      input_lengths);
 69 |         else
 70 |             return ctc.score_forward(activations, costs, flat_labels,
 71 |                                      label_lengths, input_lengths);
 72 |     } else if (options.loc == CTC_GPU) {
 73 | #if (defined(__HIPCC__) || defined(__CUDACC__))
 74 |         GpuCTC<float> ctc(alphabet_size, minibatch, workspace, options.stream,
 75 |                           options.blank_label);
 76 | 
 77 |         if (gradients != NULL)
 78 |             return ctc.cost_and_grad(activations, gradients, costs,
 79 |                                      flat_labels, label_lengths,
 80 |                                      input_lengths);
 81 |         else
 82 |             return ctc.score_forward(activations, costs, flat_labels,
 83 |                                      label_lengths, input_lengths);
 84 | #else
 85 |         std::cerr << "GPU execution requested, but not compiled with GPU support" << std::endl;
 86 |         return CTC_STATUS_EXECUTION_FAILED;
 87 | #endif
 88 |     } else {
 89 |         return CTC_STATUS_INVALID_VALUE;
 90 |     }
 91 | }
 92 | 
 93 | ctcStatus_t compute_ctc_loss_double(const double* const activations,
 94 |                              double* gradients,
 95 |                              const int* const flat_labels,
 96 |                              const int* const label_lengths,
 97 |                              const int* const input_lengths,
 98 |                              int alphabet_size,
 99 |                              int minibatch,
100 |                              double *costs,
101 |                              void *workspace,
102 |                              ctcOptions options) {
103 |     if (activations == nullptr ||
104 |         flat_labels == nullptr ||
105 |         label_lengths == nullptr ||
106 |         input_lengths == nullptr ||
107 |         costs == nullptr ||
108 |         workspace == nullptr ||
109 |         alphabet_size <= 0 ||
110 |         minibatch <= 0)
111 |         return CTC_STATUS_INVALID_VALUE;
112 | 
113 |     if (options.loc == CTC_CPU) {
114 |         CpuCTC<double> ctc(alphabet_size, minibatch, workspace, options.num_threads,
115 |                           options.blank_label);
116 | 
117 |         if (gradients != NULL)
118 |             return ctc.cost_and_grad(activations, gradients,
119 |                                      costs,
120 |                                      flat_labels, label_lengths,
121 |                                      input_lengths);
122 |         else
123 |             return ctc.score_forward(activations, costs, flat_labels,
124 |                                      label_lengths, input_lengths);
125 |     } else if (options.loc == CTC_GPU) {
126 | #if (defined(__HIPCC__) || defined(__CUDACC__))
127 |         GpuCTC<double> ctc(alphabet_size, minibatch, workspace, options.stream,
128 |                           options.blank_label);
129 | 
130 |         if (gradients != NULL)
131 |             return ctc.cost_and_grad(activations, gradients, costs,
132 |                                      flat_labels, label_lengths,
133 |                                      input_lengths);
134 |         else
135 |             return ctc.score_forward(activations, costs, flat_labels,
136 |                                      label_lengths, input_lengths);
137 | #else
138 |         std::cerr << "GPU execution requested, but not compiled with GPU support" << std::endl;
139 |         return CTC_STATUS_EXECUTION_FAILED;
140 | #endif
141 |     } else {
142 |         return CTC_STATUS_INVALID_VALUE;
143 |     }
144 | }
145 | 
146 | 
147 | ctcStatus_t get_workspace_size(const int* const label_lengths,
148 |                                const int* const input_lengths,
149 |                                int alphabet_size, int minibatch,
150 |                                ctcOptions options,
151 |                                size_t* size_bytes)
152 | {
153 |     if (label_lengths == nullptr ||
154 |         input_lengths == nullptr ||
155 |         size_bytes == nullptr ||
156 |         alphabet_size <= 0 ||
157 |         minibatch <= 0)
158 |         return CTC_STATUS_INVALID_VALUE;
159 | 
160 |     // This is the max of all S and T for all examples in the minibatch.
161 |     int maxL = *std::max_element(label_lengths, label_lengths + minibatch);
162 |     int maxT = *std::max_element(input_lengths, input_lengths + minibatch);
163 | 
164 |     const int S = 2 * maxL + 1;
165 | 
166 |     *size_bytes = 0;
167 | 
168 |     if (options.loc == CTC_GPU) {
169 |         // GPU storage
170 |         //nll_forward, nll_backward
171 |         *size_bytes += 2 * sizeof(float) * minibatch;
172 | 
173 |         //repeats
174 |         *size_bytes += sizeof(int) * minibatch;
175 | 
176 |         //label offsets
177 |         *size_bytes += sizeof(int) * minibatch;
178 | 
179 |         //utt_length
180 |         *size_bytes += sizeof(int) * minibatch;
181 | 
182 |         //label lengths
183 |         *size_bytes += sizeof(int) * minibatch;
184 | 
185 |         //labels without blanks - overallocate for now
186 |         *size_bytes += sizeof(int) * maxL * minibatch;
187 | 
188 |         //labels with blanks
189 |         *size_bytes += sizeof(int) * S * minibatch;
190 | 
191 |         //alphas
192 |         *size_bytes += sizeof(float) * S * maxT * minibatch;
193 | 
194 |         //denoms
195 |         *size_bytes += sizeof(float) * maxT * minibatch;
196 | 
197 |         //probs (since we will pass in activations)
198 |         *size_bytes += sizeof(float) * alphabet_size * maxT * minibatch;
199 | 
200 |     } else {
201 |         //cpu can eventually replace all minibatch with
202 |         //max number of concurrent threads if memory is
203 |         //really tight
204 | 
205 |         //per minibatch memory
206 |         size_t per_minibatch_bytes = 0;
207 | 
208 |         //output
209 |         per_minibatch_bytes += sizeof(float) * alphabet_size ;
210 | 
211 |         //alphas
212 |         per_minibatch_bytes += sizeof(float) * S * maxT;
213 | 
214 |         //betas
215 |         per_minibatch_bytes += sizeof(float) * S;
216 | 
217 |         //labels w/blanks, e_inc, s_inc
218 |         per_minibatch_bytes += 3 * sizeof(int) * S;
219 | 
220 |         *size_bytes = per_minibatch_bytes * minibatch;
221 | 
222 |         //probs
223 |         *size_bytes += sizeof(float) * alphabet_size * maxT * minibatch;
224 |     }
225 | 
226 |     return CTC_STATUS_SUCCESS;
227 | }
228 | 
229 | ctcStatus_t get_workspace_size_double(const int* const label_lengths,
230 |                                const int* const input_lengths,
231 |                                int alphabet_size, int minibatch,
232 |                                ctcOptions options,
233 |                                size_t* size_bytes)
234 | {
235 |     if (label_lengths == nullptr ||
236 |         input_lengths == nullptr ||
237 |         size_bytes == nullptr ||
238 |         alphabet_size <= 0 ||
239 |         minibatch <= 0)
240 |         return CTC_STATUS_INVALID_VALUE;
241 | 
242 |     // This is the max of all S and T for all examples in the minibatch.
243 |     int maxL = *std::max_element(label_lengths, label_lengths + minibatch);
244 |     int maxT = *std::max_element(input_lengths, input_lengths + minibatch);
245 | 
246 |     const int S = 2 * maxL + 1;
247 | 
248 |     *size_bytes = 0;
249 | 
250 |     if (options.loc == CTC_GPU) {
251 |         // GPU storage
252 |         //nll_forward, nll_backward
253 |         *size_bytes += 2 * sizeof(double) * minibatch;
254 | 
255 |         //repeats
256 |         *size_bytes += sizeof(int) * minibatch;
257 | 
258 |         //label offsets
259 |         *size_bytes += sizeof(int) * minibatch;
260 | 
261 |         //utt_length
262 |         *size_bytes += sizeof(int) * minibatch;
263 | 
264 |         //label lengths
265 |         *size_bytes += sizeof(int) * minibatch;
266 | 
267 |         //labels without blanks - overallocate for now
268 |         *size_bytes += sizeof(int) * maxL * minibatch;
269 | 
270 |         //labels with blanks
271 |         *size_bytes += sizeof(int) * S * minibatch;
272 | 
273 |         //alphas
274 |         *size_bytes += sizeof(double) * S * maxT * minibatch;
275 | 
276 |         //denoms
277 |         *size_bytes += sizeof(double) * maxT * minibatch;
278 | 
279 |         //probs (since we will pass in activations)
280 |         *size_bytes += sizeof(double) * alphabet_size * maxT * minibatch;
281 | 
282 |     } else {
283 |         //cpu can eventually replace all minibatch with
284 |         //max number of concurrent threads if memory is
285 |         //really tight
286 | 
287 |         //per minibatch memory
288 |         size_t per_minibatch_bytes = 0;
289 | 
290 |         //output
291 |         per_minibatch_bytes += sizeof(double) * alphabet_size ;
292 | 
293 |         //alphas
294 |         per_minibatch_bytes += sizeof(double) * S * maxT;
295 | 
296 |         //betas
297 |         per_minibatch_bytes += sizeof(double) * S;
298 | 
299 |         //labels w/blanks, e_inc, s_inc
300 |         per_minibatch_bytes += 3 * sizeof(int) * S;
301 | 
302 |         *size_bytes = per_minibatch_bytes * minibatch;
303 | 
304 |         //probs
305 |         *size_bytes += sizeof(double) * alphabet_size * maxT * minibatch;
306 |     }
307 | 
308 |     return CTC_STATUS_SUCCESS;
309 | }
310 | 
311 | }
312 | 


--------------------------------------------------------------------------------
/src/ctc_entrypoint.cu:
--------------------------------------------------------------------------------
1 | ctc_entrypoint.cpp


--------------------------------------------------------------------------------
/src/reduce.cu:
--------------------------------------------------------------------------------
  1 | // Includes, system
  2 | // #include <stdio.h>
  3 | // #include <stdlib.h>
  4 | 
  5 | // Includes, cuda
  6 | // #include <cuda_runtime.h>
  7 | // #include <cublas_v2.h>
  8 | 
  9 | // Includes, cuda helper functions
 10 | // #include <helper_cuda.h>
 11 | 
 12 | // For the functors
 13 | #include "detail/ctc_helper.h"
 14 | #include "ctc.h"
 15 | 
 16 | const int warp_size = 32;
 17 | 
 18 | template<int NT, typename T, typename Rop>
 19 | struct CTAReduce;
 20 | 
 21 | template<int NT, typename T, typename Rop>
 22 | struct CTAReduce {
 23 |     enum { Size = NT, Capacity = NT };
 24 |     struct Storage { T shared[Capacity]; };
 25 | 
 26 |     __device__ static T reduce(int tid, T x, Storage& storage, int count, Rop g) {
 27 |         T* s = storage.shared;
 28 |         s[tid] = x;
 29 |         __syncthreads();
 30 | 
 31 |         // Fold the data in half with each pass.
 32 | #pragma unroll
 33 |         for(int offset = NT / 2; offset >= warp_size; offset /= 2) {
 34 |             if(tid + offset < count && tid < offset) {
 35 |                 // Read from the right half and store to the left half.
 36 |                 x = g(x, s[offset + tid]);
 37 |                 s[tid] = x;
 38 |             }
 39 |             __syncthreads();
 40 |         }
 41 | 
 42 |         T shuff;
 43 |         for (int offset = warp_size / 2; offset > 0; offset /= 2) {
 44 | #if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
 45 |             shuff = __shfl_down_sync(0xFFFFFFFF, x, offset);
 46 | #else
 47 |             shuff = __shfl_down(x, offset);
 48 | #endif
 49 |             if (tid + offset < count && tid < offset)
 50 |                 x = g(x, shuff);
 51 |         }
 52 |         return x;
 53 |     }
 54 | };
 55 | 
 56 | template <int NT, typename Iop, typename Rop, typename T>
 57 | __global__ void reduce_rows(Iop f, Rop g, const T* input, T* output,
 58 |                             int num_rows, int num_cols) {
 59 | 
 60 |     typedef CTAReduce<NT, T, Rop> R;
 61 |     __shared__ typename R::Storage storage;
 62 | 
 63 |     int tid = threadIdx.x;
 64 |     int idx = tid;
 65 |     int col = blockIdx.x;
 66 |     T curr;
 67 | 
 68 |     // Each block works on a column
 69 |     if (idx < num_rows)
 70 |         curr = f(input[idx + col*num_rows]);
 71 |     idx += NT;
 72 | 
 73 | 
 74 |     while (idx < num_rows) {
 75 |         curr = g(curr, f(input[idx + col*num_rows]));
 76 |         idx += NT;
 77 |     }
 78 | 
 79 |     // Sum thread-totals over the CTA.
 80 |     curr = R::reduce(tid, curr, storage, num_rows, g);
 81 | 
 82 |     // Store result in out
 83 |     if (tid == 0)
 84 |         output[col] = curr;
 85 | }
 86 | 
 87 | template <int NT, typename Iop, typename Rop, typename T>
 88 | __global__ void reduce_cols(Iop f, Rop g, const T* input, T* output,
 89 |                             int num_rows, int num_cols) {
 90 | 
 91 |     __shared__ T s[NT];
 92 | 
 93 |     int warps_per_block = NT / warp_size;
 94 |     int row = blockDim.x * blockIdx.x + threadIdx.x;
 95 |     int col = threadIdx.y;
 96 |     T curr;
 97 | 
 98 |     if (row < num_rows && col < num_cols) {
 99 |         curr = f(input[row + col*num_rows]);
100 |         col += blockDim.y;
101 |         while (col < num_cols) {
102 |             curr = g(curr, f(input[row + col*num_rows]));
103 |             col += blockDim.y;
104 |         }
105 |     }
106 |     s[threadIdx.x * warps_per_block + threadIdx.y] = curr;
107 |     __syncthreads();
108 | 
109 |     // Reduce
110 |     if (threadIdx.y == 0 && row < num_rows) {
111 | #pragma unroll
112 |         for (int i = 1; i < warps_per_block && i < num_cols; ++i)
113 |             curr = g(curr, s[i + threadIdx.x * warps_per_block]);
114 |         output[row] = curr;
115 |     }
116 | }
117 | 
118 | struct ReduceHelper {
119 | 
120 |     template<typename T, typename Iof, typename Rof>
121 |     static void impl(Iof f, Rof g, const T* input, T* output, int num_rows, int num_cols, bool axis, gpuStream_t stream) {
122 | 
123 |         int grid_size;
124 | 
125 |         if (axis) {
126 |             grid_size = num_cols;
127 |             reduce_rows<128><<<grid_size, 128, 0, stream>>>
128 |                (f, g, input, output, num_rows, num_cols);
129 | 
130 |         } else {
131 |             dim3 tpb(warp_size, 128 / warp_size);
132 |             grid_size = (num_cols + warp_size - 1)/warp_size;
133 |             reduce_cols<128><<<grid_size, tpb, 0, stream>>>
134 |                 (f, g, input, output, num_rows, num_cols);
135 | 
136 |         }
137 |     }
138 | };
139 | 
140 | 
141 | template<typename T, typename Iof, typename  Rof>
142 | ctcStatus_t reduce(Iof f, Rof g, const T* input, T* output, int rows, int cols, bool axis, gpuStream_t stream) {
143 |     ReduceHelper::impl(f, g, input, output, rows, cols, axis, stream);
144 | 
145 | #ifdef __HIPCC__
146 |     hipStreamSynchronize(stream);
147 |     gpuError_t err = hipGetLastError();
148 | #else
149 |     cudaStreamSynchronize(stream);
150 |     gpuError_t err = cudaGetLastError();
151 | #endif
152 | 
153 |     if (err != gpuSuccess)
154 |         return CTC_STATUS_EXECUTION_FAILED;
155 | 
156 |     return CTC_STATUS_SUCCESS;
157 | }
158 | template<typename T>
159 | ctcStatus_t reduce_negate(const T *input, T *output, int rows, int cols, bool axis, gpuStream_t stream) {
160 |     return reduce(ctc_helper::negate<T>(), ctc_helper::add<T>(), input, output, rows, cols, axis, stream);
161 | }
162 | template ctcStatus_t reduce_negate<float>(const float *input, float *output, int rows, int cols, bool axis, gpuStream_t stream);
163 | template ctcStatus_t reduce_negate<double>(const double *input, double *output, int rows, int cols, bool axis, gpuStream_t stream);
164 | 
165 | template<typename T>
166 | ctcStatus_t reduce_exp(const T *input, T *output, int rows, int cols, bool axis, gpuStream_t stream) {
167 |     return reduce(ctc_helper::exponential<T>(), ctc_helper::add<T>(), input, output, rows, cols, axis, stream);
168 | }
169 | template ctcStatus_t reduce_exp<float>(const float *input, float *output, int rows, int cols, bool axis, gpuStream_t stream);
170 | template ctcStatus_t reduce_exp<double>(const double *input, double *output, int rows, int cols, bool axis, gpuStream_t stream);
171 | 
172 | template<typename T>
173 | ctcStatus_t reduce_max(const T *input, T *output, int rows, int cols, bool axis, gpuStream_t stream) {
174 |     return reduce(ctc_helper::identity<T>(), ctc_helper::maximum<T>(),input, output, rows, cols, axis, stream);
175 | }
176 | template ctcStatus_t reduce_max<float>(const float *input, float *output, int rows, int cols, bool axis, gpuStream_t stream);
177 | template ctcStatus_t reduce_max<double>(const double *input, double *output, int rows, int cols, bool axis, gpuStream_t stream);
178 | 


--------------------------------------------------------------------------------
/tensorflow_binding/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | dist
3 | *.egg-info
4 | *.so
5 | include/cuda
6 | *.pyc
7 | 


--------------------------------------------------------------------------------
/tensorflow_binding/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # TensorFlow binding for WarpCTC
 3 | 
 4 | This package provides TensorFlow kernels that wrap the WarpCTC
 5 | library.  Kernels are provided for both the CTCLoss op already in
 6 | TensorFlow, as well as a new WarpCTC op provided in this package.  The
 7 | WarpCTC op has an interface that more closely matches the native
 8 | WarpCTC interface than TensorFlow's CTCLoss op. Note that the CTCLoss
 9 | op expects the reserved blank label to be the largest value while the
10 | WarpCTC op takes the reserved blank label value as an attribute which
11 | defaults to `0`.
12 | 
13 | ## Installation
14 | 
15 | To build the kernels it is necessary to have the TensorFlow source
16 | code available, since TensorFlow doesn't currently install the
17 | necessary headers to handle the SparseTensor that the CTCLoss op uses
18 | to input the labels.  You can retrieve the TensorFlow source from
19 | github.com:
20 | 
21 | ```bash
22 | git clone https://github.com/tensorflow/tensorflow.git
23 | ```
24 | 
25 | Tell the build scripts where you have the TensorFlow source tree by
26 | setting the `TENSORFLOW_SRC_PATH` environment variable:
27 | 
28 | ```bash
29 | export TENSORFLOW_SRC_PATH=/path/to/tensorflow
30 | ```
31 | 
32 | `WARP_CTC_PATH` should be set to the location of a built WarpCTC
33 | (i.e. `libwarpctc.so`).  This defaults to `../build`, so from within a
34 | new warp-ctc clone you could build WarpCTC like this:
35 | 
36 | ```bash
37 | mkdir build; cd build
38 | cmake ..
39 | make
40 | ```
41 | 
42 | Otherwise, set `WARP_CTC_PATH` to wherever you have `libwarpctc.so`
43 | installed. If you have a GPU, you should also make sure that
44 | `CUDA_HOME` is set to the home cuda directory (i.e. where
45 | `include/cuda.h` and `lib/libcudart.so` live).
46 | 
47 | You should now be able to use `setup.py` to install the package into
48 | your current Python environment:
49 | 
50 | ```bash
51 | python setup.py install
52 | ```
53 | 
54 | You can run a few unit tests with `setup.py` as well if you want:
55 | 
56 | ```bash
57 | python setup.py test
58 | ```
59 | 
60 | ## Using the kernels
61 | 
62 | First import the module:
63 | 
64 | ```python
65 | import warpctc_tensorflow
66 | ```
67 | 
68 | The GPU kernel for the existing `CTCLoss` op is registered and ready
69 | to use.  If you want to use WarpCTC as the CPU kernel for the
70 | `CTCLoss` op you can use the ("experimental") `_kernel_label_map`
71 | function to tell TensorFlow to use WarpCTC kernels instead of the
72 | default CPU kernel:
73 | 
74 | ```python
75 | with tf.get_default_graph()._kernel_label_map({"CTCLoss": "WarpCTC"}):
76 |     ...
77 |     loss = tf.nn.ctc_loss(inputs, labels, seq_lens)
78 | ```
79 | 
80 | Note that `preprocess_collapse_repeated` must be `False` and
81 | `ctc_merge_repeated` must be `True` (their default values) as these
82 | options are not currently supported.
83 | 
84 | The WarpCTC op is available via the `warpctc_tensorflow.ctc` function:
85 | 
86 | ```python
87 | costs = warpctc_tensorflow.ctc(activations, flat_labels, label_lengths, input_lengths)
88 | ```
89 | 
90 | The `activations` input is a 3 dimensional Tensor and all the others
91 | are single dimension Tensors.  See the main WarpCTC documentation for
92 | more information.
93 |     
94 |     
95 | 


--------------------------------------------------------------------------------
/tensorflow_binding/setup.py:
--------------------------------------------------------------------------------
  1 | """setup.py script for warp-ctc TensorFlow wrapper"""
  2 | 
  3 | from __future__ import print_function
  4 | 
  5 | import os
  6 | import platform
  7 | import re
  8 | import setuptools
  9 | import sys
 10 | import unittest
 11 | import warnings
 12 | from setuptools.command.build_ext import build_ext as orig_build_ext
 13 | 
 14 | # We need to import tensorflow to find where its include directory is.
 15 | try:
 16 |     import tensorflow as tf
 17 | except ImportError:
 18 |     raise RuntimeError("Tensorflow must be installed to build the tensorflow wrapper.")
 19 | 
 20 | if "CUDA_HOME" not in os.environ:
 21 |     print("CUDA_HOME not found in the environment so building "
 22 |           "without GPU support. To build with GPU support "
 23 |           "please define the CUDA_HOME environment variable. "
 24 |           "This should be a path which contains include/cuda.h",
 25 |           file=sys.stderr)
 26 |     enable_gpu = False
 27 | else:
 28 |     enable_gpu = True
 29 | 
 30 | 
 31 | if "TENSORFLOW_SRC_PATH" not in os.environ:
 32 |     print("Please define the TENSORFLOW_SRC_PATH environment variable.\n"
 33 |           "This should be a path to the Tensorflow source directory.",
 34 |           file=sys.stderr)
 35 |     sys.exit(1)
 36 | 
 37 | if platform.system() == 'Darwin':
 38 |     lib_ext = ".dylib"
 39 | else:
 40 |     lib_ext = ".so"
 41 | 
 42 | warp_ctc_path = "../build"
 43 | if "WARP_CTC_PATH" in os.environ:
 44 |     warp_ctc_path = os.environ["WARP_CTC_PATH"]
 45 | if not os.path.exists(os.path.join(warp_ctc_path, "libwarpctc"+lib_ext)):
 46 |     print(("Could not find libwarpctc.so in {}.\n"
 47 |            "Build warp-ctc and set WARP_CTC_PATH to the location of"
 48 |            " libwarpctc.so (default is '../build')").format(warp_ctc_path),
 49 |           file=sys.stderr)
 50 |     sys.exit(1)
 51 | 
 52 | root_path = os.path.realpath(os.path.dirname(__file__))
 53 | 
 54 | tf_include = tf.sysconfig.get_include()
 55 | tf_src_dir = os.environ["TENSORFLOW_SRC_PATH"]
 56 | tf_includes = [tf_include, tf_src_dir]
 57 | warp_ctc_includes = [os.path.join(root_path, '../include')]
 58 | include_dirs = tf_includes + warp_ctc_includes
 59 | 
 60 | if tf.__version__ >= '1.4':
 61 |     include_dirs += [tf_include + '/../../external/nsync/public']
 62 | 
 63 | if os.getenv("TF_CXX11_ABI") is not None:
 64 |     TF_CXX11_ABI = os.getenv("TF_CXX11_ABI")
 65 | else:
 66 |     warnings.warn("Assuming tensorflow was compiled without C++11 ABI. "
 67 |                   "It is generally true if you are using binary pip package. "
 68 |                   "If you compiled tensorflow from source with gcc >= 5 and didn't set "
 69 |                   "-D_GLIBCXX_USE_CXX11_ABI=0 during compilation, you need to set "
 70 |                   "environment variable TF_CXX11_ABI=1 when compiling this bindings. "
 71 |                   "Also be sure to touch some files in src to trigger recompilation. "
 72 |                   "Also, you need to set (or unsed) this environment variable if getting "
 73 |                   "undefined symbol: _ZN10tensorflow... errors")
 74 |     TF_CXX11_ABI = "0"
 75 | 
 76 | extra_compile_args = ['-std=c++11', '-fPIC', '-D_GLIBCXX_USE_CXX11_ABI=' + TF_CXX11_ABI]
 77 | # current tensorflow code triggers return type errors, silence those for now
 78 | extra_compile_args += ['-Wno-return-type']
 79 | 
 80 | extra_link_args = []
 81 | if tf.__version__ >= '1.4':
 82 |     if os.path.exists(os.path.join(tf_src_dir, 'libtensorflow_framework.so')):
 83 |         extra_link_args = ['-L' + tf.sysconfig.get_lib(), '-ltensorflow_framework']
 84 | 
 85 | if (enable_gpu):
 86 |     extra_compile_args += ['-DWARPCTC_ENABLE_GPU']
 87 |     include_dirs += [os.path.join(os.environ["CUDA_HOME"], 'include')]
 88 | 
 89 |     # mimic tensorflow cuda include setup so that their include command work
 90 |     if not os.path.exists(os.path.join(root_path, "include")):
 91 |         os.mkdir(os.path.join(root_path, "include"))
 92 | 
 93 |     cuda_inc_path = os.path.join(root_path, "include/cuda")
 94 |     if not os.path.exists(cuda_inc_path) or os.readlink(cuda_inc_path) != os.environ["CUDA_HOME"]:
 95 |         if os.path.exists(cuda_inc_path):
 96 |             os.remove(cuda_inc_path)
 97 |         os.symlink(os.environ["CUDA_HOME"], cuda_inc_path)
 98 |     include_dirs += [os.path.join(root_path, 'include')]
 99 | 
100 | # Ensure that all expected files and directories exist.
101 | for loc in include_dirs:
102 |     if not os.path.exists(loc):
103 |         print(("Could not find file or directory {}.\n"
104 |                "Check your environment variables and paths?").format(loc),
105 |               file=sys.stderr)
106 |         sys.exit(1)
107 | 
108 | lib_srcs = ['src/ctc_op_kernel.cc', 'src/warpctc_op.cc']
109 | 
110 | ext = setuptools.Extension('warpctc_tensorflow.kernels',
111 |                            sources = lib_srcs,
112 |                            language = 'c++',
113 |                            include_dirs = include_dirs,
114 |                            library_dirs = [warp_ctc_path],
115 |                            runtime_library_dirs = [os.path.realpath(warp_ctc_path)],
116 |                            libraries = ['warpctc', 'tensorflow_framework'],
117 |                            extra_compile_args = extra_compile_args,
118 |                            extra_link_args = extra_link_args)
119 | 
120 | class build_tf_ext(orig_build_ext):
121 |     def build_extensions(self):
122 |         self.compiler.compiler_so.remove('-Wstrict-prototypes')
123 |         orig_build_ext.build_extensions(self)
124 | 
125 | def discover_test_suite():
126 |     test_loader = unittest.TestLoader()
127 |     test_suite = test_loader.discover('tests', pattern='test_*.py')
128 |     return test_suite
129 | 
130 | # Read the README.md file for the long description. This lets us avoid
131 | # duplicating the package description in multiple places in the source.
132 | README_PATH = os.path.join(os.path.dirname(__file__), "README.md")
133 | with open(README_PATH, "r") as handle:
134 |     # Extract everything between the first set of ## headlines
135 |     LONG_DESCRIPTION = re.search("#.*([^#]*)##", handle.read()).group(1).strip()
136 | 
137 | setuptools.setup(
138 |     name = "warpctc_tensorflow",
139 |     version = "0.1",
140 |     description = "TensorFlow wrapper for warp-ctc",
141 |     long_description = LONG_DESCRIPTION,
142 |     url = "https://github.com/baidu-research/warp-ctc",
143 |     author = "Jared Casper",
144 |     author_email = "jared.casper@baidu.com",
145 |     license = "Apache",
146 |     packages = ["warpctc_tensorflow"],
147 |     ext_modules = [ext],
148 |     cmdclass = {'build_ext': build_tf_ext},
149 |     test_suite = 'setup.discover_test_suite',
150 | )
151 | 


--------------------------------------------------------------------------------
/tensorflow_binding/src/ctc_op_kernel.cc:
--------------------------------------------------------------------------------
  1 | #ifdef WARPCTC_ENABLE_GPU
  2 | #define EIGEN_USE_GPU
  3 | #include <cuda.h>
  4 | #endif
  5 | 
  6 | #include "tensorflow/core/framework/op_kernel.h"
  7 | #include "tensorflow/core/kernels/bounds_check.h"
  8 | #include "tensorflow/core/util/sparse/sparse_tensor.h"
  9 | 
 10 | #include "ctc.h"
 11 | 
 12 | namespace tf = tensorflow;
 13 | 
 14 | namespace warp_ctc {
 15 | 
 16 | class CTCLossOpBase : public tf::OpKernel {
 17 |   public:
 18 |     explicit CTCLossOpBase(tf::OpKernelConstruction* ctx) : tf::OpKernel(ctx) {
 19 |         bool preprocess_collapse_repeated;
 20 |         OP_REQUIRES_OK(ctx, ctx->GetAttr("preprocess_collapse_repeated",
 21 |                                          &preprocess_collapse_repeated));
 22 |         OP_REQUIRES(ctx, preprocess_collapse_repeated == false,
 23 |                     tf::errors::InvalidArgument("preprocess collapse repeated is not currently "
 24 |                                                 "supported in the WarpCTC kernel."));
 25 | 
 26 |         bool ctc_merge_repeated;
 27 |         OP_REQUIRES_OK(ctx, ctx->GetAttr("ctc_merge_repeated", &ctc_merge_repeated));
 28 |         OP_REQUIRES(ctx, ctc_merge_repeated == true,
 29 |                     tf::errors::InvalidArgument("ctc_merge_repeated == false is not currently "
 30 |                                                 "supported. WarpCTC always merges repeated symbols."));
 31 |     }
 32 | 
 33 |     void Compute(tf::OpKernelContext* ctx) override {
 34 |         const tf::Tensor* inputs;
 35 |         const tf::Tensor* labels_indices;
 36 |         const tf::Tensor* labels_values;
 37 |         const tf::Tensor* seq_len;
 38 |         OP_REQUIRES_OK(ctx, ctx->input("inputs", &inputs));
 39 |         OP_REQUIRES_OK(ctx, ctx->input("labels_indices", &labels_indices));
 40 |         OP_REQUIRES_OK(ctx, ctx->input("labels_values", &labels_values));
 41 |         OP_REQUIRES_OK(ctx, ctx->input("sequence_length", &seq_len));
 42 | 
 43 |         OP_REQUIRES(ctx, inputs->shape().dims() == 3,
 44 |                     tf::errors::InvalidArgument("inputs is not a 3-Tensor"));
 45 |         OP_REQUIRES(ctx, tf::TensorShapeUtils::IsVector(seq_len->shape()),
 46 |                     tf::errors::InvalidArgument("sequence_length is not a vector"));
 47 |         OP_REQUIRES(ctx, tf::TensorShapeUtils::IsMatrix(labels_indices->shape()),
 48 |                     tf::errors::InvalidArgument("labels_indices is not a matrix"));
 49 |         OP_REQUIRES(ctx, tf::TensorShapeUtils::IsVector(labels_values->shape()),
 50 |                     tf::errors::InvalidArgument("labels_values is not a vector"));
 51 | 
 52 |         const auto& inputs_shape = inputs->shape();
 53 |         const auto max_time = inputs_shape.dim_size(0);
 54 |         const auto batch_size = inputs_shape.dim_size(1);
 55 |         const auto num_classes_raw = inputs_shape.dim_size(2);
 56 |         OP_REQUIRES(
 57 |                 ctx, tf::FastBoundsCheck(num_classes_raw, std::numeric_limits<int>::max()),
 58 |                 tf::errors::InvalidArgument("num_classes cannot exceed max int"));
 59 |         const auto num_classes = static_cast<const int>(num_classes_raw);
 60 | 
 61 |         OP_REQUIRES(
 62 |                 ctx, batch_size == seq_len->dim_size(0),
 63 |                 tf::errors::InvalidArgument("len(sequence_length) != batch_size.  ",
 64 |                                             "len(sequence_length):  ", seq_len->dim_size(0),
 65 |                                             " batch_size: ", batch_size));
 66 |         auto seq_len_t = seq_len->vec<int32_t>();
 67 | 
 68 |         OP_REQUIRES(ctx, labels_indices->dim_size(0) == labels_values->dim_size(0),
 69 |                     tf::errors::InvalidArgument(
 70 |                             "labels_indices and labels_values must contain the "
 71 |                             "same number of rows, but saw shapes: ",
 72 |                             labels_indices->shape().DebugString(), " vs. ",
 73 |                             labels_values->shape().DebugString()));
 74 | 
 75 |         auto labels_shape = tf::TensorShape({batch_size, max_time});
 76 |         auto order = std::vector<tf::int64>{0, 1};
 77 |         auto labels_sp = tf::sparse::SparseTensor(*labels_indices, *labels_values,
 78 |                                                   labels_shape, order);
 79 | 
 80 |         auto labels_sp_valid = labels_sp.IndicesValid();
 81 |         OP_REQUIRES(ctx, labels_sp_valid.ok(),
 82 |                     tf::errors::InvalidArgument("label SparseTensor is not valid: ",
 83 |                                             labels_sp_valid.error_message()));
 84 | 
 85 |         auto label_lengths = std::vector<int>{};
 86 |         for (const auto& g : labels_sp.group({0})) {  // iterate by batch
 87 |             const auto batch_indices = g.group()[0];
 88 |             OP_REQUIRES(ctx, tf::FastBoundsCheck(batch_indices, batch_size),
 89 |                         tf::errors::InvalidArgument("labels batch index must be between ",
 90 |                                                     0, " and ", batch_size, " but saw: ",
 91 |                                                     batch_indices));
 92 |             
 93 |             auto values = g.values<int32_t>();
 94 |             label_lengths.push_back(values.size());
 95 |         }
 96 |         auto label_values_t = labels_values->vec<int>();
 97 | 
 98 | 
 99 |         OP_REQUIRES(ctx, static_cast<size_t>(batch_size) == label_lengths.size(),
100 |                     tf::errors::InvalidArgument("len(labels) != batch_size.  ",
101 |                                                 "len(labels):  ", label_lengths.size(),
102 |                                                 " batch_size: ", batch_size));
103 | 
104 |         for (int b = 0; b < batch_size; ++b) {
105 |             OP_REQUIRES(
106 |                     ctx, seq_len_t(b) <= max_time,
107 |                     tf::errors::InvalidArgument("sequence_length(", b, ") <= ", max_time));
108 |         }
109 | 
110 |         tf::Tensor* loss = nullptr;
111 |         OP_REQUIRES_OK(ctx, ctx->allocate_output("loss", seq_len->shape(), &loss));
112 |         auto loss_t = loss->vec<float>();
113 | 
114 |         tf::Tensor* gradient;
115 |         OP_REQUIRES_OK(ctx,
116 |                        ctx->allocate_output("gradient", inputs_shape, &gradient));
117 |         set_zero(gradient);
118 |         auto gradient_t = gradient->tensor<float, 3>();
119 | 
120 |         auto inputs_t = inputs->tensor<float, 3>();
121 | 
122 |         auto options = create_options(ctx);
123 |         options.blank_label = num_classes - 1;
124 | 
125 |         size_t workspace_size_bytes;
126 |         auto warp_status = get_workspace_size(label_lengths.data(), seq_len_t.data(),
127 |                                               num_classes, batch_size,
128 |                                               options, &workspace_size_bytes);
129 |         OP_REQUIRES(ctx, warp_status == CTC_STATUS_SUCCESS,
130 |                     tf::errors::Internal("warp_ctc error in get_workspace_size: ",
131 |                                          ctcGetStatusString(warp_status)));
132 | 
133 |         auto workspace_shape = tf::TensorShape{static_cast<int64_t>(workspace_size_bytes)};
134 |         tf::Tensor workspace;
135 |         OP_REQUIRES_OK(ctx, ctx->allocate_temp(tf::DT_UINT8, workspace_shape, &workspace));
136 |         auto workspace_t = workspace.flat<uint8_t>();
137 | 
138 |         warp_status = compute_ctc_loss(inputs_t.data(),
139 |                                        gradient_t.data(),
140 |                                        label_values_t.data(),
141 |                                        label_lengths.data(),
142 |                                        seq_len_t.data(),
143 |                                        num_classes, batch_size,
144 |                                        loss_t.data(), workspace_t.data(), options);
145 |         
146 |         OP_REQUIRES(ctx, warp_status == CTC_STATUS_SUCCESS,
147 |                     tf::errors::Internal("warp_ctc error in compute_ctc_loss: ",
148 |                                          ctcGetStatusString(warp_status)));
149 | 
150 |     }
151 | 
152 |   private:
153 |     virtual void set_zero(tf::Tensor* t) = 0;
154 |     virtual ctcOptions create_options(tf::OpKernelContext* ctx) = 0;
155 | };
156 | 
157 | class CTCLossOpCPU : public CTCLossOpBase {
158 |   public:
159 |     explicit CTCLossOpCPU(tf::OpKernelConstruction* ctx) : CTCLossOpBase(ctx) {
160 |     }
161 | 
162 |   private:
163 |     void set_zero(tf::Tensor* t) override {
164 |         t->flat<float>().setZero();
165 |     }
166 | 
167 |     ctcOptions create_options(tf::OpKernelContext* ctx) override {
168 |         auto options = ctcOptions{};
169 |         options.loc = CTC_CPU;
170 |         options.num_threads = ctx->device()->tensorflow_cpu_worker_threads()->num_threads;
171 |         return options;
172 |     }
173 | };
174 | 
175 | REGISTER_KERNEL_BUILDER(Name("CTCLoss")
176 |                         .Device(::tensorflow::DEVICE_CPU)
177 |                         .Label("WarpCTC"),
178 |                         CTCLossOpCPU);
179 | 
180 | #ifdef WARPCTC_ENABLE_GPU
181 | 
182 | class CTCLossOpGPU : public CTCLossOpBase {
183 |   public:
184 |     explicit CTCLossOpGPU(tf::OpKernelConstruction* ctx) : CTCLossOpBase(ctx) {
185 |     }
186 | 
187 |   private:
188 |     void set_zero(tf::Tensor* t) override {
189 |         cudaMemset(t->flat<float>().data(), 0, t->NumElements()*sizeof(float));
190 |     }
191 | 
192 |     ctcOptions create_options(tf::OpKernelContext* ctx) override {
193 |         auto cuda_stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
194 |         auto options = ctcOptions{};
195 |         options.loc = CTC_GPU;
196 |         options.stream = cuda_stream;
197 |         return options;
198 |     }
199 | };
200 | 
201 | // Register GPU kernel both with and without the label
202 | REGISTER_KERNEL_BUILDER(Name("CTCLoss")
203 |                         .Device(::tensorflow::DEVICE_GPU)
204 |                         .Label("WarpCTC")
205 |                         .HostMemory("labels_indices")
206 |                         .HostMemory("labels_values")
207 |                         .HostMemory("sequence_length")
208 |                         .HostMemory("loss"),
209 |                         CTCLossOpGPU);
210 | REGISTER_KERNEL_BUILDER(Name("CTCLoss")
211 |                         .Device(::tensorflow::DEVICE_GPU)
212 |                         .HostMemory("labels_indices")
213 |                         .HostMemory("labels_values")
214 |                         .HostMemory("sequence_length")
215 |                         .HostMemory("loss"),
216 |                         CTCLossOpGPU);
217 | 
218 | #undef EIGEN_USE_GPU
219 | #endif
220 | 
221 | }
222 | 


--------------------------------------------------------------------------------
/tensorflow_binding/src/warpctc_op.cc:
--------------------------------------------------------------------------------
  1 | #ifdef WARPCTC_ENABLE_GPU
  2 | #define EIGEN_USE_GPU
  3 | #include <cuda.h>
  4 | #endif
  5 | 
  6 | #include "tensorflow/core/framework/op.h"
  7 | #include "tensorflow/core/framework/op_kernel.h"
  8 | #include "tensorflow/core/kernels/bounds_check.h"
  9 | #include "tensorflow/core/framework/allocator.h"
 10 | #include "ctc.h"
 11 | 
 12 | 
 13 | REGISTER_OP("WarpCTC")
 14 |     .Input("activations: float32")
 15 |     .Input("flat_labels: int32")
 16 |     .Input("label_lengths: int32")
 17 |     .Input("input_lengths: int32")
 18 |     .Attr("blank_label: int = 0")
 19 |     .Output("costs: float32")
 20 |     .Output("gradients: float32");
 21 | 
 22 | namespace tf = tensorflow;
 23 | 
 24 | namespace warp_ctc {
 25 | 
 26 | class WarpCTCOpBase : public tf::OpKernel {
 27 |   public:
 28 |     explicit WarpCTCOpBase(tf::OpKernelConstruction* ctx) : tf::OpKernel(ctx) {
 29 |         OP_REQUIRES_OK(ctx, ctx->GetAttr("blank_label", &blank_label_));
 30 |     }
 31 | 
 32 |     void Compute(tf::OpKernelContext* ctx) override {
 33 |         // Grab the input tensors
 34 |         const tf::Tensor* activations;
 35 |         const tf::Tensor* flat_labels;
 36 |         const tf::Tensor* label_lengths;
 37 |         const tf::Tensor* input_lengths;
 38 |         OP_REQUIRES_OK(ctx, ctx->input("activations", &activations));
 39 |         OP_REQUIRES_OK(ctx, ctx->input("flat_labels", &flat_labels));
 40 |         OP_REQUIRES_OK(ctx, ctx->input("label_lengths", &label_lengths));
 41 |         OP_REQUIRES_OK(ctx, ctx->input("input_lengths", &input_lengths));
 42 | 
 43 |         OP_REQUIRES(ctx, activations->shape().dims() == 3,
 44 |                     tf::errors::InvalidArgument("activations is not a 3-Tensor"));
 45 |         OP_REQUIRES(ctx, tf::TensorShapeUtils::IsVector(flat_labels->shape()),
 46 |                      tf::errors::InvalidArgument("flat_labels is not a vector"));
 47 |         OP_REQUIRES(ctx, tf::TensorShapeUtils::IsVector(label_lengths->shape()),
 48 |                      tf::errors::InvalidArgument("label_lengths is not a vector"));
 49 |         OP_REQUIRES(ctx, tf::TensorShapeUtils::IsVector(input_lengths->shape()),
 50 |                      tf::errors::InvalidArgument("input_lengths is not a vector"));
 51 | 
 52 |         const auto& acts_shape = activations->shape();
 53 |         const auto max_time = acts_shape.dim_size(0);
 54 |         const auto batch_size = acts_shape.dim_size(1);
 55 |         const auto num_classes_raw = acts_shape.dim_size(2);
 56 | 
 57 |         auto activations_t = activations->tensor<float, 3>();
 58 |         auto flat_labels_t = flat_labels->vec<int32_t>();
 59 | 
 60 |         OP_REQUIRES(
 61 |                 ctx, tf::FastBoundsCheck(num_classes_raw, std::numeric_limits<int>::max()),
 62 |                 tf::errors::InvalidArgument("num_classes cannot exceed max int"));
 63 |         const auto alphabet_size = static_cast<const int>(num_classes_raw);
 64 | 
 65 |         OP_REQUIRES(
 66 |                 ctx, batch_size == input_lengths->dim_size(0),
 67 |                 tf::errors::InvalidArgument("len(input_lengths) != batch_size.  ",
 68 |                                             "len(input_length):  ", input_lengths->dim_size(0),
 69 |                                             " batch_size: ", batch_size));
 70 |         auto input_lengths_t = input_lengths->vec<int32_t>();
 71 | 
 72 |         OP_REQUIRES(
 73 |                 ctx, batch_size == label_lengths->dim_size(0),
 74 |                 tf::errors::InvalidArgument("len(label_lengths) != batch_size.  ",
 75 |                                             "len(label_length):  ", label_lengths->dim_size(0),
 76 |                                             " batch_size: ", batch_size));
 77 |         auto label_lengths_t = label_lengths->vec<int32_t>();
 78 | 
 79 |         // check that labels are in the alphabet?
 80 | 
 81 |         for (int b = 0; b < batch_size; b++) {
 82 |             OP_REQUIRES(ctx, input_lengths_t(b) <= max_time,
 83 |                         tf::errors::InvalidArgument("input_lengths(", b, ") <= ", max_time));
 84 |         }
 85 | 
 86 |         tf::Tensor* costs = nullptr;
 87 |         OP_REQUIRES_OK(ctx, ctx->allocate_output("costs", input_lengths->shape(), &costs));
 88 |         auto costs_t = costs->vec<float>();
 89 | 
 90 |         tf::Tensor* grads = nullptr;
 91 |         OP_REQUIRES_OK(ctx, ctx->allocate_output("gradients", activations->shape(),
 92 |                                                  &grads));
 93 |         set_zero(grads);
 94 |         auto grads_t = grads->tensor<float, 3>();
 95 | 
 96 |         auto options = create_options(ctx);
 97 |         options.blank_label = blank_label_;
 98 | 
 99 |         size_t workspace_size_bytes;
100 |         auto warp_status = get_workspace_size(label_lengths_t.data(),
101 |                                               input_lengths_t.data(),
102 |                                               alphabet_size, batch_size,
103 |                                               options, &workspace_size_bytes);
104 | 
105 |         OP_REQUIRES(ctx, warp_status == CTC_STATUS_SUCCESS,
106 |                     tf::errors::Internal("warp_ctc error in get_workspace_size: ",
107 |                                          ctcGetStatusString(warp_status)));
108 | 
109 |         auto workspace_shape = tf::TensorShape{static_cast<int64_t>(workspace_size_bytes)};
110 |         tf::Tensor workspace;
111 |         OP_REQUIRES_OK(ctx, ctx->allocate_temp(tf::DT_UINT8, workspace_shape, &workspace));
112 |         auto workspace_t = workspace.flat<uint8_t>();
113 | 
114 |         // compute CTC
115 |         warp_status = compute_ctc_loss(activations_t.data(),
116 |                                        grads_t.data(),
117 |                                        flat_labels_t.data(),
118 |                                        label_lengths_t.data(),
119 |                                        input_lengths_t.data(),
120 |                                        alphabet_size, batch_size,
121 |                                        costs_t.data(), workspace_t.data(), options);
122 | 
123 |         OP_REQUIRES(ctx, warp_status == CTC_STATUS_SUCCESS,
124 |                     tf::errors::Internal("warp_ctc error in compute_ctc_loss: ",
125 |                                          ctcGetStatusString(warp_status)));
126 | 
127 |     }
128 |   private:
129 |     int blank_label_;
130 |     virtual void set_zero(tf::Tensor* t) = 0;
131 |     virtual ctcOptions create_options(tf::OpKernelContext* ctx) = 0;
132 | };
133 | 
134 | class WarpCTCOpCPU : public WarpCTCOpBase {
135 |   public:
136 |     explicit WarpCTCOpCPU(tf::OpKernelConstruction* ctx) : WarpCTCOpBase(ctx) {
137 |     }
138 | 
139 |   private:
140 |     void set_zero(tf::Tensor* t) override {
141 |         t->flat<float>().setZero();
142 |     }
143 | 
144 |     ctcOptions create_options(tf::OpKernelContext* ctx) override {
145 |         auto options = ctcOptions{};
146 |         options.loc = CTC_CPU;
147 |         options.num_threads = ctx->device()->tensorflow_cpu_worker_threads()->num_threads;
148 |         return options;
149 |     }
150 | };
151 | 
152 | REGISTER_KERNEL_BUILDER(Name("WarpCTC").Device(::tensorflow::DEVICE_CPU), WarpCTCOpCPU);
153 | 
154 | #ifdef WARPCTC_ENABLE_GPU
155 | 
156 | class WarpCTCOpGPU : public WarpCTCOpBase {
157 |   public:
158 |     explicit WarpCTCOpGPU(tf::OpKernelConstruction* ctx) : WarpCTCOpBase(ctx) {
159 |     }
160 | 
161 |   private:
162 |     void set_zero(tf::Tensor* t) override {
163 |         cudaMemset(t->flat<float>().data(), 0, t->NumElements()*sizeof(float));
164 |     }
165 | 
166 |     ctcOptions create_options(tf::OpKernelContext* ctx) override {
167 |         auto cuda_stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
168 |         auto options = ctcOptions{};
169 |         options.loc = CTC_GPU;
170 |         options.stream = cuda_stream;
171 |         return options;
172 |     }
173 | };
174 | 
175 | REGISTER_KERNEL_BUILDER(Name("WarpCTC").Device(::tensorflow::DEVICE_GPU)
176 |                         .HostMemory("flat_labels")
177 |                         .HostMemory("label_lengths")
178 |                         .HostMemory("input_lengths")
179 |                         .HostMemory("costs"),
180 |                         WarpCTCOpGPU);
181 | #undef EIGEN_USE_GPU
182 | #endif
183 | 
184 | }
185 | 
186 | 


--------------------------------------------------------------------------------
/tensorflow_binding/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baidu-research/warp-ctc/94b2fa178347cf02757bdc7329dc2f1b46f5d094/tensorflow_binding/tests/__init__.py


--------------------------------------------------------------------------------
/tensorflow_binding/tests/test_ctc_loss_op.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Tests for tensorflow.ctc_ops.ctc_decoder_ops."""
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import numpy as np
 22 | import tensorflow as tf
 23 | import warpctc_tensorflow
 24 | from tensorflow.python.client import device_lib
 25 | 
 26 | 
 27 | def SimpleSparseTensorFrom(x):
 28 |   """Create a very simple SparseTensor with dimensions (batch, time).
 29 | 
 30 |   Args:
 31 |     x: a list of lists of type int
 32 | 
 33 |   Returns:
 34 |     x_ix and x_val, the indices and values of the SparseTensor<2>.
 35 |   """
 36 |   x_ix = []
 37 |   x_val = []
 38 |   for batch_i, batch in enumerate(x):
 39 |     for time, val in enumerate(batch):
 40 |       x_ix.append([batch_i, time])
 41 |       x_val.append(val)
 42 |   x_shape = [len(x), np.asarray(x_ix).max(0)[1]+1]
 43 |   x_ix = tf.constant(x_ix, tf.int64)
 44 |   x_val = tf.constant(x_val, tf.int32)
 45 |   x_shape = tf.constant(x_shape, tf.int64)
 46 | 
 47 |   return tf.SparseTensor(x_ix, x_val, x_shape)
 48 | 
 49 | def is_gpu_available():
 50 |   """Returns whether TensorFlow can access a GPU."""
 51 |   return any(x.device_type == 'GPU' for x in device_lib.list_local_devices())
 52 | 
 53 | class CTCLossTest(tf.test.TestCase):
 54 | 
 55 |   def _testCTCLoss(self, inputs, seq_lens, labels,
 56 |                    loss_truth, grad_truth,
 57 |                    use_gpu=False, expected_err_re=None):
 58 |     self.assertEqual(len(inputs), len(grad_truth))
 59 | 
 60 |     inputs_t = tf.constant(inputs)
 61 | 
 62 |     log_dev_placement = False
 63 |     if not use_gpu:
 64 |       # Note: using use_gpu=False seems to not work
 65 |       # it runs the GPU version instead
 66 |       config = tf.ConfigProto(log_device_placement=log_dev_placement,
 67 |                               device_count={'GPU': 0})
 68 |     else:
 69 |       config = tf.ConfigProto(log_device_placement=log_dev_placement,
 70 |                               allow_soft_placement=False)
 71 | 
 72 |     with tf.get_default_graph()._kernel_label_map({"CTCLoss": "WarpCTC"}):
 73 |       with self.test_session(use_gpu=use_gpu, force_gpu=use_gpu, config=config) as sess:
 74 |         loss = tf.nn.ctc_loss(inputs=inputs_t,
 75 |                               labels=labels,
 76 |                               sequence_length=seq_lens)
 77 |         grad = tf.gradients(loss, [inputs_t])[0]
 78 | 
 79 |         self.assertShapeEqual(loss_truth, loss)
 80 |         self.assertShapeEqual(grad_truth, grad)
 81 | 
 82 |         if expected_err_re is None:
 83 |           (tf_loss, tf_grad) = sess.run([loss, grad])
 84 |           self.assertAllClose(tf_loss, loss_truth, rtol=1e-4, atol=1e-4)
 85 |           self.assertAllClose(tf_grad, grad_truth, rtol=1e-4, atol=1e-4)
 86 |         else:
 87 |           with self.assertRaisesOpError(expected_err_re):
 88 |             sess.run([loss, grad])
 89 | 
 90 |   def _testBasic(self, use_gpu):
 91 |     """Test two batch entries."""
 92 |     # Input and ground truth from Alex Graves' implementation.
 93 |     #
 94 |     #### Batch entry 0 #####
 95 |     # targets: 0 1 2 1 0
 96 |     # outputs:
 97 |     # 0 0.633766 0.221185 0.0917319 0.0129757 0.0142857 0.0260553
 98 |     # 1 0.111121 0.588392 0.278779 0.0055756 0.00569609 0.010436
 99 |     # 2 0.0357786 0.633813 0.321418 0.00249248 0.00272882 0.0037688
100 |     # 3 0.0663296 0.643849 0.280111 0.00283995 0.0035545 0.00331533
101 |     # 4 0.458235 0.396634 0.123377 0.00648837 0.00903441 0.00623107
102 |     # alpha:
103 |     # 0 -3.64753 -0.456075 -inf -inf -inf -inf -inf -inf -inf -inf -inf
104 |     # 1 -inf -inf -inf -0.986437 -inf -inf -inf -inf -inf -inf -inf
105 |     # 2 -inf -inf -inf -inf -inf -2.12145 -inf -inf -inf -inf -inf
106 |     # 3 -inf -inf -inf -inf -inf -inf -inf -2.56174 -inf -inf -inf
107 |     # 4 -inf -inf -inf -inf -inf -inf -inf -inf -inf -3.34211 -inf
108 |     # beta:
109 |     # 0 -inf -2.88604 -inf -inf -inf -inf -inf -inf -inf -inf -inf
110 |     # 1 -inf -inf -inf -2.35568 -inf -inf -inf -inf -inf -inf -inf
111 |     # 2 -inf -inf -inf -inf -inf -1.22066 -inf -inf -inf -inf -inf
112 |     # 3 -inf -inf -inf -inf -inf -inf -inf -0.780373 -inf -inf -inf
113 |     # 4 -inf -inf -inf -inf -inf -inf -inf -inf -inf 0 0
114 |     # prob: -3.34211
115 |     # outputDerivs:
116 |     # 0 -0.366234 0.221185 0.0917319 0.0129757 0.0142857 0.0260553
117 |     # 1 0.111121 -0.411608 0.278779 0.0055756 0.00569609 0.010436
118 |     # 2 0.0357786 0.633813 -0.678582 0.00249248 0.00272882 0.0037688
119 |     # 3 0.0663296 -0.356151 0.280111 0.00283995 0.0035545 0.00331533
120 |     # 4 -0.541765 0.396634 0.123377 0.00648837 0.00903441 0.00623107
121 |     #
122 |     #### Batch entry 1 #####
123 |     #
124 |     # targets: 0 1 1 0
125 |     # outputs:
126 |     # 0 0.30176 0.28562 0.0831517 0.0862751 0.0816851 0.161508
127 |     # 1 0.24082 0.397533 0.0557226 0.0546814 0.0557528 0.19549
128 |     # 2 0.230246 0.450868 0.0389607 0.038309 0.0391602 0.202456
129 |     # 3 0.280884 0.429522 0.0326593 0.0339046 0.0326856 0.190345
130 |     # 4 0.423286 0.315517 0.0338439 0.0393744 0.0339315 0.154046
131 |     # alpha:
132 |     # 0 -1.8232 -1.19812 -inf -inf -inf -inf -inf -inf -inf
133 |     # 1 -inf -2.19315 -2.83037 -2.1206 -inf -inf -inf -inf -inf
134 |     # 2 -inf -inf -inf -2.03268 -3.71783 -inf -inf -inf -inf
135 |     # 3 -inf -inf -inf -inf -inf -4.56292 -inf -inf -inf
136 |     # 4 -inf -inf -inf -inf -inf -inf -inf -5.42262 -inf
137 |     # beta:
138 |     # 0 -inf -4.2245 -inf -inf -inf -inf -inf -inf -inf
139 |     # 1 -inf -inf -inf -3.30202 -inf -inf -inf -inf -inf
140 |     # 2 -inf -inf -inf -inf -1.70479 -0.856738 -inf -inf -inf
141 |     # 3 -inf -inf -inf -inf -inf -0.859706 -0.859706 -0.549337 -inf
142 |     # 4 -inf -inf -inf -inf -inf -inf -inf 0 0
143 |     # prob: -5.42262
144 |     # outputDerivs:
145 |     # 0 -0.69824 0.28562 0.0831517 0.0862751 0.0816851 0.161508
146 |     # 1 0.24082 -0.602467 0.0557226 0.0546814 0.0557528 0.19549
147 |     # 2 0.230246 0.450868 0.0389607 0.038309 0.0391602 -0.797544
148 |     # 3 0.280884 -0.570478 0.0326593 0.0339046 0.0326856 0.190345
149 |     # 4 -0.576714 0.315517 0.0338439 0.0393744 0.0339315 0.154046
150 | 
151 |     # max_time_steps == 7
152 |     depth = 6
153 | 
154 |     # seq_len_0 == 5
155 |     targets_0 = [0, 1, 2, 1, 0]
156 |     loss_log_prob_0 = -3.34211
157 |     # dimensions are time x depth
158 |     input_prob_matrix_0 = np.asarray(
159 |         [[0.633766, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553],
160 |          [0.111121, 0.588392, 0.278779, 0.0055756, 0.00569609, 0.010436],
161 |          [0.0357786, 0.633813, 0.321418, 0.00249248, 0.00272882, 0.0037688],
162 |          [0.0663296, 0.643849, 0.280111, 0.00283995, 0.0035545, 0.00331533],
163 |          [0.458235, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107]],
164 |         dtype=np.float32)
165 |     input_log_prob_matrix_0 = np.log(input_prob_matrix_0)
166 |     gradient_log_prob_0 = np.asarray(
167 |         [[-0.366234, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553],
168 |          [0.111121, -0.411608, 0.278779, 0.0055756, 0.00569609, 0.010436],
169 |          [0.0357786, 0.633813, -0.678582, 0.00249248, 0.00272882, 0.0037688],
170 |          [0.0663296, -0.356151, 0.280111, 0.00283995, 0.0035545, 0.00331533],
171 |          [-0.541765, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107]],
172 |         dtype=np.float32)
173 | 
174 |     # seq_len_1 == 5
175 |     targets_1 = [0, 1, 1, 0]
176 |     loss_log_prob_1 = -5.42262
177 |     # dimensions are time x depth
178 | 
179 |     input_prob_matrix_1 = np.asarray(
180 |         [[0.30176, 0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508],
181 |          [0.24082, 0.397533, 0.0557226, 0.0546814, 0.0557528, 0.19549],
182 |          [0.230246, 0.450868, 0.0389607, 0.038309, 0.0391602, 0.202456],
183 |          [0.280884, 0.429522, 0.0326593, 0.0339046, 0.0326856, 0.190345],
184 |          [0.423286, 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046]],
185 |         dtype=np.float32)
186 |     input_log_prob_matrix_1 = np.log(input_prob_matrix_1)
187 |     gradient_log_prob_1 = np.asarray(
188 |         [[-0.69824, 0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508],
189 |          [0.24082, -0.602467, 0.0557226, 0.0546814, 0.0557528, 0.19549],
190 |          [0.230246, 0.450868, 0.0389607, 0.038309, 0.0391602, -0.797544],
191 |          [0.280884, -0.570478, 0.0326593, 0.0339046, 0.0326856, 0.190345],
192 |          [-0.576714, 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046]],
193 |         dtype=np.float32)
194 | 
195 |     # len max_time_steps array of 2 x depth matrices
196 |     inputs = [np.vstack([input_log_prob_matrix_0[t, :],
197 |                          input_log_prob_matrix_1[t, :]])
198 |               for t in range(5)] + 2 * [np.nan*np.ones((2, depth), np.float32)]
199 | 
200 |     # convert inputs into [max_time x batch_size x depth tensor] Tensor
201 |     inputs = np.asarray(inputs, dtype=np.float32)
202 | 
203 |     # len batch_size array of label vectors
204 |     labels = SimpleSparseTensorFrom([targets_0, targets_1])
205 | 
206 |     # batch_size length vector of sequence_lengths
207 |     seq_lens = np.array([5, 5], dtype=np.int32)
208 | 
209 |     # output: batch_size length vector of negative log probabilities
210 |     loss_truth = np.array([-loss_log_prob_0, -loss_log_prob_1], np.float32)
211 | 
212 |     # output: len max_time_steps array of 2 x depth matrices
213 |     grad_truth = [np.vstack([gradient_log_prob_0[t, :],
214 |                              gradient_log_prob_1[t, :]])
215 |                   for t in range(5)] + 2 * [np.zeros((2, depth), np.float32)]
216 | 
217 |     # convert grad_truth into [max_time x batch_size x depth] Tensor
218 |     grad_truth = np.asarray(grad_truth, dtype=np.float32)
219 | 
220 |     self._testCTCLoss(inputs, seq_lens, labels, loss_truth, grad_truth, use_gpu=use_gpu)
221 | 
222 |   def testBasicCPU(self):
223 |     self._testBasic(use_gpu=False)
224 | 
225 |   def testBasicGPU(self):
226 |     if (is_gpu_available()):
227 |       self._testBasic(use_gpu=True)
228 |     else:
229 |       print("Skipping GPU test, no gpus available")
230 | 
231 | if __name__ == "__main__":
232 |   tf.test.main()
233 | 


--------------------------------------------------------------------------------
/tensorflow_binding/tests/test_warpctc_op.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | from warpctc_tensorflow import ctc
  4 | from tensorflow.python.client import device_lib
  5 | 
  6 | def is_gpu_available():
  7 |     """Returns whether TensorFlow can access a GPU."""
  8 |     return any(x.device_type == 'GPU' for x in device_lib.list_local_devices())
  9 | 
 10 | class WarpCTCTest(tf.test.TestCase):
 11 | 
 12 |     def _run_ctc(self, activations, input_lengths,
 13 |                  flat_labels, label_lengths,
 14 |                  expected_costs, expected_gradients,
 15 |                  use_gpu=False, expected_error=None):
 16 |         self.assertEquals(activations.shape, expected_gradients.shape)
 17 |         activations_t = tf.constant(activations)
 18 |         input_lengths_t = tf.constant(input_lengths)
 19 |         flat_labels_t = tf.constant(flat_labels)
 20 |         label_lengths_t = tf.constant(label_lengths)
 21 |         costs = ctc(activations=activations_t,
 22 |                     flat_labels=flat_labels_t,
 23 |                     label_lengths=label_lengths_t,
 24 |                     input_lengths=input_lengths_t)
 25 | 
 26 |         grad = tf.gradients(costs, [activations_t])[0]
 27 | 
 28 |         self.assertShapeEqual(expected_costs, costs)
 29 | 
 30 |         self.assertShapeEqual(expected_gradients, grad)
 31 | 
 32 |         log_dev_placement = False
 33 |         if not use_gpu:
 34 |             # Note: using use_gpu=False seems to not work
 35 |             # it runs the GPU version instead
 36 |             config = tf.ConfigProto(log_device_placement=log_dev_placement,
 37 |                                     device_count={'GPU': 0})
 38 |         else:
 39 |             config = tf.ConfigProto(log_device_placement=log_dev_placement,
 40 |                                     allow_soft_placement=False)
 41 | 
 42 |         with self.test_session(use_gpu=use_gpu, force_gpu=use_gpu, config=config) as sess:
 43 |             if expected_error is None:
 44 |                 (tf_costs, tf_grad) = sess.run([costs, grad])
 45 |                 self.assertAllClose(tf_costs, expected_costs, atol=1e-6)
 46 |                 self.assertAllClose(tf_grad, expected_gradients, atol=1e-6)
 47 |             else:
 48 |                 with self.assertRaisesOpError(expected_error):
 49 |                     sess.run([costs, grad])
 50 | 
 51 |                     sess.run([costs, grad])
 52 | 
 53 |     def _test_basic(self, use_gpu):
 54 |         # Softmax activations for the following inputs:
 55 |         activations = np.array([
 56 |             [0.1, 0.6, 0.1, 0.1, 0.1],
 57 |             [0.1, 0.1, 0.6, 0.1, 0.1]
 58 |         ], dtype=np.float32)
 59 | 
 60 |         alphabet_size = 5
 61 |         # dimensions should be t, n, p: (t timesteps, n minibatches,
 62 |         # p prob of each alphabet). This is one instance, so expand
 63 |         # dimensions in the middle
 64 |         activations = np.expand_dims(activations, 1)
 65 |         labels = np.asarray([1, 2], dtype=np.int32)
 66 |         expected_costs = np.asarray([2.46286], dtype=np.float32)
 67 |         gradients = np.asarray([
 68 |             [0.177031, -0.708125, 0.177031, 0.177031, 0.177031],
 69 |             [0.177031, 0.177031, -0.708125, 0.177031, 0.177031]
 70 |         ], dtype=np.float32)
 71 |         expected_gradients = np.expand_dims(gradients, 1)
 72 |         label_lengths = np.asarray([2], dtype=np.int32)
 73 |         input_lengths = np.asarray([2], dtype=np.int32)
 74 | 
 75 |         self._run_ctc(activations=activations,
 76 |                       input_lengths=input_lengths,
 77 |                       flat_labels=labels, label_lengths=label_lengths,
 78 |                       expected_costs=expected_costs,
 79 |                       expected_gradients=expected_gradients,
 80 |                       use_gpu=use_gpu)
 81 | 
 82 |     def test_basic_cpu(self):
 83 |         self._test_basic(use_gpu=False)
 84 | 
 85 |     def test_basic_gpu(self):
 86 |         if (is_gpu_available()):
 87 |             self._test_basic(use_gpu=True)
 88 |         else:
 89 |             print("Skipping GPU test, no gpus available")
 90 | 
 91 |     def _test_multiple_batches(self, use_gpu):
 92 |         activations = np.array([
 93 |             [0.1, 0.6, 0.1, 0.1, 0.1],
 94 |             [0.1, 0.1, 0.6, 0.1, 0.1]
 95 |         ], dtype=np.float32)
 96 | 
 97 |         alphabet_size = 5
 98 |         # dimensions should be t, n, p: (t timesteps, n minibatches,
 99 |         # p prob of each alphabet). This is one instance, so expand
100 |         # dimensions in the middle
101 |         _activations = np.expand_dims(activations, 1)
102 |         activations = np.concatenate([_activations, _activations[...]], axis=1)
103 |         labels = np.asarray([1, 2, 1, 2], dtype=np.int32)
104 |         expected_costs = np.asarray([2.46286, 2.46286], dtype=np.float32)
105 |         gradients = np.asarray([
106 |             [0.177031, -0.708125, 0.177031, 0.177031, 0.177031],
107 |             [0.177031, 0.177031, -0.708125, 0.177031, 0.177031]
108 |         ], dtype=np.float32)
109 |         _expected_gradients = np.expand_dims(gradients, 1)
110 |         expected_gradients = np.concatenate(
111 |             [_expected_gradients, _expected_gradients[...]], axis=1)
112 | 
113 |         label_lengths = np.asarray([2, 2], dtype=np.int32)
114 |         input_lengths = np.asarray([2, 2], dtype=np.int32)
115 | 
116 |         self._run_ctc(activations=activations,
117 |                       input_lengths=input_lengths,
118 |                       flat_labels=labels, label_lengths=label_lengths,
119 |                       expected_costs=expected_costs,
120 |                       expected_gradients=expected_gradients,
121 |                       use_gpu=use_gpu)
122 | 
123 |     def test_multiple_batches_cpu(self):
124 |         self._test_multiple_batches(use_gpu=False)
125 | 
126 |     def test_multiple_batches_gpu(self):
127 |         if (is_gpu_available()):
128 |             self._test_multiple_batches(use_gpu=True)
129 |         else:
130 |             print("Skipping GPU test, no gpus available")
131 | 
132 | if __name__ == "__main__":
133 |     tf.test.main()
134 | 


--------------------------------------------------------------------------------
/tensorflow_binding/warpctc_tensorflow/__init__.py:
--------------------------------------------------------------------------------
 1 | import imp
 2 | import tensorflow as tf
 3 | from tensorflow.python.framework import ops
 4 | from tensorflow.python.ops.nn_grad import _BroadcastMul
 5 | 
 6 | lib_file = imp.find_module('kernels', __path__)[1]
 7 | _warpctc = tf.load_op_library(lib_file)
 8 | 
 9 | def ctc(activations, flat_labels, label_lengths, input_lengths,
10 |         blank_label=0):
11 |     '''Computes the CTC loss between a sequence of activations and a
12 |     ground truth labeling.
13 | 
14 |     Args:
15 | 
16 |         activations: A 3-D Tensor of floats.  The dimensions
17 |                      should be (t, n, a), where t is the time index, n
18 |                      is the minibatch index, and a indexes over
19 |                      activations for each symbol in the alphabet.
20 | 
21 |         flat_labels: A 1-D Tensor of ints, a concatenation of all the
22 |                      labels for the minibatch.
23 | 
24 |         label_lengths: A 1-D Tensor of ints, the length of each label
25 |                        for each example in the minibatch.
26 | 
27 |         input_lengths: A 1-D Tensor of ints, the number of time steps
28 |                        for each sequence in the minibatch.
29 | 
30 |         blank_label: int, the label value/index that the CTC
31 |                      calculation should use as the blank label
32 | 
33 |     Returns:
34 |         1-D float Tensor, the cost of each example in the minibatch
35 |         (as negative log probabilities).
36 | 
37 |     * This class performs the softmax operation internally.
38 | 
39 |     * The label reserved for the blank symbol should be label 0.
40 | 
41 |     '''
42 |     loss, _ = _warpctc.warp_ctc(activations, flat_labels, label_lengths,
43 |                                 input_lengths, blank_label)
44 |     return loss
45 | 
46 | 
47 | @ops.RegisterGradient("WarpCTC")
48 | def _CTCLossGrad(op, grad_loss, _):
49 |     grad = op.outputs[1]
50 |     return [_BroadcastMul(grad_loss, grad), None, None, None]
51 | 
52 | 
53 | @ops.RegisterShape("WarpCTC")
54 | def _CTCLossShape(op):
55 |     inputs_shape = op.inputs[0].get_shape().with_rank(3)
56 |     batch_size = inputs_shape[1]
57 |     return [batch_size, inputs_shape]
58 | 
59 | 


--------------------------------------------------------------------------------
/tests/test.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <algorithm>
  4 | #include <numeric>
  5 | #include <stdexcept>
  6 | #include <vector>
  7 | #include <random>
  8 | 
  9 | #include <ctc.h>
 10 | 
 11 | inline void throw_on_error(ctcStatus_t status, const char* message) {
 12 |     if (status != CTC_STATUS_SUCCESS) {
 13 |         throw std::runtime_error(message + (", stat = " + 
 14 |                                             std::string(ctcGetStatusString(status))));
 15 |     }
 16 | }
 17 | 
 18 | #if (defined(__HIPCC__) || defined(__CUDACC__))
 19 | #ifdef __HIPCC__
 20 | #include <thrust/system_error.h>
 21 | #include <thrust/system/hip/error.h>
 22 | 
 23 | inline void throw_on_error(hipError_t error, const char* message) {
 24 |     if (error) {
 25 |         throw thrust::system_error(error, thrust::hip_category(), message);
 26 |     }
 27 | }
 28 | #else
 29 | #include <thrust/system_error.h>
 30 | #include <thrust/system/cuda/error.h>
 31 | 
 32 | inline void throw_on_error(cudaError_t error, const char* message) {
 33 |     if (error) {
 34 |         throw thrust::system_error(error, thrust::cuda_category(), message);
 35 |     }
 36 | }
 37 | #endif
 38 | #endif
 39 | 
 40 | std::vector<float>
 41 | genActs(int size) {
 42 |     std::vector<float> arr(size);
 43 |     std::mt19937 gen(0);
 44 |     std::uniform_real_distribution<> dis(0, 1);
 45 |     for(int i = 0; i < size; ++i)
 46 |         arr[i] = dis(gen);
 47 |     return arr;
 48 | }
 49 | 
 50 | std::vector<int>
 51 | genLabels(int alphabet_size, int L) {
 52 |     std::vector<int> label(L);
 53 | 
 54 |     std::mt19937 gen(1);
 55 |     std::uniform_int_distribution<> dis(1, alphabet_size - 1);
 56 | 
 57 |     for(int i = 0; i < L; ++i) {
 58 |         label[i] = dis(gen);
 59 |     }
 60 |     // guarantee repeats for testing
 61 |     if (L >= 3) {
 62 |         label[L / 2] = label[L / 2 + 1];
 63 |         label[L / 2 - 1] = label[L / 2];
 64 |     }
 65 |     return label;
 66 | }
 67 | 
 68 | float rel_diff(const std::vector<float>& grad,
 69 |                const std::vector<float>& num_grad) {
 70 |     float diff = 0.;
 71 |     float tot = 0.;
 72 |     for(size_t idx = 0; idx < grad.size(); ++idx) {
 73 |         diff += (grad[idx] - num_grad[idx]) * (grad[idx] - num_grad[idx]);
 74 |         tot += grad[idx] * grad[idx];
 75 |     }
 76 | 
 77 |     return diff / tot;
 78 | }
 79 | 
 80 | // Numerically stable softmax for a minibatch of 1
 81 | void softmax(const float* const acts,
 82 |              int alphabet_size, int T,
 83 |              float *probs) {
 84 | 
 85 |     for (int t = 0; t < T; ++t) {
 86 | 
 87 |         float max_activation =
 88 |             -std::numeric_limits<float>::infinity();
 89 | 
 90 |         for (int a = 0; a < alphabet_size; ++a)
 91 |             max_activation =
 92 |                std::max(max_activation, acts[t*alphabet_size + a]);
 93 | 
 94 |         float denom = 0;
 95 |         for (int a = 0; a < alphabet_size; ++a)
 96 |             denom += std::exp(acts[t*alphabet_size + a] - max_activation);
 97 | 
 98 |         for (int a = 0; a < alphabet_size; ++a)
 99 |             probs[t*alphabet_size + a] =
100 |                std::exp(acts[t*alphabet_size + a] - max_activation) / denom;
101 |     }
102 | }
103 | 


--------------------------------------------------------------------------------
/torch_binding/TUTORIAL.md:
--------------------------------------------------------------------------------
  1 | ## Torch Tutorial
  2 | 
  3 | [In Chinese 中文版](TUTORIAL.zh_cn.md)
  4 | 
  5 | Make sure you have `warp-ctc` installed by running ```luarocks make torch_binding/rocks/warp-ctc-scm-1.rockspec``` at the top level directory.
  6 | 
  7 | Using the torch bindings, it is easy to experiment with CTC interactively.
  8 | 
  9 | If you have compiled without GPU support, replace `torch.Tensor(...):cuda()` with
 10 | `torch.Tensor(...):float()` and calls to `gpu_ctc` with `cpu_ctc`.
 11 | 
 12 | The CTC algorithm gives the loss between input sequences and target output sequences. Since CTC
 13 | is commonly used with neural networks, we will call the input sequences activation sequences.
 14 | The target output sequences are drawn from a fixed alphabet. For the discussion here we choose 
 15 | the four characters `a,b,c,d`. The algorithm requires a `<BLANK>` symbol distinct from the alphabet. 
 16 | This means that the activation sequences will be sequences of vectors of dimension five (the size of our alphabet
 17 | together with `<BLANK>`). The vectors will be converted to a probability distribution over the alphabet 
 18 | and the `<BLANK>` with a SoftMax function. So for example a problem with an activation sequence of length seven 
 19 | would be (the components of the vectors here are arbitrary)
 20 | 
 21 | ```{<2,0,0,0,0>, <1,3,0,0,0>, <1,4,1,0,0>, <1,1,5,6,0>, <1,1,1,1,1>, <1,1,7,1,1>, <9,1,1,1,1>}```
 22 | 
 23 | and a valid target output sequence would be `dacba`.
 24 | 
 25 | 
 26 | 
 27 | To start we are going to use a very simple example. In the example we will have an activation sequence of length
 28 | one and also a target output sequence of length one. To specify the activation sequence then,
 29 | we have to write down the components of a single five dimensional vector. 
 30 | We are going to use `<0,0,0,0,0>` as the single vector in the activation sequence
 31 | and so the resulting probabilities will be `0.2,0.2,0.2,0.2,0.2`. 
 32 | 
 33 | For the targets, we are going to have a single label `a`.
 34 | 
 35 | Firstly, how do we present the data to the algorithm? As usual in Torch, the activations are
 36 | put into rows in a 2 dimensional Tensor. The target labels are put into a lua table of tables
 37 | with one table for each sequence of target labels. We only have one sequence (of one label)
 38 | and so the table is `{{1}}` as the label `a` has index 1 (the index 0 is reserved for the blank symbol).
 39 | Since we are allowing the possibility of inputting different length activation sequences, we have to specify
 40 | the length of our input activation sequence, which in this case is 1 with a lua table `{1}`.
 41 | 
 42 | To calculate the value of the CTC loss for the above problem just observe that with a one element input
 43 | sequence and a single output label, there is only one possible alignment and so the symbol
 44 | must be emitted at the first time step. The probability of emitting the symbol is `0.2`. The algorithm
 45 | returns the negative log likelihood which is `-ln(0.2)=1.6094`.
 46 | 
 47 | Now we want to use the code to do the calculation. Start with a Torch session and require the libraries.
 48 | 
 49 | If you have GPU support
 50 | 
 51 | ```
 52 | th>require 'cutorch'  
 53 | ```
 54 | 
 55 | for CPU only
 56 | 
 57 | ```
 58 | th>require 'warp_ctc'  
 59 | ```
 60 | 
 61 | We need to put the activations in rows - so note the double braces.
 62 | 
 63 | ```
 64 | th>acts = torch.Tensor({{0,0,0,0,0}}):cuda()
 65 | ```
 66 | 
 67 | If an empty grad Tensor is passed, the gradient calculation will not be done.
 68 | 
 69 | ```
 70 | th>grads = torch.Tensor():cuda()
 71 | ```
 72 | 
 73 | For the target labels and sizes of the input sequence,
 74 | 
 75 | ```
 76 | th>labels = {{1}}
 77 | th>sizes ={1}
 78 | ```
 79 | 
 80 | If you have CUDA support, use `gpu_ctc` otherwise use `cpu_ctc`
 81 | 
 82 | ```
 83 | th> gpu_ctc(acts, grads, labels, sizes)
 84 | 
 85 | {
 86 |   1 : 1.6094379425049
 87 | }
 88 | ```
 89 | 
 90 | The function returns a lua table of the CTC loss for each set of sequences.
 91 | 
 92 | Now for a slightly more interesting example. Suppose we have an input sequence of
 93 | length three, with activations 
 94 | 
 95 | `<1,2,3,4,5>`,`<6,7,8,9,10>` and `<11,12,13,14,15>`. 
 96 | 
 97 | The corresponding probabilities for the frames are then 
 98 | 
 99 | `0.0117, 0.0317, 0.0861, 0.2341, 0.6364`
100 | 
101 | (the probabilties are the same for each frame in this special case).
102 | 
103 | For target symbols we will use the sequence `c,c`.
104 | 
105 | ```
106 | th>acts = torch.Tensor({{1,2,3,4,5},{6,7,8,9,10},{11,12,13,14,15}}):cuda()
107 | th>labels = {{3,3}}
108 | th>sizes = {3}
109 | ```
110 | CTC calculates the probability of all the possible alignments. Note that the targets
111 | contain the repeated symbol `c`. CTC cannot emit a repeated symbol on consecutive timesteps
112 | (for more details consult http://www.cs.toronto.edu/~graves/icml_2006.pdf) it must separate 
113 | the repeated symbol with a blank and so the only possible aligned sequence is 
114 | 
115 | `c <BLANK> c`.
116 | 
117 | CTC assumes the label probabilities are conditionally independent given the data and so
118 | we expect the answer to be `Pr(c at frame 1)*Pr(<BLANK> at frame 2)*Pr(c at frame 3) = 0.2341*0.0117*0.2341`
119 | and `-ln(0.2341*0.0117*0.2341) = 7.3522`.
120 | 
121 | ```
122 | th> gpu_ctc(acts, grads, labels, sizes)
123 | 
124 | {
125 |   1 : 7.355742931366
126 | }
127 | ```
128 | 
129 | The small numerical difference is from doing one of the calculations by hand.
130 |  
131 | Suppose the target sequence is `b,c` and the activations are 
132 | 
133 | `<-5,-4,-3,-2,-1>`,`<-10,-9,-8,-7,-6>` and `<-15,-14,-13,-12,-11>`.
134 | 
135 | The corresponding probabilities for the frames are then again 
136 | 
137 | `0.0117, 0.0317, 0.0861, 0.2341, 0.6364`.
138 | 
139 | Now there are five possible alignments as repeated symbols
140 | are collapsed and blanks are removed:
141 | `<BLANK> b c`, `b <BLANK> c`, `b c <BLANK>`, `b b c` and `b c c`. 
142 | 
143 | The result should be 
144 | `-ln(3*0.0117*0.0861*0.2341 + 0.0861*0.0861*0.2341 + 0.0861*0.2341*0.2341) = 4.9390`
145 | 
146 | ```
147 | th>acts = torch.Tensor({{-5,-4,-3,-2,-1},{-10,-9,-8,-7,-6},{-15,-14,-13,-12,-11}}):cuda()
148 | th>labels = {{2,3}}
149 | th>sizes = {3}
150 | th>gpu_ctc(acts, grads, labels, sizes)
151 | 
152 | {
153 |   1 : 4.938850402832
154 | }
155 | ```
156 | 
157 | So we have three examples. The final example shows how to do all three at once is the case
158 | where we want to put minibatches through the algorithm. The labels are now `{{1}, {3,3}, {2,3}}`
159 | and the lengths of the input sequences are `{1,3,3}`. We have to put all of the input sequences in
160 | a single two dimensional matrix. This is done by interleaving the input sequence elements so that the
161 | input matrix will look like this. For clarity we start with the first two input sequences
162 | 
163 | 
164 | | entries | col1 | col2 | col3 | col4 | col5 |
165 | |---------|------|------|------|------|------|
166 | |seq1 item 1|0|0|0|0|0|
167 | |seq2 item 1|1|2|3|4|5|
168 | |seq1 item 2|P|P|P|P|P|
169 | |seq2 item 2|6|7|8|9|10|
170 | |seq1 item 3|P|P|P|P|P|
171 | |seq2 item 3|11|12|13|14|15|
172 | 
173 | Since the first sequence has no second or third elements, we pad the matrix with zeros (which appear as
174 | `P` in the above table). Now we put the third sequence in 
175 | 
176 | | entries | col1 | col2 | col3 | col4 | col5 |
177 | |---------|------|------|------|------|------|
178 | |seq1 item 1|0|0|0|0|0|
179 | |seq2 item 1|1|2|3|4|5|
180 | |seq3 item 1|-5|-4|-3|-2|-1|
181 | |seq1 item 2|P|P|P|P|P|
182 | |seq2 item 2|6|7|8|9|10|
183 | |seq3 item 2|-10|-9|-8|7|-6|
184 | |seq1 item 3|P|P|P|P|P|
185 | |seq2 item 3|11|12|13|14|15|
186 | |seq3 item 3|-15|-14|-13|-12|-11|
187 | 
188 | 
189 | The complete example in Torch is
190 | 
191 | ```
192 | th>acts = torch.Tensor({{0,0,0,0,0},{1,2,3,4,5},{-5,-4,-3,-2,-1},
193 |                         {0,0,0,0,0},{6,7,8,9,10},{-10,-9,-8,-7,-6},
194 |                         {0,0,0,0,0},{11,12,13,14,15},{-15,-14,-13,-12,-11}}):cuda()
195 | th>labels = {{1}, {3,3}, {2,3}}
196 | th>sizes = {1,3,3}
197 | th>gpu_ctc(acts, grads, labels, sizes)
198 | 
199 | {
200 |   1 : 1.6094379425049
201 |   2 : 7.355742931366
202 |   3 : 4.938850402832
203 | }
204 | ```
205 | 
206 | In order to obtain gradients wrt the incoming activations simply pass a
207 | tensor of the same size as the activations tensor. Also see 
208 | `torch_binding/tests/test.lua` for more examples.
209 | 


--------------------------------------------------------------------------------
/torch_binding/TUTORIAL.zh_cn.md:
--------------------------------------------------------------------------------
  1 | ## Torch教程
  2 | 
  3 | 为了确保您成功将‘Warp－CTC’和Torch绑定，请在warp－ctc根目录中运行“luarocks make torch_binding/rocks/warp-ctc-scm-1.rockspec”。
  4 | 
  5 | 现在，您可以非常容易的通过torch_binding来测试CTC。
  6 | 
  7 | 假如你的编译没有GPU支持，请用`torch.Tensor(...):float()`替代`torch.Tensor(...):cuda()`，用`cpu_ctc`取代`gpu_ctc` 
  8 | 
  9 | 
 10 | CTC是一种计算输入序列与目标输出序列之间相似程度的目标函数。由于CTC普遍运用于神经网络，我们称输入序列为激活序列。目标输出序列是从一个固定的字母表中得出的。
 11 | 为了在此讨论， 我们选择了四个字母```a,b,c,d```. 算法需要一个`<空白>`符号区别于字母。这就意味着激活序列会是一个五维向量序列（字母的数量加<空白>）。这个向量（通过softmax函数）将会被转化成字母以及<空白>上的概率分布。
 12 | 
 13 | 比如CTC可以用来衡量一个长度为7的激活序列和标签 ```dacba``` 之间的误差。
 14 | 
 15 | 一个长度为7的激活序列就会是（向量的组成部分是任意的）
 16 | 
 17 | ```{<2,0,0,0,0>, <1,3,0,0,0>, <1,4,1,0,0>, <1,1,5,6,0>, <1,1,1,1,1>, <1,1,7,1,1>, <9,1,1,1,1>}```
 18 | 
 19 | 得到的有效输出序列即`daceba`.
 20 | 
 21 | 一开始我们会举一个非常简单的例子。在这个例子中我们会用一个长度为1的激活序列，以及一个长度为1的目标输出序列。
 22 | 为了指定这个激活序列，我们必须写下每一个五维向量的组成部分。我们使用`<0,0,0,0,0>`作为激活序列的单一向量，得到的概率分布及`0.2,0.2,0.2,0.2,0.2`. 
 23 | 对于目标输出，我们会用一个单一标签`a`.
 24 | 
 25 | 首先，我们如何将数据展现给算法？ 像平时使用Torch一样，激活表示要在一个2维张量中放入行。目标标签需要放入lua table, 每个目标标签序列都有一个对应的表。
 26 | 我们每一个标签仅有一个序列，因此当标签`a`有指数1时，表即`{{1}}` （指数0预留给空白符号）。因为我们允许输入不同长度的激活序列的可能性，我们需要指定
 27 | 输入激活序列的长度，在这个例子即包涵一个lua table`{1}`的1. 
 28 | 
 29 | 为了计算以上问题（单一元素输入序列，单一输出标签）的CTC损失函数的价值， 只有一种可能的对齐方式，所以符号必须在第一个时间步（time step)发出。
 30 | 发出符号的概率为`0.2`。 算法返回的负对数似然值为`-ln(0.2)=1.6094`.
 31 | 
 32 | 现在让我们通过代码来做计算。先从Torch部分开始，需要代码库。
 33 | 
 34 | 
 35 | 假如你有GPU的支持
 36 | 
 37 | ```
 38 | th>require 'cutorch'  
 39 | ```
 40 | 
 41 | 如果仅有CPU
 42 | 
 43 | ```
 44 | th>require 'warp_ctc'  
 45 | ```
 46 | 
 47 | 请将激活输入行－－ 注意用两个大括号
 48 | 
 49 | ```
 50 | th>acts = torch.Tensor({{0,0,0,0,0}}):cuda()
 51 | ```
 52 | 
 53 | 假如输入为空，梯度计算则不能完成。
 54 | 
 55 | ```
 56 | th>grads = torch.Tensor():cuda()
 57 | ```
 58 | 
 59 | 对于目标标签以及输入序列的大小
 60 | ```
 61 | th>labels = {{1}}
 62 | th>sizes ={1}
 63 | ```
 64 | 
 65 | 如果你有CUDA支持，请使用`gpu_ctc` ，否则请使用`cpu_ctc`
 66 | 
 67 | ```
 68 | th> gpu_ctc(acts, grads, labels, sizes)
 69 | 
 70 | {
 71 | 1 : 1.6094379425049
 72 | }
 73 | ```
 74 | 
 75 | 对每一组序列，函数会返回CTC损失的一个lua table.
 76 | 
 77 | 
 78 | 现在，我们来看一个更有意思的例子。假如我们有一个长度为3的输入序列，激活后：
 79 | 
 80 | `<1,2,3,4,5>`,`<6,7,8,9,10>` and `<11,12,13,14,15>`. 
 81 | 
 82 | 对应这些帧的概率则为
 83 | 
 84 | `0.0117, 0.0317, 0.0861, 0.2341, 0.6364`
 85 | 
 86 | （在这个特殊例子中，每一帧的概率都一样）
 87 | 对于目标符号，我们将使用序列`c,c`.
 88 | 
 89 | ```
 90 | th>acts = torch.Tensor({{1,2,3,4,5},{6,7,8,9,10},{11,12,13,14,15}}):cuda()
 91 | th>labels = {{3,3}}
 92 | th>sizes = {3}
 93 | ```
 94 | CTC计算了所有可能对齐的概率。请注意目标包涵了重复的符号`c`.CTC不能在连续的时间步上发出重复的符号（更多细节，[请见](http://www.cs.toronto.edu/~graves/icml_2006.pdf)）。对于重复的符号必须用一个空白分开，所以唯一可能的对齐序列为`c <空白> c`.
 95 | 
 96 | CTC假设，在给定数据的情况下，标签概率是有条件独立的，所以我们期待的答案即`Pr(c at frame 1)*Pr(<空白> at frame 2)*Pr(c at frame 3) = 0.2341*0.0117*0.2341`
 97 | and `-ln(0.2341*0.0117*0.2341) = 7.3522`.
 98 | ```
 99 | th> gpu_ctc(acts, grads, labels, sizes)
100 | 
101 | {
102 | 1 : 7.355742931366
103 | }
104 | ```
105 | 
106 | 小的数值差由于其中一个计算人工完成。
107 | 
108 | 假设目标序列为`b,c`，激活序列则为
109 | 
110 | `<-5,-4,-3,-2,-1>`,`<-10,-9,-8,-7,-6>` and `<-15,-14,-13,-12,-11>`.
111 | 
112 | 对应这些帧的概率则为
113 | 
114 | `0.0117, 0.0317, 0.0861, 0.2341, 0.6364`.
115 | 
116 | 
117 | 由于重复的符号被清空，空白被取消，现在有五种可能的对齐
118 | `<空白> b c`, `b <空白> c`, `b c <空白>`, `b b c` and `b c c`. 
119 | 
120 | 结果应当是
121 | `-ln(3*0.0117*0.0861*0.2341 + 0.0861*0.0861*0.2341 + 0.0861*0.2341*0.2341) = 4.9390`
122 | ```
123 | th>acts = torch.Tensor({{-5,-4,-3,-2,-1},{-10,-9,-8,-7,-6},{-15,-14,-13,-12,-11}}):cuda()
124 | th>labels = {{2,3}}
125 | th>sizes = {3}
126 | th>gpu_ctc(acts, grads, labels, sizes)
127 | 
128 | {
129 | 1 : 4.938850402832
130 | }
131 | ```
132 | 
133 | 因此，我们有三个例子。最后一个例子显示如果通过算法将3个例子做迷你批处理 （minibatch). 标签现在是`{{1}, {3,3}, {2,3}}`，输入序列的长度是`{1,3,3}`. 
134 | 我们必须将输入序列放入一个单独的两维矩阵。通过交织输入序列的元素，我们的输入矩阵如下：
135 | 为了清楚起见，我们从前两个输入序列开始
136 | 
137 | | entries | col1 | col2 | col3 | col4 | col5 |
138 | |---------|------|------|------|------|------|
139 | |seq1 item 1|0|0|0|0|0|
140 | |seq2 item 1|1|2|3|4|5|
141 | |seq1 item 2|P|P|P|P|P|
142 | |seq2 item 2|6|7|8|9|10|
143 | |seq1 item 3|P|P|P|P|P|
144 | |seq2 item 3|11|12|13|14|15|
145 | 
146 | 由于第一个序列没有第二个或第三个元素，我们用0填入矩阵（在上面一个表格中显示为`P`）。 现在我们将第三个序列放入表格中
147 | 
148 | | entries | col1 | col2 | col3 | col4 | col5 |
149 | |---------|------|------|------|------|------|
150 | |seq1 item 1|0|0|0|0|0|
151 | |seq2 item 1|1|2|3|4|5|
152 | |seq3 item 1|-5|-4|-3|-2|-1|
153 | |seq1 item 2|P|P|P|P|P|
154 | |seq2 item 2|6|7|8|9|10|
155 | |seq3 item 2|-10|-9|-8|7|-6|
156 | |seq1 item 3|P|P|P|P|P|
157 | |seq2 item 3|11|12|13|14|15|
158 | |seq3 item 3|-15|-14|-13|-12|-11|
159 | 
160 | 
161 | 在Torch中完整的例子如下
162 | ```
163 | th>acts = torch.Tensor({{0,0,0,0,0},{1,2,3,4,5},{-5,-4,-3,-2,-1},
164 | {0,0,0,0,0},{6,7,8,9,10},{-10,-9,-8,-7,-6},
165 | {0,0,0,0,0},{11,12,13,14,15},{-15,-14,-13,-12,-11}}):cuda()
166 | th>labels = {{1}, {3,3}, {2,3}}
167 | th>sizes = {1,3,3}
168 | th>gpu_ctc(acts, grads, labels, sizes)
169 | 
170 | {
171 | 1 : 1.6094379425049
172 | 2 : 7.355742931366
173 | 3 : 4.938850402832
174 | }
175 | ```
176 | 
177 | 为了获取接下来激活的梯度，传递和激活张量同样大小的张量即可。
178 | 如果想看更多例子，请见`torch_binding/tests/test.lua`。 
179 | 


--------------------------------------------------------------------------------
/torch_binding/binding.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <vector>
  3 | 
  4 | #include "utils.h"
  5 | 
  6 | #include <lua.h>
  7 | #include <luaT.h>
  8 | 
  9 | #include <numeric>
 10 | 
 11 | #include "ctc.h"
 12 | 
 13 | #ifdef TORCH_NOGPU
 14 |     #include "TH.h"
 15 | #else
 16 |     #include "THC.h"
 17 |     #include "THCTensor.h"
 18 |     #include "detail/reduce.h"
 19 | #endif
 20 | 
 21 | int processTargets(lua_State* L, int** sizes, int** labels, int** label_sizes) {
 22 |     // sizes table is 4 item on stack
 23 |     // labels is 3 item on stack
 24 | 
 25 |     if (!lua_istable(L, 4)) {
 26 |         lua_pushfstring(L, "invalid argument 4 for sequence lengths (expected table, got %s)",
 27 |                         luaL_typename(L, -1));
 28 |         lua_error(L);
 29 |     }
 30 | 
 31 |     int minibatch_size = lua_objlen(L, 4);
 32 | 
 33 |     *sizes = new int[minibatch_size];
 34 | 
 35 |     for(int i = 0; i < minibatch_size; i++) {
 36 |         lua_pushinteger(L, i+1);
 37 |         lua_gettable(L, 4);
 38 |         if(lua_isnumber(L, -1)) {
 39 |             (*sizes)[i] = (int) lua_tonumber(L, -1);
 40 |         } else {
 41 |             lua_pushfstring(L, "invalid entry #%d in array sizes (expected number, got %s)",
 42 |                             i, luaL_typename(L, -1));
 43 |             lua_error(L);
 44 |         }
 45 |         lua_pop(L, 1);
 46 |     }
 47 | 
 48 |     if (!lua_istable(L, 3)) {
 49 |         lua_pushfstring(L, "invalid argument 3 for sequence labels (expected table, got %s)",
 50 |                         luaL_typename(L, -1));
 51 |         lua_error(L);
 52 |     }
 53 | 
 54 |     int number_of_target_seq = lua_objlen(L, 3);
 55 | 
 56 |     if (number_of_target_seq != minibatch_size) {
 57 |         lua_pushfstring(L, "The minibatch size %d and the number of target sequences %d must be the same",
 58 |                         minibatch_size, number_of_target_seq);
 59 |         lua_error(L);
 60 |     }
 61 | 
 62 |     std::vector<int> labels_vec;
 63 |     *label_sizes = new int[minibatch_size];
 64 | 
 65 |     for(int i = 0; i < minibatch_size; i++) {
 66 |         lua_pushinteger(L, i+1);
 67 |         lua_gettable(L, 3);
 68 | 
 69 |         if(lua_istable(L, -1)) {
 70 | 
 71 |             int current_label_length = (int) lua_objlen(L, -1);
 72 |             (*label_sizes)[i] = current_label_length;
 73 | 
 74 |             for (int ix = 0; ix < current_label_length; ix++) {
 75 |                 lua_pushinteger(L, ix + 1);
 76 |                 lua_gettable(L, -2);
 77 |                 if(lua_isnumber(L, -1)) {
 78 |                     labels_vec.push_back((int) lua_tonumber(L, -1));
 79 |                 } else {
 80 |                     lua_pushfstring(L, "invalid entry #%d in array labels (expected number, got %s)",
 81 |                                     ix + 1, luaL_typename(L, -1));
 82 |                     lua_error(L);
 83 |                 }
 84 | 
 85 |                 lua_pop(L, 1);
 86 |             }
 87 | 
 88 |         } else {
 89 |             lua_pushfstring(L, "invalid entry #%d in table labels (expected table, got %s)",
 90 |                             i + 1, luaL_typename(L, -1));
 91 |             lua_error(L);
 92 |         }
 93 | 
 94 |         lua_pop(L, 1);
 95 | 
 96 |     }
 97 | 
 98 |     *labels = new int[labels_vec.size()];
 99 | 
100 |     std::copy(labels_vec.begin(), labels_vec.end(), *labels);
101 | 
102 |     return minibatch_size;
103 | }
104 | 
105 | extern "C" int gpu_ctc(lua_State* L) {
106 | #ifdef TORCH_NOGPU
107 |     std::cout << "Compiled without CUDA support." << std::endl;
108 |     lua_newtable(L);
109 | 
110 | 
111 |     lua_pushnumber(L, -999999.0);
112 |     lua_rawseti(L, -2, 1);
113 | 
114 | #else
115 |     THCudaTensor *probs =
116 |         static_cast<THCudaTensor *>(luaT_checkudata(L, 1, "torch.CudaTensor"));
117 |     THCudaTensor *grads =
118 |         static_cast<THCudaTensor *>(luaT_checkudata(L, 2, "torch.CudaTensor"));
119 | 
120 |     int* sizes;
121 |     int* labels_ptr;
122 |     int* label_sizes_ptr;
123 | 
124 |     int minibatch_size = processTargets(L, &sizes, &labels_ptr, &label_sizes_ptr);
125 | 
126 |     float *probs_ptr;
127 | 
128 |     if (probs->storage) {
129 |         probs_ptr = probs->storage->data + probs->storageOffset;
130 |     } else {
131 |         lua_pushfstring(L, "probs cannot be an empty tensor");
132 |         lua_error(L);
133 |     }
134 | 
135 |     float *grads_ptr;
136 | 
137 |     if (grads->storage) {
138 |         grads_ptr = grads->storage->data + grads->storageOffset;;
139 |     } else {
140 |         grads_ptr = NULL; // this will trigger the score forward code path
141 |     }
142 | 
143 |     float* costs = new float[minibatch_size];
144 | 
145 |     ctcOptions options;
146 |     memset(&options, 0, sizeof(options));
147 |     options.loc = CTC_GPU;
148 |     options.stream = THCState_getCurrentStream(cutorch_getstate(L));
149 | 
150 |     size_t gpu_size_bytes;
151 |     get_workspace_size(label_sizes_ptr, sizes,
152 |                        (int) probs->size[1], minibatch_size,
153 |                        options, &gpu_size_bytes);
154 | 
155 |     float* gpu_workspace;
156 |     THCudaMalloc(cutorch_getstate(L), (void **) &gpu_workspace, gpu_size_bytes);
157 | 
158 |     compute_ctc_loss(probs_ptr, grads_ptr,
159 |                      labels_ptr, label_sizes_ptr,
160 |                      sizes, (int) probs->size[1],
161 |                      minibatch_size, costs,
162 |                      gpu_workspace, options);
163 | 
164 |     lua_newtable(L);
165 | 
166 |     for (int ix = 0; ix < minibatch_size; ix++) {
167 |         lua_pushnumber(L, costs[ix]);
168 |         lua_rawseti(L, -2, ix+1);
169 |     }
170 | 
171 |     THCudaFree(cutorch_getstate(L), (void *) gpu_workspace);
172 | 
173 |     delete sizes;
174 |     delete labels_ptr;
175 |     delete label_sizes_ptr;
176 |     delete costs;
177 | #endif
178 |     return 1;
179 | }
180 | 
181 | extern "C" int cpu_ctc(lua_State* L) {
182 | 
183 |     THFloatTensor *probs =
184 |         static_cast<THFloatTensor *>(luaT_checkudata(L, 1, "torch.FloatTensor"));
185 |     THFloatTensor *grads =
186 |         static_cast<THFloatTensor *>(luaT_checkudata(L, 2, "torch.FloatTensor"));
187 | 
188 |     int* sizes;
189 |     int* labels_ptr;
190 |     int* label_sizes_ptr;
191 | 
192 |     int minibatch_size = processTargets(L, &sizes, &labels_ptr, &label_sizes_ptr);
193 |     float *probs_ptr;
194 | 
195 |     if (probs->storage) {
196 |         probs_ptr = probs->storage->data + probs->storageOffset;
197 |     } else {
198 |         lua_pushfstring(L, "probs cannot be an empty tensor");
199 |         lua_error(L);
200 |     }
201 | 
202 |     float *grads_ptr;
203 | 
204 |     if (grads->storage) {
205 |         grads_ptr = grads->storage->data + grads->storageOffset;;
206 |     } else {
207 |         grads_ptr = NULL; // this will trigger the score forward code path
208 |     }
209 | 
210 |     float* costs = new float[minibatch_size];
211 | 
212 |     ctcOptions options;
213 |     memset(&options, 0, sizeof(options));
214 |     options.loc = CTC_CPU;
215 |     options.num_threads = 0; // will use default number of threads
216 | 
217 | #if defined(CTC_DISABLE_OMP) || defined(APPLE)
218 |     // have to use at least one
219 |     options.num_threads = std::max(options.num_threads, (unsigned int) 1);
220 | #endif
221 | 
222 |     size_t cpu_size_bytes;
223 |     get_workspace_size(label_sizes_ptr, sizes,
224 |                        (int) probs->size[1], minibatch_size,
225 |                        options, &cpu_size_bytes);
226 | 
227 |     float* cpu_workspace = (float*) new unsigned char[cpu_size_bytes];
228 | 
229 |     compute_ctc_loss(probs_ptr, grads_ptr,
230 |                      labels_ptr, label_sizes_ptr,
231 |                      sizes, probs->size[1],
232 |                      minibatch_size, costs,
233 |                      cpu_workspace, options);
234 | 
235 |     lua_newtable(L);
236 | 
237 |     for (int ix = 0; ix < minibatch_size; ix++) {
238 |         lua_pushnumber(L, costs[ix]);
239 |         lua_rawseti(L, -2, ix+1);
240 |     }
241 | 
242 |     delete cpu_workspace;
243 |     delete sizes;
244 |     delete labels_ptr;
245 |     delete label_sizes_ptr;
246 |     delete costs;
247 | 
248 |     return 1;
249 | }
250 | 
251 | extern "C" int luaopen_libwarp_ctc(lua_State *L) {
252 |     lua_register(L, "gpu_ctc", gpu_ctc);
253 |     lua_register(L, "cpu_ctc", cpu_ctc);
254 | 
255 |     return 0;
256 | }
257 | 


--------------------------------------------------------------------------------
/torch_binding/init.lua:
--------------------------------------------------------------------------------
1 | require 'libwarp_ctc'


--------------------------------------------------------------------------------
/torch_binding/rocks/warp-ctc-scm-1.rockspec:
--------------------------------------------------------------------------------
 1 | package = "warp-ctc"
 2 | version = "scm-1"
 3 | 
 4 | source = {
 5 |    url = "git://github.com/baidu-research/warp-ctc.git",
 6 | }
 7 | 
 8 | description = {
 9 |    summary = "Baidu CTC Implementation",
10 |    detailed = [[
11 |    ]],
12 |    homepage = "https://github.com/baidu-research/warp-ctc",
13 |    license = "Apache"
14 | }
15 | 
16 | dependencies = {
17 |    "torch >= 7.0",
18 | }
19 | 
20 | build = {
21 |    type = "command",
22 |    build_command = [[
23 | cmake -E make_directory build && cd build && cmake .. -DLUALIB=$(LUALIB) -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) -j$(getconf _NPROCESSORS_ONLN) && make install
24 | ]],
25 | 	platforms = {},
26 |    install_command = "cd build"
27 | }
28 | 


--------------------------------------------------------------------------------
/torch_binding/tests/data/chars.txt:
--------------------------------------------------------------------------------
 1 | ' 1
 2 | <SPACE> 2
 3 | a 3
 4 | b 4
 5 | c 5
 6 | d 6
 7 | e 7
 8 | f 8
 9 | g 9
10 | h 10
11 | i 11
12 | j 12
13 | k 13
14 | l 14
15 | m 15
16 | n 16
17 | o 17
18 | p 18
19 | q 19
20 | r 20
21 | s 21
22 | t 22
23 | u 23
24 | v 24
25 | w 25
26 | x 26
27 | y 27
28 | z 28
29 | 


--------------------------------------------------------------------------------
/torch_binding/tests/data/sizes.txt:
--------------------------------------------------------------------------------
  1 | 433
  2 | 434
  3 | 434
  4 | 434
  5 | 434
  6 | 434
  7 | 434
  8 | 434
  9 | 434
 10 | 434
 11 | 434
 12 | 434
 13 | 434
 14 | 434
 15 | 434
 16 | 434
 17 | 435
 18 | 435
 19 | 435
 20 | 435
 21 | 435
 22 | 435
 23 | 435
 24 | 435
 25 | 436
 26 | 437
 27 | 438
 28 | 438
 29 | 438
 30 | 439
 31 | 439
 32 | 440
 33 | 433
 34 | 434
 35 | 434
 36 | 434
 37 | 434
 38 | 434
 39 | 434
 40 | 434
 41 | 434
 42 | 434
 43 | 434
 44 | 434
 45 | 434
 46 | 434
 47 | 434
 48 | 434
 49 | 435
 50 | 435
 51 | 435
 52 | 435
 53 | 435
 54 | 435
 55 | 435
 56 | 435
 57 | 436
 58 | 437
 59 | 438
 60 | 438
 61 | 438
 62 | 439
 63 | 439
 64 | 440
 65 | 433
 66 | 434
 67 | 434
 68 | 434
 69 | 434
 70 | 434
 71 | 434
 72 | 434
 73 | 434
 74 | 434
 75 | 434
 76 | 434
 77 | 434
 78 | 434
 79 | 434
 80 | 434
 81 | 435
 82 | 435
 83 | 435
 84 | 435
 85 | 435
 86 | 435
 87 | 435
 88 | 435
 89 | 436
 90 | 437
 91 | 438
 92 | 438
 93 | 438
 94 | 439
 95 | 439
 96 | 440
 97 | 433
 98 | 434
 99 | 434
100 | 434
101 | 434
102 | 434
103 | 434
104 | 434
105 | 434
106 | 434
107 | 434
108 | 434
109 | 434
110 | 434
111 | 434
112 | 434
113 | 435
114 | 435
115 | 435
116 | 435
117 | 435
118 | 435
119 | 435
120 | 435
121 | 436
122 | 437
123 | 438
124 | 438
125 | 438
126 | 439
127 | 439
128 | 440
129 | 433
130 | 434
131 | 434
132 | 434
133 | 434
134 | 434
135 | 434
136 | 434
137 | 434
138 | 434
139 | 434
140 | 434
141 | 434
142 | 434
143 | 434
144 | 434
145 | 435
146 | 435
147 | 435
148 | 435
149 | 435
150 | 435
151 | 435
152 | 435
153 | 436
154 | 437
155 | 438
156 | 438
157 | 438
158 | 439
159 | 439
160 | 440
161 | 433
162 | 434
163 | 434
164 | 434
165 | 434
166 | 434
167 | 434
168 | 434
169 | 434
170 | 434
171 | 434
172 | 434
173 | 434
174 | 434
175 | 434
176 | 434
177 | 435
178 | 435
179 | 435
180 | 435
181 | 435
182 | 435
183 | 435
184 | 435
185 | 436
186 | 437
187 | 438
188 | 438
189 | 438
190 | 439
191 | 439
192 | 440
193 | 


--------------------------------------------------------------------------------
/torch_binding/tests/test.lua:
--------------------------------------------------------------------------------
  1 | require 'nn'
  2 | require 'cutorch'
  3 | require 'warp_ctc'
  4 | 
  5 | function os.capture(cmd, raw)
  6 |     local f = assert(io.popen(cmd, 'r'))
  7 |     local s = assert(f:read('*a'))
  8 |     f:close()
  9 |     if raw then return s end
 10 |     s = string.gsub(s, '^%s+', '')
 11 |     s = string.gsub(s, '%s+$', '')
 12 |     s = string.gsub(s, '[\n\r]+', ' ')
 13 |     return s
 14 | end
 15 | 
 16 | function reduce(list)
 17 |     local acc
 18 |     for k, v in ipairs(list) do
 19 |         if 1 == k then
 20 |             acc = v
 21 |         else
 22 |             acc = acc +  v
 23 |         end
 24 |     end
 25 |     return acc
 26 | end
 27 | 
 28 | function simpleTest()
 29 |     local cpu_acts = torch.Tensor({{0.1, 0.6, 0.1, 0.1,0.1},{0.1, 0.1, 0.6, 0.1, 0.1}}):float()
 30 |     local cpu_probs = nn.SoftMax():updateOutput(cpu_acts:double()):float()
 31 |     local cpu_grads = cpu_probs:clone():zero()
 32 | 
 33 |     local labels = {{1,2}}
 34 |     --local label_lengths = torch.Tensor({2}):int()
 35 | 
 36 |     local sizes = {2}
 37 |     --print(cpu_probs, cpu_grads, labels, sizes)
 38 | 
 39 |     print("CPU_cost:", reduce(cpu_ctc(cpu_acts, cpu_grads, labels, sizes)))
 40 |     print(cpu_grads)
 41 | 
 42 |     local cpu_grads = torch.Tensor():float()
 43 |     print("CPU_cost: score forward", reduce(cpu_ctc(cpu_acts, cpu_grads, labels, sizes)))
 44 | 
 45 |     local acts = cpu_acts:cuda()
 46 |     local grads = acts:clone():zero()
 47 | 
 48 |     --print(probs, grads, labels, label_lengths, sizes)
 49 | 
 50 |     local cost = reduce(gpu_ctc(acts, grads, labels, sizes))
 51 |     print("GPU_cost:", cost)
 52 |     print(grads)
 53 | 
 54 |     local grads = torch.Tensor():cuda()
 55 |     print("GPU_cost: score forward", reduce(gpu_ctc(acts, grads, labels, sizes)))
 56 | 
 57 | end
 58 | 
 59 | function mediumTest(multiplier)
 60 |     local cpu_acts = torch.Tensor({{0.1, 0.6, 0.1, 0.1,0.1},{0.1, 0.1, 0.6, 0.1, 0.1},
 61 |         {0.6, 0.1, 0.1, 0.1,0.1},{0.1, 0.1, 0.5, 0.2, 0.1}}):float()*multiplier
 62 |     local cpu_grads = cpu_acts:clone():zero()
 63 | 
 64 |     local labels = {{1,2},{1,2}}
 65 |     local sizes = {2, 2 }
 66 | 
 67 |     --print(cpu_probs, cpu_grads, labels, sizes)
 68 | 
 69 |     print("CPU_cost:", reduce(cpu_ctc(cpu_acts, cpu_grads, labels, sizes)))
 70 |     print(cpu_grads)
 71 | 
 72 |     local cpu_grads = torch.Tensor():float()
 73 |     print("CPU_cost: score forward", reduce(cpu_ctc(cpu_acts, cpu_grads, labels, sizes)))
 74 | 
 75 |     local acts = cpu_acts:cuda()
 76 |     local grads = acts:clone():zero()
 77 | 
 78 |     --print(probs, grads, labels, sizes)
 79 | 
 80 |     local cost = reduce(gpu_ctc(acts, grads, labels, sizes))
 81 |     print("GPU_cost:", cost)
 82 |     print(grads)
 83 | 
 84 |     local grads = torch.Tensor():cuda()
 85 |     print("GPU_cost: score forward", reduce(gpu_ctc(acts, grads, labels, sizes)))
 86 | 
 87 | end
 88 | 
 89 | function emptyLabelTest()
 90 |     local cpu_acts = torch.Tensor({{0.1, 0.6, 0.1, 0.1,0.1},{0.1, 0.1, 0.6, 0.1, 0.1},
 91 |         {0.6, 0.1, 0.1, 0.1,0.1},{0.1, 0.1, 0.5, 0.2, 0.1}}):float()
 92 |     local cpu_grads = cpu_acts:clone():zero()
 93 | 
 94 |     local labels = {{1,2},{}}
 95 |     local sizes = {2, 2 }
 96 | 
 97 |     --print(cpu_probs, cpu_grads, labels, sizes)
 98 | 
 99 |     print("CPU_cost:", reduce(cpu_ctc(cpu_acts, cpu_grads, labels, sizes)))
100 |     print(cpu_grads)
101 | 
102 |     local cpu_grads = torch.Tensor():float()
103 |     print("CPU_cost: score forward", reduce(cpu_ctc(cpu_acts, cpu_grads, labels, sizes)))
104 | 
105 |     local acts = cpu_acts:cuda()
106 |     local grads = acts:clone():zero()
107 | 
108 |     --print(probs, grads, labels, sizes)
109 | 
110 |     local cost = reduce(gpu_ctc(acts, grads, labels, sizes))
111 |     print("GPU_cost:", cost)
112 |     print(grads)
113 | 
114 |     local grads = torch.Tensor():cuda()
115 |     print("GPU_cost: score forward", reduce(gpu_ctc(acts, grads, labels, sizes)))
116 | 
117 | end
118 | 
119 | function getTargets()
120 |     local outdim = 29 --TODO count chars.txt
121 |     local file = io.open("data/sizes.txt", "r");
122 | 
123 |     if not file then
124 |         print("File not found data/sizes.txt are you runnng the test from the tests dir?")
125 |     end
126 | 
127 |     local sizes = {}
128 |     for line in file:lines() do
129 |         table.insert (sizes, tonumber(line));
130 |     end
131 | 
132 | 
133 |     local label_file = io.open("data/labels.txt", "r");
134 |     local labels = {}
135 | 
136 |     for line in label_file:lines() do
137 |         local current_labels = {}
138 |         for w in line:gmatch("%S+") do
139 |             table.insert (current_labels, tonumber(w));
140 |         end
141 |         table.insert (labels, current_labels);
142 |     end
143 | 
144 |     return outdim, sizes, labels
145 | end
146 | 
147 | 
148 | function bigTest(minibatch_size)
149 | 
150 |     detected_OS = os.capture('uname', false)
151 | 
152 |     local outdim, raw_sizes, raw_labels = getTargets()
153 | 
154 |     -- truncate tables to given minibatch_size
155 |     local sizes = {}
156 |     local labels = {}
157 | 
158 |     local max_length = 0
159 | 
160 |     for idx = 1,minibatch_size do
161 |         if raw_sizes[idx] > max_length then
162 |             max_length = raw_sizes[idx]
163 |         end
164 | 
165 |         table.insert(sizes, raw_sizes[idx])
166 |         table.insert(labels, raw_labels[idx])
167 |     end
168 | 
169 |     local minibatch_size = table.getn(sizes)
170 | 
171 |     print("Using minibatch size: ", #sizes)
172 |     print("Using outdim size: ", outdim)
173 |     print("Max size: ", max_length)
174 | 
175 |     torch.manualSeed(123)
176 | 
177 |     local cpu_acts = torch.rand(minibatch_size*max_length, outdim):float()
178 |     local cpu_grads = cpu_acts:clone():fill(0)
179 | 
180 |     print("CPU_cost:", reduce(cpu_ctc(cpu_acts, cpu_grads, labels, sizes)))
181 | 
182 |     if detected_OS == "Darwin" then
183 |         if cpu_grads:ne(cpu_grads):sum() > 0 then
184 |             print(sys.COLORS.red .. ' cpu_grads after update has NaN/s')
185 |         else
186 |             print('cpu_grads do not have nans')
187 |         end
188 |     end
189 | 
190 |     local cpu_null_grads = torch.Tensor():float()
191 |     print("CPU_cost: score forward", reduce(cpu_ctc(cpu_acts, cpu_null_grads, labels, sizes)))
192 | 
193 |     local acts = cpu_acts:cuda()
194 |     local grads = acts:clone():zero()
195 | 
196 |     --print(probs, grads, labels, sizes)
197 | 
198 |     local cost = reduce(gpu_ctc(acts, grads, labels, sizes))
199 |     print("GPU_cost:", cost)
200 | 
201 |     if detected_OS == "Darwin" then
202 |         if grads:ne(grads):sum() > 0 then
203 |             print(sys.COLORS.red .. ' gpu_grads after update has NaN/s')
204 |         else
205 |             print('gpu_grads do not have nans')
206 |         end
207 | 
208 |         print("L2 norm grad diff: ", torch.norm(cpu_grads - grads:float()))
209 | 
210 |     end
211 | 
212 |     local grads = torch.Tensor():cuda()
213 |     print("GPU_cost: score forward", reduce(gpu_ctc(acts, grads, labels, sizes)))
214 | 
215 | end
216 | 
217 | simpleTest()
218 | mediumTest(1.0)
219 | print("Stability test")
220 | mediumTest(200.0) -- test SM stability if compiled with USE_NSM this will not have nans
221 | print("Empty label test")
222 | emptyLabelTest()
223 | bigTest(32)
224 | bigTest(64)
225 | bigTest(96)
226 | bigTest(111)


--------------------------------------------------------------------------------
/torch_binding/utils.c:
--------------------------------------------------------------------------------
 1 | #include "utils.h"
 2 | 
 3 | THLongStorage* cutorch_checklongargs(lua_State *L, int index) {
 4 |     THLongStorage *storage;
 5 |     int i;
 6 |     int narg = lua_gettop(L)-index+1;
 7 | 
 8 |     if(narg == 1 && luaT_toudata(L, index, "torch.LongStorage")) {
 9 |         THLongStorage *storagesrc = luaT_toudata(L, index, "torch.LongStorage");
10 |         storage = THLongStorage_newWithSize(storagesrc->size);
11 |         THLongStorage_copy(storage, storagesrc);
12 |     } else {
13 |         storage = THLongStorage_newWithSize(narg);
14 |         for(i = index; i < index+narg; i++) {
15 |             if(!lua_isnumber(L, i)) {
16 |                 THLongStorage_free(storage);
17 |                 luaL_argerror(L, i, "number expected");
18 |             }
19 |             THLongStorage_set(storage, i-index, lua_tonumber(L, i));
20 |         }
21 |     }
22 |     return storage;
23 | }
24 | 
25 | int cutorch_islongargs(lua_State *L, int index) {
26 |     int narg = lua_gettop(L)-index+1;
27 | 
28 |     if(narg == 1 && luaT_toudata(L, index, "torch.LongStorage")) {
29 |         return 1;
30 |     } else {
31 |         int i;
32 | 
33 |         for(i = index; i < index+narg; i++) {
34 |             if(!lua_isnumber(L, i))
35 |                 return 0;
36 |         }
37 |         return 1;
38 |     }
39 |     return 0;
40 | }
41 | 
42 | struct THCState* cutorch_getstate(lua_State* L) {
43 |     lua_getglobal(L, "cutorch");
44 |     lua_getfield(L, -1, "_state");
45 |     struct THCState *state = lua_touserdata(L, -1);
46 |     lua_pop(L, 2);
47 |     return state;
48 | }
49 | 


--------------------------------------------------------------------------------
/torch_binding/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef CUTORCH_UTILS_INC
 2 | #define CUTORCH_UTILS_INC
 3 | 
 4 | #include "luaT.h"
 5 | #include "TH.h"
 6 | 
 7 | #ifdef __cplusplus
 8 | # define TORCH_EXTERNC extern "C"
 9 | #else
10 | # define TORCH_EXTERNC extern
11 | #endif
12 | 
13 | #ifdef __GNUC__
14 | # define TORCH_UNUSED __attribute__((unused))
15 | #else
16 | # define TORCH_UNUSED
17 | #endif
18 | 
19 | #ifdef _WIN32
20 | # ifdef torch_EXPORTS
21 | #  define TORCH_API TORCH_EXTERNC __declspec(dllexport)
22 | # else
23 | #  define TORCH_API TORCH_EXTERNC __declspec(dllimport)
24 | # endif
25 | #else
26 | # define TORCH_API TORCH_EXTERNC
27 | #endif
28 | 
29 | #if LUA_VERSION_NUM == 501
30 | /*
31 | ** Adapted from Lua 5.2.0
32 | */
33 | TORCH_UNUSED static void luaL_setfuncs (lua_State *L, const luaL_Reg *l, int nup) {
34 |     luaL_checkstack(L, nup+1, "too many upvalues");
35 |     for (; l->name != NULL; l++) {  /* fill the table with given functions */
36 |         int i;
37 |         lua_pushstring(L, l->name);
38 |         for (i = 0; i < nup; i++)  /* copy upvalues to the top */
39 |             lua_pushvalue(L, -(nup+1));
40 |         lua_pushcclosure(L, l->func, nup);  /* closure with those upvalues */
41 |         lua_settable(L, -(nup + 3));
42 |     }
43 |     lua_pop(L, nup);  /* remove upvalues */
44 | }
45 | #endif
46 | 
47 | 
48 | TORCH_API THLongStorage* cutorch_checklongargs(lua_State *L, int index);
49 | TORCH_API int cutorch_islongargs(lua_State *L, int index);
50 | 
51 | struct THCState;
52 | TORCH_API struct THCState* cutorch_getstate(lua_State* L);
53 | 
54 | #endif


--------------------------------------------------------------------------------