├── .circleci
    └── config.yml
├── .clang-format
├── .gitignore
├── CITATION
├── CMakeLists.txt
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── bindings
    └── python
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── compute_version.py
    │   ├── example
    │       └── criterion_example.py
    │   ├── flashlight
    │       └── lib
    │       │   └── sequence
    │       │       ├── __init__.py
    │       │       ├── _criterion.cpp
    │       │       ├── criterion.py
    │       │       └── criterion_torch.py
    │   ├── test
    │       └── test_import.py
    │   └── version.txt
├── cmake
    ├── BuildGoogleTest.cmake
    ├── Buildcub.cmake
    ├── Buildpybind11.cmake
    ├── FindFilesystem.cmake
    ├── FindGMock.cmake
    ├── InternalUtils.cmake
    ├── TestUtils.cmake
    └── flashlight-sequence-config.cmake.in
├── codecov.yml
├── flashlight
    └── lib
    │   └── sequence
    │       ├── CMakeLists.txt
    │       ├── Defines.h
    │       └── criterion
    │           ├── CMakeLists.txt
    │           ├── Defines.h
    │           ├── Workspace.h
    │           ├── cpu
    │               ├── ConnectionistTemporalClassificationCriterion.cpp
    │               ├── ConnectionistTemporalClassificationCriterion.h
    │               ├── CriterionUtils.cpp
    │               ├── CriterionUtils.h
    │               ├── ForceAlignmentCriterion.cpp
    │               ├── ForceAlignmentCriterion.h
    │               ├── FullConnectionCriterion.cpp
    │               ├── FullConnectionCriterion.h
    │               ├── ViterbiPath.cpp
    │               └── ViterbiPath.h
    │           └── cuda
    │               ├── CriterionUtils.cu
    │               ├── CriterionUtils.cuh
    │               ├── ForceAlignmentCriterion.cu
    │               ├── ForceAlignmentCriterion.cuh
    │               ├── FullConnectionCriterion.cu
    │               ├── FullConnectionCriterion.cuh
    │               ├── ViterbiPath.cu
    │               └── ViterbiPath.cuh
├── pyproject.toml
└── setup.py


/.circleci/config.yml:
--------------------------------------------------------------------------------
  1 | version: 2.1
  2 | 
  3 | macos_env: &macos_env
  4 |   macos:
  5 |     xcode: 13.4.1
  6 |   resource_class: large
  7 |   environment:
  8 |     HOMEBREW_NO_AUTO_UPDATE: "1"
  9 | 
 10 | gpu: &gpu
 11 |   machine:
 12 |     image: linux-cuda-11:2023.02.1
 13 |   resource_class: gpu.nvidia.medium
 14 | 
 15 | executors:
 16 |   windows_gpu:
 17 |     machine:
 18 |       image: windows-server-2019-nvidia:stable
 19 |     # TODO: install a newer CUDA version if relying on newer
 20 |     # C++ features not in nvcc 10.1
 21 |     resource_class: windows.gpu.nvidia.medium
 22 | 
 23 | orbs:
 24 |   win: circleci/windows@5.0.0
 25 | 
 26 | commands:
 27 |   install_ubuntu_build_dependencies:
 28 |     parameters:
 29 |       use_cuda:
 30 |         type: string
 31 |         default: "OFF"
 32 |     steps:
 33 |       - run:
 34 |           name: "Install Build Dependencies"
 35 |           command: |
 36 |             sudo apt -y update
 37 |             sudo apt -y install build-essential python3-dev python3-pip python3-venv cmake
 38 |   install_macos_build_dependencies:
 39 |     steps:
 40 |       - run:
 41 |           name: "Install Build Dependencies"
 42 |           command: |
 43 |             brew install cmake libomp googletest
 44 |   install_msvc_build_dependencies:
 45 |     steps:
 46 |       - run:
 47 |           name: "Install Build Dependencies"
 48 |           command: |
 49 |             choco install cmake python3 -y
 50 |       # windows needs a path modification
 51 |       - run:
 52 |           name: "Set PATH to find CMake"
 53 |           command: echo 'export PATH="$PATH:/c/Program Files/CMake/bin"' >> $BASH_ENV
 54 |   build_flashlight_sequence:
 55 |     parameters:
 56 |       use_openmp:
 57 |         type: string
 58 |         default: "ON"
 59 |       use_cuda:
 60 |         type: string
 61 |         default: "ON"
 62 |       build_shared_libs:
 63 |         type: string
 64 |         default: "OFF"
 65 |       build_code_coverage:
 66 |         type: string
 67 |         default: "OFF"
 68 |       platform:
 69 |         type: string
 70 |         default: "linux"
 71 |     steps:
 72 |       - run:
 73 |           name: "Build and Install Flashlight Sequence"
 74 |           command: |
 75 |             mkdir build && \
 76 |             cmake -S . -B build \
 77 |               -DBUILD_SHARED_LIBS=<< parameters.build_shared_libs >> \
 78 |               -DFL_SEQUENCE_USE_OPENMP=<< parameters.use_openmp >> \
 79 |               -DFL_SEQUENCE_USE_CUDA=<< parameters.use_cuda >> \
 80 |               -DFL_SEQUENCE_CODE_COVERAGE=<< parameters.build_code_coverage >>
 81 |             cmake --build build --parallel
 82 |       # only test install with non-Windows platforms (TODO: fix this)
 83 |       - when:
 84 |           condition:
 85 |             not:
 86 |               equal: ["windows", << parameters.platform >>]
 87 |           steps:
 88 |             - run:
 89 |                 name: "Run Install Step"
 90 |                 command: sudo cmake --install build
 91 |       # linux needs ldconfig
 92 |       - when:
 93 |           condition:
 94 |             equal: ["linux", << parameters.platform >>]
 95 |           steps:
 96 |             - run:
 97 |                 name: "Configure shared lib paths"
 98 |                 command: sudo ldconfig
 99 |   install_python_bindings:
100 |     parameters:
101 |       use_cuda:
102 |         type: string
103 |         default: "ON"
104 |     steps:
105 |       - run:
106 |           name: "Setup virtualenv"
107 |           command: |
108 |             python3 -m venv venv
109 |             source venv/bin/activate
110 |             pip install --upgrade pip
111 |             echo "source venv/bin/activate" >> $BASH_ENV
112 |       - run:
113 |           name: "Install Python Bindings"
114 |           command: |
115 |             pip install numpy
116 |             USE_CUDA=<< parameters.use_cuda >> pip install -v .
117 |   run_python_tests:
118 |     parameters:
119 |       use_cuda:
120 |         type: string
121 |         default: "ON"
122 |     steps:
123 |       - run:
124 |           name: "Run Python Binding Tests"
125 |           command: |
126 |             cd bindings/python/test
127 |             USE_CUDA=<< parameters.use_cuda >> python -m unittest discover -v .
128 |   test_with_external_project:
129 |     parameters:
130 |       build_shared_libs:
131 |         type: string
132 |         default: "OFF"
133 |     steps:
134 |       - run:
135 |           name: Set up dependent external project
136 |           command: |
137 |             mkdir -p test_project && cd test_project && \
138 |             echo -e "\
139 |               #include <flashlight/lib/sequence/criterion/cpu/ViterbiPath.h> \n
140 |               int main() {                                                   \n
141 |                 using ViterbiPath = fl::lib::cpu::ViterbiPath<float>;        \n
142 |                 return 0;                                                    \n
143 |               }                                                              \n
144 |             " > main.cpp && \
145 |             echo -e "\
146 |               cmake_minimum_required(VERSION 3.10)                                \n
147 |               project(test_project)                                               \n
148 |               set(CMAKE_CXX_STANDARD 17)                                          \n
149 |               set(CMAKE_CXX_STANDARD_REQUIRED ON)                                 \n
150 |               add_executable(main main.cpp)                                       \n
151 |               find_package(flashlight-sequence CONFIG REQUIRED)                   \n
152 |               target_link_libraries(main PRIVATE flashlight::flashlight-sequence) \n
153 |             " > CMakeLists.txt
154 |       - run:
155 |           name: Build dependent external project
156 |           command: |
157 |             cd test_project && mkdir -p build
158 |             cmake -S . -B build -DBUILD_SHARED_LIBS=<< parameters.build_shared_libs >> && \
159 |             cmake --build build --parallel && ./build/main
160 |   run_codecov:
161 |     steps:
162 |       - run:
163 |           name: "Get code coverage"
164 |           command: |
165 |             sudo apt-get install -y --no-install-recommends lcov curl && \
166 |             lcov --capture --directory . --output-file coverage.info && \
167 |             lcov --remove coverage.info '/usr/*' --output-file coverage.info &&
168 |             lcov --remove coverage.info '*/include/*' --output-file coverage.info && \
169 |             lcov --remove coverage.info '*/gtest/*' --output-file coverage.info && \
170 |             lcov --list coverage.info && \
171 |             bash <(curl -s https://codecov.io/bash) -f coverage.info \
172 |             -t $CODECOV_TOKEN \
173 |             || echo 'Codecov did not collect coverage reports'
174 |   run_ubuntu_20_gcc_9:
175 |     parameters:
176 |       use_cuda:
177 |         type: string
178 |         default: "ON"
179 |       build_shared_libs:
180 |         type: string
181 |         default: "OFF"
182 |       run_codecov:
183 |         type: string
184 |         default: ""
185 |     steps:
186 |       - checkout
187 |       - install_ubuntu_build_dependencies:
188 |           use_cuda: << parameters.use_cuda >>
189 |       - build_flashlight_sequence:
190 |           build_shared_libs: << parameters.build_shared_libs >>
191 |           use_cuda: << parameters.use_cuda >>
192 |           build_code_coverage: << parameters.run_codecov >>
193 |       - run:
194 |           name: "Run C++ Tests"
195 |           command: |
196 |             cd build && ctest
197 |       - test_with_external_project:
198 |           build_shared_libs: << parameters.build_shared_libs >>
199 |       - when:
200 |           condition: << parameters.run_codecov >>
201 |           steps:
202 |             - run_codecov
203 |   run_ubuntu_20_gcc_9_python:
204 |     parameters:
205 |       use_cuda:
206 |         type: string
207 |         default: "ON"
208 |     steps:
209 |       - checkout
210 |       - install_ubuntu_build_dependencies:
211 |           use_cuda: << parameters.use_cuda >>
212 |       - install_python_bindings:
213 |           use_cuda: << parameters.use_cuda >>
214 |       - run_python_tests:
215 |           use_cuda: << parameters.use_cuda >>
216 | 
217 | jobs:
218 |   ubuntu_20_gcc_9:
219 |     parameters:
220 |       build_shared_libs:
221 |         type: string
222 |         default: "OFF"
223 |       run_codecov:
224 |         type: string
225 |         default: ""
226 |     docker:
227 |       - image: cimg/base:2021.04
228 |     steps:
229 |       - run_ubuntu_20_gcc_9:
230 |           use_cuda: "OFF"
231 |           build_shared_libs: << parameters.build_shared_libs >>
232 |           run_codecov: << parameters.run_codecov >>
233 | 
234 |   ubuntu_20_gcc_9_cuda:
235 |     parameters:
236 |       build_shared_libs:
237 |         type: string
238 |         default: "OFF"
239 |       run_codecov:
240 |         type: string
241 |         default: ""
242 |     <<: *gpu
243 |     steps:
244 |       - run_ubuntu_20_gcc_9:
245 |           use_cuda: "ON"
246 |           build_shared_libs: << parameters.build_shared_libs >>
247 |           run_codecov: << parameters.run_codecov >>
248 | 
249 |   ubuntu_20_gcc_9_python:
250 |     docker:
251 |       - image: cimg/base:2021.04
252 |     steps:
253 |       - run_ubuntu_20_gcc_9_python:
254 |           use_cuda: "OFF"
255 | 
256 |   ubuntu_20_gcc_9_python_cuda:
257 |     <<: *gpu
258 |     steps:
259 |       - run_ubuntu_20_gcc_9_python:
260 |           use_cuda: "ON"
261 | 
262 |   macos_clang_13:
263 |     parameters:
264 |       build_shared_libs:
265 |         type: string
266 |         default: "OFF"
267 |     <<: *macos_env
268 |     shell: /bin/bash -eux -o pipefail
269 |     steps:
270 |       - checkout
271 |       - install_macos_build_dependencies
272 |       - build_flashlight_sequence:
273 |           platform: "macos"
274 |           use_cuda: "OFF"
275 |           build_shared_libs: << parameters.build_shared_libs >>
276 | 
277 |   macos_clang_13_python:
278 |     <<: *macos_env
279 |     shell: /bin/bash -eux -o pipefail
280 |     steps:
281 |       - checkout
282 |       - install_macos_build_dependencies
283 |       - install_python_bindings:
284 |           use_cuda: "OFF"
285 |       - run_python_tests:
286 |           use_cuda: "OFF"
287 | 
288 |   windows_msvc:
289 |     parameters:
290 |       build_shared_libs:
291 |         type: string
292 |         default: "OFF"
293 |     executor:
294 |       name: win/default
295 |       shell: bash.exe
296 |     steps:
297 |       - checkout
298 |       - install_msvc_build_dependencies
299 |       - build_flashlight_sequence:
300 |           platform: "windows"
301 |           use_cuda: "OFF"
302 | 
303 |   windows_msvc_cuda:
304 |     parameters:
305 |       build_shared_libs:
306 |         type: string
307 |         default: "OFF"
308 |     executor: windows_gpu
309 |     steps:
310 |       - checkout
311 |       - install_msvc_build_dependencies
312 |       - build_flashlight_sequence:
313 |           platform: "windows"
314 |           use_cuda: "ON"
315 | 
316 |   windows_msvc_python:
317 |     parameters:
318 |       use_cuda:
319 |         type: string
320 |         default: "OFF"
321 |     executor:
322 |       name: win/default
323 |       shell: bash.exe
324 |     steps:
325 |       - checkout
326 |       - install_msvc_build_dependencies
327 |       - install_python_bindings:
328 |           use_cuda: "OFF"
329 | 
330 | workflows:
331 |   build-test:
332 |     jobs:
333 |       - ubuntu_20_gcc_9:
334 |           name: "Ubuntu 20.04 gcc-9 static"
335 |       - ubuntu_20_gcc_9:
336 |           name: "Ubuntu 20.04 gcc-9 shared"
337 |           build_shared_libs: "ON"
338 |       - ubuntu_20_gcc_9_cuda:
339 |           name: "Ubuntu 20.04 gcc-9 static + CUDA"
340 |       - ubuntu_20_gcc_9_cuda:
341 |           name: "Ubuntu 20.04 gcc-9 shared + CUDA"
342 |           build_shared_libs: "ON"
343 |           run_codecov: "ON"
344 |       - ubuntu_20_gcc_9_python:
345 |           name: "Ubuntu 20.04 gcc-9 Python"
346 |       - ubuntu_20_gcc_9_python_cuda:
347 |           name: "Ubuntu 20.04 gcc-9 Python + CUDA"
348 |       - macos_clang_13:
349 |           name: "macOS Clang 13 - shared"
350 |           build_shared_libs: "ON"
351 |       - macos_clang_13_python:
352 |           name: "macOS Clang 13 Python"
353 |       - windows_msvc:
354 |           name: "Windows VS 17 2022 | MSVC 19.33"
355 |       - windows_msvc_cuda:
356 |           name: "Windows VS 17 2022 | MSVC 19.33 + CUDA"
357 |       - windows_msvc_python:
358 |           name: "Windows VS 17 2022 | MSVC 19.33 Python"
359 | 


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
 1 | ---
 2 | AccessModifierOffset: -1
 3 | AlignAfterOpenBracket: AlwaysBreak
 4 | AlignConsecutiveAssignments: false
 5 | AlignConsecutiveDeclarations: false
 6 | AlignEscapedNewlinesLeft: true
 7 | AlignOperands:   false
 8 | AlignTrailingComments: false
 9 | AllowAllParametersOfDeclarationOnNextLine: false
10 | AllowShortBlocksOnASingleLine: false
11 | AllowShortCaseLabelsOnASingleLine: false
12 | AllowShortFunctionsOnASingleLine: Empty
13 | AllowShortIfStatementsOnASingleLine: false
14 | AllowShortLoopsOnASingleLine: false
15 | AlwaysBreakAfterReturnType: None
16 | AlwaysBreakBeforeMultilineStrings: true
17 | AlwaysBreakTemplateDeclarations: true
18 | BinPackArguments: false
19 | BinPackParameters: false
20 | BraceWrapping:
21 |   AfterClass:      false
22 |   AfterControlStatement: false
23 |   AfterEnum:       false
24 |   AfterFunction:   false
25 |   AfterNamespace:  false
26 |   AfterObjCDeclaration: false
27 |   AfterStruct:     false
28 |   AfterUnion:      false
29 |   BeforeCatch:     false
30 |   BeforeElse:      false
31 |   IndentBraces:    false
32 | BreakBeforeBinaryOperators: None
33 | BreakBeforeBraces: Attach
34 | BreakBeforeTernaryOperators: true
35 | BreakConstructorInitializersBeforeComma: false
36 | BreakAfterJavaFieldAnnotations: false
37 | BreakStringLiterals: false
38 | ColumnLimit:     80
39 | CommentPragmas:  '^ IWYU pragma:'
40 | ConstructorInitializerAllOnOneLineOrOnePerLine: true
41 | ConstructorInitializerIndentWidth: 4
42 | ContinuationIndentWidth: 4
43 | Cpp11BracedListStyle: true
44 | DerivePointerAlignment: false
45 | DisableFormat:   false
46 | ForEachMacros:   [ FOR_EACH, FOR_EACH_R, FOR_EACH_RANGE, ]
47 | IncludeCategories:
48 |   - Regex:           '^<.*\.h(pp)?>'
49 |     Priority:        1
50 |   - Regex:           '^<.*'
51 |     Priority:        2
52 |   - Regex:           '.*'
53 |     Priority:        3
54 | IndentCaseLabels: true
55 | IndentPPDirectives: None
56 | IndentWidth:     2
57 | IndentWrappedFunctionNames: false
58 | KeepEmptyLinesAtTheStartOfBlocks: false
59 | MacroBlockBegin: ''
60 | MacroBlockEnd:   ''
61 | MaxEmptyLinesToKeep: 1
62 | NamespaceIndentation: None
63 | ObjCBlockIndentWidth: 2
64 | ObjCSpaceAfterProperty: false
65 | ObjCSpaceBeforeProtocolList: false
66 | PenaltyBreakBeforeFirstCallParameter: 1
67 | PenaltyBreakComment: 300
68 | PenaltyBreakFirstLessLess: 120
69 | PenaltyBreakString: 1000
70 | PenaltyExcessCharacter: 1000000
71 | PenaltyReturnTypeOnItsOwnLine: 200
72 | PointerAlignment: Left
73 | ReflowComments:  true
74 | SortIncludes:    true
75 | SpaceAfterCStyleCast: false
76 | SpaceBeforeAssignmentOperators: true
77 | SpaceBeforeParens: ControlStatements
78 | SpaceInEmptyParentheses: false
79 | SpacesBeforeTrailingComments: 1
80 | SpacesInAngles:  false
81 | SpacesInContainerLiterals: true
82 | SpacesInCStyleCastParentheses: false
83 | SpacesInParentheses: false
84 | SpacesInSquareBrackets: false
85 | Standard:        Cpp11
86 | TabWidth:        8
87 | UseTab:          Never
88 | ...
89 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Build
 2 | build
 3 | release
 4 | debug
 5 | 
 6 | # FB
 7 | fb
 8 | TARGETS
 9 | 
10 | # Conan
11 | conanbuildinfo*
12 | conan.lock
13 | arrayfire
14 | conaninfo*
15 | graph_info.json
16 | 
17 | # Python bindings
18 | __pycache__/
19 | *$py.class
20 | .Python
21 | build/
22 | develop-eggs/
23 | dist/
24 | downloads/
25 | eggs/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 | wheels/
30 | **/version.py
31 | **/BUILD_VERSION.txt
32 | 
33 | # Coverage
34 | *.info
35 | 
36 | # Precompiled Headers
37 | *.gch
38 | *.pch
39 | 
40 | # Compiled Object files
41 | *.slo
42 | *.lo
43 | *.o
44 | *.obj
45 | 
46 | # Compiled Dynamic libraries
47 | *.so
48 | *.so.*
49 | *.dylib
50 | *.dll
51 | 
52 | # Compiled Static libraries
53 | *.lai
54 | *.la
55 | *.a
56 | *.lib
57 | 
58 | # Dev environment
59 | .vscode
60 | 


--------------------------------------------------------------------------------
/CITATION:
--------------------------------------------------------------------------------
1 | @misc{kahn2022flashlight,
2 |       title={Flashlight: Enabling Innovation in Tools for Machine Learning},
3 |       author={Jacob Kahn and Vineel Pratap and Tatiana Likhomanenko and Qiantong Xu and Awni Hannun and Jeff Cai and Paden Tomasello and Ann Lee and Edouard Grave and Gilad Avidov and Benoit Steiner and Vitaliy Liptchinsky and Gabriel Synnaeve and Ronan Collobert},
4 |       year={2022},
5 |       eprint={2201.12465},
6 |       archivePrefix={arXiv},
7 |       primaryClass={cs.LG}
8 | }
9 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | cmake_minimum_required(VERSION 3.16)
  2 | 
  3 | project(flashlight-sequence LANGUAGES CXX C VERSION 0.1)
  4 | 
  5 | include(CTest)
  6 | 
  7 | # ----------------------------- Setup -----------------------------
  8 | find_program(CCACHE_PROGRAM ccache)
  9 | if(CCACHE_PROGRAM)
 10 |   set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CCACHE_PROGRAM}")
 11 | endif()
 12 | 
 13 | list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 14 | 
 15 | option(FL_SEQUENCE_CODE_COVERAGE "Enable coverage reporting" OFF)
 16 | 
 17 | # Default directories for installation
 18 | set(FL_INSTALL_INC_DIR "include" CACHE PATH "Install path for headers")
 19 | set(FL_INSTALL_LIB_DIR "lib" CACHE PATH "Install path for libraries")
 20 | set(FL_INSTALL_BIN_DIR "bin" CACHE PATH "Install path for binaries")
 21 | # Other assets
 22 | set(FL_INSTALL_ASSETS_BASE_DIR "share/${PROJECT_NAME}")
 23 | set(FL_INSTALL_CMAKE_DIR "${FL_INSTALL_ASSETS_BASE_DIR}/cmake" CACHE PATH "Install path for CMake files")
 24 | set(FL_INSTALL_EXAMPLES_DIR "${FL_INSTALL_ASSETS_BASE_DIR}/examples" CACHE PATH "Install path for example files")
 25 | set(FL_INSTALL_DOC_DIR "${FL_INSTALL_ASSETS_BASE_DIR}/doc" CACHE PATH "Install path for documentation")
 26 | 
 27 | include(${PROJECT_SOURCE_DIR}/cmake/InternalUtils.cmake)
 28 | 
 29 | # ----------------------------- Configuration -----------------------------
 30 | 
 31 | option(FL_SEQUENCE_USE_OPENMP "Build with OpenMP support" OFF)
 32 | option(FL_SEQUENCE_USE_CUDA "Build with CUDA support" OFF)
 33 | option(FL_SEQUENCE_BUILD_TESTS "Build tests" ON)
 34 | option(FL_SEQUENCE_BUILD_PYTHON "Build Python bindings" OFF)
 35 | option(FL_SEQUENCE_BUILD_STANDALONE "Build standalone installation" ON)
 36 | option(FL_SEQUENCE_BUILD_PYTHON_PACKAGE "Build packaging-ready Python artifacts" OFF)
 37 | 
 38 | # ------------------------ Build ------------------------
 39 | 
 40 | add_library(flashlight-sequence)
 41 | 
 42 | set_target_properties(flashlight-sequence PROPERTIES
 43 |   CXX_STANDARD 17
 44 |   CXX_STANDARD_REQUIRED ON
 45 |   )
 46 | 
 47 | target_include_directories(
 48 |   flashlight-sequence
 49 |   PUBLIC
 50 |   $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>
 51 |   )
 52 | 
 53 | target_compile_definitions(flashlight-sequence PUBLIC FL_SEQ_DLL)
 54 | 
 55 | if (FL_SEQUENCE_USE_CUDA)
 56 |   enable_language(CUDA)
 57 | 
 58 |   # To support nvcc with CUDA < 11
 59 |   set_target_properties(
 60 |     flashlight-sequence
 61 |     PROPERTIES
 62 |     CUDA_STANDARD 14
 63 |     CUDA_STANDARD_REQUIRED ON
 64 |     )
 65 | 
 66 |   target_compile_definitions(
 67 |     flashlight-sequence
 68 |     PUBLIC
 69 |     FL_SEQUENCE_USE_OPENMP
 70 |     FL_SEQUENCE_USE_CUDA
 71 |   )
 72 | endif()
 73 | 
 74 | include(${PROJECT_SOURCE_DIR}/flashlight/lib/sequence/CMakeLists.txt)
 75 | 
 76 | if (FL_SEQUENCE_BUILD_PYTHON)
 77 |   include(${PROJECT_SOURCE_DIR}/bindings/python/CMakeLists.txt)
 78 | endif()
 79 | 
 80 | add_library(flashlight::flashlight-sequence ALIAS flashlight-sequence)
 81 | 
 82 | # ------------------------ Tests + Code Coverage------------------------
 83 | 
 84 | if (FL_SEQUENCE_BUILD_TESTS)
 85 |   enable_testing()
 86 |   include(TestUtils)
 87 |   # TODO: add back after moving tests from Flashlight core
 88 |   # add_subdirectory(${PROJECT_SOURCE_DIR}/flashlight/lib/sequence/test)
 89 | endif()
 90 | 
 91 | if (FL_SEQUENCE_CODE_COVERAGE)
 92 |   fl_sequence_add_coverage_to_target(TARGET flashlight-sequence)
 93 | endif()
 94 | 
 95 | # ------------------------ Installation ------------------------
 96 | 
 97 | # Install headers
 98 | install(
 99 |   DIRECTORY ${PROJECT_SOURCE_DIR}/flashlight/lib
100 |   COMPONENT headers
101 |   DESTINATION ${FL_INSTALL_INC_DIR}/flashlight
102 |   FILES_MATCHING # preserve directory structure
103 |   PATTERN "*.h"
104 |   PATTERN "*.hpp"
105 |   PATTERN "*.cuh"
106 |   PATTERN "test*" EXCLUDE
107 |   PATTERN ".git" EXCLUDE
108 | )
109 | 
110 | # Install libraries and create CMake config/targets files
111 | fl_sequence_setup_install_targets(INSTALL_TARGETS flashlight-sequence)
112 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to make participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |   advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |   address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |   professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies within all project spaces, and it also applies when
49 | an individual is representing the project or its community in public spaces.
50 | Examples of representing a project or community include using an official
51 | project e-mail address, posting via an official social media account, or acting
52 | as an appointed representative at an online or offline event. Representation of
53 | a project may be further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at <opensource-conduct@fb.com>. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org
74 | 
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to flashlight
 2 | flashlight is still under development; we appreciate any contributions.
 3 | 
 4 | ## Pull Requests
 5 | We actively welcome your pull requests.
 6 | 
 7 | 1. Fork the repo and create your branch from `master`.
 8 | 2. If you've added code that should be tested, add tests.
 9 | 3. If you've changed APIs, update [and build](docs/README.md) the documentation (to check correctness - don't submit built documentation).
10 | 4. Ensure the test suite passes.
11 | 5. Make sure your code lints and run `clang-format` given the provided configuration.
12 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
13 | 
14 | ## Contributor License Agreement ("CLA")
15 | All contributors must sign the CLA for their pull requests to be eligible for merge. One only needs to do this once to contribute to any of Facebook's open source projects.
16 | 
17 | You can find the CLA [here](https://code.facebook.com/cla).
18 | 
19 | ## Issues
20 | We use [GitHub issues](https://github.com/flashlight/flashlight/issues) to track public bugs. When filing, a bug, please make sure your description is clear and include sufficient instructions to reproduce the issue (for instance, your OS, compiler version, and selected backend).
21 | 
22 | ## License
23 | By contributing to flashlight, you agree that your contributions will be licensed
24 | under the LICENSE file in the root directory of this source tree.
25 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) Facebook, Inc. and its affiliates.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include CMakeLists.txt
2 | recursive-include cmake *.cmake *.cmake.in
3 | recursive-include flashlight/lib *.h *.cpp CMakeLists.txt
4 | recursive-include flashlight/lib *.cuh *.cu
5 | recursive-include bindings/python *.cpp CMakeLists.txt
6 | global-exclude *.o *.so *.dylib *.a .git *.pyc *.swp
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Flashlight Sequence: Algorithms for Sequence Data
  2 | 
  3 | [**Quickstart**](#quickstart)
  4 | | [**Installation**](#building-and-installing)
  5 | | [**Python Documentation**](bindings/python)
  6 | | [**Citing**](#citing)
  7 | 
  8 | [![CircleCI](https://circleci.com/gh/flashlight/sequence.svg?style=shield)](https://app.circleci.com/pipelines/github/flashlight/sequence)
  9 | [![Join the chat at https://gitter.im/flashlight-ml/community](https://img.shields.io/gitter/room/flashlight-ml/community)](https://gitter.im/flashlight-ml/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) [![Codecov](https://img.shields.io/codecov/c/github/flashlight/sequence)](https://codecov.io/gh/flashlight/sequence) [![Vcpkg](https://img.shields.io/vcpkg/v/flashlight-sequence)](https://vcpkg.link/ports/flashlight-sequence)
 10 | 
 11 | *Flashlight Sequence* is a library with fast implementations of sequence-based operations. It includes:
 12 | - A fast, parallel CPU implementation of the Viterbi algorithm for greedy "`argmax`-style" decoding
 13 | - Fast implementations (CPU and CUDA) of the [Wav2letter ASG loss](https://arxiv.org/pdf/1609.03193.pdf) function including the fully-connected and forced-alignment algorithms.
 14 | 
 15 | ## Quickstart
 16 | 
 17 | Flashlight Sequence has Python bindings. To install the bindings from source, [optionally install CUDA] then clone the repo and build:
 18 | ```shell
 19 | git clone https://github.com/flashlight/sequence && cd sequence
 20 | pip install .
 21 | ```
 22 | To install with CUDA support, set the environment variable `USE_CUDA=1` when running the install command. By default, bindings are installed with OpenMP support; to build and install without OpenMP, set the environment to have `USE_OPENMP=0` when buildling.
 23 | 
 24 | See the [full Python binding documentation](bindings/python) for examples and more.
 25 | 
 26 | ## Building and Installing
 27 | [**From Source (C++)**](#building-from-source) | [**With `vcpkg` (C++)**](#with-vcpkg) | [**From Source (Python)**](bindings/python#build-instructions) | [**Adding to Your Own Project (C++)**](#adding-flashlight-sequence-to-a-c-project)
 28 | 
 29 | ### Requirements
 30 | At minimum, C++ compilation requires:
 31 | - A C++ compiler with good C++17 support (e.g. gcc/g++ >= 7)
 32 | - [CMake](https://cmake.org/) — version 3.16 or later, and ``make``
 33 | - A Linux-based operating system.
 34 | 
 35 | **CUDA Support:** If building with CUDA support, CUDA >= 9 is recommended. To toggle CUDA support use the `FL_SEQUENCE_USE_CUDA` CMake option or the `USE_CUDA` environment variable when building the Python bindings. To toggle OpenMP support, use the `FL_SEQUENCE_USE_OPENMP` CMake option or use the `USE_OPENMP` environment variable when building the Python bindings.
 36 | 
 37 | **Tests:** If building tests, [Google Test](https://github.com/google/googletest) >= 1.12 is required, but is installed automatically on build if not found. The `FL_SEQUENCE_BUILD_TESTS` CMake option toggles building tests.
 38 | 
 39 | Instructions for building/installing the Python bindings from source [can be found here](bindings/python/README.md).
 40 | 
 41 | ### Building from Source
 42 | 
 43 | Building the C++ project from source is simple:
 44 | ```bash
 45 | git clone https://github.com/flashlight/sequence && cd sequence
 46 | cmake -S . -B build
 47 | cmake --build build --parallel
 48 | cd build && ctest && cd .. # run tests
 49 | cmake --install build # install at the CMAKE_INSTALL_PREFIX
 50 | ```
 51 | To enable CUDA while building, pass `-DFL_SEQUENCE_USE_CUDA=ON` to CMake. To enable building with OpenMP, pass `-DFL_SEQUENCE_USE_OPENMP=ON` to CMake. To disable building tests, pass `-DFL_SEQUENCE_BUILD_TESTS=OFF`.
 52 | 
 53 | If building with CUDA < 11, [NVIDIA cub](https://github.com/NVIDIA/cub) is required. It will be downloaded automatically if not found; the `FL_SEQUENCE_BUILD_STANDALONE` build option controls this behavior.
 54 | 
 55 | #### With [`vcpkg`](https://vcpkg.io/)
 56 | 
 57 | Flashlight Sequence can also be installed and used downstream with the [`vcpkg`](https://vcpkg.io/) package manager. The [port](https://github.com/microsoft/vcpkg/blob/master/ports/flashlight-sequence/) contains optional features for building with OpenMP and/or CUDA:
 58 | ```bash
 59 | vcpkg install flashlight-sequence # no dependencies, or:
 60 | vcpkg install "flashlight-sequence[cuda]" # with CUDA
 61 | vcpkg install "flashlight-sequence[openmp]" # with OpenMP
 62 | vcpkg install "flashlight-sequence[cuda,openmp]" # with both!
 63 | ```
 64 | 
 65 | ### Adding Flashlight Sequence to a C++ Project
 66 | 
 67 | Given a simple `project.cpp` file that includes and links to Flashlight Sequence:
 68 | ```c++
 69 | #include <iostream>
 70 | 
 71 | #include <flashlight/lib/sequence/criterion/cpu/ViterbiPath.h>
 72 | 
 73 | int main() {
 74 |   auto res = fl::lib::cpu::ViterbiPath<float>::compute(...);
 75 |   std::cout << "ViterbiPath result[0] " << res[0] << std::endl;
 76 |  return 0;
 77 | }
 78 | ```
 79 | 
 80 | The following CMake configuration links Flashlight and sets include directories:
 81 | 
 82 | ```cmake
 83 | cmake_minimum_required(VERSION 3.16)
 84 | set(CMAKE_CXX_STANDARD 17)
 85 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 86 | 
 87 | add_executable(myProject project.cpp)
 88 | 
 89 | find_package(flashlight-sequence CONFIG REQUIRED)
 90 | target_link_libraries(myProject PRIVATE flashlight::flashlight-sequence)
 91 | ```
 92 | 
 93 | ### Contributing and Contact
 94 | Contact: jacobkahn@fb.com
 95 | 
 96 | Flashlight Sequence is actively developed. See
 97 | [CONTRIBUTING](CONTRIBUTING.md) for more on how to help out.
 98 | 
 99 | ## Citing
100 | You can cite [Flashlight](https://arxiv.org/abs/2201.12465) using:
101 | ```
102 | @misc{kahn2022flashlight,
103 |       title={Flashlight: Enabling Innovation in Tools for Machine Learning},
104 |       author={Jacob Kahn and Vineel Pratap and Tatiana Likhomanenko and Qiantong Xu and Awni Hannun and Jeff Cai and Paden Tomasello and Ann Lee and Edouard Grave and Gilad Avidov and Benoit Steiner and Vitaliy Liptchinsky and Gabriel Synnaeve and Ronan Collobert},
105 |       year={2022},
106 |       eprint={2201.12465},
107 |       archivePrefix={arXiv},
108 |       primaryClass={cs.LG}
109 | }
110 | ```
111 | 
112 | For the AutoSegmentation Criterion (ASG), cite:
113 | ```
114 | @article{collobert2016wav2letter,
115 |   title={Wav2letter: an end-to-end convnet-based speech recognition system},
116 |   author={Collobert, Ronan and Puhrsch, Christian and Synnaeve, Gabriel},
117 |   journal={arXiv preprint arXiv:1609.03193},
118 |   year={2016}
119 | }
120 | ```
121 | 
122 | ## License
123 | Flashlight Sequence is under an MIT license. See [LICENSE](LICENSE) for more information.
124 | 


--------------------------------------------------------------------------------
/bindings/python/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.16)
 2 | 
 3 | if (NOT BUILD_SHARED_LIBS)
 4 |   message(FATAL_ERROR "Cannot build Python bindings with a static lib build: "
 5 |     "set BUILD_SHARED_LIBS to ON.")
 6 | endif()
 7 | 
 8 | # Since the Python libs and standalone Flashlight Text libs are built/installed
 9 | # to the same directory, set rpaths on the Python targets to be the current dir
10 | if(APPLE)
11 |   # macOS
12 |   set(CMAKE_MACOSX_RPATH ON)
13 |   set(_portable_rpath_origin "@loader_path")
14 | else()
15 |   # Linux
16 |   set(CMAKE_BUILD_RPATH_USE_ORIGIN ON)
17 |   set(_portable_rpath_origin $ORIGIN)
18 | endif(APPLE)
19 | 
20 | find_package(Python3 REQUIRED COMPONENTS Interpreter Development.Module)
21 | find_package(pybind11 CONFIG)
22 | if (NOT pybind11_FOUND)
23 |   message(STATUS "Could not find pybind11 -- downloading from source.")
24 |   include(${PROJECT_SOURCE_DIR}/cmake/Buildpybind11.cmake)
25 | endif()
26 | 
27 | function (add_pybind11_extension ext_name)
28 |   string(REPLACE "_" ";" modlist ${ext_name})
29 |   list(GET modlist -1 modname)
30 |   list(REMOVE_AT modlist -1)
31 |   if(modlist)
32 |     string(REPLACE ";" "/" relpath "${modlist}")
33 |   else()
34 |     set(relpath "")
35 |   endif()
36 | 
37 |   pybind11_add_module(
38 |     ${ext_name}
39 |     ${CMAKE_CURRENT_LIST_DIR}/${relpath}/_${modname}.cpp
40 |     )
41 | 
42 |   target_link_libraries(
43 |     ${ext_name}
44 |     PUBLIC
45 |     flashlight-sequence
46 |     )
47 | 
48 |   target_include_directories(
49 |     ${ext_name}
50 |     PRIVATE
51 |     ${PROJECT_SOURCE_DIR}
52 |     )
53 | 
54 |   if (FL_SEQUENCE_BUILD_PYTHON_PACKAGE)
55 |     set_target_properties(${ext_name} PROPERTIES
56 |       OUTPUT_NAME ${ext_name}
57 |       BUILD_RPATH ${_portable_rpath_origin})
58 |   else()
59 |     if (CMAKE_LIBRARY_OUTPUT_DIRECTORY)
60 |       set_target_properties(${ext_name} PROPERTIES
61 |         LIBRARY_OUTPUT_DIRECTORY ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/${relpath})
62 |     endif()
63 |   endif()
64 | endfunction ()
65 | 
66 | add_pybind11_extension(flashlight_lib_sequence_criterion)
67 | 


--------------------------------------------------------------------------------
/bindings/python/README.md:
--------------------------------------------------------------------------------
 1 | # Flashlight Sequence Python Bindings
 2 | **Contents**
 3 | - [Installation](#installation)
 4 |   * [Dependencies](#dependencies)
 5 |   * [Build Instructions](#build-instructions)
 6 |   * [Advanced Options](#advanced-options)
 7 | - [Python API Documentation](#python-api-documentation)
 8 | 
 9 | ## Installation
10 | CUDA is required if building CUDA kernel implementations with the Python package.
11 | 
12 | ### Build Instructions
13 | From the project root, run:
14 | ```
15 | pip install .
16 | ```
17 | 
18 | or locally in editable mode (`-e` is required as libs are built outside of the bindings directory)
19 | ```
20 | pip install -e .
21 | ```
22 | 
23 | (`pypi` installation coming soon)
24 | 
25 | ### Advanced Options
26 | - `USE_CUDA=1` builds CUDA kernels for many included algorithms. CUDA >= 9 is required.
27 | 
28 | ## Python API Documentation
29 | 
30 | Coming soon.
31 | 


--------------------------------------------------------------------------------
/bindings/python/compute_version.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 2 | #
 3 | # This source code is licensed under the BSD license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | #
 6 | # Inspired by http://bit.ly/40pQb1Q
 7 | import subprocess
 8 | from pathlib import Path
 9 | from typing import Optional
10 | 
11 | from packaging import version
12 | 
13 | THIS_PATH = Path(__file__).resolve()
14 | version_from_file = (THIS_PATH.parent / "version.txt").read_text().strip()
15 | 
16 | 
17 | def get_tagged_version() -> Optional[str]:
18 |     """
19 |     Return a version corresponding to a git tag if it matches version.txt
20 |     """
21 |     try:
22 |         tag = subprocess.check_output(
23 |             ["git", "describe", "--tags", "--exact-match", "HEAD"],
24 |             text=True,
25 |             stderr=subprocess.DEVNULL,
26 |         ).strip()
27 |     except subprocess.CalledProcessError:  # no tag
28 |         return None
29 | 
30 |     if not tag.startswith("v"):
31 |         return None
32 | 
33 |     # Must match the version in `version.txt` -- ignores `rc` or other suffixes
34 |     assert (
35 |         version.parse(version_from_file).release == version.parse(tag[1:]).release
36 |     ), f"The version in version.txt ({version_from_file}) does not match the given tag ({tag})"
37 |     return tag[1:]
38 | 
39 | 
40 | def get_dev_version() -> str:
41 |     # Total number of commits appended to ensure chronological ordering
42 |     # (see PEP440)
43 |     num_commits = subprocess.check_output(
44 |         ["git", "rev-list", "--count", "HEAD"], text=True
45 |     ).strip()
46 |     return f"{version_from_file}.dev{num_commits}"
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     tagged_version = get_tagged_version()
51 |     if tagged_version is not None:
52 |         print(tagged_version, end="")
53 |     else:
54 |         print(get_dev_version(), end="")
55 | 


--------------------------------------------------------------------------------
/bindings/python/example/criterion_example.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Copyright (c) Facebook, Inc. and its affiliates.
 4 | 
 5 | This source code is licensed under the MIT-style license found in the
 6 | LICENSE file in the root directory of this source tree.
 7 | """
 8 | 
 9 | import argparse
10 | 
11 | import torch
12 | from flashlight.lib.sequence.criterion import ASGLoss, CriterionScaleMode
13 | 
14 | 
15 | def main() -> None:
16 |     parser = argparse.ArgumentParser()
17 |     parser.add_argument(
18 |         "--cpu", action="store_true", help="Use cpu backend, otherwise use CUDA backend"
19 |     )
20 |     parser.add_argument(
21 |         "--double",
22 |         action="store_true",
23 |         help="store tensors in double, otherwise in float",
24 |     )
25 |     args = parser.parse_args()
26 | 
27 |     device = torch.device("cpu" if args.cpu else "cuda")
28 |     float_type = torch.double if args.double else torch.float
29 | 
30 |     # create ASG loss with scaling the loss to the sqrt of target size
31 |     # and 6 tokens (6 tokens scores predicted by some network for each frame)
32 |     asg = ASGLoss(6, scale_mode=CriterionScaleMode.TARGET_SZ_SQRT).to(device)
33 |     # define the input to the loss (scores for tokens came from
34 |     # some network for each frame) size is [batch, time, ntokens]
35 |     input = torch.tensor(
36 |         [
37 |             [
38 |                 [-0.4340, -0.0254, 0.3667, 0.4180, -0.3805, -0.1707],
39 |                 [0.1060, 0.3631, -0.1122, -0.3825, -0.0031, -0.3801],
40 |                 [0.0443, -0.3795, 0.3194, -0.3130, 0.0094, 0.1560],
41 |                 [0.1252, 0.2877, 0.1997, -0.4554, 0.2774, -0.2526],
42 |                 [-0.4001, -0.2402, 0.1295, 0.0172, 0.1805, -0.3299],
43 |             ],
44 |             [
45 |                 [0.3298, -0.2259, -0.0959, 0.4909, 0.2996, -0.2543],
46 |                 [-0.2863, 0.3239, -0.3988, 0.0732, -0.2107, -0.4739],
47 |                 [-0.0906, 0.0480, -0.1301, 0.3975, -0.3317, -0.1967],
48 |                 [0.4372, -0.2006, 0.0094, 0.3281, 0.1873, -0.2945],
49 |                 [0.2399, 0.0320, -0.3768, -0.2849, -0.2248, 0.3186],
50 |             ],
51 |             [
52 |                 [0.0225, -0.3867, -0.1929, -0.2904, -0.4958, -0.2533],
53 |                 [0.4001, -0.1517, -0.2799, -0.2915, 0.4198, 0.4506],
54 |                 [0.1446, -0.4753, -0.0711, 0.2876, -0.1851, -0.1066],
55 |                 [0.2081, -0.1190, -0.3902, -0.1668, 0.1911, -0.2848],
56 |                 [-0.3846, 0.1175, 0.1052, 0.2172, -0.0362, 0.3055],
57 |             ],
58 |         ],
59 |         dtype=float_type,
60 |         device=device,
61 |         requires_grad=True,
62 |     )
63 |     # define the padded target transcriptions (encoded with tokens indices),
64 |     # padded index is -1
65 |     target = torch.tensor(
66 |         [[2, 1, 5, 1, 3], [4, 3, 5, -1, -1], [3, 2, 2, 1, -1]],
67 |         dtype=torch.int,
68 |         device=device,
69 |     )
70 |     # define target transcriptions sizes
71 |     target_size = torch.tensor([5, 3, 4], dtype=torch.int, device=device)
72 |     # define gradient of the network
73 |     grad = torch.ones(3, dtype=float_type, device=device)
74 | 
75 |     print("List of ASG parameters", list(asg.parameters()))
76 |     # run forward pass to compute the ASG loss
77 |     loss = asg.forward(input, target, target_size)
78 |     print("ASG loss =", loss)
79 |     # run backward pass
80 |     loss.backward(grad)
81 |     print("Gradients with respect to the ASG loss input", input.grad)
82 |     print("Gradients with respect to the transition matrix", asg.transitions.grad)
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     main()  # pragma: no cover
87 | 


--------------------------------------------------------------------------------
/bindings/python/flashlight/lib/sequence/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Copyright (c) Facebook, Inc. and its affiliates.
 4 | 
 5 | This source code is licensed under the MIT-style license found in the
 6 | LICENSE file in the root directory of this source tree.
 7 | """
 8 | 
 9 | name = "sequence"
10 | 


--------------------------------------------------------------------------------
/bindings/python/flashlight/lib/sequence/_criterion.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) Facebook, Inc. and its affiliates.
  3 |  *
  4 |  * This source code is licensed under the MIT-style license found in the
  5 |  * LICENSE file in the root directory of this source tree.
  6 |  */
  7 | 
  8 | #include <pybind11/pybind11.h>
  9 | 
 10 | #include "flashlight/lib/sequence/criterion/cpu/ForceAlignmentCriterion.h"
 11 | #include "flashlight/lib/sequence/criterion/cpu/FullConnectionCriterion.h"
 12 | #include "flashlight/lib/sequence/criterion/cpu/ViterbiPath.h"
 13 | 
 14 | #ifdef FL_SEQUENCE_USE_CUDA
 15 | #include "flashlight/lib/sequence/criterion/cuda/ForceAlignmentCriterion.cuh"
 16 | #include "flashlight/lib/sequence/criterion/cuda/FullConnectionCriterion.cuh"
 17 | #include "flashlight/lib/sequence/criterion/cuda/ViterbiPath.cuh"
 18 | #endif // FL_SEQUENCE_USE_CUDA
 19 | 
 20 | namespace py = pybind11;
 21 | using namespace fl::lib::seq;
 22 | 
 23 | template <class T>
 24 | static T castBytes(const py::bytes& b) {
 25 |   static_assert(
 26 |       std::is_standard_layout<T>::value,
 27 |       "types represented as bytes must be standard layout");
 28 |   std::string s = b;
 29 |   if (s.size() != sizeof(T)) {
 30 |     throw std::runtime_error("wrong py::bytes size to represent object");
 31 |   }
 32 |   return *reinterpret_cast<const T*>(s.data());
 33 | }
 34 | 
 35 | using CpuFAC = fl::lib::cpu::ForceAlignmentCriterion<float>;
 36 | using CpuFCC = fl::lib::cpu::FullConnectionCriterion<float>;
 37 | using CpuViterbi = fl::lib::cpu::ViterbiPath<float>;
 38 | 
 39 | static void CpuFAC_forward(
 40 |     int B,
 41 |     int T,
 42 |     int N,
 43 |     int L,
 44 |     CriterionScaleMode scaleMode,
 45 |     py::bytes input,
 46 |     py::bytes target,
 47 |     py::bytes targetSize,
 48 |     py::bytes trans,
 49 |     py::bytes loss,
 50 |     py::bytes workspace) {
 51 |   CpuFAC::forward(
 52 |       B,
 53 |       T,
 54 |       N,
 55 |       L,
 56 |       scaleMode,
 57 |       castBytes<const float*>(input),
 58 |       castBytes<const int*>(target),
 59 |       castBytes<const int*>(targetSize),
 60 |       castBytes<const float*>(trans),
 61 |       castBytes<float*>(loss),
 62 |       castBytes<void*>(workspace));
 63 | }
 64 | 
 65 | static void CpuFAC_backward(
 66 |     int B,
 67 |     int T,
 68 |     int N,
 69 |     int L,
 70 |     py::bytes target,
 71 |     py::bytes targetSize,
 72 |     py::bytes grad,
 73 |     py::bytes inputGrad,
 74 |     py::bytes transGrad,
 75 |     py::bytes workspace) {
 76 |   CpuFAC::backward(
 77 |       B,
 78 |       T,
 79 |       N,
 80 |       L,
 81 |       castBytes<const int*>(target),
 82 |       castBytes<const int*>(targetSize),
 83 |       castBytes<const float*>(grad),
 84 |       castBytes<float*>(inputGrad),
 85 |       castBytes<float*>(transGrad),
 86 |       castBytes<void*>(workspace));
 87 | }
 88 | 
 89 | static void CpuFCC_forward(
 90 |     int B,
 91 |     int T,
 92 |     int N,
 93 |     CriterionScaleMode scaleMode,
 94 |     py::bytes input,
 95 |     py::bytes targetSize,
 96 |     py::bytes trans,
 97 |     py::bytes loss,
 98 |     py::bytes workspace) {
 99 |   CpuFCC::forward(
100 |       B,
101 |       T,
102 |       N,
103 |       scaleMode,
104 |       castBytes<const float*>(input),
105 |       castBytes<const int*>(targetSize),
106 |       castBytes<const float*>(trans),
107 |       castBytes<float*>(loss),
108 |       castBytes<void*>(workspace));
109 | }
110 | 
111 | static void CpuFCC_backward(
112 |     int B,
113 |     int T,
114 |     int N,
115 |     py::bytes trans,
116 |     py::bytes grad,
117 |     py::bytes inputGrad,
118 |     py::bytes transGrad,
119 |     py::bytes workspace) {
120 |   CpuFCC::backward(
121 |       B,
122 |       T,
123 |       N,
124 |       castBytes<const float*>(trans),
125 |       castBytes<const float*>(grad),
126 |       castBytes<float*>(inputGrad),
127 |       castBytes<float*>(transGrad),
128 |       castBytes<void*>(workspace));
129 | }
130 | 
131 | static void CpuViterbi_compute(
132 |     int B,
133 |     int T,
134 |     int N,
135 |     py::bytes input,
136 |     py::bytes trans,
137 |     py::bytes path,
138 |     py::bytes workspace) {
139 |   CpuViterbi::compute(
140 |       B,
141 |       T,
142 |       N,
143 |       castBytes<const float*>(input),
144 |       castBytes<const float*>(trans),
145 |       castBytes<int*>(path),
146 |       castBytes<void*>(workspace));
147 | }
148 | 
149 | #ifdef FL_SEQUENCE_USE_CUDA
150 | 
151 | using CudaFAC = fl::lib::cuda::ForceAlignmentCriterion<float>;
152 | using CudaFCC = fl::lib::cuda::FullConnectionCriterion<float>;
153 | using CudaViterbi = fl::lib::cuda::ViterbiPath<float>;
154 | 
155 | static void CudaFAC_forward(
156 |     int B,
157 |     int T,
158 |     int N,
159 |     int L,
160 |     CriterionScaleMode scaleMode,
161 |     py::bytes input,
162 |     py::bytes target,
163 |     py::bytes targetSize,
164 |     py::bytes trans,
165 |     py::bytes loss,
166 |     py::bytes workspace,
167 |     py::bytes stream) {
168 |   CudaFAC::forward(
169 |       B,
170 |       T,
171 |       N,
172 |       L,
173 |       scaleMode,
174 |       castBytes<const float*>(input),
175 |       castBytes<const int*>(target),
176 |       castBytes<const int*>(targetSize),
177 |       castBytes<const float*>(trans),
178 |       castBytes<float*>(loss),
179 |       castBytes<void*>(workspace),
180 |       castBytes<cudaStream_t>(stream));
181 | }
182 | 
183 | static void CudaFAC_backward(
184 |     int B,
185 |     int T,
186 |     int N,
187 |     int L,
188 |     py::bytes target,
189 |     py::bytes targetSize,
190 |     py::bytes grad,
191 |     py::bytes inputGrad,
192 |     py::bytes transGrad,
193 |     py::bytes workspace,
194 |     py::bytes stream) {
195 |   CudaFAC::backward(
196 |       B,
197 |       T,
198 |       N,
199 |       L,
200 |       castBytes<const int*>(target),
201 |       castBytes<const int*>(targetSize),
202 |       castBytes<const float*>(grad),
203 |       castBytes<float*>(inputGrad),
204 |       castBytes<float*>(transGrad),
205 |       castBytes<void*>(workspace),
206 |       castBytes<cudaStream_t>(stream));
207 | }
208 | 
209 | static void CudaFCC_forward(
210 |     int B,
211 |     int T,
212 |     int N,
213 |     CriterionScaleMode scaleMode,
214 |     py::bytes input,
215 |     py::bytes targetSize,
216 |     py::bytes trans,
217 |     py::bytes loss,
218 |     py::bytes workspace,
219 |     py::bytes stream) {
220 |   CudaFCC::forward(
221 |       B,
222 |       T,
223 |       N,
224 |       scaleMode,
225 |       castBytes<const float*>(input),
226 |       castBytes<const int*>(targetSize),
227 |       castBytes<const float*>(trans),
228 |       castBytes<float*>(loss),
229 |       castBytes<void*>(workspace),
230 |       castBytes<cudaStream_t>(stream));
231 | }
232 | 
233 | static void CudaFCC_backward(
234 |     int B,
235 |     int T,
236 |     int N,
237 |     py::bytes trans,
238 |     py::bytes grad,
239 |     py::bytes inputGrad,
240 |     py::bytes transGrad,
241 |     py::bytes workspace,
242 |     py::bytes stream) {
243 |   CudaFCC::backward(
244 |       B,
245 |       T,
246 |       N,
247 |       castBytes<const float*>(trans),
248 |       castBytes<const float*>(grad),
249 |       castBytes<float*>(inputGrad),
250 |       castBytes<float*>(transGrad),
251 |       castBytes<void*>(workspace),
252 |       castBytes<cudaStream_t>(stream));
253 | }
254 | 
255 | static void CudaViterbi_compute(
256 |     int B,
257 |     int T,
258 |     int N,
259 |     py::bytes input,
260 |     py::bytes trans,
261 |     py::bytes path,
262 |     py::bytes workspace,
263 |     py::bytes stream) {
264 |   CudaViterbi::compute(
265 |       B,
266 |       T,
267 |       N,
268 |       castBytes<const float*>(input),
269 |       castBytes<const float*>(trans),
270 |       castBytes<int*>(path),
271 |       castBytes<void*>(workspace),
272 |       castBytes<cudaStream_t>(stream));
273 | }
274 | 
275 | #endif // FL_SEQUENCE_USE_CUDA
276 | 
277 | PYBIND11_MODULE(flashlight_lib_sequence_criterion, m) {
278 |   py::enum_<CriterionScaleMode>(m, "CriterionScaleMode")
279 |       .value("NONE", CriterionScaleMode::NONE)
280 |       .value("INPUT_SZ", CriterionScaleMode::INPUT_SZ)
281 |       .value("INPUT_SZ_SQRT", CriterionScaleMode::INPUT_SZ_SQRT)
282 |       .value("TARGET_SZ", CriterionScaleMode::TARGET_SZ)
283 |       .value("TARGET_SZ_SQRT", CriterionScaleMode::TARGET_SZ_SQRT);
284 | 
285 |   py::class_<CpuFAC>(m, "CpuForceAlignmentCriterion")
286 |       .def("get_workspace_size", &CpuFAC::getWorkspaceSize)
287 |       .def("forward", &CpuFAC_forward)
288 |       .def("backward", &CpuFAC_backward);
289 | 
290 |   py::class_<CpuFCC>(m, "CpuFullConnectionCriterion")
291 |       .def("get_workspace_size", &CpuFCC::getWorkspaceSize)
292 |       .def("forward", &CpuFCC_forward)
293 |       .def("backward", &CpuFCC_backward);
294 | 
295 |   py::class_<CpuViterbi>(m, "CpuViterbiPath")
296 |       .def("get_workspace_size", &CpuViterbi::getWorkspaceSize)
297 |       .def("compute", &CpuViterbi_compute);
298 | 
299 | #ifdef FL_SEQUENCE_USE_CUDA
300 |   m.attr("sizeof_cuda_stream") = py::int_(sizeof(cudaStream_t));
301 | 
302 |   py::class_<CudaFAC>(m, "CudaForceAlignmentCriterion")
303 |       .def("get_workspace_size", &CudaFAC::getWorkspaceSize)
304 |       .def("forward", &CudaFAC_forward)
305 |       .def("backward", &CudaFAC_backward);
306 | 
307 |   py::class_<CudaFCC>(m, "CudaFullConnectionCriterion")
308 |       .def("get_workspace_size", &CudaFCC::getWorkspaceSize)
309 |       .def("forward", &CudaFCC_forward)
310 |       .def("backward", &CudaFCC_backward);
311 | 
312 |   py::class_<CudaViterbi>(m, "CudaViterbiPath")
313 |       .def("get_workspace_size", &CudaViterbi::getWorkspaceSize)
314 |       .def("compute", &CudaViterbi_compute);
315 | #endif // FL_SEQUENCE_USE_CUDA
316 | }
317 | 


--------------------------------------------------------------------------------
/bindings/python/flashlight/lib/sequence/criterion.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Copyright (c) Facebook, Inc. and its affiliates.
 4 | 
 5 | This source code is licensed under the MIT-style license found in the
 6 | LICENSE file in the root directory of this source tree.
 7 | """
 8 | 
 9 | from .flashlight_lib_sequence_criterion import (  # @manual
10 |     CpuForceAlignmentCriterion,
11 |     CpuFullConnectionCriterion,
12 |     CpuViterbiPath,
13 |     CriterionScaleMode,
14 | )
15 | 
16 | 
17 | have_torch = False
18 | try:
19 |     import torch
20 | 
21 |     have_torch = True
22 | except ImportError:
23 |     pass
24 | 
25 | if have_torch:
26 |     from flashlight.lib.sequence.criterion_torch import (
27 |         ASGLoss,
28 |         check_tensor,
29 |         create_workspace,
30 |         FACFunction,
31 |         FCCFunction,
32 |         get_cuda_stream_as_bytes,
33 |         get_data_ptr_as_bytes,
34 |         run_backward,
35 |         run_direction,
36 |         run_forward,
37 |         run_get_workspace_size,
38 |     )
39 | 


--------------------------------------------------------------------------------
/bindings/python/flashlight/lib/sequence/criterion_torch.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Copyright (c) Facebook, Inc. and its affiliates.
  4 | 
  5 | This source code is licensed under the MIT-style license found in the
  6 | LICENSE file in the root directory of this source tree.
  7 | """
  8 | 
  9 | import struct
 10 | import sys
 11 | 
 12 | import flashlight.lib.sequence.flashlight_lib_sequence_criterion as _C  # @manual
 13 | import torch
 14 | import torch.nn as nn
 15 | 
 16 | 
 17 | def get_data_ptr_as_bytes(tensor):
 18 |     return struct.pack("P", tensor.data_ptr())
 19 | 
 20 | 
 21 | def get_cuda_stream_as_bytes():
 22 |     s = torch.cuda.current_stream().cuda_stream
 23 |     return s.to_bytes(_C.sizeof_cuda_stream, byteorder=sys.byteorder)
 24 | 
 25 | 
 26 | def check_tensor(tensor, size, dtype, device):
 27 |     shape = torch.Size(size)
 28 |     if tensor.shape != shape:
 29 |         raise ValueError(f"wrong tensor size: expected {shape}, got {tensor.shape}")
 30 |     return tensor.to(dtype=dtype, device=device).contiguous()
 31 | 
 32 | 
 33 | def run_direction(cls, device, direction, *args):
 34 |     """
 35 |     Select and run CPU/CUDA implementation of `forward()` or `backward()`.
 36 |     If CUDA, create the right device context and also pass the CUDA stream.
 37 |     """
 38 |     device = torch.device(device)
 39 |     if device.type == "cuda":
 40 |         with torch.cuda.device(device):
 41 |             fn = getattr(cls.cuda_impl(), direction)
 42 |             fn(*args, get_cuda_stream_as_bytes())
 43 |     elif device.type == "cpu":
 44 |         fn = getattr(cls.cpu_impl(), direction)
 45 |         fn(*args)
 46 |     else:
 47 |         raise ValueError("unknown/unsupported device type")
 48 | 
 49 | 
 50 | def run_forward(cls, device, *args):
 51 |     run_direction(cls, device, "forward", *args)
 52 | 
 53 | 
 54 | def run_backward(cls, device, *args):
 55 |     run_direction(cls, device, "backward", *args)
 56 | 
 57 | 
 58 | def run_get_workspace_size(cls, device, *args):
 59 |     device = torch.device(device)
 60 |     if device.type == "cuda":
 61 |         return cls.cuda_impl().get_workspace_size(*args)
 62 |     elif device.type == "cpu":
 63 |         return cls.cpu_impl().get_workspace_size(*args)
 64 |     else:
 65 |         raise ValueError("unknown/unsupported device type")
 66 | 
 67 | 
 68 | def create_workspace(cls, device, *args):
 69 |     """
 70 |     Select and run CPU/CUDA implementation of `get_workspace_size()`,
 71 |     then return a byte tensor of appropriate size.
 72 |     """
 73 |     workspace_size = run_get_workspace_size(cls, device, *args)
 74 |     return torch.empty(workspace_size, dtype=torch.uint8, device=device)
 75 | 
 76 | 
 77 | class FACFunction(torch.autograd.Function):
 78 |     """
 79 |     torch.autograd.Function for ForceAlignmentCriterion
 80 |     Supports CPU and CUDA backends, compute the probability of the correct paths
 81 |     in the ASG graph (the nominator of the ASG loss)
 82 |     """
 83 | 
 84 |     @staticmethod
 85 |     def cuda_impl():
 86 |         """
 87 |         Get CUDA implementation of forward/backward for the criterion
 88 |         """
 89 |         return _C.CudaForceAlignmentCriterion
 90 | 
 91 |     @staticmethod
 92 |     def cpu_impl():
 93 |         """
 94 |         Get CPU implementation of forward/backward for the criterion
 95 |         """
 96 |         return _C.CpuForceAlignmentCriterion
 97 | 
 98 |     @classmethod
 99 |     def forward(cls, ctx, input, target, target_size, transitions, scale_mode):
100 |         """
101 |         Forward pass of the criterion.
102 | 
103 |         Parameters:
104 |         -----------
105 |         input: float torch.tensor of the size [Batch, Time, Ntokens]
106 |                (output of the network with scores for all frames and all tokens)
107 |         target: int torch.tensor of the size [Batch, Length]
108 |                (padded target transcription encoded with indices of tokens)
109 |         target_size: int torch.tensor of the size [Batch]
110 |                (original length of each target transcription in the bacth)
111 |         transitions: float torch.tensor of size [Ntokens, Ntokens]
112 |                (transitions matrix for ASG loss function,
113 |                 scores of moving from state of token_i to token_j)
114 |         scale_mode: int, scaling factor of the output, possible values
115 |                   NONE = 0,
116 |                   INPUT_SZ = 1,
117 |                   INPUT_SZ_SQRT = 2,
118 |                   TARGET_SZ = 3,
119 |                   TARGET_SZ_SQRT = 4,
120 |         """
121 |         B = input.size(0)
122 |         T = input.size(1)
123 |         N = input.size(2)
124 |         L = target.size(1)
125 |         device = input.device
126 | 
127 |         input_float = check_tensor(input, [B, T, N], torch.float, device)
128 |         target = check_tensor(target, [B, L], torch.int, device)
129 |         target_size = check_tensor(target_size, [B], torch.int, device)
130 |         transitions_float = check_tensor(transitions, [N, N], torch.float, device)
131 | 
132 |         loss = torch.empty(B, dtype=torch.float, device=device)
133 |         workspace = create_workspace(cls, device, B, T, N, L)
134 |         run_forward(
135 |             cls,
136 |             device,
137 |             B,
138 |             T,
139 |             N,
140 |             L,
141 |             scale_mode,
142 |             get_data_ptr_as_bytes(input_float),
143 |             get_data_ptr_as_bytes(target),
144 |             get_data_ptr_as_bytes(target_size),
145 |             get_data_ptr_as_bytes(transitions_float),
146 |             get_data_ptr_as_bytes(loss),
147 |             get_data_ptr_as_bytes(workspace),
148 |         )
149 |         ctx.save_for_backward(input, target, target_size, transitions, workspace)
150 |         return loss.to(input)
151 | 
152 |     @classmethod
153 |     def backward(cls, ctx, grad):
154 |         input, target, target_size, transitions, workspace = ctx.saved_tensors
155 |         B = input.size(0)
156 |         T = input.size(1)
157 |         N = input.size(2)
158 |         L = target.size(1)
159 |         device = input.device
160 | 
161 |         grad = check_tensor(grad, [B], torch.float, device)
162 | 
163 |         input_grad = torch.empty(B, T, N, dtype=torch.float, device=device)
164 |         transitions_grad = torch.empty(N, N, dtype=torch.float, device=device)
165 |         run_backward(
166 |             cls,
167 |             device,
168 |             B,
169 |             T,
170 |             N,
171 |             L,
172 |             get_data_ptr_as_bytes(target),
173 |             get_data_ptr_as_bytes(target_size),
174 |             get_data_ptr_as_bytes(grad),
175 |             get_data_ptr_as_bytes(input_grad),
176 |             get_data_ptr_as_bytes(transitions_grad),
177 |             get_data_ptr_as_bytes(workspace),
178 |         )
179 | 
180 |         return input_grad.to(input), None, None, transitions_grad.to(transitions), None
181 | 
182 | 
183 | class FCCFunction(torch.autograd.Function):
184 |     """
185 |     torch.autograd.Function for FullConnectionCriterion
186 |     Supports CPU and CUDA backends, compute the probability of the full ASG graph
187 |     (the denominator of the ASG loss)
188 |     """
189 | 
190 |     @staticmethod
191 |     def cuda_impl():
192 |         """
193 |         Get CUDA implementation of forward/backward for the criterion
194 |         """
195 |         return _C.CudaFullConnectionCriterion
196 | 
197 |     @staticmethod
198 |     def cpu_impl():
199 |         """
200 |         Get CPU implementation of forward/backward for the criterion
201 |         """
202 |         return _C.CpuFullConnectionCriterion
203 | 
204 |     @classmethod
205 |     def forward(cls, ctx, input, target_size, transitions, scale_mode):
206 |         """
207 |         Forward pass of the criterion.
208 | 
209 |         Parameters:
210 |         -----------
211 |         input: float torch.tensor of the size [Batch, Time, Ntokens]
212 |                (output of the network with scores for all frames and all tokens)
213 |         target: int torch.tensor of the size [Batch, Length]
214 |                (padded target transcription encoded with indices of tokens)
215 |         target_size: int torch.tensor of the size [Batch]
216 |                (original length of each target transcription in the bacth)
217 |         transitions: float torch.tensor of size [Ntokens, Ntokens]
218 |                (transitions matrix for ASG loss function,
219 |                 scores of moving from state of token_i to token_j)
220 |         scale_mode: int, scaling factor of the output, possible values
221 |                   NONE = 0,
222 |                   INPUT_SZ = 1,
223 |                   INPUT_SZ_SQRT = 2,
224 |                   TARGET_SZ = 3,
225 |                   TARGET_SZ_SQRT = 4,
226 |         """
227 |         B = input.size(0)
228 |         T = input.size(1)
229 |         N = input.size(2)
230 |         device = input.device
231 | 
232 |         input_float = check_tensor(input, [B, T, N], torch.float, device)
233 |         if scale_mode != _C.CriterionScaleMode.NONE:
234 |             target_size = check_tensor(target_size, [B], torch.int, device)
235 |         transitions_float = check_tensor(transitions, [N, N], torch.float, device)
236 | 
237 |         loss = torch.empty(B, dtype=torch.float, device=device)
238 |         workspace = create_workspace(cls, device, B, T, N)
239 |         run_forward(
240 |             cls,
241 |             device,
242 |             B,
243 |             T,
244 |             N,
245 |             scale_mode,
246 |             get_data_ptr_as_bytes(input_float),
247 |             get_data_ptr_as_bytes(target_size),
248 |             get_data_ptr_as_bytes(transitions_float),
249 |             get_data_ptr_as_bytes(loss),
250 |             get_data_ptr_as_bytes(workspace),
251 |         )
252 |         ctx.save_for_backward(input, transitions, workspace)
253 |         return loss.to(input)
254 | 
255 |     @classmethod
256 |     def backward(cls, ctx, grad):
257 |         input, transitions, workspace = ctx.saved_tensors
258 |         B = input.size(0)
259 |         T = input.size(1)
260 |         N = input.size(2)
261 |         device = input.device
262 | 
263 |         grad = check_tensor(grad, [B], torch.float, device)
264 | 
265 |         input_grad = torch.empty(B, T, N, dtype=torch.float, device=device)
266 |         transitions_grad = torch.empty(N, N, dtype=torch.float, device=device)
267 |         run_backward(
268 |             cls,
269 |             device,
270 |             B,
271 |             T,
272 |             N,
273 |             get_data_ptr_as_bytes(transitions),
274 |             get_data_ptr_as_bytes(grad),
275 |             get_data_ptr_as_bytes(input_grad),
276 |             get_data_ptr_as_bytes(transitions_grad),
277 |             get_data_ptr_as_bytes(workspace),
278 |         )
279 |         return input_grad.to(input), None, transitions_grad.to(transitions), None
280 | 
281 | 
282 | class ASGLoss(nn.Module):
283 |     def __init__(self, N, scale_mode=_C.CriterionScaleMode.NONE):
284 |         """
285 |         ASG loss implementation. It is similar to CTC, but there is no blanks.
286 |         There are also repetitions like ann -> an1 and transition matrix of scores
287 |         from token_i to token_j.
288 | 
289 |         Parameters:
290 |         -----------
291 |         N: int, number of tokens to predict for each frame
292 |         scale_mode: int, scaling factor of the loss function, possible values
293 |                   NONE = 0,
294 |                   INPUT_SZ = 1,
295 |                   INPUT_SZ_SQRT = 2,
296 |                   TARGET_SZ = 3,
297 |                   TARGET_SZ_SQRT = 4,
298 |         """
299 |         super().__init__()
300 |         self.transitions = nn.Parameter(
301 |             torch.zeros(N, N, dtype=torch.float, requires_grad=True)
302 |         )
303 |         self.scale_mode = scale_mode
304 | 
305 |     def forward(self, input, target, target_size):
306 |         """
307 |         Forward pass of the ASG loss.
308 | 
309 |         Parameters:
310 |         -----------
311 |         input: float torch.tensor of the size [Batch, Time, Ntokens]
312 |                (output of the network with scores for all frames and all tokens)
313 |         target: int torch.tensor of the size [Batch, Length]
314 |                (padded target transcription encoded with indices of tokens)
315 |         target_size: int torch.tensor of the size [Batch]
316 |                (original length of each target transcription in the bacth)
317 | 
318 |         """
319 |         return FCCFunction.apply(
320 |             input, target_size, self.transitions, self.scale_mode
321 |         ) - FACFunction.apply(
322 |             input, target, target_size, self.transitions, self.scale_mode
323 |         )
324 | 


--------------------------------------------------------------------------------
/bindings/python/test/test_import.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright (c) Facebook, Inc. and its affiliates.
 3 | This source code is licensed under the MIT-style license found in the
 4 | LICENSE file in the root directory of this source tree.
 5 | """
 6 | 
 7 | import logging
 8 | import os
 9 | import unittest
10 | 
11 | 
12 | class ImportTestCase(unittest.TestCase):
13 |     def test_import_lib_sequence(self) -> None:
14 |         from flashlight.lib.sequence import criterion
15 |         from flashlight.lib.sequence.criterion import (
16 |             CpuForceAlignmentCriterion,
17 |             CpuFullConnectionCriterion,
18 |             CpuViterbiPath,
19 |             CriterionScaleMode,
20 |         )
21 | 
22 |         if os.getenv("USE_CUDA", "OFF").upper() not in [
23 |             "OFF",
24 |             "0",
25 |             "NO",
26 |             "FALSE",
27 |             "N",
28 |         ]:
29 |             from flashlight.lib.sequence.flashlight_lib_sequence_criterion import (
30 |                 CudaForceAlignmentCriterion,
31 |                 CudaFullConnectionCriterion,
32 |                 CudaViterbiPath,
33 |             )
34 |         else:
35 |             logging.info("Flashlight Sequence bindings built without CUDA")
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     unittest.main()
40 | 


--------------------------------------------------------------------------------
/bindings/python/version.txt:
--------------------------------------------------------------------------------
1 | 0.0.0
2 | 


--------------------------------------------------------------------------------
/cmake/BuildGoogleTest.cmake:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.16)
 2 | 
 3 | include(FetchContent)
 4 | 
 5 | set(gtest_URL https://github.com/google/googletest.git)
 6 | set(gtest_TAG v1.13.0)
 7 | 
 8 | FetchContent_Declare(
 9 |   googletest
10 |   GIT_REPOSITORY ${gtest_URL}
11 |   GIT_TAG        ${gtest_TAG}
12 | )
13 | 
14 | set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) # for Windows
15 | FetchContent_MakeAvailable(googletest)
16 | 


--------------------------------------------------------------------------------
/cmake/Buildcub.cmake:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.16)
 2 | 
 3 | include(FetchContent)
 4 | 
 5 | FetchContent_Declare(
 6 |   cub
 7 |   GIT_REPOSITORY https://github.com/NVIDIA/cub.git
 8 |   # guaranteed to work with CUDA < 11, where it isn't bundled
 9 |   GIT_TAG        1.8.0
10 |   )
11 | 
12 | FetchContent_MakeAvailable(cub)
13 | set(cub_INCLUDE_DIRS ${cub_SOURCE_DIR})
14 | 


--------------------------------------------------------------------------------
/cmake/Buildpybind11.cmake:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.16)
 2 | 
 3 | include(FetchContent)
 4 | 
 5 | set(pybind11_URL https://github.com/pybind/pybind11.git)
 6 | set(pybind11_TAG v2.10.3)
 7 | 
 8 | FetchContent_Declare(
 9 |     pybind11
10 |     GIT_REPOSITORY ${pybind11_URL}
11 |     GIT_TAG ${pybind11_TAG}
12 | )
13 | 
14 | FetchContent_MakeAvailable(pybind11)
15 | 


--------------------------------------------------------------------------------
/cmake/FindFilesystem.cmake:
--------------------------------------------------------------------------------
  1 | # Taken from https://gitlab.kitware.com/cmake/cmake/-/issues/17834, with modifications
  2 | 
  3 | #[=======================================================================[.rst:
  4 | 
  5 | FindFilesystem
  6 | ##############
  7 | 
  8 | This module supports the C++17 standard library's filesystem utilities. Use the
  9 | :imp-target:`std::filesystem` imported target to
 10 | 
 11 | Options
 12 | *******
 13 | 
 14 | The ``COMPONENTS`` argument to this module supports the following values:
 15 | 
 16 | .. find-component:: Experimental
 17 |     :name: fs.Experimental
 18 | 
 19 |     Allows the module to find the "experimental" Filesystem TS version of the
 20 |     Filesystem library. This is the library that should be used with the
 21 |     ``std::experimental::filesystem`` namespace.
 22 | 
 23 | .. find-component:: Final
 24 |     :name: fs.Final
 25 | 
 26 |     Finds the final C++17 standard version of the filesystem library.
 27 | 
 28 | If no components are provided, behaves as if the
 29 | :find-component:`fs.Final` component was specified.
 30 | 
 31 | If both :find-component:`fs.Experimental` and :find-component:`fs.Final` are
 32 | provided, first looks for ``Final``, and falls back to ``Experimental`` in case
 33 | of failure. If ``Final`` is found, :imp-target:`std::filesystem` and all
 34 | :ref:`variables <fs.variables>` will refer to the ``Final`` version.
 35 | 
 36 | 
 37 | Imported Targets
 38 | ****************
 39 | 
 40 | .. imp-target:: std::filesystem
 41 | 
 42 |     The ``std::filesystem`` imported target is defined when any requested
 43 |     version of the C++ filesystem library has been found, whether it is
 44 |     *Experimental* or *Final*.
 45 | 
 46 |     If no version of the filesystem library is available, this target will not
 47 |     be defined.
 48 | 
 49 |     .. note::
 50 |         This target has ``cxx_std_17`` as an ``INTERFACE``
 51 |         :ref:`compile language standard feature <req-lang-standards>`. Linking
 52 |         to this target will automatically enable C++17 if no later standard
 53 |         version is already required on the linking target.
 54 | 
 55 | 
 56 | .. _fs.variables:
 57 | 
 58 | Variables
 59 | *********
 60 | 
 61 | .. variable:: CXX_FILESYSTEM_IS_EXPERIMENTAL
 62 | 
 63 |     Set to ``TRUE`` when the :find-component:`fs.Experimental` version of C++
 64 |     filesystem library was found, otherwise ``FALSE``.
 65 | 
 66 | .. variable:: CXX_FILESYSTEM_HAVE_FS
 67 | 
 68 |     Set to ``TRUE`` when a filesystem header was found.
 69 | 
 70 | .. variable:: CXX_FILESYSTEM_HEADER
 71 | 
 72 |     Set to either ``filesystem`` or ``experimental/filesystem`` depending on
 73 |     whether :find-component:`fs.Final` or :find-component:`fs.Experimental` was
 74 |     found.
 75 | 
 76 | .. variable:: CXX_FILESYSTEM_NAMESPACE
 77 | 
 78 |     Set to either ``std::filesystem`` or ``std::experimental::filesystem``
 79 |     depending on whether :find-component:`fs.Final` or
 80 |     :find-component:`fs.Experimental` was found.
 81 | 
 82 | 
 83 | Examples
 84 | ********
 85 | 
 86 | Using `find_package(Filesystem)` with no component arguments:
 87 | 
 88 | .. code-block:: cmake
 89 | 
 90 |     find_package(Filesystem REQUIRED)
 91 | 
 92 |     add_executable(my-program main.cpp)
 93 |     target_link_libraries(my-program PRIVATE std::filesystem)
 94 | 
 95 | 
 96 | #]=======================================================================]
 97 | 
 98 | 
 99 | if(TARGET std::filesystem)
100 |     # This module has already been processed. Don't do it again.
101 |     return()
102 | endif()
103 | 
104 | cmake_minimum_required(VERSION 3.10)
105 | 
106 | include(CMakePushCheckState)
107 | include(CheckIncludeFileCXX)
108 | 
109 | # If we're not cross-compiling, try to run test executables.
110 | # Otherwise, assume that compile + link is a sufficient check.
111 | if(CMAKE_CROSSCOMPILING)
112 |     include(CheckCXXSourceCompiles)
113 |     macro(_cmcm_check_cxx_source code var)
114 |         check_cxx_source_compiles("${code}" ${var})
115 |     endmacro()
116 | else()
117 |     include(CheckCXXSourceRuns)
118 |     macro(_cmcm_check_cxx_source code var)
119 |         check_cxx_source_runs("${code}" ${var})
120 |     endmacro()
121 | endif()
122 | 
123 | cmake_push_check_state()
124 | 
125 | set(CMAKE_REQUIRED_QUIET ${Filesystem_FIND_QUIETLY})
126 | 
127 | # All of our tests required C++17 or later
128 | set(CMAKE_CXX_STANDARD 17)
129 | 
130 | # Normalize and check the component list we were given
131 | set(want_components ${Filesystem_FIND_COMPONENTS})
132 | if(Filesystem_FIND_COMPONENTS STREQUAL "")
133 |     set(want_components Final)
134 | endif()
135 | 
136 | # Warn on any unrecognized components
137 | set(extra_components ${want_components})
138 | list(REMOVE_ITEM extra_components Final Experimental)
139 | foreach(component IN LISTS extra_components)
140 |     message(WARNING "Extraneous find_package component for Filesystem: ${component}")
141 | endforeach()
142 | 
143 | # Detect which of Experimental and Final we should look for
144 | set(find_experimental TRUE)
145 | set(find_final TRUE)
146 | if(NOT "Final" IN_LIST want_components)
147 |     set(find_final FALSE)
148 | endif()
149 | if(NOT "Experimental" IN_LIST want_components)
150 |     set(find_experimental FALSE)
151 | endif()
152 | 
153 | if(find_final)
154 |     check_include_file_cxx("filesystem" _CXX_FILESYSTEM_HAVE_HEADER)
155 |     mark_as_advanced(_CXX_FILESYSTEM_HAVE_HEADER)
156 |     if(_CXX_FILESYSTEM_HAVE_HEADER)
157 |         # We found the non-experimental header. Don't bother looking for the
158 |         # experimental one.
159 |         set(find_experimental FALSE)
160 |     endif()
161 | else()
162 |     set(_CXX_FILESYSTEM_HAVE_HEADER FALSE)
163 | endif()
164 | 
165 | if(find_experimental)
166 |     check_include_file_cxx("experimental/filesystem" _CXX_FILESYSTEM_HAVE_EXPERIMENTAL_HEADER)
167 |     mark_as_advanced(_CXX_FILESYSTEM_HAVE_EXPERIMENTAL_HEADER)
168 | else()
169 |     set(_CXX_FILESYSTEM_HAVE_EXPERIMENTAL_HEADER FALSE)
170 | endif()
171 | 
172 | if(_CXX_FILESYSTEM_HAVE_HEADER)
173 |     set(_have_fs TRUE)
174 |     set(_fs_header filesystem)
175 |     set(_fs_namespace std::filesystem)
176 |     set(_is_experimental FALSE)
177 | elseif(_CXX_FILESYSTEM_HAVE_EXPERIMENTAL_HEADER)
178 |     set(_have_fs TRUE)
179 |     set(_fs_header experimental/filesystem)
180 |     set(_fs_namespace std::experimental::filesystem)
181 |     set(_is_experimental TRUE)
182 | else()
183 |     set(_have_fs FALSE)
184 | endif()
185 | 
186 | set(CXX_FILESYSTEM_HAVE_FS ${_have_fs} CACHE BOOL "TRUE if we have the C++ filesystem headers")
187 | set(CXX_FILESYSTEM_HEADER ${_fs_header} CACHE STRING "The header that should be included to obtain the filesystem APIs")
188 | set(CXX_FILESYSTEM_NAMESPACE ${_fs_namespace} CACHE STRING "The C++ namespace that contains the filesystem APIs")
189 | set(CXX_FILESYSTEM_IS_EXPERIMENTAL ${_is_experimental} CACHE BOOL "TRUE if the C++ filesystem library is the experimental version")
190 | 
191 | set(_found FALSE)
192 | 
193 | if(CXX_FILESYSTEM_HAVE_FS)
194 |     # We have some filesystem library available. Do link checks
195 |     string(CONFIGURE [[
196 |         #include <cstdlib>
197 |         #include <@CXX_FILESYSTEM_HEADER@>
198 | 
199 |         int main() {
200 |             auto cwd = @CXX_FILESYSTEM_NAMESPACE@::current_path();
201 |             printf("%s", cwd.c_str());
202 |             return EXIT_SUCCESS;
203 |         }
204 |     ]] code @ONLY)
205 | 
206 |     # Check a simple filesystem program without any linker flags
207 |     _cmcm_check_cxx_source("${code}" CXX_FILESYSTEM_NO_LINK_NEEDED)
208 | 
209 |     set(can_link ${CXX_FILESYSTEM_NO_LINK_NEEDED})
210 | 
211 |     if(NOT CXX_FILESYSTEM_NO_LINK_NEEDED)
212 |         set(prev_libraries ${CMAKE_REQUIRED_LIBRARIES})
213 |         # Add the libstdc++ flag
214 |         set(CMAKE_REQUIRED_LIBRARIES ${prev_libraries} -lstdc++fs)
215 |         _cmcm_check_cxx_source("${code}" CXX_FILESYSTEM_STDCPPFS_NEEDED)
216 |         set(can_link ${CXX_FILESYSTEM_STDCPPFS_NEEDED})
217 |         if(NOT CXX_FILESYSTEM_STDCPPFS_NEEDED)
218 |             # Try the libc++ flag
219 |             set(CMAKE_REQUIRED_LIBRARIES ${prev_libraries} -lc++fs)
220 |             _cmcm_check_cxx_source("${code}" CXX_FILESYSTEM_CPPFS_NEEDED)
221 |             set(can_link ${CXX_FILESYSTEM_CPPFS_NEEDED})
222 |         endif()
223 |     endif()
224 | 
225 |     if(can_link)
226 |         add_library(std::filesystem INTERFACE IMPORTED)
227 |         # TODO: re-enable this once requiring CUDA >= 11/an nvcc version that works with C++ 17
228 |         # set_property(TARGET std::filesystem APPEND PROPERTY INTERFACE_COMPILE_FEATURES cxx_std_17)
229 |         set(_found TRUE)
230 | 
231 |         if(CXX_FILESYSTEM_NO_LINK_NEEDED)
232 |             # Nothing to add...
233 |         elseif(CXX_FILESYSTEM_STDCPPFS_NEEDED)
234 |             set_property(TARGET std::filesystem APPEND PROPERTY INTERFACE_LINK_LIBRARIES -lstdc++fs)
235 |         elseif(CXX_FILESYSTEM_CPPFS_NEEDED)
236 |             set_property(TARGET std::filesystem APPEND PROPERTY INTERFACE_LINK_LIBRARIES -lc++fs)
237 |         endif()
238 |     endif()
239 | endif()
240 | 
241 | cmake_pop_check_state()
242 | 
243 | set(Filesystem_FOUND ${_found} CACHE BOOL "TRUE if we can run a program using std::filesystem" FORCE)
244 | 
245 | if(Filesystem_FIND_REQUIRED AND NOT Filesystem_FOUND)
246 |     message(FATAL_ERROR "Cannot run simple program using std::filesystem")
247 | endif()
248 | 


--------------------------------------------------------------------------------
/cmake/FindGMock.cmake:
--------------------------------------------------------------------------------
 1 | # Find gmock
 2 | #
 3 | #  GMOCK_INCLUDE_DIRS - where to find gmock/gmock.h, etc.
 4 | #  GMOCK_LIBRARIES   - List of libraries when using gmock.
 5 | #  GMOCK_FOUND       - True if gmock found.
 6 | 
 7 | if (GMOCK_INCLUDE_DIRS)
 8 |   # Already in cache, be silent
 9 |   set(GMOCK_FIND_QUIETLY TRUE)
10 | endif()
11 | 
12 | find_package(GMock CONFIG)
13 | if (NOT TARGET GTest::gmock)
14 |   if (NOT GMOCK_ROOT)
15 |     set(GMOCK_ROOT ENV{GMOCK_ROOT})
16 |   endif()
17 | 
18 |   find_path(GMOCK_INCLUDE_DIRS gmock/gmock.h PATHS ${GMOCK_ROOT})
19 |   find_library(GMOCK_MAIN_LIBRARY NAMES gmock_main PATHS ${GMOCK_ROOT})
20 |   find_library(GMOCK_LIBRARIES NAMES gmock PATHS ${GMOCK_ROOT})
21 | 
22 |   set(GMOCK_BOTH_LIBRARIES
23 |     ${GMOCK_MAIN_LIBRARY}
24 |     ${GMOCK_LIBRARIES}
25 |     )
26 | 
27 |   # handle the QUIETLY and REQUIRED arguments and set GMOCK_FOUND to TRUE if
28 |   # all listed variables are TRUE
29 |   include(FindPackageHandleStandardArgs)
30 |   find_package_handle_standard_args(
31 |     GMock
32 |     DEFAULT_MSG
33 |     GMOCK_MAIN_LIBRARY
34 |     GMOCK_LIBRARIES
35 |     GMOCK_LIBRARIES
36 |     GMOCK_INCLUDE_DIRS
37 |     )
38 | 
39 |   mark_as_advanced(
40 |     GMOCK_MAIN_LIBRARY
41 |     GMOCK_LIBRARIES
42 |     LIBGTEST_LIBRARY
43 |     GMOCK_LIBRARIES
44 |     GMOCK_INCLUDE_DIRS
45 |     )
46 | 
47 |   add_library(GTest::gmock UNKNOWN IMPORTED)
48 |   set_target_properties(GTest::gmock PROPERTIES
49 |     INTERFACE_INCLUDE_DIRECTORIES ${GMOCK_INCLUDE_DIRS}
50 |     IMPORTED_LOCATION ${GMOCK_LIBRARIES}
51 |     )
52 | 
53 |   add_library(GTest::gmock_main UNKNOWN IMPORTED)
54 |   set_target_properties(GTest::gmock_main PROPERTIES
55 |     INTERFACE_INCLUDE_DIRECTORIES ${GMOCK_INCLUDE_DIRS}
56 |     IMPORTED_LOCATION ${GMOCK_MAIN_LIBRARY}
57 |     )
58 | endif()
59 | 


--------------------------------------------------------------------------------
/cmake/InternalUtils.cmake:
--------------------------------------------------------------------------------
 1 | function(fl_sequence_add_coverage_to_target)
 2 |   set(oneValueArgs TARGET)
 3 |   cmake_parse_arguments(add_coverage_to_target "${options}" "${oneValueArgs}"
 4 |     "${multiValueArgs}" ${ARGN})
 5 | 
 6 |   if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
 7 |     # Add required flags (GCC & LLVM/Clang)
 8 |     target_compile_options(${add_coverage_to_target_TARGET} PUBLIC
 9 |       -O0 # TODO: reconcile this with CMake modes for something cleaner
10 |       -g
11 |       $<$<COMPILE_LANGUAGE:CXX>:--coverage>
12 |       )
13 |     if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.13)
14 |       target_link_options(${add_coverage_to_target_TARGET}
15 |         PUBLIC
16 |         $<$<COMPILE_LANGUAGE:CXX>:--coverage>)
17 |     else()
18 |       target_link_libraries(${add_coverage_to_target_TARGET}
19 |         PUBLIC
20 |         --coverage)
21 |     endif()
22 |   endif()
23 | endfunction(fl_sequence_add_coverage_to_target)
24 | 
25 | function(fl_sequence_setup_install_targets)
26 |   set(multiValueArgs INSTALL_TARGETS INSTALL_HEADERS)
27 |   cmake_parse_arguments(setup_install_targets "${options}" "${oneValueArgs}"
28 |     "${multiValueArgs}" ${ARGN})
29 | 
30 |   list(LENGTH setup_install_targets_INSTALL_TARGETS TARGETS_LENGTH)
31 |   if (${TARGETS_LENGTH} EQUAL 0)
32 |     message(FATAL_ERROR "setup_install_targets called with "
33 |       "empty targets list.")
34 |   endif()
35 | 
36 |   # Main target
37 |   install(
38 |     TARGETS ${setup_install_targets_INSTALL_TARGETS}
39 |     EXPORT flashlight-sequence-targets
40 |     COMPONENT flashlight-sequence
41 |     PUBLIC_HEADER DESTINATION fl
42 |     RUNTIME DESTINATION ${FL_INSTALL_BIN_DIR}
43 |     LIBRARY DESTINATION ${FL_INSTALL_LIB_DIR}
44 |     ARCHIVE DESTINATION ${FL_INSTALL_LIB_DIR}
45 |     FRAMEWORK DESTINATION framework
46 |     INCLUDES DESTINATION ${FL_INSTALL_INC_DIR}
47 |     )
48 | 
49 |   # Write and install targets file
50 |   install(
51 |     EXPORT flashlight-sequence-targets
52 |     NAMESPACE flashlight::
53 |     DESTINATION ${FL_INSTALL_CMAKE_DIR}
54 |     COMPONENT flashlight-sequence
55 |     )
56 | 
57 |   # Write config file (used by projects including fl, such as examples)
58 |   include(CMakePackageConfigHelpers)
59 |   set(INCLUDE_DIRS include)
60 |   set(CMAKE_DIR ${FL_INSTALL_CMAKE_DIR})
61 |   configure_package_config_file(
62 |     ${PROJECT_SOURCE_DIR}/cmake/flashlight-sequence-config.cmake.in
63 |     cmake/install/${FL_CONFIG_CMAKE_BUILD_DIR}/flashlight-sequence-config.cmake
64 |     INSTALL_DESTINATION
65 |     ${FL_INSTALL_CMAKE_DIR}
66 |     PATH_VARS INCLUDE_DIRS CMAKE_DIR
67 |     )
68 |   write_basic_package_version_file(
69 |     cmake/install/${FL_CONFIG_CMAKE_BUILD_DIR}/flashlight-sequence-config-version.cmake
70 |     COMPATIBILITY SameMajorVersion
71 |     )
72 |   install(FILES
73 |     ${PROJECT_BINARY_DIR}/cmake/install/flashlight-sequence-config.cmake
74 |     ${PROJECT_BINARY_DIR}/cmake/install/flashlight-sequence-config-version.cmake
75 |     DESTINATION ${FL_INSTALL_CMAKE_DIR}
76 |     COMPONENT flashlight-sequence
77 |     )
78 |   set_target_properties(${setup_install_targets_INSTALL_TARGETS} PROPERTIES
79 |     VERSION "${flashlight-sequence_VERSION}"
80 |     SOVERSION "${flashlight-sequence_VERSION_MAJOR}")
81 | endfunction(fl_sequence_setup_install_targets)
82 | 


--------------------------------------------------------------------------------
/cmake/TestUtils.cmake:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.16)
 2 | 
 3 | set(GTEST_TARGETS "")
 4 | 
 5 | # Get or find Google Test and Google Mock
 6 | find_package(GTest 1.12.1)
 7 | if (NOT GTEST_FOUND)
 8 |   if (NOT TARGET gtest)
 9 |     message(STATUS "googletest not found - will download and build from source")
10 |     # Download, build, and find the resulting googletest
11 |     include(${PROJECT_SOURCE_DIR}/cmake/BuildGoogleTest.cmake)
12 |     list(APPEND GTEST_TARGETS GTest::gtest GTest::gtest_main GTest::gmock GTest::gmock_main)
13 |   endif()
14 | else()
15 |   message(STATUS "gtest found: (include: ${GTEST_INCLUDE_DIRS}, lib: ${GTEST_BOTH_LIBRARIES}")
16 |   if (TARGET GTest::GTest)
17 |     # We found the differently-named CMake targets from FindGTest
18 |     if (NOT TARGET GTest::Main)
19 |       message(FATAL_ERROR "Google Test must be built with main")
20 |     endif()
21 |     # TODO: these targets are deprecated in CMake 3.20
22 |     list(APPEND GTEST_TARGETS GTest::GTest GTest::Main)
23 |   endif()
24 |   if (NOT TARGET GTest::gmock)
25 |     find_package(GMock REQUIRED)
26 |     message(STATUS "gmock found: (include: ${GMOCK_INCLUDE_DIRS}, lib: ${GMOCK_BOTH_LIBRARIES})")
27 |   endif()
28 |   list(APPEND GTEST_TARGETS GTest::gmock GTest::gmock_main)
29 |   message(STATUS "Found gtest and gmock on system.")
30 | endif()
31 | 
32 | include(GoogleTest)
33 | find_package(Threads REQUIRED)
34 | 
35 | function(build_test)
36 |   set(options)
37 |   set(oneValueArgs SRC)
38 |   set(multiValueArgs LIBS PREPROC)
39 |   cmake_parse_arguments(build_test "${options}" "${oneValueArgs}"
40 |     "${multiValueArgs}" ${ARGN})
41 | 
42 |   get_filename_component(src_name ${build_test_SRC} NAME_WE)
43 |   set(target "${src_name}")
44 |   add_executable(${target} ${build_test_SRC})
45 |   if (TARGET gtest)
46 |     add_dependencies(${target} gtest) # make sure gtest is built first
47 |   endif()
48 |   target_link_libraries(
49 |     ${target}
50 |     PUBLIC
51 |     ${GTEST_TARGETS}
52 |     ${build_test_LIBS}
53 |      ${CMAKE_THREAD_LIBS_INIT}
54 |     )
55 |   target_include_directories(
56 |     ${target}
57 |     PUBLIC
58 |     ${PROJECT_SOURCE_DIR}
59 |     )
60 |   target_compile_definitions(
61 |     ${target}
62 |     PUBLIC
63 |     ${build_test_PREPROC}
64 |     )
65 |   gtest_add_tests(TARGET ${target})
66 | endfunction(build_test)
67 | 


--------------------------------------------------------------------------------
/cmake/flashlight-sequence-config.cmake.in:
--------------------------------------------------------------------------------
 1 | # flashlight-sequence
 2 | #
 3 | # IMPORTED targets
 4 | # ^^^^^^^^^^^^^^^^
 5 | #
 6 | # Configuration file for flashlight-sequence. Provides the following
 7 | # `IMPORTED` targets:
 8 | #
 9 | # ``flashlight::flashlight-sequence``
10 | #   The flashlight-sequence library.
11 | #
12 | # The above targets can be linked with your build using ``target_link_library``.
13 | # Example:
14 | #
15 | #   add_executable(myexecutable mySource.cpp)
16 | #   target_link_library(myexecutable PRIVATE flashlight::flashlight-sequence)
17 | #
18 | # The above properly links flashlight-sequence with myexecutable. No call to
19 | # ``target_include_directories`` is required.
20 | #
21 | 
22 | # Dependencies
23 | include(CMakeFindDependencyMacro)
24 | if (@FL_SEQUENCE_USE_OPENMP@)
25 |   find_dependency(OpenMP)
26 | endif()
27 | if (@FL_SEQUENCE_USE_CUDA@)
28 |   # TODO: use FindCUDAToolkit after requiring CMake >= 3.17
29 |   enable_language(CUDA)
30 | endif()
31 | # Config variables
32 | set(FL_SEQUENCE_USE_OPENMP @FL_SEQUENCE_USE_OPENMP@)
33 | set(FL_SEQUENCE_USE_CUDA @FL_SEQUENCE_USE_CUDA@)
34 | 
35 | ################################################################################
36 | 
37 | @PACKAGE_INIT@
38 | 
39 | # Add IMPORTED targets
40 | if(NOT TARGET flashlight::flashlight-sequence)
41 |   if(EXISTS @PACKAGE_CMAKE_DIR@/flashlight-sequence-targets.cmake)
42 |     include(@PACKAGE_CMAKE_DIR@/flashlight-sequence-targets.cmake)
43 |   endif()
44 | endif()
45 | 
46 | # For legacy configurations
47 | set(flashlight-sequence_LIBRARIES flashlight::flashlight-sequence)
48 | if (EXISTS @PACKAGE_INCLUDE_DIRS@)
49 |   set(flashlight-sequence_INCLUDE_DIRS @PACKAGE_INCLUDE_DIRS@)
50 | endif()
51 | set(flashlight-sequence_FOUND 1)
52 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | comment: false
2 | coverage:
3 |   status:
4 |     project:
5 |       default:
6 |         threshold: 0.25%
7 | 


--------------------------------------------------------------------------------
/flashlight/lib/sequence/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.16)
2 | 
3 | include(${CMAKE_CURRENT_LIST_DIR}/criterion/CMakeLists.txt)
4 | 


--------------------------------------------------------------------------------
/flashlight/lib/sequence/Defines.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Facebook, Inc. and its affiliates.
 3 |  *
 4 |  * This source code is licensed under the MIT-style license found in the
 5 |  * LICENSE file in the root directory of this source tree.
 6 |  */
 7 | 
 8 | #pragma once
 9 | 
10 | #if defined(_WIN32) || defined(_MSC_VER)
11 | 
12 | #ifdef FL_SEQ_DLL
13 | #define FL_SEQ_API __declspec(dllexport)
14 | #else // FL_SEQ_DLL
15 | #define FL_SEQ_API __declspec(dllimport)
16 | #endif // FL_SEQ_DLL
17 | 
18 | #define FL_SEQ_DEPRECATED(msg) __declspec(deprecated(msg)
19 | 
20 | #else // defined(_WIN32) || defined(_MSC_VER)
21 | 
22 | #define FL_SEQ_API __attribute__((visibility("default")))
23 | #define FL_SEQ_DEPRECATED(msg) __attribute__((deprecated(msg)))
24 | 
25 | #endif // defined(_WIN32) || defined(_MSC_VER)
26 | 


--------------------------------------------------------------------------------
/flashlight/lib/sequence/criterion/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.16)
 2 | 
 3 | if (FL_SEQUENCE_USE_OPENMP)
 4 |   find_package(OpenMP REQUIRED)
 5 |   target_link_libraries(flashlight-sequence PRIVATE OpenMP::OpenMP_CXX)
 6 | endif()
 7 | 
 8 | target_sources(
 9 |   flashlight-sequence
10 |   PRIVATE
11 |   ${CMAKE_CURRENT_LIST_DIR}/cpu/CriterionUtils.cpp
12 |   ${CMAKE_CURRENT_LIST_DIR}/cpu/ForceAlignmentCriterion.cpp
13 |   ${CMAKE_CURRENT_LIST_DIR}/cpu/ConnectionistTemporalClassificationCriterion.cpp
14 |   ${CMAKE_CURRENT_LIST_DIR}/cpu/FullConnectionCriterion.cpp
15 |   ${CMAKE_CURRENT_LIST_DIR}/cpu/ViterbiPath.cpp
16 |   )
17 | 
18 | if (FL_SEQUENCE_USE_CUDA)
19 |   # cub is required for BlockReduce and not bundled with CUDA < 11
20 |   find_path(cub_INCLUDE_DIRS
21 |     NAMES cub.cuh
22 |     PATH_SUFFIXES cub include
23 |     PATHS ${cub_BASE_DIR} ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
24 |     ENV ${cub_BASE_DIR})
25 |   if (NOT cub_INCLUDE_DIRS)
26 |     if (NOT FL_SEQUENCE_BUILD_STANDALONE)
27 |       message(FATAL_ERROR
28 |         "Required dependency NVIDIA cub not found - try setting cub_BASE_DIR")
29 |     endif()
30 | 
31 |     message(STATUS "NVIDIA cub not found - downloading from source")
32 |     include(${PROJECT_SOURCE_DIR}/cmake/Buildcub.cmake)
33 |     target_include_directories(flashlight-sequence PRIVATE ${cub_INCLUDE_DIRS})
34 |   else()
35 |     message(STATUS "NVIDIA cub found: (include: ${cub_INCLUDE_DIRS})")
36 |   endif()
37 | 
38 |   target_sources(
39 |     flashlight-sequence
40 |     PRIVATE
41 |     ${CMAKE_CURRENT_LIST_DIR}/cuda/CriterionUtils.cu
42 |     ${CMAKE_CURRENT_LIST_DIR}/cuda/ForceAlignmentCriterion.cu
43 |     ${CMAKE_CURRENT_LIST_DIR}/cuda/FullConnectionCriterion.cu
44 |     ${CMAKE_CURRENT_LIST_DIR}/cuda/ViterbiPath.cu
45 |     )
46 | 
47 |   target_include_directories(
48 |     flashlight-sequence
49 |     PUBLIC
50 |     $<BUILD_INTERFACE:${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}>
51 |     )
52 | endif()
53 | 


--------------------------------------------------------------------------------
/flashlight/lib/sequence/criterion/Defines.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Facebook, Inc. and its affiliates.
 3 |  *
 4 |  * This source code is licensed under the MIT-style license found in the
 5 |  * LICENSE file in the root directory of this source tree.
 6 |  */
 7 | 
 8 | #pragma once
 9 | 
10 | namespace fl {
11 | namespace lib {
12 | namespace seq {
13 | 
14 | enum class CriterionScaleMode {
15 |   NONE = 0,
16 |   INPUT_SZ = 1,
17 |   INPUT_SZ_SQRT = 2,
18 |   TARGET_SZ = 3,
19 |   TARGET_SZ_SQRT = 4,
20 | };
21 | 
22 | } // namespace seq
23 | } // namespace lib
24 | } // namespace fl
25 | 


--------------------------------------------------------------------------------
/flashlight/lib/sequence/criterion/Workspace.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Facebook, Inc. and its affiliates.
 3 |  *
 4 |  * This source code is licensed under the MIT-style license found in the
 5 |  * LICENSE file in the root directory of this source tree.
 6 |  */
 7 | 
 8 | #pragma once
 9 | 
10 | #include <cstddef>
11 | #include <cstdint>
12 | 
13 | namespace fl {
14 | namespace lib {
15 | namespace seq {
16 | 
17 | /**
18 |  * Partitions a contiguous block of memory into aligned arrays.
19 |  * Can be used for either host or device memory.
20 |  *
21 |  * Usage: first create Workspace(nullptr), request() some arrays, then call
22 |  * requiredSize(). Next, allocate memory of that size. Finally, create
23 |  * Workspace(ptr) and request() the same sequence of arrays.
24 |  */
25 | template <size_t Alignment = 256>
26 | class Workspace {
27 |  public:
28 |   explicit Workspace(void* workspacePtr)
29 |       : workspacePtr_(reinterpret_cast<uintptr_t>(workspacePtr)), offset_(0) {
30 |     align();
31 |   }
32 | 
33 |   template <class T>
34 |   T* request(size_t s0, size_t s1 = 1, size_t s2 = 1, size_t s3 = 1) {
35 |     align();
36 |     auto p = reinterpret_cast<T*>(workspacePtr_ + offset_);
37 |     offset_ += sizeof(T) * s0 * s1 * s2 * s3;
38 |     return p;
39 |   }
40 | 
41 |   template <class T>
42 |   void request(T** p, size_t s0, size_t s1 = 1, size_t s2 = 1, size_t s3 = 1) {
43 |     *p = request<T>(s0, s1, s2, s3);
44 |   }
45 | 
46 |   size_t requiredSize() const {
47 |     // Add extra bytes in case the initial `workspacePtr` isn't aligned
48 |     return offset_ + Alignment - 1;
49 |   }
50 | 
51 |  private:
52 |   void align() {
53 |     // Pad until `workspacePtr_ + offset_` is a multiple of `Alignment`
54 |     offset_ +=
55 |         Alignment - 1 - (workspacePtr_ + offset_ + Alignment - 1) % Alignment;
56 |   }
57 | 
58 |   const uintptr_t workspacePtr_;
59 |   size_t offset_;
60 | };
61 | 
62 | } // namespace seq
63 | } // namespace lib
64 | } // namespace fl
65 | 


--------------------------------------------------------------------------------
/flashlight/lib/sequence/criterion/cpu/ConnectionistTemporalClassificationCriterion.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) Facebook, Inc. and its affiliates.
  3 |  *
  4 |  * This source code is licensed under the MIT-style license found in the
  5 |  * LICENSE file in the root directory of this source tree.
  6 |  */
  7 | 
  8 | #include "flashlight/lib/sequence/criterion/cpu/ConnectionistTemporalClassificationCriterion.h"
  9 | 
 10 | #include <cmath>
 11 | #include <limits>
 12 | 
 13 | #include "flashlight/lib/sequence/criterion/Workspace.h"
 14 | 
 15 | namespace {
 16 | 
 17 | template <class Float>
 18 | struct WorkspacePtrs {
 19 |   WorkspacePtrs(void* workspace, int B, int T, int /* N unused */, int L) {
 20 |     const int s = (2 * L) + 1;
 21 |     fl::lib::seq::Workspace<> ws(workspace);
 22 |     ws.request(&alpha, B, T, s);
 23 |     ws.request(&s_inc, B, s);
 24 |     ws.request(&e_inc, B, s);
 25 |     ws.request(&backptr, B, T, s);
 26 |     ws.request(&labels_w_blanks, B, s);
 27 |     requiredSize = ws.requiredSize();
 28 |   }
 29 | 
 30 |   Float* alpha;
 31 |   int* s_inc;
 32 |   int* e_inc;
 33 |   int* labels_w_blanks;
 34 |   int* backptr;
 35 |   size_t requiredSize;
 36 | };
 37 | 
 38 | /*
 39 |  * Derived from warpctc/include/detail/cpu_ctc.h
 40 |  * Creates labels_w_blanks which adds blank_labels between each character in
 41 |  * labels.
 42 |  * s_inc and e_inc are used by the `compute_alphas` kernel to determine the
 43 |  * furthest starting and end label location that each time step could possibly
 44 |  * be.
 45 |  */
 46 | int setup_labels(
 47 |     const int* labels,
 48 |     int* s_inc,
 49 |     int* e_inc,
 50 |     int* labels_w_blanks,
 51 |     int blank_label,
 52 |     int L,
 53 |     int S) {
 54 |   int e_counter = 0;
 55 |   int s_counter = 0;
 56 | 
 57 |   s_inc[s_counter++] = 1;
 58 | 
 59 |   int repeats = 0;
 60 | 
 61 |   for (int i = 1; i < L; ++i) {
 62 |     if (labels[i - 1] == labels[i]) {
 63 |       s_inc[s_counter++] = 1;
 64 |       s_inc[s_counter++] = 1;
 65 |       e_inc[e_counter++] = 1;
 66 |       e_inc[e_counter++] = 1;
 67 |       ++repeats;
 68 |     } else {
 69 |       s_inc[s_counter++] = 2;
 70 |       e_inc[e_counter++] = 2;
 71 |     }
 72 |   }
 73 |   e_inc[e_counter++] = 1;
 74 | 
 75 |   for (int i = 0; i < L; ++i) {
 76 |     labels_w_blanks[2 * i] = blank_label;
 77 |     labels_w_blanks[2 * i + 1] = labels[i];
 78 |   }
 79 |   labels_w_blanks[S - 1] = blank_label;
 80 | 
 81 |   return repeats;
 82 | }
 83 | 
 84 | /*
 85 |  * Derived from warpctc/include/detail/cpu_ctc.h
 86 |  * Float can be either float or double
 87 |  */
 88 | template <class Float>
 89 | void compute_alphas(
 90 |     const Float* input,
 91 |     int repeats,
 92 |     int S,
 93 |     int T,
 94 |     int N,
 95 |     const int* const e_inc,
 96 |     const int* const s_inc,
 97 |     const int* const labels,
 98 |     Float* alphas,
 99 |     int* backptr,
100 |     int* paths) {
101 |   const int blank_label_idx = N - 1;
102 |   int start = (((S / 2) + repeats - T) < 0) ? 0 : 1, end = S > 1 ? 2 : 1;
103 | 
104 |   for (int i = 0; i < S * T; i++) {
105 |     alphas[i] = -std::numeric_limits<Float>::infinity();
106 |   }
107 | 
108 |   for (int i = start; i < end; ++i) {
109 |     alphas[i] = input[labels[i]];
110 |   }
111 | 
112 |   // Iterate through each time frame
113 |   for (int t = 1; t < T; ++t) {
114 |     // Calculate the smallest and largest possible index of the target that this
115 |     // time could be
116 |     int remain = (S / 2) + repeats - (T - t);
117 |     if (remain >= 0) {
118 |       start += s_inc[remain];
119 |     }
120 |     if (t <= (S / 2) + repeats) {
121 |       end += e_inc[t - 1];
122 |     }
123 |     int startloop = start;
124 |     int idx1 = t * S, idx2 = (t - 1) * S, idx3 = t * N;
125 | 
126 |     if (start == 0) {
127 |       alphas[idx1] = alphas[idx2] + input[blank_label_idx + idx3];
128 |       backptr[idx1] = 0;
129 |       startloop += 1;
130 |     }
131 | 
132 |     for (int i = startloop; i < end; ++i) {
133 |       Float x0 = alphas[i + idx2];
134 |       Float x1 = alphas[(i - 1) + idx2];
135 |       Float x2 = -std::numeric_limits<Float>::infinity();
136 | 
137 |       // In CTC, the optimal path may optionally chose to skip a blank label.
138 |       // x2 represents skipping a letter, and can only happen if we're not
139 |       // currently on a blank_label, and we're not on a repeat letter
140 |       // (i != 1) just ensures we don't access labels[i - 2] if its i < 2
141 |       if (labels[i] != blank_label_idx && i != 1 &&
142 |           labels[i] != labels[i - 2]) {
143 |         x2 = alphas[(i - 2) + idx2];
144 |       }
145 |       Float result = 0.0;
146 |       if (x2 > x1 && x2 > x0) {
147 |         result = x2;
148 |         backptr[i + idx1] = 2;
149 |       } else if (x1 > x0 && x1 > x2) {
150 |         result = x1;
151 |         backptr[i + idx1] = 1;
152 |       } else {
153 |         result = x0;
154 |         backptr[i + idx1] = 0;
155 |       }
156 |       alphas[i + idx1] = result + input[labels[i] + idx3];
157 |     }
158 |   }
159 | 
160 |   int ltrIdx = alphas[T * S - 1] > alphas[T * S - 2] ? S - 1 : S - 2;
161 |   for (int t = T - 1; t >= 0; t--) {
162 |     paths[t] = labels[ltrIdx];
163 |     ltrIdx -= backptr[(t * S) + ltrIdx];
164 |   }
165 | }
166 | 
167 | } // namespace
168 | 
169 | namespace fl {
170 | namespace lib {
171 | namespace cpu {
172 | 
173 | template <class Float>
174 | size_t ConnectionistTemporalClassificationCriterion<Float>::getWorkspaceSize(
175 |     int B,
176 |     int T,
177 |     int N,
178 |     int L) {
179 |   WorkspacePtrs<Float> dummy(nullptr, B, T, N, L);
180 |   return dummy.requiredSize;
181 | }
182 | 
183 | template <class Float>
184 | void ConnectionistTemporalClassificationCriterion<Float>::viterbi(
185 |     int B,
186 |     int T,
187 |     int N,
188 |     int _L,
189 |     const Float* _input,
190 |     const int* _target,
191 |     const int* targetSize,
192 |     int* bestPaths,
193 |     void* workspace) {
194 |   const int _S = (2 * _L) + 1;
195 |   const int blank_label = N - 1;
196 |   WorkspacePtrs<Float> ws(workspace, B, T, N, _L);
197 |   for (auto b = 0; b < B; b++) {
198 |     auto L = targetSize[b];
199 |     auto S = (2 * L) + 1;
200 |     int repeats = setup_labels(
201 |         _target + (b * _L),
202 |         ws.s_inc + (b * _S),
203 |         ws.e_inc + (b * _S),
204 |         ws.labels_w_blanks + (b * _S),
205 |         blank_label,
206 |         L,
207 |         S);
208 |     compute_alphas(
209 |         _input + (b * N * T),
210 |         repeats,
211 |         S,
212 |         T,
213 |         N,
214 |         ws.e_inc + b * _S,
215 |         ws.s_inc + b * _S,
216 |         ws.labels_w_blanks + b * _S,
217 |         ws.alpha + (b * _S * T),
218 |         ws.backptr + (b * _S * T),
219 |         bestPaths + (b * T));
220 |   }
221 | }
222 | 
223 | template struct ConnectionistTemporalClassificationCriterion<float>;
224 | template struct ConnectionistTemporalClassificationCriterion<double>;
225 | 
226 | } // namespace cpu
227 | } // namespace lib
228 | } // namespace fl
229 | 


--------------------------------------------------------------------------------
/flashlight/lib/sequence/criterion/cpu/ConnectionistTemporalClassificationCriterion.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Facebook, Inc. and its affiliates.
 3 |  *
 4 |  * This source code is licensed under the MIT-style license found in the
 5 |  * LICENSE file in the root directory of this source tree.
 6 |  */
 7 | #pragma once
 8 | 
 9 | #include <cstddef>
10 | 
11 | #include "flashlight/lib/sequence/Defines.h"
12 | 
13 | namespace fl {
14 | namespace lib {
15 | namespace cpu {
16 | 
17 | template <class Float>
18 | struct FL_SEQ_API ConnectionistTemporalClassificationCriterion {
19 |   static size_t getWorkspaceSize(int B, int T, int N, int L);
20 | 
21 |   static void viterbi(
22 |       int B,
23 |       int T,
24 |       int N,
25 |       int L,
26 |       const Float* input,
27 |       const int* target,
28 |       const int* targetSize,
29 |       int* bestPaths,
30 |       void* workspace);
31 | };
32 | } // namespace cpu
33 | } // namespace lib
34 | } // namespace fl
35 | 


--------------------------------------------------------------------------------
/flashlight/lib/sequence/criterion/cpu/CriterionUtils.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Facebook, Inc. and its affiliates.
 3 |  *
 4 |  * This source code is licensed under the MIT-style license found in the
 5 |  * LICENSE file in the root directory of this source tree.
 6 |  */
 7 | 
 8 | #include "flashlight/lib/sequence/criterion/cpu/CriterionUtils.h"
 9 | 
10 | #include <algorithm>
11 | #include <cmath>
12 | #include <vector>
13 | 
14 | #include "flashlight/lib/sequence/criterion/Defines.h"
15 | 
16 | namespace fl {
17 | namespace lib {
18 | namespace cpu {
19 | 
20 | template <class Float>
21 | void CriterionUtils<Float>::batchTargetSize(
22 |     int B,
23 |     int L,
24 |     int maxSize,
25 |     const int* target,
26 |     int* targetSize) {
27 |   for (int b = 0; b < B; ++b) {
28 |     for (int i = L - 1; i >= 0; --i) {
29 |       if (target[b * L + i] >= 0) {
30 |         targetSize[b] = std::min(maxSize, i + 1);
31 |         break;
32 |       }
33 |     }
34 |   }
35 | }
36 | 
37 | template <class Float>
38 | void CriterionUtils<Float>::computeScale(
39 |     int B,
40 |     int T,
41 |     int /* N */,
42 |     CriterionScaleMode scaleMode,
43 |     const int* targetSize,
44 |     Float* scale) {
45 |   for (int b = 0; b < B; ++b) {
46 |     switch (scaleMode) {
47 |       case CriterionScaleMode::NONE:
48 |         scale[b] = 1.0;
49 |         break;
50 |       case CriterionScaleMode::INPUT_SZ:
51 |         scale[b] = T > 0 ? 1.0 / T : 1.0;
52 |         break;
53 |       case CriterionScaleMode::INPUT_SZ_SQRT:
54 |         scale[b] = T > 0 ? std::sqrt(1.0 / T) : 1.0;
55 |         break;
56 |       case CriterionScaleMode::TARGET_SZ:
57 |         scale[b] = targetSize[b] > 0 ? 1.0 / targetSize[b] : 1.0;
58 |         break;
59 |       case CriterionScaleMode::TARGET_SZ_SQRT:
60 |         scale[b] = targetSize[b] > 0 ? std::sqrt(1.0 / targetSize[b]) : 1.0;
61 |         break;
62 |       default:
63 |         break;
64 |     }
65 |   }
66 | }
67 | 
68 | template struct CriterionUtils<float>;
69 | template struct CriterionUtils<double>;
70 | 
71 | } // namespace cpu
72 | } // namespace lib
73 | } // namespace fl
74 | 


--------------------------------------------------------------------------------
/flashlight/lib/sequence/criterion/cpu/CriterionUtils.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Facebook, Inc. and its affiliates.
 3 |  *
 4 |  * This source code is licensed under the MIT-style license found in the
 5 |  * LICENSE file in the root directory of this source tree.
 6 |  */
 7 | 
 8 | #pragma once
 9 | 
10 | #include <cstring>
11 | 
12 | #include "flashlight/lib/sequence//criterion/Defines.h"
13 | #include "flashlight/lib/sequence/Defines.h"
14 | 
15 | using fl::lib::seq::CriterionScaleMode;
16 | 
17 | namespace fl {
18 | namespace lib {
19 | namespace cpu {
20 | 
21 | /// Check CUDA header for docs.
22 | template <class Float>
23 | struct FL_SEQ_API CriterionUtils {
24 |   static void batchTargetSize(
25 |       int B,
26 |       int L,
27 |       int maxSize,
28 |       const int* target,
29 |       int* targetSize);
30 | 
31 |   static void computeScale(
32 |       int B,
33 |       int T,
34 |       int N,
35 |       CriterionScaleMode scaleMode,
36 |       const int* targetSize,
37 |       Float* scale);
38 | };
39 | 
40 | /// Zeroes `count * sizeof(T)` device bytes
41 | template <typename T>
42 | void setZero(T* ptr, size_t count) {
43 |   std::memset(ptr, 0, count * sizeof(T));
44 | }
45 | 
46 | } // namespace cpu
47 | } // namespace lib
48 | } // namespace fl
49 | 


--------------------------------------------------------------------------------
/flashlight/lib/sequence/criterion/cpu/ForceAlignmentCriterion.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) Facebook, Inc. and its affiliates.
  3 |  *
  4 |  * This source code is licensed under the MIT-style license found in the
  5 |  * LICENSE file in the root directory of this source tree.
  6 |  */
  7 | 
  8 | #include "flashlight/lib/sequence/criterion/cpu/ForceAlignmentCriterion.h"
  9 | 
 10 | #include <algorithm>
 11 | #include <cmath>
 12 | 
 13 | #include "flashlight/lib/sequence/criterion/Workspace.h"
 14 | #include "flashlight/lib/sequence/criterion/cpu/CriterionUtils.h"
 15 | 
 16 | namespace {
 17 | 
 18 | template <class Float>
 19 | struct WorkspacePtrs {
 20 |   WorkspacePtrs(void* workspace, int B, int T, int N, int L) {
 21 |     fl::lib::seq::Workspace<> ws(workspace);
 22 |     ws.request(&scale, B);
 23 |     ws.request(&alpha, B, T, L);
 24 |     ws.request(&alphaGrad, B, T, L);
 25 |     ws.request(&transBatchGrad, B, N, N);
 26 |     ws.request(&transBuf1, B, L);
 27 |     ws.request(&transBuf2, B, L);
 28 |     ws.request(&transBufGrad1, B, L);
 29 |     ws.request(&transBufGrad2, B, L);
 30 |     requiredSize = ws.requiredSize();
 31 |   }
 32 | 
 33 |   Float* scale;
 34 |   double* alpha;
 35 |   double* alphaGrad;
 36 |   Float* transBatchGrad;
 37 |   Float* transBuf1;
 38 |   Float* transBuf2;
 39 |   Float* transBufGrad1;
 40 |   Float* transBufGrad2;
 41 |   size_t requiredSize;
 42 | };
 43 | 
 44 | } // namespace
 45 | 
 46 | namespace fl {
 47 | namespace lib {
 48 | namespace cpu {
 49 | 
 50 | template <class Float>
 51 | size_t
 52 | ForceAlignmentCriterion<Float>::getWorkspaceSize(int B, int T, int N, int L) {
 53 |   WorkspacePtrs<Float> dummy(nullptr, B, T, N, L);
 54 |   return dummy.requiredSize;
 55 | }
 56 | 
 57 | template <class Float>
 58 | void ForceAlignmentCriterion<Float>::forward(
 59 |     int B,
 60 |     int T,
 61 |     int N,
 62 |     int _L,
 63 |     CriterionScaleMode scaleMode,
 64 |     const Float* _input,
 65 |     const int* _target,
 66 |     const int* targetSize,
 67 |     const Float* trans,
 68 |     Float* loss,
 69 |     void* workspace) {
 70 |   WorkspacePtrs<Float> ws(workspace, B, T, N, _L);
 71 |   CriterionUtils<Float>::computeScale(B, T, N, scaleMode, targetSize, ws.scale);
 72 | 
 73 | #pragma omp parallel for num_threads(B)
 74 |   for (int b = 0; b < B; ++b) {
 75 |     auto* alpha = &ws.alpha[b * T * _L];
 76 |     auto* input = &_input[b * T * N];
 77 |     auto* target = &_target[b * _L];
 78 |     auto* transBuf1 = &ws.transBuf1[b * _L];
 79 |     auto* transBuf2 = &ws.transBuf2[b * _L];
 80 |     int L = targetSize[b];
 81 | 
 82 |     alpha[0] = input[target[0]];
 83 | 
 84 |     for (int i = 0; i < L; ++i) {
 85 |       transBuf1[i] = trans[target[i] * N + target[i]];
 86 |       transBuf2[i] = i > 0 ? trans[target[i] * N + target[i - 1]] : 0;
 87 |     }
 88 | 
 89 |     for (int t = 1; t < T; ++t) {
 90 |       auto* inputCur = &input[t * N];
 91 |       auto* alphaPrev = &alpha[(t - 1) * L];
 92 |       auto* alphaCur = &alpha[t * L];
 93 | 
 94 |       int high = t < L ? t : L;
 95 |       int low = T - t < L ? L - (T - t) : 1;
 96 | 
 97 |       if (T - t >= L) {
 98 |         alphaCur[0] = alphaPrev[0] + transBuf1[0] + inputCur[target[0]];
 99 |       }
100 | 
101 |       if (t < L) {
102 |         alphaCur[high] =
103 |             alphaPrev[high - 1] + transBuf2[high] + inputCur[target[high]];
104 |       }
105 | 
106 |       for (int i = low; i < high; ++i) {
107 |         double s1 = alphaPrev[i] + transBuf1[i];
108 |         double s2 = alphaPrev[i - 1] + transBuf2[i];
109 |         // lse = logSumExp(s1, s2)
110 |         double lse =
111 |             s1 < s2 ? s2 + log1p(exp(s1 - s2)) : s1 + log1p(exp(s2 - s1));
112 |         alphaCur[i] = lse + inputCur[target[i]];
113 |       }
114 |     }
115 | 
116 |     loss[b] = alpha[T * L - 1] * ws.scale[b];
117 |   }
118 | }
119 | 
120 | template <class Float>
121 | void ForceAlignmentCriterion<Float>::backward(
122 |     int B,
123 |     int T,
124 |     int N,
125 |     int _L,
126 |     const int* _target,
127 |     const int* targetSize,
128 |     const Float* grad,
129 |     Float* _inputGrad,
130 |     Float* transGrad,
131 |     void* workspace) {
132 |   WorkspacePtrs<Float> ws(workspace, B, T, N, _L);
133 |   setZero(_inputGrad, B * T * N);
134 |   setZero(transGrad, N * N);
135 |   setZero(ws.alphaGrad, B * T * _L);
136 |   setZero(ws.transBatchGrad, B * N * N);
137 |   setZero(ws.transBufGrad1, B * _L);
138 |   setZero(ws.transBufGrad2, B * _L);
139 | 
140 | #pragma omp parallel for num_threads(B)
141 |   for (int b = 0; b < B; ++b) {
142 |     auto* alpha = &ws.alpha[b * T * _L];
143 |     auto* alphaGrad = &ws.alphaGrad[b * T * _L];
144 |     auto* inputGrad = &_inputGrad[b * T * N];
145 |     auto* target = &_target[b * _L];
146 |     auto* transBatchGrad = &ws.transBatchGrad[b * N * N];
147 |     auto* transBuf1 = &ws.transBuf1[b * _L];
148 |     auto* transBuf2 = &ws.transBuf2[b * _L];
149 |     auto* transBufGrad1 = &ws.transBufGrad1[b * _L];
150 |     auto* transBufGrad2 = &ws.transBufGrad2[b * _L];
151 |     int L = targetSize[b];
152 | 
153 |     alphaGrad[T * L - 1] = 1;
154 | 
155 |     for (int t = T - 1; t > 0; --t) {
156 |       auto* inputCurGrad = &inputGrad[t * N];
157 |       auto* alphaPrev = &alpha[(t - 1) * L];
158 |       auto* alphaCurGrad = &alphaGrad[t * L];
159 |       auto* alphaPrevGrad = &alphaGrad[(t - 1) * L];
160 | 
161 |       int high = t < L ? t : L;
162 |       int low = T - t < L ? L - (T - t) : 1;
163 | 
164 |       int high1 = t < L ? t + 1 : L;
165 |       int low1 = T - t < L ? L - (T - t) : 0;
166 | 
167 |       for (int i = low1; i < high1; ++i) {
168 |         inputCurGrad[target[i]] += alphaCurGrad[i];
169 |       }
170 | 
171 |       if (T - t >= L) {
172 |         alphaPrevGrad[0] += alphaCurGrad[0];
173 |         transBufGrad1[0] += alphaCurGrad[0];
174 |       }
175 | 
176 |       if (t < L) {
177 |         alphaPrevGrad[high - 1] += alphaCurGrad[high];
178 |         transBufGrad2[high] += alphaCurGrad[high];
179 |       }
180 | 
181 |       for (int i = low; i < high; ++i) {
182 |         double s1 = alphaPrev[i] + transBuf1[i];
183 |         double s2 = alphaPrev[i - 1] + transBuf2[i];
184 |         // d1, d2 = dLogSumExp(s1, s2)
185 |         double d1, d2;
186 |         if (s1 < s2) {
187 |           d2 = 1 / (1 + exp(s1 - s2));
188 |           d1 = 1 - d2;
189 |         } else {
190 |           d1 = 1 / (1 + exp(s2 - s1));
191 |           d2 = 1 - d1;
192 |         }
193 |         alphaPrevGrad[i] += d1 * alphaCurGrad[i];
194 |         alphaPrevGrad[i - 1] += d2 * alphaCurGrad[i];
195 |         transBufGrad1[i] += d1 * alphaCurGrad[i];
196 |         transBufGrad2[i] += d2 * alphaCurGrad[i];
197 |       }
198 |     }
199 | 
200 |     inputGrad[target[0]] += alphaGrad[0];
201 |     auto gradScale = grad[b] * ws.scale[b];
202 |     for (int i = 0; i < T * N; ++i) {
203 |       inputGrad[i] *= gradScale;
204 |     }
205 | 
206 |     for (int i = 0; i < L; ++i) {
207 |       transBatchGrad[target[i] * N + target[i]] += transBufGrad1[i];
208 |       if (i > 0) {
209 |         transBatchGrad[target[i] * N + target[i - 1]] += transBufGrad2[i];
210 |       }
211 |     }
212 |   }
213 | 
214 |   for (int b = 0; b < B; ++b) {
215 |     auto transBatchGrad = ws.transBatchGrad + b * N * N;
216 |     auto gradScale = grad[b] * ws.scale[b];
217 |     for (int i = 0; i < N * N; ++i) {
218 |       transGrad[i] += gradScale * transBatchGrad[i];
219 |     }
220 |   }
221 | }
222 | 
223 | template <class Float>
224 | void ForceAlignmentCriterion<Float>::viterbi(
225 |     int B,
226 |     int T,
227 |     int N,
228 |     int _L,
229 |     const Float* _input,
230 |     const int* _target,
231 |     const int* targetSize,
232 |     const Float* trans,
233 |     int* bestPaths,
234 |     void* workspace) {
235 |   WorkspacePtrs<Float> ws(workspace, B, T, N, _L);
236 | 
237 | #pragma omp parallel for num_threads(B)
238 |   for (int b = 0; b < B; ++b) {
239 |     double* alpha = &ws.alpha[b * T * _L];
240 |     const Float* input = &_input[b * T * N];
241 |     const int* target = &_target[b * _L];
242 |     Float* transBuf1 = &ws.transBuf1[b * _L];
243 |     Float* transBuf2 = &ws.transBuf2[b * _L];
244 |     int L = targetSize[b];
245 |     for (int i = 0; i < L * T; i++) {
246 |       alpha[i] = -std::numeric_limits<Float>::infinity();
247 |     }
248 | 
249 |     alpha[0] = input[target[0]];
250 | 
251 |     for (int i = 0; i < L; ++i) {
252 |       transBuf1[i] = trans[target[i] * N + target[i]];
253 |       transBuf2[i] = i > 0 ? trans[target[i] * N + target[i - 1]] : 0;
254 |     }
255 | 
256 |     for (int t = 1; t < T; ++t) {
257 |       const Float* inputCur = &input[t * N];
258 |       double* alphaPrev = &alpha[(t - 1) * L];
259 |       double* alphaCur = &alpha[t * L];
260 | 
261 |       int high = t < L ? t : L;
262 |       int low = T - t < L ? L - (T - t) : 1;
263 | 
264 |       // Handle edge cases.
265 |       // If (T - t >= L), then we can conceivably still be at the initial blank
266 |       if (T - t >= L) {
267 |         alphaCur[0] = alphaPrev[0] + transBuf1[0] + inputCur[target[0]];
268 |       }
269 | 
270 |       // If (t < L), then the highest position can only be be computed
271 |       // by transitioning. (We couldn't have been at position `high`
272 |       // at the previous timestep).
273 |       if (t < L) {
274 |         alphaCur[high] =
275 |             alphaPrev[high - 1] + transBuf2[high] + inputCur[target[high]];
276 |       }
277 | 
278 |       for (int i = low; i < high; ++i) {
279 |         double s1 = alphaPrev[i] + transBuf1[i];
280 |         double s2 = alphaPrev[i - 1] + transBuf2[i];
281 |         alphaCur[i] = inputCur[target[i]] + fmax(s1, s2);
282 |       }
283 |     }
284 | 
285 |     auto ltrIdx = L - 1;
286 |     int* bestPath = bestPaths + b * T;
287 |     for (auto t = T - 1; t > 0; t--) {
288 |       bestPath[t] = target[ltrIdx];
289 |       auto* alphaPrev = &alpha[(t - 1) * L];
290 |       if (ltrIdx > 0) {
291 |         double s1 = alphaPrev[ltrIdx] + transBuf1[ltrIdx];
292 |         double s2 = alphaPrev[ltrIdx - 1] + transBuf2[ltrIdx];
293 |         if (s2 > s1) {
294 |           ltrIdx--;
295 |         }
296 |       }
297 |     }
298 |     bestPath[0] = target[ltrIdx];
299 |   }
300 | }
301 | 
302 | template struct ForceAlignmentCriterion<float>;
303 | template struct ForceAlignmentCriterion<double>;
304 | 
305 | } // namespace cpu
306 | } // namespace lib
307 | } // namespace fl
308 | 


--------------------------------------------------------------------------------
/flashlight/lib/sequence/criterion/cpu/ForceAlignmentCriterion.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Facebook, Inc. and its affiliates.
 3 |  *
 4 |  * This source code is licensed under the MIT-style license found in the
 5 |  * LICENSE file in the root directory of this source tree.
 6 |  */
 7 | 
 8 | #pragma once
 9 | 
10 | #include <cstddef>
11 | 
12 | #include "flashlight/lib/sequence/Defines.h"
13 | #include "flashlight/lib/sequence/criterion/Defines.h"
14 | 
15 | using fl::lib::seq::CriterionScaleMode;
16 | 
17 | namespace fl {
18 | namespace lib {
19 | namespace cpu {
20 | 
21 | /// Check CUDA header for docs.
22 | template <class Float>
23 | struct FL_SEQ_API ForceAlignmentCriterion {
24 |   static size_t getWorkspaceSize(int B, int T, int N, int L);
25 | 
26 |   static void forward(
27 |       int B,
28 |       int T,
29 |       int N,
30 |       int L,
31 |       CriterionScaleMode scaleMode,
32 |       const Float* input,
33 |       const int* target,
34 |       const int* targetSize,
35 |       const Float* trans,
36 |       Float* loss,
37 |       void* workspace);
38 | 
39 |   static void backward(
40 |       int B,
41 |       int T,
42 |       int N,
43 |       int L,
44 |       const int* target,
45 |       const int* targetSize,
46 |       const Float* grad,
47 |       Float* inputGrad,
48 |       Float* transGrad,
49 |       void* workspace);
50 | 
51 |   static void viterbi(
52 |       int B,
53 |       int T,
54 |       int N,
55 |       int L,
56 |       const Float* input,
57 |       const int* target,
58 |       const int* targetSize,
59 |       const Float* trans,
60 |       int* bestPaths,
61 |       void* workspace);
62 | };
63 | 
64 | } // namespace cpu
65 | } // namespace lib
66 | } // namespace fl
67 | 


--------------------------------------------------------------------------------
/flashlight/lib/sequence/criterion/cpu/FullConnectionCriterion.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) Facebook, Inc. and its affiliates.
  3 |  *
  4 |  * This source code is licensed under the MIT-style license found in the
  5 |  * LICENSE file in the root directory of this source tree.
  6 |  */
  7 | 
  8 | #include "flashlight/lib/sequence/criterion/cpu/FullConnectionCriterion.h"
  9 | 
 10 | #include <cmath>
 11 | 
 12 | #include "flashlight/lib/sequence/criterion/Workspace.h"
 13 | #include "flashlight/lib/sequence/criterion/cpu/CriterionUtils.h"
 14 | 
 15 | namespace {
 16 | 
 17 | template <class Float>
 18 | struct WorkspacePtrs {
 19 |   explicit WorkspacePtrs(void* workspace, int B, int T, int N) {
 20 |     fl::lib::seq::Workspace<> ws(workspace);
 21 |     ws.request(&scale, B);
 22 |     ws.request(&alpha, B, T, N);
 23 |     ws.request(&alphaGrad, B, T, N);
 24 |     ws.request(&transBatchGrad, B, N, N);
 25 |     ws.request(&transBuf, B, N, N);
 26 |     requiredSize = ws.requiredSize();
 27 |   }
 28 | 
 29 |   Float* scale;
 30 |   double* alpha;
 31 |   double* alphaGrad;
 32 |   double* transBatchGrad;
 33 |   double* transBuf;
 34 |   size_t requiredSize;
 35 | };
 36 | 
 37 | } // namespace
 38 | 
 39 | namespace fl {
 40 | namespace lib {
 41 | namespace cpu {
 42 | 
 43 | template <class Float>
 44 | size_t FullConnectionCriterion<Float>::getWorkspaceSize(int B, int T, int N) {
 45 |   return WorkspacePtrs<Float>(nullptr, B, T, N).requiredSize;
 46 | }
 47 | 
 48 | template <class Float>
 49 | void FullConnectionCriterion<Float>::forward(
 50 |     int B,
 51 |     int T,
 52 |     int N,
 53 |     CriterionScaleMode scaleMode,
 54 |     const Float* input,
 55 |     const int* targetSize,
 56 |     const Float* trans,
 57 |     Float* loss,
 58 |     void* workspace) {
 59 |   WorkspacePtrs<Float> ws(workspace, B, T, N);
 60 |   CriterionUtils<Float>::computeScale(B, T, N, scaleMode, targetSize, ws.scale);
 61 | 
 62 | #pragma omp parallel for num_threads(B)
 63 |   for (int b = 0; b < B; ++b) {
 64 |     for (int n = 0; n < N; ++n) {
 65 |       int k = b * T * N + n;
 66 |       ws.alpha[k] = input[k];
 67 |     }
 68 | 
 69 |     for (int t = 1; t <= T; ++t) {
 70 |       for (int m = 0; m < N; ++m) {
 71 |         const auto* alphaPrev = &ws.alpha[b * T * N + (t - 1) * N];
 72 |         const auto* inputCur = &input[b * T * N + t * N];
 73 |         auto* transBuf = &ws.transBuf[b * N * N + m * N];
 74 |         auto* alphaCur = &ws.alpha[b * T * N + t * N];
 75 | 
 76 |         double maxValue = -INFINITY;
 77 |         for (int n = 0; n < N; ++n) {
 78 |           double val = transBuf[n] =
 79 |               alphaPrev[n] + (t == T ? 0 : trans[m * N + n]);
 80 |           maxValue = val > maxValue ? val : maxValue;
 81 |         }
 82 | 
 83 |         double sumValue = 0;
 84 |         for (int n = 0; n < N; ++n) {
 85 |           sumValue += exp(transBuf[n] - maxValue);
 86 |         }
 87 | 
 88 |         if (t == T) {
 89 |           loss[b] = ws.scale[b] * (log(sumValue) + maxValue);
 90 |           break;
 91 |         }
 92 | 
 93 |         alphaCur[m] = log(sumValue) + maxValue + inputCur[m];
 94 |       }
 95 |     }
 96 |   }
 97 | }
 98 | 
 99 | template <class Float>
100 | void FullConnectionCriterion<Float>::backward(
101 |     int B,
102 |     int T,
103 |     int N,
104 |     const Float* trans,
105 |     const Float* grad,
106 |     Float* _inputGrad,
107 |     Float* transGrad,
108 |     void* workspace) {
109 |   WorkspacePtrs<Float> ws(workspace, B, T, N);
110 |   setZero(_inputGrad, B * T * N);
111 |   setZero(transGrad, N * N);
112 |   setZero(ws.alphaGrad, B * T * N);
113 |   setZero(ws.transBatchGrad, B * N * N);
114 | 
115 | #pragma omp parallel for num_threads(B)
116 |   for (int b = 0; b < B; ++b) {
117 |     for (int t = T; t > 0; --t) {
118 |       for (int m = 0; m < N; ++m) {
119 |         const auto* alphaPrev = &ws.alpha[b * T * N + (t - 1) * N];
120 |         const auto* alphaCurGrad = &ws.alphaGrad[b * T * N + t * N];
121 |         auto* alphaPrevGrad = &ws.alphaGrad[b * T * N + (t - 1) * N];
122 |         auto* transBuf = &ws.transBuf[b * N * N + m * N];
123 |         auto* transBatchGrad = &ws.transBatchGrad[b * N * N + m * N];
124 | 
125 |         double maxValue = -INFINITY;
126 |         for (int n = 0; n < N; ++n) {
127 |           double val = transBuf[n] =
128 |               alphaPrev[n] + (t == T ? 0 : trans[m * N + n]);
129 |           maxValue = val > maxValue ? val : maxValue;
130 |         }
131 | 
132 |         double sumValue = 0;
133 |         for (int n = 0; n < N; ++n) {
134 |           transBuf[n] = exp(transBuf[n] - maxValue);
135 |           sumValue += transBuf[n];
136 |         }
137 | 
138 |         if (t == T) {
139 |           for (int n = 0; n < N; ++n) {
140 |             alphaPrevGrad[n] = transBuf[n] / sumValue;
141 |           }
142 |           break;
143 |         }
144 | 
145 |         for (int n = 0; n < N; ++n) {
146 |           transBuf[n] = transBuf[n] / sumValue * alphaCurGrad[m];
147 |           transBatchGrad[n] += transBuf[n];
148 |         }
149 |       }
150 | 
151 |       if (t == T) {
152 |         continue;
153 |       }
154 | 
155 |       for (int m = 0; m < N; ++m) {
156 |         auto* alphaPrevGrad = &ws.alphaGrad[b * T * N + (t - 1) * N];
157 | 
158 |         for (int n = 0; n < N; ++n) {
159 |           alphaPrevGrad[m] += ws.transBuf[b * N * N + n * N + m];
160 |         }
161 |       }
162 |     }
163 | 
164 |     auto* alphaGrad = &ws.alphaGrad[b * T * N];
165 |     auto* inputGrad = &_inputGrad[b * T * N];
166 | 
167 |     for (int i = 0; i < T * N; ++i) {
168 |       inputGrad[i] = ws.scale[b] * grad[b] * alphaGrad[i];
169 |     }
170 |   }
171 | 
172 |   for (int b = 0; b < B; ++b) {
173 |     auto* transBatchGrad = &ws.transBatchGrad[b * N * N];
174 | 
175 |     for (int i = 0; i < N * N; ++i) {
176 |       transGrad[i] += ws.scale[b] * grad[b] * transBatchGrad[i];
177 |     }
178 |   }
179 | }
180 | 
181 | template struct FullConnectionCriterion<float>;
182 | template struct FullConnectionCriterion<double>;
183 | 
184 | } // namespace cpu
185 | } // namespace lib
186 | } // namespace fl
187 | 


--------------------------------------------------------------------------------
/flashlight/lib/sequence/criterion/cpu/FullConnectionCriterion.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Facebook, Inc. and its affiliates.
 3 |  *
 4 |  * This source code is licensed under the MIT-style license found in the
 5 |  * LICENSE file in the root directory of this source tree.
 6 |  */
 7 | 
 8 | #pragma once
 9 | 
10 | #include <cstddef>
11 | 
12 | #include "flashlight/lib/sequence/Defines.h"
13 | #include "flashlight/lib/sequence/criterion/Defines.h"
14 | 
15 | using fl::lib::seq::CriterionScaleMode;
16 | 
17 | namespace fl {
18 | namespace lib {
19 | namespace cpu {
20 | 
21 | /// Check CUDA header for docs.
22 | template <class Float>
23 | struct FL_SEQ_API FullConnectionCriterion {
24 |   static size_t getWorkspaceSize(int B, int T, int N);
25 | 
26 |   static void forward(
27 |       int B,
28 |       int T,
29 |       int N,
30 |       CriterionScaleMode scaleMode,
31 |       const Float* input,
32 |       const int* targetSize,
33 |       const Float* trans,
34 |       Float* loss,
35 |       void* workspace);
36 | 
37 |   static void backward(
38 |       int B,
39 |       int T,
40 |       int N,
41 |       const Float* trans,
42 |       const Float* grad,
43 |       Float* inputGrad,
44 |       Float* transGrad,
45 |       void* workspace);
46 | };
47 | 
48 | } // namespace cpu
49 | } // namespace lib
50 | } // namespace fl
51 | 


--------------------------------------------------------------------------------
/flashlight/lib/sequence/criterion/cpu/ViterbiPath.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Facebook, Inc. and its affiliates.
 3 |  *
 4 |  * This source code is licensed under the MIT-style license found in the
 5 |  * LICENSE file in the root directory of this source tree.
 6 |  */
 7 | 
 8 | #include "flashlight/lib/sequence/criterion/cpu/ViterbiPath.h"
 9 | 
10 | #include <cmath>
11 | 
12 | #include "flashlight/lib/sequence/criterion/Workspace.h"
13 | 
14 | namespace {
15 | 
16 | template <class Float>
17 | struct WorkspacePtrs {
18 |   explicit WorkspacePtrs(void* workspace, int B, int T, int N) {
19 |     fl::lib::seq::Workspace<> ws(workspace);
20 |     ws.request(&alpha, B, 2, N);
21 |     ws.request(&beta, B, T, N);
22 |     requiredSize = ws.requiredSize();
23 |   }
24 | 
25 |   Float* alpha;
26 |   int* beta;
27 |   size_t requiredSize;
28 | };
29 | 
30 | } // namespace
31 | 
32 | namespace fl {
33 | namespace lib {
34 | namespace cpu {
35 | 
36 | template <class Float>
37 | size_t ViterbiPath<Float>::getWorkspaceSize(int B, int T, int N) {
38 |   return WorkspacePtrs<Float>(nullptr, B, T, N).requiredSize;
39 | }
40 | 
41 | template <class Float>
42 | void ViterbiPath<Float>::compute(
43 |     int B,
44 |     int T,
45 |     int N,
46 |     const Float* input,
47 |     const Float* trans,
48 |     int* _path,
49 |     void* workspace) {
50 |   WorkspacePtrs<Float> ws(workspace, B, T, N);
51 | 
52 | #pragma omp parallel for num_threads(B)
53 |   for (int b = 0; b < B; ++b) {
54 |     for (int n = 0; n < N; ++n) {
55 |       ws.alpha[b * 2 * N + n] = input[b * T * N + n];
56 |     }
57 | 
58 |     for (int t = 1; t <= T; ++t) {
59 |       const auto* alphaPrev = &ws.alpha[b * 2 * N + ((t - 1) % 2) * N];
60 |       const auto* inputCur = &input[b * T * N + t * N];
61 |       auto* alphaCur = &ws.alpha[b * 2 * N + (t % 2) * N];
62 |       auto* betaCur = &ws.beta[b * T * N + t * N];
63 | 
64 |       for (int m = 0; m < N; ++m) {
65 |         int maxIndex = -1;
66 |         Float maxValue = -INFINITY;
67 |         for (int n = 0; n < N; ++n) {
68 |           Float val = alphaPrev[n] + (t == T ? 0 : trans[m * N + n]);
69 |           if (val > maxValue) {
70 |             maxIndex = n;
71 |             maxValue = val;
72 |           }
73 |         }
74 | 
75 |         if (t == T) {
76 |           auto* path = &_path[b * T];
77 |           path[T - 1] = maxIndex;
78 |           for (int s = T - 1; s > 0; --s) {
79 |             path[s - 1] = ws.beta[b * T * N + s * N + path[s]];
80 |           }
81 |           break;
82 |         }
83 | 
84 |         alphaCur[m] = maxValue + inputCur[m];
85 |         betaCur[m] = maxIndex;
86 |       }
87 |     }
88 |   }
89 | }
90 | 
91 | template struct ViterbiPath<float>;
92 | template struct ViterbiPath<double>;
93 | 
94 | } // namespace cpu
95 | } // namespace lib
96 | } // namespace fl
97 | 


--------------------------------------------------------------------------------
/flashlight/lib/sequence/criterion/cpu/ViterbiPath.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Facebook, Inc. and its affiliates.
 3 |  *
 4 |  * This source code is licensed under the MIT-style license found in the
 5 |  * LICENSE file in the root directory of this source tree.
 6 |  */
 7 | 
 8 | #pragma once
 9 | 
10 | #include <cstddef>
11 | 
12 | #include "flashlight/lib/sequence/Defines.h"
13 | 
14 | namespace fl {
15 | namespace lib {
16 | namespace cpu {
17 | 
18 | /// Check CUDA header for docs.
19 | template <class Float>
20 | struct FL_SEQ_API ViterbiPath {
21 |   static size_t getWorkspaceSize(int B, int T, int N);
22 | 
23 |   static void compute(
24 |       int B,
25 |       int T,
26 |       int N,
27 |       const Float* input,
28 |       const Float* trans,
29 |       int* path,
30 |       void* workspace);
31 | };
32 | 
33 | } // namespace cpu
34 | } // namespace lib
35 | } // namespace fl
36 | 


--------------------------------------------------------------------------------
/flashlight/lib/sequence/criterion/cuda/CriterionUtils.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) Facebook, Inc. and its affiliates.
  3 |  *
  4 |  * This source code is licensed under the MIT-style license found in the
  5 |  * LICENSE file in the root directory of this source tree.
  6 |  */
  7 | 
  8 | #include "flashlight/lib/sequence/criterion/cuda/CriterionUtils.cuh"
  9 | 
 10 | #include <algorithm>
 11 | 
 12 | namespace {
 13 | 
 14 | using fl::lib::seq::CriterionScaleMode;
 15 | using namespace fl::lib::seq;
 16 | 
 17 | /*
 18 |  * B thread blocks
 19 |  * 32 threads/block (ideally)
 20 |  */
 21 | __global__ void
 22 | batchTargetSizeKernel(int L, int maxSize, const int* _target, int* targetSize) {
 23 |   auto b = blockIdx.x;
 24 |   auto target = _target + b * L;
 25 | 
 26 |   __shared__ int idx;
 27 | 
 28 |   if (threadIdx.x == 0) {
 29 |     idx = 0;
 30 |   }
 31 | 
 32 |   __syncthreads();
 33 | 
 34 |   for (auto i = L - 1 - threadIdx.x; i >= 0; i -= blockDim.x) {
 35 |     if (target[i] >= 0) {
 36 |       atomicMax(&idx, i + 1);
 37 |       break;
 38 |     }
 39 |   }
 40 | 
 41 |   __syncthreads();
 42 | 
 43 |   if (threadIdx.x == 0) {
 44 |     targetSize[b] = idx < maxSize ? idx : maxSize;
 45 |   }
 46 | }
 47 | 
 48 | /*
 49 |  * 1 thread block
 50 |  * B threads/block (ideally)
 51 |  */
 52 | template <class Float>
 53 | __global__ void computeScaleKernel(
 54 |     int B,
 55 |     int T,
 56 |     int /* N */,
 57 |     CriterionScaleMode scaleMode,
 58 |     const int* targetSize,
 59 |     Float* scale) {
 60 |   for (auto b = threadIdx.x; b < B; b += blockDim.x) {
 61 |     switch (scaleMode) {
 62 |       case CriterionScaleMode::NONE:
 63 |         scale[b] = 1.0;
 64 |         break;
 65 |       case CriterionScaleMode::INPUT_SZ:
 66 |         scale[b] = T > 0 ? 1.0 / T : 1.0;
 67 |         break;
 68 |       case CriterionScaleMode::INPUT_SZ_SQRT:
 69 |         scale[b] = T > 0 ? std::sqrt(1.0 / T) : 1.0;
 70 |         break;
 71 |       case CriterionScaleMode::TARGET_SZ:
 72 |         scale[b] = targetSize[b] > 0 ? 1.0 / targetSize[b] : 1.0;
 73 |         break;
 74 |       case CriterionScaleMode::TARGET_SZ_SQRT:
 75 |         scale[b] = targetSize[b] > 0 ? std::sqrt(1.0 / targetSize[b]) : 1.0;
 76 |         break;
 77 |       default:
 78 |         break;
 79 |     }
 80 |   }
 81 | }
 82 | 
 83 | } // namespace
 84 | 
 85 | namespace fl {
 86 | namespace lib {
 87 | namespace cuda {
 88 | 
 89 | template <class Float>
 90 | void CriterionUtils<Float>::batchTargetSize(
 91 |     int B,
 92 |     int L,
 93 |     int maxSize,
 94 |     const int* target,
 95 |     int* targetSize,
 96 |     cudaStream_t stream) {
 97 |   batchTargetSizeKernel<<<B, 32, 0, stream>>>(L, maxSize, target, targetSize);
 98 | }
 99 | 
100 | template <class Float>
101 | void CriterionUtils<Float>::computeScale(
102 |     int B,
103 |     int T,
104 |     int N,
105 |     CriterionScaleMode scaleMode,
106 |     const int* targetSize,
107 |     Float* scale,
108 |     cudaStream_t stream) {
109 |   int blockSize = std::min(256, (B + 31) / 32 * 32);
110 |   computeScaleKernel<<<1, blockSize, 0, stream>>>(
111 |       B, T, N, scaleMode, targetSize, scale);
112 | }
113 | 
114 | template struct CriterionUtils<float>;
115 | template struct CriterionUtils<double>;
116 | 
117 | } // namespace cuda
118 | } // namespace lib
119 | } // namespace fl
120 | 


--------------------------------------------------------------------------------
/flashlight/lib/sequence/criterion/cuda/CriterionUtils.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Facebook, Inc. and its affiliates.
 3 |  *
 4 |  * This source code is licensed under the MIT-style license found in the
 5 |  * LICENSE file in the root directory of this source tree.
 6 |  */
 7 | 
 8 | #pragma once
 9 | 
10 | #include <cuda.h>
11 | #include <cuda_runtime.h>
12 | #include <math_constants.h> // for CUDART_INF
13 | #include <cstddef>
14 | 
15 | #include "flashlight/lib/sequence/Defines.h"
16 | #include "flashlight/lib/sequence/criterion/Defines.h"
17 | 
18 | using fl::lib::seq::CriterionScaleMode;
19 | 
20 | #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
21 | 
22 | /// Double-precision `atomicAdd` backport for compute capability < 6.0
23 | /// From NVIDIA docs: https://docs.nvidia.com/cuda/cuda-c-programming-guide/
24 | static __inline__ __device__ double atomicAdd(double* address, double val) {
25 |   unsigned long long int* address_as_ull = (unsigned long long int*)address;
26 |   unsigned long long int old = *address_as_ull, assumed;
27 | 
28 |   do {
29 |     assumed = old;
30 |     old = atomicCAS(
31 |         address_as_ull,
32 |         assumed,
33 |         __double_as_longlong(val + __longlong_as_double(assumed)));
34 | 
35 |     // Note: uses integer comparison to avoid hang in case of NaN (since NaN !=
36 |     // NaN)
37 |   } while (assumed != old);
38 | 
39 |   return __longlong_as_double(old);
40 | }
41 | 
42 | #endif
43 | 
44 | namespace fl {
45 | namespace lib {
46 | namespace cuda {
47 | 
48 | template <class Float>
49 | struct FL_SEQ_API CriterionUtils {
50 |   /**
51 |    * B: batch size
52 |    * L: target size
53 |    * maxSize: target size results are clamped down to this
54 |    * target: [B][L] target labels
55 |    * targetSize: [B] (out) target sizes
56 |    * stream: CUDA stream
57 |    */
58 |   static void batchTargetSize(
59 |       int B,
60 |       int L,
61 |       int maxSize,
62 |       const int* target,
63 |       int* targetSize,
64 |       cudaStream_t stream);
65 | 
66 |   /**
67 |    * B: batch size
68 |    * T: input length
69 |    * N: dictionary size
70 |    * scaleMode: type of size scaling
71 |    * targetSize: [B] target sizes
72 |    * scale: [B] (out) scale factor
73 |    * stream: CUDA stream
74 |    */
75 |   static void computeScale(
76 |       int B,
77 |       int T,
78 |       int N,
79 |       CriterionScaleMode scaleMode,
80 |       const int* targetSize,
81 |       Float* scale,
82 |       cudaStream_t stream);
83 | };
84 | 
85 | /// Zeroes `count * sizeof(T)` device bytes
86 | template <typename T>
87 | void setZero(T* devPtr, size_t count, cudaStream_t stream) {
88 |   cudaMemsetAsync(devPtr, 0, count * sizeof(T), stream);
89 | }
90 | 
91 | } // namespace cuda
92 | } // namespace lib
93 | } // namespace fl
94 | 


--------------------------------------------------------------------------------
/flashlight/lib/sequence/criterion/cuda/ForceAlignmentCriterion.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) Facebook, Inc. and its affiliates.
  3 |  *
  4 |  * This source code is licensed under the MIT-style license found in the
  5 |  * LICENSE file in the root directory of this source tree.
  6 |  */
  7 | 
  8 | #include "flashlight/lib/sequence/criterion/cuda/ForceAlignmentCriterion.cuh"
  9 | 
 10 | #include <algorithm>
 11 | #include <cmath>
 12 | 
 13 | #include "flashlight/lib/sequence/criterion/Workspace.h"
 14 | #include "flashlight/lib/sequence/criterion/cuda/CriterionUtils.cuh"
 15 | using fl::lib::seq::CriterionScaleMode;
 16 | 
 17 | namespace {
 18 | 
 19 | template <class Float>
 20 | struct WorkspacePtrs {
 21 |   explicit WorkspacePtrs(void* workspace, int B, int T, int N, int L) {
 22 |     fl::lib::seq::Workspace<> ws(workspace);
 23 |     ws.request(&scale, B);
 24 |     ws.request(&alpha, B, T, L);
 25 |     ws.request(&alphaGrad, B, T, L);
 26 |     ws.request(&transBatchGrad, B, N, N);
 27 |     ws.request(&transBuf1, B, L);
 28 |     ws.request(&transBuf2, B, L);
 29 |     ws.request(&transBufGrad1, B, L);
 30 |     ws.request(&transBufGrad2, B, L);
 31 |     requiredSize = ws.requiredSize();
 32 |   }
 33 | 
 34 |   Float* scale;
 35 |   double* alpha;
 36 |   double* alphaGrad;
 37 |   Float* transBatchGrad;
 38 |   Float* transBuf1;
 39 |   Float* transBuf2;
 40 |   Float* transBufGrad1;
 41 |   Float* transBufGrad2;
 42 |   size_t requiredSize;
 43 | };
 44 | 
 45 | /*
 46 |  * B thread blocks
 47 |  * L threads/block (ideally)
 48 |  */
 49 | template <class Float>
 50 | __global__ void forwardKernel(
 51 |     int T,
 52 |     int N,
 53 |     int _L,
 54 |     const Float* _input,
 55 |     const int* _target,
 56 |     const int* targetSize,
 57 |     const Float* trans,
 58 |     Float* _loss,
 59 |     WorkspacePtrs<Float> ws) {
 60 |   auto b = blockIdx.x;
 61 |   auto* alpha = &ws.alpha[b * T * _L];
 62 |   auto* input = &_input[b * T * N];
 63 |   auto* target = &_target[b * _L];
 64 |   auto* transBuf1 = &ws.transBuf1[b * _L];
 65 |   auto* transBuf2 = &ws.transBuf2[b * _L];
 66 |   int L = targetSize[b];
 67 | 
 68 |   for (auto i = threadIdx.x; i < L; i += blockDim.x) {
 69 |     alpha[i] = i == 0 ? input[target[0]] : 0;
 70 |     transBuf1[i] = trans[target[i] * N + target[i]];
 71 |     transBuf2[i] = i > 0 ? trans[target[i] * N + target[i - 1]] : 0;
 72 |   }
 73 | 
 74 |   for (int t = 1; t < T; ++t) {
 75 |     auto* inputCur = &input[t * N];
 76 |     auto* alphaPrev = &alpha[(t - 1) * L];
 77 |     auto* alphaCur = &alpha[t * L];
 78 | 
 79 |     int high = t < L ? t : L;
 80 |     int low = T - t < L ? L - (T - t) : 1;
 81 | 
 82 |     __syncthreads();
 83 | 
 84 |     if (threadIdx.x == 0) {
 85 |       if (T - t >= L) {
 86 |         alphaCur[0] = alphaPrev[0] + transBuf1[0] + inputCur[target[0]];
 87 |       }
 88 |     } else if (threadIdx.x == 1) {
 89 |       if (t < L) {
 90 |         alphaCur[high] =
 91 |             alphaPrev[high - 1] + transBuf2[high] + inputCur[target[high]];
 92 |       }
 93 |     }
 94 | 
 95 |     for (auto i = low + threadIdx.x; i < high; i += blockDim.x) {
 96 |       double s1 = alphaPrev[i] + transBuf1[i];
 97 |       double s2 = alphaPrev[i - 1] + transBuf2[i];
 98 |       // lse = logSumExp(s1, s2)
 99 |       double lse =
100 |           s1 < s2 ? s2 + log(1 + exp(s1 - s2)) : s1 + log(1 + exp(s2 - s1));
101 |       alphaCur[i] = lse + inputCur[target[i]];
102 |     }
103 |   }
104 | 
105 |   __syncthreads();
106 | 
107 |   if (threadIdx.x == 0) {
108 |     _loss[b] = alpha[T * L - 1] * ws.scale[b];
109 |   }
110 | }
111 | 
112 | /*
113 |  * B thread blocks
114 |  * L threads/block (ideally)
115 |  */
116 | template <class Float>
117 | __global__ void backwardKernel(
118 |     int T,
119 |     int N,
120 |     int _L,
121 |     const int* _target,
122 |     const int* targetSize,
123 |     const Float* grad,
124 |     Float* _inputGrad,
125 |     Float* transGrad,
126 |     WorkspacePtrs<Float> ws) {
127 |   auto b = blockIdx.x;
128 |   auto* alpha = &ws.alpha[b * T * _L];
129 |   auto* alphaGrad = &ws.alphaGrad[b * T * _L];
130 |   auto* inputGrad = &_inputGrad[b * T * N];
131 |   auto* target = &_target[b * _L];
132 |   auto* transBatchGrad = &ws.transBatchGrad[b * N * N];
133 |   auto* transBuf1 = &ws.transBuf1[b * _L];
134 |   auto* transBuf2 = &ws.transBuf2[b * _L];
135 |   auto* transBufGrad1 = &ws.transBufGrad1[b * _L];
136 |   auto* transBufGrad2 = &ws.transBufGrad2[b * _L];
137 |   int L = targetSize[b];
138 | 
139 |   if (threadIdx.x == 0) {
140 |     alphaGrad[T * L - 1] = 1;
141 |   }
142 | 
143 |   for (int t = T - 1; t > 0; --t) {
144 |     auto* inputCurGrad = &inputGrad[t * N];
145 |     auto* alphaPrev = &alpha[(t - 1) * L];
146 |     auto* alphaCurGrad = &alphaGrad[t * L];
147 |     auto* alphaPrevGrad = &alphaGrad[(t - 1) * L];
148 | 
149 |     int high = t < L ? t : L;
150 |     int low = T - t < L ? L - (T - t) : 1;
151 | 
152 |     int high1 = t < L ? t + 1 : L;
153 |     int low1 = T - t < L ? L - (T - t) : 0;
154 | 
155 |     __syncthreads();
156 | 
157 |     for (auto i = low1 + threadIdx.x; i < high1; i += blockDim.x) {
158 |       atomicAdd(&inputCurGrad[target[i]], alphaCurGrad[i]);
159 |     }
160 | 
161 |     if (threadIdx.x == 0) {
162 |       if (T - t >= L) {
163 |         atomicAdd(&alphaPrevGrad[0], alphaCurGrad[0]);
164 |         transBufGrad1[0] += alphaCurGrad[0];
165 |       }
166 |     } else if (threadIdx.x == 1) {
167 |       if (t < L) {
168 |         atomicAdd(&alphaPrevGrad[high - 1], alphaCurGrad[high]);
169 |         transBufGrad2[high] += alphaCurGrad[high];
170 |       }
171 |     }
172 | 
173 |     for (auto i = low + threadIdx.x; i < high; i += blockDim.x) {
174 |       double s1 = alphaPrev[i] + transBuf1[i];
175 |       double s2 = alphaPrev[i - 1] + transBuf2[i];
176 |       // d1, d2 = dLogSumExp(s1, s2)
177 |       double d1, d2;
178 |       if (s1 < s2) {
179 |         d2 = 1 / (1 + exp(s1 - s2));
180 |         d1 = 1 - d2;
181 |       } else {
182 |         d1 = 1 / (1 + exp(s2 - s1));
183 |         d2 = 1 - d1;
184 |       }
185 |       atomicAdd(&alphaPrevGrad[i], d1 * alphaCurGrad[i]);
186 |       atomicAdd(&alphaPrevGrad[i - 1], d2 * alphaCurGrad[i]);
187 |       transBufGrad1[i] += d1 * alphaCurGrad[i];
188 |       transBufGrad2[i] += d2 * alphaCurGrad[i];
189 |     }
190 |   }
191 | 
192 |   __syncthreads();
193 | 
194 |   __shared__ Float gradScale;
195 | 
196 |   if (threadIdx.x == 0) {
197 |     inputGrad[target[0]] += alphaGrad[0];
198 |     gradScale = grad[b] * ws.scale[b];
199 |   }
200 | 
201 |   for (auto i = threadIdx.x; i < L; i += blockDim.x) {
202 |     atomicAdd(&transBatchGrad[target[i] * N + target[i]], transBufGrad1[i]);
203 |     if (i > 0) {
204 |       atomicAdd(
205 |           &transBatchGrad[target[i] * N + target[i - 1]], transBufGrad2[i]);
206 |     }
207 |   }
208 | 
209 |   __syncthreads();
210 | 
211 |   for (auto i = threadIdx.x; i < T * N; i += blockDim.x) {
212 |     inputGrad[i] *= gradScale;
213 |   }
214 | 
215 |   for (auto i = threadIdx.x; i < N * N; i += blockDim.x) {
216 |     atomicAdd(&transGrad[i], gradScale * transBatchGrad[i]);
217 |   }
218 | }
219 | 
220 | template <class Float>
221 | __global__ void viterbiPathKernel(
222 |     int T,
223 |     int N,
224 |     int _L,
225 |     const Float* _input,
226 |     const int* _target,
227 |     const int* targetSize,
228 |     const Float* trans,
229 |     int* bestPaths,
230 |     WorkspacePtrs<Float> ws) {
231 |   auto b = blockIdx.x;
232 |   auto* alpha = &ws.alpha[b * T * _L];
233 |   auto* input = &_input[b * T * N];
234 |   auto* target = &_target[b * _L];
235 |   auto* transBuf1 = &ws.transBuf1[b * _L];
236 |   auto* transBuf2 = &ws.transBuf2[b * _L];
237 |   int L = targetSize[b];
238 | 
239 |   for (auto i = threadIdx.x; i < L * T; i += blockDim.x) {
240 |     alpha[i] = i == 0 ? input[target[0]] : -CUDART_INF_F;
241 |   }
242 | 
243 |   for (auto i = threadIdx.x; i < L; i += blockDim.x) {
244 |     transBuf1[i] = trans[target[i] * N + target[i]];
245 |     transBuf2[i] = i > 0 ? trans[target[i] * N + target[i - 1]] : 0;
246 |   }
247 |   if (L > T || L == 0) {
248 |     return;
249 |   }
250 | 
251 |   for (int t = 1; t < T; ++t) {
252 |     auto* inputCur = &input[t * N];
253 |     auto* alphaPrev = &alpha[(t - 1) * L];
254 |     auto* alphaCur = &alpha[t * L];
255 | 
256 |     int high = t < L ? t : L;
257 |     int low = T - t < L ? L - (T - t) : 1;
258 | 
259 |     // Ensure that all previous alphas have been computed
260 |     __syncthreads();
261 | 
262 |     if (threadIdx.x == 0) {
263 |       if (T - t >= L) {
264 |         alphaCur[0] = alphaPrev[0] + transBuf1[0] + inputCur[target[0]];
265 |       }
266 |     } else if (threadIdx.x == 1) {
267 |       if (t < L) {
268 |         alphaCur[high] =
269 |             alphaPrev[high - 1] + transBuf2[high] + inputCur[target[high]];
270 |       }
271 |     }
272 | 
273 |     for (auto i = low + threadIdx.x; i < high; i += blockDim.x) {
274 |       double s1 = alphaPrev[i] + transBuf1[i];
275 |       double s2 = alphaPrev[i - 1] + transBuf2[i];
276 |       alphaCur[i] = inputCur[target[i]] + max(s1, s2);
277 |     }
278 |   }
279 |   // Ensure all threads are finished and alphas have been computed before
280 |   // computing backward path
281 |   __syncthreads();
282 |   if (threadIdx.x == 0) {
283 |     int ltrIdx = L - 1;
284 |     for (int t = T - 1; t > 0; t--) {
285 |       bestPaths[t + (b * T)] = target[ltrIdx];
286 |       auto* alphaPrev = &alpha[(t - 1) * L];
287 |       if (ltrIdx > 0) {
288 |         double s1 = alphaPrev[ltrIdx] + transBuf1[ltrIdx];
289 |         double s2 = alphaPrev[ltrIdx - 1] + transBuf2[ltrIdx];
290 |         if (s2 > s1) {
291 |           ltrIdx--;
292 |         }
293 |       }
294 |     }
295 |     bestPaths[b * T] = target[ltrIdx];
296 |   }
297 | }
298 | 
299 | } // namespace
300 | 
301 | namespace fl {
302 | namespace lib {
303 | namespace cuda {
304 | 
305 | template <class Float>
306 | size_t
307 | ForceAlignmentCriterion<Float>::getWorkspaceSize(int B, int T, int N, int L) {
308 |   return WorkspacePtrs<Float>(nullptr, B, T, N, L).requiredSize;
309 | }
310 | 
311 | template <class Float>
312 | void ForceAlignmentCriterion<Float>::forward(
313 |     int B,
314 |     int T,
315 |     int N,
316 |     int L,
317 |     CriterionScaleMode scaleMode,
318 |     const Float* input,
319 |     const int* target,
320 |     const int* targetSize,
321 |     const Float* trans,
322 |     Float* loss,
323 |     void* workspace,
324 |     cudaStream_t stream) {
325 |   int blockSize = std::min(256, (L + 31) / 32 * 32);
326 |   WorkspacePtrs<Float> ws(workspace, B, T, N, L);
327 |   CriterionUtils<Float>::computeScale(
328 |       B, T, N, scaleMode, targetSize, ws.scale, stream);
329 |   forwardKernel<<<B, blockSize, 0, stream>>>(
330 |       T, N, L, input, target, targetSize, trans, loss, ws);
331 | }
332 | 
333 | template <class Float>
334 | void ForceAlignmentCriterion<Float>::backward(
335 |     int B,
336 |     int T,
337 |     int N,
338 |     int L,
339 |     const int* target,
340 |     const int* targetSize,
341 |     const Float* grad,
342 |     Float* inputGrad,
343 |     Float* transGrad,
344 |     void* workspace,
345 |     cudaStream_t stream) {
346 |   int blockSize = std::min(256, (L + 31) / 32 * 32);
347 |   WorkspacePtrs<Float> ws(workspace, B, T, N, L);
348 |   setZero(inputGrad, B * T * N, stream);
349 |   setZero(transGrad, N * N, stream);
350 |   setZero(ws.alphaGrad, B * T * L, stream);
351 |   setZero(ws.transBatchGrad, B * N * N, stream);
352 |   setZero(ws.transBufGrad1, B * L, stream);
353 |   setZero(ws.transBufGrad2, B * L, stream);
354 |   backwardKernel<<<B, blockSize, 0, stream>>>(
355 |       T, N, L, target, targetSize, grad, inputGrad, transGrad, ws);
356 | }
357 | 
358 | template <class Float>
359 | void ForceAlignmentCriterion<Float>::viterbiPath(
360 |     int B,
361 |     int T,
362 |     int N,
363 |     int L,
364 |     const Float* input,
365 |     const int* target,
366 |     const int* targetSize,
367 |     const Float* trans,
368 |     int* bestPaths,
369 |     void* workspace,
370 |     cudaStream_t stream) {
371 |   int blockSize = std::min(256, (L + 31) / 32 * 32);
372 |   WorkspacePtrs<Float> ws(workspace, B, T, N, L);
373 |   setZero(ws.alpha, B * T * L, stream);
374 |   viterbiPathKernel<<<B, blockSize, 0, stream>>>(
375 |       T, N, L, input, target, targetSize, trans, bestPaths, ws);
376 | }
377 | 
378 | template struct ForceAlignmentCriterion<float>;
379 | template struct ForceAlignmentCriterion<double>;
380 | 
381 | } // namespace cuda
382 | } // namespace lib
383 | } // namespace fl
384 | 


--------------------------------------------------------------------------------
/flashlight/lib/sequence/criterion/cuda/ForceAlignmentCriterion.cuh:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) Facebook, Inc. and its affiliates.
  3 |  *
  4 |  * This source code is licensed under the MIT-style license found in the
  5 |  * LICENSE file in the root directory of this source tree.
  6 |  */
  7 | 
  8 | #pragma once
  9 | 
 10 | #include <cuda_runtime.h>
 11 | 
 12 | #include "flashlight/lib/sequence/Defines.h"
 13 | #include "flashlight/lib/sequence/criterion/Defines.h"
 14 | 
 15 | using fl::lib::seq::CriterionScaleMode;
 16 | 
 17 | namespace fl {
 18 | namespace lib {
 19 | namespace cuda {
 20 | 
 21 | /// The numerator of ASG loss. Reference: https://arxiv.org/abs/1609.03193
 22 | template <class Float>
 23 | struct FL_SEQ_API ForceAlignmentCriterion {
 24 |   /**
 25 |    * B: batch size
 26 |    * T: input length
 27 |    * N: dictionary size
 28 |    * L: target size
 29 |    */
 30 |   static size_t getWorkspaceSize(int B, int T, int N, int L);
 31 | 
 32 |   /**
 33 |    * B: batch size
 34 |    * T: input length
 35 |    * N: dictionary size
 36 |    * L: target size
 37 |    * scaleMode: type of size scaling
 38 |    * input: [B][T][N] input frames from network
 39 |    * target: [B][L] target labels
 40 |    * targetSize: [B] target sizes
 41 |    * trans: [N][N] transition matrix
 42 |    * loss: [B] (out) loss value
 43 |    * workspace: (in/out) internal workspace
 44 |    * stream: CUDA stream
 45 |    */
 46 |   static void forward(
 47 |       int B,
 48 |       int T,
 49 |       int N,
 50 |       int L,
 51 |       CriterionScaleMode scaleMode,
 52 |       const Float* input,
 53 |       const int* target,
 54 |       const int* targetSize,
 55 |       const Float* trans,
 56 |       Float* loss,
 57 |       void* workspace,
 58 |       cudaStream_t stream);
 59 | 
 60 |   /**
 61 |    * B: batch size
 62 |    * T: input length
 63 |    * N: dictionary size
 64 |    * L: target size
 65 |    * target: [B][L] target labels
 66 |    * targetSize: [B] target sizes
 67 |    * grad: [B] gradient w.r.t. loss
 68 |    * inputGrad: [B][T][N] (out) gradient w.r.t. input
 69 |    * transGrad: [N][N] (out) gradient w.r.t. transitions
 70 |    * workspace: (in/out) internal workspace from forward
 71 |    * stream: CUDA stream
 72 |    */
 73 |   static void backward(
 74 |       int B,
 75 |       int T,
 76 |       int N,
 77 |       int L,
 78 |       const int* target,
 79 |       const int* targetSize,
 80 |       const Float* grad,
 81 |       Float* inputGrad,
 82 |       Float* transGrad,
 83 |       void* workspace,
 84 |       cudaStream_t stream);
 85 | 
 86 |   static void viterbiPath(
 87 |       int B,
 88 |       int T,
 89 |       int N,
 90 |       int L,
 91 |       const Float* input,
 92 |       const int* target,
 93 |       const int* targetSize,
 94 |       const Float* trans,
 95 |       int* bestPaths,
 96 |       void* workspace,
 97 |       cudaStream_t stream);
 98 | };
 99 | 
100 | } // namespace cuda
101 | } // namespace lib
102 | } // namespace fl
103 | 


--------------------------------------------------------------------------------
/flashlight/lib/sequence/criterion/cuda/FullConnectionCriterion.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) Facebook, Inc. and its affiliates.
  3 |  *
  4 |  * This source code is licensed under the MIT-style license found in the
  5 |  * LICENSE file in the root directory of this source tree.
  6 |  */
  7 | 
  8 | #include "flashlight/lib/sequence/criterion/cuda/FullConnectionCriterion.cuh"
  9 | 
 10 | #include <cmath>
 11 | 
 12 | #include <cub/cub.cuh>
 13 | 
 14 | #include "flashlight/lib/sequence/criterion/Workspace.h"
 15 | #include "flashlight/lib/sequence/criterion/cuda/CriterionUtils.cuh"
 16 | using fl::lib::seq::CriterionScaleMode;
 17 | 
 18 | namespace {
 19 | 
 20 | constexpr int kBlockSize = 32;
 21 | 
 22 | template <class Float>
 23 | struct WorkspacePtrs {
 24 |   explicit WorkspacePtrs(void* workspace, int B, int T, int N) {
 25 |     fl::lib::seq::Workspace<> ws(workspace);
 26 |     ws.request(&scale, B);
 27 |     ws.request(&alpha, B, T, N);
 28 |     ws.request(&alphaGrad, B, T, N);
 29 |     ws.request(&transBatchGrad, B, N, N);
 30 |     ws.request(&transBuf, B, N, N);
 31 |     requiredSize = ws.requiredSize();
 32 |   }
 33 | 
 34 |   Float* scale;
 35 |   double* alpha;
 36 |   double* alphaGrad;
 37 |   double* transBatchGrad;
 38 |   double* transBuf;
 39 |   size_t requiredSize;
 40 | };
 41 | 
 42 | /*
 43 |  * B thread blocks
 44 |  * kBlockSize threads/block
 45 |  */
 46 | template <class Float>
 47 | __global__ void
 48 | forwardInitial(int T, int N, const Float* input, WorkspacePtrs<Float> ws) {
 49 |   auto b = blockIdx.x;
 50 |   for (auto n = threadIdx.x; n < N; n += blockDim.x) {
 51 |     int k = b * T * N + n;
 52 |     ws.alpha[k] = input[k];
 53 |   }
 54 | }
 55 | 
 56 | /*
 57 |  * B * N thread blocks (B if Final)
 58 |  * kBlockSize threads/block
 59 |  */
 60 | template <bool Final, class Float>
 61 | __global__ void forwardStep(
 62 |     int T,
 63 |     int N,
 64 |     int t,
 65 |     const Float* input,
 66 |     const Float* trans,
 67 |     Float* loss,
 68 |     WorkspacePtrs<Float> ws) {
 69 |   int b, m;
 70 |   if (Final) {
 71 |     b = blockIdx.x;
 72 |   } else {
 73 |     b = blockIdx.x / N;
 74 |     m = blockIdx.x % N;
 75 |   }
 76 | 
 77 |   const auto* alphaPrev = &ws.alpha[b * T * N + (t - 1) * N];
 78 |   const auto* inputCur = &input[b * T * N + t * N];
 79 |   auto* alphaCur = &ws.alpha[b * T * N + t * N];
 80 |   auto* transBuf = &ws.transBuf[blockIdx.x * N];
 81 | 
 82 |   using BlockReduce = cub::BlockReduce<double, kBlockSize>;
 83 |   __shared__ typename BlockReduce::TempStorage tempStorage;
 84 |   __shared__ double maxValue;
 85 | 
 86 |   double threadMax = -INFINITY;
 87 |   for (auto n = threadIdx.x; n < N; n += blockDim.x) {
 88 |     double val = transBuf[n] = alphaPrev[n] + (Final ? 0 : trans[m * N + n]);
 89 |     threadMax = val > threadMax ? val : threadMax;
 90 |   }
 91 | 
 92 |   double maxResult = BlockReduce(tempStorage).Reduce(threadMax, cub::Max());
 93 |   if (threadIdx.x == 0) {
 94 |     maxValue = maxResult;
 95 |   }
 96 | 
 97 |   __syncthreads();
 98 | 
 99 |   double threadSum = 0;
100 |   for (auto n = threadIdx.x; n < N; n += blockDim.x) {
101 |     threadSum += exp(transBuf[n] - maxValue);
102 |   }
103 | 
104 |   double sumResult = BlockReduce(tempStorage).Sum(threadSum);
105 |   if (threadIdx.x == 0) {
106 |     if (Final) {
107 |       loss[b] = ws.scale[b] * (log(sumResult) + maxValue);
108 |     } else {
109 |       alphaCur[m] = log(sumResult) + maxValue + inputCur[m];
110 |     }
111 |   }
112 | }
113 | 
114 | /*
115 |  * B * N thread blocks (B if Initial)
116 |  * kBlockSize threads/block
117 |  */
118 | template <bool Initial, class Float>
119 | __global__ void backwardStep1(
120 |     int T,
121 |     int N,
122 |     int t,
123 |     const Float* trans,
124 |     WorkspacePtrs<Float> ws) {
125 |   int b, m;
126 |   if (Initial) {
127 |     b = blockIdx.x;
128 |   } else {
129 |     b = blockIdx.x / N;
130 |     m = blockIdx.x % N;
131 |   }
132 | 
133 |   const auto* alphaPrev = &ws.alpha[b * T * N + (t - 1) * N];
134 |   const auto* alphaCurGrad = &ws.alphaGrad[b * T * N + t * N];
135 |   auto* alphaPrevGrad = &ws.alphaGrad[b * T * N + (t - 1) * N];
136 |   auto* transBuf = &ws.transBuf[blockIdx.x * N];
137 |   auto* transBatchGrad = &ws.transBatchGrad[blockIdx.x * N];
138 | 
139 |   using BlockReduce = cub::BlockReduce<double, kBlockSize>;
140 |   __shared__ typename BlockReduce::TempStorage tempStorage;
141 |   __shared__ double maxValue;
142 |   __shared__ double sumValue;
143 | 
144 |   double threadMax = -INFINITY;
145 |   for (auto n = threadIdx.x; n < N; n += blockDim.x) {
146 |     double val = transBuf[n] = alphaPrev[n] + (Initial ? 0 : trans[m * N + n]);
147 |     threadMax = val > threadMax ? val : threadMax;
148 |   }
149 | 
150 |   double maxResult = BlockReduce(tempStorage).Reduce(threadMax, cub::Max());
151 |   if (threadIdx.x == 0) {
152 |     maxValue = maxResult;
153 |   }
154 | 
155 |   double threadSum = 0;
156 |   for (auto n = threadIdx.x; n < N; n += blockDim.x) {
157 |     transBuf[n] = exp(transBuf[n] - maxValue);
158 |     threadSum += transBuf[n];
159 |   }
160 | 
161 |   double sumResult = BlockReduce(tempStorage).Sum(threadSum);
162 |   if (threadIdx.x == 0) {
163 |     sumValue = sumResult;
164 |   }
165 | 
166 |   __syncthreads();
167 | 
168 |   for (auto n = threadIdx.x; n < N; n += blockDim.x) {
169 |     if (Initial) {
170 |       alphaPrevGrad[n] = transBuf[n] / sumValue;
171 |     } else {
172 |       transBuf[n] = transBuf[n] / sumValue * alphaCurGrad[m];
173 |       transBatchGrad[n] += transBuf[n];
174 |     }
175 |   }
176 | }
177 | 
178 | /*
179 |  * B * N thread blocks
180 |  * kBlockSize threads/block
181 |  */
182 | template <class Float>
183 | __global__ void backwardStep2(int T, int N, int t, WorkspacePtrs<Float> ws) {
184 |   auto b = blockIdx.x / N;
185 |   auto m = blockIdx.x % N;
186 | 
187 |   auto* alphaPrevGrad = &ws.alphaGrad[b * T * N + (t - 1) * N];
188 | 
189 |   using BlockReduce = cub::BlockReduce<double, kBlockSize>;
190 |   __shared__ typename BlockReduce::TempStorage tempStorage;
191 | 
192 |   double threadSum = 0;
193 |   for (auto n = threadIdx.x; n < N; n += blockDim.x) {
194 |     threadSum += ws.transBuf[b * N * N + n * N + m];
195 |   }
196 | 
197 |   double sumResult = BlockReduce(tempStorage).Sum(threadSum);
198 |   if (threadIdx.x == 0) {
199 |     alphaPrevGrad[m] = sumResult;
200 |   }
201 | }
202 | 
203 | /*
204 |  * B thread blocks
205 |  * 128 threads/block
206 |  */
207 | template <class Float>
208 | __global__ void backwardFinal(
209 |     int T,
210 |     int N,
211 |     const Float* _grad,
212 |     Float* _inputGrad,
213 |     Float* transGrad,
214 |     WorkspacePtrs<Float> ws) {
215 |   auto b = blockIdx.x;
216 | 
217 |   auto* alphaGrad = &ws.alphaGrad[b * T * N];
218 |   auto* inputGrad = &_inputGrad[b * T * N];
219 |   auto* transBatchGrad = &ws.transBatchGrad[b * N * N];
220 | 
221 |   __shared__ Float gradScale;
222 | 
223 |   if (threadIdx.x == 0) {
224 |     gradScale = ws.scale[b] * _grad[b];
225 |   }
226 | 
227 |   __syncthreads();
228 | 
229 |   for (auto i = threadIdx.x; i < T * N; i += blockDim.x) {
230 |     inputGrad[i] = gradScale * alphaGrad[i];
231 |   }
232 | 
233 |   for (auto i = threadIdx.x; i < N * N; i += blockDim.x) {
234 |     atomicAdd(&transGrad[i], gradScale * transBatchGrad[i]);
235 |   }
236 | }
237 | 
238 | } // namespace
239 | 
240 | namespace fl {
241 | namespace lib {
242 | namespace cuda {
243 | 
244 | template <class Float>
245 | size_t FullConnectionCriterion<Float>::getWorkspaceSize(int B, int T, int N) {
246 |   return WorkspacePtrs<Float>(nullptr, B, T, N).requiredSize;
247 | }
248 | 
249 | template <class Float>
250 | void FullConnectionCriterion<Float>::forward(
251 |     int B,
252 |     int T,
253 |     int N,
254 |     CriterionScaleMode scaleMode,
255 |     const Float* input,
256 |     const int* targetSize,
257 |     const Float* trans,
258 |     Float* loss,
259 |     void* workspace,
260 |     cudaStream_t stream) {
261 |   WorkspacePtrs<Float> ws(workspace, B, T, N);
262 |   CriterionUtils<Float>::computeScale(
263 |       B, T, N, scaleMode, targetSize, ws.scale, stream);
264 |   forwardInitial<<<B, kBlockSize, 0, stream>>>(T, N, input, ws);
265 |   for (int t = 1; t < T; ++t) {
266 |     forwardStep<false>
267 |         <<<B * N, kBlockSize, 0, stream>>>(T, N, t, input, trans, loss, ws);
268 |   }
269 |   forwardStep<true>
270 |       <<<B, kBlockSize, 0, stream>>>(T, N, T, input, trans, loss, ws);
271 | }
272 | 
273 | template <class Float>
274 | void FullConnectionCriterion<Float>::backward(
275 |     int B,
276 |     int T,
277 |     int N,
278 |     const Float* trans,
279 |     const Float* grad,
280 |     Float* inputGrad,
281 |     Float* transGrad,
282 |     void* workspace,
283 |     cudaStream_t stream) {
284 |   WorkspacePtrs<Float> ws(workspace, B, T, N);
285 |   setZero(inputGrad, B * T * N, stream);
286 |   setZero(transGrad, N * N, stream);
287 |   setZero(ws.transBatchGrad, B * N * N, stream);
288 |   backwardStep1<true><<<B, kBlockSize, 0, stream>>>(T, N, T, trans, ws);
289 |   for (int t = T - 1; t > 0; --t) {
290 |     backwardStep1<false><<<B * N, kBlockSize, 0, stream>>>(T, N, t, trans, ws);
291 |     backwardStep2<<<B * N, kBlockSize, 0, stream>>>(T, N, t, ws);
292 |   }
293 |   backwardFinal<<<B, 128, 0, stream>>>(T, N, grad, inputGrad, transGrad, ws);
294 | }
295 | 
296 | template struct FullConnectionCriterion<float>;
297 | template struct FullConnectionCriterion<double>;
298 | 
299 | } // namespace cuda
300 | } // namespace lib
301 | } // namespace fl
302 | 


--------------------------------------------------------------------------------
/flashlight/lib/sequence/criterion/cuda/FullConnectionCriterion.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Facebook, Inc. and its affiliates.
 3 |  *
 4 |  * This source code is licensed under the MIT-style license found in the
 5 |  * LICENSE file in the root directory of this source tree.
 6 |  */
 7 | 
 8 | #pragma once
 9 | 
10 | #include <cuda_runtime.h>
11 | 
12 | #include "flashlight/lib/sequence/Defines.h"
13 | #include "flashlight/lib/sequence/criterion/Defines.h"
14 | 
15 | using fl::lib::seq::CriterionScaleMode;
16 | 
17 | namespace fl {
18 | namespace lib {
19 | namespace cuda {
20 | 
21 | /// The denominator of ASG loss. Reference: https://arxiv.org/abs/1609.03193
22 | template <class Float>
23 | struct FL_SEQ_API FullConnectionCriterion {
24 |   /**
25 |    * B: batch size
26 |    * T: input length
27 |    * N: dictionary size
28 |    */
29 |   static size_t getWorkspaceSize(int B, int T, int N);
30 | 
31 |   /**
32 |    * B: batch size
33 |    * T: input length
34 |    * N: dictionary size
35 |    * scaleMode: type of size scaling
36 |    * input: [B][T][N] input frames from network
37 |    * targetSize: [B] target sizes (may be null if not needed for scaleMode)
38 |    * trans: [N][N] transition matrix
39 |    * loss: [B] (out) loss value
40 |    * workspace: (in/out) internal workspace
41 |    * stream: CUDA stream
42 |    */
43 |   static void forward(
44 |       int B,
45 |       int T,
46 |       int N,
47 |       CriterionScaleMode scaleMode,
48 |       const Float* input,
49 |       const int* targetSize,
50 |       const Float* trans,
51 |       Float* loss,
52 |       void* workspace,
53 |       cudaStream_t stream);
54 | 
55 |   /**
56 |    * B: batch size
57 |    * T: input length
58 |    * N: dictionary size
59 |    * trans: [N][N] transition matrix
60 |    * grad: [B] gradient w.r.t. loss
61 |    * inputGrad: [B][T][N] (out) gradient w.r.t. input
62 |    * transGrad: [N][N] (out) gradient w.r.t transitions
63 |    * workspace: (in/out) internal workspace from forward
64 |    * stream: CUDA stream
65 |    */
66 |   static void backward(
67 |       int B,
68 |       int T,
69 |       int N,
70 |       const Float* trans,
71 |       const Float* grad,
72 |       Float* inputGrad,
73 |       Float* transGrad,
74 |       void* workspace,
75 |       cudaStream_t stream);
76 | };
77 | 
78 | } // namespace cuda
79 | } // namespace lib
80 | } // namespace fl
81 | 


--------------------------------------------------------------------------------
/flashlight/lib/sequence/criterion/cuda/ViterbiPath.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) Facebook, Inc. and its affiliates.
  3 |  *
  4 |  * This source code is licensed under the MIT-style license found in the
  5 |  * LICENSE file in the root directory of this source tree.
  6 |  */
  7 | 
  8 | #include "flashlight/lib/sequence/criterion/cuda/ViterbiPath.cuh"
  9 | 
 10 | #include <cmath>
 11 | 
 12 | #include <cub/cub.cuh>
 13 | 
 14 | #include "flashlight/lib/sequence/criterion/Workspace.h"
 15 | 
 16 | namespace {
 17 | 
 18 | constexpr int kBlockSize = 32;
 19 | 
 20 | template <class Float>
 21 | struct WorkspacePtrs {
 22 |   explicit WorkspacePtrs(void* workspace, int B, int T, int N) {
 23 |     fl::lib::seq::Workspace<> ws(workspace);
 24 |     ws.request(&alpha, B, 2, N);
 25 |     ws.request(&beta, B, T, N);
 26 |     requiredSize = ws.requiredSize();
 27 |   }
 28 | 
 29 |   Float* alpha;
 30 |   int* beta;
 31 |   size_t requiredSize;
 32 | };
 33 | 
 34 | /*
 35 |  * B thread blocks
 36 |  * kBlockSize threads/block
 37 |  */
 38 | template <class Float>
 39 | __global__ void
 40 | computeInitial(int T, int N, const Float* input, WorkspacePtrs<Float> ws) {
 41 |   auto b = blockIdx.x;
 42 |   for (auto n = threadIdx.x; n < N; n += blockDim.x) {
 43 |     ws.alpha[b * 2 * N + n] = input[b * T * N + n];
 44 |   }
 45 | }
 46 | 
 47 | /*
 48 |  * B * N thread blocks (B if Final)
 49 |  * kBlockSize threads/block
 50 |  */
 51 | template <bool Final, class Float>
 52 | __global__ void computeStep(
 53 |     int T,
 54 |     int N,
 55 |     int t,
 56 |     const Float* input,
 57 |     const Float* trans,
 58 |     int* _path,
 59 |     WorkspacePtrs<Float> ws) {
 60 |   int b, m;
 61 |   if (Final) {
 62 |     b = blockIdx.x;
 63 |   } else {
 64 |     b = blockIdx.x / N;
 65 |     m = blockIdx.x % N;
 66 |   }
 67 | 
 68 |   const auto* alphaPrev = &ws.alpha[b * 2 * N + ((t - 1) % 2) * N];
 69 |   const auto* inputCur = &input[b * T * N + t * N];
 70 |   auto* alphaCur = &ws.alpha[b * 2 * N + (t % 2) * N];
 71 |   auto* betaCur = &ws.beta[b * T * N + t * N];
 72 | 
 73 |   using BlockReduce =
 74 |       cub::BlockReduce<cub::KeyValuePair<int, Float>, kBlockSize>;
 75 |   __shared__ typename BlockReduce::TempStorage tempStorage;
 76 | 
 77 |   cub::KeyValuePair<int, Float> threadMax;
 78 |   threadMax.value = -INFINITY;
 79 |   for (auto n = threadIdx.x; n < N; n += blockDim.x) {
 80 |     Float val = alphaPrev[n] + (Final ? 0 : trans[m * N + n]);
 81 |     if (val > threadMax.value) {
 82 |       threadMax.key = n;
 83 |       threadMax.value = val;
 84 |     }
 85 |   }
 86 | 
 87 |   auto result = BlockReduce(tempStorage).Reduce(threadMax, cub::ArgMax());
 88 |   if (threadIdx.x == 0) {
 89 |     if (Final) {
 90 |       auto* path = &_path[b * T];
 91 |       path[T - 1] = result.key;
 92 |       for (int s = T - 1; s > 0; --s) {
 93 |         path[s - 1] = ws.beta[b * T * N + s * N + path[s]];
 94 |       }
 95 |     } else {
 96 |       alphaCur[m] = result.value + inputCur[m];
 97 |       betaCur[m] = result.key;
 98 |     }
 99 |   }
100 | }
101 | 
102 | } // namespace
103 | 
104 | namespace fl {
105 | namespace lib {
106 | namespace cuda {
107 | 
108 | template <class Float>
109 | size_t ViterbiPath<Float>::getWorkspaceSize(int B, int T, int N) {
110 |   return WorkspacePtrs<Float>(nullptr, B, T, N).requiredSize;
111 | }
112 | 
113 | template <class Float>
114 | void ViterbiPath<Float>::compute(
115 |     int B,
116 |     int T,
117 |     int N,
118 |     const Float* input,
119 |     const Float* trans,
120 |     int* path,
121 |     void* workspace,
122 |     cudaStream_t stream) {
123 |   WorkspacePtrs<Float> ws(workspace, B, T, N);
124 |   computeInitial<<<B, kBlockSize, 0, stream>>>(T, N, input, ws);
125 |   for (int t = 1; t < T; ++t) {
126 |     computeStep<false>
127 |         <<<B * N, kBlockSize, 0, stream>>>(T, N, t, input, trans, path, ws);
128 |   }
129 |   computeStep<true>
130 |       <<<B, kBlockSize, 0, stream>>>(T, N, T, input, trans, path, ws);
131 | }
132 | 
133 | template struct ViterbiPath<float>;
134 | template struct ViterbiPath<double>;
135 | 
136 | } // namespace cuda
137 | } // namespace lib
138 | } // namespace fl
139 | 


--------------------------------------------------------------------------------
/flashlight/lib/sequence/criterion/cuda/ViterbiPath.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) Facebook, Inc. and its affiliates.
 3 |  *
 4 |  * This source code is licensed under the MIT-style license found in the
 5 |  * LICENSE file in the root directory of this source tree.
 6 |  */
 7 | 
 8 | #pragma once
 9 | 
10 | #include <cuda_runtime.h>
11 | 
12 | #include "flashlight/lib/sequence/Defines.h"
13 | 
14 | namespace fl {
15 | namespace lib {
16 | namespace cuda {
17 | 
18 | /// Computes max likelihood path using Viterbi algorithm.
19 | template <class Float>
20 | struct FL_SEQ_API ViterbiPath {
21 |   /**
22 |    * B: batch size
23 |    * T: input length
24 |    * N: dictionary size
25 |    */
26 |   static size_t getWorkspaceSize(int B, int T, int N);
27 | 
28 |   /**
29 |    * B: batch size
30 |    * T: input length
31 |    * N: dictionary size
32 |    * input: [B][T][N] input frames from network
33 |    * trans: [N][N] transition matrix
34 |    * path: [B][T] (out) Viterbi path
35 |    * workspace: (in/out) internal workspace
36 |    * stream: CUDA stream
37 |    */
38 |   static void compute(
39 |       int B,
40 |       int T,
41 |       int N,
42 |       const Float* input,
43 |       const Float* trans,
44 |       int* path,
45 |       void* workspace,
46 |       cudaStream_t stream);
47 | };
48 | 
49 | } // namespace cuda
50 | } // namespace lib
51 | } // namespace fl
52 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools", "wheel", "cmake", "packaging"]
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Copyright (c) Facebook, Inc. and its affiliates.
  4 | This source code is licensed under the MIT-style license found in the
  5 | LICENSE file in the root directory of this source tree.
  6 | """
  7 | 
  8 | import datetime
  9 | import os
 10 | import platform
 11 | import re
 12 | import subprocess
 13 | import sys
 14 | from pathlib import Path
 15 | 
 16 | from packaging import version
 17 | from setuptools import Extension, find_namespace_packages, setup
 18 | from setuptools.command.build_ext import build_ext
 19 | 
 20 | THIS_DIR = os.path.dirname(os.path.abspath(__file__))
 21 | 
 22 | # Path relative to project root that contains Python artifacts for packaging
 23 | PACKAGE_DIR = "bindings/python"
 24 | ARTIFACTS_DIR = os.path.join(PACKAGE_DIR, "flashlight/lib/sequence")
 25 | BUILD_VERSION_PATH = Path(os.path.join(THIS_DIR, "BUILD_VERSION.txt"))
 26 | 
 27 | 
 28 | # Environment variables:
 29 | # - `USE_CUDA=1` enables building with CUDA support
 30 | # By default builds with USE_CUDA=0
 31 | 
 32 | 
 33 | def check_env_flag(name, default=""):
 34 |     return os.getenv(name, default).upper() in ["ON", "1", "YES", "TRUE", "Y"]
 35 | 
 36 | 
 37 | def check_negative_env_flag(name, default="") -> bool:
 38 |     return os.getenv(name, default).upper() in ["OFF", "0", "NO", "FALSE", "N"]
 39 | 
 40 | 
 41 | def get_local_version_suffix() -> str:
 42 |     date_suffix = datetime.datetime.now().strftime("%Y%m%d")
 43 |     git_hash = subprocess.check_output(
 44 |         ["git", "rev-parse", "--short", "HEAD"], cwd=Path(__file__).parent
 45 |     ).decode("ascii")[:-1]
 46 |     return f"+{git_hash}.d{date_suffix}"
 47 | 
 48 | 
 49 | def write_version_file(version: str):
 50 |     version_path = os.path.join(THIS_DIR, ARTIFACTS_DIR, "version.py")
 51 |     with open(version_path, "w") as f:
 52 |         f.write("# noqa: C801\n")
 53 |         f.write(f'__version__ = "{version}"\n')
 54 |         tag = os.getenv("GIT_TAG")
 55 |         if tag is not None:
 56 |             f.write(f'git_tag = "{tag}"\n')
 57 | 
 58 | 
 59 | class CMakeExtension(Extension):
 60 |     def __init__(self, name):
 61 |         Extension.__init__(self, name, sources=[])
 62 | 
 63 | 
 64 | class CMakeBuild(build_ext):
 65 |     def run(self):
 66 |         try:
 67 |             out = subprocess.check_output(["cmake", "--version"])
 68 |         except OSError:
 69 |             raise RuntimeError(
 70 |                 "CMake must be installed to build the following extensions: "
 71 |                 + ", ".join(e.name for e in self.extensions)
 72 |             )
 73 | 
 74 |         cmake_version = re.search(r"version\s*([\d.]+)", out.decode().lower()).group(1)
 75 |         if version.parse(cmake_version) < version.parse("3.18"):
 76 |             raise RuntimeError(
 77 |                 "CMake >= 3.18 is required to build flashlight-sequence Python bindings"
 78 |             )
 79 | 
 80 |         # our CMakeLists builds all the extensions at once
 81 |         for ext in self.extensions:
 82 |             self.build_extensions(ext)
 83 | 
 84 |     def build_extensions(self, ext):
 85 |         if not os.path.exists(self.build_temp):
 86 |             os.makedirs(self.build_temp)
 87 | 
 88 |         ext_dir = str(Path(self.get_ext_fullpath(ext.name)).absolute().parent)
 89 |         source_dir = str(Path(__file__).absolute().parent)
 90 |         use_cuda = "ON" if check_env_flag("USE_CUDA") else "OFF"
 91 |         use_openmp = "OFF" if check_negative_env_flag("USE_OPENMP") else "ON"
 92 |         cmake_args = [
 93 |             "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=" + ext_dir,
 94 |             "-DPython3_EXECUTABLE=" + sys.executable,
 95 |             "-DBUILD_SHARED_LIBS=ON",
 96 |             "-DFL_SEQUENCE_BUILD_TESTS=OFF",
 97 |             "-DFL_SEQUENCE_BUILD_PYTHON=ON",
 98 |             "-DFL_SEQUENCE_BUILD_PYTHON_PACKAGE=ON",
 99 |             "-DFL_SEQUENCE_BUILD_STANDALONE=OFF",
100 |             "-DFL_SEQUENCE_USE_OPENMP=" + use_openmp,
101 |             "-DFL_SEQUENCE_USE_CUDA=" + use_cuda,
102 |         ]
103 |         cfg = "Debug" if self.debug else "Release"
104 |         build_args = ["--config", cfg]
105 | 
106 |         if platform.system() == "Windows":
107 |             cmake_args += [
108 |                 "-DCMAKE_RUNTIME_OUTPUT_DIRECTORY_{}={}".format(cfg.upper(), ext_dir),
109 |                 "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}".format(cfg.upper(), ext_dir),
110 |                 "-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY_{}={}".format(cfg.upper(), ext_dir),
111 |             ]
112 |             if sys.maxsize > 2**32:
113 |                 cmake_args += ["-A", "x64"]
114 |             build_args += ["--", "/m"]
115 |         else:
116 |             cmake_args += ["-DCMAKE_BUILD_TYPE=" + cfg]
117 |             build_args += ["--", "-j4"]
118 | 
119 |         env = os.environ.copy()
120 |         env["CXXFLAGS"] = '{} -fPIC -DVERSION_INFO=\\"{}\\"'.format(
121 |             env.get("CXXFLAGS", ""), self.distribution.get_version()
122 |         )
123 | 
124 |         if not os.path.exists(self.build_temp):
125 |             os.makedirs(self.build_temp)
126 |         subprocess.check_call(
127 |             ["cmake", source_dir] + cmake_args, cwd=self.build_temp, env=env
128 |         )
129 |         subprocess.check_call(
130 |             ["cmake", "--build", "."] + build_args, cwd=self.build_temp
131 |         )
132 | 
133 | 
134 | def main():
135 |     if os.getenv("BUILD_VERSION"):
136 |         version = os.getenv("BUILD_VERSION")
137 |     elif BUILD_VERSION_PATH.is_file():
138 |         version = BUILD_VERSION_PATH.read_text().strip()
139 |     else:
140 |         version_txt = os.path.join(THIS_DIR, PACKAGE_DIR, "version.txt")
141 |         with open(version_txt) as f:
142 |             version = f.readline().strip()
143 |         version += get_local_version_suffix()
144 | 
145 |     write_version_file(version)
146 | 
147 |     # Read Python bindings README
148 |     long_description = (Path(PACKAGE_DIR) / "README.md").read_text()
149 | 
150 |     setup(
151 |         name="flashlight-sequence",
152 |         version=version,
153 |         url="https://github.com/flashlight/sequence",
154 |         author="Jacob Kahn",
155 |         author_email="jacobkahn1@gmail.com",
156 |         description="Flashlight Sequence bindings for Python",
157 |         long_description=long_description,
158 |         long_description_content_type="text/markdown",
159 |         packages=find_namespace_packages(
160 |             where=PACKAGE_DIR,
161 |             include=["flashlight.lib.sequence", "flashlight.lib.sequence.criterion"],
162 |             exclude=["test"],
163 |         ),
164 |         package_dir={"": PACKAGE_DIR},
165 |         ext_modules=[CMakeExtension("flashlight.lib.sequence.criterion")],
166 |         cmdclass={"build_ext": CMakeBuild},
167 |         zip_safe=False,
168 |         license="BSD licensed, as found in the LICENSE file",
169 |         python_requires=">=3.6",
170 |         classifiers=[
171 |             "Programming Language :: Python :: 3.6",
172 |             "Programming Language :: Python :: 3.7",
173 |             "Programming Language :: Python :: 3.8",
174 |             "Programming Language :: Python :: 3.9",
175 |             "Programming Language :: Python :: 3.10",
176 |             "Programming Language :: Python :: 3.11",
177 |             "Topic :: Scientific/Engineering :: Artificial Intelligence",
178 |             "Operating System :: OS Independent",
179 |         ],
180 |     )
181 | 
182 | 
183 | if __name__ == "__main__":
184 |     main()
185 | 


--------------------------------------------------------------------------------