├── .circleci └── config.yml ├── .clang-format ├── .gitignore ├── CITATION ├── CMakeLists.txt ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── bindings └── python │ ├── CMakeLists.txt │ ├── README.md │ ├── compute_version.py │ ├── example │ └── criterion_example.py │ ├── flashlight │ └── lib │ │ └── sequence │ │ ├── __init__.py │ │ ├── _criterion.cpp │ │ ├── criterion.py │ │ └── criterion_torch.py │ ├── test │ └── test_import.py │ └── version.txt ├── cmake ├── BuildGoogleTest.cmake ├── Buildcub.cmake ├── Buildpybind11.cmake ├── FindFilesystem.cmake ├── FindGMock.cmake ├── InternalUtils.cmake ├── TestUtils.cmake └── flashlight-sequence-config.cmake.in ├── codecov.yml ├── flashlight └── lib │ └── sequence │ ├── CMakeLists.txt │ ├── Defines.h │ └── criterion │ ├── CMakeLists.txt │ ├── Defines.h │ ├── Workspace.h │ ├── cpu │ ├── ConnectionistTemporalClassificationCriterion.cpp │ ├── ConnectionistTemporalClassificationCriterion.h │ ├── CriterionUtils.cpp │ ├── CriterionUtils.h │ ├── ForceAlignmentCriterion.cpp │ ├── ForceAlignmentCriterion.h │ ├── FullConnectionCriterion.cpp │ ├── FullConnectionCriterion.h │ ├── ViterbiPath.cpp │ └── ViterbiPath.h │ └── cuda │ ├── CriterionUtils.cu │ ├── CriterionUtils.cuh │ ├── ForceAlignmentCriterion.cu │ ├── ForceAlignmentCriterion.cuh │ ├── FullConnectionCriterion.cu │ ├── FullConnectionCriterion.cuh │ ├── ViterbiPath.cu │ └── ViterbiPath.cuh ├── pyproject.toml └── setup.py /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2.1 2 | 3 | macos_env: &macos_env 4 | macos: 5 | xcode: 13.4.1 6 | resource_class: large 7 | environment: 8 | HOMEBREW_NO_AUTO_UPDATE: "1" 9 | 10 | gpu: &gpu 11 | machine: 12 | image: linux-cuda-11:2023.02.1 13 | resource_class: gpu.nvidia.medium 14 | 15 | executors: 16 | windows_gpu: 17 | machine: 18 | image: windows-server-2019-nvidia:stable 19 | # TODO: install a newer CUDA version if relying on newer 20 | # C++ features not in nvcc 10.1 21 | resource_class: windows.gpu.nvidia.medium 22 | 23 | orbs: 24 | win: circleci/windows@5.0.0 25 | 26 | commands: 27 | install_ubuntu_build_dependencies: 28 | parameters: 29 | use_cuda: 30 | type: string 31 | default: "OFF" 32 | steps: 33 | - run: 34 | name: "Install Build Dependencies" 35 | command: | 36 | sudo apt -y update 37 | sudo apt -y install build-essential python3-dev python3-pip python3-venv cmake 38 | install_macos_build_dependencies: 39 | steps: 40 | - run: 41 | name: "Install Build Dependencies" 42 | command: | 43 | brew install cmake libomp googletest 44 | install_msvc_build_dependencies: 45 | steps: 46 | - run: 47 | name: "Install Build Dependencies" 48 | command: | 49 | choco install cmake python3 -y 50 | # windows needs a path modification 51 | - run: 52 | name: "Set PATH to find CMake" 53 | command: echo 'export PATH="$PATH:/c/Program Files/CMake/bin"' >> $BASH_ENV 54 | build_flashlight_sequence: 55 | parameters: 56 | use_openmp: 57 | type: string 58 | default: "ON" 59 | use_cuda: 60 | type: string 61 | default: "ON" 62 | build_shared_libs: 63 | type: string 64 | default: "OFF" 65 | build_code_coverage: 66 | type: string 67 | default: "OFF" 68 | platform: 69 | type: string 70 | default: "linux" 71 | steps: 72 | - run: 73 | name: "Build and Install Flashlight Sequence" 74 | command: | 75 | mkdir build && \ 76 | cmake -S . -B build \ 77 | -DBUILD_SHARED_LIBS=<< parameters.build_shared_libs >> \ 78 | -DFL_SEQUENCE_USE_OPENMP=<< parameters.use_openmp >> \ 79 | -DFL_SEQUENCE_USE_CUDA=<< parameters.use_cuda >> \ 80 | -DFL_SEQUENCE_CODE_COVERAGE=<< parameters.build_code_coverage >> 81 | cmake --build build --parallel 82 | # only test install with non-Windows platforms (TODO: fix this) 83 | - when: 84 | condition: 85 | not: 86 | equal: ["windows", << parameters.platform >>] 87 | steps: 88 | - run: 89 | name: "Run Install Step" 90 | command: sudo cmake --install build 91 | # linux needs ldconfig 92 | - when: 93 | condition: 94 | equal: ["linux", << parameters.platform >>] 95 | steps: 96 | - run: 97 | name: "Configure shared lib paths" 98 | command: sudo ldconfig 99 | install_python_bindings: 100 | parameters: 101 | use_cuda: 102 | type: string 103 | default: "ON" 104 | steps: 105 | - run: 106 | name: "Setup virtualenv" 107 | command: | 108 | python3 -m venv venv 109 | source venv/bin/activate 110 | pip install --upgrade pip 111 | echo "source venv/bin/activate" >> $BASH_ENV 112 | - run: 113 | name: "Install Python Bindings" 114 | command: | 115 | pip install numpy 116 | USE_CUDA=<< parameters.use_cuda >> pip install -v . 117 | run_python_tests: 118 | parameters: 119 | use_cuda: 120 | type: string 121 | default: "ON" 122 | steps: 123 | - run: 124 | name: "Run Python Binding Tests" 125 | command: | 126 | cd bindings/python/test 127 | USE_CUDA=<< parameters.use_cuda >> python -m unittest discover -v . 128 | test_with_external_project: 129 | parameters: 130 | build_shared_libs: 131 | type: string 132 | default: "OFF" 133 | steps: 134 | - run: 135 | name: Set up dependent external project 136 | command: | 137 | mkdir -p test_project && cd test_project && \ 138 | echo -e "\ 139 | #include \n 140 | int main() { \n 141 | using ViterbiPath = fl::lib::cpu::ViterbiPath; \n 142 | return 0; \n 143 | } \n 144 | " > main.cpp && \ 145 | echo -e "\ 146 | cmake_minimum_required(VERSION 3.10) \n 147 | project(test_project) \n 148 | set(CMAKE_CXX_STANDARD 17) \n 149 | set(CMAKE_CXX_STANDARD_REQUIRED ON) \n 150 | add_executable(main main.cpp) \n 151 | find_package(flashlight-sequence CONFIG REQUIRED) \n 152 | target_link_libraries(main PRIVATE flashlight::flashlight-sequence) \n 153 | " > CMakeLists.txt 154 | - run: 155 | name: Build dependent external project 156 | command: | 157 | cd test_project && mkdir -p build 158 | cmake -S . -B build -DBUILD_SHARED_LIBS=<< parameters.build_shared_libs >> && \ 159 | cmake --build build --parallel && ./build/main 160 | run_codecov: 161 | steps: 162 | - run: 163 | name: "Get code coverage" 164 | command: | 165 | sudo apt-get install -y --no-install-recommends lcov curl && \ 166 | lcov --capture --directory . --output-file coverage.info && \ 167 | lcov --remove coverage.info '/usr/*' --output-file coverage.info && 168 | lcov --remove coverage.info '*/include/*' --output-file coverage.info && \ 169 | lcov --remove coverage.info '*/gtest/*' --output-file coverage.info && \ 170 | lcov --list coverage.info && \ 171 | bash <(curl -s https://codecov.io/bash) -f coverage.info \ 172 | -t $CODECOV_TOKEN \ 173 | || echo 'Codecov did not collect coverage reports' 174 | run_ubuntu_20_gcc_9: 175 | parameters: 176 | use_cuda: 177 | type: string 178 | default: "ON" 179 | build_shared_libs: 180 | type: string 181 | default: "OFF" 182 | run_codecov: 183 | type: string 184 | default: "" 185 | steps: 186 | - checkout 187 | - install_ubuntu_build_dependencies: 188 | use_cuda: << parameters.use_cuda >> 189 | - build_flashlight_sequence: 190 | build_shared_libs: << parameters.build_shared_libs >> 191 | use_cuda: << parameters.use_cuda >> 192 | build_code_coverage: << parameters.run_codecov >> 193 | - run: 194 | name: "Run C++ Tests" 195 | command: | 196 | cd build && ctest 197 | - test_with_external_project: 198 | build_shared_libs: << parameters.build_shared_libs >> 199 | - when: 200 | condition: << parameters.run_codecov >> 201 | steps: 202 | - run_codecov 203 | run_ubuntu_20_gcc_9_python: 204 | parameters: 205 | use_cuda: 206 | type: string 207 | default: "ON" 208 | steps: 209 | - checkout 210 | - install_ubuntu_build_dependencies: 211 | use_cuda: << parameters.use_cuda >> 212 | - install_python_bindings: 213 | use_cuda: << parameters.use_cuda >> 214 | - run_python_tests: 215 | use_cuda: << parameters.use_cuda >> 216 | 217 | jobs: 218 | ubuntu_20_gcc_9: 219 | parameters: 220 | build_shared_libs: 221 | type: string 222 | default: "OFF" 223 | run_codecov: 224 | type: string 225 | default: "" 226 | docker: 227 | - image: cimg/base:2021.04 228 | steps: 229 | - run_ubuntu_20_gcc_9: 230 | use_cuda: "OFF" 231 | build_shared_libs: << parameters.build_shared_libs >> 232 | run_codecov: << parameters.run_codecov >> 233 | 234 | ubuntu_20_gcc_9_cuda: 235 | parameters: 236 | build_shared_libs: 237 | type: string 238 | default: "OFF" 239 | run_codecov: 240 | type: string 241 | default: "" 242 | <<: *gpu 243 | steps: 244 | - run_ubuntu_20_gcc_9: 245 | use_cuda: "ON" 246 | build_shared_libs: << parameters.build_shared_libs >> 247 | run_codecov: << parameters.run_codecov >> 248 | 249 | ubuntu_20_gcc_9_python: 250 | docker: 251 | - image: cimg/base:2021.04 252 | steps: 253 | - run_ubuntu_20_gcc_9_python: 254 | use_cuda: "OFF" 255 | 256 | ubuntu_20_gcc_9_python_cuda: 257 | <<: *gpu 258 | steps: 259 | - run_ubuntu_20_gcc_9_python: 260 | use_cuda: "ON" 261 | 262 | macos_clang_13: 263 | parameters: 264 | build_shared_libs: 265 | type: string 266 | default: "OFF" 267 | <<: *macos_env 268 | shell: /bin/bash -eux -o pipefail 269 | steps: 270 | - checkout 271 | - install_macos_build_dependencies 272 | - build_flashlight_sequence: 273 | platform: "macos" 274 | use_cuda: "OFF" 275 | build_shared_libs: << parameters.build_shared_libs >> 276 | 277 | macos_clang_13_python: 278 | <<: *macos_env 279 | shell: /bin/bash -eux -o pipefail 280 | steps: 281 | - checkout 282 | - install_macos_build_dependencies 283 | - install_python_bindings: 284 | use_cuda: "OFF" 285 | - run_python_tests: 286 | use_cuda: "OFF" 287 | 288 | windows_msvc: 289 | parameters: 290 | build_shared_libs: 291 | type: string 292 | default: "OFF" 293 | executor: 294 | name: win/default 295 | shell: bash.exe 296 | steps: 297 | - checkout 298 | - install_msvc_build_dependencies 299 | - build_flashlight_sequence: 300 | platform: "windows" 301 | use_cuda: "OFF" 302 | 303 | windows_msvc_cuda: 304 | parameters: 305 | build_shared_libs: 306 | type: string 307 | default: "OFF" 308 | executor: windows_gpu 309 | steps: 310 | - checkout 311 | - install_msvc_build_dependencies 312 | - build_flashlight_sequence: 313 | platform: "windows" 314 | use_cuda: "ON" 315 | 316 | windows_msvc_python: 317 | parameters: 318 | use_cuda: 319 | type: string 320 | default: "OFF" 321 | executor: 322 | name: win/default 323 | shell: bash.exe 324 | steps: 325 | - checkout 326 | - install_msvc_build_dependencies 327 | - install_python_bindings: 328 | use_cuda: "OFF" 329 | 330 | workflows: 331 | build-test: 332 | jobs: 333 | - ubuntu_20_gcc_9: 334 | name: "Ubuntu 20.04 gcc-9 static" 335 | - ubuntu_20_gcc_9: 336 | name: "Ubuntu 20.04 gcc-9 shared" 337 | build_shared_libs: "ON" 338 | - ubuntu_20_gcc_9_cuda: 339 | name: "Ubuntu 20.04 gcc-9 static + CUDA" 340 | - ubuntu_20_gcc_9_cuda: 341 | name: "Ubuntu 20.04 gcc-9 shared + CUDA" 342 | build_shared_libs: "ON" 343 | run_codecov: "ON" 344 | - ubuntu_20_gcc_9_python: 345 | name: "Ubuntu 20.04 gcc-9 Python" 346 | - ubuntu_20_gcc_9_python_cuda: 347 | name: "Ubuntu 20.04 gcc-9 Python + CUDA" 348 | - macos_clang_13: 349 | name: "macOS Clang 13 - shared" 350 | build_shared_libs: "ON" 351 | - macos_clang_13_python: 352 | name: "macOS Clang 13 Python" 353 | - windows_msvc: 354 | name: "Windows VS 17 2022 | MSVC 19.33" 355 | - windows_msvc_cuda: 356 | name: "Windows VS 17 2022 | MSVC 19.33 + CUDA" 357 | - windows_msvc_python: 358 | name: "Windows VS 17 2022 | MSVC 19.33 Python" 359 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | AccessModifierOffset: -1 3 | AlignAfterOpenBracket: AlwaysBreak 4 | AlignConsecutiveAssignments: false 5 | AlignConsecutiveDeclarations: false 6 | AlignEscapedNewlinesLeft: true 7 | AlignOperands: false 8 | AlignTrailingComments: false 9 | AllowAllParametersOfDeclarationOnNextLine: false 10 | AllowShortBlocksOnASingleLine: false 11 | AllowShortCaseLabelsOnASingleLine: false 12 | AllowShortFunctionsOnASingleLine: Empty 13 | AllowShortIfStatementsOnASingleLine: false 14 | AllowShortLoopsOnASingleLine: false 15 | AlwaysBreakAfterReturnType: None 16 | AlwaysBreakBeforeMultilineStrings: true 17 | AlwaysBreakTemplateDeclarations: true 18 | BinPackArguments: false 19 | BinPackParameters: false 20 | BraceWrapping: 21 | AfterClass: false 22 | AfterControlStatement: false 23 | AfterEnum: false 24 | AfterFunction: false 25 | AfterNamespace: false 26 | AfterObjCDeclaration: false 27 | AfterStruct: false 28 | AfterUnion: false 29 | BeforeCatch: false 30 | BeforeElse: false 31 | IndentBraces: false 32 | BreakBeforeBinaryOperators: None 33 | BreakBeforeBraces: Attach 34 | BreakBeforeTernaryOperators: true 35 | BreakConstructorInitializersBeforeComma: false 36 | BreakAfterJavaFieldAnnotations: false 37 | BreakStringLiterals: false 38 | ColumnLimit: 80 39 | CommentPragmas: '^ IWYU pragma:' 40 | ConstructorInitializerAllOnOneLineOrOnePerLine: true 41 | ConstructorInitializerIndentWidth: 4 42 | ContinuationIndentWidth: 4 43 | Cpp11BracedListStyle: true 44 | DerivePointerAlignment: false 45 | DisableFormat: false 46 | ForEachMacros: [ FOR_EACH, FOR_EACH_R, FOR_EACH_RANGE, ] 47 | IncludeCategories: 48 | - Regex: '^<.*\.h(pp)?>' 49 | Priority: 1 50 | - Regex: '^<.*' 51 | Priority: 2 52 | - Regex: '.*' 53 | Priority: 3 54 | IndentCaseLabels: true 55 | IndentPPDirectives: None 56 | IndentWidth: 2 57 | IndentWrappedFunctionNames: false 58 | KeepEmptyLinesAtTheStartOfBlocks: false 59 | MacroBlockBegin: '' 60 | MacroBlockEnd: '' 61 | MaxEmptyLinesToKeep: 1 62 | NamespaceIndentation: None 63 | ObjCBlockIndentWidth: 2 64 | ObjCSpaceAfterProperty: false 65 | ObjCSpaceBeforeProtocolList: false 66 | PenaltyBreakBeforeFirstCallParameter: 1 67 | PenaltyBreakComment: 300 68 | PenaltyBreakFirstLessLess: 120 69 | PenaltyBreakString: 1000 70 | PenaltyExcessCharacter: 1000000 71 | PenaltyReturnTypeOnItsOwnLine: 200 72 | PointerAlignment: Left 73 | ReflowComments: true 74 | SortIncludes: true 75 | SpaceAfterCStyleCast: false 76 | SpaceBeforeAssignmentOperators: true 77 | SpaceBeforeParens: ControlStatements 78 | SpaceInEmptyParentheses: false 79 | SpacesBeforeTrailingComments: 1 80 | SpacesInAngles: false 81 | SpacesInContainerLiterals: true 82 | SpacesInCStyleCastParentheses: false 83 | SpacesInParentheses: false 84 | SpacesInSquareBrackets: false 85 | Standard: Cpp11 86 | TabWidth: 8 87 | UseTab: Never 88 | ... 89 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Build 2 | build 3 | release 4 | debug 5 | 6 | # FB 7 | fb 8 | TARGETS 9 | 10 | # Conan 11 | conanbuildinfo* 12 | conan.lock 13 | arrayfire 14 | conaninfo* 15 | graph_info.json 16 | 17 | # Python bindings 18 | __pycache__/ 19 | *$py.class 20 | .Python 21 | build/ 22 | develop-eggs/ 23 | dist/ 24 | downloads/ 25 | eggs/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | wheels/ 30 | **/version.py 31 | **/BUILD_VERSION.txt 32 | 33 | # Coverage 34 | *.info 35 | 36 | # Precompiled Headers 37 | *.gch 38 | *.pch 39 | 40 | # Compiled Object files 41 | *.slo 42 | *.lo 43 | *.o 44 | *.obj 45 | 46 | # Compiled Dynamic libraries 47 | *.so 48 | *.so.* 49 | *.dylib 50 | *.dll 51 | 52 | # Compiled Static libraries 53 | *.lai 54 | *.la 55 | *.a 56 | *.lib 57 | 58 | # Dev environment 59 | .vscode 60 | -------------------------------------------------------------------------------- /CITATION: -------------------------------------------------------------------------------- 1 | @misc{kahn2022flashlight, 2 | title={Flashlight: Enabling Innovation in Tools for Machine Learning}, 3 | author={Jacob Kahn and Vineel Pratap and Tatiana Likhomanenko and Qiantong Xu and Awni Hannun and Jeff Cai and Paden Tomasello and Ann Lee and Edouard Grave and Gilad Avidov and Benoit Steiner and Vitaliy Liptchinsky and Gabriel Synnaeve and Ronan Collobert}, 4 | year={2022}, 5 | eprint={2201.12465}, 6 | archivePrefix={arXiv}, 7 | primaryClass={cs.LG} 8 | } 9 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.16) 2 | 3 | project(flashlight-sequence LANGUAGES CXX C VERSION 0.1) 4 | 5 | include(CTest) 6 | 7 | # ----------------------------- Setup ----------------------------- 8 | find_program(CCACHE_PROGRAM ccache) 9 | if(CCACHE_PROGRAM) 10 | set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CCACHE_PROGRAM}") 11 | endif() 12 | 13 | list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") 14 | 15 | option(FL_SEQUENCE_CODE_COVERAGE "Enable coverage reporting" OFF) 16 | 17 | # Default directories for installation 18 | set(FL_INSTALL_INC_DIR "include" CACHE PATH "Install path for headers") 19 | set(FL_INSTALL_LIB_DIR "lib" CACHE PATH "Install path for libraries") 20 | set(FL_INSTALL_BIN_DIR "bin" CACHE PATH "Install path for binaries") 21 | # Other assets 22 | set(FL_INSTALL_ASSETS_BASE_DIR "share/${PROJECT_NAME}") 23 | set(FL_INSTALL_CMAKE_DIR "${FL_INSTALL_ASSETS_BASE_DIR}/cmake" CACHE PATH "Install path for CMake files") 24 | set(FL_INSTALL_EXAMPLES_DIR "${FL_INSTALL_ASSETS_BASE_DIR}/examples" CACHE PATH "Install path for example files") 25 | set(FL_INSTALL_DOC_DIR "${FL_INSTALL_ASSETS_BASE_DIR}/doc" CACHE PATH "Install path for documentation") 26 | 27 | include(${PROJECT_SOURCE_DIR}/cmake/InternalUtils.cmake) 28 | 29 | # ----------------------------- Configuration ----------------------------- 30 | 31 | option(FL_SEQUENCE_USE_OPENMP "Build with OpenMP support" OFF) 32 | option(FL_SEQUENCE_USE_CUDA "Build with CUDA support" OFF) 33 | option(FL_SEQUENCE_BUILD_TESTS "Build tests" ON) 34 | option(FL_SEQUENCE_BUILD_PYTHON "Build Python bindings" OFF) 35 | option(FL_SEQUENCE_BUILD_STANDALONE "Build standalone installation" ON) 36 | option(FL_SEQUENCE_BUILD_PYTHON_PACKAGE "Build packaging-ready Python artifacts" OFF) 37 | 38 | # ------------------------ Build ------------------------ 39 | 40 | add_library(flashlight-sequence) 41 | 42 | set_target_properties(flashlight-sequence PROPERTIES 43 | CXX_STANDARD 17 44 | CXX_STANDARD_REQUIRED ON 45 | ) 46 | 47 | target_include_directories( 48 | flashlight-sequence 49 | PUBLIC 50 | $ 51 | ) 52 | 53 | target_compile_definitions(flashlight-sequence PUBLIC FL_SEQ_DLL) 54 | 55 | if (FL_SEQUENCE_USE_CUDA) 56 | enable_language(CUDA) 57 | 58 | # To support nvcc with CUDA < 11 59 | set_target_properties( 60 | flashlight-sequence 61 | PROPERTIES 62 | CUDA_STANDARD 14 63 | CUDA_STANDARD_REQUIRED ON 64 | ) 65 | 66 | target_compile_definitions( 67 | flashlight-sequence 68 | PUBLIC 69 | FL_SEQUENCE_USE_OPENMP 70 | FL_SEQUENCE_USE_CUDA 71 | ) 72 | endif() 73 | 74 | include(${PROJECT_SOURCE_DIR}/flashlight/lib/sequence/CMakeLists.txt) 75 | 76 | if (FL_SEQUENCE_BUILD_PYTHON) 77 | include(${PROJECT_SOURCE_DIR}/bindings/python/CMakeLists.txt) 78 | endif() 79 | 80 | add_library(flashlight::flashlight-sequence ALIAS flashlight-sequence) 81 | 82 | # ------------------------ Tests + Code Coverage------------------------ 83 | 84 | if (FL_SEQUENCE_BUILD_TESTS) 85 | enable_testing() 86 | include(TestUtils) 87 | # TODO: add back after moving tests from Flashlight core 88 | # add_subdirectory(${PROJECT_SOURCE_DIR}/flashlight/lib/sequence/test) 89 | endif() 90 | 91 | if (FL_SEQUENCE_CODE_COVERAGE) 92 | fl_sequence_add_coverage_to_target(TARGET flashlight-sequence) 93 | endif() 94 | 95 | # ------------------------ Installation ------------------------ 96 | 97 | # Install headers 98 | install( 99 | DIRECTORY ${PROJECT_SOURCE_DIR}/flashlight/lib 100 | COMPONENT headers 101 | DESTINATION ${FL_INSTALL_INC_DIR}/flashlight 102 | FILES_MATCHING # preserve directory structure 103 | PATTERN "*.h" 104 | PATTERN "*.hpp" 105 | PATTERN "*.cuh" 106 | PATTERN "test*" EXCLUDE 107 | PATTERN ".git" EXCLUDE 108 | ) 109 | 110 | # Install libraries and create CMake config/targets files 111 | fl_sequence_setup_install_targets(INSTALL_TARGETS flashlight-sequence) 112 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to make participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies within all project spaces, and it also applies when 49 | an individual is representing the project or its community in public spaces. 50 | Examples of representing a project or community include using an official 51 | project e-mail address, posting via an official social media account, or acting 52 | as an appointed representative at an online or offline event. Representation of 53 | a project may be further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at . All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to flashlight 2 | flashlight is still under development; we appreciate any contributions. 3 | 4 | ## Pull Requests 5 | We actively welcome your pull requests. 6 | 7 | 1. Fork the repo and create your branch from `master`. 8 | 2. If you've added code that should be tested, add tests. 9 | 3. If you've changed APIs, update [and build](docs/README.md) the documentation (to check correctness - don't submit built documentation). 10 | 4. Ensure the test suite passes. 11 | 5. Make sure your code lints and run `clang-format` given the provided configuration. 12 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 13 | 14 | ## Contributor License Agreement ("CLA") 15 | All contributors must sign the CLA for their pull requests to be eligible for merge. One only needs to do this once to contribute to any of Facebook's open source projects. 16 | 17 | You can find the CLA [here](https://code.facebook.com/cla). 18 | 19 | ## Issues 20 | We use [GitHub issues](https://github.com/flashlight/flashlight/issues) to track public bugs. When filing, a bug, please make sure your description is clear and include sufficient instructions to reproduce the issue (for instance, your OS, compiler version, and selected backend). 21 | 22 | ## License 23 | By contributing to flashlight, you agree that your contributions will be licensed 24 | under the LICENSE file in the root directory of this source tree. 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Facebook, Inc. and its affiliates. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include CMakeLists.txt 2 | recursive-include cmake *.cmake *.cmake.in 3 | recursive-include flashlight/lib *.h *.cpp CMakeLists.txt 4 | recursive-include flashlight/lib *.cuh *.cu 5 | recursive-include bindings/python *.cpp CMakeLists.txt 6 | global-exclude *.o *.so *.dylib *.a .git *.pyc *.swp 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Flashlight Sequence: Algorithms for Sequence Data 2 | 3 | [**Quickstart**](#quickstart) 4 | | [**Installation**](#building-and-installing) 5 | | [**Python Documentation**](bindings/python) 6 | | [**Citing**](#citing) 7 | 8 | [![CircleCI](https://circleci.com/gh/flashlight/sequence.svg?style=shield)](https://app.circleci.com/pipelines/github/flashlight/sequence) 9 | [![Join the chat at https://gitter.im/flashlight-ml/community](https://img.shields.io/gitter/room/flashlight-ml/community)](https://gitter.im/flashlight-ml/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) [![Codecov](https://img.shields.io/codecov/c/github/flashlight/sequence)](https://codecov.io/gh/flashlight/sequence) [![Vcpkg](https://img.shields.io/vcpkg/v/flashlight-sequence)](https://vcpkg.link/ports/flashlight-sequence) 10 | 11 | *Flashlight Sequence* is a library with fast implementations of sequence-based operations. It includes: 12 | - A fast, parallel CPU implementation of the Viterbi algorithm for greedy "`argmax`-style" decoding 13 | - Fast implementations (CPU and CUDA) of the [Wav2letter ASG loss](https://arxiv.org/pdf/1609.03193.pdf) function including the fully-connected and forced-alignment algorithms. 14 | 15 | ## Quickstart 16 | 17 | Flashlight Sequence has Python bindings. To install the bindings from source, [optionally install CUDA] then clone the repo and build: 18 | ```shell 19 | git clone https://github.com/flashlight/sequence && cd sequence 20 | pip install . 21 | ``` 22 | To install with CUDA support, set the environment variable `USE_CUDA=1` when running the install command. By default, bindings are installed with OpenMP support; to build and install without OpenMP, set the environment to have `USE_OPENMP=0` when buildling. 23 | 24 | See the [full Python binding documentation](bindings/python) for examples and more. 25 | 26 | ## Building and Installing 27 | [**From Source (C++)**](#building-from-source) | [**With `vcpkg` (C++)**](#with-vcpkg) | [**From Source (Python)**](bindings/python#build-instructions) | [**Adding to Your Own Project (C++)**](#adding-flashlight-sequence-to-a-c-project) 28 | 29 | ### Requirements 30 | At minimum, C++ compilation requires: 31 | - A C++ compiler with good C++17 support (e.g. gcc/g++ >= 7) 32 | - [CMake](https://cmake.org/) — version 3.16 or later, and ``make`` 33 | - A Linux-based operating system. 34 | 35 | **CUDA Support:** If building with CUDA support, CUDA >= 9 is recommended. To toggle CUDA support use the `FL_SEQUENCE_USE_CUDA` CMake option or the `USE_CUDA` environment variable when building the Python bindings. To toggle OpenMP support, use the `FL_SEQUENCE_USE_OPENMP` CMake option or use the `USE_OPENMP` environment variable when building the Python bindings. 36 | 37 | **Tests:** If building tests, [Google Test](https://github.com/google/googletest) >= 1.12 is required, but is installed automatically on build if not found. The `FL_SEQUENCE_BUILD_TESTS` CMake option toggles building tests. 38 | 39 | Instructions for building/installing the Python bindings from source [can be found here](bindings/python/README.md). 40 | 41 | ### Building from Source 42 | 43 | Building the C++ project from source is simple: 44 | ```bash 45 | git clone https://github.com/flashlight/sequence && cd sequence 46 | cmake -S . -B build 47 | cmake --build build --parallel 48 | cd build && ctest && cd .. # run tests 49 | cmake --install build # install at the CMAKE_INSTALL_PREFIX 50 | ``` 51 | To enable CUDA while building, pass `-DFL_SEQUENCE_USE_CUDA=ON` to CMake. To enable building with OpenMP, pass `-DFL_SEQUENCE_USE_OPENMP=ON` to CMake. To disable building tests, pass `-DFL_SEQUENCE_BUILD_TESTS=OFF`. 52 | 53 | If building with CUDA < 11, [NVIDIA cub](https://github.com/NVIDIA/cub) is required. It will be downloaded automatically if not found; the `FL_SEQUENCE_BUILD_STANDALONE` build option controls this behavior. 54 | 55 | #### With [`vcpkg`](https://vcpkg.io/) 56 | 57 | Flashlight Sequence can also be installed and used downstream with the [`vcpkg`](https://vcpkg.io/) package manager. The [port](https://github.com/microsoft/vcpkg/blob/master/ports/flashlight-sequence/) contains optional features for building with OpenMP and/or CUDA: 58 | ```bash 59 | vcpkg install flashlight-sequence # no dependencies, or: 60 | vcpkg install "flashlight-sequence[cuda]" # with CUDA 61 | vcpkg install "flashlight-sequence[openmp]" # with OpenMP 62 | vcpkg install "flashlight-sequence[cuda,openmp]" # with both! 63 | ``` 64 | 65 | ### Adding Flashlight Sequence to a C++ Project 66 | 67 | Given a simple `project.cpp` file that includes and links to Flashlight Sequence: 68 | ```c++ 69 | #include 70 | 71 | #include 72 | 73 | int main() { 74 | auto res = fl::lib::cpu::ViterbiPath::compute(...); 75 | std::cout << "ViterbiPath result[0] " << res[0] << std::endl; 76 | return 0; 77 | } 78 | ``` 79 | 80 | The following CMake configuration links Flashlight and sets include directories: 81 | 82 | ```cmake 83 | cmake_minimum_required(VERSION 3.16) 84 | set(CMAKE_CXX_STANDARD 17) 85 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 86 | 87 | add_executable(myProject project.cpp) 88 | 89 | find_package(flashlight-sequence CONFIG REQUIRED) 90 | target_link_libraries(myProject PRIVATE flashlight::flashlight-sequence) 91 | ``` 92 | 93 | ### Contributing and Contact 94 | Contact: jacobkahn@fb.com 95 | 96 | Flashlight Sequence is actively developed. See 97 | [CONTRIBUTING](CONTRIBUTING.md) for more on how to help out. 98 | 99 | ## Citing 100 | You can cite [Flashlight](https://arxiv.org/abs/2201.12465) using: 101 | ``` 102 | @misc{kahn2022flashlight, 103 | title={Flashlight: Enabling Innovation in Tools for Machine Learning}, 104 | author={Jacob Kahn and Vineel Pratap and Tatiana Likhomanenko and Qiantong Xu and Awni Hannun and Jeff Cai and Paden Tomasello and Ann Lee and Edouard Grave and Gilad Avidov and Benoit Steiner and Vitaliy Liptchinsky and Gabriel Synnaeve and Ronan Collobert}, 105 | year={2022}, 106 | eprint={2201.12465}, 107 | archivePrefix={arXiv}, 108 | primaryClass={cs.LG} 109 | } 110 | ``` 111 | 112 | For the AutoSegmentation Criterion (ASG), cite: 113 | ``` 114 | @article{collobert2016wav2letter, 115 | title={Wav2letter: an end-to-end convnet-based speech recognition system}, 116 | author={Collobert, Ronan and Puhrsch, Christian and Synnaeve, Gabriel}, 117 | journal={arXiv preprint arXiv:1609.03193}, 118 | year={2016} 119 | } 120 | ``` 121 | 122 | ## License 123 | Flashlight Sequence is under an MIT license. See [LICENSE](LICENSE) for more information. 124 | -------------------------------------------------------------------------------- /bindings/python/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.16) 2 | 3 | if (NOT BUILD_SHARED_LIBS) 4 | message(FATAL_ERROR "Cannot build Python bindings with a static lib build: " 5 | "set BUILD_SHARED_LIBS to ON.") 6 | endif() 7 | 8 | # Since the Python libs and standalone Flashlight Text libs are built/installed 9 | # to the same directory, set rpaths on the Python targets to be the current dir 10 | if(APPLE) 11 | # macOS 12 | set(CMAKE_MACOSX_RPATH ON) 13 | set(_portable_rpath_origin "@loader_path") 14 | else() 15 | # Linux 16 | set(CMAKE_BUILD_RPATH_USE_ORIGIN ON) 17 | set(_portable_rpath_origin $ORIGIN) 18 | endif(APPLE) 19 | 20 | find_package(Python3 REQUIRED COMPONENTS Interpreter Development.Module) 21 | find_package(pybind11 CONFIG) 22 | if (NOT pybind11_FOUND) 23 | message(STATUS "Could not find pybind11 -- downloading from source.") 24 | include(${PROJECT_SOURCE_DIR}/cmake/Buildpybind11.cmake) 25 | endif() 26 | 27 | function (add_pybind11_extension ext_name) 28 | string(REPLACE "_" ";" modlist ${ext_name}) 29 | list(GET modlist -1 modname) 30 | list(REMOVE_AT modlist -1) 31 | if(modlist) 32 | string(REPLACE ";" "/" relpath "${modlist}") 33 | else() 34 | set(relpath "") 35 | endif() 36 | 37 | pybind11_add_module( 38 | ${ext_name} 39 | ${CMAKE_CURRENT_LIST_DIR}/${relpath}/_${modname}.cpp 40 | ) 41 | 42 | target_link_libraries( 43 | ${ext_name} 44 | PUBLIC 45 | flashlight-sequence 46 | ) 47 | 48 | target_include_directories( 49 | ${ext_name} 50 | PRIVATE 51 | ${PROJECT_SOURCE_DIR} 52 | ) 53 | 54 | if (FL_SEQUENCE_BUILD_PYTHON_PACKAGE) 55 | set_target_properties(${ext_name} PROPERTIES 56 | OUTPUT_NAME ${ext_name} 57 | BUILD_RPATH ${_portable_rpath_origin}) 58 | else() 59 | if (CMAKE_LIBRARY_OUTPUT_DIRECTORY) 60 | set_target_properties(${ext_name} PROPERTIES 61 | LIBRARY_OUTPUT_DIRECTORY ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/${relpath}) 62 | endif() 63 | endif() 64 | endfunction () 65 | 66 | add_pybind11_extension(flashlight_lib_sequence_criterion) 67 | -------------------------------------------------------------------------------- /bindings/python/README.md: -------------------------------------------------------------------------------- 1 | # Flashlight Sequence Python Bindings 2 | **Contents** 3 | - [Installation](#installation) 4 | * [Dependencies](#dependencies) 5 | * [Build Instructions](#build-instructions) 6 | * [Advanced Options](#advanced-options) 7 | - [Python API Documentation](#python-api-documentation) 8 | 9 | ## Installation 10 | CUDA is required if building CUDA kernel implementations with the Python package. 11 | 12 | ### Build Instructions 13 | From the project root, run: 14 | ``` 15 | pip install . 16 | ``` 17 | 18 | or locally in editable mode (`-e` is required as libs are built outside of the bindings directory) 19 | ``` 20 | pip install -e . 21 | ``` 22 | 23 | (`pypi` installation coming soon) 24 | 25 | ### Advanced Options 26 | - `USE_CUDA=1` builds CUDA kernels for many included algorithms. CUDA >= 9 is required. 27 | 28 | ## Python API Documentation 29 | 30 | Coming soon. 31 | -------------------------------------------------------------------------------- /bindings/python/compute_version.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. 2 | # 3 | # This source code is licensed under the BSD license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | # 6 | # Inspired by http://bit.ly/40pQb1Q 7 | import subprocess 8 | from pathlib import Path 9 | from typing import Optional 10 | 11 | from packaging import version 12 | 13 | THIS_PATH = Path(__file__).resolve() 14 | version_from_file = (THIS_PATH.parent / "version.txt").read_text().strip() 15 | 16 | 17 | def get_tagged_version() -> Optional[str]: 18 | """ 19 | Return a version corresponding to a git tag if it matches version.txt 20 | """ 21 | try: 22 | tag = subprocess.check_output( 23 | ["git", "describe", "--tags", "--exact-match", "HEAD"], 24 | text=True, 25 | stderr=subprocess.DEVNULL, 26 | ).strip() 27 | except subprocess.CalledProcessError: # no tag 28 | return None 29 | 30 | if not tag.startswith("v"): 31 | return None 32 | 33 | # Must match the version in `version.txt` -- ignores `rc` or other suffixes 34 | assert ( 35 | version.parse(version_from_file).release == version.parse(tag[1:]).release 36 | ), f"The version in version.txt ({version_from_file}) does not match the given tag ({tag})" 37 | return tag[1:] 38 | 39 | 40 | def get_dev_version() -> str: 41 | # Total number of commits appended to ensure chronological ordering 42 | # (see PEP440) 43 | num_commits = subprocess.check_output( 44 | ["git", "rev-list", "--count", "HEAD"], text=True 45 | ).strip() 46 | return f"{version_from_file}.dev{num_commits}" 47 | 48 | 49 | if __name__ == "__main__": 50 | tagged_version = get_tagged_version() 51 | if tagged_version is not None: 52 | print(tagged_version, end="") 53 | else: 54 | print(get_dev_version(), end="") 55 | -------------------------------------------------------------------------------- /bindings/python/example/criterion_example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Copyright (c) Facebook, Inc. and its affiliates. 4 | 5 | This source code is licensed under the MIT-style license found in the 6 | LICENSE file in the root directory of this source tree. 7 | """ 8 | 9 | import argparse 10 | 11 | import torch 12 | from flashlight.lib.sequence.criterion import ASGLoss, CriterionScaleMode 13 | 14 | 15 | def main() -> None: 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument( 18 | "--cpu", action="store_true", help="Use cpu backend, otherwise use CUDA backend" 19 | ) 20 | parser.add_argument( 21 | "--double", 22 | action="store_true", 23 | help="store tensors in double, otherwise in float", 24 | ) 25 | args = parser.parse_args() 26 | 27 | device = torch.device("cpu" if args.cpu else "cuda") 28 | float_type = torch.double if args.double else torch.float 29 | 30 | # create ASG loss with scaling the loss to the sqrt of target size 31 | # and 6 tokens (6 tokens scores predicted by some network for each frame) 32 | asg = ASGLoss(6, scale_mode=CriterionScaleMode.TARGET_SZ_SQRT).to(device) 33 | # define the input to the loss (scores for tokens came from 34 | # some network for each frame) size is [batch, time, ntokens] 35 | input = torch.tensor( 36 | [ 37 | [ 38 | [-0.4340, -0.0254, 0.3667, 0.4180, -0.3805, -0.1707], 39 | [0.1060, 0.3631, -0.1122, -0.3825, -0.0031, -0.3801], 40 | [0.0443, -0.3795, 0.3194, -0.3130, 0.0094, 0.1560], 41 | [0.1252, 0.2877, 0.1997, -0.4554, 0.2774, -0.2526], 42 | [-0.4001, -0.2402, 0.1295, 0.0172, 0.1805, -0.3299], 43 | ], 44 | [ 45 | [0.3298, -0.2259, -0.0959, 0.4909, 0.2996, -0.2543], 46 | [-0.2863, 0.3239, -0.3988, 0.0732, -0.2107, -0.4739], 47 | [-0.0906, 0.0480, -0.1301, 0.3975, -0.3317, -0.1967], 48 | [0.4372, -0.2006, 0.0094, 0.3281, 0.1873, -0.2945], 49 | [0.2399, 0.0320, -0.3768, -0.2849, -0.2248, 0.3186], 50 | ], 51 | [ 52 | [0.0225, -0.3867, -0.1929, -0.2904, -0.4958, -0.2533], 53 | [0.4001, -0.1517, -0.2799, -0.2915, 0.4198, 0.4506], 54 | [0.1446, -0.4753, -0.0711, 0.2876, -0.1851, -0.1066], 55 | [0.2081, -0.1190, -0.3902, -0.1668, 0.1911, -0.2848], 56 | [-0.3846, 0.1175, 0.1052, 0.2172, -0.0362, 0.3055], 57 | ], 58 | ], 59 | dtype=float_type, 60 | device=device, 61 | requires_grad=True, 62 | ) 63 | # define the padded target transcriptions (encoded with tokens indices), 64 | # padded index is -1 65 | target = torch.tensor( 66 | [[2, 1, 5, 1, 3], [4, 3, 5, -1, -1], [3, 2, 2, 1, -1]], 67 | dtype=torch.int, 68 | device=device, 69 | ) 70 | # define target transcriptions sizes 71 | target_size = torch.tensor([5, 3, 4], dtype=torch.int, device=device) 72 | # define gradient of the network 73 | grad = torch.ones(3, dtype=float_type, device=device) 74 | 75 | print("List of ASG parameters", list(asg.parameters())) 76 | # run forward pass to compute the ASG loss 77 | loss = asg.forward(input, target, target_size) 78 | print("ASG loss =", loss) 79 | # run backward pass 80 | loss.backward(grad) 81 | print("Gradients with respect to the ASG loss input", input.grad) 82 | print("Gradients with respect to the transition matrix", asg.transitions.grad) 83 | 84 | 85 | if __name__ == "__main__": 86 | main() # pragma: no cover 87 | -------------------------------------------------------------------------------- /bindings/python/flashlight/lib/sequence/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Copyright (c) Facebook, Inc. and its affiliates. 4 | 5 | This source code is licensed under the MIT-style license found in the 6 | LICENSE file in the root directory of this source tree. 7 | """ 8 | 9 | name = "sequence" 10 | -------------------------------------------------------------------------------- /bindings/python/flashlight/lib/sequence/_criterion.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Facebook, Inc. and its affiliates. 3 | * 4 | * This source code is licensed under the MIT-style license found in the 5 | * LICENSE file in the root directory of this source tree. 6 | */ 7 | 8 | #include 9 | 10 | #include "flashlight/lib/sequence/criterion/cpu/ForceAlignmentCriterion.h" 11 | #include "flashlight/lib/sequence/criterion/cpu/FullConnectionCriterion.h" 12 | #include "flashlight/lib/sequence/criterion/cpu/ViterbiPath.h" 13 | 14 | #ifdef FL_SEQUENCE_USE_CUDA 15 | #include "flashlight/lib/sequence/criterion/cuda/ForceAlignmentCriterion.cuh" 16 | #include "flashlight/lib/sequence/criterion/cuda/FullConnectionCriterion.cuh" 17 | #include "flashlight/lib/sequence/criterion/cuda/ViterbiPath.cuh" 18 | #endif // FL_SEQUENCE_USE_CUDA 19 | 20 | namespace py = pybind11; 21 | using namespace fl::lib::seq; 22 | 23 | template 24 | static T castBytes(const py::bytes& b) { 25 | static_assert( 26 | std::is_standard_layout::value, 27 | "types represented as bytes must be standard layout"); 28 | std::string s = b; 29 | if (s.size() != sizeof(T)) { 30 | throw std::runtime_error("wrong py::bytes size to represent object"); 31 | } 32 | return *reinterpret_cast(s.data()); 33 | } 34 | 35 | using CpuFAC = fl::lib::cpu::ForceAlignmentCriterion; 36 | using CpuFCC = fl::lib::cpu::FullConnectionCriterion; 37 | using CpuViterbi = fl::lib::cpu::ViterbiPath; 38 | 39 | static void CpuFAC_forward( 40 | int B, 41 | int T, 42 | int N, 43 | int L, 44 | CriterionScaleMode scaleMode, 45 | py::bytes input, 46 | py::bytes target, 47 | py::bytes targetSize, 48 | py::bytes trans, 49 | py::bytes loss, 50 | py::bytes workspace) { 51 | CpuFAC::forward( 52 | B, 53 | T, 54 | N, 55 | L, 56 | scaleMode, 57 | castBytes(input), 58 | castBytes(target), 59 | castBytes(targetSize), 60 | castBytes(trans), 61 | castBytes(loss), 62 | castBytes(workspace)); 63 | } 64 | 65 | static void CpuFAC_backward( 66 | int B, 67 | int T, 68 | int N, 69 | int L, 70 | py::bytes target, 71 | py::bytes targetSize, 72 | py::bytes grad, 73 | py::bytes inputGrad, 74 | py::bytes transGrad, 75 | py::bytes workspace) { 76 | CpuFAC::backward( 77 | B, 78 | T, 79 | N, 80 | L, 81 | castBytes(target), 82 | castBytes(targetSize), 83 | castBytes(grad), 84 | castBytes(inputGrad), 85 | castBytes(transGrad), 86 | castBytes(workspace)); 87 | } 88 | 89 | static void CpuFCC_forward( 90 | int B, 91 | int T, 92 | int N, 93 | CriterionScaleMode scaleMode, 94 | py::bytes input, 95 | py::bytes targetSize, 96 | py::bytes trans, 97 | py::bytes loss, 98 | py::bytes workspace) { 99 | CpuFCC::forward( 100 | B, 101 | T, 102 | N, 103 | scaleMode, 104 | castBytes(input), 105 | castBytes(targetSize), 106 | castBytes(trans), 107 | castBytes(loss), 108 | castBytes(workspace)); 109 | } 110 | 111 | static void CpuFCC_backward( 112 | int B, 113 | int T, 114 | int N, 115 | py::bytes trans, 116 | py::bytes grad, 117 | py::bytes inputGrad, 118 | py::bytes transGrad, 119 | py::bytes workspace) { 120 | CpuFCC::backward( 121 | B, 122 | T, 123 | N, 124 | castBytes(trans), 125 | castBytes(grad), 126 | castBytes(inputGrad), 127 | castBytes(transGrad), 128 | castBytes(workspace)); 129 | } 130 | 131 | static void CpuViterbi_compute( 132 | int B, 133 | int T, 134 | int N, 135 | py::bytes input, 136 | py::bytes trans, 137 | py::bytes path, 138 | py::bytes workspace) { 139 | CpuViterbi::compute( 140 | B, 141 | T, 142 | N, 143 | castBytes(input), 144 | castBytes(trans), 145 | castBytes(path), 146 | castBytes(workspace)); 147 | } 148 | 149 | #ifdef FL_SEQUENCE_USE_CUDA 150 | 151 | using CudaFAC = fl::lib::cuda::ForceAlignmentCriterion; 152 | using CudaFCC = fl::lib::cuda::FullConnectionCriterion; 153 | using CudaViterbi = fl::lib::cuda::ViterbiPath; 154 | 155 | static void CudaFAC_forward( 156 | int B, 157 | int T, 158 | int N, 159 | int L, 160 | CriterionScaleMode scaleMode, 161 | py::bytes input, 162 | py::bytes target, 163 | py::bytes targetSize, 164 | py::bytes trans, 165 | py::bytes loss, 166 | py::bytes workspace, 167 | py::bytes stream) { 168 | CudaFAC::forward( 169 | B, 170 | T, 171 | N, 172 | L, 173 | scaleMode, 174 | castBytes(input), 175 | castBytes(target), 176 | castBytes(targetSize), 177 | castBytes(trans), 178 | castBytes(loss), 179 | castBytes(workspace), 180 | castBytes(stream)); 181 | } 182 | 183 | static void CudaFAC_backward( 184 | int B, 185 | int T, 186 | int N, 187 | int L, 188 | py::bytes target, 189 | py::bytes targetSize, 190 | py::bytes grad, 191 | py::bytes inputGrad, 192 | py::bytes transGrad, 193 | py::bytes workspace, 194 | py::bytes stream) { 195 | CudaFAC::backward( 196 | B, 197 | T, 198 | N, 199 | L, 200 | castBytes(target), 201 | castBytes(targetSize), 202 | castBytes(grad), 203 | castBytes(inputGrad), 204 | castBytes(transGrad), 205 | castBytes(workspace), 206 | castBytes(stream)); 207 | } 208 | 209 | static void CudaFCC_forward( 210 | int B, 211 | int T, 212 | int N, 213 | CriterionScaleMode scaleMode, 214 | py::bytes input, 215 | py::bytes targetSize, 216 | py::bytes trans, 217 | py::bytes loss, 218 | py::bytes workspace, 219 | py::bytes stream) { 220 | CudaFCC::forward( 221 | B, 222 | T, 223 | N, 224 | scaleMode, 225 | castBytes(input), 226 | castBytes(targetSize), 227 | castBytes(trans), 228 | castBytes(loss), 229 | castBytes(workspace), 230 | castBytes(stream)); 231 | } 232 | 233 | static void CudaFCC_backward( 234 | int B, 235 | int T, 236 | int N, 237 | py::bytes trans, 238 | py::bytes grad, 239 | py::bytes inputGrad, 240 | py::bytes transGrad, 241 | py::bytes workspace, 242 | py::bytes stream) { 243 | CudaFCC::backward( 244 | B, 245 | T, 246 | N, 247 | castBytes(trans), 248 | castBytes(grad), 249 | castBytes(inputGrad), 250 | castBytes(transGrad), 251 | castBytes(workspace), 252 | castBytes(stream)); 253 | } 254 | 255 | static void CudaViterbi_compute( 256 | int B, 257 | int T, 258 | int N, 259 | py::bytes input, 260 | py::bytes trans, 261 | py::bytes path, 262 | py::bytes workspace, 263 | py::bytes stream) { 264 | CudaViterbi::compute( 265 | B, 266 | T, 267 | N, 268 | castBytes(input), 269 | castBytes(trans), 270 | castBytes(path), 271 | castBytes(workspace), 272 | castBytes(stream)); 273 | } 274 | 275 | #endif // FL_SEQUENCE_USE_CUDA 276 | 277 | PYBIND11_MODULE(flashlight_lib_sequence_criterion, m) { 278 | py::enum_(m, "CriterionScaleMode") 279 | .value("NONE", CriterionScaleMode::NONE) 280 | .value("INPUT_SZ", CriterionScaleMode::INPUT_SZ) 281 | .value("INPUT_SZ_SQRT", CriterionScaleMode::INPUT_SZ_SQRT) 282 | .value("TARGET_SZ", CriterionScaleMode::TARGET_SZ) 283 | .value("TARGET_SZ_SQRT", CriterionScaleMode::TARGET_SZ_SQRT); 284 | 285 | py::class_(m, "CpuForceAlignmentCriterion") 286 | .def("get_workspace_size", &CpuFAC::getWorkspaceSize) 287 | .def("forward", &CpuFAC_forward) 288 | .def("backward", &CpuFAC_backward); 289 | 290 | py::class_(m, "CpuFullConnectionCriterion") 291 | .def("get_workspace_size", &CpuFCC::getWorkspaceSize) 292 | .def("forward", &CpuFCC_forward) 293 | .def("backward", &CpuFCC_backward); 294 | 295 | py::class_(m, "CpuViterbiPath") 296 | .def("get_workspace_size", &CpuViterbi::getWorkspaceSize) 297 | .def("compute", &CpuViterbi_compute); 298 | 299 | #ifdef FL_SEQUENCE_USE_CUDA 300 | m.attr("sizeof_cuda_stream") = py::int_(sizeof(cudaStream_t)); 301 | 302 | py::class_(m, "CudaForceAlignmentCriterion") 303 | .def("get_workspace_size", &CudaFAC::getWorkspaceSize) 304 | .def("forward", &CudaFAC_forward) 305 | .def("backward", &CudaFAC_backward); 306 | 307 | py::class_(m, "CudaFullConnectionCriterion") 308 | .def("get_workspace_size", &CudaFCC::getWorkspaceSize) 309 | .def("forward", &CudaFCC_forward) 310 | .def("backward", &CudaFCC_backward); 311 | 312 | py::class_(m, "CudaViterbiPath") 313 | .def("get_workspace_size", &CudaViterbi::getWorkspaceSize) 314 | .def("compute", &CudaViterbi_compute); 315 | #endif // FL_SEQUENCE_USE_CUDA 316 | } 317 | -------------------------------------------------------------------------------- /bindings/python/flashlight/lib/sequence/criterion.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Copyright (c) Facebook, Inc. and its affiliates. 4 | 5 | This source code is licensed under the MIT-style license found in the 6 | LICENSE file in the root directory of this source tree. 7 | """ 8 | 9 | from .flashlight_lib_sequence_criterion import ( # @manual 10 | CpuForceAlignmentCriterion, 11 | CpuFullConnectionCriterion, 12 | CpuViterbiPath, 13 | CriterionScaleMode, 14 | ) 15 | 16 | 17 | have_torch = False 18 | try: 19 | import torch 20 | 21 | have_torch = True 22 | except ImportError: 23 | pass 24 | 25 | if have_torch: 26 | from flashlight.lib.sequence.criterion_torch import ( 27 | ASGLoss, 28 | check_tensor, 29 | create_workspace, 30 | FACFunction, 31 | FCCFunction, 32 | get_cuda_stream_as_bytes, 33 | get_data_ptr_as_bytes, 34 | run_backward, 35 | run_direction, 36 | run_forward, 37 | run_get_workspace_size, 38 | ) 39 | -------------------------------------------------------------------------------- /bindings/python/flashlight/lib/sequence/criterion_torch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Copyright (c) Facebook, Inc. and its affiliates. 4 | 5 | This source code is licensed under the MIT-style license found in the 6 | LICENSE file in the root directory of this source tree. 7 | """ 8 | 9 | import struct 10 | import sys 11 | 12 | import flashlight.lib.sequence.flashlight_lib_sequence_criterion as _C # @manual 13 | import torch 14 | import torch.nn as nn 15 | 16 | 17 | def get_data_ptr_as_bytes(tensor): 18 | return struct.pack("P", tensor.data_ptr()) 19 | 20 | 21 | def get_cuda_stream_as_bytes(): 22 | s = torch.cuda.current_stream().cuda_stream 23 | return s.to_bytes(_C.sizeof_cuda_stream, byteorder=sys.byteorder) 24 | 25 | 26 | def check_tensor(tensor, size, dtype, device): 27 | shape = torch.Size(size) 28 | if tensor.shape != shape: 29 | raise ValueError(f"wrong tensor size: expected {shape}, got {tensor.shape}") 30 | return tensor.to(dtype=dtype, device=device).contiguous() 31 | 32 | 33 | def run_direction(cls, device, direction, *args): 34 | """ 35 | Select and run CPU/CUDA implementation of `forward()` or `backward()`. 36 | If CUDA, create the right device context and also pass the CUDA stream. 37 | """ 38 | device = torch.device(device) 39 | if device.type == "cuda": 40 | with torch.cuda.device(device): 41 | fn = getattr(cls.cuda_impl(), direction) 42 | fn(*args, get_cuda_stream_as_bytes()) 43 | elif device.type == "cpu": 44 | fn = getattr(cls.cpu_impl(), direction) 45 | fn(*args) 46 | else: 47 | raise ValueError("unknown/unsupported device type") 48 | 49 | 50 | def run_forward(cls, device, *args): 51 | run_direction(cls, device, "forward", *args) 52 | 53 | 54 | def run_backward(cls, device, *args): 55 | run_direction(cls, device, "backward", *args) 56 | 57 | 58 | def run_get_workspace_size(cls, device, *args): 59 | device = torch.device(device) 60 | if device.type == "cuda": 61 | return cls.cuda_impl().get_workspace_size(*args) 62 | elif device.type == "cpu": 63 | return cls.cpu_impl().get_workspace_size(*args) 64 | else: 65 | raise ValueError("unknown/unsupported device type") 66 | 67 | 68 | def create_workspace(cls, device, *args): 69 | """ 70 | Select and run CPU/CUDA implementation of `get_workspace_size()`, 71 | then return a byte tensor of appropriate size. 72 | """ 73 | workspace_size = run_get_workspace_size(cls, device, *args) 74 | return torch.empty(workspace_size, dtype=torch.uint8, device=device) 75 | 76 | 77 | class FACFunction(torch.autograd.Function): 78 | """ 79 | torch.autograd.Function for ForceAlignmentCriterion 80 | Supports CPU and CUDA backends, compute the probability of the correct paths 81 | in the ASG graph (the nominator of the ASG loss) 82 | """ 83 | 84 | @staticmethod 85 | def cuda_impl(): 86 | """ 87 | Get CUDA implementation of forward/backward for the criterion 88 | """ 89 | return _C.CudaForceAlignmentCriterion 90 | 91 | @staticmethod 92 | def cpu_impl(): 93 | """ 94 | Get CPU implementation of forward/backward for the criterion 95 | """ 96 | return _C.CpuForceAlignmentCriterion 97 | 98 | @classmethod 99 | def forward(cls, ctx, input, target, target_size, transitions, scale_mode): 100 | """ 101 | Forward pass of the criterion. 102 | 103 | Parameters: 104 | ----------- 105 | input: float torch.tensor of the size [Batch, Time, Ntokens] 106 | (output of the network with scores for all frames and all tokens) 107 | target: int torch.tensor of the size [Batch, Length] 108 | (padded target transcription encoded with indices of tokens) 109 | target_size: int torch.tensor of the size [Batch] 110 | (original length of each target transcription in the bacth) 111 | transitions: float torch.tensor of size [Ntokens, Ntokens] 112 | (transitions matrix for ASG loss function, 113 | scores of moving from state of token_i to token_j) 114 | scale_mode: int, scaling factor of the output, possible values 115 | NONE = 0, 116 | INPUT_SZ = 1, 117 | INPUT_SZ_SQRT = 2, 118 | TARGET_SZ = 3, 119 | TARGET_SZ_SQRT = 4, 120 | """ 121 | B = input.size(0) 122 | T = input.size(1) 123 | N = input.size(2) 124 | L = target.size(1) 125 | device = input.device 126 | 127 | input_float = check_tensor(input, [B, T, N], torch.float, device) 128 | target = check_tensor(target, [B, L], torch.int, device) 129 | target_size = check_tensor(target_size, [B], torch.int, device) 130 | transitions_float = check_tensor(transitions, [N, N], torch.float, device) 131 | 132 | loss = torch.empty(B, dtype=torch.float, device=device) 133 | workspace = create_workspace(cls, device, B, T, N, L) 134 | run_forward( 135 | cls, 136 | device, 137 | B, 138 | T, 139 | N, 140 | L, 141 | scale_mode, 142 | get_data_ptr_as_bytes(input_float), 143 | get_data_ptr_as_bytes(target), 144 | get_data_ptr_as_bytes(target_size), 145 | get_data_ptr_as_bytes(transitions_float), 146 | get_data_ptr_as_bytes(loss), 147 | get_data_ptr_as_bytes(workspace), 148 | ) 149 | ctx.save_for_backward(input, target, target_size, transitions, workspace) 150 | return loss.to(input) 151 | 152 | @classmethod 153 | def backward(cls, ctx, grad): 154 | input, target, target_size, transitions, workspace = ctx.saved_tensors 155 | B = input.size(0) 156 | T = input.size(1) 157 | N = input.size(2) 158 | L = target.size(1) 159 | device = input.device 160 | 161 | grad = check_tensor(grad, [B], torch.float, device) 162 | 163 | input_grad = torch.empty(B, T, N, dtype=torch.float, device=device) 164 | transitions_grad = torch.empty(N, N, dtype=torch.float, device=device) 165 | run_backward( 166 | cls, 167 | device, 168 | B, 169 | T, 170 | N, 171 | L, 172 | get_data_ptr_as_bytes(target), 173 | get_data_ptr_as_bytes(target_size), 174 | get_data_ptr_as_bytes(grad), 175 | get_data_ptr_as_bytes(input_grad), 176 | get_data_ptr_as_bytes(transitions_grad), 177 | get_data_ptr_as_bytes(workspace), 178 | ) 179 | 180 | return input_grad.to(input), None, None, transitions_grad.to(transitions), None 181 | 182 | 183 | class FCCFunction(torch.autograd.Function): 184 | """ 185 | torch.autograd.Function for FullConnectionCriterion 186 | Supports CPU and CUDA backends, compute the probability of the full ASG graph 187 | (the denominator of the ASG loss) 188 | """ 189 | 190 | @staticmethod 191 | def cuda_impl(): 192 | """ 193 | Get CUDA implementation of forward/backward for the criterion 194 | """ 195 | return _C.CudaFullConnectionCriterion 196 | 197 | @staticmethod 198 | def cpu_impl(): 199 | """ 200 | Get CPU implementation of forward/backward for the criterion 201 | """ 202 | return _C.CpuFullConnectionCriterion 203 | 204 | @classmethod 205 | def forward(cls, ctx, input, target_size, transitions, scale_mode): 206 | """ 207 | Forward pass of the criterion. 208 | 209 | Parameters: 210 | ----------- 211 | input: float torch.tensor of the size [Batch, Time, Ntokens] 212 | (output of the network with scores for all frames and all tokens) 213 | target: int torch.tensor of the size [Batch, Length] 214 | (padded target transcription encoded with indices of tokens) 215 | target_size: int torch.tensor of the size [Batch] 216 | (original length of each target transcription in the bacth) 217 | transitions: float torch.tensor of size [Ntokens, Ntokens] 218 | (transitions matrix for ASG loss function, 219 | scores of moving from state of token_i to token_j) 220 | scale_mode: int, scaling factor of the output, possible values 221 | NONE = 0, 222 | INPUT_SZ = 1, 223 | INPUT_SZ_SQRT = 2, 224 | TARGET_SZ = 3, 225 | TARGET_SZ_SQRT = 4, 226 | """ 227 | B = input.size(0) 228 | T = input.size(1) 229 | N = input.size(2) 230 | device = input.device 231 | 232 | input_float = check_tensor(input, [B, T, N], torch.float, device) 233 | if scale_mode != _C.CriterionScaleMode.NONE: 234 | target_size = check_tensor(target_size, [B], torch.int, device) 235 | transitions_float = check_tensor(transitions, [N, N], torch.float, device) 236 | 237 | loss = torch.empty(B, dtype=torch.float, device=device) 238 | workspace = create_workspace(cls, device, B, T, N) 239 | run_forward( 240 | cls, 241 | device, 242 | B, 243 | T, 244 | N, 245 | scale_mode, 246 | get_data_ptr_as_bytes(input_float), 247 | get_data_ptr_as_bytes(target_size), 248 | get_data_ptr_as_bytes(transitions_float), 249 | get_data_ptr_as_bytes(loss), 250 | get_data_ptr_as_bytes(workspace), 251 | ) 252 | ctx.save_for_backward(input, transitions, workspace) 253 | return loss.to(input) 254 | 255 | @classmethod 256 | def backward(cls, ctx, grad): 257 | input, transitions, workspace = ctx.saved_tensors 258 | B = input.size(0) 259 | T = input.size(1) 260 | N = input.size(2) 261 | device = input.device 262 | 263 | grad = check_tensor(grad, [B], torch.float, device) 264 | 265 | input_grad = torch.empty(B, T, N, dtype=torch.float, device=device) 266 | transitions_grad = torch.empty(N, N, dtype=torch.float, device=device) 267 | run_backward( 268 | cls, 269 | device, 270 | B, 271 | T, 272 | N, 273 | get_data_ptr_as_bytes(transitions), 274 | get_data_ptr_as_bytes(grad), 275 | get_data_ptr_as_bytes(input_grad), 276 | get_data_ptr_as_bytes(transitions_grad), 277 | get_data_ptr_as_bytes(workspace), 278 | ) 279 | return input_grad.to(input), None, transitions_grad.to(transitions), None 280 | 281 | 282 | class ASGLoss(nn.Module): 283 | def __init__(self, N, scale_mode=_C.CriterionScaleMode.NONE): 284 | """ 285 | ASG loss implementation. It is similar to CTC, but there is no blanks. 286 | There are also repetitions like ann -> an1 and transition matrix of scores 287 | from token_i to token_j. 288 | 289 | Parameters: 290 | ----------- 291 | N: int, number of tokens to predict for each frame 292 | scale_mode: int, scaling factor of the loss function, possible values 293 | NONE = 0, 294 | INPUT_SZ = 1, 295 | INPUT_SZ_SQRT = 2, 296 | TARGET_SZ = 3, 297 | TARGET_SZ_SQRT = 4, 298 | """ 299 | super().__init__() 300 | self.transitions = nn.Parameter( 301 | torch.zeros(N, N, dtype=torch.float, requires_grad=True) 302 | ) 303 | self.scale_mode = scale_mode 304 | 305 | def forward(self, input, target, target_size): 306 | """ 307 | Forward pass of the ASG loss. 308 | 309 | Parameters: 310 | ----------- 311 | input: float torch.tensor of the size [Batch, Time, Ntokens] 312 | (output of the network with scores for all frames and all tokens) 313 | target: int torch.tensor of the size [Batch, Length] 314 | (padded target transcription encoded with indices of tokens) 315 | target_size: int torch.tensor of the size [Batch] 316 | (original length of each target transcription in the bacth) 317 | 318 | """ 319 | return FCCFunction.apply( 320 | input, target_size, self.transitions, self.scale_mode 321 | ) - FACFunction.apply( 322 | input, target, target_size, self.transitions, self.scale_mode 323 | ) 324 | -------------------------------------------------------------------------------- /bindings/python/test/test_import.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Facebook, Inc. and its affiliates. 3 | This source code is licensed under the MIT-style license found in the 4 | LICENSE file in the root directory of this source tree. 5 | """ 6 | 7 | import logging 8 | import os 9 | import unittest 10 | 11 | 12 | class ImportTestCase(unittest.TestCase): 13 | def test_import_lib_sequence(self) -> None: 14 | from flashlight.lib.sequence import criterion 15 | from flashlight.lib.sequence.criterion import ( 16 | CpuForceAlignmentCriterion, 17 | CpuFullConnectionCriterion, 18 | CpuViterbiPath, 19 | CriterionScaleMode, 20 | ) 21 | 22 | if os.getenv("USE_CUDA", "OFF").upper() not in [ 23 | "OFF", 24 | "0", 25 | "NO", 26 | "FALSE", 27 | "N", 28 | ]: 29 | from flashlight.lib.sequence.flashlight_lib_sequence_criterion import ( 30 | CudaForceAlignmentCriterion, 31 | CudaFullConnectionCriterion, 32 | CudaViterbiPath, 33 | ) 34 | else: 35 | logging.info("Flashlight Sequence bindings built without CUDA") 36 | 37 | 38 | if __name__ == "__main__": 39 | unittest.main() 40 | -------------------------------------------------------------------------------- /bindings/python/version.txt: -------------------------------------------------------------------------------- 1 | 0.0.0 2 | -------------------------------------------------------------------------------- /cmake/BuildGoogleTest.cmake: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.16) 2 | 3 | include(FetchContent) 4 | 5 | set(gtest_URL https://github.com/google/googletest.git) 6 | set(gtest_TAG v1.13.0) 7 | 8 | FetchContent_Declare( 9 | googletest 10 | GIT_REPOSITORY ${gtest_URL} 11 | GIT_TAG ${gtest_TAG} 12 | ) 13 | 14 | set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) # for Windows 15 | FetchContent_MakeAvailable(googletest) 16 | -------------------------------------------------------------------------------- /cmake/Buildcub.cmake: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.16) 2 | 3 | include(FetchContent) 4 | 5 | FetchContent_Declare( 6 | cub 7 | GIT_REPOSITORY https://github.com/NVIDIA/cub.git 8 | # guaranteed to work with CUDA < 11, where it isn't bundled 9 | GIT_TAG 1.8.0 10 | ) 11 | 12 | FetchContent_MakeAvailable(cub) 13 | set(cub_INCLUDE_DIRS ${cub_SOURCE_DIR}) 14 | -------------------------------------------------------------------------------- /cmake/Buildpybind11.cmake: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.16) 2 | 3 | include(FetchContent) 4 | 5 | set(pybind11_URL https://github.com/pybind/pybind11.git) 6 | set(pybind11_TAG v2.10.3) 7 | 8 | FetchContent_Declare( 9 | pybind11 10 | GIT_REPOSITORY ${pybind11_URL} 11 | GIT_TAG ${pybind11_TAG} 12 | ) 13 | 14 | FetchContent_MakeAvailable(pybind11) 15 | -------------------------------------------------------------------------------- /cmake/FindFilesystem.cmake: -------------------------------------------------------------------------------- 1 | # Taken from https://gitlab.kitware.com/cmake/cmake/-/issues/17834, with modifications 2 | 3 | #[=======================================================================[.rst: 4 | 5 | FindFilesystem 6 | ############## 7 | 8 | This module supports the C++17 standard library's filesystem utilities. Use the 9 | :imp-target:`std::filesystem` imported target to 10 | 11 | Options 12 | ******* 13 | 14 | The ``COMPONENTS`` argument to this module supports the following values: 15 | 16 | .. find-component:: Experimental 17 | :name: fs.Experimental 18 | 19 | Allows the module to find the "experimental" Filesystem TS version of the 20 | Filesystem library. This is the library that should be used with the 21 | ``std::experimental::filesystem`` namespace. 22 | 23 | .. find-component:: Final 24 | :name: fs.Final 25 | 26 | Finds the final C++17 standard version of the filesystem library. 27 | 28 | If no components are provided, behaves as if the 29 | :find-component:`fs.Final` component was specified. 30 | 31 | If both :find-component:`fs.Experimental` and :find-component:`fs.Final` are 32 | provided, first looks for ``Final``, and falls back to ``Experimental`` in case 33 | of failure. If ``Final`` is found, :imp-target:`std::filesystem` and all 34 | :ref:`variables ` will refer to the ``Final`` version. 35 | 36 | 37 | Imported Targets 38 | **************** 39 | 40 | .. imp-target:: std::filesystem 41 | 42 | The ``std::filesystem`` imported target is defined when any requested 43 | version of the C++ filesystem library has been found, whether it is 44 | *Experimental* or *Final*. 45 | 46 | If no version of the filesystem library is available, this target will not 47 | be defined. 48 | 49 | .. note:: 50 | This target has ``cxx_std_17`` as an ``INTERFACE`` 51 | :ref:`compile language standard feature `. Linking 52 | to this target will automatically enable C++17 if no later standard 53 | version is already required on the linking target. 54 | 55 | 56 | .. _fs.variables: 57 | 58 | Variables 59 | ********* 60 | 61 | .. variable:: CXX_FILESYSTEM_IS_EXPERIMENTAL 62 | 63 | Set to ``TRUE`` when the :find-component:`fs.Experimental` version of C++ 64 | filesystem library was found, otherwise ``FALSE``. 65 | 66 | .. variable:: CXX_FILESYSTEM_HAVE_FS 67 | 68 | Set to ``TRUE`` when a filesystem header was found. 69 | 70 | .. variable:: CXX_FILESYSTEM_HEADER 71 | 72 | Set to either ``filesystem`` or ``experimental/filesystem`` depending on 73 | whether :find-component:`fs.Final` or :find-component:`fs.Experimental` was 74 | found. 75 | 76 | .. variable:: CXX_FILESYSTEM_NAMESPACE 77 | 78 | Set to either ``std::filesystem`` or ``std::experimental::filesystem`` 79 | depending on whether :find-component:`fs.Final` or 80 | :find-component:`fs.Experimental` was found. 81 | 82 | 83 | Examples 84 | ******** 85 | 86 | Using `find_package(Filesystem)` with no component arguments: 87 | 88 | .. code-block:: cmake 89 | 90 | find_package(Filesystem REQUIRED) 91 | 92 | add_executable(my-program main.cpp) 93 | target_link_libraries(my-program PRIVATE std::filesystem) 94 | 95 | 96 | #]=======================================================================] 97 | 98 | 99 | if(TARGET std::filesystem) 100 | # This module has already been processed. Don't do it again. 101 | return() 102 | endif() 103 | 104 | cmake_minimum_required(VERSION 3.10) 105 | 106 | include(CMakePushCheckState) 107 | include(CheckIncludeFileCXX) 108 | 109 | # If we're not cross-compiling, try to run test executables. 110 | # Otherwise, assume that compile + link is a sufficient check. 111 | if(CMAKE_CROSSCOMPILING) 112 | include(CheckCXXSourceCompiles) 113 | macro(_cmcm_check_cxx_source code var) 114 | check_cxx_source_compiles("${code}" ${var}) 115 | endmacro() 116 | else() 117 | include(CheckCXXSourceRuns) 118 | macro(_cmcm_check_cxx_source code var) 119 | check_cxx_source_runs("${code}" ${var}) 120 | endmacro() 121 | endif() 122 | 123 | cmake_push_check_state() 124 | 125 | set(CMAKE_REQUIRED_QUIET ${Filesystem_FIND_QUIETLY}) 126 | 127 | # All of our tests required C++17 or later 128 | set(CMAKE_CXX_STANDARD 17) 129 | 130 | # Normalize and check the component list we were given 131 | set(want_components ${Filesystem_FIND_COMPONENTS}) 132 | if(Filesystem_FIND_COMPONENTS STREQUAL "") 133 | set(want_components Final) 134 | endif() 135 | 136 | # Warn on any unrecognized components 137 | set(extra_components ${want_components}) 138 | list(REMOVE_ITEM extra_components Final Experimental) 139 | foreach(component IN LISTS extra_components) 140 | message(WARNING "Extraneous find_package component for Filesystem: ${component}") 141 | endforeach() 142 | 143 | # Detect which of Experimental and Final we should look for 144 | set(find_experimental TRUE) 145 | set(find_final TRUE) 146 | if(NOT "Final" IN_LIST want_components) 147 | set(find_final FALSE) 148 | endif() 149 | if(NOT "Experimental" IN_LIST want_components) 150 | set(find_experimental FALSE) 151 | endif() 152 | 153 | if(find_final) 154 | check_include_file_cxx("filesystem" _CXX_FILESYSTEM_HAVE_HEADER) 155 | mark_as_advanced(_CXX_FILESYSTEM_HAVE_HEADER) 156 | if(_CXX_FILESYSTEM_HAVE_HEADER) 157 | # We found the non-experimental header. Don't bother looking for the 158 | # experimental one. 159 | set(find_experimental FALSE) 160 | endif() 161 | else() 162 | set(_CXX_FILESYSTEM_HAVE_HEADER FALSE) 163 | endif() 164 | 165 | if(find_experimental) 166 | check_include_file_cxx("experimental/filesystem" _CXX_FILESYSTEM_HAVE_EXPERIMENTAL_HEADER) 167 | mark_as_advanced(_CXX_FILESYSTEM_HAVE_EXPERIMENTAL_HEADER) 168 | else() 169 | set(_CXX_FILESYSTEM_HAVE_EXPERIMENTAL_HEADER FALSE) 170 | endif() 171 | 172 | if(_CXX_FILESYSTEM_HAVE_HEADER) 173 | set(_have_fs TRUE) 174 | set(_fs_header filesystem) 175 | set(_fs_namespace std::filesystem) 176 | set(_is_experimental FALSE) 177 | elseif(_CXX_FILESYSTEM_HAVE_EXPERIMENTAL_HEADER) 178 | set(_have_fs TRUE) 179 | set(_fs_header experimental/filesystem) 180 | set(_fs_namespace std::experimental::filesystem) 181 | set(_is_experimental TRUE) 182 | else() 183 | set(_have_fs FALSE) 184 | endif() 185 | 186 | set(CXX_FILESYSTEM_HAVE_FS ${_have_fs} CACHE BOOL "TRUE if we have the C++ filesystem headers") 187 | set(CXX_FILESYSTEM_HEADER ${_fs_header} CACHE STRING "The header that should be included to obtain the filesystem APIs") 188 | set(CXX_FILESYSTEM_NAMESPACE ${_fs_namespace} CACHE STRING "The C++ namespace that contains the filesystem APIs") 189 | set(CXX_FILESYSTEM_IS_EXPERIMENTAL ${_is_experimental} CACHE BOOL "TRUE if the C++ filesystem library is the experimental version") 190 | 191 | set(_found FALSE) 192 | 193 | if(CXX_FILESYSTEM_HAVE_FS) 194 | # We have some filesystem library available. Do link checks 195 | string(CONFIGURE [[ 196 | #include 197 | #include <@CXX_FILESYSTEM_HEADER@> 198 | 199 | int main() { 200 | auto cwd = @CXX_FILESYSTEM_NAMESPACE@::current_path(); 201 | printf("%s", cwd.c_str()); 202 | return EXIT_SUCCESS; 203 | } 204 | ]] code @ONLY) 205 | 206 | # Check a simple filesystem program without any linker flags 207 | _cmcm_check_cxx_source("${code}" CXX_FILESYSTEM_NO_LINK_NEEDED) 208 | 209 | set(can_link ${CXX_FILESYSTEM_NO_LINK_NEEDED}) 210 | 211 | if(NOT CXX_FILESYSTEM_NO_LINK_NEEDED) 212 | set(prev_libraries ${CMAKE_REQUIRED_LIBRARIES}) 213 | # Add the libstdc++ flag 214 | set(CMAKE_REQUIRED_LIBRARIES ${prev_libraries} -lstdc++fs) 215 | _cmcm_check_cxx_source("${code}" CXX_FILESYSTEM_STDCPPFS_NEEDED) 216 | set(can_link ${CXX_FILESYSTEM_STDCPPFS_NEEDED}) 217 | if(NOT CXX_FILESYSTEM_STDCPPFS_NEEDED) 218 | # Try the libc++ flag 219 | set(CMAKE_REQUIRED_LIBRARIES ${prev_libraries} -lc++fs) 220 | _cmcm_check_cxx_source("${code}" CXX_FILESYSTEM_CPPFS_NEEDED) 221 | set(can_link ${CXX_FILESYSTEM_CPPFS_NEEDED}) 222 | endif() 223 | endif() 224 | 225 | if(can_link) 226 | add_library(std::filesystem INTERFACE IMPORTED) 227 | # TODO: re-enable this once requiring CUDA >= 11/an nvcc version that works with C++ 17 228 | # set_property(TARGET std::filesystem APPEND PROPERTY INTERFACE_COMPILE_FEATURES cxx_std_17) 229 | set(_found TRUE) 230 | 231 | if(CXX_FILESYSTEM_NO_LINK_NEEDED) 232 | # Nothing to add... 233 | elseif(CXX_FILESYSTEM_STDCPPFS_NEEDED) 234 | set_property(TARGET std::filesystem APPEND PROPERTY INTERFACE_LINK_LIBRARIES -lstdc++fs) 235 | elseif(CXX_FILESYSTEM_CPPFS_NEEDED) 236 | set_property(TARGET std::filesystem APPEND PROPERTY INTERFACE_LINK_LIBRARIES -lc++fs) 237 | endif() 238 | endif() 239 | endif() 240 | 241 | cmake_pop_check_state() 242 | 243 | set(Filesystem_FOUND ${_found} CACHE BOOL "TRUE if we can run a program using std::filesystem" FORCE) 244 | 245 | if(Filesystem_FIND_REQUIRED AND NOT Filesystem_FOUND) 246 | message(FATAL_ERROR "Cannot run simple program using std::filesystem") 247 | endif() 248 | -------------------------------------------------------------------------------- /cmake/FindGMock.cmake: -------------------------------------------------------------------------------- 1 | # Find gmock 2 | # 3 | # GMOCK_INCLUDE_DIRS - where to find gmock/gmock.h, etc. 4 | # GMOCK_LIBRARIES - List of libraries when using gmock. 5 | # GMOCK_FOUND - True if gmock found. 6 | 7 | if (GMOCK_INCLUDE_DIRS) 8 | # Already in cache, be silent 9 | set(GMOCK_FIND_QUIETLY TRUE) 10 | endif() 11 | 12 | find_package(GMock CONFIG) 13 | if (NOT TARGET GTest::gmock) 14 | if (NOT GMOCK_ROOT) 15 | set(GMOCK_ROOT ENV{GMOCK_ROOT}) 16 | endif() 17 | 18 | find_path(GMOCK_INCLUDE_DIRS gmock/gmock.h PATHS ${GMOCK_ROOT}) 19 | find_library(GMOCK_MAIN_LIBRARY NAMES gmock_main PATHS ${GMOCK_ROOT}) 20 | find_library(GMOCK_LIBRARIES NAMES gmock PATHS ${GMOCK_ROOT}) 21 | 22 | set(GMOCK_BOTH_LIBRARIES 23 | ${GMOCK_MAIN_LIBRARY} 24 | ${GMOCK_LIBRARIES} 25 | ) 26 | 27 | # handle the QUIETLY and REQUIRED arguments and set GMOCK_FOUND to TRUE if 28 | # all listed variables are TRUE 29 | include(FindPackageHandleStandardArgs) 30 | find_package_handle_standard_args( 31 | GMock 32 | DEFAULT_MSG 33 | GMOCK_MAIN_LIBRARY 34 | GMOCK_LIBRARIES 35 | GMOCK_LIBRARIES 36 | GMOCK_INCLUDE_DIRS 37 | ) 38 | 39 | mark_as_advanced( 40 | GMOCK_MAIN_LIBRARY 41 | GMOCK_LIBRARIES 42 | LIBGTEST_LIBRARY 43 | GMOCK_LIBRARIES 44 | GMOCK_INCLUDE_DIRS 45 | ) 46 | 47 | add_library(GTest::gmock UNKNOWN IMPORTED) 48 | set_target_properties(GTest::gmock PROPERTIES 49 | INTERFACE_INCLUDE_DIRECTORIES ${GMOCK_INCLUDE_DIRS} 50 | IMPORTED_LOCATION ${GMOCK_LIBRARIES} 51 | ) 52 | 53 | add_library(GTest::gmock_main UNKNOWN IMPORTED) 54 | set_target_properties(GTest::gmock_main PROPERTIES 55 | INTERFACE_INCLUDE_DIRECTORIES ${GMOCK_INCLUDE_DIRS} 56 | IMPORTED_LOCATION ${GMOCK_MAIN_LIBRARY} 57 | ) 58 | endif() 59 | -------------------------------------------------------------------------------- /cmake/InternalUtils.cmake: -------------------------------------------------------------------------------- 1 | function(fl_sequence_add_coverage_to_target) 2 | set(oneValueArgs TARGET) 3 | cmake_parse_arguments(add_coverage_to_target "${options}" "${oneValueArgs}" 4 | "${multiValueArgs}" ${ARGN}) 5 | 6 | if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang") 7 | # Add required flags (GCC & LLVM/Clang) 8 | target_compile_options(${add_coverage_to_target_TARGET} PUBLIC 9 | -O0 # TODO: reconcile this with CMake modes for something cleaner 10 | -g 11 | $<$:--coverage> 12 | ) 13 | if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.13) 14 | target_link_options(${add_coverage_to_target_TARGET} 15 | PUBLIC 16 | $<$:--coverage>) 17 | else() 18 | target_link_libraries(${add_coverage_to_target_TARGET} 19 | PUBLIC 20 | --coverage) 21 | endif() 22 | endif() 23 | endfunction(fl_sequence_add_coverage_to_target) 24 | 25 | function(fl_sequence_setup_install_targets) 26 | set(multiValueArgs INSTALL_TARGETS INSTALL_HEADERS) 27 | cmake_parse_arguments(setup_install_targets "${options}" "${oneValueArgs}" 28 | "${multiValueArgs}" ${ARGN}) 29 | 30 | list(LENGTH setup_install_targets_INSTALL_TARGETS TARGETS_LENGTH) 31 | if (${TARGETS_LENGTH} EQUAL 0) 32 | message(FATAL_ERROR "setup_install_targets called with " 33 | "empty targets list.") 34 | endif() 35 | 36 | # Main target 37 | install( 38 | TARGETS ${setup_install_targets_INSTALL_TARGETS} 39 | EXPORT flashlight-sequence-targets 40 | COMPONENT flashlight-sequence 41 | PUBLIC_HEADER DESTINATION fl 42 | RUNTIME DESTINATION ${FL_INSTALL_BIN_DIR} 43 | LIBRARY DESTINATION ${FL_INSTALL_LIB_DIR} 44 | ARCHIVE DESTINATION ${FL_INSTALL_LIB_DIR} 45 | FRAMEWORK DESTINATION framework 46 | INCLUDES DESTINATION ${FL_INSTALL_INC_DIR} 47 | ) 48 | 49 | # Write and install targets file 50 | install( 51 | EXPORT flashlight-sequence-targets 52 | NAMESPACE flashlight:: 53 | DESTINATION ${FL_INSTALL_CMAKE_DIR} 54 | COMPONENT flashlight-sequence 55 | ) 56 | 57 | # Write config file (used by projects including fl, such as examples) 58 | include(CMakePackageConfigHelpers) 59 | set(INCLUDE_DIRS include) 60 | set(CMAKE_DIR ${FL_INSTALL_CMAKE_DIR}) 61 | configure_package_config_file( 62 | ${PROJECT_SOURCE_DIR}/cmake/flashlight-sequence-config.cmake.in 63 | cmake/install/${FL_CONFIG_CMAKE_BUILD_DIR}/flashlight-sequence-config.cmake 64 | INSTALL_DESTINATION 65 | ${FL_INSTALL_CMAKE_DIR} 66 | PATH_VARS INCLUDE_DIRS CMAKE_DIR 67 | ) 68 | write_basic_package_version_file( 69 | cmake/install/${FL_CONFIG_CMAKE_BUILD_DIR}/flashlight-sequence-config-version.cmake 70 | COMPATIBILITY SameMajorVersion 71 | ) 72 | install(FILES 73 | ${PROJECT_BINARY_DIR}/cmake/install/flashlight-sequence-config.cmake 74 | ${PROJECT_BINARY_DIR}/cmake/install/flashlight-sequence-config-version.cmake 75 | DESTINATION ${FL_INSTALL_CMAKE_DIR} 76 | COMPONENT flashlight-sequence 77 | ) 78 | set_target_properties(${setup_install_targets_INSTALL_TARGETS} PROPERTIES 79 | VERSION "${flashlight-sequence_VERSION}" 80 | SOVERSION "${flashlight-sequence_VERSION_MAJOR}") 81 | endfunction(fl_sequence_setup_install_targets) 82 | -------------------------------------------------------------------------------- /cmake/TestUtils.cmake: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.16) 2 | 3 | set(GTEST_TARGETS "") 4 | 5 | # Get or find Google Test and Google Mock 6 | find_package(GTest 1.12.1) 7 | if (NOT GTEST_FOUND) 8 | if (NOT TARGET gtest) 9 | message(STATUS "googletest not found - will download and build from source") 10 | # Download, build, and find the resulting googletest 11 | include(${PROJECT_SOURCE_DIR}/cmake/BuildGoogleTest.cmake) 12 | list(APPEND GTEST_TARGETS GTest::gtest GTest::gtest_main GTest::gmock GTest::gmock_main) 13 | endif() 14 | else() 15 | message(STATUS "gtest found: (include: ${GTEST_INCLUDE_DIRS}, lib: ${GTEST_BOTH_LIBRARIES}") 16 | if (TARGET GTest::GTest) 17 | # We found the differently-named CMake targets from FindGTest 18 | if (NOT TARGET GTest::Main) 19 | message(FATAL_ERROR "Google Test must be built with main") 20 | endif() 21 | # TODO: these targets are deprecated in CMake 3.20 22 | list(APPEND GTEST_TARGETS GTest::GTest GTest::Main) 23 | endif() 24 | if (NOT TARGET GTest::gmock) 25 | find_package(GMock REQUIRED) 26 | message(STATUS "gmock found: (include: ${GMOCK_INCLUDE_DIRS}, lib: ${GMOCK_BOTH_LIBRARIES})") 27 | endif() 28 | list(APPEND GTEST_TARGETS GTest::gmock GTest::gmock_main) 29 | message(STATUS "Found gtest and gmock on system.") 30 | endif() 31 | 32 | include(GoogleTest) 33 | find_package(Threads REQUIRED) 34 | 35 | function(build_test) 36 | set(options) 37 | set(oneValueArgs SRC) 38 | set(multiValueArgs LIBS PREPROC) 39 | cmake_parse_arguments(build_test "${options}" "${oneValueArgs}" 40 | "${multiValueArgs}" ${ARGN}) 41 | 42 | get_filename_component(src_name ${build_test_SRC} NAME_WE) 43 | set(target "${src_name}") 44 | add_executable(${target} ${build_test_SRC}) 45 | if (TARGET gtest) 46 | add_dependencies(${target} gtest) # make sure gtest is built first 47 | endif() 48 | target_link_libraries( 49 | ${target} 50 | PUBLIC 51 | ${GTEST_TARGETS} 52 | ${build_test_LIBS} 53 | ${CMAKE_THREAD_LIBS_INIT} 54 | ) 55 | target_include_directories( 56 | ${target} 57 | PUBLIC 58 | ${PROJECT_SOURCE_DIR} 59 | ) 60 | target_compile_definitions( 61 | ${target} 62 | PUBLIC 63 | ${build_test_PREPROC} 64 | ) 65 | gtest_add_tests(TARGET ${target}) 66 | endfunction(build_test) 67 | -------------------------------------------------------------------------------- /cmake/flashlight-sequence-config.cmake.in: -------------------------------------------------------------------------------- 1 | # flashlight-sequence 2 | # 3 | # IMPORTED targets 4 | # ^^^^^^^^^^^^^^^^ 5 | # 6 | # Configuration file for flashlight-sequence. Provides the following 7 | # `IMPORTED` targets: 8 | # 9 | # ``flashlight::flashlight-sequence`` 10 | # The flashlight-sequence library. 11 | # 12 | # The above targets can be linked with your build using ``target_link_library``. 13 | # Example: 14 | # 15 | # add_executable(myexecutable mySource.cpp) 16 | # target_link_library(myexecutable PRIVATE flashlight::flashlight-sequence) 17 | # 18 | # The above properly links flashlight-sequence with myexecutable. No call to 19 | # ``target_include_directories`` is required. 20 | # 21 | 22 | # Dependencies 23 | include(CMakeFindDependencyMacro) 24 | if (@FL_SEQUENCE_USE_OPENMP@) 25 | find_dependency(OpenMP) 26 | endif() 27 | if (@FL_SEQUENCE_USE_CUDA@) 28 | # TODO: use FindCUDAToolkit after requiring CMake >= 3.17 29 | enable_language(CUDA) 30 | endif() 31 | # Config variables 32 | set(FL_SEQUENCE_USE_OPENMP @FL_SEQUENCE_USE_OPENMP@) 33 | set(FL_SEQUENCE_USE_CUDA @FL_SEQUENCE_USE_CUDA@) 34 | 35 | ################################################################################ 36 | 37 | @PACKAGE_INIT@ 38 | 39 | # Add IMPORTED targets 40 | if(NOT TARGET flashlight::flashlight-sequence) 41 | if(EXISTS @PACKAGE_CMAKE_DIR@/flashlight-sequence-targets.cmake) 42 | include(@PACKAGE_CMAKE_DIR@/flashlight-sequence-targets.cmake) 43 | endif() 44 | endif() 45 | 46 | # For legacy configurations 47 | set(flashlight-sequence_LIBRARIES flashlight::flashlight-sequence) 48 | if (EXISTS @PACKAGE_INCLUDE_DIRS@) 49 | set(flashlight-sequence_INCLUDE_DIRS @PACKAGE_INCLUDE_DIRS@) 50 | endif() 51 | set(flashlight-sequence_FOUND 1) 52 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | comment: false 2 | coverage: 3 | status: 4 | project: 5 | default: 6 | threshold: 0.25% 7 | -------------------------------------------------------------------------------- /flashlight/lib/sequence/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.16) 2 | 3 | include(${CMAKE_CURRENT_LIST_DIR}/criterion/CMakeLists.txt) 4 | -------------------------------------------------------------------------------- /flashlight/lib/sequence/Defines.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Facebook, Inc. and its affiliates. 3 | * 4 | * This source code is licensed under the MIT-style license found in the 5 | * LICENSE file in the root directory of this source tree. 6 | */ 7 | 8 | #pragma once 9 | 10 | #if defined(_WIN32) || defined(_MSC_VER) 11 | 12 | #ifdef FL_SEQ_DLL 13 | #define FL_SEQ_API __declspec(dllexport) 14 | #else // FL_SEQ_DLL 15 | #define FL_SEQ_API __declspec(dllimport) 16 | #endif // FL_SEQ_DLL 17 | 18 | #define FL_SEQ_DEPRECATED(msg) __declspec(deprecated(msg) 19 | 20 | #else // defined(_WIN32) || defined(_MSC_VER) 21 | 22 | #define FL_SEQ_API __attribute__((visibility("default"))) 23 | #define FL_SEQ_DEPRECATED(msg) __attribute__((deprecated(msg))) 24 | 25 | #endif // defined(_WIN32) || defined(_MSC_VER) 26 | -------------------------------------------------------------------------------- /flashlight/lib/sequence/criterion/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.16) 2 | 3 | if (FL_SEQUENCE_USE_OPENMP) 4 | find_package(OpenMP REQUIRED) 5 | target_link_libraries(flashlight-sequence PRIVATE OpenMP::OpenMP_CXX) 6 | endif() 7 | 8 | target_sources( 9 | flashlight-sequence 10 | PRIVATE 11 | ${CMAKE_CURRENT_LIST_DIR}/cpu/CriterionUtils.cpp 12 | ${CMAKE_CURRENT_LIST_DIR}/cpu/ForceAlignmentCriterion.cpp 13 | ${CMAKE_CURRENT_LIST_DIR}/cpu/ConnectionistTemporalClassificationCriterion.cpp 14 | ${CMAKE_CURRENT_LIST_DIR}/cpu/FullConnectionCriterion.cpp 15 | ${CMAKE_CURRENT_LIST_DIR}/cpu/ViterbiPath.cpp 16 | ) 17 | 18 | if (FL_SEQUENCE_USE_CUDA) 19 | # cub is required for BlockReduce and not bundled with CUDA < 11 20 | find_path(cub_INCLUDE_DIRS 21 | NAMES cub.cuh 22 | PATH_SUFFIXES cub include 23 | PATHS ${cub_BASE_DIR} ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} 24 | ENV ${cub_BASE_DIR}) 25 | if (NOT cub_INCLUDE_DIRS) 26 | if (NOT FL_SEQUENCE_BUILD_STANDALONE) 27 | message(FATAL_ERROR 28 | "Required dependency NVIDIA cub not found - try setting cub_BASE_DIR") 29 | endif() 30 | 31 | message(STATUS "NVIDIA cub not found - downloading from source") 32 | include(${PROJECT_SOURCE_DIR}/cmake/Buildcub.cmake) 33 | target_include_directories(flashlight-sequence PRIVATE ${cub_INCLUDE_DIRS}) 34 | else() 35 | message(STATUS "NVIDIA cub found: (include: ${cub_INCLUDE_DIRS})") 36 | endif() 37 | 38 | target_sources( 39 | flashlight-sequence 40 | PRIVATE 41 | ${CMAKE_CURRENT_LIST_DIR}/cuda/CriterionUtils.cu 42 | ${CMAKE_CURRENT_LIST_DIR}/cuda/ForceAlignmentCriterion.cu 43 | ${CMAKE_CURRENT_LIST_DIR}/cuda/FullConnectionCriterion.cu 44 | ${CMAKE_CURRENT_LIST_DIR}/cuda/ViterbiPath.cu 45 | ) 46 | 47 | target_include_directories( 48 | flashlight-sequence 49 | PUBLIC 50 | $ 51 | ) 52 | endif() 53 | -------------------------------------------------------------------------------- /flashlight/lib/sequence/criterion/Defines.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Facebook, Inc. and its affiliates. 3 | * 4 | * This source code is licensed under the MIT-style license found in the 5 | * LICENSE file in the root directory of this source tree. 6 | */ 7 | 8 | #pragma once 9 | 10 | namespace fl { 11 | namespace lib { 12 | namespace seq { 13 | 14 | enum class CriterionScaleMode { 15 | NONE = 0, 16 | INPUT_SZ = 1, 17 | INPUT_SZ_SQRT = 2, 18 | TARGET_SZ = 3, 19 | TARGET_SZ_SQRT = 4, 20 | }; 21 | 22 | } // namespace seq 23 | } // namespace lib 24 | } // namespace fl 25 | -------------------------------------------------------------------------------- /flashlight/lib/sequence/criterion/Workspace.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Facebook, Inc. and its affiliates. 3 | * 4 | * This source code is licensed under the MIT-style license found in the 5 | * LICENSE file in the root directory of this source tree. 6 | */ 7 | 8 | #pragma once 9 | 10 | #include 11 | #include 12 | 13 | namespace fl { 14 | namespace lib { 15 | namespace seq { 16 | 17 | /** 18 | * Partitions a contiguous block of memory into aligned arrays. 19 | * Can be used for either host or device memory. 20 | * 21 | * Usage: first create Workspace(nullptr), request() some arrays, then call 22 | * requiredSize(). Next, allocate memory of that size. Finally, create 23 | * Workspace(ptr) and request() the same sequence of arrays. 24 | */ 25 | template 26 | class Workspace { 27 | public: 28 | explicit Workspace(void* workspacePtr) 29 | : workspacePtr_(reinterpret_cast(workspacePtr)), offset_(0) { 30 | align(); 31 | } 32 | 33 | template 34 | T* request(size_t s0, size_t s1 = 1, size_t s2 = 1, size_t s3 = 1) { 35 | align(); 36 | auto p = reinterpret_cast(workspacePtr_ + offset_); 37 | offset_ += sizeof(T) * s0 * s1 * s2 * s3; 38 | return p; 39 | } 40 | 41 | template 42 | void request(T** p, size_t s0, size_t s1 = 1, size_t s2 = 1, size_t s3 = 1) { 43 | *p = request(s0, s1, s2, s3); 44 | } 45 | 46 | size_t requiredSize() const { 47 | // Add extra bytes in case the initial `workspacePtr` isn't aligned 48 | return offset_ + Alignment - 1; 49 | } 50 | 51 | private: 52 | void align() { 53 | // Pad until `workspacePtr_ + offset_` is a multiple of `Alignment` 54 | offset_ += 55 | Alignment - 1 - (workspacePtr_ + offset_ + Alignment - 1) % Alignment; 56 | } 57 | 58 | const uintptr_t workspacePtr_; 59 | size_t offset_; 60 | }; 61 | 62 | } // namespace seq 63 | } // namespace lib 64 | } // namespace fl 65 | -------------------------------------------------------------------------------- /flashlight/lib/sequence/criterion/cpu/ConnectionistTemporalClassificationCriterion.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Facebook, Inc. and its affiliates. 3 | * 4 | * This source code is licensed under the MIT-style license found in the 5 | * LICENSE file in the root directory of this source tree. 6 | */ 7 | 8 | #include "flashlight/lib/sequence/criterion/cpu/ConnectionistTemporalClassificationCriterion.h" 9 | 10 | #include 11 | #include 12 | 13 | #include "flashlight/lib/sequence/criterion/Workspace.h" 14 | 15 | namespace { 16 | 17 | template 18 | struct WorkspacePtrs { 19 | WorkspacePtrs(void* workspace, int B, int T, int /* N unused */, int L) { 20 | const int s = (2 * L) + 1; 21 | fl::lib::seq::Workspace<> ws(workspace); 22 | ws.request(&alpha, B, T, s); 23 | ws.request(&s_inc, B, s); 24 | ws.request(&e_inc, B, s); 25 | ws.request(&backptr, B, T, s); 26 | ws.request(&labels_w_blanks, B, s); 27 | requiredSize = ws.requiredSize(); 28 | } 29 | 30 | Float* alpha; 31 | int* s_inc; 32 | int* e_inc; 33 | int* labels_w_blanks; 34 | int* backptr; 35 | size_t requiredSize; 36 | }; 37 | 38 | /* 39 | * Derived from warpctc/include/detail/cpu_ctc.h 40 | * Creates labels_w_blanks which adds blank_labels between each character in 41 | * labels. 42 | * s_inc and e_inc are used by the `compute_alphas` kernel to determine the 43 | * furthest starting and end label location that each time step could possibly 44 | * be. 45 | */ 46 | int setup_labels( 47 | const int* labels, 48 | int* s_inc, 49 | int* e_inc, 50 | int* labels_w_blanks, 51 | int blank_label, 52 | int L, 53 | int S) { 54 | int e_counter = 0; 55 | int s_counter = 0; 56 | 57 | s_inc[s_counter++] = 1; 58 | 59 | int repeats = 0; 60 | 61 | for (int i = 1; i < L; ++i) { 62 | if (labels[i - 1] == labels[i]) { 63 | s_inc[s_counter++] = 1; 64 | s_inc[s_counter++] = 1; 65 | e_inc[e_counter++] = 1; 66 | e_inc[e_counter++] = 1; 67 | ++repeats; 68 | } else { 69 | s_inc[s_counter++] = 2; 70 | e_inc[e_counter++] = 2; 71 | } 72 | } 73 | e_inc[e_counter++] = 1; 74 | 75 | for (int i = 0; i < L; ++i) { 76 | labels_w_blanks[2 * i] = blank_label; 77 | labels_w_blanks[2 * i + 1] = labels[i]; 78 | } 79 | labels_w_blanks[S - 1] = blank_label; 80 | 81 | return repeats; 82 | } 83 | 84 | /* 85 | * Derived from warpctc/include/detail/cpu_ctc.h 86 | * Float can be either float or double 87 | */ 88 | template 89 | void compute_alphas( 90 | const Float* input, 91 | int repeats, 92 | int S, 93 | int T, 94 | int N, 95 | const int* const e_inc, 96 | const int* const s_inc, 97 | const int* const labels, 98 | Float* alphas, 99 | int* backptr, 100 | int* paths) { 101 | const int blank_label_idx = N - 1; 102 | int start = (((S / 2) + repeats - T) < 0) ? 0 : 1, end = S > 1 ? 2 : 1; 103 | 104 | for (int i = 0; i < S * T; i++) { 105 | alphas[i] = -std::numeric_limits::infinity(); 106 | } 107 | 108 | for (int i = start; i < end; ++i) { 109 | alphas[i] = input[labels[i]]; 110 | } 111 | 112 | // Iterate through each time frame 113 | for (int t = 1; t < T; ++t) { 114 | // Calculate the smallest and largest possible index of the target that this 115 | // time could be 116 | int remain = (S / 2) + repeats - (T - t); 117 | if (remain >= 0) { 118 | start += s_inc[remain]; 119 | } 120 | if (t <= (S / 2) + repeats) { 121 | end += e_inc[t - 1]; 122 | } 123 | int startloop = start; 124 | int idx1 = t * S, idx2 = (t - 1) * S, idx3 = t * N; 125 | 126 | if (start == 0) { 127 | alphas[idx1] = alphas[idx2] + input[blank_label_idx + idx3]; 128 | backptr[idx1] = 0; 129 | startloop += 1; 130 | } 131 | 132 | for (int i = startloop; i < end; ++i) { 133 | Float x0 = alphas[i + idx2]; 134 | Float x1 = alphas[(i - 1) + idx2]; 135 | Float x2 = -std::numeric_limits::infinity(); 136 | 137 | // In CTC, the optimal path may optionally chose to skip a blank label. 138 | // x2 represents skipping a letter, and can only happen if we're not 139 | // currently on a blank_label, and we're not on a repeat letter 140 | // (i != 1) just ensures we don't access labels[i - 2] if its i < 2 141 | if (labels[i] != blank_label_idx && i != 1 && 142 | labels[i] != labels[i - 2]) { 143 | x2 = alphas[(i - 2) + idx2]; 144 | } 145 | Float result = 0.0; 146 | if (x2 > x1 && x2 > x0) { 147 | result = x2; 148 | backptr[i + idx1] = 2; 149 | } else if (x1 > x0 && x1 > x2) { 150 | result = x1; 151 | backptr[i + idx1] = 1; 152 | } else { 153 | result = x0; 154 | backptr[i + idx1] = 0; 155 | } 156 | alphas[i + idx1] = result + input[labels[i] + idx3]; 157 | } 158 | } 159 | 160 | int ltrIdx = alphas[T * S - 1] > alphas[T * S - 2] ? S - 1 : S - 2; 161 | for (int t = T - 1; t >= 0; t--) { 162 | paths[t] = labels[ltrIdx]; 163 | ltrIdx -= backptr[(t * S) + ltrIdx]; 164 | } 165 | } 166 | 167 | } // namespace 168 | 169 | namespace fl { 170 | namespace lib { 171 | namespace cpu { 172 | 173 | template 174 | size_t ConnectionistTemporalClassificationCriterion::getWorkspaceSize( 175 | int B, 176 | int T, 177 | int N, 178 | int L) { 179 | WorkspacePtrs dummy(nullptr, B, T, N, L); 180 | return dummy.requiredSize; 181 | } 182 | 183 | template 184 | void ConnectionistTemporalClassificationCriterion::viterbi( 185 | int B, 186 | int T, 187 | int N, 188 | int _L, 189 | const Float* _input, 190 | const int* _target, 191 | const int* targetSize, 192 | int* bestPaths, 193 | void* workspace) { 194 | const int _S = (2 * _L) + 1; 195 | const int blank_label = N - 1; 196 | WorkspacePtrs ws(workspace, B, T, N, _L); 197 | for (auto b = 0; b < B; b++) { 198 | auto L = targetSize[b]; 199 | auto S = (2 * L) + 1; 200 | int repeats = setup_labels( 201 | _target + (b * _L), 202 | ws.s_inc + (b * _S), 203 | ws.e_inc + (b * _S), 204 | ws.labels_w_blanks + (b * _S), 205 | blank_label, 206 | L, 207 | S); 208 | compute_alphas( 209 | _input + (b * N * T), 210 | repeats, 211 | S, 212 | T, 213 | N, 214 | ws.e_inc + b * _S, 215 | ws.s_inc + b * _S, 216 | ws.labels_w_blanks + b * _S, 217 | ws.alpha + (b * _S * T), 218 | ws.backptr + (b * _S * T), 219 | bestPaths + (b * T)); 220 | } 221 | } 222 | 223 | template struct ConnectionistTemporalClassificationCriterion; 224 | template struct ConnectionistTemporalClassificationCriterion; 225 | 226 | } // namespace cpu 227 | } // namespace lib 228 | } // namespace fl 229 | -------------------------------------------------------------------------------- /flashlight/lib/sequence/criterion/cpu/ConnectionistTemporalClassificationCriterion.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Facebook, Inc. and its affiliates. 3 | * 4 | * This source code is licensed under the MIT-style license found in the 5 | * LICENSE file in the root directory of this source tree. 6 | */ 7 | #pragma once 8 | 9 | #include 10 | 11 | #include "flashlight/lib/sequence/Defines.h" 12 | 13 | namespace fl { 14 | namespace lib { 15 | namespace cpu { 16 | 17 | template 18 | struct FL_SEQ_API ConnectionistTemporalClassificationCriterion { 19 | static size_t getWorkspaceSize(int B, int T, int N, int L); 20 | 21 | static void viterbi( 22 | int B, 23 | int T, 24 | int N, 25 | int L, 26 | const Float* input, 27 | const int* target, 28 | const int* targetSize, 29 | int* bestPaths, 30 | void* workspace); 31 | }; 32 | } // namespace cpu 33 | } // namespace lib 34 | } // namespace fl 35 | -------------------------------------------------------------------------------- /flashlight/lib/sequence/criterion/cpu/CriterionUtils.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Facebook, Inc. and its affiliates. 3 | * 4 | * This source code is licensed under the MIT-style license found in the 5 | * LICENSE file in the root directory of this source tree. 6 | */ 7 | 8 | #include "flashlight/lib/sequence/criterion/cpu/CriterionUtils.h" 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | #include "flashlight/lib/sequence/criterion/Defines.h" 15 | 16 | namespace fl { 17 | namespace lib { 18 | namespace cpu { 19 | 20 | template 21 | void CriterionUtils::batchTargetSize( 22 | int B, 23 | int L, 24 | int maxSize, 25 | const int* target, 26 | int* targetSize) { 27 | for (int b = 0; b < B; ++b) { 28 | for (int i = L - 1; i >= 0; --i) { 29 | if (target[b * L + i] >= 0) { 30 | targetSize[b] = std::min(maxSize, i + 1); 31 | break; 32 | } 33 | } 34 | } 35 | } 36 | 37 | template 38 | void CriterionUtils::computeScale( 39 | int B, 40 | int T, 41 | int /* N */, 42 | CriterionScaleMode scaleMode, 43 | const int* targetSize, 44 | Float* scale) { 45 | for (int b = 0; b < B; ++b) { 46 | switch (scaleMode) { 47 | case CriterionScaleMode::NONE: 48 | scale[b] = 1.0; 49 | break; 50 | case CriterionScaleMode::INPUT_SZ: 51 | scale[b] = T > 0 ? 1.0 / T : 1.0; 52 | break; 53 | case CriterionScaleMode::INPUT_SZ_SQRT: 54 | scale[b] = T > 0 ? std::sqrt(1.0 / T) : 1.0; 55 | break; 56 | case CriterionScaleMode::TARGET_SZ: 57 | scale[b] = targetSize[b] > 0 ? 1.0 / targetSize[b] : 1.0; 58 | break; 59 | case CriterionScaleMode::TARGET_SZ_SQRT: 60 | scale[b] = targetSize[b] > 0 ? std::sqrt(1.0 / targetSize[b]) : 1.0; 61 | break; 62 | default: 63 | break; 64 | } 65 | } 66 | } 67 | 68 | template struct CriterionUtils; 69 | template struct CriterionUtils; 70 | 71 | } // namespace cpu 72 | } // namespace lib 73 | } // namespace fl 74 | -------------------------------------------------------------------------------- /flashlight/lib/sequence/criterion/cpu/CriterionUtils.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Facebook, Inc. and its affiliates. 3 | * 4 | * This source code is licensed under the MIT-style license found in the 5 | * LICENSE file in the root directory of this source tree. 6 | */ 7 | 8 | #pragma once 9 | 10 | #include 11 | 12 | #include "flashlight/lib/sequence//criterion/Defines.h" 13 | #include "flashlight/lib/sequence/Defines.h" 14 | 15 | using fl::lib::seq::CriterionScaleMode; 16 | 17 | namespace fl { 18 | namespace lib { 19 | namespace cpu { 20 | 21 | /// Check CUDA header for docs. 22 | template 23 | struct FL_SEQ_API CriterionUtils { 24 | static void batchTargetSize( 25 | int B, 26 | int L, 27 | int maxSize, 28 | const int* target, 29 | int* targetSize); 30 | 31 | static void computeScale( 32 | int B, 33 | int T, 34 | int N, 35 | CriterionScaleMode scaleMode, 36 | const int* targetSize, 37 | Float* scale); 38 | }; 39 | 40 | /// Zeroes `count * sizeof(T)` device bytes 41 | template 42 | void setZero(T* ptr, size_t count) { 43 | std::memset(ptr, 0, count * sizeof(T)); 44 | } 45 | 46 | } // namespace cpu 47 | } // namespace lib 48 | } // namespace fl 49 | -------------------------------------------------------------------------------- /flashlight/lib/sequence/criterion/cpu/ForceAlignmentCriterion.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Facebook, Inc. and its affiliates. 3 | * 4 | * This source code is licensed under the MIT-style license found in the 5 | * LICENSE file in the root directory of this source tree. 6 | */ 7 | 8 | #include "flashlight/lib/sequence/criterion/cpu/ForceAlignmentCriterion.h" 9 | 10 | #include 11 | #include 12 | 13 | #include "flashlight/lib/sequence/criterion/Workspace.h" 14 | #include "flashlight/lib/sequence/criterion/cpu/CriterionUtils.h" 15 | 16 | namespace { 17 | 18 | template 19 | struct WorkspacePtrs { 20 | WorkspacePtrs(void* workspace, int B, int T, int N, int L) { 21 | fl::lib::seq::Workspace<> ws(workspace); 22 | ws.request(&scale, B); 23 | ws.request(&alpha, B, T, L); 24 | ws.request(&alphaGrad, B, T, L); 25 | ws.request(&transBatchGrad, B, N, N); 26 | ws.request(&transBuf1, B, L); 27 | ws.request(&transBuf2, B, L); 28 | ws.request(&transBufGrad1, B, L); 29 | ws.request(&transBufGrad2, B, L); 30 | requiredSize = ws.requiredSize(); 31 | } 32 | 33 | Float* scale; 34 | double* alpha; 35 | double* alphaGrad; 36 | Float* transBatchGrad; 37 | Float* transBuf1; 38 | Float* transBuf2; 39 | Float* transBufGrad1; 40 | Float* transBufGrad2; 41 | size_t requiredSize; 42 | }; 43 | 44 | } // namespace 45 | 46 | namespace fl { 47 | namespace lib { 48 | namespace cpu { 49 | 50 | template 51 | size_t 52 | ForceAlignmentCriterion::getWorkspaceSize(int B, int T, int N, int L) { 53 | WorkspacePtrs dummy(nullptr, B, T, N, L); 54 | return dummy.requiredSize; 55 | } 56 | 57 | template 58 | void ForceAlignmentCriterion::forward( 59 | int B, 60 | int T, 61 | int N, 62 | int _L, 63 | CriterionScaleMode scaleMode, 64 | const Float* _input, 65 | const int* _target, 66 | const int* targetSize, 67 | const Float* trans, 68 | Float* loss, 69 | void* workspace) { 70 | WorkspacePtrs ws(workspace, B, T, N, _L); 71 | CriterionUtils::computeScale(B, T, N, scaleMode, targetSize, ws.scale); 72 | 73 | #pragma omp parallel for num_threads(B) 74 | for (int b = 0; b < B; ++b) { 75 | auto* alpha = &ws.alpha[b * T * _L]; 76 | auto* input = &_input[b * T * N]; 77 | auto* target = &_target[b * _L]; 78 | auto* transBuf1 = &ws.transBuf1[b * _L]; 79 | auto* transBuf2 = &ws.transBuf2[b * _L]; 80 | int L = targetSize[b]; 81 | 82 | alpha[0] = input[target[0]]; 83 | 84 | for (int i = 0; i < L; ++i) { 85 | transBuf1[i] = trans[target[i] * N + target[i]]; 86 | transBuf2[i] = i > 0 ? trans[target[i] * N + target[i - 1]] : 0; 87 | } 88 | 89 | for (int t = 1; t < T; ++t) { 90 | auto* inputCur = &input[t * N]; 91 | auto* alphaPrev = &alpha[(t - 1) * L]; 92 | auto* alphaCur = &alpha[t * L]; 93 | 94 | int high = t < L ? t : L; 95 | int low = T - t < L ? L - (T - t) : 1; 96 | 97 | if (T - t >= L) { 98 | alphaCur[0] = alphaPrev[0] + transBuf1[0] + inputCur[target[0]]; 99 | } 100 | 101 | if (t < L) { 102 | alphaCur[high] = 103 | alphaPrev[high - 1] + transBuf2[high] + inputCur[target[high]]; 104 | } 105 | 106 | for (int i = low; i < high; ++i) { 107 | double s1 = alphaPrev[i] + transBuf1[i]; 108 | double s2 = alphaPrev[i - 1] + transBuf2[i]; 109 | // lse = logSumExp(s1, s2) 110 | double lse = 111 | s1 < s2 ? s2 + log1p(exp(s1 - s2)) : s1 + log1p(exp(s2 - s1)); 112 | alphaCur[i] = lse + inputCur[target[i]]; 113 | } 114 | } 115 | 116 | loss[b] = alpha[T * L - 1] * ws.scale[b]; 117 | } 118 | } 119 | 120 | template 121 | void ForceAlignmentCriterion::backward( 122 | int B, 123 | int T, 124 | int N, 125 | int _L, 126 | const int* _target, 127 | const int* targetSize, 128 | const Float* grad, 129 | Float* _inputGrad, 130 | Float* transGrad, 131 | void* workspace) { 132 | WorkspacePtrs ws(workspace, B, T, N, _L); 133 | setZero(_inputGrad, B * T * N); 134 | setZero(transGrad, N * N); 135 | setZero(ws.alphaGrad, B * T * _L); 136 | setZero(ws.transBatchGrad, B * N * N); 137 | setZero(ws.transBufGrad1, B * _L); 138 | setZero(ws.transBufGrad2, B * _L); 139 | 140 | #pragma omp parallel for num_threads(B) 141 | for (int b = 0; b < B; ++b) { 142 | auto* alpha = &ws.alpha[b * T * _L]; 143 | auto* alphaGrad = &ws.alphaGrad[b * T * _L]; 144 | auto* inputGrad = &_inputGrad[b * T * N]; 145 | auto* target = &_target[b * _L]; 146 | auto* transBatchGrad = &ws.transBatchGrad[b * N * N]; 147 | auto* transBuf1 = &ws.transBuf1[b * _L]; 148 | auto* transBuf2 = &ws.transBuf2[b * _L]; 149 | auto* transBufGrad1 = &ws.transBufGrad1[b * _L]; 150 | auto* transBufGrad2 = &ws.transBufGrad2[b * _L]; 151 | int L = targetSize[b]; 152 | 153 | alphaGrad[T * L - 1] = 1; 154 | 155 | for (int t = T - 1; t > 0; --t) { 156 | auto* inputCurGrad = &inputGrad[t * N]; 157 | auto* alphaPrev = &alpha[(t - 1) * L]; 158 | auto* alphaCurGrad = &alphaGrad[t * L]; 159 | auto* alphaPrevGrad = &alphaGrad[(t - 1) * L]; 160 | 161 | int high = t < L ? t : L; 162 | int low = T - t < L ? L - (T - t) : 1; 163 | 164 | int high1 = t < L ? t + 1 : L; 165 | int low1 = T - t < L ? L - (T - t) : 0; 166 | 167 | for (int i = low1; i < high1; ++i) { 168 | inputCurGrad[target[i]] += alphaCurGrad[i]; 169 | } 170 | 171 | if (T - t >= L) { 172 | alphaPrevGrad[0] += alphaCurGrad[0]; 173 | transBufGrad1[0] += alphaCurGrad[0]; 174 | } 175 | 176 | if (t < L) { 177 | alphaPrevGrad[high - 1] += alphaCurGrad[high]; 178 | transBufGrad2[high] += alphaCurGrad[high]; 179 | } 180 | 181 | for (int i = low; i < high; ++i) { 182 | double s1 = alphaPrev[i] + transBuf1[i]; 183 | double s2 = alphaPrev[i - 1] + transBuf2[i]; 184 | // d1, d2 = dLogSumExp(s1, s2) 185 | double d1, d2; 186 | if (s1 < s2) { 187 | d2 = 1 / (1 + exp(s1 - s2)); 188 | d1 = 1 - d2; 189 | } else { 190 | d1 = 1 / (1 + exp(s2 - s1)); 191 | d2 = 1 - d1; 192 | } 193 | alphaPrevGrad[i] += d1 * alphaCurGrad[i]; 194 | alphaPrevGrad[i - 1] += d2 * alphaCurGrad[i]; 195 | transBufGrad1[i] += d1 * alphaCurGrad[i]; 196 | transBufGrad2[i] += d2 * alphaCurGrad[i]; 197 | } 198 | } 199 | 200 | inputGrad[target[0]] += alphaGrad[0]; 201 | auto gradScale = grad[b] * ws.scale[b]; 202 | for (int i = 0; i < T * N; ++i) { 203 | inputGrad[i] *= gradScale; 204 | } 205 | 206 | for (int i = 0; i < L; ++i) { 207 | transBatchGrad[target[i] * N + target[i]] += transBufGrad1[i]; 208 | if (i > 0) { 209 | transBatchGrad[target[i] * N + target[i - 1]] += transBufGrad2[i]; 210 | } 211 | } 212 | } 213 | 214 | for (int b = 0; b < B; ++b) { 215 | auto transBatchGrad = ws.transBatchGrad + b * N * N; 216 | auto gradScale = grad[b] * ws.scale[b]; 217 | for (int i = 0; i < N * N; ++i) { 218 | transGrad[i] += gradScale * transBatchGrad[i]; 219 | } 220 | } 221 | } 222 | 223 | template 224 | void ForceAlignmentCriterion::viterbi( 225 | int B, 226 | int T, 227 | int N, 228 | int _L, 229 | const Float* _input, 230 | const int* _target, 231 | const int* targetSize, 232 | const Float* trans, 233 | int* bestPaths, 234 | void* workspace) { 235 | WorkspacePtrs ws(workspace, B, T, N, _L); 236 | 237 | #pragma omp parallel for num_threads(B) 238 | for (int b = 0; b < B; ++b) { 239 | double* alpha = &ws.alpha[b * T * _L]; 240 | const Float* input = &_input[b * T * N]; 241 | const int* target = &_target[b * _L]; 242 | Float* transBuf1 = &ws.transBuf1[b * _L]; 243 | Float* transBuf2 = &ws.transBuf2[b * _L]; 244 | int L = targetSize[b]; 245 | for (int i = 0; i < L * T; i++) { 246 | alpha[i] = -std::numeric_limits::infinity(); 247 | } 248 | 249 | alpha[0] = input[target[0]]; 250 | 251 | for (int i = 0; i < L; ++i) { 252 | transBuf1[i] = trans[target[i] * N + target[i]]; 253 | transBuf2[i] = i > 0 ? trans[target[i] * N + target[i - 1]] : 0; 254 | } 255 | 256 | for (int t = 1; t < T; ++t) { 257 | const Float* inputCur = &input[t * N]; 258 | double* alphaPrev = &alpha[(t - 1) * L]; 259 | double* alphaCur = &alpha[t * L]; 260 | 261 | int high = t < L ? t : L; 262 | int low = T - t < L ? L - (T - t) : 1; 263 | 264 | // Handle edge cases. 265 | // If (T - t >= L), then we can conceivably still be at the initial blank 266 | if (T - t >= L) { 267 | alphaCur[0] = alphaPrev[0] + transBuf1[0] + inputCur[target[0]]; 268 | } 269 | 270 | // If (t < L), then the highest position can only be be computed 271 | // by transitioning. (We couldn't have been at position `high` 272 | // at the previous timestep). 273 | if (t < L) { 274 | alphaCur[high] = 275 | alphaPrev[high - 1] + transBuf2[high] + inputCur[target[high]]; 276 | } 277 | 278 | for (int i = low; i < high; ++i) { 279 | double s1 = alphaPrev[i] + transBuf1[i]; 280 | double s2 = alphaPrev[i - 1] + transBuf2[i]; 281 | alphaCur[i] = inputCur[target[i]] + fmax(s1, s2); 282 | } 283 | } 284 | 285 | auto ltrIdx = L - 1; 286 | int* bestPath = bestPaths + b * T; 287 | for (auto t = T - 1; t > 0; t--) { 288 | bestPath[t] = target[ltrIdx]; 289 | auto* alphaPrev = &alpha[(t - 1) * L]; 290 | if (ltrIdx > 0) { 291 | double s1 = alphaPrev[ltrIdx] + transBuf1[ltrIdx]; 292 | double s2 = alphaPrev[ltrIdx - 1] + transBuf2[ltrIdx]; 293 | if (s2 > s1) { 294 | ltrIdx--; 295 | } 296 | } 297 | } 298 | bestPath[0] = target[ltrIdx]; 299 | } 300 | } 301 | 302 | template struct ForceAlignmentCriterion; 303 | template struct ForceAlignmentCriterion; 304 | 305 | } // namespace cpu 306 | } // namespace lib 307 | } // namespace fl 308 | -------------------------------------------------------------------------------- /flashlight/lib/sequence/criterion/cpu/ForceAlignmentCriterion.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Facebook, Inc. and its affiliates. 3 | * 4 | * This source code is licensed under the MIT-style license found in the 5 | * LICENSE file in the root directory of this source tree. 6 | */ 7 | 8 | #pragma once 9 | 10 | #include 11 | 12 | #include "flashlight/lib/sequence/Defines.h" 13 | #include "flashlight/lib/sequence/criterion/Defines.h" 14 | 15 | using fl::lib::seq::CriterionScaleMode; 16 | 17 | namespace fl { 18 | namespace lib { 19 | namespace cpu { 20 | 21 | /// Check CUDA header for docs. 22 | template 23 | struct FL_SEQ_API ForceAlignmentCriterion { 24 | static size_t getWorkspaceSize(int B, int T, int N, int L); 25 | 26 | static void forward( 27 | int B, 28 | int T, 29 | int N, 30 | int L, 31 | CriterionScaleMode scaleMode, 32 | const Float* input, 33 | const int* target, 34 | const int* targetSize, 35 | const Float* trans, 36 | Float* loss, 37 | void* workspace); 38 | 39 | static void backward( 40 | int B, 41 | int T, 42 | int N, 43 | int L, 44 | const int* target, 45 | const int* targetSize, 46 | const Float* grad, 47 | Float* inputGrad, 48 | Float* transGrad, 49 | void* workspace); 50 | 51 | static void viterbi( 52 | int B, 53 | int T, 54 | int N, 55 | int L, 56 | const Float* input, 57 | const int* target, 58 | const int* targetSize, 59 | const Float* trans, 60 | int* bestPaths, 61 | void* workspace); 62 | }; 63 | 64 | } // namespace cpu 65 | } // namespace lib 66 | } // namespace fl 67 | -------------------------------------------------------------------------------- /flashlight/lib/sequence/criterion/cpu/FullConnectionCriterion.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Facebook, Inc. and its affiliates. 3 | * 4 | * This source code is licensed under the MIT-style license found in the 5 | * LICENSE file in the root directory of this source tree. 6 | */ 7 | 8 | #include "flashlight/lib/sequence/criterion/cpu/FullConnectionCriterion.h" 9 | 10 | #include 11 | 12 | #include "flashlight/lib/sequence/criterion/Workspace.h" 13 | #include "flashlight/lib/sequence/criterion/cpu/CriterionUtils.h" 14 | 15 | namespace { 16 | 17 | template 18 | struct WorkspacePtrs { 19 | explicit WorkspacePtrs(void* workspace, int B, int T, int N) { 20 | fl::lib::seq::Workspace<> ws(workspace); 21 | ws.request(&scale, B); 22 | ws.request(&alpha, B, T, N); 23 | ws.request(&alphaGrad, B, T, N); 24 | ws.request(&transBatchGrad, B, N, N); 25 | ws.request(&transBuf, B, N, N); 26 | requiredSize = ws.requiredSize(); 27 | } 28 | 29 | Float* scale; 30 | double* alpha; 31 | double* alphaGrad; 32 | double* transBatchGrad; 33 | double* transBuf; 34 | size_t requiredSize; 35 | }; 36 | 37 | } // namespace 38 | 39 | namespace fl { 40 | namespace lib { 41 | namespace cpu { 42 | 43 | template 44 | size_t FullConnectionCriterion::getWorkspaceSize(int B, int T, int N) { 45 | return WorkspacePtrs(nullptr, B, T, N).requiredSize; 46 | } 47 | 48 | template 49 | void FullConnectionCriterion::forward( 50 | int B, 51 | int T, 52 | int N, 53 | CriterionScaleMode scaleMode, 54 | const Float* input, 55 | const int* targetSize, 56 | const Float* trans, 57 | Float* loss, 58 | void* workspace) { 59 | WorkspacePtrs ws(workspace, B, T, N); 60 | CriterionUtils::computeScale(B, T, N, scaleMode, targetSize, ws.scale); 61 | 62 | #pragma omp parallel for num_threads(B) 63 | for (int b = 0; b < B; ++b) { 64 | for (int n = 0; n < N; ++n) { 65 | int k = b * T * N + n; 66 | ws.alpha[k] = input[k]; 67 | } 68 | 69 | for (int t = 1; t <= T; ++t) { 70 | for (int m = 0; m < N; ++m) { 71 | const auto* alphaPrev = &ws.alpha[b * T * N + (t - 1) * N]; 72 | const auto* inputCur = &input[b * T * N + t * N]; 73 | auto* transBuf = &ws.transBuf[b * N * N + m * N]; 74 | auto* alphaCur = &ws.alpha[b * T * N + t * N]; 75 | 76 | double maxValue = -INFINITY; 77 | for (int n = 0; n < N; ++n) { 78 | double val = transBuf[n] = 79 | alphaPrev[n] + (t == T ? 0 : trans[m * N + n]); 80 | maxValue = val > maxValue ? val : maxValue; 81 | } 82 | 83 | double sumValue = 0; 84 | for (int n = 0; n < N; ++n) { 85 | sumValue += exp(transBuf[n] - maxValue); 86 | } 87 | 88 | if (t == T) { 89 | loss[b] = ws.scale[b] * (log(sumValue) + maxValue); 90 | break; 91 | } 92 | 93 | alphaCur[m] = log(sumValue) + maxValue + inputCur[m]; 94 | } 95 | } 96 | } 97 | } 98 | 99 | template 100 | void FullConnectionCriterion::backward( 101 | int B, 102 | int T, 103 | int N, 104 | const Float* trans, 105 | const Float* grad, 106 | Float* _inputGrad, 107 | Float* transGrad, 108 | void* workspace) { 109 | WorkspacePtrs ws(workspace, B, T, N); 110 | setZero(_inputGrad, B * T * N); 111 | setZero(transGrad, N * N); 112 | setZero(ws.alphaGrad, B * T * N); 113 | setZero(ws.transBatchGrad, B * N * N); 114 | 115 | #pragma omp parallel for num_threads(B) 116 | for (int b = 0; b < B; ++b) { 117 | for (int t = T; t > 0; --t) { 118 | for (int m = 0; m < N; ++m) { 119 | const auto* alphaPrev = &ws.alpha[b * T * N + (t - 1) * N]; 120 | const auto* alphaCurGrad = &ws.alphaGrad[b * T * N + t * N]; 121 | auto* alphaPrevGrad = &ws.alphaGrad[b * T * N + (t - 1) * N]; 122 | auto* transBuf = &ws.transBuf[b * N * N + m * N]; 123 | auto* transBatchGrad = &ws.transBatchGrad[b * N * N + m * N]; 124 | 125 | double maxValue = -INFINITY; 126 | for (int n = 0; n < N; ++n) { 127 | double val = transBuf[n] = 128 | alphaPrev[n] + (t == T ? 0 : trans[m * N + n]); 129 | maxValue = val > maxValue ? val : maxValue; 130 | } 131 | 132 | double sumValue = 0; 133 | for (int n = 0; n < N; ++n) { 134 | transBuf[n] = exp(transBuf[n] - maxValue); 135 | sumValue += transBuf[n]; 136 | } 137 | 138 | if (t == T) { 139 | for (int n = 0; n < N; ++n) { 140 | alphaPrevGrad[n] = transBuf[n] / sumValue; 141 | } 142 | break; 143 | } 144 | 145 | for (int n = 0; n < N; ++n) { 146 | transBuf[n] = transBuf[n] / sumValue * alphaCurGrad[m]; 147 | transBatchGrad[n] += transBuf[n]; 148 | } 149 | } 150 | 151 | if (t == T) { 152 | continue; 153 | } 154 | 155 | for (int m = 0; m < N; ++m) { 156 | auto* alphaPrevGrad = &ws.alphaGrad[b * T * N + (t - 1) * N]; 157 | 158 | for (int n = 0; n < N; ++n) { 159 | alphaPrevGrad[m] += ws.transBuf[b * N * N + n * N + m]; 160 | } 161 | } 162 | } 163 | 164 | auto* alphaGrad = &ws.alphaGrad[b * T * N]; 165 | auto* inputGrad = &_inputGrad[b * T * N]; 166 | 167 | for (int i = 0; i < T * N; ++i) { 168 | inputGrad[i] = ws.scale[b] * grad[b] * alphaGrad[i]; 169 | } 170 | } 171 | 172 | for (int b = 0; b < B; ++b) { 173 | auto* transBatchGrad = &ws.transBatchGrad[b * N * N]; 174 | 175 | for (int i = 0; i < N * N; ++i) { 176 | transGrad[i] += ws.scale[b] * grad[b] * transBatchGrad[i]; 177 | } 178 | } 179 | } 180 | 181 | template struct FullConnectionCriterion; 182 | template struct FullConnectionCriterion; 183 | 184 | } // namespace cpu 185 | } // namespace lib 186 | } // namespace fl 187 | -------------------------------------------------------------------------------- /flashlight/lib/sequence/criterion/cpu/FullConnectionCriterion.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Facebook, Inc. and its affiliates. 3 | * 4 | * This source code is licensed under the MIT-style license found in the 5 | * LICENSE file in the root directory of this source tree. 6 | */ 7 | 8 | #pragma once 9 | 10 | #include 11 | 12 | #include "flashlight/lib/sequence/Defines.h" 13 | #include "flashlight/lib/sequence/criterion/Defines.h" 14 | 15 | using fl::lib::seq::CriterionScaleMode; 16 | 17 | namespace fl { 18 | namespace lib { 19 | namespace cpu { 20 | 21 | /// Check CUDA header for docs. 22 | template 23 | struct FL_SEQ_API FullConnectionCriterion { 24 | static size_t getWorkspaceSize(int B, int T, int N); 25 | 26 | static void forward( 27 | int B, 28 | int T, 29 | int N, 30 | CriterionScaleMode scaleMode, 31 | const Float* input, 32 | const int* targetSize, 33 | const Float* trans, 34 | Float* loss, 35 | void* workspace); 36 | 37 | static void backward( 38 | int B, 39 | int T, 40 | int N, 41 | const Float* trans, 42 | const Float* grad, 43 | Float* inputGrad, 44 | Float* transGrad, 45 | void* workspace); 46 | }; 47 | 48 | } // namespace cpu 49 | } // namespace lib 50 | } // namespace fl 51 | -------------------------------------------------------------------------------- /flashlight/lib/sequence/criterion/cpu/ViterbiPath.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Facebook, Inc. and its affiliates. 3 | * 4 | * This source code is licensed under the MIT-style license found in the 5 | * LICENSE file in the root directory of this source tree. 6 | */ 7 | 8 | #include "flashlight/lib/sequence/criterion/cpu/ViterbiPath.h" 9 | 10 | #include 11 | 12 | #include "flashlight/lib/sequence/criterion/Workspace.h" 13 | 14 | namespace { 15 | 16 | template 17 | struct WorkspacePtrs { 18 | explicit WorkspacePtrs(void* workspace, int B, int T, int N) { 19 | fl::lib::seq::Workspace<> ws(workspace); 20 | ws.request(&alpha, B, 2, N); 21 | ws.request(&beta, B, T, N); 22 | requiredSize = ws.requiredSize(); 23 | } 24 | 25 | Float* alpha; 26 | int* beta; 27 | size_t requiredSize; 28 | }; 29 | 30 | } // namespace 31 | 32 | namespace fl { 33 | namespace lib { 34 | namespace cpu { 35 | 36 | template 37 | size_t ViterbiPath::getWorkspaceSize(int B, int T, int N) { 38 | return WorkspacePtrs(nullptr, B, T, N).requiredSize; 39 | } 40 | 41 | template 42 | void ViterbiPath::compute( 43 | int B, 44 | int T, 45 | int N, 46 | const Float* input, 47 | const Float* trans, 48 | int* _path, 49 | void* workspace) { 50 | WorkspacePtrs ws(workspace, B, T, N); 51 | 52 | #pragma omp parallel for num_threads(B) 53 | for (int b = 0; b < B; ++b) { 54 | for (int n = 0; n < N; ++n) { 55 | ws.alpha[b * 2 * N + n] = input[b * T * N + n]; 56 | } 57 | 58 | for (int t = 1; t <= T; ++t) { 59 | const auto* alphaPrev = &ws.alpha[b * 2 * N + ((t - 1) % 2) * N]; 60 | const auto* inputCur = &input[b * T * N + t * N]; 61 | auto* alphaCur = &ws.alpha[b * 2 * N + (t % 2) * N]; 62 | auto* betaCur = &ws.beta[b * T * N + t * N]; 63 | 64 | for (int m = 0; m < N; ++m) { 65 | int maxIndex = -1; 66 | Float maxValue = -INFINITY; 67 | for (int n = 0; n < N; ++n) { 68 | Float val = alphaPrev[n] + (t == T ? 0 : trans[m * N + n]); 69 | if (val > maxValue) { 70 | maxIndex = n; 71 | maxValue = val; 72 | } 73 | } 74 | 75 | if (t == T) { 76 | auto* path = &_path[b * T]; 77 | path[T - 1] = maxIndex; 78 | for (int s = T - 1; s > 0; --s) { 79 | path[s - 1] = ws.beta[b * T * N + s * N + path[s]]; 80 | } 81 | break; 82 | } 83 | 84 | alphaCur[m] = maxValue + inputCur[m]; 85 | betaCur[m] = maxIndex; 86 | } 87 | } 88 | } 89 | } 90 | 91 | template struct ViterbiPath; 92 | template struct ViterbiPath; 93 | 94 | } // namespace cpu 95 | } // namespace lib 96 | } // namespace fl 97 | -------------------------------------------------------------------------------- /flashlight/lib/sequence/criterion/cpu/ViterbiPath.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Facebook, Inc. and its affiliates. 3 | * 4 | * This source code is licensed under the MIT-style license found in the 5 | * LICENSE file in the root directory of this source tree. 6 | */ 7 | 8 | #pragma once 9 | 10 | #include 11 | 12 | #include "flashlight/lib/sequence/Defines.h" 13 | 14 | namespace fl { 15 | namespace lib { 16 | namespace cpu { 17 | 18 | /// Check CUDA header for docs. 19 | template 20 | struct FL_SEQ_API ViterbiPath { 21 | static size_t getWorkspaceSize(int B, int T, int N); 22 | 23 | static void compute( 24 | int B, 25 | int T, 26 | int N, 27 | const Float* input, 28 | const Float* trans, 29 | int* path, 30 | void* workspace); 31 | }; 32 | 33 | } // namespace cpu 34 | } // namespace lib 35 | } // namespace fl 36 | -------------------------------------------------------------------------------- /flashlight/lib/sequence/criterion/cuda/CriterionUtils.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Facebook, Inc. and its affiliates. 3 | * 4 | * This source code is licensed under the MIT-style license found in the 5 | * LICENSE file in the root directory of this source tree. 6 | */ 7 | 8 | #include "flashlight/lib/sequence/criterion/cuda/CriterionUtils.cuh" 9 | 10 | #include 11 | 12 | namespace { 13 | 14 | using fl::lib::seq::CriterionScaleMode; 15 | using namespace fl::lib::seq; 16 | 17 | /* 18 | * B thread blocks 19 | * 32 threads/block (ideally) 20 | */ 21 | __global__ void 22 | batchTargetSizeKernel(int L, int maxSize, const int* _target, int* targetSize) { 23 | auto b = blockIdx.x; 24 | auto target = _target + b * L; 25 | 26 | __shared__ int idx; 27 | 28 | if (threadIdx.x == 0) { 29 | idx = 0; 30 | } 31 | 32 | __syncthreads(); 33 | 34 | for (auto i = L - 1 - threadIdx.x; i >= 0; i -= blockDim.x) { 35 | if (target[i] >= 0) { 36 | atomicMax(&idx, i + 1); 37 | break; 38 | } 39 | } 40 | 41 | __syncthreads(); 42 | 43 | if (threadIdx.x == 0) { 44 | targetSize[b] = idx < maxSize ? idx : maxSize; 45 | } 46 | } 47 | 48 | /* 49 | * 1 thread block 50 | * B threads/block (ideally) 51 | */ 52 | template 53 | __global__ void computeScaleKernel( 54 | int B, 55 | int T, 56 | int /* N */, 57 | CriterionScaleMode scaleMode, 58 | const int* targetSize, 59 | Float* scale) { 60 | for (auto b = threadIdx.x; b < B; b += blockDim.x) { 61 | switch (scaleMode) { 62 | case CriterionScaleMode::NONE: 63 | scale[b] = 1.0; 64 | break; 65 | case CriterionScaleMode::INPUT_SZ: 66 | scale[b] = T > 0 ? 1.0 / T : 1.0; 67 | break; 68 | case CriterionScaleMode::INPUT_SZ_SQRT: 69 | scale[b] = T > 0 ? std::sqrt(1.0 / T) : 1.0; 70 | break; 71 | case CriterionScaleMode::TARGET_SZ: 72 | scale[b] = targetSize[b] > 0 ? 1.0 / targetSize[b] : 1.0; 73 | break; 74 | case CriterionScaleMode::TARGET_SZ_SQRT: 75 | scale[b] = targetSize[b] > 0 ? std::sqrt(1.0 / targetSize[b]) : 1.0; 76 | break; 77 | default: 78 | break; 79 | } 80 | } 81 | } 82 | 83 | } // namespace 84 | 85 | namespace fl { 86 | namespace lib { 87 | namespace cuda { 88 | 89 | template 90 | void CriterionUtils::batchTargetSize( 91 | int B, 92 | int L, 93 | int maxSize, 94 | const int* target, 95 | int* targetSize, 96 | cudaStream_t stream) { 97 | batchTargetSizeKernel<<>>(L, maxSize, target, targetSize); 98 | } 99 | 100 | template 101 | void CriterionUtils::computeScale( 102 | int B, 103 | int T, 104 | int N, 105 | CriterionScaleMode scaleMode, 106 | const int* targetSize, 107 | Float* scale, 108 | cudaStream_t stream) { 109 | int blockSize = std::min(256, (B + 31) / 32 * 32); 110 | computeScaleKernel<<<1, blockSize, 0, stream>>>( 111 | B, T, N, scaleMode, targetSize, scale); 112 | } 113 | 114 | template struct CriterionUtils; 115 | template struct CriterionUtils; 116 | 117 | } // namespace cuda 118 | } // namespace lib 119 | } // namespace fl 120 | -------------------------------------------------------------------------------- /flashlight/lib/sequence/criterion/cuda/CriterionUtils.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Facebook, Inc. and its affiliates. 3 | * 4 | * This source code is licensed under the MIT-style license found in the 5 | * LICENSE file in the root directory of this source tree. 6 | */ 7 | 8 | #pragma once 9 | 10 | #include 11 | #include 12 | #include // for CUDART_INF 13 | #include 14 | 15 | #include "flashlight/lib/sequence/Defines.h" 16 | #include "flashlight/lib/sequence/criterion/Defines.h" 17 | 18 | using fl::lib::seq::CriterionScaleMode; 19 | 20 | #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 21 | 22 | /// Double-precision `atomicAdd` backport for compute capability < 6.0 23 | /// From NVIDIA docs: https://docs.nvidia.com/cuda/cuda-c-programming-guide/ 24 | static __inline__ __device__ double atomicAdd(double* address, double val) { 25 | unsigned long long int* address_as_ull = (unsigned long long int*)address; 26 | unsigned long long int old = *address_as_ull, assumed; 27 | 28 | do { 29 | assumed = old; 30 | old = atomicCAS( 31 | address_as_ull, 32 | assumed, 33 | __double_as_longlong(val + __longlong_as_double(assumed))); 34 | 35 | // Note: uses integer comparison to avoid hang in case of NaN (since NaN != 36 | // NaN) 37 | } while (assumed != old); 38 | 39 | return __longlong_as_double(old); 40 | } 41 | 42 | #endif 43 | 44 | namespace fl { 45 | namespace lib { 46 | namespace cuda { 47 | 48 | template 49 | struct FL_SEQ_API CriterionUtils { 50 | /** 51 | * B: batch size 52 | * L: target size 53 | * maxSize: target size results are clamped down to this 54 | * target: [B][L] target labels 55 | * targetSize: [B] (out) target sizes 56 | * stream: CUDA stream 57 | */ 58 | static void batchTargetSize( 59 | int B, 60 | int L, 61 | int maxSize, 62 | const int* target, 63 | int* targetSize, 64 | cudaStream_t stream); 65 | 66 | /** 67 | * B: batch size 68 | * T: input length 69 | * N: dictionary size 70 | * scaleMode: type of size scaling 71 | * targetSize: [B] target sizes 72 | * scale: [B] (out) scale factor 73 | * stream: CUDA stream 74 | */ 75 | static void computeScale( 76 | int B, 77 | int T, 78 | int N, 79 | CriterionScaleMode scaleMode, 80 | const int* targetSize, 81 | Float* scale, 82 | cudaStream_t stream); 83 | }; 84 | 85 | /// Zeroes `count * sizeof(T)` device bytes 86 | template 87 | void setZero(T* devPtr, size_t count, cudaStream_t stream) { 88 | cudaMemsetAsync(devPtr, 0, count * sizeof(T), stream); 89 | } 90 | 91 | } // namespace cuda 92 | } // namespace lib 93 | } // namespace fl 94 | -------------------------------------------------------------------------------- /flashlight/lib/sequence/criterion/cuda/ForceAlignmentCriterion.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Facebook, Inc. and its affiliates. 3 | * 4 | * This source code is licensed under the MIT-style license found in the 5 | * LICENSE file in the root directory of this source tree. 6 | */ 7 | 8 | #include "flashlight/lib/sequence/criterion/cuda/ForceAlignmentCriterion.cuh" 9 | 10 | #include 11 | #include 12 | 13 | #include "flashlight/lib/sequence/criterion/Workspace.h" 14 | #include "flashlight/lib/sequence/criterion/cuda/CriterionUtils.cuh" 15 | using fl::lib::seq::CriterionScaleMode; 16 | 17 | namespace { 18 | 19 | template 20 | struct WorkspacePtrs { 21 | explicit WorkspacePtrs(void* workspace, int B, int T, int N, int L) { 22 | fl::lib::seq::Workspace<> ws(workspace); 23 | ws.request(&scale, B); 24 | ws.request(&alpha, B, T, L); 25 | ws.request(&alphaGrad, B, T, L); 26 | ws.request(&transBatchGrad, B, N, N); 27 | ws.request(&transBuf1, B, L); 28 | ws.request(&transBuf2, B, L); 29 | ws.request(&transBufGrad1, B, L); 30 | ws.request(&transBufGrad2, B, L); 31 | requiredSize = ws.requiredSize(); 32 | } 33 | 34 | Float* scale; 35 | double* alpha; 36 | double* alphaGrad; 37 | Float* transBatchGrad; 38 | Float* transBuf1; 39 | Float* transBuf2; 40 | Float* transBufGrad1; 41 | Float* transBufGrad2; 42 | size_t requiredSize; 43 | }; 44 | 45 | /* 46 | * B thread blocks 47 | * L threads/block (ideally) 48 | */ 49 | template 50 | __global__ void forwardKernel( 51 | int T, 52 | int N, 53 | int _L, 54 | const Float* _input, 55 | const int* _target, 56 | const int* targetSize, 57 | const Float* trans, 58 | Float* _loss, 59 | WorkspacePtrs ws) { 60 | auto b = blockIdx.x; 61 | auto* alpha = &ws.alpha[b * T * _L]; 62 | auto* input = &_input[b * T * N]; 63 | auto* target = &_target[b * _L]; 64 | auto* transBuf1 = &ws.transBuf1[b * _L]; 65 | auto* transBuf2 = &ws.transBuf2[b * _L]; 66 | int L = targetSize[b]; 67 | 68 | for (auto i = threadIdx.x; i < L; i += blockDim.x) { 69 | alpha[i] = i == 0 ? input[target[0]] : 0; 70 | transBuf1[i] = trans[target[i] * N + target[i]]; 71 | transBuf2[i] = i > 0 ? trans[target[i] * N + target[i - 1]] : 0; 72 | } 73 | 74 | for (int t = 1; t < T; ++t) { 75 | auto* inputCur = &input[t * N]; 76 | auto* alphaPrev = &alpha[(t - 1) * L]; 77 | auto* alphaCur = &alpha[t * L]; 78 | 79 | int high = t < L ? t : L; 80 | int low = T - t < L ? L - (T - t) : 1; 81 | 82 | __syncthreads(); 83 | 84 | if (threadIdx.x == 0) { 85 | if (T - t >= L) { 86 | alphaCur[0] = alphaPrev[0] + transBuf1[0] + inputCur[target[0]]; 87 | } 88 | } else if (threadIdx.x == 1) { 89 | if (t < L) { 90 | alphaCur[high] = 91 | alphaPrev[high - 1] + transBuf2[high] + inputCur[target[high]]; 92 | } 93 | } 94 | 95 | for (auto i = low + threadIdx.x; i < high; i += blockDim.x) { 96 | double s1 = alphaPrev[i] + transBuf1[i]; 97 | double s2 = alphaPrev[i - 1] + transBuf2[i]; 98 | // lse = logSumExp(s1, s2) 99 | double lse = 100 | s1 < s2 ? s2 + log(1 + exp(s1 - s2)) : s1 + log(1 + exp(s2 - s1)); 101 | alphaCur[i] = lse + inputCur[target[i]]; 102 | } 103 | } 104 | 105 | __syncthreads(); 106 | 107 | if (threadIdx.x == 0) { 108 | _loss[b] = alpha[T * L - 1] * ws.scale[b]; 109 | } 110 | } 111 | 112 | /* 113 | * B thread blocks 114 | * L threads/block (ideally) 115 | */ 116 | template 117 | __global__ void backwardKernel( 118 | int T, 119 | int N, 120 | int _L, 121 | const int* _target, 122 | const int* targetSize, 123 | const Float* grad, 124 | Float* _inputGrad, 125 | Float* transGrad, 126 | WorkspacePtrs ws) { 127 | auto b = blockIdx.x; 128 | auto* alpha = &ws.alpha[b * T * _L]; 129 | auto* alphaGrad = &ws.alphaGrad[b * T * _L]; 130 | auto* inputGrad = &_inputGrad[b * T * N]; 131 | auto* target = &_target[b * _L]; 132 | auto* transBatchGrad = &ws.transBatchGrad[b * N * N]; 133 | auto* transBuf1 = &ws.transBuf1[b * _L]; 134 | auto* transBuf2 = &ws.transBuf2[b * _L]; 135 | auto* transBufGrad1 = &ws.transBufGrad1[b * _L]; 136 | auto* transBufGrad2 = &ws.transBufGrad2[b * _L]; 137 | int L = targetSize[b]; 138 | 139 | if (threadIdx.x == 0) { 140 | alphaGrad[T * L - 1] = 1; 141 | } 142 | 143 | for (int t = T - 1; t > 0; --t) { 144 | auto* inputCurGrad = &inputGrad[t * N]; 145 | auto* alphaPrev = &alpha[(t - 1) * L]; 146 | auto* alphaCurGrad = &alphaGrad[t * L]; 147 | auto* alphaPrevGrad = &alphaGrad[(t - 1) * L]; 148 | 149 | int high = t < L ? t : L; 150 | int low = T - t < L ? L - (T - t) : 1; 151 | 152 | int high1 = t < L ? t + 1 : L; 153 | int low1 = T - t < L ? L - (T - t) : 0; 154 | 155 | __syncthreads(); 156 | 157 | for (auto i = low1 + threadIdx.x; i < high1; i += blockDim.x) { 158 | atomicAdd(&inputCurGrad[target[i]], alphaCurGrad[i]); 159 | } 160 | 161 | if (threadIdx.x == 0) { 162 | if (T - t >= L) { 163 | atomicAdd(&alphaPrevGrad[0], alphaCurGrad[0]); 164 | transBufGrad1[0] += alphaCurGrad[0]; 165 | } 166 | } else if (threadIdx.x == 1) { 167 | if (t < L) { 168 | atomicAdd(&alphaPrevGrad[high - 1], alphaCurGrad[high]); 169 | transBufGrad2[high] += alphaCurGrad[high]; 170 | } 171 | } 172 | 173 | for (auto i = low + threadIdx.x; i < high; i += blockDim.x) { 174 | double s1 = alphaPrev[i] + transBuf1[i]; 175 | double s2 = alphaPrev[i - 1] + transBuf2[i]; 176 | // d1, d2 = dLogSumExp(s1, s2) 177 | double d1, d2; 178 | if (s1 < s2) { 179 | d2 = 1 / (1 + exp(s1 - s2)); 180 | d1 = 1 - d2; 181 | } else { 182 | d1 = 1 / (1 + exp(s2 - s1)); 183 | d2 = 1 - d1; 184 | } 185 | atomicAdd(&alphaPrevGrad[i], d1 * alphaCurGrad[i]); 186 | atomicAdd(&alphaPrevGrad[i - 1], d2 * alphaCurGrad[i]); 187 | transBufGrad1[i] += d1 * alphaCurGrad[i]; 188 | transBufGrad2[i] += d2 * alphaCurGrad[i]; 189 | } 190 | } 191 | 192 | __syncthreads(); 193 | 194 | __shared__ Float gradScale; 195 | 196 | if (threadIdx.x == 0) { 197 | inputGrad[target[0]] += alphaGrad[0]; 198 | gradScale = grad[b] * ws.scale[b]; 199 | } 200 | 201 | for (auto i = threadIdx.x; i < L; i += blockDim.x) { 202 | atomicAdd(&transBatchGrad[target[i] * N + target[i]], transBufGrad1[i]); 203 | if (i > 0) { 204 | atomicAdd( 205 | &transBatchGrad[target[i] * N + target[i - 1]], transBufGrad2[i]); 206 | } 207 | } 208 | 209 | __syncthreads(); 210 | 211 | for (auto i = threadIdx.x; i < T * N; i += blockDim.x) { 212 | inputGrad[i] *= gradScale; 213 | } 214 | 215 | for (auto i = threadIdx.x; i < N * N; i += blockDim.x) { 216 | atomicAdd(&transGrad[i], gradScale * transBatchGrad[i]); 217 | } 218 | } 219 | 220 | template 221 | __global__ void viterbiPathKernel( 222 | int T, 223 | int N, 224 | int _L, 225 | const Float* _input, 226 | const int* _target, 227 | const int* targetSize, 228 | const Float* trans, 229 | int* bestPaths, 230 | WorkspacePtrs ws) { 231 | auto b = blockIdx.x; 232 | auto* alpha = &ws.alpha[b * T * _L]; 233 | auto* input = &_input[b * T * N]; 234 | auto* target = &_target[b * _L]; 235 | auto* transBuf1 = &ws.transBuf1[b * _L]; 236 | auto* transBuf2 = &ws.transBuf2[b * _L]; 237 | int L = targetSize[b]; 238 | 239 | for (auto i = threadIdx.x; i < L * T; i += blockDim.x) { 240 | alpha[i] = i == 0 ? input[target[0]] : -CUDART_INF_F; 241 | } 242 | 243 | for (auto i = threadIdx.x; i < L; i += blockDim.x) { 244 | transBuf1[i] = trans[target[i] * N + target[i]]; 245 | transBuf2[i] = i > 0 ? trans[target[i] * N + target[i - 1]] : 0; 246 | } 247 | if (L > T || L == 0) { 248 | return; 249 | } 250 | 251 | for (int t = 1; t < T; ++t) { 252 | auto* inputCur = &input[t * N]; 253 | auto* alphaPrev = &alpha[(t - 1) * L]; 254 | auto* alphaCur = &alpha[t * L]; 255 | 256 | int high = t < L ? t : L; 257 | int low = T - t < L ? L - (T - t) : 1; 258 | 259 | // Ensure that all previous alphas have been computed 260 | __syncthreads(); 261 | 262 | if (threadIdx.x == 0) { 263 | if (T - t >= L) { 264 | alphaCur[0] = alphaPrev[0] + transBuf1[0] + inputCur[target[0]]; 265 | } 266 | } else if (threadIdx.x == 1) { 267 | if (t < L) { 268 | alphaCur[high] = 269 | alphaPrev[high - 1] + transBuf2[high] + inputCur[target[high]]; 270 | } 271 | } 272 | 273 | for (auto i = low + threadIdx.x; i < high; i += blockDim.x) { 274 | double s1 = alphaPrev[i] + transBuf1[i]; 275 | double s2 = alphaPrev[i - 1] + transBuf2[i]; 276 | alphaCur[i] = inputCur[target[i]] + max(s1, s2); 277 | } 278 | } 279 | // Ensure all threads are finished and alphas have been computed before 280 | // computing backward path 281 | __syncthreads(); 282 | if (threadIdx.x == 0) { 283 | int ltrIdx = L - 1; 284 | for (int t = T - 1; t > 0; t--) { 285 | bestPaths[t + (b * T)] = target[ltrIdx]; 286 | auto* alphaPrev = &alpha[(t - 1) * L]; 287 | if (ltrIdx > 0) { 288 | double s1 = alphaPrev[ltrIdx] + transBuf1[ltrIdx]; 289 | double s2 = alphaPrev[ltrIdx - 1] + transBuf2[ltrIdx]; 290 | if (s2 > s1) { 291 | ltrIdx--; 292 | } 293 | } 294 | } 295 | bestPaths[b * T] = target[ltrIdx]; 296 | } 297 | } 298 | 299 | } // namespace 300 | 301 | namespace fl { 302 | namespace lib { 303 | namespace cuda { 304 | 305 | template 306 | size_t 307 | ForceAlignmentCriterion::getWorkspaceSize(int B, int T, int N, int L) { 308 | return WorkspacePtrs(nullptr, B, T, N, L).requiredSize; 309 | } 310 | 311 | template 312 | void ForceAlignmentCriterion::forward( 313 | int B, 314 | int T, 315 | int N, 316 | int L, 317 | CriterionScaleMode scaleMode, 318 | const Float* input, 319 | const int* target, 320 | const int* targetSize, 321 | const Float* trans, 322 | Float* loss, 323 | void* workspace, 324 | cudaStream_t stream) { 325 | int blockSize = std::min(256, (L + 31) / 32 * 32); 326 | WorkspacePtrs ws(workspace, B, T, N, L); 327 | CriterionUtils::computeScale( 328 | B, T, N, scaleMode, targetSize, ws.scale, stream); 329 | forwardKernel<<>>( 330 | T, N, L, input, target, targetSize, trans, loss, ws); 331 | } 332 | 333 | template 334 | void ForceAlignmentCriterion::backward( 335 | int B, 336 | int T, 337 | int N, 338 | int L, 339 | const int* target, 340 | const int* targetSize, 341 | const Float* grad, 342 | Float* inputGrad, 343 | Float* transGrad, 344 | void* workspace, 345 | cudaStream_t stream) { 346 | int blockSize = std::min(256, (L + 31) / 32 * 32); 347 | WorkspacePtrs ws(workspace, B, T, N, L); 348 | setZero(inputGrad, B * T * N, stream); 349 | setZero(transGrad, N * N, stream); 350 | setZero(ws.alphaGrad, B * T * L, stream); 351 | setZero(ws.transBatchGrad, B * N * N, stream); 352 | setZero(ws.transBufGrad1, B * L, stream); 353 | setZero(ws.transBufGrad2, B * L, stream); 354 | backwardKernel<<>>( 355 | T, N, L, target, targetSize, grad, inputGrad, transGrad, ws); 356 | } 357 | 358 | template 359 | void ForceAlignmentCriterion::viterbiPath( 360 | int B, 361 | int T, 362 | int N, 363 | int L, 364 | const Float* input, 365 | const int* target, 366 | const int* targetSize, 367 | const Float* trans, 368 | int* bestPaths, 369 | void* workspace, 370 | cudaStream_t stream) { 371 | int blockSize = std::min(256, (L + 31) / 32 * 32); 372 | WorkspacePtrs ws(workspace, B, T, N, L); 373 | setZero(ws.alpha, B * T * L, stream); 374 | viterbiPathKernel<<>>( 375 | T, N, L, input, target, targetSize, trans, bestPaths, ws); 376 | } 377 | 378 | template struct ForceAlignmentCriterion; 379 | template struct ForceAlignmentCriterion; 380 | 381 | } // namespace cuda 382 | } // namespace lib 383 | } // namespace fl 384 | -------------------------------------------------------------------------------- /flashlight/lib/sequence/criterion/cuda/ForceAlignmentCriterion.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Facebook, Inc. and its affiliates. 3 | * 4 | * This source code is licensed under the MIT-style license found in the 5 | * LICENSE file in the root directory of this source tree. 6 | */ 7 | 8 | #pragma once 9 | 10 | #include 11 | 12 | #include "flashlight/lib/sequence/Defines.h" 13 | #include "flashlight/lib/sequence/criterion/Defines.h" 14 | 15 | using fl::lib::seq::CriterionScaleMode; 16 | 17 | namespace fl { 18 | namespace lib { 19 | namespace cuda { 20 | 21 | /// The numerator of ASG loss. Reference: https://arxiv.org/abs/1609.03193 22 | template 23 | struct FL_SEQ_API ForceAlignmentCriterion { 24 | /** 25 | * B: batch size 26 | * T: input length 27 | * N: dictionary size 28 | * L: target size 29 | */ 30 | static size_t getWorkspaceSize(int B, int T, int N, int L); 31 | 32 | /** 33 | * B: batch size 34 | * T: input length 35 | * N: dictionary size 36 | * L: target size 37 | * scaleMode: type of size scaling 38 | * input: [B][T][N] input frames from network 39 | * target: [B][L] target labels 40 | * targetSize: [B] target sizes 41 | * trans: [N][N] transition matrix 42 | * loss: [B] (out) loss value 43 | * workspace: (in/out) internal workspace 44 | * stream: CUDA stream 45 | */ 46 | static void forward( 47 | int B, 48 | int T, 49 | int N, 50 | int L, 51 | CriterionScaleMode scaleMode, 52 | const Float* input, 53 | const int* target, 54 | const int* targetSize, 55 | const Float* trans, 56 | Float* loss, 57 | void* workspace, 58 | cudaStream_t stream); 59 | 60 | /** 61 | * B: batch size 62 | * T: input length 63 | * N: dictionary size 64 | * L: target size 65 | * target: [B][L] target labels 66 | * targetSize: [B] target sizes 67 | * grad: [B] gradient w.r.t. loss 68 | * inputGrad: [B][T][N] (out) gradient w.r.t. input 69 | * transGrad: [N][N] (out) gradient w.r.t. transitions 70 | * workspace: (in/out) internal workspace from forward 71 | * stream: CUDA stream 72 | */ 73 | static void backward( 74 | int B, 75 | int T, 76 | int N, 77 | int L, 78 | const int* target, 79 | const int* targetSize, 80 | const Float* grad, 81 | Float* inputGrad, 82 | Float* transGrad, 83 | void* workspace, 84 | cudaStream_t stream); 85 | 86 | static void viterbiPath( 87 | int B, 88 | int T, 89 | int N, 90 | int L, 91 | const Float* input, 92 | const int* target, 93 | const int* targetSize, 94 | const Float* trans, 95 | int* bestPaths, 96 | void* workspace, 97 | cudaStream_t stream); 98 | }; 99 | 100 | } // namespace cuda 101 | } // namespace lib 102 | } // namespace fl 103 | -------------------------------------------------------------------------------- /flashlight/lib/sequence/criterion/cuda/FullConnectionCriterion.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Facebook, Inc. and its affiliates. 3 | * 4 | * This source code is licensed under the MIT-style license found in the 5 | * LICENSE file in the root directory of this source tree. 6 | */ 7 | 8 | #include "flashlight/lib/sequence/criterion/cuda/FullConnectionCriterion.cuh" 9 | 10 | #include 11 | 12 | #include 13 | 14 | #include "flashlight/lib/sequence/criterion/Workspace.h" 15 | #include "flashlight/lib/sequence/criterion/cuda/CriterionUtils.cuh" 16 | using fl::lib::seq::CriterionScaleMode; 17 | 18 | namespace { 19 | 20 | constexpr int kBlockSize = 32; 21 | 22 | template 23 | struct WorkspacePtrs { 24 | explicit WorkspacePtrs(void* workspace, int B, int T, int N) { 25 | fl::lib::seq::Workspace<> ws(workspace); 26 | ws.request(&scale, B); 27 | ws.request(&alpha, B, T, N); 28 | ws.request(&alphaGrad, B, T, N); 29 | ws.request(&transBatchGrad, B, N, N); 30 | ws.request(&transBuf, B, N, N); 31 | requiredSize = ws.requiredSize(); 32 | } 33 | 34 | Float* scale; 35 | double* alpha; 36 | double* alphaGrad; 37 | double* transBatchGrad; 38 | double* transBuf; 39 | size_t requiredSize; 40 | }; 41 | 42 | /* 43 | * B thread blocks 44 | * kBlockSize threads/block 45 | */ 46 | template 47 | __global__ void 48 | forwardInitial(int T, int N, const Float* input, WorkspacePtrs ws) { 49 | auto b = blockIdx.x; 50 | for (auto n = threadIdx.x; n < N; n += blockDim.x) { 51 | int k = b * T * N + n; 52 | ws.alpha[k] = input[k]; 53 | } 54 | } 55 | 56 | /* 57 | * B * N thread blocks (B if Final) 58 | * kBlockSize threads/block 59 | */ 60 | template 61 | __global__ void forwardStep( 62 | int T, 63 | int N, 64 | int t, 65 | const Float* input, 66 | const Float* trans, 67 | Float* loss, 68 | WorkspacePtrs ws) { 69 | int b, m; 70 | if (Final) { 71 | b = blockIdx.x; 72 | } else { 73 | b = blockIdx.x / N; 74 | m = blockIdx.x % N; 75 | } 76 | 77 | const auto* alphaPrev = &ws.alpha[b * T * N + (t - 1) * N]; 78 | const auto* inputCur = &input[b * T * N + t * N]; 79 | auto* alphaCur = &ws.alpha[b * T * N + t * N]; 80 | auto* transBuf = &ws.transBuf[blockIdx.x * N]; 81 | 82 | using BlockReduce = cub::BlockReduce; 83 | __shared__ typename BlockReduce::TempStorage tempStorage; 84 | __shared__ double maxValue; 85 | 86 | double threadMax = -INFINITY; 87 | for (auto n = threadIdx.x; n < N; n += blockDim.x) { 88 | double val = transBuf[n] = alphaPrev[n] + (Final ? 0 : trans[m * N + n]); 89 | threadMax = val > threadMax ? val : threadMax; 90 | } 91 | 92 | double maxResult = BlockReduce(tempStorage).Reduce(threadMax, cub::Max()); 93 | if (threadIdx.x == 0) { 94 | maxValue = maxResult; 95 | } 96 | 97 | __syncthreads(); 98 | 99 | double threadSum = 0; 100 | for (auto n = threadIdx.x; n < N; n += blockDim.x) { 101 | threadSum += exp(transBuf[n] - maxValue); 102 | } 103 | 104 | double sumResult = BlockReduce(tempStorage).Sum(threadSum); 105 | if (threadIdx.x == 0) { 106 | if (Final) { 107 | loss[b] = ws.scale[b] * (log(sumResult) + maxValue); 108 | } else { 109 | alphaCur[m] = log(sumResult) + maxValue + inputCur[m]; 110 | } 111 | } 112 | } 113 | 114 | /* 115 | * B * N thread blocks (B if Initial) 116 | * kBlockSize threads/block 117 | */ 118 | template 119 | __global__ void backwardStep1( 120 | int T, 121 | int N, 122 | int t, 123 | const Float* trans, 124 | WorkspacePtrs ws) { 125 | int b, m; 126 | if (Initial) { 127 | b = blockIdx.x; 128 | } else { 129 | b = blockIdx.x / N; 130 | m = blockIdx.x % N; 131 | } 132 | 133 | const auto* alphaPrev = &ws.alpha[b * T * N + (t - 1) * N]; 134 | const auto* alphaCurGrad = &ws.alphaGrad[b * T * N + t * N]; 135 | auto* alphaPrevGrad = &ws.alphaGrad[b * T * N + (t - 1) * N]; 136 | auto* transBuf = &ws.transBuf[blockIdx.x * N]; 137 | auto* transBatchGrad = &ws.transBatchGrad[blockIdx.x * N]; 138 | 139 | using BlockReduce = cub::BlockReduce; 140 | __shared__ typename BlockReduce::TempStorage tempStorage; 141 | __shared__ double maxValue; 142 | __shared__ double sumValue; 143 | 144 | double threadMax = -INFINITY; 145 | for (auto n = threadIdx.x; n < N; n += blockDim.x) { 146 | double val = transBuf[n] = alphaPrev[n] + (Initial ? 0 : trans[m * N + n]); 147 | threadMax = val > threadMax ? val : threadMax; 148 | } 149 | 150 | double maxResult = BlockReduce(tempStorage).Reduce(threadMax, cub::Max()); 151 | if (threadIdx.x == 0) { 152 | maxValue = maxResult; 153 | } 154 | 155 | double threadSum = 0; 156 | for (auto n = threadIdx.x; n < N; n += blockDim.x) { 157 | transBuf[n] = exp(transBuf[n] - maxValue); 158 | threadSum += transBuf[n]; 159 | } 160 | 161 | double sumResult = BlockReduce(tempStorage).Sum(threadSum); 162 | if (threadIdx.x == 0) { 163 | sumValue = sumResult; 164 | } 165 | 166 | __syncthreads(); 167 | 168 | for (auto n = threadIdx.x; n < N; n += blockDim.x) { 169 | if (Initial) { 170 | alphaPrevGrad[n] = transBuf[n] / sumValue; 171 | } else { 172 | transBuf[n] = transBuf[n] / sumValue * alphaCurGrad[m]; 173 | transBatchGrad[n] += transBuf[n]; 174 | } 175 | } 176 | } 177 | 178 | /* 179 | * B * N thread blocks 180 | * kBlockSize threads/block 181 | */ 182 | template 183 | __global__ void backwardStep2(int T, int N, int t, WorkspacePtrs ws) { 184 | auto b = blockIdx.x / N; 185 | auto m = blockIdx.x % N; 186 | 187 | auto* alphaPrevGrad = &ws.alphaGrad[b * T * N + (t - 1) * N]; 188 | 189 | using BlockReduce = cub::BlockReduce; 190 | __shared__ typename BlockReduce::TempStorage tempStorage; 191 | 192 | double threadSum = 0; 193 | for (auto n = threadIdx.x; n < N; n += blockDim.x) { 194 | threadSum += ws.transBuf[b * N * N + n * N + m]; 195 | } 196 | 197 | double sumResult = BlockReduce(tempStorage).Sum(threadSum); 198 | if (threadIdx.x == 0) { 199 | alphaPrevGrad[m] = sumResult; 200 | } 201 | } 202 | 203 | /* 204 | * B thread blocks 205 | * 128 threads/block 206 | */ 207 | template 208 | __global__ void backwardFinal( 209 | int T, 210 | int N, 211 | const Float* _grad, 212 | Float* _inputGrad, 213 | Float* transGrad, 214 | WorkspacePtrs ws) { 215 | auto b = blockIdx.x; 216 | 217 | auto* alphaGrad = &ws.alphaGrad[b * T * N]; 218 | auto* inputGrad = &_inputGrad[b * T * N]; 219 | auto* transBatchGrad = &ws.transBatchGrad[b * N * N]; 220 | 221 | __shared__ Float gradScale; 222 | 223 | if (threadIdx.x == 0) { 224 | gradScale = ws.scale[b] * _grad[b]; 225 | } 226 | 227 | __syncthreads(); 228 | 229 | for (auto i = threadIdx.x; i < T * N; i += blockDim.x) { 230 | inputGrad[i] = gradScale * alphaGrad[i]; 231 | } 232 | 233 | for (auto i = threadIdx.x; i < N * N; i += blockDim.x) { 234 | atomicAdd(&transGrad[i], gradScale * transBatchGrad[i]); 235 | } 236 | } 237 | 238 | } // namespace 239 | 240 | namespace fl { 241 | namespace lib { 242 | namespace cuda { 243 | 244 | template 245 | size_t FullConnectionCriterion::getWorkspaceSize(int B, int T, int N) { 246 | return WorkspacePtrs(nullptr, B, T, N).requiredSize; 247 | } 248 | 249 | template 250 | void FullConnectionCriterion::forward( 251 | int B, 252 | int T, 253 | int N, 254 | CriterionScaleMode scaleMode, 255 | const Float* input, 256 | const int* targetSize, 257 | const Float* trans, 258 | Float* loss, 259 | void* workspace, 260 | cudaStream_t stream) { 261 | WorkspacePtrs ws(workspace, B, T, N); 262 | CriterionUtils::computeScale( 263 | B, T, N, scaleMode, targetSize, ws.scale, stream); 264 | forwardInitial<<>>(T, N, input, ws); 265 | for (int t = 1; t < T; ++t) { 266 | forwardStep 267 | <<>>(T, N, t, input, trans, loss, ws); 268 | } 269 | forwardStep 270 | <<>>(T, N, T, input, trans, loss, ws); 271 | } 272 | 273 | template 274 | void FullConnectionCriterion::backward( 275 | int B, 276 | int T, 277 | int N, 278 | const Float* trans, 279 | const Float* grad, 280 | Float* inputGrad, 281 | Float* transGrad, 282 | void* workspace, 283 | cudaStream_t stream) { 284 | WorkspacePtrs ws(workspace, B, T, N); 285 | setZero(inputGrad, B * T * N, stream); 286 | setZero(transGrad, N * N, stream); 287 | setZero(ws.transBatchGrad, B * N * N, stream); 288 | backwardStep1<<>>(T, N, T, trans, ws); 289 | for (int t = T - 1; t > 0; --t) { 290 | backwardStep1<<>>(T, N, t, trans, ws); 291 | backwardStep2<<>>(T, N, t, ws); 292 | } 293 | backwardFinal<<>>(T, N, grad, inputGrad, transGrad, ws); 294 | } 295 | 296 | template struct FullConnectionCriterion; 297 | template struct FullConnectionCriterion; 298 | 299 | } // namespace cuda 300 | } // namespace lib 301 | } // namespace fl 302 | -------------------------------------------------------------------------------- /flashlight/lib/sequence/criterion/cuda/FullConnectionCriterion.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Facebook, Inc. and its affiliates. 3 | * 4 | * This source code is licensed under the MIT-style license found in the 5 | * LICENSE file in the root directory of this source tree. 6 | */ 7 | 8 | #pragma once 9 | 10 | #include 11 | 12 | #include "flashlight/lib/sequence/Defines.h" 13 | #include "flashlight/lib/sequence/criterion/Defines.h" 14 | 15 | using fl::lib::seq::CriterionScaleMode; 16 | 17 | namespace fl { 18 | namespace lib { 19 | namespace cuda { 20 | 21 | /// The denominator of ASG loss. Reference: https://arxiv.org/abs/1609.03193 22 | template 23 | struct FL_SEQ_API FullConnectionCriterion { 24 | /** 25 | * B: batch size 26 | * T: input length 27 | * N: dictionary size 28 | */ 29 | static size_t getWorkspaceSize(int B, int T, int N); 30 | 31 | /** 32 | * B: batch size 33 | * T: input length 34 | * N: dictionary size 35 | * scaleMode: type of size scaling 36 | * input: [B][T][N] input frames from network 37 | * targetSize: [B] target sizes (may be null if not needed for scaleMode) 38 | * trans: [N][N] transition matrix 39 | * loss: [B] (out) loss value 40 | * workspace: (in/out) internal workspace 41 | * stream: CUDA stream 42 | */ 43 | static void forward( 44 | int B, 45 | int T, 46 | int N, 47 | CriterionScaleMode scaleMode, 48 | const Float* input, 49 | const int* targetSize, 50 | const Float* trans, 51 | Float* loss, 52 | void* workspace, 53 | cudaStream_t stream); 54 | 55 | /** 56 | * B: batch size 57 | * T: input length 58 | * N: dictionary size 59 | * trans: [N][N] transition matrix 60 | * grad: [B] gradient w.r.t. loss 61 | * inputGrad: [B][T][N] (out) gradient w.r.t. input 62 | * transGrad: [N][N] (out) gradient w.r.t transitions 63 | * workspace: (in/out) internal workspace from forward 64 | * stream: CUDA stream 65 | */ 66 | static void backward( 67 | int B, 68 | int T, 69 | int N, 70 | const Float* trans, 71 | const Float* grad, 72 | Float* inputGrad, 73 | Float* transGrad, 74 | void* workspace, 75 | cudaStream_t stream); 76 | }; 77 | 78 | } // namespace cuda 79 | } // namespace lib 80 | } // namespace fl 81 | -------------------------------------------------------------------------------- /flashlight/lib/sequence/criterion/cuda/ViterbiPath.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Facebook, Inc. and its affiliates. 3 | * 4 | * This source code is licensed under the MIT-style license found in the 5 | * LICENSE file in the root directory of this source tree. 6 | */ 7 | 8 | #include "flashlight/lib/sequence/criterion/cuda/ViterbiPath.cuh" 9 | 10 | #include 11 | 12 | #include 13 | 14 | #include "flashlight/lib/sequence/criterion/Workspace.h" 15 | 16 | namespace { 17 | 18 | constexpr int kBlockSize = 32; 19 | 20 | template 21 | struct WorkspacePtrs { 22 | explicit WorkspacePtrs(void* workspace, int B, int T, int N) { 23 | fl::lib::seq::Workspace<> ws(workspace); 24 | ws.request(&alpha, B, 2, N); 25 | ws.request(&beta, B, T, N); 26 | requiredSize = ws.requiredSize(); 27 | } 28 | 29 | Float* alpha; 30 | int* beta; 31 | size_t requiredSize; 32 | }; 33 | 34 | /* 35 | * B thread blocks 36 | * kBlockSize threads/block 37 | */ 38 | template 39 | __global__ void 40 | computeInitial(int T, int N, const Float* input, WorkspacePtrs ws) { 41 | auto b = blockIdx.x; 42 | for (auto n = threadIdx.x; n < N; n += blockDim.x) { 43 | ws.alpha[b * 2 * N + n] = input[b * T * N + n]; 44 | } 45 | } 46 | 47 | /* 48 | * B * N thread blocks (B if Final) 49 | * kBlockSize threads/block 50 | */ 51 | template 52 | __global__ void computeStep( 53 | int T, 54 | int N, 55 | int t, 56 | const Float* input, 57 | const Float* trans, 58 | int* _path, 59 | WorkspacePtrs ws) { 60 | int b, m; 61 | if (Final) { 62 | b = blockIdx.x; 63 | } else { 64 | b = blockIdx.x / N; 65 | m = blockIdx.x % N; 66 | } 67 | 68 | const auto* alphaPrev = &ws.alpha[b * 2 * N + ((t - 1) % 2) * N]; 69 | const auto* inputCur = &input[b * T * N + t * N]; 70 | auto* alphaCur = &ws.alpha[b * 2 * N + (t % 2) * N]; 71 | auto* betaCur = &ws.beta[b * T * N + t * N]; 72 | 73 | using BlockReduce = 74 | cub::BlockReduce, kBlockSize>; 75 | __shared__ typename BlockReduce::TempStorage tempStorage; 76 | 77 | cub::KeyValuePair threadMax; 78 | threadMax.value = -INFINITY; 79 | for (auto n = threadIdx.x; n < N; n += blockDim.x) { 80 | Float val = alphaPrev[n] + (Final ? 0 : trans[m * N + n]); 81 | if (val > threadMax.value) { 82 | threadMax.key = n; 83 | threadMax.value = val; 84 | } 85 | } 86 | 87 | auto result = BlockReduce(tempStorage).Reduce(threadMax, cub::ArgMax()); 88 | if (threadIdx.x == 0) { 89 | if (Final) { 90 | auto* path = &_path[b * T]; 91 | path[T - 1] = result.key; 92 | for (int s = T - 1; s > 0; --s) { 93 | path[s - 1] = ws.beta[b * T * N + s * N + path[s]]; 94 | } 95 | } else { 96 | alphaCur[m] = result.value + inputCur[m]; 97 | betaCur[m] = result.key; 98 | } 99 | } 100 | } 101 | 102 | } // namespace 103 | 104 | namespace fl { 105 | namespace lib { 106 | namespace cuda { 107 | 108 | template 109 | size_t ViterbiPath::getWorkspaceSize(int B, int T, int N) { 110 | return WorkspacePtrs(nullptr, B, T, N).requiredSize; 111 | } 112 | 113 | template 114 | void ViterbiPath::compute( 115 | int B, 116 | int T, 117 | int N, 118 | const Float* input, 119 | const Float* trans, 120 | int* path, 121 | void* workspace, 122 | cudaStream_t stream) { 123 | WorkspacePtrs ws(workspace, B, T, N); 124 | computeInitial<<>>(T, N, input, ws); 125 | for (int t = 1; t < T; ++t) { 126 | computeStep 127 | <<>>(T, N, t, input, trans, path, ws); 128 | } 129 | computeStep 130 | <<>>(T, N, T, input, trans, path, ws); 131 | } 132 | 133 | template struct ViterbiPath; 134 | template struct ViterbiPath; 135 | 136 | } // namespace cuda 137 | } // namespace lib 138 | } // namespace fl 139 | -------------------------------------------------------------------------------- /flashlight/lib/sequence/criterion/cuda/ViterbiPath.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Facebook, Inc. and its affiliates. 3 | * 4 | * This source code is licensed under the MIT-style license found in the 5 | * LICENSE file in the root directory of this source tree. 6 | */ 7 | 8 | #pragma once 9 | 10 | #include 11 | 12 | #include "flashlight/lib/sequence/Defines.h" 13 | 14 | namespace fl { 15 | namespace lib { 16 | namespace cuda { 17 | 18 | /// Computes max likelihood path using Viterbi algorithm. 19 | template 20 | struct FL_SEQ_API ViterbiPath { 21 | /** 22 | * B: batch size 23 | * T: input length 24 | * N: dictionary size 25 | */ 26 | static size_t getWorkspaceSize(int B, int T, int N); 27 | 28 | /** 29 | * B: batch size 30 | * T: input length 31 | * N: dictionary size 32 | * input: [B][T][N] input frames from network 33 | * trans: [N][N] transition matrix 34 | * path: [B][T] (out) Viterbi path 35 | * workspace: (in/out) internal workspace 36 | * stream: CUDA stream 37 | */ 38 | static void compute( 39 | int B, 40 | int T, 41 | int N, 42 | const Float* input, 43 | const Float* trans, 44 | int* path, 45 | void* workspace, 46 | cudaStream_t stream); 47 | }; 48 | 49 | } // namespace cuda 50 | } // namespace lib 51 | } // namespace fl 52 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel", "cmake", "packaging"] 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Copyright (c) Facebook, Inc. and its affiliates. 4 | This source code is licensed under the MIT-style license found in the 5 | LICENSE file in the root directory of this source tree. 6 | """ 7 | 8 | import datetime 9 | import os 10 | import platform 11 | import re 12 | import subprocess 13 | import sys 14 | from pathlib import Path 15 | 16 | from packaging import version 17 | from setuptools import Extension, find_namespace_packages, setup 18 | from setuptools.command.build_ext import build_ext 19 | 20 | THIS_DIR = os.path.dirname(os.path.abspath(__file__)) 21 | 22 | # Path relative to project root that contains Python artifacts for packaging 23 | PACKAGE_DIR = "bindings/python" 24 | ARTIFACTS_DIR = os.path.join(PACKAGE_DIR, "flashlight/lib/sequence") 25 | BUILD_VERSION_PATH = Path(os.path.join(THIS_DIR, "BUILD_VERSION.txt")) 26 | 27 | 28 | # Environment variables: 29 | # - `USE_CUDA=1` enables building with CUDA support 30 | # By default builds with USE_CUDA=0 31 | 32 | 33 | def check_env_flag(name, default=""): 34 | return os.getenv(name, default).upper() in ["ON", "1", "YES", "TRUE", "Y"] 35 | 36 | 37 | def check_negative_env_flag(name, default="") -> bool: 38 | return os.getenv(name, default).upper() in ["OFF", "0", "NO", "FALSE", "N"] 39 | 40 | 41 | def get_local_version_suffix() -> str: 42 | date_suffix = datetime.datetime.now().strftime("%Y%m%d") 43 | git_hash = subprocess.check_output( 44 | ["git", "rev-parse", "--short", "HEAD"], cwd=Path(__file__).parent 45 | ).decode("ascii")[:-1] 46 | return f"+{git_hash}.d{date_suffix}" 47 | 48 | 49 | def write_version_file(version: str): 50 | version_path = os.path.join(THIS_DIR, ARTIFACTS_DIR, "version.py") 51 | with open(version_path, "w") as f: 52 | f.write("# noqa: C801\n") 53 | f.write(f'__version__ = "{version}"\n') 54 | tag = os.getenv("GIT_TAG") 55 | if tag is not None: 56 | f.write(f'git_tag = "{tag}"\n') 57 | 58 | 59 | class CMakeExtension(Extension): 60 | def __init__(self, name): 61 | Extension.__init__(self, name, sources=[]) 62 | 63 | 64 | class CMakeBuild(build_ext): 65 | def run(self): 66 | try: 67 | out = subprocess.check_output(["cmake", "--version"]) 68 | except OSError: 69 | raise RuntimeError( 70 | "CMake must be installed to build the following extensions: " 71 | + ", ".join(e.name for e in self.extensions) 72 | ) 73 | 74 | cmake_version = re.search(r"version\s*([\d.]+)", out.decode().lower()).group(1) 75 | if version.parse(cmake_version) < version.parse("3.18"): 76 | raise RuntimeError( 77 | "CMake >= 3.18 is required to build flashlight-sequence Python bindings" 78 | ) 79 | 80 | # our CMakeLists builds all the extensions at once 81 | for ext in self.extensions: 82 | self.build_extensions(ext) 83 | 84 | def build_extensions(self, ext): 85 | if not os.path.exists(self.build_temp): 86 | os.makedirs(self.build_temp) 87 | 88 | ext_dir = str(Path(self.get_ext_fullpath(ext.name)).absolute().parent) 89 | source_dir = str(Path(__file__).absolute().parent) 90 | use_cuda = "ON" if check_env_flag("USE_CUDA") else "OFF" 91 | use_openmp = "OFF" if check_negative_env_flag("USE_OPENMP") else "ON" 92 | cmake_args = [ 93 | "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=" + ext_dir, 94 | "-DPython3_EXECUTABLE=" + sys.executable, 95 | "-DBUILD_SHARED_LIBS=ON", 96 | "-DFL_SEQUENCE_BUILD_TESTS=OFF", 97 | "-DFL_SEQUENCE_BUILD_PYTHON=ON", 98 | "-DFL_SEQUENCE_BUILD_PYTHON_PACKAGE=ON", 99 | "-DFL_SEQUENCE_BUILD_STANDALONE=OFF", 100 | "-DFL_SEQUENCE_USE_OPENMP=" + use_openmp, 101 | "-DFL_SEQUENCE_USE_CUDA=" + use_cuda, 102 | ] 103 | cfg = "Debug" if self.debug else "Release" 104 | build_args = ["--config", cfg] 105 | 106 | if platform.system() == "Windows": 107 | cmake_args += [ 108 | "-DCMAKE_RUNTIME_OUTPUT_DIRECTORY_{}={}".format(cfg.upper(), ext_dir), 109 | "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}".format(cfg.upper(), ext_dir), 110 | "-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY_{}={}".format(cfg.upper(), ext_dir), 111 | ] 112 | if sys.maxsize > 2**32: 113 | cmake_args += ["-A", "x64"] 114 | build_args += ["--", "/m"] 115 | else: 116 | cmake_args += ["-DCMAKE_BUILD_TYPE=" + cfg] 117 | build_args += ["--", "-j4"] 118 | 119 | env = os.environ.copy() 120 | env["CXXFLAGS"] = '{} -fPIC -DVERSION_INFO=\\"{}\\"'.format( 121 | env.get("CXXFLAGS", ""), self.distribution.get_version() 122 | ) 123 | 124 | if not os.path.exists(self.build_temp): 125 | os.makedirs(self.build_temp) 126 | subprocess.check_call( 127 | ["cmake", source_dir] + cmake_args, cwd=self.build_temp, env=env 128 | ) 129 | subprocess.check_call( 130 | ["cmake", "--build", "."] + build_args, cwd=self.build_temp 131 | ) 132 | 133 | 134 | def main(): 135 | if os.getenv("BUILD_VERSION"): 136 | version = os.getenv("BUILD_VERSION") 137 | elif BUILD_VERSION_PATH.is_file(): 138 | version = BUILD_VERSION_PATH.read_text().strip() 139 | else: 140 | version_txt = os.path.join(THIS_DIR, PACKAGE_DIR, "version.txt") 141 | with open(version_txt) as f: 142 | version = f.readline().strip() 143 | version += get_local_version_suffix() 144 | 145 | write_version_file(version) 146 | 147 | # Read Python bindings README 148 | long_description = (Path(PACKAGE_DIR) / "README.md").read_text() 149 | 150 | setup( 151 | name="flashlight-sequence", 152 | version=version, 153 | url="https://github.com/flashlight/sequence", 154 | author="Jacob Kahn", 155 | author_email="jacobkahn1@gmail.com", 156 | description="Flashlight Sequence bindings for Python", 157 | long_description=long_description, 158 | long_description_content_type="text/markdown", 159 | packages=find_namespace_packages( 160 | where=PACKAGE_DIR, 161 | include=["flashlight.lib.sequence", "flashlight.lib.sequence.criterion"], 162 | exclude=["test"], 163 | ), 164 | package_dir={"": PACKAGE_DIR}, 165 | ext_modules=[CMakeExtension("flashlight.lib.sequence.criterion")], 166 | cmdclass={"build_ext": CMakeBuild}, 167 | zip_safe=False, 168 | license="BSD licensed, as found in the LICENSE file", 169 | python_requires=">=3.6", 170 | classifiers=[ 171 | "Programming Language :: Python :: 3.6", 172 | "Programming Language :: Python :: 3.7", 173 | "Programming Language :: Python :: 3.8", 174 | "Programming Language :: Python :: 3.9", 175 | "Programming Language :: Python :: 3.10", 176 | "Programming Language :: Python :: 3.11", 177 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 178 | "Operating System :: OS Independent", 179 | ], 180 | ) 181 | 182 | 183 | if __name__ == "__main__": 184 | main() 185 | --------------------------------------------------------------------------------