├── .clang-format ├── .gitignore ├── .syntastic_cpp_config ├── .travis.yml ├── CMakeLists.txt ├── NOTES.md ├── README.md ├── compile.env ├── hflt.cmake.in ├── scripts ├── get_albert_pretrained.sh ├── get_data.sh ├── get_third_party.sh ├── sst2tojson.jq └── trace_model.py ├── src ├── CMakeLists.txt ├── config_utils.cpp ├── config_utils.h ├── dataset_classification.cpp ├── dataset_classification.h ├── dataset_qa.cpp ├── dataset_qa.h ├── processors.cpp ├── processors.h ├── run_model.cpp ├── run_model.h ├── squad_utils.cpp ├── squad_utils.h ├── tokenizer_albert.cpp ├── tokenizer_albert.h ├── tokenizer_base.cpp ├── tokenizer_base.h ├── transformer_example.h └── transformer_stack.h └── test ├── CMakeLists.txt ├── assets ├── sst-2-head.json └── sst-2-head.tsv ├── hflt_tests.cpp ├── scripts └── convert_squad.py ├── test_dataset_and_processors.h ├── test_squad_utils.h ├── test_transformer_config.h └── test_transformer_example.h /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | # BasedOnStyle: LLVM 4 | AccessModifierOffset: -2 5 | AlignAfterOpenBracket: Align 6 | AlignConsecutiveMacros: false 7 | AlignConsecutiveAssignments: false 8 | AlignConsecutiveDeclarations: false 9 | AlignEscapedNewlines: Right 10 | AlignOperands: true 11 | AlignTrailingComments: true 12 | AllowAllArgumentsOnNextLine: true 13 | AllowAllConstructorInitializersOnNextLine: true 14 | AllowAllParametersOfDeclarationOnNextLine: true 15 | AllowShortBlocksOnASingleLine: false 16 | AllowShortCaseLabelsOnASingleLine: false 17 | AllowShortFunctionsOnASingleLine: All 18 | AllowShortLambdasOnASingleLine: All 19 | AllowShortIfStatementsOnASingleLine: Never 20 | AllowShortLoopsOnASingleLine: false 21 | AlwaysBreakAfterDefinitionReturnType: None 22 | AlwaysBreakAfterReturnType: None 23 | AlwaysBreakBeforeMultilineStrings: false 24 | AlwaysBreakTemplateDeclarations: MultiLine 25 | BinPackArguments: true 26 | BinPackParameters: true 27 | BraceWrapping: 28 | AfterCaseLabel: false 29 | AfterClass: false 30 | AfterControlStatement: false 31 | AfterEnum: false 32 | AfterFunction: false 33 | AfterNamespace: false 34 | AfterObjCDeclaration: false 35 | AfterStruct: false 36 | AfterUnion: false 37 | AfterExternBlock: false 38 | BeforeCatch: false 39 | BeforeElse: false 40 | IndentBraces: false 41 | SplitEmptyFunction: true 42 | SplitEmptyRecord: true 43 | SplitEmptyNamespace: true 44 | BreakBeforeBinaryOperators: None 45 | BreakBeforeBraces: Attach 46 | BreakBeforeInheritanceComma: false 47 | BreakInheritanceList: BeforeColon 48 | BreakBeforeTernaryOperators: true 49 | BreakConstructorInitializersBeforeComma: false 50 | BreakConstructorInitializers: BeforeColon 51 | BreakAfterJavaFieldAnnotations: false 52 | BreakStringLiterals: true 53 | ColumnLimit: 80 54 | CommentPragmas: '^ IWYU pragma:' 55 | CompactNamespaces: false 56 | ConstructorInitializerAllOnOneLineOrOnePerLine: false 57 | ConstructorInitializerIndentWidth: 4 58 | ContinuationIndentWidth: 4 59 | Cpp11BracedListStyle: true 60 | DerivePointerAlignment: false 61 | DisableFormat: false 62 | ExperimentalAutoDetectBinPacking: false 63 | FixNamespaceComments: true 64 | ForEachMacros: 65 | - foreach 66 | - Q_FOREACH 67 | - BOOST_FOREACH 68 | IncludeBlocks: Preserve 69 | IncludeCategories: 70 | - Regex: '^"(llvm|llvm-c|clang|clang-c)/' 71 | Priority: 2 72 | - Regex: '^(<|"(gtest|gmock|isl|json)/)' 73 | Priority: 3 74 | - Regex: '.*' 75 | Priority: 1 76 | IncludeIsMainRegex: '(Test)?$' 77 | IndentCaseLabels: false 78 | IndentPPDirectives: None 79 | IndentWidth: 2 80 | IndentWrappedFunctionNames: false 81 | JavaScriptQuotes: Leave 82 | JavaScriptWrapImports: true 83 | KeepEmptyLinesAtTheStartOfBlocks: true 84 | MacroBlockBegin: '' 85 | MacroBlockEnd: '' 86 | MaxEmptyLinesToKeep: 1 87 | NamespaceIndentation: None 88 | ObjCBinPackProtocolList: Auto 89 | ObjCBlockIndentWidth: 2 90 | ObjCSpaceAfterProperty: false 91 | ObjCSpaceBeforeProtocolList: true 92 | PenaltyBreakAssignment: 2 93 | PenaltyBreakBeforeFirstCallParameter: 19 94 | PenaltyBreakComment: 300 95 | PenaltyBreakFirstLessLess: 120 96 | PenaltyBreakString: 1000 97 | PenaltyBreakTemplateDeclaration: 10 98 | PenaltyExcessCharacter: 1000000 99 | PenaltyReturnTypeOnItsOwnLine: 60 100 | PointerAlignment: Right 101 | ReflowComments: true 102 | SortIncludes: true 103 | SortUsingDeclarations: true 104 | SpaceAfterCStyleCast: false 105 | SpaceAfterLogicalNot: false 106 | SpaceAfterTemplateKeyword: true 107 | SpaceBeforeAssignmentOperators: true 108 | SpaceBeforeCpp11BracedList: false 109 | SpaceBeforeCtorInitializerColon: true 110 | SpaceBeforeInheritanceColon: true 111 | SpaceBeforeParens: ControlStatements 112 | SpaceBeforeRangeBasedForLoopColon: true 113 | SpaceInEmptyParentheses: false 114 | SpacesBeforeTrailingComments: 1 115 | SpacesInAngles: false 116 | SpacesInContainerLiterals: true 117 | SpacesInCStyleCastParentheses: false 118 | SpacesInParentheses: false 119 | SpacesInSquareBrackets: false 120 | Standard: Cpp11 121 | StatementMacros: 122 | - Q_UNUSED 123 | - QT_REQUIRE_VERSION 124 | TabWidth: 8 125 | UseTab: Never 126 | ... 127 | 128 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | 34 | # models 35 | *.model 36 | *.bin 37 | *.pt 38 | 39 | # build folder, temporary testing, third party libraries and data 40 | build/ 41 | tmp/ 42 | data/ 43 | models/* 44 | third_party/* 45 | 46 | # dev environmental variables 47 | dev.env 48 | -------------------------------------------------------------------------------- /.syntastic_cpp_config: -------------------------------------------------------------------------------- 1 | -I/home/david/.local/libtorch/include/torch/csrc/api/include 2 | -I/home/david/.local/libtorch/include 3 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: cpp 2 | dist: bionic 3 | 4 | compiler: gcc 5 | 6 | addons: 7 | apt: 8 | packages: cmake build-essential pkg-config libgoogle-perftools-dev libboost-all-dev wget unzip libgtest-dev 9 | 10 | before_script: 11 | - mkdir build_gtest && cd build_gtest 12 | - cmake -DBUILD_SHARED_LIBS=ON /usr/src/gtest && make && sudo make install 13 | - sudo ldconfig /usr/local/lib 14 | - cd .. 15 | - rm -Rf build_gtest 16 | - mkdir models 17 | - alias python='python3' 18 | - scripts/get_data.sh 19 | - scripts/get_third_party.sh 20 | - wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate "https://docs.google.com/uc?export=download&id=1i0rr-ogZ2MDYPpUMBsg-2PV7zVddivJ0" -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1i0rr-ogZ2MDYPpUMBsg-2PV7zVddivJ0" -O models/sst2_trained.tar.gz && rm -rf /tmp/cookies.txt 21 | - curl https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -o /tmp/miniconda.sh 22 | - chmod +x /tmp/miniconda.sh 23 | - /tmp/miniconda.sh -b -p ~/miniconda3 24 | - source "$HOME/miniconda3/etc/profile.d/conda.sh" 25 | - hash -r 26 | - conda config --set always_yes yes --set changeps1 no 27 | - conda update -q conda 28 | - conda create -q -n test-environment python=3.7 29 | - conda activate test-environment 30 | - conda install -c pytorch pytorch cpuonly 31 | - pip install transformers 32 | - cd models && tar xzvf sst2_trained.tar.gz && cd .. 33 | - python scripts/trace_albert.py 34 | - source compile.env 35 | - mkdir build 36 | - cd build 37 | - cmake -DBUILD_TEST=ON -DCMAKE_BUILD_TYPE=Debug .. 38 | 39 | script: 40 | - make 41 | 42 | after_success: 43 | - src/hflt 44 | - head -n 100 ../data/SST-2/dev.tsv > ../data/SST-2/dev-small.tsv 45 | - src/hflt ../models/sst2_trained ../data/SST-2/dev-small.tsv 46 | - ctest -VV 47 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | 3 | cmake_policy(SET CMP0074 NEW) 4 | 5 | set(CMAKE_CXX_STANDARD 14) 6 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 7 | 8 | option(DO_CLANG_TIDY "clang tidy output" OFF) 9 | if (DO_CLANG_TIDY) 10 | set(CMAKE_CXX_CLANG_TIDY clang-tidy -checks=-*,readability-*) 11 | endif() 12 | 13 | # set the project name: Huggingface Libtorch i.e. HfLt 14 | project(hflt VERSION 0.0.1 DESCRIPTION "huggingface transformers inference in c++") 15 | 16 | set(CMAKE_EXPORT_COMPILE_COMMANDS ON) 17 | 18 | # add sentencepiece, consider making this a Find*.cmake file 19 | if (DEFINED ENV{SENTENCEPIECE_ROOT}) 20 | set(SENTENCEPIECE_ROOT $ENV{SENTENCEPIECE_ROOT}) 21 | else() 22 | set(SENTENCEPIECE_ROOT ${PROJECT_SOURCE_DIR}/third_party/local) 23 | endif() 24 | find_library(SENTENCEPIECE_LIBRARIES IMPORTED 25 | NAMES sentencepiece libsentencepiece 26 | PATHS "${PROJECT_SOURCE_DIR}/third_party/local" 27 | HINTS "${SENTENCEPIECE_ROOT}/lib") 28 | set(SENTENCEPIECE_INCLUDE_DIRS ${SENTENCEPIECE_ROOT}/include) 29 | include_directories(${SENTENCEPIECE_INCLUDE_DIRS}) 30 | 31 | # alternative method but apparently required pkg-config which isn't always available 32 | #set(ENV{PKG_CONFIG_PATH} "$ENV{PKG_CONFIG_PATH}:${SENTENCEPIECE_ROOT}/lib/pkgconfig") 33 | #find_package(PkgConfig REQUIRED) 34 | #pkg_check_modules(SENTENCEPIECE REQUIRED sentencepiece) 35 | #target_link_libraries(hflt "${SENTENCEPIECE_LINK_LIBRARIES}) 36 | 37 | # add boost config_utils. which is header only so no components needed to be specified 38 | # https://stackoverflow.com/questions/6646405/how-do-you-add-boost-libraries-in-cmakelists-txt 39 | find_package(Boost 1.45.0) 40 | include_directories(${Boost_INCLUDE_DIRS}) 41 | 42 | # add torch 43 | find_package(Torch REQUIRED IMPORTED) 44 | 45 | # add nlohmann json 46 | find_package(nlohmann_json PATHS "${PROJECT_SOURCE_DIR}/third_party/local" REQUIRED IMPORTED) 47 | 48 | add_subdirectory(src) 49 | 50 | # tests 51 | option(BUILD_TEST "Build c++ tests" OFF) 52 | if (BUILD_TEST) 53 | enable_testing() 54 | include(GoogleTest) 55 | add_subdirectory(test) 56 | endif() 57 | 58 | #include(CMakePackageConfigHelpers) 59 | #configure_package_config_file( 60 | # "hflt.cmake.in" 61 | # "hfltConfig.cmake" 62 | # INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/hflt 63 | # PATH_VARS 64 | # CMAKE_INSTALL_LIBDIR) 65 | #install(FILES "${CMAKE_CURRENT_BINARY_DIR}/hfltConfig.cmake" 66 | # DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/hflt") 67 | #install(EXPORT hfltTargets 68 | # DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/hflt" 69 | # FILE hfltTargets.cmake) 70 | -------------------------------------------------------------------------------- /NOTES.md: -------------------------------------------------------------------------------- 1 | # Notes to Self 2 | 3 | ## Resources 4 | 5 | [boost tokenizer docs](https://www.boost.org/doc/libs/1_71_0/libs/tokenizer/doc/index.html) 6 | [csv processing](https://stackoverflow.com/questions/1120140/how-can-i-read-and-parse-csv-files-in-c) 7 | [pytorch-cpp examples](https://github.com/prabhuomkar/pytorch-cpp.) 8 | [unique_ptr for sentencepiece](https://stackoverflow.com/questions/42595473/correct-usage-of-unique-ptr-in-class-member) 9 | [batch function](https://github.com/pytorch/pytorch/blob/master/torch/csrc/api/include/torch/data/datasets/base.h#L69) 10 | [pkg-config and cmake](https://stackoverflow.com/questions/44487053/set-pkg-config-path-in-cmake) - sentencepiece doesn't have a proper cmake file 11 | [libtorch no_grad()](https://discuss.pytorch.org/t/memory-leak-in-libtorch-extremely-simple-code/38149/5) 12 | 13 | ## C++ General Notes 14 | 15 | how to compile sentencepiece manually 16 | ```sh 17 | g++ -o out in.cpp -L -lsentencepiece 18 | ``` 19 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Libtorch + Huggingface Transformers 2 | 3 | [![Build Status](https://travis-ci.org/dhpollack/huggingface_libtorch.svg?branch=master)](https://travis-ci.org/dhpollack/huggingface_libtorch) 4 | 5 | ## Requirements 6 | 7 | Currently, I have only tested this on Linux (Arch Linux and Ubuntu 18.04 LTS) 8 | 9 | To run this repo, you need the following: 10 | 11 | - [x] A modern c++ compiler (newer version of gcc and clang seem to work, although gcc 9 has issues) 12 | - [x] [Libtorch](https://pytorch.org) 13 | - [x] [Sentencepiece](https://github.com/google/sentencepiece) 14 | - [x] [Boost](https://boost.org) 15 | - [x] [nlohmann json](https://github.com/nlohmann/json) 16 | - [ ] Other Tokenizers (not implemented yet) 17 | 18 | To run the sample, you'll additionally need: 19 | 20 | - [x] [hugginface's transformers](https://github.com/huggingface/transformers) 21 | - [x] [PyTorch - python version](https://pytorch.org) 22 | - [x] [Anaconda / Miniconda](https://docs.conda.io/en/latest/miniconda.html) 23 | 24 | Below are a set of scripts that will download and install the example dataset, a pretrained model, and the various c++ libraries required for this repo. I am assuming anyone using this has a preexisting install of conda and boost already installed. If you already have the c++ libraries installed, you can look at `compile.dev` and set the appropriate environmental variables to the locations of your local libraries. There are some requirements for the above mentioned libraries and you obviously need to install those as well. You can look at my CI build to see what it would take to run this library from a clean Ubuntu 18.04 LTS system. 25 | 26 | Right now I don't plan on supporting OSX or Windows, although I suspect this could run on OSX if you install all the required libraries manually. Especially for Windows, I don't have the time nor a Windows system to test this on. Feel free to make a PR if you want OSX / Mac support. 27 | 28 | Currently, I have only test this with an ALBERT model on a simple classification task (sentiment analysis). One should be able to use any of the sentencepiece-tokenized models with fairly few modifications and hopefully, I'll be adding the other tokenizers and tasks soon. 29 | 30 | [ALBERT model pretrained on SST-2](https://drive.google.com/open?id=1i0rr-ogZ2MDYPpUMBsg-2PV7zVddivJ0) - download and unzip it (i.e. in a folder called `models`). 31 | 32 | ### How to Run Sample / Tests 33 | 34 | ```sh 35 | # get the data 36 | scripts/get_data.sh 37 | # get the requirements if you need them 38 | scripts/get_third_party.sh 39 | # get model finetuned on the SST-2 dataset 40 | scripts/get_albert_pretrained.sh 41 | # the following sets up a minimal anaconda env to trace a huggingface transformers model 42 | conda create -n hflt python=3.7 43 | conda activate hflt 44 | conda install -c pytorch pytorch cpuonly 45 | pip install transformers typer 46 | # trace the model that we downloaded above 47 | python scripts/trace_model.py 48 | ``` 49 | 50 | ## Build from Source 51 | 52 | ```sh 53 | source compile.env 54 | mkdir build && cd build 55 | cmake .. 56 | make 57 | ``` 58 | 59 | ## run 60 | currently, gets 89.2325% accuracy on the dev set vs. 89.2201% last reported by the trained model. 61 | 62 | ```sh 63 | hflt [model dir] [dataset file] 64 | ``` 65 | 66 | ## Build Tests and Run Sample 67 | 68 | ```sh 69 | source compile.env 70 | mkdir build_debug && cd build_debug 71 | cmake -DBUILD_TEST=ON -DCMAKE_BUILD_TYPE=Debug .. 72 | make -j $(nproc) 73 | ctest -VV 74 | src/hflt ../models/sst2_trained ../data/SST-2/dev.tsv 75 | ``` 76 | 77 | ## too lazy to install... 78 | 79 | [colab notebook with GPU](https://colab.research.google.com/drive/1TFZbXhiGBtcWVH3ir9Hb1gLGcJyxzTNS) 80 | 81 | You should be able to run this repo from colab with the above link. I also downloaded the CUDA version of libtorch there. 82 | 83 | ## training model 84 | 85 | I used the transformers library to train the sentiment analysis example with a mostly default parameters. Here's the command I ran: 86 | 87 | ```sh 88 | # clone transformers repo for training scripts 89 | git clone https://github.com/huggingface/transformers.git 90 | # single GPU training 91 | python transformers/examples/run_glue.py --task_name sst-2 --data_dir data/SST-2 --model_type albert --model_name_or_path albert-base-v1 --save_steps 5000 --output_dir output --do_train --do_eval --evaluate_during_training --per_gpu_train_batch_size 32 --overwrite_output_dir 92 | # multi-gpu training 93 | NUM_GPUS=$(nvidia-smi -L | wc -l) python -m torch.distributed.launch --nproc_per_node ${NUM_GPUS} transformers/examples/run_glue.py --task_name sst-2 --data_dir data/SST-2 --model_type albert --model_name_or_path albert-base-v1 --save_steps 5000 --output_dir output --do_train --do_eval --evaluate_during_training --per_gpu_train_batch_size 32 --overwrite_output_dir 94 | ``` 95 | -------------------------------------------------------------------------------- /compile.env: -------------------------------------------------------------------------------- 1 | export Torch_ROOT=$(pwd)/third_party/libtorch/share/cmake/Torch 2 | export SENTENCEPIECE_ROOT=$(pwd)/third_party/local 3 | #export Boost_ROOT=$(pwd)/third_party/boost_1_72_0 4 | -------------------------------------------------------------------------------- /hflt.cmake.in: -------------------------------------------------------------------------------- 1 | @PACKAGE_INIT@ 2 | 3 | include( "${CMAKE_CURRENT_LIST_DIR}/hfltTargets.cmake" ) 4 | 5 | # add sentencepiece, consider making this a Find*.cmake file 6 | if (DEFINED ENV{SENTENCEPIECE_ROOT}) 7 | set(SENTENCEPIECE_ROOT $ENV{SENTENCEPIECE_ROOT}) 8 | else() 9 | set(SENTENCEPIECE_ROOT ${PROJECT_SOURCE_DIR}/third_party/local) 10 | endif() 11 | find_library(SENTENCEPIECE_LIBRARIES 12 | NAMES sentencepiece libsentencepiece 13 | PATHS "${PROJECT_SOURCE_DIR}/third_party/local" 14 | HINTS "${SENTENCEPIECE_ROOT}/lib") 15 | set(SENTENCEPIECE_INCLUDE_DIRS ${SENTENCEPIECE_ROOT}/include) 16 | include_directories(${SENTENCEPIECE_INCLUDE_DIRS}) 17 | 18 | # add boost tokenizer, which is header only so no components needed to be specified 19 | # https://stackoverflow.com/questions/6646405/how-do-you-add-boost-libraries-in-cmakelists-txt 20 | find_package(Boost 1.45.0) 21 | include_directories(${Boost_INCLUDE_DIRS}) 22 | 23 | # add torch 24 | find_package(Torch REQUIRED) 25 | 26 | # add nlohmann json 27 | find_package(nlohmann_json PATHS "${PROJECT_SOURCE_DIR}/third_party/local" REQUIRED) 28 | 29 | -------------------------------------------------------------------------------- /scripts/get_albert_pretrained.sh: -------------------------------------------------------------------------------- 1 | cd $(dirname $0)/.. 2 | 3 | mkdir -p models && cd models 4 | 5 | wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate "https://docs.google.com/uc?export=download&id=1i0rr-ogZ2MDYPpUMBsg-2PV7zVddivJ0" -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1i0rr-ogZ2MDYPpUMBsg-2PV7zVddivJ0" -O sst2_trained.tar.gz && rm -rf /tmp/cookies.txt 6 | 7 | tar xzvf sst2_trained.tar.gz 8 | 9 | # run the following command 10 | echo 'run: `python ../scripts/trace_albert.py`' 11 | -------------------------------------------------------------------------------- /scripts/get_data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # stackoverflow seems to think there are better ways of doing the following 4 | cd $(dirname $0)/.. 5 | 6 | GLUEDLURI=https://raw.githubusercontent.com/nyu-mll/GLUE-baselines/master/download_glue_data.py 7 | SQUADV2BASEURI=https://rajpurkar.github.io/SQuAD-explorer/dataset/ 8 | 9 | # download GLUE SST-2 10 | mkdir -p data && cd data 11 | wget $GLUEDLURI 12 | python download_glue_data.py --data_dir . --tasks SST 13 | rm download_glue_data.py 14 | 15 | # download squad v2 16 | mkdir SQuAD && cd SQuAD 17 | #wget "${SQUADV2BASEURI}/train-v2.0.json" # ignore trainset for now 18 | wget "${SQUADV2BASEURI}/dev-v2.0.json" 19 | 20 | -------------------------------------------------------------------------------- /scripts/get_third_party.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | cd $(dirname $0)/.. 4 | 5 | mkdir -p third_party && cd third_party 6 | 7 | THIRDPARTYLOCAL="local" 8 | SENTENCEPIECEURI="https://github.com/google/sentencepiece.git" 9 | LIBTORCHURI="https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-1.5.0%2Bcpu.zip" 10 | BOOSTURI="https://dl.bintray.com/boostorg/release/1.72.0/source/boost_1_72_0.tar.gz" 11 | NLOHMANNURI="https://github.com/nlohmann/json.git" 12 | 13 | echo "Installing sentencepiece, make sure you've installed the requirements, which on Ubuntu is:" 14 | echo "sudo apt-get install cmake build-essential pkg-config libgoogle-perftools-dev" 15 | 16 | git clone $SENTENCEPIECEURI 17 | cd sentencepiece 18 | mkdir -p build && cd build 19 | cmake -DCMAKE_INSTALL_PREFIX=$(realpath ../../${THIRDPARTYLOCAL}) .. 20 | make -j $(nproc --ignore=1) 21 | make install 22 | # cleaning up 23 | cd ../.. 24 | rm -Rf sentencepiece 25 | 26 | 27 | echo "installing cpu-libtorch, if you want the CUDA version it's available at https://pytorch.org" 28 | wget $LIBTORCHURI -O libtorch.zip 29 | unzip -qq libtorch.zip 30 | rm libtorch.zip 31 | 32 | echo "installing nlohmann json..." 33 | git clone --depth 1 ${NLOHMANNURI} 34 | cd json 35 | mkdir build && cd build 36 | cmake -DJSON_BuildTests=OFF -DCMAKE_INSTALL_PREFIX=$(realpath ../../${THIRDPARTYLOCAL}) .. 37 | make install 38 | cd .. 39 | 40 | 41 | #echo "installing boost" 42 | #wget $BOOSTURI -O boost.tar.gz 43 | #tar xzf boost.tar.gz 44 | #rm boost.tar.gz 45 | 46 | -------------------------------------------------------------------------------- /scripts/sst2tojson.jq: -------------------------------------------------------------------------------- 1 | # jq -R -f scripts/sst2tojson.jq [input_tsv_file] > [output_json_file] 2 | [inputs] | 3 | {data: 4 | [ 5 | to_entries | 6 | .[] | 7 | (.key | tostring) + "\t" + .value | 8 | split("\t") | 9 | {guid: .[0], text_a: .[1], text_b: "", label: .[2]} 10 | ] 11 | } 12 | -------------------------------------------------------------------------------- /scripts/trace_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | from pydoc import locate 4 | 5 | import torch 6 | import transformers 7 | import typer 8 | 9 | app = typer.Typer() 10 | 11 | 12 | @app.command() 13 | def trace_classification_model( 14 | model_path: Path = "./models/sst2_trained", 15 | output_name: str = "traced_albert.pt", 16 | hf_model_class: str = "transformers.AlbertForSequenceClassification", 17 | hf_tokenizer_class: str = "transformers.AlbertTokenizer", 18 | ): 19 | output_path = model_path / output_name 20 | tokenizer_cls = locate(hf_tokenizer_class) 21 | tokenizer = tokenizer_cls.from_pretrained(str(model_path)) 22 | tokens = tokenizer.encode( 23 | "this is a test", add_special_tokens=True, return_tensors="pt" 24 | ).flatten() 25 | tokens_len = tokens.size(0) 26 | token_ids = torch.zeros(128, dtype=torch.long) 27 | token_ids[:tokens_len] = tokens 28 | token_ids.unsqueeze_(0) 29 | attention_mask = torch.ones(128, dtype=torch.long) 30 | attention_mask[:tokens_len] = 0 31 | attention_mask.unsqueeze_(0) 32 | token_type_ids = (attention_mask == 0).to(torch.long) 33 | position_ids = torch.arange(0, 128, dtype=torch.long) 34 | dummy_input = [token_ids, attention_mask, token_type_ids, position_ids] 35 | model_cls = locate(hf_model_class) 36 | model = model_cls.from_pretrained(str(model_path), torchscript=True) 37 | traced_model = torch.jit.trace(model, dummy_input) 38 | torch.jit.save(traced_model, str(output_path)) 39 | 40 | 41 | if __name__ == "__main__": 42 | app() 43 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | 3 | cmake_policy(SET CMP0074 NEW) 4 | 5 | set(HEADER_FILES 6 | squad_utils.h 7 | ) 8 | 9 | set(SOURCE_FILES 10 | run_model.cpp 11 | ) 12 | 13 | set(UTIL_FILES 14 | squad_utils.cpp 15 | tokenizer_base.cpp 16 | tokenizer_albert.cpp 17 | config_utils.cpp 18 | processors.cpp 19 | dataset_classification.cpp 20 | dataset_qa.cpp 21 | ) 22 | 23 | # add the shared library and executable 24 | add_library(hflt SHARED ${UTIL_FILES}) 25 | add_executable(hflt-bin ${SOURCE_FILES}) 26 | 27 | # link the executable to the shared library 28 | set(CMAKE_INSTALL_RPATH "${ORIGIN}") 29 | set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE) 30 | 31 | set_target_properties(hflt-bin PROPERTIES OUTPUT_NAME "hflt") 32 | set_target_properties(hflt PROPERTIES 33 | VERSION ${PROJECT_VERSION} 34 | SOVERSION 0 35 | PUBLIC_HEADER squad_utils.h) 36 | 37 | # makes working with subdirectories easier, but right now not used 38 | target_include_directories(hflt PRIVATE .) 39 | 40 | # install 41 | include(GNUInstallDirs) 42 | install(TARGETS hflt 43 | EXPORT hfltTargets 44 | LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} 45 | PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} 46 | ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}) 47 | install(TARGETS hflt-bin 48 | RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) 49 | 50 | # link target libraries 51 | target_link_libraries(hflt PUBLIC 52 | ${TORCH_LIBRARIES} 53 | ) 54 | target_link_libraries(hflt PRIVATE 55 | ${SENTENCEPIECE_LIBRARIES} 56 | nlohmann_json::nlohmann_json 57 | ) 58 | target_link_libraries(hflt-bin hflt) 59 | 60 | -------------------------------------------------------------------------------- /src/config_utils.cpp: -------------------------------------------------------------------------------- 1 | #include "config_utils.h" 2 | 3 | using namespace std; 4 | 5 | template T _contains_or_empty(json j, char const *k) { 6 | return j.contains(k) ? j[k].get() : T(); 7 | } 8 | 9 | TransformersTokenizerConfigs read_transformers_pretrained(const char *dirpath) { 10 | string basedir(dirpath); 11 | string tokenizer_config_path = basedir + "/tokenizer_config.json"; 12 | ifstream fd_tokenizer_config(tokenizer_config_path); 13 | TransformersTokenizerConfigs configs; 14 | TransformersTokenizerConfig tokenizer_config; 15 | TransformersSpecialTokensMap special_tokens_map; 16 | TransformersAddedTokens added_tokens; 17 | if (!fd_tokenizer_config.is_open()) { 18 | cerr << "something went wrong opening: " << tokenizer_config_path << endl; 19 | } else { 20 | tokenizer_config = read_transformers_tokenizer_config(fd_tokenizer_config); 21 | } 22 | string special_tokens_map_path = basedir + "/special_tokens_map.json"; 23 | ifstream fd_special_tokens_map(special_tokens_map_path); 24 | if (!fd_special_tokens_map.is_open()) { 25 | cerr << "something went wrong opening: " << special_tokens_map_path << endl; 26 | } else { 27 | special_tokens_map = 28 | read_transformers_special_tokens_map(fd_special_tokens_map); 29 | } 30 | string added_tokens_path = basedir + "/added_tokens.json"; 31 | ifstream fd_added_tokens(added_tokens_path); 32 | if (!fd_added_tokens.is_open()) { 33 | // keep for debugging, but the added tokens json file isn't always created 34 | // cerr << "something went wrong opening: " << added_tokens_path << endl; 35 | } else { 36 | added_tokens = read_transformers_added_tokens(fd_added_tokens); 37 | } 38 | // combine into TransformersTokenizerConfigs 39 | configs = {tokenizer_config, special_tokens_map, added_tokens}; 40 | return configs; 41 | } 42 | 43 | TransformersTokenizerConfig read_transformers_tokenizer_config(ifstream &fd) { 44 | json config; 45 | fd >> config; 46 | TransformersTokenizerConfig tc = { 47 | _contains_or_empty(config, "do_lower_case"), 48 | _contains_or_empty>(config, "init_inputs"), 49 | _contains_or_empty(config, "max_len")}; 50 | return tc; 51 | } 52 | 53 | TransformersSpecialTokensMap 54 | read_transformers_special_tokens_map(ifstream &fd) { 55 | json special_tokens; 56 | fd >> special_tokens; 57 | TransformersSpecialTokensMap stm = { 58 | _contains_or_empty(special_tokens, "cls_token"), 59 | _contains_or_empty(special_tokens, "mask_token"), 60 | _contains_or_empty(special_tokens, "pad_token"), 61 | _contains_or_empty(special_tokens, "sep_token"), 62 | _contains_or_empty(special_tokens, "unk_token"), 63 | _contains_or_empty(special_tokens, "bos_token"), 64 | _contains_or_empty(special_tokens, "eos_token")}; 65 | return stm; 66 | } 67 | 68 | TransformersAddedTokens read_transformers_added_tokens(ifstream &fd) { 69 | json added_tokens; 70 | fd >> added_tokens; 71 | TransformersAddedTokens at = {added_tokens.empty() 72 | ? vector() 73 | : added_tokens.get>()}; 74 | return at; 75 | } 76 | -------------------------------------------------------------------------------- /src/config_utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | using json = nlohmann::json; 10 | 11 | struct TransformersTokenizerConfig { 12 | bool do_lower_case; 13 | std::vector init_inputs; 14 | size_t max_len; 15 | }; 16 | 17 | struct TransformersSpecialTokensMap { 18 | std::string cls_token; 19 | std::string mask_token; 20 | std::string pad_token; 21 | std::string sep_token; 22 | std::string unk_token; 23 | std::string bos_token; // in sentencepiece models 24 | std::string eos_token; // in sentencepiece models 25 | }; 26 | 27 | struct TransformersAddedTokens { 28 | std::vector added_tokens; 29 | }; 30 | 31 | struct TransformersTokenizerConfigs { 32 | TransformersTokenizerConfig tokenizer_config; 33 | TransformersSpecialTokensMap special_tokens_map; 34 | TransformersAddedTokens added_tokens; 35 | }; 36 | 37 | TransformersTokenizerConfigs read_transformers_pretrained(const char *dirpath); 38 | TransformersTokenizerConfig 39 | read_transformers_tokenizer_config(std::ifstream &fd); 40 | TransformersSpecialTokensMap 41 | read_transformers_special_tokens_map(std::ifstream &fd); 42 | TransformersAddedTokens read_transformers_added_tokens(std::ifstream &fd); 43 | -------------------------------------------------------------------------------- /src/dataset_classification.cpp: -------------------------------------------------------------------------------- 1 | #include "dataset_classification.h" 2 | 3 | using namespace std; 4 | 5 | // Constructor 6 | template 7 | TransformerClassificationDS:: 8 | TransformerClassificationDS( 9 | const string &pretrained_dir, long maximum_sequence_len, 10 | const function(const string &arg)> read_examples_fn, 11 | const string &read_examples_arg) 12 | : tokenizer_(pretrained_dir.c_str()), 13 | examples_(read_examples_fn(read_examples_arg)), 14 | msl_(maximum_sequence_len) {} 15 | 16 | // get() 17 | template 18 | FeaturesType 19 | TransformerClassificationDS::get( 20 | size_t index) { 21 | auto opts_data = torch::TensorOptions().dtype(torch::kLong); 22 | ExampleType ex = examples_[index]; 23 | // tokenize and tensorize 24 | FeaturesType features = tokenizer_.encode(ex.text_a, ex.text_b, true, msl_, 0, 25 | "longest_first", true); 26 | features.label = _label_to_tensor(ex.label, opts_data); 27 | return features; 28 | } 29 | 30 | // size() 31 | template 32 | torch::optional 33 | TransformerClassificationDS::size() 34 | const { 35 | torch::optional sz(examples_.size()); 36 | return sz; 37 | } 38 | 39 | // examples() 40 | template 41 | const vector & 42 | TransformerClassificationDS::examples() const { 44 | return examples_; 45 | } 46 | 47 | // _label_to_tensor() 48 | template 49 | torch::Tensor 50 | TransformerClassificationDS:: 51 | _label_to_tensor(const string &label, torch::TensorOptions &topts) { 52 | vector lv; 53 | stringstream ss(label); 54 | std::transform(istream_iterator(ss), istream_iterator(), 55 | std::back_inserter(lv), [](long x) { return x; }); 56 | return torch::from_blob(lv.data(), {(long)lv.size()}, topts).clone(); 57 | } 58 | 59 | template class TransformerClassificationDS; 60 | template class TransformerClassificationDS<>; 61 | -------------------------------------------------------------------------------- /src/dataset_classification.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | #include 12 | 13 | #include "tokenizer_albert.h" 14 | #include "tokenizer_base.h" 15 | #include "transformer_example.h" 16 | 17 | template > 20 | class TransformerClassificationDS 21 | : public torch::data::datasets::Dataset< 22 | TransformerClassificationDS, 24 | TransformerSingleFeatures> { 25 | public: 26 | // A base dataset for transformers, which loads the model and tokenizer from 27 | // `pretrained_dir` and populates the `examples_` member using the function 28 | // `read_examples` and `read_examples_arg`. 29 | // 30 | // TODO: make `read_examples` and `read_examples_arg` more generic with 31 | // variadic templates 32 | // 33 | // The supplied `filepath` path should be a tsv file with the sentence 34 | // followed by the label. 35 | explicit TransformerClassificationDS( 36 | const std::string &pretrained_dir, long maximum_sequence_len, 37 | const std::function< 38 | std::vector(const std::string &arg)> 39 | read_examples_fn, 40 | const std::string &read_examples_arg); 41 | 42 | // Returns the `TransformerSingleExample` at the given `index`. 43 | virtual TransformerSingleFeatures get(size_t index) override; 44 | 45 | // Returns the size of the dataset. 46 | torch::optional size() const override; 47 | 48 | // Returns all examples as a vector. 49 | const std::vector &examples() const; 50 | 51 | // read all examples 52 | // virtual std::vector read_examples(const 53 | // std::string &arg) override; 54 | 55 | private: 56 | torch::Tensor _label_to_tensor(const std::string &label, 57 | torch::TensorOptions &topts); 58 | TokenizerType tokenizer_; 59 | std::vector examples_; 60 | long msl_; // maximum sequence length 61 | }; 62 | -------------------------------------------------------------------------------- /src/dataset_qa.cpp: -------------------------------------------------------------------------------- 1 | #include "dataset_qa.h" 2 | 3 | using namespace std; 4 | 5 | // Constructor 6 | template 7 | TransformerQADS::TransformerQADS( 8 | const string &pretrained_dir, long maximum_sequence_len, 9 | const function(const string &arg)> read_examples_fn, 10 | const string &read_examples_arg) 11 | : tokenizer_(pretrained_dir.c_str()), 12 | examples_(read_examples_fn(read_examples_arg)), 13 | msl_(maximum_sequence_len) { 14 | // constructor post-initialization 15 | vector> doc_span_mapping = 16 | add_tokens_to_examples(examples_, tokenizer_, msl_); 17 | items_ = doc_span_mapping; 18 | } 19 | 20 | // get() 21 | template 22 | FeaturesType 23 | TransformerQADS::get(size_t index) { 24 | pair indices = items_[index]; 25 | ExampleType ex = examples_[indices.first]; 26 | pair p_span = ex.p_spans[indices.second]; 27 | FeaturesType features = example_to_features(ex, p_span); 28 | return features; 29 | } 30 | 31 | // size() 32 | template 33 | torch::optional 34 | TransformerQADS::size() const { 35 | torch::optional sz(items_.size()); 36 | return sz; 37 | } 38 | 39 | // examples() 40 | template 41 | const vector & 42 | TransformerQADS::examples() const { 43 | return examples_; 44 | } 45 | 46 | // _label_to_tensor() 47 | template 48 | torch::Tensor 49 | TransformerQADS::_label_to_tensor( 50 | const string &label, torch::TensorOptions &topts) { 51 | vector lv; 52 | stringstream ss(label); 53 | std::transform(istream_iterator(ss), istream_iterator(), 54 | std::back_inserter(lv), [](long x) { return x; }); 55 | return torch::from_blob(lv.data(), {(long)lv.size()}, topts).clone(); 56 | } 57 | 58 | // example_to_features() 59 | template 60 | FeaturesType 61 | TransformerQADS::example_to_features( 62 | ExampleType &example, pair &p_span) { 63 | auto opts_data = torch::TensorOptions().dtype(torch::kLong); 64 | size_t max_len = static_cast(msl_); 65 | vector q_tokens(example.q_tokens.begin(), example.q_tokens.end()); 66 | q_tokens.insert(q_tokens.begin(), tokenizer_.cls_token_id()); 67 | q_tokens.push_back(tokenizer_.sep_token_id()); 68 | vector p_tokens_span(example.p_tokens.begin() + p_span.first, 69 | example.p_tokens.begin() + p_span.second); 70 | p_tokens_span.push_back(tokenizer_.sep_token_id()); 71 | vector tokens; 72 | tokens.insert(tokens.end(), q_tokens.begin(), q_tokens.end()); 73 | tokens.insert(tokens.end(), p_tokens_span.begin(), p_tokens_span.end()); 74 | tokens.resize(max_len); 75 | vector attention_mask(q_tokens.size() + p_tokens_span.size(), 1); 76 | attention_mask.resize(max_len, 0); 77 | vector token_type_ids(q_tokens.size(), 0); 78 | token_type_ids.resize(max_len, 1); 79 | vector a_tokens(example.a_tokens.begin(), example.a_tokens.end()); 80 | a_tokens.resize(max_len, 0); 81 | FeaturesType features( 82 | {torch::from_blob(tokens.data(), {msl_}, opts_data).clone(), 83 | torch::from_blob(attention_mask.data(), {msl_}, opts_data).clone(), 84 | torch::from_blob(token_type_ids.data(), {msl_}, opts_data).clone(), 85 | torch::arange(0, msl_, opts_data), 86 | torch::from_blob(a_tokens.data(), {msl_}, opts_data).clone()}); 87 | return features; 88 | } 89 | 90 | template class TransformerQADS; 91 | template class TransformerQADS<>; 92 | -------------------------------------------------------------------------------- /src/dataset_qa.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | #include "squad_utils.h" 15 | #include "tokenizer_albert.h" 16 | #include "tokenizer_base.h" 17 | #include "transformer_example.h" 18 | 19 | template > 22 | class TransformerQADS 23 | : public torch::data::datasets::Dataset< 24 | TransformerQADS, 26 | TransformerSingleFeatures> { 27 | public: 28 | // A base dataset for transformers, which loads the model and tokenizer from 29 | // `pretrained_dir` and populates the `examples_` member using the function 30 | // `read_examples` and `read_examples_arg`. 31 | // 32 | // TODO: make `read_examples` and `read_examples_arg` more generic with 33 | // variadic templates 34 | // 35 | // The supplied `filepath` path should be a tsv file with the sentence 36 | // followed by the label. 37 | explicit TransformerQADS( 38 | const std::string &pretrained_dir, long maximum_sequence_len, 39 | const std::function< 40 | std::vector(const std::string &arg)> 41 | read_examples_fn, 42 | const std::string &read_examples_arg); 43 | 44 | // Returns the `TransformerSingleExample` at the given `index`. 45 | virtual TransformerSingleFeatures get(size_t index) override; 46 | 47 | // Returns the size of the dataset. 48 | torch::optional size() const override; 49 | 50 | // Returns all examples as a vector. 51 | const std::vector &examples() const; 52 | 53 | TransformerSingleFeatures 54 | example_to_features(TransformerSingleExample &example, std::pair &p_span); 55 | 56 | private: 57 | torch::Tensor _label_to_tensor(const std::string &label, 58 | torch::TensorOptions &topts); 59 | TokenizerType tokenizer_; 60 | std::vector examples_; 61 | std::vector> items_; 62 | long msl_; // maximum sequence length 63 | }; 64 | -------------------------------------------------------------------------------- /src/processors.cpp: -------------------------------------------------------------------------------- 1 | #include "processors.h" 2 | 3 | using namespace std; 4 | 5 | vector readGenericJsonFile(const string &filepath) { 6 | // assumes a json in the format {..., "data": [{data_obj},...]}, where 7 | // data_obj = {"guid": "guid_as_string", "text_a": "some text", "text_b": 8 | // "more text", "label": "label0"} 9 | vector examples; 10 | ifstream ifs(filepath); 11 | if (!ifs.is_open()) { 12 | cerr << "unable to open generic json datafile" << endl; 13 | } else { 14 | json j; 15 | ifs >> j; 16 | for (auto &item : j["data"]) { 17 | string guid = item["guid"].get(); 18 | string text_a = item["text_a"].get(); 19 | string text_b = item["text_b"].get(); 20 | string label = item["label"].get(); 21 | examples.emplace_back(guid, text_a, text_b, label); 22 | } 23 | } 24 | return examples; 25 | } 26 | vector readSST2CsvFile(const string &filepath) { 27 | // This function assumes the csv file is in the format `\t