├── .clang-format
├── .gitignore
├── .syntastic_cpp_config
├── .travis.yml
├── CMakeLists.txt
├── NOTES.md
├── README.md
├── compile.env
├── hflt.cmake.in
├── scripts
    ├── get_albert_pretrained.sh
    ├── get_data.sh
    ├── get_third_party.sh
    ├── sst2tojson.jq
    └── trace_model.py
├── src
    ├── CMakeLists.txt
    ├── config_utils.cpp
    ├── config_utils.h
    ├── dataset_classification.cpp
    ├── dataset_classification.h
    ├── dataset_qa.cpp
    ├── dataset_qa.h
    ├── processors.cpp
    ├── processors.h
    ├── run_model.cpp
    ├── run_model.h
    ├── squad_utils.cpp
    ├── squad_utils.h
    ├── tokenizer_albert.cpp
    ├── tokenizer_albert.h
    ├── tokenizer_base.cpp
    ├── tokenizer_base.h
    ├── transformer_example.h
    └── transformer_stack.h
└── test
    ├── CMakeLists.txt
    ├── assets
        ├── sst-2-head.json
        └── sst-2-head.tsv
    ├── hflt_tests.cpp
    ├── scripts
        └── convert_squad.py
    ├── test_dataset_and_processors.h
    ├── test_squad_utils.h
    ├── test_transformer_config.h
    └── test_transformer_example.h


/.clang-format:
--------------------------------------------------------------------------------
  1 | ---
  2 | Language:        Cpp
  3 | # BasedOnStyle:  LLVM
  4 | AccessModifierOffset: -2
  5 | AlignAfterOpenBracket: Align
  6 | AlignConsecutiveMacros: false
  7 | AlignConsecutiveAssignments: false
  8 | AlignConsecutiveDeclarations: false
  9 | AlignEscapedNewlines: Right
 10 | AlignOperands:   true
 11 | AlignTrailingComments: true
 12 | AllowAllArgumentsOnNextLine: true
 13 | AllowAllConstructorInitializersOnNextLine: true
 14 | AllowAllParametersOfDeclarationOnNextLine: true
 15 | AllowShortBlocksOnASingleLine: false
 16 | AllowShortCaseLabelsOnASingleLine: false
 17 | AllowShortFunctionsOnASingleLine: All
 18 | AllowShortLambdasOnASingleLine: All
 19 | AllowShortIfStatementsOnASingleLine: Never
 20 | AllowShortLoopsOnASingleLine: false
 21 | AlwaysBreakAfterDefinitionReturnType: None
 22 | AlwaysBreakAfterReturnType: None
 23 | AlwaysBreakBeforeMultilineStrings: false
 24 | AlwaysBreakTemplateDeclarations: MultiLine
 25 | BinPackArguments: true
 26 | BinPackParameters: true
 27 | BraceWrapping:
 28 |   AfterCaseLabel:  false
 29 |   AfterClass:      false
 30 |   AfterControlStatement: false
 31 |   AfterEnum:       false
 32 |   AfterFunction:   false
 33 |   AfterNamespace:  false
 34 |   AfterObjCDeclaration: false
 35 |   AfterStruct:     false
 36 |   AfterUnion:      false
 37 |   AfterExternBlock: false
 38 |   BeforeCatch:     false
 39 |   BeforeElse:      false
 40 |   IndentBraces:    false
 41 |   SplitEmptyFunction: true
 42 |   SplitEmptyRecord: true
 43 |   SplitEmptyNamespace: true
 44 | BreakBeforeBinaryOperators: None
 45 | BreakBeforeBraces: Attach
 46 | BreakBeforeInheritanceComma: false
 47 | BreakInheritanceList: BeforeColon
 48 | BreakBeforeTernaryOperators: true
 49 | BreakConstructorInitializersBeforeComma: false
 50 | BreakConstructorInitializers: BeforeColon
 51 | BreakAfterJavaFieldAnnotations: false
 52 | BreakStringLiterals: true
 53 | ColumnLimit:     80
 54 | CommentPragmas:  '^ IWYU pragma:'
 55 | CompactNamespaces: false
 56 | ConstructorInitializerAllOnOneLineOrOnePerLine: false
 57 | ConstructorInitializerIndentWidth: 4
 58 | ContinuationIndentWidth: 4
 59 | Cpp11BracedListStyle: true
 60 | DerivePointerAlignment: false
 61 | DisableFormat:   false
 62 | ExperimentalAutoDetectBinPacking: false
 63 | FixNamespaceComments: true
 64 | ForEachMacros:
 65 |   - foreach
 66 |   - Q_FOREACH
 67 |   - BOOST_FOREACH
 68 | IncludeBlocks:   Preserve
 69 | IncludeCategories:
 70 |   - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
 71 |     Priority:        2
 72 |   - Regex:           '^(<|"(gtest|gmock|isl|json)/)'
 73 |     Priority:        3
 74 |   - Regex:           '.*'
 75 |     Priority:        1
 76 | IncludeIsMainRegex: '(Test)?$'
 77 | IndentCaseLabels: false
 78 | IndentPPDirectives: None
 79 | IndentWidth:     2
 80 | IndentWrappedFunctionNames: false
 81 | JavaScriptQuotes: Leave
 82 | JavaScriptWrapImports: true
 83 | KeepEmptyLinesAtTheStartOfBlocks: true
 84 | MacroBlockBegin: ''
 85 | MacroBlockEnd:   ''
 86 | MaxEmptyLinesToKeep: 1
 87 | NamespaceIndentation: None
 88 | ObjCBinPackProtocolList: Auto
 89 | ObjCBlockIndentWidth: 2
 90 | ObjCSpaceAfterProperty: false
 91 | ObjCSpaceBeforeProtocolList: true
 92 | PenaltyBreakAssignment: 2
 93 | PenaltyBreakBeforeFirstCallParameter: 19
 94 | PenaltyBreakComment: 300
 95 | PenaltyBreakFirstLessLess: 120
 96 | PenaltyBreakString: 1000
 97 | PenaltyBreakTemplateDeclaration: 10
 98 | PenaltyExcessCharacter: 1000000
 99 | PenaltyReturnTypeOnItsOwnLine: 60
100 | PointerAlignment: Right
101 | ReflowComments:  true
102 | SortIncludes:    true
103 | SortUsingDeclarations: true
104 | SpaceAfterCStyleCast: false
105 | SpaceAfterLogicalNot: false
106 | SpaceAfterTemplateKeyword: true
107 | SpaceBeforeAssignmentOperators: true
108 | SpaceBeforeCpp11BracedList: false
109 | SpaceBeforeCtorInitializerColon: true
110 | SpaceBeforeInheritanceColon: true
111 | SpaceBeforeParens: ControlStatements
112 | SpaceBeforeRangeBasedForLoopColon: true
113 | SpaceInEmptyParentheses: false
114 | SpacesBeforeTrailingComments: 1
115 | SpacesInAngles:  false
116 | SpacesInContainerLiterals: true
117 | SpacesInCStyleCastParentheses: false
118 | SpacesInParentheses: false
119 | SpacesInSquareBrackets: false
120 | Standard:        Cpp11
121 | StatementMacros:
122 |   - Q_UNUSED
123 |   - QT_REQUIRE_VERSION
124 | TabWidth:        8
125 | UseTab:          Never
126 | ...
127 | 
128 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | *.d
 3 | 
 4 | # Compiled Object files
 5 | *.slo
 6 | *.lo
 7 | *.o
 8 | *.obj
 9 | 
10 | # Precompiled Headers
11 | *.gch
12 | *.pch
13 | 
14 | # Compiled Dynamic libraries
15 | *.so
16 | *.dylib
17 | *.dll
18 | 
19 | # Fortran module files
20 | *.mod
21 | *.smod
22 | 
23 | # Compiled Static libraries
24 | *.lai
25 | *.la
26 | *.a
27 | *.lib
28 | 
29 | # Executables
30 | *.exe
31 | *.out
32 | *.app
33 | 
34 | # models
35 | *.model
36 | *.bin
37 | *.pt
38 | 
39 | # build folder, temporary testing, third party libraries and data
40 | build/
41 | tmp/
42 | data/
43 | models/*
44 | third_party/*
45 | 
46 | # dev environmental variables
47 | dev.env
48 | 


--------------------------------------------------------------------------------
/.syntastic_cpp_config:
--------------------------------------------------------------------------------
1 | -I/home/david/.local/libtorch/include/torch/csrc/api/include
2 | -I/home/david/.local/libtorch/include
3 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: cpp
 2 | dist: bionic
 3 | 
 4 | compiler: gcc
 5 | 
 6 | addons:
 7 |   apt:
 8 |     packages: cmake build-essential pkg-config libgoogle-perftools-dev libboost-all-dev wget unzip libgtest-dev
 9 | 
10 | before_script:
11 |     - mkdir build_gtest && cd build_gtest
12 |     - cmake -DBUILD_SHARED_LIBS=ON /usr/src/gtest && make && sudo make install
13 |     - sudo ldconfig /usr/local/lib
14 |     - cd ..
15 |     - rm -Rf build_gtest
16 |     - mkdir models
17 |     - alias python='python3'
18 |     - scripts/get_data.sh
19 |     - scripts/get_third_party.sh
20 |     - wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate "https://docs.google.com/uc?export=download&id=1i0rr-ogZ2MDYPpUMBsg-2PV7zVddivJ0" -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1i0rr-ogZ2MDYPpUMBsg-2PV7zVddivJ0" -O models/sst2_trained.tar.gz && rm -rf /tmp/cookies.txt
21 |     - curl https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -o /tmp/miniconda.sh
22 |     - chmod +x /tmp/miniconda.sh
23 |     - /tmp/miniconda.sh -b -p ~/miniconda3
24 |     - source "$HOME/miniconda3/etc/profile.d/conda.sh"
25 |     - hash -r
26 |     - conda config --set always_yes yes --set changeps1 no
27 |     - conda update -q conda
28 |     - conda create -q -n test-environment python=3.7
29 |     - conda activate test-environment
30 |     - conda install -c pytorch pytorch cpuonly
31 |     - pip install transformers
32 |     - cd models && tar xzvf sst2_trained.tar.gz && cd ..
33 |     - python scripts/trace_albert.py
34 |     - source compile.env
35 |     - mkdir build
36 |     - cd build
37 |     - cmake -DBUILD_TEST=ON -DCMAKE_BUILD_TYPE=Debug ..
38 | 
39 | script: 
40 |     - make
41 | 
42 | after_success:
43 |     - src/hflt
44 |     - head -n 100 ../data/SST-2/dev.tsv > ../data/SST-2/dev-small.tsv
45 |     - src/hflt ../models/sst2_trained ../data/SST-2/dev-small.tsv 
46 |     - ctest -VV
47 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.10)
 2 | 
 3 | cmake_policy(SET CMP0074 NEW)
 4 | 
 5 | set(CMAKE_CXX_STANDARD 14)
 6 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 7 | 
 8 | option(DO_CLANG_TIDY "clang tidy output" OFF)
 9 | if (DO_CLANG_TIDY)
10 |     set(CMAKE_CXX_CLANG_TIDY clang-tidy -checks=-*,readability-*)
11 | endif()
12 | 
13 | # set the project name: Huggingface Libtorch i.e. HfLt
14 | project(hflt VERSION 0.0.1 DESCRIPTION "huggingface transformers inference in c++")
15 | 
16 | set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
17 | 
18 | # add sentencepiece, consider making this a Find*.cmake file
19 | if (DEFINED ENV{SENTENCEPIECE_ROOT})
20 |     set(SENTENCEPIECE_ROOT $ENV{SENTENCEPIECE_ROOT})
21 | else()
22 |     set(SENTENCEPIECE_ROOT ${PROJECT_SOURCE_DIR}/third_party/local)
23 | endif()
24 | find_library(SENTENCEPIECE_LIBRARIES IMPORTED
25 |     NAMES sentencepiece libsentencepiece
26 |     PATHS "${PROJECT_SOURCE_DIR}/third_party/local"
27 |     HINTS "${SENTENCEPIECE_ROOT}/lib")
28 | set(SENTENCEPIECE_INCLUDE_DIRS ${SENTENCEPIECE_ROOT}/include)
29 | include_directories(${SENTENCEPIECE_INCLUDE_DIRS})
30 | 
31 | # alternative method but apparently required pkg-config which isn't always available
32 | #set(ENV{PKG_CONFIG_PATH} "$ENV{PKG_CONFIG_PATH}:${SENTENCEPIECE_ROOT}/lib/pkgconfig")
33 | #find_package(PkgConfig REQUIRED)
34 | #pkg_check_modules(SENTENCEPIECE REQUIRED sentencepiece)
35 | #target_link_libraries(hflt "${SENTENCEPIECE_LINK_LIBRARIES})
36 | 
37 | # add boost config_utils. which is header only so no components needed to be specified 
38 | # https://stackoverflow.com/questions/6646405/how-do-you-add-boost-libraries-in-cmakelists-txt
39 | find_package(Boost 1.45.0)
40 | include_directories(${Boost_INCLUDE_DIRS})
41 | 
42 | # add torch
43 | find_package(Torch REQUIRED IMPORTED)
44 | 
45 | # add nlohmann json
46 | find_package(nlohmann_json PATHS "${PROJECT_SOURCE_DIR}/third_party/local"  REQUIRED IMPORTED)
47 | 
48 | add_subdirectory(src)
49 | 
50 | # tests
51 | option(BUILD_TEST "Build c++ tests" OFF)
52 | if (BUILD_TEST)
53 |   enable_testing()
54 |   include(GoogleTest)
55 |   add_subdirectory(test)
56 | endif()
57 | 
58 | #include(CMakePackageConfigHelpers)
59 | #configure_package_config_file(
60 | #        "hflt.cmake.in"
61 | #        "hfltConfig.cmake"
62 | #        INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/hflt
63 | #        PATH_VARS
64 | #        CMAKE_INSTALL_LIBDIR)
65 | #install(FILES "${CMAKE_CURRENT_BINARY_DIR}/hfltConfig.cmake"
66 | #        DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/hflt")
67 | #install(EXPORT hfltTargets
68 | #        DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/hflt"
69 | #        FILE hfltTargets.cmake)
70 | 


--------------------------------------------------------------------------------
/NOTES.md:
--------------------------------------------------------------------------------
 1 | # Notes to Self
 2 | 
 3 | ## Resources
 4 | 
 5 | [boost tokenizer docs](https://www.boost.org/doc/libs/1_71_0/libs/tokenizer/doc/index.html)  
 6 | [csv processing](https://stackoverflow.com/questions/1120140/how-can-i-read-and-parse-csv-files-in-c)  
 7 | [pytorch-cpp examples](https://github.com/prabhuomkar/pytorch-cpp.)  
 8 | [unique_ptr for sentencepiece](https://stackoverflow.com/questions/42595473/correct-usage-of-unique-ptr-in-class-member)  
 9 | [batch function](https://github.com/pytorch/pytorch/blob/master/torch/csrc/api/include/torch/data/datasets/base.h#L69)  
10 | [pkg-config and cmake](https://stackoverflow.com/questions/44487053/set-pkg-config-path-in-cmake) - sentencepiece doesn't have a proper cmake file
11 | [libtorch no_grad()](https://discuss.pytorch.org/t/memory-leak-in-libtorch-extremely-simple-code/38149/5)
12 | 
13 | ## C++ General Notes
14 | 
15 | how to compile sentencepiece manually
16 | ```sh
17 | g++ -o out in.cpp -L<path to /lib with sentencepiece.so> -lsentencepiece
18 | ```
19 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Libtorch + Huggingface Transformers
 2 | 
 3 | [![Build Status](https://travis-ci.org/dhpollack/huggingface_libtorch.svg?branch=master)](https://travis-ci.org/dhpollack/huggingface_libtorch)
 4 | 
 5 | ## Requirements
 6 | 
 7 | Currently, I have only tested this on Linux (Arch Linux and Ubuntu 18.04 LTS)
 8 | 
 9 | To run this repo, you need the following:  
10 | 
11 | - [x] A modern c++ compiler (newer version of gcc and clang seem to work, although gcc 9 has issues)
12 | - [x] [Libtorch](https://pytorch.org)  
13 | - [x] [Sentencepiece](https://github.com/google/sentencepiece)  
14 | - [x] [Boost](https://boost.org)  
15 | - [x] [nlohmann json](https://github.com/nlohmann/json)  
16 | - [ ] Other Tokenizers (not implemented yet)  
17 | 
18 | To run the sample, you'll additionally need:  
19 | 
20 | - [x] [hugginface's transformers](https://github.com/huggingface/transformers)  
21 | - [x] [PyTorch - python version](https://pytorch.org)
22 | - [x] [Anaconda / Miniconda](https://docs.conda.io/en/latest/miniconda.html)
23 | 
24 | Below are a set of scripts that will download and install the example dataset, a pretrained model, and the various c++ libraries required for this repo.  I am assuming anyone using this has a preexisting install of conda and boost already installed.  If you already have the c++ libraries installed, you can look at `compile.dev` and set the appropriate environmental variables to the locations of your local libraries.  There are some requirements for the above mentioned libraries and you obviously need to install those as well.  You can look at my CI build to see what it would take to run this library from a clean Ubuntu 18.04 LTS system.  
25 | 
26 | Right now I don't plan on supporting OSX or Windows, although I suspect this could run on OSX if you install all the required libraries manually.  Especially for Windows, I don't have the time nor a Windows system to test this on.  Feel free to make a PR if you want OSX / Mac support.  
27 | 
28 | Currently, I have only test this with an ALBERT model on a simple classification task (sentiment analysis).  One should be able to use any of the sentencepiece-tokenized models with fairly few modifications and hopefully, I'll be adding the other tokenizers and tasks soon.  
29 | 
30 | [ALBERT model pretrained on SST-2](https://drive.google.com/open?id=1i0rr-ogZ2MDYPpUMBsg-2PV7zVddivJ0) - download and unzip it (i.e. in a folder called `models`).  
31 | 
32 | ### How to Run Sample / Tests
33 | 
34 | ```sh
35 | # get the data
36 | scripts/get_data.sh
37 | # get the requirements if you need them
38 | scripts/get_third_party.sh
39 | # get model finetuned on the SST-2 dataset
40 | scripts/get_albert_pretrained.sh
41 | # the following sets up a minimal anaconda env to trace a huggingface transformers model
42 | conda create -n hflt python=3.7
43 | conda activate hflt
44 | conda install -c pytorch pytorch cpuonly
45 | pip install transformers typer
46 | # trace the model that we downloaded above
47 | python scripts/trace_model.py
48 | ```
49 | 
50 | ## Build from Source
51 | 
52 | ```sh
53 | source compile.env
54 | mkdir build && cd build
55 | cmake ..
56 | make
57 | ```
58 | 
59 | ## run
60 | currently, gets 89.2325% accuracy on the dev set vs. 89.2201% last reported by the trained model.
61 | 
62 | ```sh
63 | hflt [model dir] [dataset file]
64 | ```
65 | 
66 | ## Build Tests and Run Sample
67 | 
68 | ```sh
69 | source compile.env
70 | mkdir build_debug && cd build_debug
71 | cmake -DBUILD_TEST=ON -DCMAKE_BUILD_TYPE=Debug ..
72 | make -j $(nproc)
73 | ctest -VV
74 | src/hflt ../models/sst2_trained ../data/SST-2/dev.tsv
75 | ```
76 | 
77 | ## too lazy to install...
78 | 
79 | [colab notebook with GPU](https://colab.research.google.com/drive/1TFZbXhiGBtcWVH3ir9Hb1gLGcJyxzTNS)
80 | 
81 | You should be able to run this repo from colab with the above link.  I also downloaded the CUDA version of libtorch there.
82 | 
83 | ## training model
84 | 
85 | I used the transformers library to train the sentiment analysis example with a mostly default parameters.  Here's the command I ran:
86 | 
87 | ```sh
88 | # clone transformers repo for training scripts
89 | git clone https://github.com/huggingface/transformers.git
90 | # single GPU training
91 | python transformers/examples/run_glue.py --task_name sst-2 --data_dir data/SST-2 --model_type albert --model_name_or_path albert-base-v1 --save_steps 5000 --output_dir output --do_train --do_eval --evaluate_during_training --per_gpu_train_batch_size 32 --overwrite_output_dir
92 | # multi-gpu training
93 | NUM_GPUS=$(nvidia-smi -L | wc -l) python -m torch.distributed.launch --nproc_per_node ${NUM_GPUS} transformers/examples/run_glue.py --task_name sst-2 --data_dir data/SST-2 --model_type albert --model_name_or_path albert-base-v1 --save_steps 5000 --output_dir output --do_train --do_eval --evaluate_during_training --per_gpu_train_batch_size 32 --overwrite_output_dir
94 | ```
95 | 


--------------------------------------------------------------------------------
/compile.env:
--------------------------------------------------------------------------------
1 | export Torch_ROOT=$(pwd)/third_party/libtorch/share/cmake/Torch
2 | export SENTENCEPIECE_ROOT=$(pwd)/third_party/local
3 | #export Boost_ROOT=$(pwd)/third_party/boost_1_72_0
4 | 


--------------------------------------------------------------------------------
/hflt.cmake.in:
--------------------------------------------------------------------------------
 1 | @PACKAGE_INIT@
 2 | 
 3 | include( "${CMAKE_CURRENT_LIST_DIR}/hfltTargets.cmake" )
 4 | 
 5 | # add sentencepiece, consider making this a Find*.cmake file
 6 | if (DEFINED ENV{SENTENCEPIECE_ROOT})
 7 |     set(SENTENCEPIECE_ROOT $ENV{SENTENCEPIECE_ROOT})
 8 | else()
 9 |     set(SENTENCEPIECE_ROOT ${PROJECT_SOURCE_DIR}/third_party/local)
10 | endif()
11 | find_library(SENTENCEPIECE_LIBRARIES
12 |     NAMES sentencepiece libsentencepiece
13 |     PATHS "${PROJECT_SOURCE_DIR}/third_party/local"
14 |     HINTS "${SENTENCEPIECE_ROOT}/lib")
15 | set(SENTENCEPIECE_INCLUDE_DIRS ${SENTENCEPIECE_ROOT}/include)
16 | include_directories(${SENTENCEPIECE_INCLUDE_DIRS})
17 | 
18 | # add boost tokenizer, which is header only so no components needed to be specified 
19 | # https://stackoverflow.com/questions/6646405/how-do-you-add-boost-libraries-in-cmakelists-txt
20 | find_package(Boost 1.45.0)
21 | include_directories(${Boost_INCLUDE_DIRS})
22 | 
23 | # add torch
24 | find_package(Torch REQUIRED)
25 | 
26 | # add nlohmann json
27 | find_package(nlohmann_json PATHS "${PROJECT_SOURCE_DIR}/third_party/local"  REQUIRED)
28 | 
29 | 


--------------------------------------------------------------------------------
/scripts/get_albert_pretrained.sh:
--------------------------------------------------------------------------------
 1 | cd $(dirname $0)/..
 2 | 
 3 | mkdir -p models && cd models
 4 | 
 5 | wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate "https://docs.google.com/uc?export=download&id=1i0rr-ogZ2MDYPpUMBsg-2PV7zVddivJ0" -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1i0rr-ogZ2MDYPpUMBsg-2PV7zVddivJ0" -O sst2_trained.tar.gz && rm -rf /tmp/cookies.txt
 6 | 
 7 | tar xzvf sst2_trained.tar.gz
 8 | 
 9 | # run the following command
10 | echo 'run: `python ../scripts/trace_albert.py`'
11 | 


--------------------------------------------------------------------------------
/scripts/get_data.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # stackoverflow seems to think there are better ways of doing the following
 4 | cd $(dirname $0)/..
 5 | 
 6 | GLUEDLURI=https://raw.githubusercontent.com/nyu-mll/GLUE-baselines/master/download_glue_data.py
 7 | SQUADV2BASEURI=https://rajpurkar.github.io/SQuAD-explorer/dataset/
 8 | 
 9 | # download GLUE SST-2
10 | mkdir -p data && cd data
11 | wget $GLUEDLURI
12 | python download_glue_data.py --data_dir . --tasks SST
13 | rm download_glue_data.py
14 | 
15 | # download squad v2
16 | mkdir SQuAD && cd SQuAD
17 | #wget "${SQUADV2BASEURI}/train-v2.0.json"  # ignore trainset for now
18 | wget "${SQUADV2BASEURI}/dev-v2.0.json"
19 | 
20 | 


--------------------------------------------------------------------------------
/scripts/get_third_party.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | cd $(dirname $0)/..
 4 | 
 5 | mkdir -p third_party && cd third_party
 6 | 
 7 | THIRDPARTYLOCAL="local"
 8 | SENTENCEPIECEURI="https://github.com/google/sentencepiece.git"
 9 | LIBTORCHURI="https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-1.5.0%2Bcpu.zip"
10 | BOOSTURI="https://dl.bintray.com/boostorg/release/1.72.0/source/boost_1_72_0.tar.gz"
11 | NLOHMANNURI="https://github.com/nlohmann/json.git"
12 | 
13 | echo "Installing sentencepiece, make sure you've installed the requirements, which on Ubuntu is:"
14 | echo "sudo apt-get install cmake build-essential pkg-config libgoogle-perftools-dev"
15 | 
16 | git clone $SENTENCEPIECEURI
17 | cd sentencepiece
18 | mkdir -p build && cd build
19 | cmake -DCMAKE_INSTALL_PREFIX=$(realpath ../../${THIRDPARTYLOCAL}) ..
20 | make -j $(nproc --ignore=1)
21 | make install
22 | # cleaning up
23 | cd ../..
24 | rm -Rf sentencepiece
25 | 
26 | 
27 | echo "installing cpu-libtorch, if you want the CUDA version it's available at https://pytorch.org"
28 | wget $LIBTORCHURI -O libtorch.zip
29 | unzip -qq libtorch.zip
30 | rm libtorch.zip
31 | 
32 | echo "installing nlohmann json..."
33 | git clone --depth 1 ${NLOHMANNURI}
34 | cd json
35 | mkdir build && cd build
36 | cmake -DJSON_BuildTests=OFF -DCMAKE_INSTALL_PREFIX=$(realpath ../../${THIRDPARTYLOCAL}) ..
37 | make install
38 | cd ..
39 | 
40 | 
41 | #echo "installing boost"
42 | #wget $BOOSTURI -O boost.tar.gz
43 | #tar xzf boost.tar.gz
44 | #rm boost.tar.gz
45 | 
46 | 


--------------------------------------------------------------------------------
/scripts/sst2tojson.jq:
--------------------------------------------------------------------------------
 1 | # jq -R -f scripts/sst2tojson.jq [input_tsv_file] > [output_json_file]
 2 | [inputs] |
 3 | {data: 
 4 |   [
 5 |     to_entries | 
 6 |     .[] | 
 7 |     (.key | tostring) + "\t" + .value | 
 8 |     split("\t") | 
 9 |     {guid: .[0], text_a: .[1], text_b: "", label: .[2]}
10 |   ]
11 | }
12 | 


--------------------------------------------------------------------------------
/scripts/trace_model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import Path
 3 | from pydoc import locate
 4 | 
 5 | import torch
 6 | import transformers
 7 | import typer
 8 | 
 9 | app = typer.Typer()
10 | 
11 | 
12 | @app.command()
13 | def trace_classification_model(
14 |     model_path: Path = "./models/sst2_trained",
15 |     output_name: str = "traced_albert.pt",
16 |     hf_model_class: str = "transformers.AlbertForSequenceClassification",
17 |     hf_tokenizer_class: str = "transformers.AlbertTokenizer",
18 | ):
19 |     output_path = model_path / output_name
20 |     tokenizer_cls = locate(hf_tokenizer_class)
21 |     tokenizer = tokenizer_cls.from_pretrained(str(model_path))
22 |     tokens = tokenizer.encode(
23 |         "this is a test", add_special_tokens=True, return_tensors="pt"
24 |     ).flatten()
25 |     tokens_len = tokens.size(0)
26 |     token_ids = torch.zeros(128, dtype=torch.long)
27 |     token_ids[:tokens_len] = tokens
28 |     token_ids.unsqueeze_(0)
29 |     attention_mask = torch.ones(128, dtype=torch.long)
30 |     attention_mask[:tokens_len] = 0
31 |     attention_mask.unsqueeze_(0)
32 |     token_type_ids = (attention_mask == 0).to(torch.long)
33 |     position_ids = torch.arange(0, 128, dtype=torch.long)
34 |     dummy_input = [token_ids, attention_mask, token_type_ids, position_ids]
35 |     model_cls = locate(hf_model_class)
36 |     model = model_cls.from_pretrained(str(model_path), torchscript=True)
37 |     traced_model = torch.jit.trace(model, dummy_input)
38 |     torch.jit.save(traced_model, str(output_path))
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     app()
43 | 


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.10)
 2 | 
 3 | cmake_policy(SET CMP0074 NEW)
 4 | 
 5 | set(HEADER_FILES
 6 |     squad_utils.h
 7 |     )
 8 | 
 9 | set(SOURCE_FILES
10 |     run_model.cpp
11 |     )
12 | 
13 | set(UTIL_FILES
14 |     squad_utils.cpp
15 |     tokenizer_base.cpp
16 |     tokenizer_albert.cpp
17 |     config_utils.cpp
18 |     processors.cpp
19 |     dataset_classification.cpp
20 |     dataset_qa.cpp
21 |    )
22 | 
23 | # add the shared library and executable
24 | add_library(hflt SHARED ${UTIL_FILES})
25 | add_executable(hflt-bin ${SOURCE_FILES})
26 | 
27 | # link the executable to the shared library
28 | set(CMAKE_INSTALL_RPATH "${ORIGIN}")
29 | set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
30 | 
31 | set_target_properties(hflt-bin PROPERTIES OUTPUT_NAME "hflt")
32 | set_target_properties(hflt PROPERTIES 
33 | 	                   VERSION ${PROJECT_VERSION} 
34 |                            SOVERSION 0
35 |                            PUBLIC_HEADER squad_utils.h)
36 | 
37 | # makes working with subdirectories easier, but right now not used
38 | target_include_directories(hflt PRIVATE .)
39 | 
40 | # install
41 | include(GNUInstallDirs)
42 | install(TARGETS hflt
43 |         EXPORT hfltTargets
44 |         LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
45 | 	PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
46 |         ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
47 | install(TARGETS hflt-bin
48 | 	RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
49 | 
50 | # link target libraries
51 | target_link_libraries(hflt PUBLIC
52 |                       ${TORCH_LIBRARIES}
53 |                       )
54 | target_link_libraries(hflt PRIVATE 
55 |                       ${SENTENCEPIECE_LIBRARIES} 
56 |                       nlohmann_json::nlohmann_json
57 |                       )
58 | target_link_libraries(hflt-bin hflt)
59 | 
60 | 


--------------------------------------------------------------------------------
/src/config_utils.cpp:
--------------------------------------------------------------------------------
 1 | #include "config_utils.h"
 2 | 
 3 | using namespace std;
 4 | 
 5 | template <typename T> T _contains_or_empty(json j, char const *k) {
 6 |   return j.contains(k) ? j[k].get<T>() : T();
 7 | }
 8 | 
 9 | TransformersTokenizerConfigs read_transformers_pretrained(const char *dirpath) {
10 |   string basedir(dirpath);
11 |   string tokenizer_config_path = basedir + "/tokenizer_config.json";
12 |   ifstream fd_tokenizer_config(tokenizer_config_path);
13 |   TransformersTokenizerConfigs configs;
14 |   TransformersTokenizerConfig tokenizer_config;
15 |   TransformersSpecialTokensMap special_tokens_map;
16 |   TransformersAddedTokens added_tokens;
17 |   if (!fd_tokenizer_config.is_open()) {
18 |     cerr << "something went wrong opening: " << tokenizer_config_path << endl;
19 |   } else {
20 |     tokenizer_config = read_transformers_tokenizer_config(fd_tokenizer_config);
21 |   }
22 |   string special_tokens_map_path = basedir + "/special_tokens_map.json";
23 |   ifstream fd_special_tokens_map(special_tokens_map_path);
24 |   if (!fd_special_tokens_map.is_open()) {
25 |     cerr << "something went wrong opening: " << special_tokens_map_path << endl;
26 |   } else {
27 |     special_tokens_map =
28 |         read_transformers_special_tokens_map(fd_special_tokens_map);
29 |   }
30 |   string added_tokens_path = basedir + "/added_tokens.json";
31 |   ifstream fd_added_tokens(added_tokens_path);
32 |   if (!fd_added_tokens.is_open()) {
33 |     // keep for debugging, but the added tokens json file isn't always created
34 |     // cerr << "something went wrong opening: " << added_tokens_path << endl;
35 |   } else {
36 |     added_tokens = read_transformers_added_tokens(fd_added_tokens);
37 |   }
38 |   // combine into TransformersTokenizerConfigs
39 |   configs = {tokenizer_config, special_tokens_map, added_tokens};
40 |   return configs;
41 | }
42 | 
43 | TransformersTokenizerConfig read_transformers_tokenizer_config(ifstream &fd) {
44 |   json config;
45 |   fd >> config;
46 |   TransformersTokenizerConfig tc = {
47 |       _contains_or_empty<bool>(config, "do_lower_case"),
48 |       _contains_or_empty<vector<string>>(config, "init_inputs"),
49 |       _contains_or_empty<size_t>(config, "max_len")};
50 |   return tc;
51 | }
52 | 
53 | TransformersSpecialTokensMap
54 | read_transformers_special_tokens_map(ifstream &fd) {
55 |   json special_tokens;
56 |   fd >> special_tokens;
57 |   TransformersSpecialTokensMap stm = {
58 |       _contains_or_empty<string>(special_tokens, "cls_token"),
59 |       _contains_or_empty<string>(special_tokens, "mask_token"),
60 |       _contains_or_empty<string>(special_tokens, "pad_token"),
61 |       _contains_or_empty<string>(special_tokens, "sep_token"),
62 |       _contains_or_empty<string>(special_tokens, "unk_token"),
63 |       _contains_or_empty<string>(special_tokens, "bos_token"),
64 |       _contains_or_empty<string>(special_tokens, "eos_token")};
65 |   return stm;
66 | }
67 | 
68 | TransformersAddedTokens read_transformers_added_tokens(ifstream &fd) {
69 |   json added_tokens;
70 |   fd >> added_tokens;
71 |   TransformersAddedTokens at = {added_tokens.empty()
72 |                                     ? vector<string>()
73 |                                     : added_tokens.get<vector<string>>()};
74 |   return at;
75 | }
76 | 


--------------------------------------------------------------------------------
/src/config_utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <fstream>
 4 | #include <iostream>
 5 | #include <map>
 6 | #include <nlohmann/json.hpp>
 7 | #include <string>
 8 | 
 9 | using json = nlohmann::json;
10 | 
11 | struct TransformersTokenizerConfig {
12 |   bool do_lower_case;
13 |   std::vector<std::string> init_inputs;
14 |   size_t max_len;
15 | };
16 | 
17 | struct TransformersSpecialTokensMap {
18 |   std::string cls_token;
19 |   std::string mask_token;
20 |   std::string pad_token;
21 |   std::string sep_token;
22 |   std::string unk_token;
23 |   std::string bos_token; // in sentencepiece models
24 |   std::string eos_token; // in sentencepiece models
25 | };
26 | 
27 | struct TransformersAddedTokens {
28 |   std::vector<std::string> added_tokens;
29 | };
30 | 
31 | struct TransformersTokenizerConfigs {
32 |   TransformersTokenizerConfig tokenizer_config;
33 |   TransformersSpecialTokensMap special_tokens_map;
34 |   TransformersAddedTokens added_tokens;
35 | };
36 | 
37 | TransformersTokenizerConfigs read_transformers_pretrained(const char *dirpath);
38 | TransformersTokenizerConfig
39 | read_transformers_tokenizer_config(std::ifstream &fd);
40 | TransformersSpecialTokensMap
41 | read_transformers_special_tokens_map(std::ifstream &fd);
42 | TransformersAddedTokens read_transformers_added_tokens(std::ifstream &fd);
43 | 


--------------------------------------------------------------------------------
/src/dataset_classification.cpp:
--------------------------------------------------------------------------------
 1 | #include "dataset_classification.h"
 2 | 
 3 | using namespace std;
 4 | 
 5 | // Constructor
 6 | template <typename TokenizerType, typename ExampleType, typename FeaturesType>
 7 | TransformerClassificationDS<TokenizerType, ExampleType, FeaturesType>::
 8 |     TransformerClassificationDS(
 9 |         const string &pretrained_dir, long maximum_sequence_len,
10 |         const function<vector<ExampleType>(const string &arg)> read_examples_fn,
11 |         const string &read_examples_arg)
12 |     : tokenizer_(pretrained_dir.c_str()),
13 |       examples_(read_examples_fn(read_examples_arg)),
14 |       msl_(maximum_sequence_len) {}
15 | 
16 | // get()
17 | template <typename TokenizerType, typename ExampleType, typename FeaturesType>
18 | FeaturesType
19 | TransformerClassificationDS<TokenizerType, ExampleType, FeaturesType>::get(
20 |     size_t index) {
21 |   auto opts_data = torch::TensorOptions().dtype(torch::kLong);
22 |   ExampleType ex = examples_[index];
23 |   // tokenize and tensorize
24 |   FeaturesType features = tokenizer_.encode(ex.text_a, ex.text_b, true, msl_, 0,
25 |                                             "longest_first", true);
26 |   features.label = _label_to_tensor(ex.label, opts_data);
27 |   return features;
28 | }
29 | 
30 | // size()
31 | template <typename TokenizerType, typename ExampleType, typename FeaturesType>
32 | torch::optional<size_t>
33 | TransformerClassificationDS<TokenizerType, ExampleType, FeaturesType>::size()
34 |     const {
35 |   torch::optional<size_t> sz(examples_.size());
36 |   return sz;
37 | }
38 | 
39 | // examples()
40 | template <typename TokenizerType, typename ExampleType, typename FeaturesType>
41 | const vector<ExampleType> &
42 | TransformerClassificationDS<TokenizerType, ExampleType,
43 |                             FeaturesType>::examples() const {
44 |   return examples_;
45 | }
46 | 
47 | // _label_to_tensor()
48 | template <typename TokenizerType, typename ExampleType, typename FeaturesType>
49 | torch::Tensor
50 | TransformerClassificationDS<TokenizerType, ExampleType, FeaturesType>::
51 |     _label_to_tensor(const string &label, torch::TensorOptions &topts) {
52 |   vector<long> lv;
53 |   stringstream ss(label);
54 |   std::transform(istream_iterator<long>(ss), istream_iterator<long>(),
55 |                  std::back_inserter(lv), [](long x) { return x; });
56 |   return torch::from_blob(lv.data(), {(long)lv.size()}, topts).clone();
57 | }
58 | 
59 | template class TransformerClassificationDS<TokenizerAlbert>;
60 | template class TransformerClassificationDS<>;
61 | 


--------------------------------------------------------------------------------
/src/dataset_classification.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <torch/data/datasets/base.h>
 4 | #include <torch/types.h>
 5 | 
 6 | #include <fstream>
 7 | #include <iostream>
 8 | #include <sstream>
 9 | 
10 | #include <string>
11 | #include <vector>
12 | 
13 | #include "tokenizer_albert.h"
14 | #include "tokenizer_base.h"
15 | #include "transformer_example.h"
16 | 
17 | template <typename TokenizerType = TokenizerBase,
18 |           typename TransformerSingleExample = TransformerExample,
19 |           typename TransformerSingleFeatures = TransformerFeatures<>>
20 | class TransformerClassificationDS
21 |     : public torch::data::datasets::Dataset<
22 |           TransformerClassificationDS<TokenizerType, TransformerSingleExample,
23 |                                       TransformerSingleFeatures>,
24 |           TransformerSingleFeatures> {
25 | public:
26 |   // A base dataset for transformers, which loads the model and tokenizer from
27 |   // `pretrained_dir` and populates the `examples_` member using the function
28 |   // `read_examples` and `read_examples_arg`.
29 |   //
30 |   // TODO: make `read_examples` and `read_examples_arg` more generic with
31 |   // variadic templates
32 |   //
33 |   // The supplied `filepath` path should be a tsv file with the sentence
34 |   // followed by the label.
35 |   explicit TransformerClassificationDS(
36 |       const std::string &pretrained_dir, long maximum_sequence_len,
37 |       const std::function<
38 |           std::vector<TransformerSingleExample>(const std::string &arg)>
39 |           read_examples_fn,
40 |       const std::string &read_examples_arg);
41 | 
42 |   // Returns the `TransformerSingleExample` at the given `index`.
43 |   virtual TransformerSingleFeatures get(size_t index) override;
44 | 
45 |   // Returns the size of the dataset.
46 |   torch::optional<size_t> size() const override;
47 | 
48 |   // Returns all examples as a vector.
49 |   const std::vector<TransformerSingleExample> &examples() const;
50 | 
51 |   // read all examples
52 |   // virtual std::vector<TransformerSingleExample> read_examples(const
53 |   // std::string &arg) override;
54 | 
55 | private:
56 |   torch::Tensor _label_to_tensor(const std::string &label,
57 |                                  torch::TensorOptions &topts);
58 |   TokenizerType tokenizer_;
59 |   std::vector<TransformerSingleExample> examples_;
60 |   long msl_; // maximum sequence length
61 | };
62 | 


--------------------------------------------------------------------------------
/src/dataset_qa.cpp:
--------------------------------------------------------------------------------
 1 | #include "dataset_qa.h"
 2 | 
 3 | using namespace std;
 4 | 
 5 | // Constructor
 6 | template <typename TokenizerType, typename ExampleType, typename FeaturesType>
 7 | TransformerQADS<TokenizerType, ExampleType, FeaturesType>::TransformerQADS(
 8 |     const string &pretrained_dir, long maximum_sequence_len,
 9 |     const function<vector<ExampleType>(const string &arg)> read_examples_fn,
10 |     const string &read_examples_arg)
11 |     : tokenizer_(pretrained_dir.c_str()),
12 |       examples_(read_examples_fn(read_examples_arg)),
13 |       msl_(maximum_sequence_len) {
14 |   // constructor post-initialization
15 |   vector<pair<size_t, size_t>> doc_span_mapping =
16 |       add_tokens_to_examples(examples_, tokenizer_, msl_);
17 |   items_ = doc_span_mapping;
18 | }
19 | 
20 | // get()
21 | template <typename TokenizerType, typename ExampleType, typename FeaturesType>
22 | FeaturesType
23 | TransformerQADS<TokenizerType, ExampleType, FeaturesType>::get(size_t index) {
24 |   pair<size_t, size_t> indices = items_[index];
25 |   ExampleType ex = examples_[indices.first];
26 |   pair<size_t, size_t> p_span = ex.p_spans[indices.second];
27 |   FeaturesType features = example_to_features(ex, p_span);
28 |   return features;
29 | }
30 | 
31 | // size()
32 | template <typename TokenizerType, typename ExampleType, typename FeaturesType>
33 | torch::optional<size_t>
34 | TransformerQADS<TokenizerType, ExampleType, FeaturesType>::size() const {
35 |   torch::optional<size_t> sz(items_.size());
36 |   return sz;
37 | }
38 | 
39 | // examples()
40 | template <typename TokenizerType, typename ExampleType, typename FeaturesType>
41 | const vector<ExampleType> &
42 | TransformerQADS<TokenizerType, ExampleType, FeaturesType>::examples() const {
43 |   return examples_;
44 | }
45 | 
46 | // _label_to_tensor()
47 | template <typename TokenizerType, typename ExampleType, typename FeaturesType>
48 | torch::Tensor
49 | TransformerQADS<TokenizerType, ExampleType, FeaturesType>::_label_to_tensor(
50 |     const string &label, torch::TensorOptions &topts) {
51 |   vector<long> lv;
52 |   stringstream ss(label);
53 |   std::transform(istream_iterator<long>(ss), istream_iterator<long>(),
54 |                  std::back_inserter(lv), [](long x) { return x; });
55 |   return torch::from_blob(lv.data(), {(long)lv.size()}, topts).clone();
56 | }
57 | 
58 | // example_to_features()
59 | template <typename TokenizerType, typename ExampleType, typename FeaturesType>
60 | FeaturesType
61 | TransformerQADS<TokenizerType, ExampleType, FeaturesType>::example_to_features(
62 |     ExampleType &example, pair<size_t, size_t> &p_span) {
63 |   auto opts_data = torch::TensorOptions().dtype(torch::kLong);
64 |   size_t max_len = static_cast<size_t>(msl_);
65 |   vector<int> q_tokens(example.q_tokens.begin(), example.q_tokens.end());
66 |   q_tokens.insert(q_tokens.begin(), tokenizer_.cls_token_id());
67 |   q_tokens.push_back(tokenizer_.sep_token_id());
68 |   vector<int> p_tokens_span(example.p_tokens.begin() + p_span.first,
69 |                             example.p_tokens.begin() + p_span.second);
70 |   p_tokens_span.push_back(tokenizer_.sep_token_id());
71 |   vector<long> tokens;
72 |   tokens.insert(tokens.end(), q_tokens.begin(), q_tokens.end());
73 |   tokens.insert(tokens.end(), p_tokens_span.begin(), p_tokens_span.end());
74 |   tokens.resize(max_len);
75 |   vector<long> attention_mask(q_tokens.size() + p_tokens_span.size(), 1);
76 |   attention_mask.resize(max_len, 0);
77 |   vector<long> token_type_ids(q_tokens.size(), 0);
78 |   token_type_ids.resize(max_len, 1);
79 |   vector<long> a_tokens(example.a_tokens.begin(), example.a_tokens.end());
80 |   a_tokens.resize(max_len, 0);
81 |   FeaturesType features(
82 |       {torch::from_blob(tokens.data(), {msl_}, opts_data).clone(),
83 |        torch::from_blob(attention_mask.data(), {msl_}, opts_data).clone(),
84 |        torch::from_blob(token_type_ids.data(), {msl_}, opts_data).clone(),
85 |        torch::arange(0, msl_, opts_data),
86 |        torch::from_blob(a_tokens.data(), {msl_}, opts_data).clone()});
87 |   return features;
88 | }
89 | 
90 | template class TransformerQADS<TokenizerAlbert>;
91 | template class TransformerQADS<>;
92 | 


--------------------------------------------------------------------------------
/src/dataset_qa.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <torch/data/datasets/base.h>
 4 | #include <torch/types.h>
 5 | 
 6 | #include <fstream>
 7 | #include <iostream>
 8 | #include <sstream>
 9 | 
10 | #include <utility>
11 | #include <string>
12 | #include <vector>
13 | 
14 | #include "squad_utils.h"
15 | #include "tokenizer_albert.h"
16 | #include "tokenizer_base.h"
17 | #include "transformer_example.h"
18 | 
19 | template <typename TokenizerType = TokenizerBase,
20 |           typename TransformerSingleExample = SquadExample,
21 |           typename TransformerSingleFeatures = TransformerFeatures<>>
22 | class TransformerQADS
23 |     : public torch::data::datasets::Dataset<
24 |           TransformerQADS<TokenizerType, TransformerSingleExample,
25 |                           TransformerSingleFeatures>,
26 |           TransformerSingleFeatures> {
27 | public:
28 |   // A base dataset for transformers, which loads the model and tokenizer from
29 |   // `pretrained_dir` and populates the `examples_` member using the function
30 |   // `read_examples` and `read_examples_arg`.
31 |   //
32 |   // TODO: make `read_examples` and `read_examples_arg` more generic with
33 |   // variadic templates
34 |   //
35 |   // The supplied `filepath` path should be a tsv file with the sentence
36 |   // followed by the label.
37 |   explicit TransformerQADS(
38 |       const std::string &pretrained_dir, long maximum_sequence_len,
39 |       const std::function<
40 |           std::vector<TransformerSingleExample>(const std::string &arg)>
41 |           read_examples_fn,
42 |       const std::string &read_examples_arg);
43 | 
44 |   // Returns the `TransformerSingleExample` at the given `index`.
45 |   virtual TransformerSingleFeatures get(size_t index) override;
46 | 
47 |   // Returns the size of the dataset.
48 |   torch::optional<size_t> size() const override;
49 | 
50 |   // Returns all examples as a vector.
51 |   const std::vector<TransformerSingleExample> &examples() const;
52 | 
53 |   TransformerSingleFeatures
54 |   example_to_features(TransformerSingleExample &example, std::pair<size_t, size_t> &p_span);
55 | 
56 | private:
57 |   torch::Tensor _label_to_tensor(const std::string &label,
58 |                                  torch::TensorOptions &topts);
59 |   TokenizerType tokenizer_;
60 |   std::vector<TransformerSingleExample> examples_;
61 |   std::vector<std::pair<size_t, size_t>> items_;
62 |   long msl_; // maximum sequence length
63 | };
64 | 


--------------------------------------------------------------------------------
/src/processors.cpp:
--------------------------------------------------------------------------------
 1 | #include "processors.h"
 2 | 
 3 | using namespace std;
 4 | 
 5 | vector<TransformerExample> readGenericJsonFile(const string &filepath) {
 6 |   // assumes a json in the format {..., "data": [{data_obj},...]}, where
 7 |   // data_obj = {"guid": "guid_as_string", "text_a": "some text", "text_b":
 8 |   // "more text", "label": "label0"}
 9 |   vector<TransformerExample> examples;
10 |   ifstream ifs(filepath);
11 |   if (!ifs.is_open()) {
12 |     cerr << "unable to open generic json datafile" << endl;
13 |   } else {
14 |     json j;
15 |     ifs >> j;
16 |     for (auto &item : j["data"]) {
17 |       string guid = item["guid"].get<string>();
18 |       string text_a = item["text_a"].get<string>();
19 |       string text_b = item["text_b"].get<string>();
20 |       string label = item["label"].get<string>();
21 |       examples.emplace_back(guid, text_a, text_b, label);
22 |     }
23 |   }
24 |   return examples;
25 | }
26 | vector<TransformerExample> readSST2CsvFile(const string &filepath) {
27 |   // This function assumes the csv file is in the format `<sentence>\t<label>`
28 |   vector<TransformerExample> examples;
29 |   string line;
30 |   ifstream ifs(filepath);
31 |   if (!ifs.is_open()) {
32 |     return examples;
33 |   }
34 |   using tokenizer = boost::tokenizer<boost::char_separator<char>>;
35 |   boost::char_separator<char> sep("\t");
36 |   size_t i = 0;
37 |   while (getline(ifs, line)) {
38 |     tokenizer tokens(line, sep);
39 |     tokenizer::iterator tok_iter = tokens.begin();
40 |     string sentence = *tok_iter;
41 |     ++tok_iter;
42 |     int64_t label = 0;
43 |     istringstream i_str(*tok_iter);
44 |     if (!(i_str >> label)) {
45 |       // this should exclude the header of the csv
46 |       // cout << "found invalid example: " << line << endl;
47 |       continue;
48 |     }
49 |     TransformerExample ex = {std::to_string(i), sentence, "",
50 |                              std::to_string(label)};
51 |     examples.push_back(ex);
52 |     ++i;
53 |   }
54 |   return examples;
55 | }
56 | 


--------------------------------------------------------------------------------
/src/processors.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <boost/tokenizer.hpp>
 4 | #include <nlohmann/json.hpp>
 5 | 
 6 | #include <string>
 7 | 
 8 | #include <vector>
 9 | 
10 | #include "transformer_example.h"
11 | 
12 | using json = nlohmann::json;
13 | 
14 | std::vector<TransformerExample>
15 | readGenericJsonFile(const std::string &filepath);
16 | std::vector<TransformerExample> readSST2CsvFile(const std::string &filepath);
17 | 


--------------------------------------------------------------------------------
/src/run_model.cpp:
--------------------------------------------------------------------------------
 1 | #include "run_model.h"
 2 | #include "dataset_classification.h"
 3 | #include "processors.h"
 4 | #include "tokenizer_albert.h"
 5 | #include "transformer_stack.h"
 6 | 
 7 | using namespace std;
 8 | using namespace torch;
 9 | 
10 | int main(int argc, char *argv[]) {
11 |   // get cli args and check if files exist
12 |   if (argc != 3) {
13 |     string pn(argv[0]);
14 |     string pn_nopath = pn.substr(pn.find_last_of("/\\") + 1);
15 |     cout << "usage:  " << pn_nopath << " [model_path] [data_file_path]" << endl;
16 |     cout << "   i.e. `" << pn
17 |          << " ../models/sst2_trained "
18 |             "../data/SST-2/dev.tsv`"
19 |          << endl;
20 |     return -1;
21 |   }
22 |   // set some variables
23 |   const long MAXIMUM_SEQUENCE_LENGTH = 128;
24 |   const int BATCH_SIZE = 64;
25 |   Device device(torch::cuda::is_available() ? "cuda" : "cpu");
26 |   // create variables needed from cli args
27 |   string pretrained_dir(argv[1]);
28 |   string traced_model_path = pretrained_dir + "/traced_albert.pt";
29 |   string ds_file_path(argv[2]);
30 | 
31 |   // create dataset and dataloader
32 |   // auto sampler = data::samplers::SequentialSampler;
33 |   TransformerClassificationDS<TokenizerAlbert> ds(
34 |       pretrained_dir, MAXIMUM_SEQUENCE_LENGTH, readSST2CsvFile, ds_file_path);
35 |   auto ds_map = ds.map(data::transforms::Stack<TransformerFeatures<>>());
36 |   auto dl = data::make_data_loader<data::samplers::SequentialSampler>(
37 |       move(ds_map), BATCH_SIZE);
38 |   auto num_examples = ds.size().value();
39 |   size_t num_batches = ceil(num_examples / (float)BATCH_SIZE);
40 |   cout << "Found " << num_examples << " examples in " << argv[2] << endl;
41 |   cout << "Dataloader contains " << num_batches << " batches" << endl;
42 | 
43 |   // load albert model and put into eval mode
44 |   jit::script::Module model;
45 |   try {
46 |     model = jit::load(traced_model_path, device);
47 |   } catch (const c10::Error &e) {
48 |     cerr << "error loading the model" << endl;
49 |     return -1;
50 |   }
51 |   model.eval();
52 | 
53 |   // equivalent to with torch.no_grad() in python
54 |   NoGradGuard no_grad_guard;
55 |   // initialize vectors to capture results for metrics calculations
56 |   vector<Tensor> preds_vec;
57 |   vector<Tensor> labels_vec;
58 |   preds_vec.reserve(num_batches);
59 |   labels_vec.reserve(num_batches);
60 |   // run eval loop
61 |   for (auto &mb : *dl) {
62 |     // cout << "Batch Size: " << mb.input_ids.sizes() << endl;
63 |     // capture labels for metrics
64 |     labels_vec.push_back(mb.label);
65 |     // prepare input tensors for jit'ed model
66 |     auto token_ids = mb.input_ids;
67 |     auto attention_masks = mb.attention_mask;
68 |     auto token_type_ids = mb.token_type_ids;
69 |     auto position_ids = mb.position_ids;
70 |     vector<jit::IValue> inputs;
71 |     inputs.push_back(token_ids.to(device));
72 |     inputs.push_back(attention_masks.to(device));
73 |     inputs.push_back(token_type_ids.to(device));
74 |     inputs.push_back(position_ids.to(device));
75 |     // do inference and return results as a tuple
76 |     auto out = model.forward(inputs).toTuple();
77 |     // capture prediction log likelihoods for metrics
78 |     preds_vec.push_back(out->elements()[0].toTensor().to(Device("cpu")));
79 |   }
80 |   // concatenate predictions and get most likely prediction
81 |   Tensor preds = cat(preds_vec, 0).argmax(1);
82 |   // concatenate labels
83 |   Tensor labels = cat(labels_vec, 0).flatten();
84 |   // calculate the number of correct predictions and total predictions
85 |   float correct = preds.eq(labels).sum().item<float>();
86 |   float total = preds.size(0);
87 |   // report accuracy
88 |   cout << "Acc: " << correct / total << endl;
89 |   return 1;
90 | }
91 | 


--------------------------------------------------------------------------------
/src/run_model.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <torch/script.h>
 3 | #include <torch/torch.h>
 4 | 
 5 | #include <fstream>
 6 | #include <iostream>
 7 | #include <ostream>
 8 | #include <sstream>
 9 | #include <string>
10 | #include <vector>
11 | 


--------------------------------------------------------------------------------
/src/squad_utils.cpp:
--------------------------------------------------------------------------------
  1 | #include "squad_utils.h"
  2 | 
  3 | using namespace std;
  4 | 
  5 | vector<SquadExample> readSquadExamples(const string &input_path) {
  6 |   ifstream ifs(input_path);
  7 |   return read_squad_examples(ifs, true);
  8 | }
  9 | 
 10 | vector<SquadExample> read_squad_examples(ifstream &input_file,
 11 |                                          bool is_training) {
 12 |   json j;
 13 |   vector<SquadExample> examples;
 14 |   if (!input_file.is_open()) {
 15 |     cerr << "unable to open input file" << endl;
 16 |   } else {
 17 |     input_file >> j;
 18 | 
 19 |     for (auto &entry : j["data"]) {
 20 |       for (auto &paragraph : entry["paragraphs"]) {
 21 |         string paragraph_text = paragraph["context"].get<string>();
 22 |         for (auto &qa : paragraph["qas"]) {
 23 |           string qas_id = qa["id"].get<string>();
 24 |           string question_text = qa["question"].get<string>();
 25 |           int start_position = -1;
 26 |           string orig_answer_text;
 27 |           bool is_impossible = false;
 28 |           if (is_training) {
 29 |             is_impossible = qa.contains("is_impossible")
 30 |                                 ? qa["is_impossible"].get<bool>()
 31 |                                 : false;
 32 |             if (!is_impossible) {
 33 |               orig_answer_text = qa["answers"][0]["text"].get<string>();
 34 |               start_position = qa["answers"][0]["answer_start"].get<int>();
 35 |             } // default values set to else statement
 36 |           }
 37 |           SquadExample example(qas_id, question_text, paragraph_text,
 38 |                                orig_answer_text, start_position, -1,
 39 |                                is_impossible);
 40 |           examples.push_back(example);
 41 |         } // end qa
 42 |       }   // end paragraph
 43 |     }     // end element
 44 |   }       // end else
 45 |   return examples;
 46 | }
 47 | 
 48 | template <typename T>
 49 | vector<pair<size_t, size_t>>
 50 | add_tokens_to_examples(vector<SquadExample> &examples, T tokenizer_,
 51 |                        long msl_) {
 52 |   // TODO find answer and use variables related to this
 53 |   const size_t doc_stride = 128;
 54 |   const size_t max_query_length = 64;
 55 |   const size_t num_special_tokens = 3;
 56 |   vector<pair<size_t, size_t>> p_spans;
 57 |   size_t i = 0;
 58 |   for (auto &ex : examples) {
 59 |     if (ex.is_impossible) {
 60 |       // cerr << "skipping impossible question: " << ex.question_text << endl;
 61 |       // continue;
 62 |     }
 63 |     string preanswer_text = ex.paragraph_text.substr(0, ex.start_position);
 64 |     vector<int> pa_tokens = tokenizer_.template tokenize<int>(preanswer_text);
 65 |     long answer_start __attribute__((unused)) = pa_tokens.size();
 66 |     vector<int> q_tokens = tokenizer_.template tokenize<int>(ex.question_text);
 67 |     if (q_tokens.size() > max_query_length) {
 68 |       q_tokens.resize(max_query_length);
 69 |     }
 70 |     // insert special tokens into question
 71 |     // q_tokens.insert(q_tokens.begin(), tokenizer_.cls_token_id());
 72 |     // q_tokens.push_back(tokenizer_.sep_token_id());
 73 |     vector<int> p_tokens = tokenizer_.template tokenize<int>(ex.paragraph_text);
 74 |     vector<int> a_tokens =
 75 |         tokenizer_.template tokenize<int>(ex.orig_answer_text);
 76 |     // insert into example
 77 |     ex.q_tokens = q_tokens;
 78 |     ex.p_tokens = p_tokens;
 79 |     ex.a_tokens = a_tokens;
 80 |     size_t query_len = q_tokens.size();
 81 |     size_t j = 0;
 82 |     size_t span_max_len = msl_ - query_len - num_special_tokens;
 83 |     while (j * doc_stride < p_tokens.size()) {
 84 |       size_t begin_pos = j * doc_stride;
 85 |       size_t end_pos = min(begin_pos + span_max_len, p_tokens.size());
 86 |       bool is_last_span = (end_pos == p_tokens.size());
 87 |       // create span of paragraph tokens
 88 |       auto span_start __attribute__((unused)) = p_tokens.begin() + begin_pos;
 89 |       auto span_end __attribute__((unused)) = p_tokens.begin() + end_pos;
 90 |       ex.p_spans.emplace_back(begin_pos, end_pos);
 91 |       p_spans.emplace_back(i, j);
 92 |       ++j;
 93 |       if (is_last_span) {
 94 |         break;
 95 |       }
 96 |     }
 97 |     ++i;
 98 |   }
 99 |   return p_spans;
100 | }
101 | 
102 | template vector<pair<size_t, size_t>>
103 | add_tokens_to_examples(vector<SquadExample> &, TokenizerBase, long);
104 | template vector<pair<size_t, size_t>>
105 | add_tokens_to_examples(vector<SquadExample> &, TokenizerAlbert, long);
106 | 


--------------------------------------------------------------------------------
/src/squad_utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <fstream>
 4 | #include <iostream>
 5 | #include <string>
 6 | #include <torch/types.h>
 7 | #include <utility>
 8 | #include <vector>
 9 | 
10 | #include <nlohmann/json.hpp>
11 | 
12 | #include "tokenizer_albert.h"
13 | #include "tokenizer_base.h"
14 | #include "transformer_example.h"
15 | 
16 | using json = nlohmann::json;
17 | 
18 | struct SquadExample {
19 |   std::string qas_id;
20 |   std::string question_text;
21 |   std::string paragraph_text;
22 |   std::string orig_answer_text;
23 |   int start_position;
24 |   int end_position;
25 |   bool is_impossible;
26 |   std::vector<int> q_tokens;
27 |   std::vector<int> p_tokens;
28 |   std::vector<int> a_tokens;
29 |   std::vector<std::pair<size_t, size_t>> p_spans;
30 |   SquadExample(std::string &qas_id, std::string &question_text,
31 |                std::string &paragraph_text, std::string &orig_answer_text,
32 |                int start_position, int end_position, bool is_impossible)
33 |       : qas_id(qas_id), question_text(question_text),
34 |         paragraph_text(paragraph_text), orig_answer_text(orig_answer_text),
35 |         start_position(start_position), end_position(end_position),
36 |         is_impossible(is_impossible){};
37 | };
38 | 
39 | std::vector<SquadExample> readSquadExamples(const std::string &input_path);
40 | std::vector<SquadExample> read_squad_examples(std::ifstream &input_file,
41 |                                               bool is_training);
42 | template <typename TokenizerType>
43 | std::vector<std::pair<size_t, size_t>>
44 | add_tokens_to_examples(std::vector<SquadExample> &examples,
45 |                        TokenizerType tokenizer_, long msl_);
46 | 


--------------------------------------------------------------------------------
/src/tokenizer_albert.cpp:
--------------------------------------------------------------------------------
  1 | #include "tokenizer_albert.h"
  2 | 
  3 | using namespace std;
  4 | 
  5 | shared_ptr<sentencepiece::SentencePieceProcessor>
  6 | load_spmodel(const char *sppath) {
  7 |   shared_ptr<sentencepiece::SentencePieceProcessor> sp(
  8 |       new sentencepiece::SentencePieceProcessor());
  9 |   sp->LoadOrDie(sppath);
 10 |   return sp;
 11 | }
 12 | 
 13 | string get_spmodel_path(const char *pretrained_dir) {
 14 |   string s(pretrained_dir);
 15 |   s += "/spiece.model";
 16 |   return s;
 17 | }
 18 | 
 19 | TokenizerAlbert::TokenizerAlbert(const char *pretrained_dir)
 20 |     : TokenizerAlbert(read_transformers_pretrained(pretrained_dir),
 21 |                       get_spmodel_path(pretrained_dir).c_str()){};
 22 | 
 23 | TokenizerAlbert::TokenizerAlbert(TransformersTokenizerConfigs configs,
 24 |                                  const char *spmodel_path)
 25 |     : TokenizerBase(configs.special_tokens_map.bos_token,
 26 |                     configs.special_tokens_map.eos_token,
 27 |                     configs.special_tokens_map.unk_token,
 28 |                     configs.special_tokens_map.sep_token,
 29 |                     configs.special_tokens_map.pad_token,
 30 |                     configs.special_tokens_map.cls_token,
 31 |                     configs.special_tokens_map.mask_token,
 32 |                     configs.added_tokens.added_tokens, 0),
 33 |       processor_(load_spmodel(spmodel_path)){};
 34 | 
 35 | TransformerFeatures<> TokenizerAlbert::encode(string &text_a, string &text_b,
 36 |                                               bool add_special_tokens,
 37 |                                               size_t max_length, size_t stride,
 38 |                                               const char *truncation_strategy,
 39 |                                               bool pad_to_max_length) {
 40 |   // create a type for torch tensors
 41 |   auto opts_data = torch::TensorOptions().dtype(torch::kLong);
 42 |   // encode texts to ids directly
 43 |   vector<int> tis_int_a;
 44 |   vector<int> tis_int_b;
 45 |   processor_->Encode(text_a, &tis_int_a);
 46 |   processor_->Encode(text_b, &tis_int_b);
 47 |   // truncate based on length and strategy
 48 |   size_t num_tokens = tis_int_a.size() + 2;
 49 |   if (!tis_int_b.empty()) {
 50 |     num_tokens += tis_int_b.size() + 1;
 51 |   }
 52 |   truncate_sequences(tis_int_a, tis_int_b,
 53 |                      min((size_t)0, num_tokens - max_length),
 54 |                      truncation_strategy, stride);
 55 |   // add in special tokens
 56 |   tis_int_a.insert(tis_int_a.begin(), processor_->PieceToId(cls_token));
 57 |   tis_int_a.push_back(processor_->PieceToId(sep_token));
 58 |   if (!tis_int_b.empty()) {
 59 |     tis_int_b.push_back(processor_->PieceToId(sep_token));
 60 |   }
 61 |   // convert from int (sentencepiece type) to long (torch type)
 62 |   vector<long> tis;
 63 |   tis.reserve(tis_int_a.size() + tis_int_b.size());
 64 |   tis.insert(tis.end(), tis_int_a.begin(), tis_int_a.end());
 65 |   tis.insert(tis.end(), tis_int_b.begin(), tis_int_b.end());
 66 |   tis.resize(max_length, processor_->PieceToId(pad_token));
 67 |   // create attention_mask, token_type_ids, and position_ids
 68 |   vector<long> am(tis_int_a.size() + tis_int_b.size(), 1);
 69 |   am.resize(max_length, 0);
 70 |   vector<long> ttis(tis_int_a.size(), 0);
 71 |   ttis.resize(max_length, 1);
 72 |   // convert from standard data types to torch tensors
 73 |   torch::Tensor token_ids =
 74 |       torch::from_blob(tis.data(), {(long)max_length}, opts_data).clone();
 75 |   torch::Tensor attention_mask =
 76 |       torch::from_blob(am.data(), {(long)max_length}, opts_data).clone();
 77 |   torch::Tensor token_type_ids =
 78 |       torch::from_blob(ttis.data(), {(long)max_length}, opts_data).clone();
 79 |   torch::Tensor position_ids = torch::arange(0, (long)max_length, opts_data);
 80 |   // add a dummy label to the features, replaced later
 81 |   torch::Tensor dummy_label = torch::zeros({1}, opts_data);
 82 |   // create struct of features which is returned by the dataset
 83 |   TransformerFeatures<> features = {token_ids, attention_mask, token_type_ids,
 84 |                                     position_ids, dummy_label};
 85 |   return features;
 86 | }
 87 | 
 88 | template <typename T> vector<T> TokenizerAlbert::tokenize(string &text) {
 89 |   vector<T> tokens;
 90 |   processor_->Encode(text, &tokens);
 91 |   return tokens;
 92 | }
 93 | 
 94 | string TokenizerAlbert::decode(vector<long> &token_ids) {
 95 |   vector<int> tokens_int(token_ids.begin(), token_ids.end());
 96 |   string text;
 97 |   processor_->Decode(tokens_int, &text);
 98 |   return text;
 99 | }
100 | 
101 | int TokenizerAlbert::cls_token_id() { return processor_->PieceToId(cls_token); }
102 | 
103 | int TokenizerAlbert::sep_token_id() { return processor_->PieceToId(sep_token); }
104 | 
105 | int TokenizerAlbert::pad_token_id() { return processor_->PieceToId(pad_token); }
106 | 
107 | template vector<int> TokenizerAlbert::tokenize(string &);
108 | template vector<string> TokenizerAlbert::tokenize(string &);
109 | 


--------------------------------------------------------------------------------
/src/tokenizer_albert.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "config_utils.h"
 4 | #include "sentencepiece_processor.h"
 5 | #include "tokenizer_base.h"
 6 | #include "transformer_example.h"
 7 | 
 8 | class TokenizerAlbert : public TokenizerBase {
 9 | public:
10 |   TokenizerAlbert(const char *pretrained_dir);
11 |   TokenizerAlbert(TransformersTokenizerConfigs configs,
12 |                   const char *spmodel_path);
13 | 
14 |   TransformerFeatures<> encode(std::string &text_a, std::string &text_b,
15 |                                bool add_special_tokens, size_t max_length,
16 |                                size_t stride, const char *truncation_strategy,
17 |                                bool pad_to_max_length);
18 |   template <typename T> std::vector<T> tokenize(std::string &text);
19 |   std::string decode(std::vector<long> &token_ids);
20 |   int cls_token_id();
21 |   int sep_token_id();
22 |   int pad_token_id();
23 | 
24 | private:
25 |   std::shared_ptr<sentencepiece::SentencePieceProcessor> processor_;
26 | };
27 | 


--------------------------------------------------------------------------------
/src/tokenizer_base.cpp:
--------------------------------------------------------------------------------
  1 | #include "tokenizer_base.h"
  2 | 
  3 | using namespace std;
  4 | 
  5 | TokenizerBase::TokenizerBase(string &bos_token, string &eos_token,
  6 |                              string &unk_token, string &sep_token,
  7 |                              string &pad_token, string &cls_token,
  8 |                              string &mask_token,
  9 |                              vector<string> &additional_special_tokens,
 10 |                              long _pad_token_type_id)
 11 |     : bos_token(bos_token), eos_token(eos_token), unk_token(unk_token),
 12 |       sep_token(sep_token), pad_token(pad_token), cls_token(cls_token),
 13 |       mask_token(mask_token),
 14 |       additional_special_tokens(additional_special_tokens),
 15 |       _pad_token_type_id(_pad_token_type_id) {}
 16 | 
 17 | TokenizerBase::TokenizerBase(const char *pretrained_dir)
 18 |     : TokenizerBase(read_transformers_pretrained(pretrained_dir)) {}
 19 | 
 20 | TokenizerBase::TokenizerBase(TransformersTokenizerConfigs configs)
 21 |     : TokenizerBase(configs.special_tokens_map.bos_token,
 22 |                     configs.special_tokens_map.eos_token,
 23 |                     configs.special_tokens_map.unk_token,
 24 |                     configs.special_tokens_map.sep_token,
 25 |                     configs.special_tokens_map.pad_token,
 26 |                     configs.special_tokens_map.cls_token,
 27 |                     configs.special_tokens_map.mask_token,
 28 |                     configs.added_tokens.added_tokens, 0) {}
 29 | 
 30 | template <typename U> vector<U> TokenizerBase::tokenize(string &text) {
 31 |   vector<U> tokens;
 32 |   return tokens;
 33 | }
 34 | 
 35 | vector<long> TokenizerBase::convert_tokens_to_ids(vector<string> &tokens) {
 36 |   vector<long> ids;
 37 |   /*
 38 |   for (string &t : tokens) {
 39 |     long id = token_to_id_fn(t);
 40 |     ids.push_back(id);
 41 |   }
 42 |   */
 43 |   return ids;
 44 | }
 45 | 
 46 | TransformerFeatures<>
 47 | TokenizerBase::encode(string &text_a, string &text_b,
 48 |                       bool add_special_tokens = true,
 49 |                       const size_t max_length = 512, size_t stride = 0,
 50 |                       const char *truncation_strategy = "longest_first",
 51 |                       bool pad_to_max_length = false) {
 52 |   auto opts_data = torch::TensorOptions().dtype(torch::kLong);
 53 |   vector<string> tokens_a = tokenize<string>(text_a);
 54 |   vector<long> ids_a = convert_tokens_to_ids(tokens_a);
 55 |   vector<string> tokens_b = tokenize<string>(text_b);
 56 |   vector<long> ids_b = convert_tokens_to_ids(tokens_b);
 57 |   vector<long> ttis = create_token_type_ids_from_sequences(ids_a, ids_b);
 58 |   ids_a.insert(ids_a.end(), ids_b.begin(), ids_b.end());
 59 |   vector<long> am(ttis.size(), 1);
 60 |   torch::Tensor token_ids =
 61 |       torch::from_blob(ids_a.data(), {(long)max_length}, opts_data);
 62 |   torch::Tensor attention_mask =
 63 |       torch::from_blob(am.data(), {(long)max_length}, opts_data);
 64 |   torch::Tensor token_type_ids =
 65 |       torch::from_blob(ttis.data(), {(long)max_length}, opts_data);
 66 |   torch::Tensor position_ids = torch::arange(0, (long)max_length, opts_data);
 67 |   torch::Tensor dummy_label = torch::zeros(1, opts_data);
 68 |   TransformerFeatures<> features = {token_ids, attention_mask, token_type_ids,
 69 |                                     position_ids, dummy_label};
 70 |   return features;
 71 | }
 72 | 
 73 | template <typename T, typename U>
 74 | vector<long>
 75 | TokenizerBase::create_token_type_ids_from_sequences(vector<T> &tokens_vec_0,
 76 |                                                     vector<U> &tokens_vec_1) {
 77 |   vector<long> ttis_a(tokens_vec_0.size(), 0);
 78 |   vector<long> ttis_b(tokens_vec_1.size(), 1);
 79 |   ttis_a.insert(ttis_a.end(), ttis_b.begin(), ttis_b.end());
 80 |   return ttis_a;
 81 | }
 82 | 
 83 | vector<string> TokenizerBase::convert_ids_to_tokens(vector<long> &ids) {
 84 |   vector<string> tokens;
 85 |   return tokens;
 86 | }
 87 | 
 88 | string TokenizerBase::convert_tokens_to_string(vector<string> &tokens) {
 89 |   string text;
 90 |   return text;
 91 | }
 92 | 
 93 | string TokenizerBase::decode(vector<long> &token_ids) {
 94 |   string text;
 95 |   return text;
 96 | }
 97 | 
 98 | void TokenizerBase::truncate_sequences(
 99 |     vector<int> &ids_int_a, vector<int> &ids_int_b, size_t num_tokens_to_remove,
100 |     const char *truncation_strategy = "longest_first", size_t stride = 0) {
101 |   if (strcmp(truncation_strategy, "longest_first") == 0) {
102 |     while (num_tokens_to_remove > 0) {
103 |       if (ids_int_b.empty() || ids_int_a.size() > ids_int_b.size()) {
104 |         ids_int_a.pop_back();
105 |         --num_tokens_to_remove;
106 |       } else {
107 |         ids_int_b.pop_back();
108 |       }
109 |     }
110 |   } else if (strcmp(truncation_strategy, "only_first") == 0) {
111 |     while (num_tokens_to_remove > 0 && !ids_int_a.empty()) {
112 |       ids_int_a.pop_back();
113 |       --num_tokens_to_remove;
114 |     }
115 |   } else if (strcmp(truncation_strategy, "only_second") == 0) {
116 |     while (num_tokens_to_remove > 0 && !ids_int_b.empty()) {
117 |       ids_int_b.pop_back();
118 |       --num_tokens_to_remove;
119 |     }
120 |   } else if (strcmp(truncation_strategy, "only_second") == 0) {
121 |     assert((num_tokens_to_remove == 0));
122 |   } else {
123 |     cerr << "invalid truncation strategy.  skipping trancation" << endl;
124 |   }
125 | }
126 | 
127 | template vector<int> TokenizerBase::tokenize<int>(string &);
128 | template vector<string> TokenizerBase::tokenize<string>(string &);
129 | template vector<long>
130 | TokenizerBase::create_token_type_ids_from_sequences(vector<int> &,
131 |                                                     vector<long> &);
132 | 


--------------------------------------------------------------------------------
/src/tokenizer_base.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "config_utils.h"
 4 | #include "transformer_example.h"
 5 | 
 6 | #include <string>
 7 | #include <tuple>
 8 | #include <vector>
 9 | 
10 | class TokenizerBase {
11 | public:
12 |   TokenizerBase(std::string &bos_token, std::string &eos_token,
13 |                 std::string &unk_token, std::string &sep_token,
14 |                 std::string &pad_token, std::string &cls_token,
15 |                 std::string &mask_token,
16 |                 std::vector<std::string> &additional_special_tokens,
17 |                 long _pad_token_type_id = 0);
18 |   TokenizerBase(const char *pretrained_dir);
19 |   TokenizerBase(TransformersTokenizerConfigs configs);
20 |   ~TokenizerBase(){};
21 |   template <typename T> std::vector<T> tokenize(std::string &text);
22 |   virtual std::vector<long>
23 |   convert_tokens_to_ids(std::vector<std::string> &tokens);
24 |   virtual TransformerFeatures<> encode(std::string &text_a, std::string &text_b,
25 |                                        bool add_special_tokens,
26 |                                        const size_t max_length, size_t stride,
27 |                                        const char *truncation_strategy,
28 |                                        bool pad_to_max_length);
29 |   template <typename T, typename U>
30 |   std::vector<long>
31 |   create_token_type_ids_from_sequences(std::vector<T> &tokens_vec_0,
32 |                                        std::vector<U> &tokens_vec_1);
33 |   virtual std::vector<std::string>
34 |   convert_ids_to_tokens(std::vector<long> &ids);
35 |   virtual std::string
36 |   convert_tokens_to_string(std::vector<std::string> &tokens);
37 |   virtual std::string decode(std::vector<long> &token_ids);
38 |   virtual void truncate_sequences(std::vector<int> &ids_int_a,
39 |                                   std::vector<int> &ids_int_b,
40 |                                   size_t num_tokens_to_remove,
41 |                                   const char *truncation_strategy,
42 |                                   size_t stride);
43 |   virtual int cls_token_id() { return 0; };
44 |   virtual int sep_token_id() { return 0; };
45 |   virtual int pad_token_id() { return 0; };
46 | 
47 | protected:
48 |   // variables for state of tokenizer
49 |   const std::string bos_token;
50 |   const std::string eos_token;
51 |   const std::string unk_token;
52 |   const std::string sep_token;
53 |   const std::string pad_token;
54 |   const std::string cls_token;
55 |   const std::string mask_token;
56 |   const std::vector<std::string> additional_special_tokens;
57 |   const long _pad_token_type_id;
58 | };
59 | 


--------------------------------------------------------------------------------
/src/transformer_example.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <string>
 4 | #include <torch/types.h>
 5 | 
 6 | struct TransformerExample {
 7 |   std::string guid;
 8 |   std::string text_a;
 9 |   std::string text_b;
10 |   std::string label;
11 | 
12 |   TransformerExample() = default;
13 |   TransformerExample(std::string guid, std::string text_a, std::string text_b,
14 |                      std::string label)
15 |       : guid(guid), text_a(text_a), text_b(text_b), label(label){};
16 | };
17 | 
18 | /// A `TransformerFeatures` from a dataset.
19 | ///
20 | /// A dataset consists of data and an associated target (label).
21 | template <typename InputID = torch::Tensor,
22 |           typename AttentionMask = torch::Tensor,
23 |           typename TokenTypeID = torch::Tensor,
24 |           typename PositionID = torch::Tensor, typename Label = torch::Tensor>
25 | struct TransformerFeatures {
26 |   using InputIDType = InputID;
27 |   using AttentionMaskType = AttentionMask;
28 |   using TokenTypeIDType = TokenTypeID;
29 |   using PositionIDType = PositionID;
30 |   using LabelType = Label;
31 | 
32 |   TransformerFeatures() = default;
33 |   TransformerFeatures(InputID input_ids, AttentionMask attention_mask,
34 |                       TokenTypeID token_type_ids, PositionID position_ids,
35 |                       Label label)
36 |       : input_ids(std::move(input_ids)),
37 |         attention_mask(std::move(attention_mask)),
38 |         token_type_ids(std::move(token_type_ids)),
39 |         position_ids(std::move(position_ids)), label(std::move(label)) {}
40 | 
41 |   InputIDType input_ids;
42 |   AttentionMaskType attention_mask;
43 |   TokenTypeIDType token_type_ids;
44 |   PositionIDType position_ids;
45 |   LabelType label;
46 | };
47 | 


--------------------------------------------------------------------------------
/src/transformer_stack.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <torch/data/transforms.h>
 4 | #include <vector>
 5 | 
 6 | #include "transformer_example.h"
 7 | 
 8 | template <>
 9 | struct torch::data::transforms::Stack<TransformerFeatures<>>
10 |     : public torch::data::transforms::Collation<TransformerFeatures<>> {
11 |   TransformerFeatures<>
12 |   apply_batch(std::vector<TransformerFeatures<>> examples) override {
13 |     std::vector<torch::Tensor> ids, ams, ttis, ps, lbls;
14 |     ids.reserve(examples.size());
15 |     ams.reserve(examples.size());
16 |     ttis.reserve(examples.size());
17 |     ps.reserve(examples.size());
18 |     lbls.reserve(examples.size());
19 |     for (auto &example : examples) {
20 |       ids.push_back(std::move(example.input_ids));
21 |       ams.push_back(std::move(example.attention_mask));
22 |       ttis.push_back(std::move(example.token_type_ids));
23 |       ps.push_back(std::move(example.position_ids));
24 |       lbls.push_back(std::move(example.label));
25 |     }
26 |     return {torch::stack(ids), torch::stack(ams), torch::stack(ttis),
27 |             torch::stack(ps), torch::stack(lbls)};
28 |   }
29 | };
30 | 


--------------------------------------------------------------------------------
/test/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.10)
 2 | 
 3 | mark_as_advanced(
 4 |     BUILD_GMOCK BUILD_GTEST BUILD_SHARED_LIBS
 5 |     gmock_build_tests gtest_build_samples gtest_build_tests
 6 |     gtest_disable_pthreads gtest_force_shared_crt gtest_hide_internal_symbols
 7 | )
 8 | 
 9 | include_directories(${PROJECT_SOURCE_DIR})
10 | 
11 | set(SOURCES hflt_tests.cpp)
12 | add_executable(hflt-gtest ${SOURCES})
13 | target_link_libraries(hflt-gtest hflt gtest_main gtest)
14 | gtest_discover_tests(hflt-gtest WORKING_DIRECTORY $<TARGET_FILE_DIR:hflt-gtest>)
15 | 
16 | # move test assets to build folder
17 | add_custom_command(TARGET hflt-gtest POST_BUILD
18 |                    COMMAND ${CMAKE_COMMAND} -E copy_directory
19 |                    "${PROJECT_SOURCE_DIR}/data" $<TARGET_FILE_DIR:hflt-gtest>/data)
20 | add_custom_command(TARGET hflt-gtest POST_BUILD
21 |                    COMMAND ${CMAKE_COMMAND} -E copy_directory
22 |                    "${CMAKE_CURRENT_LIST_DIR}/assets" $<TARGET_FILE_DIR:hflt-gtest>/assets)
23 | add_custom_command(TARGET hflt-gtest POST_BUILD
24 |                    COMMAND ${CMAKE_COMMAND} -E copy_directory
25 |                    "${PROJECT_SOURCE_DIR}/models/sst2_trained" $<TARGET_FILE_DIR:hflt-gtest>/models/sst2_trained)
26 | 


--------------------------------------------------------------------------------
/test/assets/sst-2-head.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "data": [
 3 |     {
 4 |       "guid": "0",
 5 |       "text_a": "it 's a charming and often affecting journey . ",
 6 |       "text_b": "",
 7 |       "label": "1"
 8 |     },
 9 |     {
10 |       "guid": "1",
11 |       "text_a": "unflinchingly bleak and desperate ",
12 |       "text_b": "",
13 |       "label": "0"
14 |     },
15 |     {
16 |       "guid": "2",
17 |       "text_a": "allows us to hope that nolan is poised to embark a major career as a commercial yet inventive filmmaker . ",
18 |       "text_b": "",
19 |       "label": "1"
20 |     },
21 |     {
22 |       "guid": "3",
23 |       "text_a": "the acting , costumes , music , cinematography and sound are all astounding given the production 's austere locales . ",
24 |       "text_b": "",
25 |       "label": "1"
26 |     },
27 |     {
28 |       "guid": "4",
29 |       "text_a": "it 's slow -- very , very slow . ",
30 |       "text_b": "",
31 |       "label": "0"
32 |     },
33 |     {
34 |       "guid": "5",
35 |       "text_a": "although laced with humor and a few fanciful touches , the film is a refreshingly serious look at young women . ",
36 |       "text_b": "",
37 |       "label": "1"
38 |     },
39 |     {
40 |       "guid": "6",
41 |       "text_a": "a sometimes tedious film . ",
42 |       "text_b": "",
43 |       "label": "0"
44 |     },
45 |     {
46 |       "guid": "7",
47 |       "text_a": "or doing last year 's taxes with your ex-wife . ",
48 |       "text_b": "",
49 |       "label": "0"
50 |     },
51 |     {
52 |       "guid": "8",
53 |       "text_a": "you do n't have to know about music to appreciate the film 's easygoing blend of comedy and romance . ",
54 |       "text_b": "",
55 |       "label": "1"
56 |     }
57 |   ]
58 | }
59 | 


--------------------------------------------------------------------------------
/test/assets/sst-2-head.tsv:
--------------------------------------------------------------------------------
 1 | sentence	label
 2 | it 's a charming and often affecting journey . 	1
 3 | unflinchingly bleak and desperate 	0
 4 | allows us to hope that nolan is poised to embark a major career as a commercial yet inventive filmmaker . 	1
 5 | the acting , costumes , music , cinematography and sound are all astounding given the production 's austere locales . 	1
 6 | it 's slow -- very , very slow . 	0
 7 | although laced with humor and a few fanciful touches , the film is a refreshingly serious look at young women . 	1
 8 | a sometimes tedious film . 	0
 9 | or doing last year 's taxes with your ex-wife . 	0
10 | you do n't have to know about music to appreciate the film 's easygoing blend of comedy and romance . 	1
11 | 


--------------------------------------------------------------------------------
/test/hflt_tests.cpp:
--------------------------------------------------------------------------------
 1 | #include "gtest/gtest.h"
 2 | #include "test_squad_utils.h"
 3 | #include "test_transformer_config.h"
 4 | #include "test_transformer_example.h"
 5 | #include "test_dataset_and_processors.h"
 6 | 
 7 | int main(int argc, char **argv) {
 8 |   ::testing::InitGoogleTest(&argc, argv);
 9 |   return RUN_ALL_TESTS();
10 | }
11 | 


--------------------------------------------------------------------------------
/test/scripts/convert_squad.py:
--------------------------------------------------------------------------------
 1 | import typer
 2 | 
 3 | import numpy as np
 4 | 
 5 | from transformers.data.processors.squad import SquadV1Processor, SquadV2Processor
 6 | from transformers import AutoTokenizer, squad_convert_examples_to_features
 7 | 
 8 | output_dir = "data/SQuAD/SQuAD_dev2"
 9 | model_name = "albert-base-v1"
10 | data_dir = "data/SQuAD"
11 | dev_fn = "dev-v2.0.json"
12 | 
13 | tokenizer = AutoTokenizer.from_pretrained(model_name)
14 | 
15 | processor = SquadV2Processor()
16 | 
17 | examples = processor.get_train_examples(data_dir, filename=dev_fn)
18 | 
19 | features, dataset = squad_convert_examples_to_features(
20 |     examples=examples,
21 |     tokenizer=tokenizer,
22 |     max_seq_length=384,
23 |     doc_stride=128,
24 |     max_query_length=64,
25 |     is_training=True,
26 |     return_dataset="pt",
27 |     threads=2,
28 | )
29 | 
30 | for i, filename_base in enumerate(["input_ids", "attention_mask", "token_type_ids", "start_positions", "end_positions"]):
31 |     np.savetxt(f"{output_dir}/{filename_base}.csv", dataset.tensors[i].numpy(), delimiter=",")
32 | 
33 | with open(f"{output_dir}/strings.txt", "w") as fio:
34 |     for t in dataset.tensors[0]:
35 |         fio.write(f"{tokenizer.decode(t)}\n")
36 | 


--------------------------------------------------------------------------------
/test/test_dataset_and_processors.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <torch/torch.h>
 4 | 
 5 | #include "gtest/gtest.h"
 6 | #include "src/dataset_classification.h"
 7 | #include "src/dataset_qa.h"
 8 | #include "src/processors.h"
 9 | 
10 | std::string pretrained_dir = "models/sst2_trained";
11 | 
12 | TEST(datasetprocessorsTest, tokenizerbasetests) {
13 |   using psfn =
14 |       std::pair<std::string, std::function<std::vector<TransformerExample>(
15 |                                  const std::string)>>;
16 |   std::vector<psfn> dataset_paths(
17 |       {std::make_pair("assets/sst-2-head.tsv", readSST2CsvFile),
18 |        std::make_pair("assets/sst-2-head.json", readGenericJsonFile)});
19 |   for (auto p : dataset_paths) {
20 |     TransformerClassificationDS<> ds(pretrained_dir, 128, p.second, p.first);
21 |     size_t num_examples = ds.size().value();
22 |     auto examples = ds.examples();
23 |     auto ex = examples[0];
24 |     EXPECT_EQ(num_examples, 9);
25 |     EXPECT_EQ(ex.guid, "0");
26 |     EXPECT_EQ(ex.text_a, "it 's a charming and often affecting journey . ");
27 |     EXPECT_EQ(ex.text_b, "");
28 |     EXPECT_EQ(ex.label, "1");
29 |   }
30 | }
31 | 
32 | std::string squad_data_path("data/SQuAD/dev-v2.0.json");
33 | 
34 | TEST(datasetprocessorsTest, qatests) {
35 |   TransformerQADS<TokenizerAlbert, SquadExample> ds(pretrained_dir, 384, readSquadExamples, squad_data_path);
36 |   std::vector<SquadExample> examples = ds.examples();
37 |   EXPECT_EQ(examples.size(), 11873);
38 |   std::cout << "Squad Examples: " << examples.size() << std::endl;
39 |   std::cout << "Squad DS size(): " << ds.size().value() << std::endl;
40 |   
41 |   auto dl = torch::data::make_data_loader<torch::data::samplers::SequentialSampler>(
42 |       std::move(ds), 16);
43 |   size_t i = 0;
44 |   for (auto &_mb : *dl) {
45 |     ++i;
46 |   }
47 |   std::cout << "Num Batches: " << i << std::endl;
48 | }
49 | 


--------------------------------------------------------------------------------
/test/test_squad_utils.h:
--------------------------------------------------------------------------------
 1 | #include "gtest/gtest.h"
 2 | #include "src/squad_utils.h"
 3 | 
 4 | bool file_exists(const char *fp) {
 5 |   std::ifstream fd(fp);
 6 |   return fd.good();
 7 | }
 8 | 
 9 | size_t file_size(const char *fp) {
10 |   struct stat st;
11 |   if (stat(fp, &st) != 0) {
12 |     return 0;
13 |   }
14 |   return st.st_size;
15 | }
16 | 
17 | void file_test(const char *fp, const bool remove_file = false) {
18 |   ASSERT_EQ(file_exists(fp), true);
19 |   ASSERT_GT(file_size(fp), 0);
20 |   if (remove_file)
21 |     remove(fp);
22 | }
23 | 
24 | const char *squad_path = "data/SQuAD/dev-v2.0.json";
25 | 
26 | TEST(squadutilsTest, read_squad_json) {
27 |   file_test(squad_path);
28 |   std::ifstream squad_file(squad_path);
29 |   std::vector<SquadExample> examples = read_squad_examples(squad_file, true);
30 |   for (auto &example : examples)
31 |     EXPECT_GT(example.paragraph_text.size(), 0);
32 | }
33 | 


--------------------------------------------------------------------------------
/test/test_transformer_config.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "gtest/gtest.h"
 4 | #include "src/config_utils.h"
 5 | 
 6 | TEST(transformerconfigTest, read_pretrained) {
 7 |   const char *pretrained_dir = "models/sst2_trained";
 8 |   auto configs = read_transformers_pretrained(pretrained_dir);
 9 |   EXPECT_EQ(configs.tokenizer_config.do_lower_case, false);
10 |   EXPECT_EQ(configs.tokenizer_config.init_inputs, std::vector<std::string>({}));
11 |   EXPECT_EQ(configs.tokenizer_config.max_len, 512);
12 |   EXPECT_EQ(configs.special_tokens_map.cls_token, "[CLS]");
13 |   EXPECT_EQ(configs.special_tokens_map.mask_token, "[MASK]");
14 |   EXPECT_EQ(configs.special_tokens_map.pad_token, "<pad>");
15 |   EXPECT_EQ(configs.special_tokens_map.sep_token, "[SEP]");
16 |   EXPECT_EQ(configs.special_tokens_map.unk_token, "<unk>");
17 |   EXPECT_EQ(configs.added_tokens.added_tokens.size(), 0);
18 | }
19 | 


--------------------------------------------------------------------------------
/test/test_transformer_example.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "gtest/gtest.h"
 4 | #include "src/transformer_example.h"
 5 | 
 6 | TEST(transformerexampleTest, create_transformer_example) {
 7 |   std::string text_a("this is a test sentence without a second text");
 8 |   std::string text_b("this is text_b");
 9 |   TransformerExample ex1 = {"example1", text_a, "", "0"};
10 |   EXPECT_EQ(ex1.text_a, text_a);
11 |   TransformerExample ex2 = {"example2", text_a, text_b, "0"};
12 |   EXPECT_EQ(ex2.text_a, text_a);
13 |   EXPECT_EQ(ex2.text_b, text_b);
14 |   EXPECT_NE(ex1.guid, ex2.guid);
15 | }
16 | 


--------------------------------------------------------------------------------