├── .deepsource.toml ├── .flake8 ├── .github ├── conda │ ├── build.sh │ └── meta.yaml ├── dependabot.yml └── workflows │ ├── black.yml │ ├── python-package.yml │ ├── python-publish-conda.yml │ ├── python-publish-pypi.yml │ └── website.yml ├── .gitignore ├── .pre-commit-config.yaml ├── MANIFEST.in ├── README.md ├── docs ├── gen_ref_pages.py └── index.md ├── mkdocs.yml ├── pyproject.toml ├── requirements-dev.txt ├── requirements.txt ├── setup.cfg ├── setup.py └── transformers_embedder ├── __init__.py ├── embedder.py ├── modules ├── __init__.py ├── encoder.py └── scalar_mix.py ├── tokenizer.py └── utils.py /.deepsource.toml: -------------------------------------------------------------------------------- 1 | version = 1 2 | 3 | [[analyzers]] 4 | name = "python" 5 | enabled = true 6 | 7 | [analyzers.meta] 8 | runtime_version = "3.x.x" 9 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E203, E266, E501, W503, F403, F401, E402 3 | max-line-length = 88 4 | max-complexity = 18 5 | select = B,C,E,F,W,T4,B9 6 | -------------------------------------------------------------------------------- /.github/conda/build.sh: -------------------------------------------------------------------------------- 1 | $PYTHON setup.py install # Python command to install the script. -------------------------------------------------------------------------------- /.github/conda/meta.yaml: -------------------------------------------------------------------------------- 1 | {% set name = "transformers-embedder" %} 2 | {% set data = load_setup_py_data() %} 3 | 4 | package: 5 | name: "{{ name|lower }}" 6 | version: "{{ TRANSFORMERS_EMBEDDER_VERSION }}" 7 | 8 | about: 9 | home: {{ data['url'] }} 10 | license: {{ data['license'] }} 11 | summary: {{ data['description'] }} 12 | 13 | requirements: 14 | build: 15 | - python 16 | - transformers>=4.3,<4.12 17 | - spacy>=3.0,<3.2 18 | run: 19 | - python 20 | - transformers>=4.3,<4.12 21 | - spacy>=3.0,<3.2 22 | 23 | source: 24 | path: ../../ 25 | 26 | build: 27 | noarch: python 28 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: pip 4 | directory: "/" 5 | schedule: 6 | interval: daily 7 | time: "04:00" 8 | open-pull-requests-limit: 10 9 | -------------------------------------------------------------------------------- /.github/workflows/black.yml: -------------------------------------------------------------------------------- 1 | name: Check Code Quality 2 | 3 | on: pull_request 4 | 5 | jobs: 6 | lint: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v2 10 | - uses: psf/black@stable 11 | with: 12 | options: "-l 110" 13 | - uses: actions/checkout@v2 14 | - uses: actions/setup-python@v2 15 | with: 16 | python-version: "3.9" 17 | - name: Run flake8 18 | uses: julianwachholz/flake8-action@v2 19 | with: 20 | checkName: "Python Lint" 21 | path: ./transformers_embedder 22 | plugins: "pep8-naming==0.12.1 flake8-comprehensions==3.6.1" 23 | config: .flake8 24 | env: 25 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 26 | -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Build Python package 5 | 6 | on: 7 | push: 8 | branches: [$default-branch] 9 | pull_request: 10 | branches: [$default-branch] 11 | 12 | jobs: 13 | build: 14 | runs-on: ubuntu-latest 15 | strategy: 16 | fail-fast: false 17 | matrix: 18 | python-version: [3.6, 3.7, 3.8, 3.9] 19 | 20 | steps: 21 | - uses: actions/checkout@v2 22 | - name: Set up Python ${{ matrix.python-version }} 23 | uses: actions/setup-python@v2 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | python -m pip install flake8 pytest 30 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 31 | - name: Lint with flake8 32 | run: | 33 | # stop the build if there are Python syntax errors or undefined names 34 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 35 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 36 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 37 | # - name: Test with pytest 38 | # run: | 39 | # pytest 40 | -------------------------------------------------------------------------------- /.github/workflows/python-publish-conda.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package to Conda 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | env: 8 | ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_TOKEN }} 9 | 10 | jobs: 11 | publish: 12 | runs-on: ubuntu-latest 13 | defaults: 14 | run: 15 | shell: bash -l {0} 16 | 17 | steps: 18 | - name: Checkout repository 19 | uses: actions/checkout@v2 20 | 21 | - name: Install miniconda 22 | uses: conda-incubator/setup-miniconda@v2 23 | with: 24 | auto-update-conda: true 25 | auto-activate-base: false 26 | python-version: 3.9 27 | activate-environment: "build-transformers-embedder" 28 | channels: riccorl 29 | 30 | - name: Setup conda env 31 | run: | 32 | conda install -c defaults anaconda-client conda-build 33 | - name: Extract version 34 | run: echo "TRANSFORMERS_EMBEDDER_VERSION=`python setup.py --version`" >> $GITHUB_ENV 35 | 36 | - name: Build conda packages 37 | run: | 38 | conda info 39 | conda list 40 | conda-build -c riccorl -c conda-forge -c huggingface .github/conda 41 | 42 | - name: Upload to Anaconda 43 | run: anaconda upload `conda-build .github/conda --output` --force 44 | -------------------------------------------------------------------------------- /.github/workflows/python-publish-pypi.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package to PyPi 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | jobs: 8 | publish: 9 | runs-on: ubuntu-latest 10 | 11 | steps: 12 | - uses: actions/checkout@v2 13 | - name: Set up Python 14 | uses: actions/setup-python@v2 15 | with: 16 | python-version: "3.x" 17 | 18 | - name: Install dependencies 19 | run: | 20 | python -m pip install --upgrade pip 21 | pip install build 22 | 23 | - name: Extract version 24 | run: echo "TRANSFORMERS_EMBEDDER_VERSION=`python setup.py --version`" >> $GITHUB_ENV 25 | 26 | - name: Build package 27 | run: python -m build 28 | 29 | - name: Publish package 30 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 31 | with: 32 | user: ${{ secrets.PYPI_USERNAME }} 33 | password: ${{ secrets.PYPI_PASSWORD }} 34 | -------------------------------------------------------------------------------- /.github/workflows/website.yml: -------------------------------------------------------------------------------- 1 | name: ci 2 | on: 3 | push: 4 | branches: 5 | - master 6 | - main 7 | jobs: 8 | deploy: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v2 12 | - uses: actions/setup-python@v2 13 | with: 14 | python-version: 3.x 15 | - run: pip install mkdocs-material mkdocs-literate-nav mkdocstrings[python] mkdocs-section-index mkdocs-gen-files 16 | - run: mkdocs gh-deploy --force 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # custom 2 | stuff 3 | /test.ipynb 4 | /test.py 5 | 6 | # Fleet 7 | .fleet 8 | 9 | # Created by https://www.toptal.com/developers/gitignore/api/python,pycharm+all,vscode,macos,linux,windows 10 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,pycharm+all,vscode,macos,linux,windows 11 | 12 | ### Linux ### 13 | *~ 14 | 15 | # temporary files which can be created if a process still has a handle open of a deleted file 16 | .fuse_hidden* 17 | 18 | # KDE directory preferences 19 | .directory 20 | 21 | # Linux trash folder which might appear on any partition or disk 22 | .Trash-* 23 | 24 | # .nfs files are created when an open file is removed but is still being accessed 25 | .nfs* 26 | 27 | ### macOS ### 28 | # General 29 | .DS_Store 30 | .AppleDouble 31 | .LSOverride 32 | 33 | # Icon must end with two \r 34 | Icon 35 | 36 | 37 | # Thumbnails 38 | ._* 39 | 40 | # Files that might appear in the root of a volume 41 | .DocumentRevisions-V100 42 | .fseventsd 43 | .Spotlight-V100 44 | .TemporaryItems 45 | .Trashes 46 | .VolumeIcon.icns 47 | .com.apple.timemachine.donotpresent 48 | 49 | # Directories potentially created on remote AFP share 50 | .AppleDB 51 | .AppleDesktop 52 | Network Trash Folder 53 | Temporary Items 54 | .apdisk 55 | 56 | ### PyCharm+all ### 57 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider 58 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 59 | 60 | # User-specific stuff 61 | .idea/**/workspace.xml 62 | .idea/**/tasks.xml 63 | .idea/**/usage.statistics.xml 64 | .idea/**/dictionaries 65 | .idea/**/shelf 66 | 67 | # Generated files 68 | .idea/**/contentModel.xml 69 | 70 | # Sensitive or high-churn files 71 | .idea/**/dataSources/ 72 | .idea/**/dataSources.ids 73 | .idea/**/dataSources.local.xml 74 | .idea/**/sqlDataSources.xml 75 | .idea/**/dynamic.xml 76 | .idea/**/uiDesigner.xml 77 | .idea/**/dbnavigator.xml 78 | 79 | # Gradle 80 | .idea/**/gradle.xml 81 | .idea/**/libraries 82 | 83 | # Gradle and Maven with auto-import 84 | # When using Gradle or Maven with auto-import, you should exclude module files, 85 | # since they will be recreated, and may cause churn. Uncomment if using 86 | # auto-import. 87 | # .idea/artifacts 88 | # .idea/compiler.xml 89 | # .idea/jarRepositories.xml 90 | # .idea/modules.xml 91 | # .idea/*.iml 92 | # .idea/modules 93 | # *.iml 94 | # *.ipr 95 | 96 | # CMake 97 | cmake-build-*/ 98 | 99 | # Mongo Explorer plugin 100 | .idea/**/mongoSettings.xml 101 | 102 | # File-based project format 103 | *.iws 104 | 105 | # IntelliJ 106 | out/ 107 | 108 | # mpeltonen/sbt-idea plugin 109 | .idea_modules/ 110 | 111 | # JIRA plugin 112 | atlassian-ide-plugin.xml 113 | 114 | # Cursive Clojure plugin 115 | .idea/replstate.xml 116 | 117 | # Crashlytics plugin (for Android Studio and IntelliJ) 118 | com_crashlytics_export_strings.xml 119 | crashlytics.properties 120 | crashlytics-build.properties 121 | fabric.properties 122 | 123 | # Editor-based Rest Client 124 | .idea/httpRequests 125 | 126 | # Android studio 3.1+ serialized cache file 127 | .idea/caches/build_file_checksums.ser 128 | 129 | ### PyCharm+all Patch ### 130 | # Ignores the whole .idea folder and all .iml files 131 | # See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360 132 | 133 | .idea/ 134 | 135 | # Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023 136 | 137 | *.iml 138 | modules.xml 139 | .idea/misc.xml 140 | *.ipr 141 | 142 | # Sonarlint plugin 143 | .idea/sonarlint 144 | 145 | ### Python ### 146 | # Byte-compiled / optimized / DLL files 147 | __pycache__/ 148 | *.py[cod] 149 | *$py.class 150 | 151 | # C extensions 152 | *.so 153 | 154 | # Distribution / packaging 155 | .Python 156 | build/ 157 | develop-eggs/ 158 | dist/ 159 | downloads/ 160 | eggs/ 161 | .eggs/ 162 | lib/ 163 | lib64/ 164 | parts/ 165 | sdist/ 166 | var/ 167 | wheels/ 168 | pip-wheel-metadata/ 169 | share/python-wheels/ 170 | *.egg-info/ 171 | .installed.cfg 172 | *.egg 173 | MANIFEST 174 | 175 | # PyInstaller 176 | # Usually these files are written by a python script from a template 177 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 178 | *.manifest 179 | *.spec 180 | 181 | # Installer logs 182 | pip-log.txt 183 | pip-delete-this-directory.txt 184 | 185 | # Unit test / coverage reports 186 | htmlcov/ 187 | .tox/ 188 | .nox/ 189 | .coverage 190 | .coverage.* 191 | .cache 192 | nosetests.xml 193 | coverage.xml 194 | *.cover 195 | *.py,cover 196 | .hypothesis/ 197 | .pytest_cache/ 198 | pytestdebug.log 199 | 200 | # Translations 201 | *.mo 202 | *.pot 203 | 204 | # Django stuff: 205 | *.log 206 | local_settings.py 207 | db.sqlite3 208 | db.sqlite3-journal 209 | 210 | # Flask stuff: 211 | instance/ 212 | .webassets-cache 213 | 214 | # Scrapy stuff: 215 | .scrapy 216 | 217 | # Sphinx documentation 218 | docs/_build/ 219 | doc/_build/ 220 | 221 | # PyBuilder 222 | target/ 223 | 224 | # Jupyter Notebook 225 | .ipynb_checkpoints 226 | 227 | # IPython 228 | profile_default/ 229 | ipython_config.py 230 | 231 | # pyenv 232 | .python-version 233 | 234 | # pipenv 235 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 236 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 237 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 238 | # install all needed dependencies. 239 | #Pipfile.lock 240 | 241 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 242 | __pypackages__/ 243 | 244 | # Celery stuff 245 | celerybeat-schedule 246 | celerybeat.pid 247 | 248 | # SageMath parsed files 249 | *.sage.py 250 | 251 | # Environments 252 | .env 253 | .venv 254 | env/ 255 | venv/ 256 | ENV/ 257 | env.bak/ 258 | venv.bak/ 259 | pythonenv* 260 | 261 | # Spyder project settings 262 | .spyderproject 263 | .spyproject 264 | 265 | # Rope project settings 266 | .ropeproject 267 | 268 | # mkdocs documentation 269 | /site 270 | 271 | # mypy 272 | .mypy_cache/ 273 | .dmypy.json 274 | dmypy.json 275 | 276 | # Pyre type checker 277 | .pyre/ 278 | 279 | # pytype static type analyzer 280 | .pytype/ 281 | 282 | # profiling data 283 | .prof 284 | 285 | ### vscode ### 286 | .vscode 287 | .vscode/* 288 | !.vscode/settings.json 289 | !.vscode/tasks.json 290 | !.vscode/launch.json 291 | !.vscode/extensions.json 292 | *.code-workspace 293 | 294 | ### Windows ### 295 | # Windows thumbnail cache files 296 | Thumbs.db 297 | Thumbs.db:encryptable 298 | ehthumbs.db 299 | ehthumbs_vista.db 300 | 301 | # Dump file 302 | *.stackdump 303 | 304 | # Folder config file 305 | [Dd]esktop.ini 306 | 307 | # Recycle Bin used on file shares 308 | $RECYCLE.BIN/ 309 | 310 | # Windows Installer files 311 | *.cab 312 | *.msi 313 | *.msix 314 | *.msm 315 | *.msp 316 | 317 | # Windows shortcuts 318 | *.lnk 319 | 320 | # End of https://www.toptal.com/developers/gitignore/api/python,pycharm+all,vscode,macos,linux,windows 321 | 322 | /stuff/ 323 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/ambv/black 3 | rev: 21.9b0 4 | hooks: 5 | - id: black 6 | - repo: https://gitlab.com/pycqa/flake8 7 | rev: 3.9.2 8 | hooks: 9 | - id: flake8 10 | 11 | default_language_version: 12 | python: python3 -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | # Transformers Embedder 4 | 5 | [![Open in Visual Studio Code](https://img.shields.io/badge/preview%20in-vscode.dev-blue)](https://github.dev/Riccorl/transformers-embedder) 6 | [![PyTorch](https://img.shields.io/badge/PyTorch-orange?logo=pytorch)](https://pytorch.org/) 7 | [![Transformers](https://img.shields.io/badge/4.34-🤗%20Transformers-6670ff)](https://huggingface.co/transformers/) 8 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000)](https://github.com/psf/black) 9 | 10 | [![Upload to PyPi](https://github.com/Riccorl/transformers-embedder/actions/workflows/python-publish-pypi.yml/badge.svg)](https://github.com/Riccorl/transformers-embedder/actions/workflows/python-publish-pypi.yml) 11 | [![Upload to PyPi](https://github.com/Riccorl/transformers-embedder/actions/workflows/python-publish-conda.yml/badge.svg)](https://github.com/Riccorl/transformers-embedder/actions/workflows/python-publish-conda.yml) 12 | [![PyPi Version](https://img.shields.io/github/v/release/Riccorl/transformers-embedder)](https://github.com/Riccorl/transformers-embedder/releases) 13 | [![Anaconda-Server Badge](https://anaconda.org/riccorl/transformers-embedder/badges/version.svg)](https://anaconda.org/riccorl/transformers-embedder) 14 | [![DeepSource](https://deepsource.io/gh/Riccorl/transformers-embedder.svg/?label=active+issues)](https://deepsource.io/gh/Riccorl/transformers-embedder/?ref=repository-badge) 15 | 16 |
17 | 18 | A Word Level Transformer layer based on PyTorch and 🤗 Transformers. 19 | 20 | ## How to use 21 | 22 | Install the library from [PyPI](https://pypi.org/project/transformers-embedder): 23 | 24 | ```bash 25 | pip install transformers-embedder 26 | ``` 27 | 28 | or from [Conda](https://anaconda.org/riccorl/transformers-embedder): 29 | 30 | ```bash 31 | conda install -c riccorl transformers-embedder 32 | ``` 33 | 34 | It offers a PyTorch layer and a tokenizer that support almost every pretrained model from Huggingface 35 | [🤗Transformers](https://huggingface.co/transformers/) library. Here is a quick example: 36 | 37 | ```python 38 | import transformers_embedder as tre 39 | 40 | tokenizer = tre.Tokenizer("bert-base-cased") 41 | 42 | model = tre.TransformersEmbedder( 43 | "bert-base-cased", subword_pooling_strategy="sparse", layer_pooling_strategy="mean" 44 | ) 45 | 46 | example = "This is a sample sentence" 47 | inputs = tokenizer(example, return_tensors=True) 48 | ``` 49 | 50 | ```text 51 | { 52 | 'input_ids': tensor([[ 101, 1188, 1110, 170, 6876, 5650, 102]]), 53 | 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]]), 54 | 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]]) 55 | 'scatter_offsets': tensor([[0, 1, 2, 3, 4, 5, 6]]), 56 | 'sparse_offsets': { 57 | 'sparse_indices': tensor( 58 | [ 59 | [0, 0, 0, 0, 0, 0, 0], 60 | [0, 1, 2, 3, 4, 5, 6], 61 | [0, 1, 2, 3, 4, 5, 6] 62 | ] 63 | ), 64 | 'sparse_values': tensor([1., 1., 1., 1., 1., 1., 1.]), 65 | 'sparse_size': torch.Size([1, 7, 7]) 66 | }, 67 | 'sentence_length': 7 # with special tokens included 68 | } 69 | ``` 70 | 71 | ```python 72 | outputs = model(**inputs) 73 | ``` 74 | 75 | ```text 76 | # outputs.word_embeddings.shape[1:-1] # remove [CLS] and [SEP] 77 | torch.Size([1, 5, 768]) 78 | # len(example) 79 | 5 80 | ``` 81 | 82 | ## Info 83 | 84 | One of the annoyance of using transformer-based models is that it is not trivial to compute word embeddings 85 | from the sub-token embeddings they output. With this API it's as easy as using 🤗Transformers to get 86 | word-level embeddings from theoretically every transformer model it supports. 87 | 88 | ### Model 89 | 90 | #### Subword Pooling Strategy 91 | 92 | The `TransformersEmbedder` class offers 3 ways to get the embeddings: 93 | 94 | - `subword_pooling_strategy="sparse"`: computes the mean of the embeddings of the sub-tokens of each word 95 | (i.e. the embeddings of the sub-tokens are pooled together) using a sparse matrix multiplication. This 96 | strategy is the default one. 97 | - `subword_pooling_strategy="scatter"`: computes the mean of the embeddings of the sub-tokens of each word 98 | using a scatter-gather operation. It is not deterministic, but it works with ONNX export. 99 | - `subword_pooling_strategy="none"`: returns the raw output of the transformer model without sub-token pooling. 100 | 101 | Here a little feature table: 102 | 103 | | | Pooling | Deterministic | ONNX | 104 | |-------------|:------------------:|:------------------:|:------------------:| 105 | | **Sparse** | :white_check_mark: | :white_check_mark: | :x: | 106 | | **Scatter** | :white_check_mark: | :x: | :white_check_mark: | 107 | | **None** | :x: | :white_check_mark: | :white_check_mark: | 108 | 109 | #### Layer Pooling Strategy 110 | 111 | There are also multiple type of outputs you can get using `layer_pooling_strategy` parameter: 112 | 113 | - `layer_pooling_strategy="last"`: returns the last hidden state of the transformer model 114 | - `layer_pooling_strategy="concat"`: returns the concatenation of the selected `output_layers` of the 115 | transformer model 116 | - `layer_pooling_strategy="sum"`: returns the sum of the selected `output_layers` of the transformer model 117 | - `layer_pooling_strategy="mean"`: returns the average of the selected `output_layers` of the transformer model 118 | - `layer_pooling_strategy="scalar_mix"`: returns the output of a parameterised scalar mixture layer of the 119 | selected `output_layers` of the transformer model 120 | 121 | If you also want all the outputs from the HuggingFace model, you can set `return_all=True` to get them. 122 | 123 | ```python 124 | class TransformersEmbedder(torch.nn.Module): 125 | def __init__( 126 | self, 127 | model: Union[str, tr.PreTrainedModel], 128 | subword_pooling_strategy: str = "sparse", 129 | layer_pooling_strategy: str = "last", 130 | output_layers: Tuple[int] = (-4, -3, -2, -1), 131 | fine_tune: bool = True, 132 | return_all: bool = True, 133 | ) 134 | ``` 135 | 136 | ### Tokenizer 137 | 138 | The `Tokenizer` class provides the `tokenize` method to preprocess the input for the `TransformersEmbedder` 139 | layer. You can pass raw sentences, pre-tokenized sentences and sentences in batch. It will preprocess them 140 | returning a dictionary with the inputs for the model. By passing `return_tensors=True` it will return the 141 | inputs as `torch.Tensor`. 142 | 143 | By default, if you pass text (or batch) as strings, it uses the HuggingFace tokenizer to tokenize them. 144 | 145 | ```python 146 | text = "This is a sample sentence" 147 | tokenizer(text) 148 | 149 | text = ["This is a sample sentence", "This is another sample sentence"] 150 | tokenizer(text) 151 | ``` 152 | 153 | You can pass a pre-tokenized sentence (or batch of sentences) by setting `is_split_into_words=True` 154 | 155 | ```python 156 | text = ["This", "is", "a", "sample", "sentence"] 157 | tokenizer(text, is_split_into_words=True) 158 | 159 | text = [ 160 | ["This", "is", "a", "sample", "sentence", "1"], 161 | ["This", "is", "sample", "sentence", "2"], 162 | ] 163 | tokenizer(text, is_split_into_words=True) 164 | ``` 165 | 166 | #### Examples 167 | 168 | First, initialize the tokenizer 169 | 170 | ```python 171 | import transformers_embedder as tre 172 | 173 | tokenizer = tre.Tokenizer("bert-base-cased") 174 | ``` 175 | 176 | - You can pass a single sentence as a string: 177 | 178 | ```python 179 | text = "This is a sample sentence" 180 | tokenizer(text) 181 | ``` 182 | 183 | ```text 184 | { 185 | { 186 | 'input_ids': [[101, 1188, 1110, 170, 6876, 5650, 102]], 187 | 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0]], 188 | 'attention_mask': [[1, 1, 1, 1, 1, 1, 1]], 189 | 'scatter_offsets': [[0, 1, 2, 3, 4, 5, 6]], 190 | 'sparse_offsets': { 191 | 'sparse_indices': tensor( 192 | [ 193 | [0, 0, 0, 0, 0, 0, 0], 194 | [0, 1, 2, 3, 4, 5, 6], 195 | [0, 1, 2, 3, 4, 5, 6] 196 | ] 197 | ), 198 | 'sparse_values': tensor([1., 1., 1., 1., 1., 1., 1.]), 199 | 'sparse_size': torch.Size([1, 7, 7]) 200 | }, 201 | 'sentence_lengths': [7], 202 | } 203 | ``` 204 | 205 | - A sentence pair 206 | 207 | ```python 208 | text = "This is a sample sentence A" 209 | text_pair = "This is a sample sentence B" 210 | tokenizer(text, text_pair) 211 | ``` 212 | 213 | ```text 214 | { 215 | 'input_ids': [[101, 1188, 1110, 170, 6876, 5650, 138, 102, 1188, 1110, 170, 6876, 5650, 139, 102]], 216 | 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]], 217 | 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 218 | 'scatter_offsets': [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]], 219 | 'sparse_offsets': { 220 | 'sparse_indices': tensor( 221 | [ 222 | [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 223 | [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], 224 | [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] 225 | ] 226 | ), 227 | 'sparse_values': tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]), 228 | 'sparse_size': torch.Size([1, 15, 15]) 229 | }, 230 | 'sentence_lengths': [15], 231 | } 232 | ``` 233 | 234 | - A batch of sentences or sentence pairs. Using `padding=True` and `return_tensors=True`, the tokenizer 235 | returns the text ready for the model 236 | 237 | ```python 238 | batch = [ 239 | ["This", "is", "a", "sample", "sentence", "1"], 240 | ["This", "is", "sample", "sentence", "2"], 241 | ["This", "is", "a", "sample", "sentence", "3"], 242 | # ... 243 | ["This", "is", "a", "sample", "sentence", "n", "for", "batch"], 244 | ] 245 | tokenizer(batch, padding=True, return_tensors=True) 246 | 247 | batch_pair = [ 248 | ["This", "is", "a", "sample", "sentence", "pair", "1"], 249 | ["This", "is", "sample", "sentence", "pair", "2"], 250 | ["This", "is", "a", "sample", "sentence", "pair", "3"], 251 | # ... 252 | ["This", "is", "a", "sample", "sentence", "pair", "n", "for", "batch"], 253 | ] 254 | tokenizer(batch, batch_pair, padding=True, return_tensors=True) 255 | ``` 256 | 257 | #### Custom fields 258 | 259 | It is possible to add custom fields to the model input and tell the `tokenizer` how to pad them using 260 | `add_padding_ops`. Start by initializing the tokenizer with the model name: 261 | 262 | ```python 263 | import transformers_embedder as tre 264 | 265 | tokenizer = tre.Tokenizer("bert-base-cased") 266 | ``` 267 | 268 | Then add the custom fields to it: 269 | 270 | ```python 271 | custom_fields = { 272 | "custom_filed_1": [ 273 | [0, 0, 0, 0, 1, 0, 0], 274 | [0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0] 275 | ] 276 | } 277 | ``` 278 | 279 | Now we can add the padding logic for our custom field `custom_filed_1`. `add_padding_ops` method takes in 280 | input 281 | 282 | - `key`: name of the field in the tokenizer input 283 | - `value`: value to use for padding 284 | - `length`: length to pad. It can be an `int`, or two string value, `subword` in which the element is padded 285 | to match the length of the subwords, and `word` where the element is padded relative to the length of the 286 | batch after the merge of the subwords. 287 | 288 | ```python 289 | tokenizer.add_padding_ops("custom_filed_1", 0, "word") 290 | ``` 291 | 292 | Finally, we can tokenize the input with the custom field: 293 | 294 | ```python 295 | text = [ 296 | "This is a sample sentence", 297 | "This is another example sentence just make it longer, with a comma too!" 298 | ] 299 | 300 | tokenizer(text, padding=True, return_tensors=True, additional_inputs=custom_fields) 301 | ``` 302 | 303 | The inputs are ready for the model, including the custom filed. 304 | 305 | ```text 306 | >>> inputs 307 | 308 | { 309 | 'input_ids': tensor( 310 | [ 311 | [ 101, 1188, 1110, 170, 6876, 5650, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 312 | [ 101, 1188, 1110, 1330, 1859, 5650, 1198, 1294, 1122, 2039, 117, 1114, 170, 3254, 1918, 1315, 106, 102] 313 | ] 314 | ), 315 | 'token_type_ids': tensor( 316 | [ 317 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 318 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 319 | ] 320 | ), 321 | 'attention_mask': tensor( 322 | [ 323 | [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 324 | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 325 | ] 326 | ), 327 | 'scatter_offsets': tensor( 328 | [ 329 | [ 0, 1, 2, 3, 4, 5, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], 330 | [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16] 331 | ] 332 | ), 333 | 'sparse_offsets': { 334 | 'sparse_indices': tensor( 335 | [ 336 | [ 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 337 | [ 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16], 338 | [ 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17] 339 | ] 340 | ), 341 | 'sparse_values': tensor( 342 | [1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 343 | 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 344 | 1.0000, 1.0000, 0.5000, 0.5000, 1.0000, 1.0000, 1.0000] 345 | ), 346 | 'sparse_size': torch.Size([2, 17, 18]) 347 | } 348 | 'sentence_lengths': [7, 17], 349 | } 350 | ``` 351 | 352 | ## Acknowledgements 353 | 354 | Some code in the `TransformersEmbedder` class is taken from the [PyTorch Scatter](https://github.com/rusty1s/pytorch_scatter/) 355 | library. The pretrained models and the core of the tokenizer is from [🤗 Transformers](https://huggingface.co/transformers/). 356 | -------------------------------------------------------------------------------- /docs/gen_ref_pages.py: -------------------------------------------------------------------------------- 1 | """Generate the code reference pages and navigation.""" 2 | 3 | from pathlib import Path 4 | 5 | import os 6 | 7 | import mkdocs_gen_files 8 | 9 | nav = mkdocs_gen_files.Nav() 10 | 11 | ROOT_DIR = Path(__file__).parent.parent 12 | SRC_DIR = ROOT_DIR / "transformers_embedder" 13 | DOC_DIR = ROOT_DIR / "references" 14 | 15 | for path in sorted(Path("transformers_embedder").glob("**/*.py")): 16 | module_path = path.with_suffix("") 17 | doc_path = path.with_suffix(".md").name 18 | full_doc_path = DOC_DIR / doc_path 19 | parts = tuple(module_path.parts) 20 | 21 | if parts[-1] == "__init__": 22 | parts = parts[:-1] 23 | # doc_path = doc_path.with_name("index.md") 24 | # full_doc_path = full_doc_path.with_name("index.md") 25 | elif parts[-1] == "__main__": 26 | continue 27 | 28 | nav[parts] = doc_path 29 | 30 | with mkdocs_gen_files.open(full_doc_path, "w") as fd: 31 | ident = ".".join(parts) 32 | fd.write(f"::: {ident}") 33 | 34 | mkdocs_gen_files.set_edit_path(full_doc_path, path) 35 | 36 | with mkdocs_gen_files.open(DOC_DIR / "main.md", "w") as nav_file: 37 | nav_file.writelines(nav.build_literate_nav()) 38 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | # Transformers Embedder 4 | 5 | [![Open in Visual Studio Code](https://img.shields.io/badge/preview%20in-vscode.dev-blue)](https://github.dev/Riccorl/transformers-embedder) 6 | [![PyTorch](https://img.shields.io/badge/PyTorch-orange?logo=pytorch)](https://pytorch.org/) 7 | [![Transformers](https://img.shields.io/badge/4.34-🤗%20Transformers-6670ff)](https://huggingface.co/transformers/) 8 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000)](https://github.com/psf/black) 9 | 10 | [![Upload to PyPi](https://github.com/Riccorl/transformers-embedder/actions/workflows/python-publish-pypi.yml/badge.svg)](https://github.com/Riccorl/transformers-embedder/actions/workflows/python-publish-pypi.yml) 11 | [![Upload to PyPi](https://github.com/Riccorl/transformers-embedder/actions/workflows/python-publish-conda.yml/badge.svg)](https://github.com/Riccorl/transformers-embedder/actions/workflows/python-publish-conda.yml) 12 | [![PyPi Version](https://img.shields.io/github/v/release/Riccorl/transformers-embedder)](https://github.com/Riccorl/transformers-embedder/releases) 13 | [![Anaconda-Server Badge](https://anaconda.org/riccorl/transformers-embedder/badges/version.svg)](https://anaconda.org/riccorl/transformers-embedder) 14 | [![DeepSource](https://deepsource.io/gh/Riccorl/transformers-embedder.svg/?label=active+issues)](https://deepsource.io/gh/Riccorl/transformers-embedder/?ref=repository-badge) 15 | 16 |
17 | 18 | A Word Level Transformer layer based on PyTorch and 🤗 Transformers. 19 | 20 | ## How to use 21 | 22 | Install the library from [PyPI](https://pypi.org/project/transformers-embedder): 23 | 24 | ```bash 25 | pip install transformers-embedder 26 | ``` 27 | 28 | or from [Conda](https://anaconda.org/riccorl/transformers-embedder): 29 | 30 | ```bash 31 | conda install -c riccorl transformers-embedder 32 | ``` 33 | 34 | It offers a PyTorch layer and a tokenizer that support almost every pretrained model from Huggingface 35 | [🤗Transformers](https://huggingface.co/transformers/) library. Here is a quick example: 36 | 37 | ```python 38 | import transformers_embedder as tre 39 | 40 | tokenizer = tre.Tokenizer("bert-base-cased") 41 | 42 | model = tre.TransformersEmbedder( 43 | "bert-base-cased", subword_pooling_strategy="sparse", layer_pooling_strategy="mean" 44 | ) 45 | 46 | example = "This is a sample sentence" 47 | inputs = tokenizer(example, return_tensors=True) 48 | ``` 49 | 50 | ```text 51 | { 52 | 'input_ids': tensor([[ 101, 1188, 1110, 170, 6876, 5650, 102]]), 53 | 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]]), 54 | 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]]) 55 | 'scatter_offsets': tensor([[0, 1, 2, 3, 4, 5, 6]]), 56 | 'sparse_offsets': { 57 | 'sparse_indices': tensor( 58 | [ 59 | [0, 0, 0, 0, 0, 0, 0], 60 | [0, 1, 2, 3, 4, 5, 6], 61 | [0, 1, 2, 3, 4, 5, 6] 62 | ] 63 | ), 64 | 'sparse_values': tensor([1., 1., 1., 1., 1., 1., 1.]), 65 | 'sparse_size': torch.Size([1, 7, 7]) 66 | }, 67 | 'sentence_length': 7 # with special tokens included 68 | } 69 | ``` 70 | 71 | ```python 72 | outputs = model(**inputs) 73 | ``` 74 | 75 | ```text 76 | # outputs.word_embeddings.shape[1:-1] # remove [CLS] and [SEP] 77 | torch.Size([1, 5, 768]) 78 | # len(example) 79 | 5 80 | ``` 81 | 82 | ## Info 83 | 84 | One of the annoyance of using transformer-based models is that it is not trivial to compute word embeddings 85 | from the sub-token embeddings they output. With this API it's as easy as using 🤗Transformers to get 86 | word-level embeddings from theoretically every transformer model it supports. 87 | 88 | ### Model 89 | 90 | #### Subword Pooling Strategy 91 | 92 | The `TransformersEmbedder` class offers 3 ways to get the embeddings: 93 | 94 | - `subword_pooling_strategy="sparse"`: computes the mean of the embeddings of the sub-tokens of each word 95 | (i.e. the embeddings of the sub-tokens are pooled together) using a sparse matrix multiplication. This 96 | strategy is the default one. 97 | - `subword_pooling_strategy="scatter"`: computes the mean of the embeddings of the sub-tokens of each word 98 | using a scatter-gather operation. It is not deterministic, but it works with ONNX export. 99 | - `subword_pooling_strategy="none"`: returns the raw output of the transformer model without sub-token pooling. 100 | 101 | Here a little feature table: 102 | 103 | | | Pooling | Deterministic | ONNX | 104 | |-------------|:------------------:|:------------------:|:------------------:| 105 | | **Sparse** | :white_check_mark: | :white_check_mark: | :x: | 106 | | **Scatter** | :white_check_mark: | :x: | :white_check_mark: | 107 | | **None** | :x: | :white_check_mark: | :white_check_mark: | 108 | 109 | #### Layer Pooling Strategy 110 | 111 | There are also multiple type of outputs you can get using `layer_pooling_strategy` parameter: 112 | 113 | - `layer_pooling_strategy="last"`: returns the last hidden state of the transformer model 114 | - `layer_pooling_strategy="concat"`: returns the concatenation of the selected `output_layers` of the 115 | transformer model 116 | - `layer_pooling_strategy="sum"`: returns the sum of the selected `output_layers` of the transformer model 117 | - `layer_pooling_strategy="mean"`: returns the average of the selected `output_layers` of the transformer model 118 | - `layer_pooling_strategy="scalar_mix"`: returns the output of a parameterised scalar mixture layer of the 119 | selected `output_layers` of the transformer model 120 | 121 | If you also want all the outputs from the HuggingFace model, you can set `return_all=True` to get them. 122 | 123 | ```python 124 | class TransformersEmbedder(torch.nn.Module): 125 | def __init__( 126 | self, 127 | model: Union[str, tr.PreTrainedModel], 128 | subword_pooling_strategy: str = "sparse", 129 | layer_pooling_strategy: str = "last", 130 | output_layers: Tuple[int] = (-4, -3, -2, -1), 131 | fine_tune: bool = True, 132 | return_all: bool = True, 133 | ) 134 | ``` 135 | 136 | ### Tokenizer 137 | 138 | The `Tokenizer` class provides the `tokenize` method to preprocess the input for the `TransformersEmbedder` 139 | layer. You can pass raw sentences, pre-tokenized sentences and sentences in batch. It will preprocess them 140 | returning a dictionary with the inputs for the model. By passing `return_tensors=True` it will return the 141 | inputs as `torch.Tensor`. 142 | 143 | By default, if you pass text (or batch) as strings, it uses the HuggingFace tokenizer to tokenize them. 144 | 145 | ```python 146 | text = "This is a sample sentence" 147 | tokenizer(text) 148 | 149 | text = ["This is a sample sentence", "This is another sample sentence"] 150 | tokenizer(text) 151 | ``` 152 | 153 | You can pass a pre-tokenized sentence (or batch of sentences) by setting `is_split_into_words=True` 154 | 155 | ```python 156 | text = ["This", "is", "a", "sample", "sentence"] 157 | tokenizer(text, is_split_into_words=True) 158 | 159 | text = [ 160 | ["This", "is", "a", "sample", "sentence", "1"], 161 | ["This", "is", "sample", "sentence", "2"], 162 | ] 163 | tokenizer(text, is_split_into_words=True) 164 | ``` 165 | 166 | #### Examples 167 | 168 | First, initialize the tokenizer 169 | 170 | ```python 171 | import transformers_embedder as tre 172 | 173 | tokenizer = tre.Tokenizer("bert-base-cased") 174 | ``` 175 | 176 | - You can pass a single sentence as a string: 177 | 178 | ```python 179 | text = "This is a sample sentence" 180 | tokenizer(text) 181 | ``` 182 | 183 | ```text 184 | { 185 | { 186 | 'input_ids': [[101, 1188, 1110, 170, 6876, 5650, 102]], 187 | 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0]], 188 | 'attention_mask': [[1, 1, 1, 1, 1, 1, 1]], 189 | 'scatter_offsets': [[0, 1, 2, 3, 4, 5, 6]], 190 | 'sparse_offsets': { 191 | 'sparse_indices': tensor( 192 | [ 193 | [0, 0, 0, 0, 0, 0, 0], 194 | [0, 1, 2, 3, 4, 5, 6], 195 | [0, 1, 2, 3, 4, 5, 6] 196 | ] 197 | ), 198 | 'sparse_values': tensor([1., 1., 1., 1., 1., 1., 1.]), 199 | 'sparse_size': torch.Size([1, 7, 7]) 200 | }, 201 | 'sentence_lengths': [7], 202 | } 203 | ``` 204 | 205 | - A sentence pair 206 | 207 | ```python 208 | text = "This is a sample sentence A" 209 | text_pair = "This is a sample sentence B" 210 | tokenizer(text, text_pair) 211 | ``` 212 | 213 | ```text 214 | { 215 | 'input_ids': [[101, 1188, 1110, 170, 6876, 5650, 138, 102, 1188, 1110, 170, 6876, 5650, 139, 102]], 216 | 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]], 217 | 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 218 | 'scatter_offsets': [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]], 219 | 'sparse_offsets': { 220 | 'sparse_indices': tensor( 221 | [ 222 | [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 223 | [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], 224 | [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] 225 | ] 226 | ), 227 | 'sparse_values': tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]), 228 | 'sparse_size': torch.Size([1, 15, 15]) 229 | }, 230 | 'sentence_lengths': [15], 231 | } 232 | ``` 233 | 234 | - A batch of sentences or sentence pairs. Using `padding=True` and `return_tensors=True`, the tokenizer 235 | returns the text ready for the model 236 | 237 | ```python 238 | batch = [ 239 | ["This", "is", "a", "sample", "sentence", "1"], 240 | ["This", "is", "sample", "sentence", "2"], 241 | ["This", "is", "a", "sample", "sentence", "3"], 242 | # ... 243 | ["This", "is", "a", "sample", "sentence", "n", "for", "batch"], 244 | ] 245 | tokenizer(batch, padding=True, return_tensors=True) 246 | 247 | batch_pair = [ 248 | ["This", "is", "a", "sample", "sentence", "pair", "1"], 249 | ["This", "is", "sample", "sentence", "pair", "2"], 250 | ["This", "is", "a", "sample", "sentence", "pair", "3"], 251 | # ... 252 | ["This", "is", "a", "sample", "sentence", "pair", "n", "for", "batch"], 253 | ] 254 | tokenizer(batch, batch_pair, padding=True, return_tensors=True) 255 | ``` 256 | 257 | #### Custom fields 258 | 259 | It is possible to add custom fields to the model input and tell the `tokenizer` how to pad them using 260 | `add_padding_ops`. Start by initializing the tokenizer with the model name: 261 | 262 | ```python 263 | import transformers_embedder as tre 264 | 265 | tokenizer = tre.Tokenizer("bert-base-cased") 266 | ``` 267 | 268 | Then add the custom fields to it: 269 | 270 | ```python 271 | custom_fields = { 272 | "custom_filed_1": [ 273 | [0, 0, 0, 0, 1, 0, 0], 274 | [0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0] 275 | ] 276 | } 277 | ``` 278 | 279 | Now we can add the padding logic for our custom field `custom_filed_1`. `add_padding_ops` method takes in 280 | input 281 | 282 | - `key`: name of the field in the tokenizer input 283 | - `value`: value to use for padding 284 | - `length`: length to pad. It can be an `int`, or two string value, `subword` in which the element is padded 285 | to match the length of the subwords, and `word` where the element is padded relative to the length of the 286 | batch after the merge of the subwords. 287 | 288 | ```python 289 | tokenizer.add_padding_ops("custom_filed_1", 0, "word") 290 | ``` 291 | 292 | Finally, we can tokenize the input with the custom field: 293 | 294 | ```python 295 | text = [ 296 | "This is a sample sentence", 297 | "This is another example sentence just make it longer, with a comma too!" 298 | ] 299 | 300 | tokenizer(text, padding=True, return_tensors=True, additional_inputs=custom_fields) 301 | ``` 302 | 303 | The inputs are ready for the model, including the custom filed. 304 | 305 | ```text 306 | >>> inputs 307 | 308 | { 309 | 'input_ids': tensor( 310 | [ 311 | [ 101, 1188, 1110, 170, 6876, 5650, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 312 | [ 101, 1188, 1110, 1330, 1859, 5650, 1198, 1294, 1122, 2039, 117, 1114, 170, 3254, 1918, 1315, 106, 102] 313 | ] 314 | ), 315 | 'token_type_ids': tensor( 316 | [ 317 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 318 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 319 | ] 320 | ), 321 | 'attention_mask': tensor( 322 | [ 323 | [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 324 | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 325 | ] 326 | ), 327 | 'scatter_offsets': tensor( 328 | [ 329 | [ 0, 1, 2, 3, 4, 5, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], 330 | [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16] 331 | ] 332 | ), 333 | 'sparse_offsets': { 334 | 'sparse_indices': tensor( 335 | [ 336 | [ 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 337 | [ 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16], 338 | [ 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17] 339 | ] 340 | ), 341 | 'sparse_values': tensor( 342 | [1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 343 | 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 344 | 1.0000, 1.0000, 0.5000, 0.5000, 1.0000, 1.0000, 1.0000] 345 | ), 346 | 'sparse_size': torch.Size([2, 17, 18]) 347 | } 348 | 'sentence_lengths': [7, 17], 349 | } 350 | ``` 351 | 352 | ## Acknowledgements 353 | 354 | Some code in the `TransformersEmbedder` class is taken from the [PyTorch Scatter](https://github.com/rusty1s/pytorch_scatter/) 355 | library. The pretrained models and the core of the tokenizer is from [🤗 Transformers](https://huggingface.co/transformers/). 356 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: Transformers Embedder 2 | repo_url: https://github.com/riccorl/transformers-embedder 3 | 4 | plugins: 5 | - search 6 | - gen-files: 7 | scripts: 8 | - docs/gen_ref_pages.py 9 | - literate-nav: 10 | nav_file: main.md 11 | - section-index 12 | - mkdocstrings: 13 | custom_templates: templates 14 | default_handler: python 15 | handlers: 16 | python: 17 | options: 18 | docstring_style: google 19 | watch: 20 | - transformers_embedder 21 | 22 | theme: 23 | name: material 24 | features: 25 | - search.suggest 26 | - search.highlight 27 | icon: 28 | repo: fontawesome/brands/github 29 | palette: 30 | # Palette toggle for light mode 31 | - media: "(prefers-color-scheme: light)" 32 | primary: deep purple 33 | accent: yellow 34 | scheme: default 35 | font: 36 | text: Work Sans 37 | code: Fira Mono 38 | toggle: 39 | icon: material/brightness-7 40 | name: Switch to dark mode 41 | # Palette toggle for dark mode 42 | - media: "(prefers-color-scheme: dark)" 43 | primary: deep purple 44 | accent: yellow 45 | scheme: slate 46 | font: 47 | text: Work Sans 48 | code: Fira Mono 49 | toggle: 50 | icon: material/brightness-4 51 | name: Switch to light mode 52 | 53 | nav: 54 | - API References: references/ 55 | 56 | extra: 57 | # version: 58 | # provider: mike 59 | 60 | social: 61 | - icon: fontawesome/brands/twitter 62 | link: https://twitter.com/RiccrdoRicOrl 63 | - icon: fontawesome/brands/github 64 | link: https://github.com/riccorl 65 | 66 | markdown_extensions: 67 | - admonition 68 | - codehilite 69 | - pymdownx.superfences -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | include = '\.pyi?$' 3 | exclude = ''' 4 | /( 5 | \.git 6 | | \.hg 7 | | \.mypy_cache 8 | | \.tox 9 | | \.venv 10 | | _build 11 | | buck-out 12 | | build 13 | | dist 14 | )/ 15 | ''' -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | pre-commit 2 | datasets 3 | mkdocs-material 4 | mkdocstrings[python] 5 | mkdocs-literate-nav 6 | mkdocs-section-index 7 | mkdocs-gen-files 8 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=1.7 2 | transformers>=4.14,<4.35 3 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | extras = {} 7 | extras["torch"] = ["torch>=1.5,<2.2"] 8 | extras["all"] = extras["torch"] 9 | extras["docs"] = ["mkdocs-material"] 10 | 11 | install_requires = ["transformers>=4.14,<4.35"] 12 | 13 | setuptools.setup( 14 | name="transformers_embedder", 15 | version="3.0.11", 16 | author="Riccardo Orlando", 17 | author_email="orlandoricc@gmail.com", 18 | description="Word level transformer based embeddings", 19 | long_description=long_description, 20 | long_description_content_type="text/markdown", 21 | url="https://github.com/Riccorl/transformers-embedder", 22 | keywords="NLP deep learning transformer pytorch BERT google subtoken wordpieces embeddings", 23 | packages=setuptools.find_packages(), 24 | include_package_data=True, 25 | license="Apache", 26 | classifiers=[ 27 | "Programming Language :: Python :: 3", 28 | "License :: OSI Approved :: Apache Software License", 29 | "Operating System :: OS Independent", 30 | ], 31 | extras_require=extras, 32 | install_requires=install_requires, 33 | python_requires=">=3.6", 34 | ) 35 | -------------------------------------------------------------------------------- /transformers_embedder/__init__.py: -------------------------------------------------------------------------------- 1 | from transformers_embedder import utils 2 | 3 | if utils.is_torch_available(): 4 | from transformers_embedder.embedder import TransformersEmbedder, TransformersEncoder 5 | 6 | from transformers import ( 7 | BertTokenizer, 8 | BertTokenizerFast, 9 | BertweetTokenizer, 10 | CamembertTokenizer, 11 | CamembertTokenizerFast, 12 | DebertaTokenizer, 13 | DebertaTokenizerFast, 14 | DebertaV2Tokenizer, 15 | DebertaV2TokenizerFast, 16 | DistilBertTokenizer, 17 | DistilBertTokenizerFast, 18 | MobileBertTokenizer, 19 | MobileBertTokenizerFast, 20 | RobertaTokenizer, 21 | RobertaTokenizerFast, 22 | XLMRobertaTokenizer, 23 | XLMRobertaTokenizerFast, 24 | XLMTokenizer, 25 | ) 26 | 27 | 28 | MODELS_WITH_STARTING_TOKEN = ( 29 | BertTokenizer, 30 | BertTokenizerFast, 31 | DistilBertTokenizer, 32 | DistilBertTokenizerFast, 33 | MobileBertTokenizer, 34 | MobileBertTokenizerFast, 35 | BertweetTokenizer, 36 | CamembertTokenizer, 37 | CamembertTokenizerFast, 38 | DebertaTokenizer, 39 | DebertaTokenizerFast, 40 | DebertaV2Tokenizer, 41 | DebertaV2TokenizerFast, 42 | RobertaTokenizer, 43 | RobertaTokenizerFast, 44 | XLMRobertaTokenizer, 45 | XLMRobertaTokenizerFast, 46 | XLMTokenizer, 47 | ) 48 | 49 | MODELS_WITH_DOUBLE_SEP = ( 50 | CamembertTokenizer, 51 | CamembertTokenizerFast, 52 | BertweetTokenizer, 53 | RobertaTokenizer, 54 | RobertaTokenizerFast, 55 | XLMRobertaTokenizer, 56 | XLMRobertaTokenizerFast, 57 | ) 58 | 59 | from transformers_embedder.tokenizer import Tokenizer 60 | -------------------------------------------------------------------------------- /transformers_embedder/embedder.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from pathlib import Path 3 | from typing import Optional, Union, Tuple, Sequence, Any, Mapping 4 | 5 | import transformers as tr 6 | 7 | from transformers_embedder import utils 8 | from transformers_embedder.modules.scalar_mix import ScalarMix 9 | from transformers_embedder.modules.encoder import Encoder 10 | 11 | if utils.is_torch_available(): 12 | import torch 13 | 14 | logger = utils.get_logger(__name__) 15 | utils.get_logger("transformers") 16 | 17 | 18 | @dataclass 19 | class TransformersEmbedderOutput(tr.file_utils.ModelOutput): 20 | """Class for model's outputs.""" 21 | 22 | word_embeddings: Optional[torch.FloatTensor] = None 23 | last_hidden_state: Optional[torch.FloatTensor] = None 24 | pooler_output: Optional[torch.FloatTensor] = None 25 | hidden_states: Optional[Tuple[torch.FloatTensor]] = None 26 | attentions: Optional[Tuple[torch.FloatTensor]] = None 27 | 28 | 29 | class TransformersEmbedder(torch.nn.Module): 30 | """ 31 | Transformer Embedder class. 32 | 33 | Word level embeddings from various transformer architectures from Huggingface Transformers API. 34 | 35 | Args: 36 | model (`str`, `tr.PreTrainedModel`): 37 | Transformer model to use (https://huggingface.co/models). 38 | layer_pooling_strategy (`str`, optional, defaults to `last`): 39 | What output to get from the transformer model. The last hidden state (``last``), 40 | the concatenation of the selected hidden layers (``concat``), the sum of the selected hidden 41 | layers (``sum``), the average of the selected hidden layers (``mean``), or a scalar mixture of 42 | the selected hidden layers (``scalar_mix``). 43 | subword_pooling_strategy (`str`, optional, defaults to `sparse`): 44 | What pooling strategy to use for the sub-word embeddings. Methods available are ``sparse``, 45 | ``scatter`` and ``none``. The ``scatter`` strategy is ONNX comptabile but uses ``scatter_add_`` 46 | that is not deterministic. The ``sparse`` strategy is deterministic but it is not comptabile 47 | with ONNX. When ``subword_pooling_strategy`` is ``none``, the sub-word embeddings are not 48 | pooled. 49 | output_layers (`tuple`, `list`, `str`, optional, defaults to `(-4, -3, -2, -1)`): 50 | Which hidden layers to get from the transformer model. If ``output_layers`` is ``all``, 51 | all the hidden layers are returned. If ``output_layers`` is a tuple or a list, the hidden 52 | layers are selected according to the indexes in the tuple or list. If ``output_layers`` is 53 | a string, it must be ``all``. 54 | fine_tune (`bool`, optional, defaults to `True`): 55 | If ``True``, the transformer model is fine-tuned during training. 56 | return_all (`bool`, optional, defaults to `False`): 57 | If ``True``, returns all the outputs from the HuggingFace model. 58 | from_pretrained (`bool`, optional, defaults to `True`): 59 | If ``True``, the model is loaded from a pre-trained model, otherwise it is initialized with 60 | random weights. Usefull when you want to load a model from a specific checkpoint, without 61 | having to download the entire model. 62 | """ 63 | 64 | def __init__( 65 | self, 66 | model: Union[str, tr.PreTrainedModel], 67 | layer_pooling_strategy: str = "last", 68 | subword_pooling_strategy: str = "scatter", 69 | output_layers: Union[Sequence[int], str] = (-4, -3, -2, -1), 70 | fine_tune: bool = True, 71 | return_all: bool = False, 72 | from_pretrained: bool = True, 73 | *args, 74 | **kwargs, 75 | ) -> None: 76 | super().__init__() 77 | if isinstance(model, str): 78 | self.config = tr.AutoConfig.from_pretrained( 79 | model, 80 | output_hidden_states=True, 81 | output_attentions=True, 82 | *args, 83 | **kwargs, 84 | ) 85 | if from_pretrained: 86 | self.transformer_model = tr.AutoModel.from_pretrained( 87 | model, config=self.config, *args, **kwargs 88 | ) 89 | else: 90 | self.transformer_model = tr.AutoModel.from_config( 91 | self.config, *args, **kwargs 92 | ) 93 | else: 94 | self.transformer_model = model 95 | 96 | # pooling strategy parameters 97 | self.layer_pooling_strategy = layer_pooling_strategy 98 | self.subword_pooling_strategy = subword_pooling_strategy 99 | 100 | if output_layers == "all": 101 | output_layers = tuple( 102 | range(self.transformer_model.config.num_hidden_layers) 103 | ) 104 | 105 | # check output_layers is well defined 106 | if ( 107 | max(map(abs, output_layers)) 108 | >= self.transformer_model.config.num_hidden_layers 109 | ): 110 | raise ValueError( 111 | f"`output_layers` parameter not valid, choose between 0 and " 112 | f"{self.transformer_model.config.num_hidden_layers - 1}. " 113 | f"Current value is `{output_layers}`" 114 | ) 115 | self.output_layers = output_layers 116 | 117 | self._scalar_mix: Optional[ScalarMix] = None 118 | if layer_pooling_strategy == "scalar_mix": 119 | self._scalar_mix = ScalarMix(len(output_layers)) 120 | 121 | # check if return all transformer outputs 122 | self.return_all = return_all 123 | 124 | # if fine_tune is False, freeze all the transformer's parameters 125 | if not fine_tune: 126 | for param in self.transformer_model.parameters(): 127 | param.requires_grad = False 128 | 129 | def forward( 130 | self, 131 | input_ids: torch.Tensor, 132 | attention_mask: Optional[torch.Tensor] = None, 133 | token_type_ids: Optional[torch.Tensor] = None, 134 | scatter_offsets: Optional[torch.Tensor] = None, 135 | sparse_offsets: Optional[Mapping[str, Any]] = None, 136 | **kwargs, 137 | ) -> TransformersEmbedderOutput: 138 | """ 139 | Forward method of the PyTorch module. 140 | 141 | Args: 142 | input_ids (`torch.Tensor`): 143 | Input ids for the transformer model. 144 | attention_mask (`torch.Tensor`, optional): 145 | Attention mask for the transformer model. 146 | token_type_ids (`torch.Tensor`, optional): 147 | Token type ids for the transformer model. 148 | scatter_offsets (`torch.Tensor`, optional): 149 | Offsets of the sub-word, used to reconstruct the word embeddings using 150 | the ``scatter`` method. 151 | sparse_offsets (`Mapping[str, Any]`, optional): 152 | Offsets of the sub-word, used to reconstruct the word embeddings using 153 | the ``sparse`` method. 154 | 155 | Returns: 156 | `TransformersEmbedderOutput`: 157 | Word level embeddings plus the output of the transformer model. 158 | """ 159 | # Some HuggingFace models don't have the 160 | # token_type_ids parameter and fail even when it's given as None. 161 | inputs = {"input_ids": input_ids, "attention_mask": attention_mask} 162 | if token_type_ids is not None: 163 | inputs["token_type_ids"] = token_type_ids 164 | 165 | # Shape: [batch_size, num_sub-words, embedding_size]. 166 | transformer_outputs = self.transformer_model(**inputs) 167 | if self.layer_pooling_strategy == "last": 168 | word_embeddings = transformer_outputs.last_hidden_state 169 | elif self.layer_pooling_strategy == "concat": 170 | word_embeddings = [ 171 | transformer_outputs.hidden_states[layer] for layer in self.output_layers 172 | ] 173 | word_embeddings = torch.cat(word_embeddings, dim=-1) 174 | elif self.layer_pooling_strategy == "sum": 175 | word_embeddings = [ 176 | transformer_outputs.hidden_states[layer] for layer in self.output_layers 177 | ] 178 | word_embeddings = torch.stack(word_embeddings, dim=0).sum(dim=0) 179 | elif self.layer_pooling_strategy == "mean": 180 | word_embeddings = [ 181 | transformer_outputs.hidden_states[layer] for layer in self.output_layers 182 | ] 183 | word_embeddings = torch.stack(word_embeddings, dim=0).mean( 184 | dim=0, dtype=torch.float 185 | ) 186 | elif self.layer_pooling_strategy == "scalar_mix": 187 | word_embeddings = [ 188 | transformer_outputs.hidden_states[layer] for layer in self.output_layers 189 | ] 190 | word_embeddings = self._scalar_mix(word_embeddings) 191 | else: 192 | raise ValueError( 193 | "`layer_pooling_strategy` parameter not valid, choose between `last`, `concat`, " 194 | f"`sum`, `mean` and `scalar_mix`. Current value `{self.layer_pooling_strategy}`" 195 | ) 196 | 197 | if ( 198 | self.subword_pooling_strategy != "none" 199 | and scatter_offsets is None 200 | and sparse_offsets is None 201 | ): 202 | raise ValueError( 203 | "`subword_pooling_strategy` is not `none` but neither `scatter_offsets` not `sparse_offsets` " 204 | "were passed to the model. Cannot compute word embeddings.\nTo solve:\n" 205 | "- Set `subword_pooling_strategy` to `none` or\n" 206 | "- Pass `scatter_offsets` to the model during forward or\n" 207 | "- Pass `sparse_offsets` to the model during forward." 208 | ) 209 | 210 | if self.subword_pooling_strategy not in ["none", "scatter", "sparse"]: 211 | raise ValueError( 212 | "`subword_pooling_strategy` parameter not valid, choose between `scatter`, `sparse`" 213 | f" and `none`. Current value is `{self.subword_pooling_strategy}`." 214 | ) 215 | if self.subword_pooling_strategy == "scatter": 216 | if scatter_offsets is None: 217 | raise ValueError( 218 | "`subword_pooling_strategy` is `scatter` but `scatter_offsets` " 219 | "were not passed to the model. Cannot compute word embeddings.\nTo solve:\n" 220 | "- Set `subword_pooling_strategy` to `none` or\n" 221 | "- Pass `scatter_offsets` to the model during forward." 222 | ) 223 | word_embeddings = self.merge_scatter( 224 | word_embeddings, indices=scatter_offsets 225 | ) 226 | if self.subword_pooling_strategy == "sparse": 227 | if sparse_offsets is None: 228 | raise ValueError( 229 | "`subword_pooling_strategy` is `sparse` but `sparse_offsets` " 230 | "were not passed to the model. Cannot compute word embeddings.\nTo solve:\n" 231 | "- Set `subword_pooling_strategy` to `none` or\n" 232 | "- Pass `sparse_offsets` to the model during forward." 233 | ) 234 | word_embeddings = self.merge_sparse(word_embeddings, sparse_offsets) 235 | 236 | if self.return_all: 237 | return TransformersEmbedderOutput( 238 | word_embeddings=word_embeddings, 239 | last_hidden_state=transformer_outputs.last_hidden_state, 240 | hidden_states=transformer_outputs.hidden_states, 241 | pooler_output=transformer_outputs.pooler_output 242 | if hasattr(transformer_outputs, "pooler_output") 243 | else None, 244 | attentions=transformer_outputs.attentions, 245 | ) 246 | return TransformersEmbedderOutput(word_embeddings=word_embeddings) 247 | 248 | @staticmethod 249 | def merge_scatter(embeddings: torch.Tensor, indices: torch.Tensor) -> torch.Tensor: 250 | """ 251 | Minimal version of ``scatter_mean``, from `pytorch_scatter 252 | `_ 253 | library, that is compatible for ONNX but works only for our case. 254 | It is used to compute word level embeddings from the transformer output. 255 | 256 | Args: 257 | embeddings (`torch.Tensor`): 258 | The embeddings tensor. 259 | indices (`torch.Tensor`): 260 | The sub-word indices. 261 | 262 | Returns: 263 | `torch.Tensor` 264 | """ 265 | 266 | def broadcast(src: torch.Tensor, other: torch.Tensor): 267 | """ 268 | Broadcast ``src`` to match the shape of ``other``. 269 | 270 | Args: 271 | src (`torch.Tensor`): 272 | The tensor to broadcast. 273 | other (`torch.Tensor`): 274 | The tensor to match the shape of. 275 | 276 | Returns: 277 | `torch.Tensor`: The broadcasted tensor. 278 | """ 279 | for _ in range(src.dim(), other.dim()): 280 | src = src.unsqueeze(-1) 281 | src = src.expand_as(other) 282 | return src 283 | 284 | def scatter_sum(src: torch.Tensor, index: torch.Tensor) -> torch.Tensor: 285 | """ 286 | Sums the elements in ``src`` that have the same indices as in ``index``. 287 | 288 | Args: 289 | src (`torch.Tensor`): 290 | The tensor to sum. 291 | index (`torch.Tensor`): 292 | The indices to sum. 293 | 294 | Returns: 295 | `torch.Tensor`: The summed tensor. 296 | """ 297 | index = broadcast(index, src) 298 | size = list(src.size()) 299 | size[1] = index.max() + 1 300 | out = torch.zeros(size, dtype=src.dtype, device=src.device) 301 | return out.scatter_add_(1, index, src) 302 | 303 | # replace padding indices with the maximum value inside the batch 304 | indices[indices == -1] = torch.max(indices) 305 | merged = scatter_sum(embeddings, indices) 306 | ones = torch.ones( 307 | indices.size(), dtype=embeddings.dtype, device=embeddings.device 308 | ) 309 | count = scatter_sum(ones, indices) 310 | count.clamp_(1) 311 | count = broadcast(count, merged) 312 | merged.true_divide_(count) 313 | return merged 314 | 315 | @staticmethod 316 | def merge_sparse( 317 | embeddings: torch.Tensor, bpe_info: Optional[Mapping[str, Any]] 318 | ) -> torch.Tensor: 319 | """ 320 | Merges the subword embeddings into a single tensor, using sparse indices. 321 | 322 | Args: 323 | embeddings (`torch.Tensor`): 324 | The embeddings tensor. 325 | bpe_info (`Mapping[str, Any]`, `optional`): 326 | The BPE info. 327 | 328 | Returns: 329 | `torch.Tensor`: The merged embeddings. 330 | """ 331 | # it is constructed here and not in the tokenizer/collate because pin_memory is not sparse-compatible 332 | bpe_weights = torch.sparse_coo_tensor( 333 | indices=bpe_info["sparse_indices"], 334 | values=bpe_info["sparse_values"], 335 | size=bpe_info["sparse_size"], 336 | ) 337 | # (sentence, word, bpe) x (sentence, bpe, transformer_dim) -> (sentence, word, transformer_dim) 338 | merged = torch.bmm(bpe_weights.to_dense(), embeddings) 339 | return merged 340 | 341 | def resize_token_embeddings( 342 | self, new_num_tokens: Optional[int] = None 343 | ) -> torch.nn.Embedding: 344 | """ 345 | Resizes input token embeddings' matrix of the model if `new_num_tokens != config.vocab_size`. 346 | 347 | Args: 348 | new_num_tokens (`int`): 349 | The number of new tokens in the embedding matrix. 350 | 351 | Returns: 352 | `torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model. 353 | """ 354 | return self.transformer_model.resize_token_embeddings(new_num_tokens) 355 | 356 | def save_pretrained(self, save_directory: Union[str, Path]): 357 | """ 358 | Save a model and its configuration file to a directory. 359 | 360 | Args: 361 | save_directory (`str`, `Path`): 362 | Directory to which to save. 363 | """ 364 | self.transformer_model.save_pretrained(save_directory) 365 | 366 | @property 367 | def hidden_size(self) -> int: 368 | """ 369 | Returns the hidden size of TransformersEmbedder. 370 | 371 | Returns: 372 | `int`: Hidden size of ``self.transformer_model``. 373 | """ 374 | multiplier = ( 375 | len(self.output_layers) if self.layer_pooling_strategy == "concat" else 1 376 | ) 377 | return self.transformer_model.config.hidden_size * multiplier 378 | 379 | @property 380 | def transformer_hidden_size(self) -> int: 381 | """ 382 | Returns the hidden size of the inner transformer. 383 | 384 | Returns: 385 | `int`: Hidden size of ``self.transformer_model``. 386 | """ 387 | multiplier = ( 388 | len(self.output_layers) if self.layer_pooling_strategy == "concat" else 1 389 | ) 390 | return self.transformer_model.config.hidden_size * multiplier 391 | 392 | 393 | class TransformersEncoder(TransformersEmbedder): 394 | """ 395 | Transformer Embedder class. 396 | 397 | Word level embeddings from various transformer architectures from Huggingface Transformers API. 398 | 399 | Args: 400 | model (`str`, `tr.PreTrainedModel`): 401 | Transformer model to use (https://huggingface.co/models). 402 | layer_pooling_strategy (`str`, optional, defaults to `last`): 403 | What output to get from the transformer model. The last hidden state (``last``), 404 | the concatenation of the selected hidden layers (``concat``), the sum of the selected hidden 405 | layers (``sum``), the average of the selected hidden layers (``mean``). 406 | subword_pooling_strategy (`str`, optional, defaults to `scatter`): 407 | What pooling strategy to use for the sub-word embeddings. Methods available are ``scatter``, 408 | ``sparse`` and ``none``. The ``scatter`` strategy is ONNX comptabile but uses ``scatter_add`` 409 | that is not deterministic. The ``sparse`` strategy is deterministic but it is not comptabile 410 | with ONNX. 411 | output_layers (`tuple`, optional, defaults to `(-4, -3, -2, -1)`): 412 | Which hidden layers to get from the transformer model. 413 | fine_tune (`bool`, optional, defaults to `True`): 414 | If ``True``, the transformer model is fine-tuned during training. 415 | return_all (`bool`, optional, defaults to `False`): 416 | If ``True``, returns all the outputs from the HuggingFace model. 417 | projection_size (`int`, optional, defaults to `None`): 418 | If not ``None``, the output of the transformer is projected to this size. 419 | activation_layer (`torch.nn.Module`, optional, defaults to `None`): 420 | Activation layer to use. If ``None``, no activation layer is used. 421 | dropout (`float`, optional, defaults to `0.1`): 422 | The dropout probability. 423 | bias (`bool`, optional, defaults to `True`): 424 | If ``True``, the transformer model has a bias. 425 | """ 426 | 427 | def __init__( 428 | self, 429 | model: Union[str, tr.PreTrainedModel], 430 | layer_pooling_strategy: str = "last", 431 | subword_pooling_strategy: str = "sparse", 432 | output_layers: Sequence[int] = (-4, -3, -2, -1), 433 | fine_tune: bool = True, 434 | return_all: bool = False, 435 | projection_size: Optional[int] = None, 436 | activation_layer: Optional[torch.nn.Module] = None, 437 | dropout: float = 0.1, 438 | bias: bool = True, 439 | *args, 440 | **kwargs, 441 | ) -> None: 442 | super().__init__( 443 | model, 444 | layer_pooling_strategy, 445 | subword_pooling_strategy, 446 | output_layers, 447 | fine_tune, 448 | return_all, 449 | *args, 450 | **kwargs, 451 | ) 452 | self.encoder = Encoder( 453 | self.transformer_hidden_size, 454 | projection_size, 455 | activation_layer, 456 | dropout, 457 | bias, 458 | ) 459 | 460 | def forward( 461 | self, 462 | input_ids: torch.Tensor, 463 | attention_mask: Optional[torch.Tensor] = None, 464 | token_type_ids: Optional[torch.Tensor] = None, 465 | scatter_offsets: Optional[torch.Tensor] = None, 466 | sparse_offsets: Optional[Mapping[str, Any]] = None, 467 | **kwargs, 468 | ) -> TransformersEmbedderOutput: 469 | """ 470 | Forward method of the PyTorch module. 471 | 472 | Args: 473 | input_ids (`torch.Tensor`): 474 | Input ids for the transformer model. 475 | attention_mask (`torch.Tensor`, optional): 476 | Attention mask for the transformer model. 477 | token_type_ids (`torch.Tensor`, optional): 478 | Token type ids for the transformer model. 479 | scatter_offsets (`torch.Tensor`, optional): 480 | Offsets of the sub-word, used to reconstruct the word embeddings. 481 | 482 | Returns: 483 | `TransformersEmbedderOutput`: 484 | Word level embeddings plus the output of the transformer model. 485 | """ 486 | transformers_kwargs = { 487 | "input_ids": input_ids, 488 | "attention_mask": attention_mask, 489 | "token_type_ids": token_type_ids, 490 | "scatter_offsets": scatter_offsets, 491 | "sparse_offsets": sparse_offsets, 492 | **kwargs, 493 | } 494 | transformer_output = super().forward(**transformers_kwargs) 495 | encoder_output = self.encoder(transformer_output.word_embeddings) 496 | transformer_output.word_embeddings = encoder_output 497 | return transformer_output 498 | 499 | @property 500 | def hidden_size(self) -> int: 501 | """ 502 | Returns the hidden size of the transformer. 503 | 504 | Returns: 505 | `int`: Hidden size of ``self.transformer_model``. 506 | """ 507 | return self.encoder.projection_size 508 | -------------------------------------------------------------------------------- /transformers_embedder/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Riccorl/transformers-embedder/bacf4c5c89fb0fa6b550b1b60174cf15fd03d875/transformers_embedder/modules/__init__.py -------------------------------------------------------------------------------- /transformers_embedder/modules/encoder.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import torch 4 | 5 | 6 | class Encoder(torch.nn.Module): 7 | """ 8 | An encoder module for the `TransformersEmbedder` class. 9 | 10 | Args: 11 | transformer_hidden_size (`int`): 12 | The hidden size of the inner transformer. 13 | projection_size (`int`, `optional`, defaults to `None`): 14 | The size of the projection layer. 15 | activation_layer (`torch.nn.Module`, optional, defaults to `None`): 16 | Activation layer to use. If ``None``, no activation layer is used. 17 | dropout (`float`, `optional`, defaults to `0.1`): 18 | The dropout value. 19 | bias (`bool`, `optional`, defaults to `True`): 20 | Whether to use a bias. 21 | """ 22 | 23 | def __init__( 24 | self, 25 | transformer_hidden_size: int, 26 | projection_size: Optional[int] = None, 27 | activation_layer: Optional[torch.nn.Module] = None, 28 | dropout: float = 0.1, 29 | bias: bool = True, 30 | ): 31 | super().__init__() 32 | self.projection_size = projection_size or transformer_hidden_size 33 | self.projection_layer = torch.nn.Linear( 34 | transformer_hidden_size, self.projection_size, bias=bias 35 | ) 36 | self.dropout_layer = torch.nn.Dropout(dropout) 37 | self.activation_layer = activation_layer 38 | 39 | def forward(self, x: torch.Tensor) -> torch.Tensor: 40 | """ 41 | Forward pass of the encoder. 42 | 43 | Args: 44 | x (`torch.Tensor`): 45 | The input tensor. 46 | 47 | Returns: 48 | `torch.Tensor`: The encoded tensor. 49 | """ 50 | x = self.projection_layer(self.dropout_layer(x)) 51 | if self.activation_layer is not None: 52 | x = self.activation_layer(x) 53 | return x 54 | -------------------------------------------------------------------------------- /transformers_embedder/modules/scalar_mix.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import torch 4 | from torch.nn import ParameterList, Parameter 5 | 6 | # This code is taken from AllenNLP 7 | # https://github.com/allenai/allennlp/blob/main/allennlp/modules/scalar_mix.py 8 | 9 | 10 | class ScalarMix(torch.nn.Module): 11 | """ 12 | Computes a parameterised scalar mixture of N tensors, `mixture = gamma * sum(s_k * tensor_k)` 13 | where `s = softmax(w)`, with `w` and `gamma` scalar parameters. 14 | In addition, if `do_layer_norm=True` then apply layer normalization to each tensor 15 | before weighting. 16 | """ 17 | 18 | def __init__( 19 | self, 20 | mixture_size: int, 21 | do_layer_norm: bool = False, 22 | initial_scalar_parameters: List[float] = None, 23 | trainable: bool = True, 24 | ) -> None: 25 | super().__init__() 26 | self.mixture_size = mixture_size 27 | self.do_layer_norm = do_layer_norm 28 | 29 | if initial_scalar_parameters is None: 30 | initial_scalar_parameters = [0.0] * mixture_size 31 | elif len(initial_scalar_parameters) != mixture_size: 32 | raise ValueError( 33 | f"Length of `initial_scalar_parameters` {initial_scalar_parameters} differs " 34 | f"from `mixture_size` {mixture_size}" 35 | ) 36 | 37 | self.scalar_parameters = ParameterList( 38 | [ 39 | Parameter( 40 | torch.FloatTensor([initial_scalar_parameters[i]]), 41 | requires_grad=trainable, 42 | ) 43 | for i in range(mixture_size) 44 | ] 45 | ) 46 | self.gamma = Parameter(torch.FloatTensor([1.0]), requires_grad=trainable) 47 | 48 | def forward( 49 | self, tensors: List[torch.Tensor], mask: torch.BoolTensor = None 50 | ) -> torch.Tensor: 51 | """ 52 | Compute a weighted average of the `tensors`. The input tensors caa be any shape 53 | with at least two dimensions, but must all be the same shape. 54 | When `do_layer_norm=True`, the `mask` is a required input. If the `tensors` are 55 | dimensioned `(dim_0, ..., dim_{n-1}, dim_n)`, then the `mask` is dimensioned 56 | `(dim_0, ..., dim_{n-1})`, as in the typical case with `tensors` of shape 57 | `(batch_size, timesteps, dim)` and `mask` of shape `(batch_size, timesteps)`. 58 | When `do_layer_norm=False` the `mask` is ignored. 59 | """ 60 | if len(tensors) != self.mixture_size: 61 | raise ValueError( 62 | f"{len(tensors)} tensors were passed, but the module was initialized to " 63 | f"mix {self.mixture_size} tensors." 64 | ) 65 | 66 | def _do_layer_norm(tensor, broadcast_mask, num_elements_not_masked): 67 | tensor_masked = tensor * broadcast_mask 68 | mean = torch.sum(tensor_masked) / num_elements_not_masked 69 | variance = ( 70 | torch.sum(((tensor_masked - mean) * broadcast_mask) ** 2) 71 | / num_elements_not_masked 72 | ) 73 | return (tensor - mean) / torch.sqrt(variance + 1e-4) 74 | 75 | normed_weights = torch.nn.functional.softmax( 76 | torch.cat([parameter for parameter in self.scalar_parameters]), dim=0 77 | ) 78 | normed_weights = torch.split(normed_weights, split_size_or_sections=1) 79 | 80 | if not self.do_layer_norm: 81 | pieces = [] 82 | for weight, tensor in zip(normed_weights, tensors): 83 | pieces.append(weight * tensor) 84 | return self.gamma * sum(pieces) 85 | 86 | else: 87 | assert mask is not None 88 | broadcast_mask = mask.unsqueeze(-1) 89 | input_dim = tensors[0].size(-1) 90 | num_elements_not_masked = torch.sum(mask) * input_dim 91 | 92 | pieces = [] 93 | for weight, tensor in zip(normed_weights, tensors): 94 | pieces.append( 95 | weight 96 | * _do_layer_norm(tensor, broadcast_mask, num_elements_not_masked) 97 | ) 98 | return self.gamma * sum(pieces) 99 | -------------------------------------------------------------------------------- /transformers_embedder/tokenizer.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from collections import UserDict 4 | from functools import partial 5 | from typing import List, Dict, Union, Any, Optional, Tuple, Set, Sequence, Mapping 6 | 7 | import transformers as tr 8 | from transformers import BatchEncoding 9 | from transformers.file_utils import PaddingStrategy 10 | from transformers.tokenization_utils_base import TruncationStrategy 11 | 12 | from transformers_embedder import MODELS_WITH_STARTING_TOKEN, MODELS_WITH_DOUBLE_SEP 13 | from transformers_embedder import utils 14 | from transformers_embedder.utils import is_torch_available 15 | 16 | if is_torch_available(): 17 | import torch 18 | 19 | logger = utils.get_logger(__name__) 20 | utils.get_logger("transformers") 21 | 22 | 23 | class Tokenizer: 24 | """ 25 | A wrapper class for HuggingFace Tokenizer. 26 | 27 | Args: 28 | model (`str`, `transformers.PreTrainedTokenizer`): 29 | Language model name (or a transformer `PreTrainedTokenizer`. 30 | return_sparse_offsets (`bool`, optional, defaults to `True`): 31 | If `True`, the sparse offsets of the tokens in the input text are returned. To reduce 32 | memory usage, set this to `False` if you don't need them, e.g. you set the 33 | `subword_pooling_strategy` to `scatter` in the `TransformersEmbedder` model. 34 | """ 35 | 36 | def __init__( 37 | self, 38 | model: Union[str, tr.PreTrainedTokenizer], 39 | return_sparse_offsets: bool = True, 40 | *args, 41 | **kwargs, 42 | ): 43 | if isinstance(model, str): 44 | # init HuggingFace tokenizer 45 | self.huggingface_tokenizer = tr.AutoTokenizer.from_pretrained( 46 | model, *args, **kwargs 47 | ) 48 | # get config 49 | self.config = tr.AutoConfig.from_pretrained(model, *args, **kwargs) 50 | else: 51 | self.huggingface_tokenizer = model 52 | self.config = tr.AutoConfig.from_pretrained( 53 | self.huggingface_tokenizer.name_or_path, *args, **kwargs 54 | ) 55 | 56 | self.return_sparse_offsets = return_sparse_offsets 57 | 58 | # padding stuff 59 | # default, batch length is model max length 60 | self.subword_max_batch_len = self.huggingface_tokenizer.model_max_length 61 | self.word_max_batch_len = self.huggingface_tokenizer.model_max_length 62 | # padding ops 63 | self.padding_ops = {} 64 | # keys that will be converted in tensors 65 | self.to_tensor_inputs = set() 66 | 67 | def __len__(self): 68 | """Size of the full vocabulary with the added tokens.""" 69 | return len(self.huggingface_tokenizer) 70 | 71 | def __call__( 72 | self, 73 | text: Union[str, List[str], List[List[str]]], 74 | text_pair: Union[str, List[str], List[List[str]], None] = None, 75 | padding: Union[bool, str, PaddingStrategy] = False, 76 | truncation: Union[bool, str, TruncationStrategy] = False, 77 | max_length: Optional[int] = None, 78 | return_tensors: Optional[Union[bool, str]] = None, 79 | is_split_into_words: bool = False, 80 | additional_inputs: Optional[Dict[str, Any]] = None, 81 | *args, 82 | **kwargs, 83 | ) -> ModelInputs: 84 | """ 85 | Prepare the text in input for models that uses HuggingFace as embeddings. 86 | 87 | Args: 88 | text (`str`, `List[str]`, `List[List[str]]`, `List[List[Word]]`, `List[Word]`): 89 | Text or batch of text to be encoded. 90 | text_pair (`str`, `List[str]`, `List[List[str]]`, `List[List[Word]]`, `List[Word]`): 91 | Text or batch of text to be encoded. 92 | padding (`bool`, optional, defaults to `False`): 93 | If `True`, applies padding to the batch based on the maximum length of the batch. 94 | max_length (`int`, optional, defaults to `None`): 95 | If specified, truncates the input sequence to that value. Otherwise, 96 | uses the model max length. 97 | return_tensors (`bool`, optional, defaults to `None`): 98 | If `True`, the outputs is converted to `torch.Tensor` 99 | is_split_into_words (`bool`, optional, defaults to `False`): 100 | If `True` and the input is a string, the input is split on spaces. 101 | additional_inputs (`Dict[str, Any]`, optional, defaults to `None`): 102 | Additional inputs to be passed to the model. 103 | 104 | Returns: 105 | `ModelInputs`: The inputs to the transformer model. 106 | """ 107 | # some checks before starting 108 | if return_tensors == "tf": 109 | raise ValueError( 110 | "`return_tensors='tf'` is not supported. Please use `return_tensors='pt'` " 111 | "or `return_tensors=True`." 112 | ) 113 | if return_tensors is True: 114 | return_tensors = "pt" 115 | if return_tensors is False: 116 | return_tensors = None 117 | 118 | # check if input is batched or a single sample 119 | is_batched = bool( 120 | isinstance(text, (list, tuple)) 121 | and text 122 | and ( 123 | (isinstance(text[0], (list, tuple)) and is_split_into_words) 124 | or isinstance(text[0], str) 125 | ) 126 | ) 127 | if not is_batched: # batch it 128 | text = [text] 129 | text_pair = [text_pair] if text_pair is not None else None 130 | 131 | # use huggingface tokenizer to encode the text 132 | model_inputs = self.huggingface_tokenizer( 133 | text, 134 | text_pair=text_pair, 135 | padding=padding, 136 | truncation=truncation, 137 | max_length=max_length, 138 | is_split_into_words=is_split_into_words, 139 | return_tensors=return_tensors, 140 | *args, 141 | **kwargs, 142 | ) 143 | # build the offsets used to pool the subwords 144 | scatter_offsets, sentence_lengths = self.build_scatter_offsets( 145 | model_inputs, 146 | return_tensors=return_tensors, 147 | there_is_text_pair=text_pair is not None, 148 | ) 149 | 150 | # convert to ModelInputs 151 | model_inputs = ModelInputs(**model_inputs) 152 | # add the offsets to the model inputs 153 | model_inputs.update( 154 | {"scatter_offsets": scatter_offsets, "sentence_lengths": sentence_lengths} 155 | ) 156 | 157 | if self.return_sparse_offsets: 158 | # build the data used to pool the subwords when in sparse mode 159 | bpe_info: Mapping[str, Any] = self.build_sparse_offsets( 160 | offsets=scatter_offsets, 161 | bpe_mask=model_inputs.attention_mask, 162 | words_per_sentence=sentence_lengths, 163 | ) 164 | # add the bpe info to the model inputs 165 | model_inputs["sparse_offsets"] = ModelInputs(**bpe_info) 166 | 167 | # we also update the maximum batch length, 168 | # both for subword and word level 169 | self.subword_max_batch_len = max(len(x) for x in model_inputs.input_ids) 170 | self.word_max_batch_len = max(x for x in model_inputs.sentence_lengths) 171 | 172 | # check if we need to convert other stuff to tensors 173 | if additional_inputs: 174 | model_inputs.update(additional_inputs) 175 | # check if there is a padding strategy 176 | if padding: 177 | missing_keys = set(additional_inputs.keys()) - set( 178 | self.padding_ops.keys() 179 | ) 180 | if missing_keys: 181 | raise ValueError( 182 | f"There are no padding strategies for the following keys: {missing_keys}. " 183 | "Please add one with `tokenizer.add_padding_ops()`." 184 | ) 185 | self.pad_batch(model_inputs) 186 | # convert them to tensors 187 | if return_tensors == "pt": 188 | self.to_tensor(model_inputs) 189 | 190 | return model_inputs 191 | 192 | def build_scatter_offsets( 193 | self, 194 | model_inputs: BatchEncoding, 195 | return_tensors: bool = True, 196 | there_is_text_pair: bool = False, 197 | ) -> Tuple: 198 | """ 199 | Build the offset tensor for the batch of inputs. 200 | 201 | Args: 202 | model_inputs (`BatchEncoding`): 203 | The inputs to the transformer model. 204 | return_tensors (`bool`, optional, defaults to `True`): 205 | If `True`, the outputs is converted to `torch.Tensor` 206 | there_is_text_pair (`bool`, optional, defaults to `False`): 207 | If `True` `text_pair` is not None. 208 | 209 | Returns: 210 | `List[List[int]]` or `torch.Tensor`: The offsets of the sub-tokens. 211 | """ 212 | # output data structure 213 | offsets = [] 214 | sentence_lengths = [] 215 | # model_inputs should be the output of the HuggingFace tokenizer 216 | # it contains the word offsets to reconstruct the original tokens from the 217 | # sub-tokens 218 | for batch_index in range(len(model_inputs.input_ids)): 219 | word_ids = model_inputs.word_ids(batch_index) 220 | # it is slightly different from what we need, so here we make it compatible 221 | # with our subword pooling strategy 222 | # if the first token is a special token, we need to take it into account 223 | if self.has_starting_token: 224 | word_offsets = [0] + [ 225 | w + 1 if w is not None else w for w in word_ids[1:] 226 | ] 227 | # otherwise, we can just use word_ids as is 228 | else: 229 | word_offsets = word_ids 230 | 231 | # replace first None occurrence with sep_offset 232 | sep_index = word_offsets.index(None) 233 | 234 | # here we retrieve the max offset for the sample, which will be used as SEP offset 235 | # and also as padding value for the offsets 236 | sep_offset_value = max([w for w in word_offsets[:sep_index] if w is not None]) + 1 237 | 238 | word_offsets[sep_index] = sep_offset_value 239 | # if there is a text pair, we need to adjust the offsets for the second text 240 | if there_is_text_pair: 241 | # some models have two SEP tokens in between the two texts 242 | if self.has_double_sep: 243 | sep_index += 1 244 | sep_offset_value += 1 245 | word_offsets[sep_index] = sep_offset_value 246 | # keep the first offsets as is, adjust the second ones 247 | word_offsets = word_offsets[: sep_index + 1] + [ 248 | w + sep_offset_value if w is not None else w 249 | for w in word_offsets[sep_index + 1 :] 250 | ] 251 | # update again the sep_offset 252 | sep_offset_value = max([w for w in word_offsets if w is not None]) + 1 253 | # replace first None occurrence with sep_offset, it should be the last SEP 254 | sep_index = word_offsets.index(None) 255 | word_offsets[sep_index] = sep_offset_value 256 | # keep track of the maximum offset for padding 257 | offsets.append(word_offsets) 258 | sentence_lengths.append(sep_offset_value + 1) 259 | # replace remaining None occurrences with -1 260 | # the remaining None occurrences are the padding values 261 | offsets = [[o if o is not None else -1 for o in offset] for offset in offsets] 262 | # if return_tensor is True, we need to convert the offsets to tensors 263 | if return_tensors: 264 | offsets = torch.as_tensor(offsets) 265 | return offsets, sentence_lengths 266 | 267 | @staticmethod 268 | def build_sparse_offsets( 269 | offsets: torch.Tensor | Sequence[Sequence[int]], 270 | bpe_mask: torch.Tensor | Sequence[Sequence[int]], 271 | words_per_sentence: Sequence[int], 272 | ) -> Mapping[str, Any]: 273 | """Build tensors used as info for BPE pooling, starting from the BPE offsets. 274 | 275 | Args: 276 | offsets (`torch.Tensor` or `List[List[int]]`): 277 | The offsets to compute lengths from. 278 | bpe_mask (`torch.Tensor` or `List[List[int]]`): 279 | The attention mask at BPE level. 280 | words_per_sentence (`List[int]`): 281 | The sentence lengths, word-wise. 282 | 283 | Returns: 284 | `Mapping[str, Any]`: Tensors used to construct the sparse one which pools the 285 | transformer encoding word-wise. 286 | """ 287 | if not isinstance(offsets, torch.Tensor): 288 | offsets: torch.Tensor = torch.as_tensor(offsets) 289 | if not isinstance(bpe_mask, torch.Tensor): 290 | bpe_mask: torch.Tensor = torch.as_tensor(bpe_mask) 291 | 292 | sentence_lengths: torch.Tensor = bpe_mask.sum(dim=1) 293 | 294 | # We want to build triplets as coordinates (document, word, bpe) 295 | # We start by creating the document index for each triplet 296 | document_indices = torch.arange(offsets.size(0)).repeat_interleave( 297 | sentence_lengths 298 | ) 299 | # then the word indices 300 | word_indices = offsets[offsets != -1] 301 | # lastly the bpe indices 302 | max_range: torch.Tensor = torch.arange(bpe_mask.shape[1]) 303 | bpe_indices: torch.LongTensor = torch.cat( 304 | [max_range[:i] for i in bpe_mask.sum(dim=1)], dim=0 305 | ).long() 306 | 307 | unique_words, word_lengths = torch.unique_consecutive( 308 | offsets, return_counts=True 309 | ) 310 | unpadded_word_lengths = word_lengths[unique_words != -1] 311 | 312 | # and their weight to be used as multiplication factors 313 | bpe_weights: torch.FloatTensor = ( 314 | (1 / unpadded_word_lengths).repeat_interleave(unpadded_word_lengths).float() 315 | ) 316 | 317 | sparse_indices = torch.stack( 318 | [document_indices, word_indices, bpe_indices], dim=0 319 | ) 320 | 321 | bpe_shape = torch.Size( 322 | ( 323 | bpe_mask.size(0), # batch_size 324 | max(words_per_sentence), # max number of words per sentence 325 | bpe_mask.size(1), # max bpe_number in batch wrt the sentence 326 | ) 327 | ) 328 | 329 | return dict( 330 | sparse_indices=sparse_indices, 331 | sparse_values=bpe_weights, 332 | sparse_size=bpe_shape, 333 | ) 334 | 335 | def pad_batch( 336 | self, 337 | batch: Union[ModelInputs, Dict[str, list]], 338 | max_length: Optional[int] = None, 339 | ) -> ModelInputs: 340 | """ 341 | Pad the batch to its maximum length or to the specified `max_length`. 342 | 343 | Args: 344 | batch (`Dict[str, list]`): 345 | The batch to pad. 346 | max_length (`int`, optional): 347 | Override maximum length of the batch. 348 | 349 | Returns: 350 | `Dict[str, list]`: The padded batch. 351 | """ 352 | if max_length: 353 | self.subword_max_batch_len = max_length 354 | self.word_max_batch_len = max_length 355 | else: 356 | # get maximum len inside a batch 357 | self.subword_max_batch_len = max(len(x) for x in batch["input_ids"]) 358 | self.word_max_batch_len = max(x for x in batch["sentence_lengths"]) 359 | 360 | for key in batch: 361 | if key in self.padding_ops: 362 | batch[key] = [self.padding_ops[key](b) for b in batch[key]] 363 | 364 | return ModelInputs(batch) 365 | 366 | def pad_sequence( 367 | self, 368 | sequence: Union[List, torch.Tensor], 369 | value: int, 370 | length: Union[int, str] = "subword", 371 | pad_to_left: bool = False, 372 | ) -> Union[List, torch.Tensor]: 373 | """ 374 | Pad the input to the specified length with the given value. 375 | 376 | Args: 377 | sequence (`List`, `torch.Tensor`): 378 | Element to pad, it can be either a `List` or a `torch.Tensor`. 379 | value (`int`): 380 | Value to use as padding. 381 | length (`int`, `str`, optional, defaults to `subword`): 382 | Length after pad. 383 | pad_to_left (`bool`, optional, defaults to `False`): 384 | If `True`, pads to the left, right otherwise. 385 | 386 | Returns: 387 | `List`, `torch.Tensor`: The padded sequence. 388 | """ 389 | if length == "subword": 390 | length = self.subword_max_batch_len 391 | elif length == "word": 392 | length = self.word_max_batch_len 393 | else: 394 | if not isinstance(length, int): 395 | raise ValueError( 396 | f"`length` must be an `int`, `subword` or `word`. Current value is `{length}`" 397 | ) 398 | padding = [value] * abs(length - len(sequence)) 399 | if isinstance(sequence, torch.Tensor): 400 | if len(sequence.shape) > 1: 401 | raise ValueError( 402 | f"Sequence tensor must be 1D. Current shape is `{len(sequence.shape)}`" 403 | ) 404 | padding = torch.as_tensor(padding) 405 | if pad_to_left: 406 | if isinstance(sequence, torch.Tensor): 407 | return torch.cat((padding, sequence), -1) 408 | return padding + sequence 409 | if isinstance(sequence, torch.Tensor): 410 | return torch.cat((sequence, padding), -1) 411 | return sequence + padding 412 | 413 | def add_special_tokens( 414 | self, special_tokens_dict: Dict[str, Union[str, tr.AddedToken]] 415 | ) -> int: 416 | """ 417 | Add a dictionary of special tokens (eos, pad, cls, etc.) to the encoder. 418 | If special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last 419 | index of the current vocabulary). 420 | 421 | Args: 422 | special_tokens_dict (`Dict`): 423 | The dictionary containing special tokens. Keys should be in 424 | the list of predefined special attributes: [``bos_token``, ``eos_token``, 425 | ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, 426 | ``additional_special_tokens``]. 427 | 428 | Returns: 429 | `int`: Number of tokens added to the vocabulary. 430 | """ 431 | return self.huggingface_tokenizer.add_special_tokens(special_tokens_dict) 432 | 433 | def add_padding_ops(self, key: str, value: Any, length: Union[int, str]): 434 | """ 435 | Add padding logic to custom fields. 436 | If the field is not in `self.to_tensor_inputs`, this method will add the key to it. 437 | 438 | Args: 439 | key (`str`): 440 | Name of the field in the tokenizer input. 441 | value (`Any`): 442 | Value to use for padding. 443 | length (`int`, `str`): 444 | Length to pad. It can be an `int`, or two string value 445 | - ``subword``: the element is padded to the batch max length relative to the subwords length 446 | - ``word``: the element is padded to the batch max length relative to the original word length 447 | """ 448 | if key not in self.to_tensor_inputs: 449 | self.to_tensor_inputs.add(key) 450 | self.padding_ops[key] = partial(self.pad_sequence, value=value, length=length) 451 | 452 | def add_to_tensor_inputs(self, names: Union[str, Sequence[str]]) -> Set[str]: 453 | """ 454 | Add these keys to the ones that will be converted in Tensors. 455 | 456 | Args: 457 | names (`str`, `set`): 458 | Name of the field (or fields) to convert to tensors. 459 | 460 | Returns: 461 | `set`: The set of keys that will be converted to tensors. 462 | """ 463 | if isinstance(names, str): 464 | names = {names} 465 | if not isinstance(names, set): 466 | names = set(names) 467 | self.to_tensor_inputs |= names 468 | return self.to_tensor_inputs 469 | 470 | def to_tensor(self, batch: Union[ModelInputs, List[dict], dict]) -> ModelInputs: 471 | """ 472 | Return the batch in input as Pytorch tensors. The fields that are converted in tensors are in 473 | `self.to_tensor_inputs`. By default, only the standard model inputs are converted. Use 474 | `self.add_to_tensor_inputs` to add custom fields. 475 | 476 | Args: 477 | batch (`List[dict]`, `dict`): 478 | Batch in input. 479 | 480 | Returns: 481 | `ModelInputs`: The batch as tensor. 482 | """ 483 | # convert to tensor 484 | batch = { 485 | k: torch.as_tensor(v) 486 | if k in self.to_tensor_inputs and not isinstance(v, torch.Tensor) 487 | else v 488 | for k, v in batch.items() 489 | } 490 | return ModelInputs(batch) 491 | 492 | @staticmethod 493 | def _clean_output(output: Union[List, Dict]) -> Dict: 494 | """ 495 | Clean before output. 496 | 497 | Args: 498 | output (:obj`List[dict]`, `dict`): 499 | The output to clean. 500 | 501 | Returns: 502 | `dict`: The cleaned output. 503 | """ 504 | # single sentence case, generalize 505 | if isinstance(output, dict): 506 | output = [output] 507 | # convert list to dict 508 | output = {k: [d[k] for d in output] for k in output[0]} 509 | return output 510 | 511 | @staticmethod 512 | def _get_token_type_id(config: tr.PretrainedConfig) -> int: 513 | """ 514 | Get token type id. Useful when dealing with models that don't accept 1 as type id. 515 | Args: 516 | config (`transformers.PretrainedConfig`): 517 | Transformer config. 518 | 519 | Returns: 520 | `int`: Correct token type id for that model. 521 | """ 522 | if hasattr(config, "type_vocab_size"): 523 | return 1 if config.type_vocab_size == 2 else 0 524 | return 0 525 | 526 | @staticmethod 527 | def _type_checking(text: Any, text_pair: Any): 528 | """ 529 | Checks type of the inputs. 530 | 531 | Args: 532 | text (`Any`): 533 | Text to check. 534 | text_pair (`Any`): 535 | Text pair to check. 536 | 537 | Returns: 538 | """ 539 | 540 | def is_type_correct(text_to_check: Any) -> bool: 541 | """ 542 | Check if input type is correct, returning a boolean value. 543 | 544 | Args: 545 | text_to_check (`Any`): 546 | text to check. 547 | 548 | Returns: 549 | :obj`bool`: :obj`True` if the type is correct. 550 | """ 551 | return ( 552 | text_to_check is None 553 | or isinstance(text_to_check, str) 554 | or ( 555 | isinstance(text_to_check, (list, tuple)) 556 | and ( 557 | len(text_to_check) == 0 558 | or ( 559 | isinstance(text_to_check[0], str) 560 | or ( 561 | isinstance(text_to_check[0], (list, tuple)) 562 | and ( 563 | len(text_to_check[0]) == 0 564 | or isinstance(text_to_check[0][0], str) 565 | ) 566 | ) 567 | ) 568 | ) 569 | ) 570 | ) 571 | 572 | if not is_type_correct(text): 573 | raise AssertionError( 574 | "text input must of type `str` (single example), `List[str]` (batch or single " 575 | "pre-tokenized example) or `List[List[str]]` (batch of pre-tokenized examples)." 576 | ) 577 | 578 | if not is_type_correct(text_pair): 579 | raise AssertionError( 580 | "text_pair input must be `str` (single example), `List[str]` (batch or single " 581 | "pre-tokenized example) or `List[List[str]]` (batch of pre-tokenized examples)." 582 | ) 583 | 584 | @property 585 | def num_special_tokens(self) -> int: 586 | """ 587 | Return the number of special tokens the model needs. 588 | It assumes the input contains both sentences (`text` and `text_pair`). 589 | 590 | Returns: 591 | `int`: the number of special tokens. 592 | """ 593 | if isinstance( 594 | self.huggingface_tokenizer, MODELS_WITH_DOUBLE_SEP 595 | ) and isinstance(self.huggingface_tokenizer, MODELS_WITH_STARTING_TOKEN): 596 | return 4 597 | if isinstance( 598 | self.huggingface_tokenizer, 599 | (MODELS_WITH_DOUBLE_SEP, MODELS_WITH_STARTING_TOKEN), 600 | ): 601 | return 3 602 | return 2 603 | 604 | @property 605 | def has_double_sep(self): 606 | """True if tokenizer uses two SEP tokens.""" 607 | return isinstance(self.huggingface_tokenizer, MODELS_WITH_DOUBLE_SEP) 608 | 609 | @property 610 | def has_starting_token(self): 611 | """True if tokenizer uses a starting token.""" 612 | return isinstance(self.huggingface_tokenizer, MODELS_WITH_STARTING_TOKEN) 613 | 614 | @property 615 | def token_type_id(self): 616 | """Padding token.""" 617 | return self._get_token_type_id(self.config) 618 | 619 | @property 620 | def pad_token(self): 621 | """Padding token.""" 622 | return self.huggingface_tokenizer.pad_token 623 | 624 | @property 625 | def pad_token_id(self): 626 | """Padding token id.""" 627 | return self.huggingface_tokenizer.pad_token_id 628 | 629 | @property 630 | def unk_token(self): 631 | """Unknown token.""" 632 | return self.huggingface_tokenizer.unk_token 633 | 634 | @property 635 | def unk_token_id(self): 636 | """Unknown token id.""" 637 | return self.huggingface_tokenizer.unk_token_id 638 | 639 | @property 640 | def cls_token(self): 641 | """ 642 | Classification token. 643 | To extract a summary of an input sequence leveraging self-attention along the 644 | full depth of the model. 645 | """ 646 | return self.huggingface_tokenizer.cls_token 647 | 648 | @property 649 | def cls_token_id(self): 650 | """ 651 | Classification token id. 652 | To extract a summary of an input sequence leveraging self-attention along the 653 | full depth of the model. 654 | """ 655 | return self.huggingface_tokenizer.cls_token_id 656 | 657 | @property 658 | def sep_token(self): 659 | """Separation token, to separate context and query in an input sequence.""" 660 | return self.huggingface_tokenizer.sep_token 661 | 662 | @property 663 | def sep_token_id(self): 664 | """Separation token id, to separate context and query in an input sequence.""" 665 | return self.huggingface_tokenizer.sep_token_id 666 | 667 | @property 668 | def bos_token(self): 669 | """Beginning of sentence token.""" 670 | return self.huggingface_tokenizer.bos_token 671 | 672 | @property 673 | def bos_token_id(self): 674 | """Beginning of sentence token id.""" 675 | return self.huggingface_tokenizer.bos_token_id 676 | 677 | @property 678 | def eos_token(self): 679 | """End of sentence token.""" 680 | return self.huggingface_tokenizer.eos_token 681 | 682 | @property 683 | def eos_token_id(self): 684 | """End of sentence token id.""" 685 | return self.huggingface_tokenizer.eos_token_id 686 | 687 | 688 | class ModelInputs(UserDict): 689 | """Model input dictionary wrapper.""" 690 | 691 | def __getattr__(self, item: str): 692 | try: 693 | return self.data[item] 694 | except KeyError: 695 | raise AttributeError(f"`ModelInputs` has no attribute `{item}`") 696 | 697 | def __getitem__(self, item: str) -> Any: 698 | return self.data[item] 699 | 700 | def __getstate__(self): 701 | return {"data": self.data} 702 | 703 | def __setstate__(self, state): 704 | if "data" in state: 705 | self.data = state["data"] 706 | 707 | def keys(self): 708 | """A set-like object providing a view on D's keys.""" 709 | return self.data.keys() 710 | 711 | def values(self): 712 | """An object providing a view on D's values.""" 713 | return self.data.values() 714 | 715 | def items(self): 716 | """A set-like object providing a view on D's items.""" 717 | return self.data.items() 718 | 719 | def to(self, device: Union[str, torch.device]) -> ModelInputs: 720 | """ 721 | Send all tensors values to device. 722 | 723 | Args: 724 | device (`str` or `torch.device`): The device to put the tensors on. 725 | 726 | Returns: 727 | :class:`tokenizers.ModelInputs`: The same instance of :class:`~tokenizers.ModelInputs` 728 | after modification. 729 | """ 730 | if isinstance(device, (str, torch.device, int)): 731 | self.data = { 732 | k: v.to(device=device) if hasattr(v, "to") else v 733 | for k, v in self.data.items() 734 | } 735 | else: 736 | logger.warning( 737 | f"Attempting to cast to another type, {str(device)}. This is not supported." 738 | ) 739 | return self 740 | -------------------------------------------------------------------------------- /transformers_embedder/utils.py: -------------------------------------------------------------------------------- 1 | import importlib.util 2 | import logging 3 | 4 | _torch_available = importlib.util.find_spec("torch") is not None 5 | 6 | 7 | def is_torch_available(): 8 | """Check if PyTorch is available.""" 9 | return _torch_available 10 | 11 | 12 | def get_logger(name: str) -> logging.Logger: 13 | """ 14 | Return the logger of the given name. 15 | 16 | Args: 17 | name (`str`): The name of the logger. 18 | 19 | Returns: 20 | `logging.Logger`: The logger of the given name. 21 | """ 22 | return logging.getLogger(name) 23 | --------------------------------------------------------------------------------