├── .deepsource.toml
├── .flake8
├── .github
    ├── conda
    │   ├── build.sh
    │   └── meta.yaml
    ├── dependabot.yml
    └── workflows
    │   ├── black.yml
    │   ├── python-package.yml
    │   ├── python-publish-conda.yml
    │   ├── python-publish-pypi.yml
    │   └── website.yml
├── .gitignore
├── .pre-commit-config.yaml
├── MANIFEST.in
├── README.md
├── docs
    ├── gen_ref_pages.py
    └── index.md
├── mkdocs.yml
├── pyproject.toml
├── requirements-dev.txt
├── requirements.txt
├── setup.cfg
├── setup.py
└── transformers_embedder
    ├── __init__.py
    ├── embedder.py
    ├── modules
        ├── __init__.py
        ├── encoder.py
        └── scalar_mix.py
    ├── tokenizer.py
    └── utils.py


/.deepsource.toml:
--------------------------------------------------------------------------------
1 | version = 1
2 | 
3 | [[analyzers]]
4 | name = "python"
5 | enabled = true
6 | 
7 |   [analyzers.meta]
8 |   runtime_version = "3.x.x"
9 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = E203, E266, E501, W503, F403, F401, E402
3 | max-line-length = 88
4 | max-complexity = 18
5 | select = B,C,E,F,W,T4,B9
6 | 


--------------------------------------------------------------------------------
/.github/conda/build.sh:
--------------------------------------------------------------------------------
1 | $PYTHON setup.py install     # Python command to install the script.


--------------------------------------------------------------------------------
/.github/conda/meta.yaml:
--------------------------------------------------------------------------------
 1 | {% set name = "transformers-embedder" %}
 2 | {% set data = load_setup_py_data() %}
 3 | 
 4 | package:
 5 |   name: "{{ name|lower }}"
 6 |   version: "{{ TRANSFORMERS_EMBEDDER_VERSION }}"
 7 | 
 8 | about:
 9 |   home: {{ data['url'] }}
10 |   license: {{ data['license'] }}
11 |   summary: {{ data['description'] }}
12 | 
13 | requirements:
14 |   build:
15 |     - python
16 |     - transformers>=4.3,<4.12
17 |     - spacy>=3.0,<3.2
18 |   run:
19 |     - python
20 |     - transformers>=4.3,<4.12
21 |     - spacy>=3.0,<3.2
22 | 
23 | source:
24 |   path: ../../
25 | 
26 | build:
27 |   noarch: python
28 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | - package-ecosystem: pip
4 |   directory: "/"
5 |   schedule:
6 |     interval: daily
7 |     time: "04:00"
8 |   open-pull-requests-limit: 10
9 | 


--------------------------------------------------------------------------------
/.github/workflows/black.yml:
--------------------------------------------------------------------------------
 1 | name: Check Code Quality
 2 | 
 3 | on: pull_request
 4 | 
 5 | jobs:
 6 |   lint:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |       - uses: actions/checkout@v2
10 |       - uses: psf/black@stable
11 |         with:
12 |           options: "-l 110"
13 |       - uses: actions/checkout@v2
14 |       - uses: actions/setup-python@v2
15 |         with:
16 |           python-version: "3.9"
17 |       - name: Run flake8
18 |         uses: julianwachholz/flake8-action@v2
19 |         with:
20 |           checkName: "Python Lint"
21 |           path: ./transformers_embedder
22 |           plugins: "pep8-naming==0.12.1 flake8-comprehensions==3.6.1"
23 |           config: .flake8
24 |         env:
25 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
26 | 


--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Build Python package
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [$default-branch]
 9 |   pull_request:
10 |     branches: [$default-branch]
11 | 
12 | jobs:
13 |   build:
14 |     runs-on: ubuntu-latest
15 |     strategy:
16 |       fail-fast: false
17 |       matrix:
18 |         python-version: [3.6, 3.7, 3.8, 3.9]
19 | 
20 |     steps:
21 |       - uses: actions/checkout@v2
22 |       - name: Set up Python ${{ matrix.python-version }}
23 |         uses: actions/setup-python@v2
24 |         with:
25 |           python-version: ${{ matrix.python-version }}
26 |       - name: Install dependencies
27 |         run: |
28 |           python -m pip install --upgrade pip
29 |           python -m pip install flake8 pytest
30 |           if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
31 |       - name: Lint with flake8
32 |         run: |
33 |           # stop the build if there are Python syntax errors or undefined names
34 |           flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
35 |           # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
36 |           flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
37 |     # - name: Test with pytest
38 |     #   run: |
39 |     #     pytest
40 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish-conda.yml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Package to Conda
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 | 
 7 | env:
 8 |   ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_TOKEN }}
 9 | 
10 | jobs:
11 |   publish:
12 |     runs-on: ubuntu-latest
13 |     defaults:
14 |       run:
15 |         shell: bash -l {0}
16 | 
17 |     steps:
18 |       - name: Checkout repository
19 |         uses: actions/checkout@v2
20 | 
21 |       - name: Install miniconda
22 |         uses: conda-incubator/setup-miniconda@v2
23 |         with:
24 |           auto-update-conda: true
25 |           auto-activate-base: false
26 |           python-version: 3.9
27 |           activate-environment: "build-transformers-embedder"
28 |           channels: riccorl
29 | 
30 |       - name: Setup conda env
31 |         run: |
32 |           conda install -c defaults anaconda-client conda-build
33 |       - name: Extract version
34 |         run: echo "TRANSFORMERS_EMBEDDER_VERSION=`python setup.py --version`" >> $GITHUB_ENV
35 | 
36 |       - name: Build conda packages
37 |         run: |
38 |           conda info
39 |           conda list
40 |           conda-build -c riccorl -c conda-forge -c huggingface .github/conda
41 | 
42 |       - name: Upload to Anaconda
43 |         run: anaconda upload `conda-build .github/conda --output` --force
44 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish-pypi.yml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Package to PyPi
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 | 
 7 | jobs:
 8 |   publish:
 9 |     runs-on: ubuntu-latest
10 | 
11 |     steps:
12 |       - uses: actions/checkout@v2
13 |       - name: Set up Python
14 |         uses: actions/setup-python@v2
15 |         with:
16 |           python-version: "3.x"
17 | 
18 |       - name: Install dependencies
19 |         run: |
20 |           python -m pip install --upgrade pip
21 |           pip install build
22 | 
23 |       - name: Extract version
24 |         run: echo "TRANSFORMERS_EMBEDDER_VERSION=`python setup.py --version`" >> $GITHUB_ENV
25 | 
26 |       - name: Build package
27 |         run: python -m build
28 | 
29 |       - name: Publish package
30 |         uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
31 |         with:
32 |           user: ${{ secrets.PYPI_USERNAME }}
33 |           password: ${{ secrets.PYPI_PASSWORD }}
34 | 


--------------------------------------------------------------------------------
/.github/workflows/website.yml:
--------------------------------------------------------------------------------
 1 | name: ci
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - master
 6 |       - main
 7 | jobs:
 8 |   deploy:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - uses: actions/checkout@v2
12 |       - uses: actions/setup-python@v2
13 |         with:
14 |           python-version: 3.x
15 |       - run: pip install mkdocs-material mkdocs-literate-nav mkdocstrings[python] mkdocs-section-index mkdocs-gen-files
16 |       - run: mkdocs gh-deploy --force
17 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # custom
  2 | stuff
  3 | /test.ipynb
  4 | /test.py
  5 | 
  6 | # Fleet
  7 | .fleet
  8 | 
  9 | # Created by https://www.toptal.com/developers/gitignore/api/python,pycharm+all,vscode,macos,linux,windows
 10 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,pycharm+all,vscode,macos,linux,windows
 11 | 
 12 | ### Linux ###
 13 | *~
 14 | 
 15 | # temporary files which can be created if a process still has a handle open of a deleted file
 16 | .fuse_hidden*
 17 | 
 18 | # KDE directory preferences
 19 | .directory
 20 | 
 21 | # Linux trash folder which might appear on any partition or disk
 22 | .Trash-*
 23 | 
 24 | # .nfs files are created when an open file is removed but is still being accessed
 25 | .nfs*
 26 | 
 27 | ### macOS ###
 28 | # General
 29 | .DS_Store
 30 | .AppleDouble
 31 | .LSOverride
 32 | 
 33 | # Icon must end with two \r
 34 | Icon
 35 | 
 36 | 
 37 | # Thumbnails
 38 | ._*
 39 | 
 40 | # Files that might appear in the root of a volume
 41 | .DocumentRevisions-V100
 42 | .fseventsd
 43 | .Spotlight-V100
 44 | .TemporaryItems
 45 | .Trashes
 46 | .VolumeIcon.icns
 47 | .com.apple.timemachine.donotpresent
 48 | 
 49 | # Directories potentially created on remote AFP share
 50 | .AppleDB
 51 | .AppleDesktop
 52 | Network Trash Folder
 53 | Temporary Items
 54 | .apdisk
 55 | 
 56 | ### PyCharm+all ###
 57 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
 58 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
 59 | 
 60 | # User-specific stuff
 61 | .idea/**/workspace.xml
 62 | .idea/**/tasks.xml
 63 | .idea/**/usage.statistics.xml
 64 | .idea/**/dictionaries
 65 | .idea/**/shelf
 66 | 
 67 | # Generated files
 68 | .idea/**/contentModel.xml
 69 | 
 70 | # Sensitive or high-churn files
 71 | .idea/**/dataSources/
 72 | .idea/**/dataSources.ids
 73 | .idea/**/dataSources.local.xml
 74 | .idea/**/sqlDataSources.xml
 75 | .idea/**/dynamic.xml
 76 | .idea/**/uiDesigner.xml
 77 | .idea/**/dbnavigator.xml
 78 | 
 79 | # Gradle
 80 | .idea/**/gradle.xml
 81 | .idea/**/libraries
 82 | 
 83 | # Gradle and Maven with auto-import
 84 | # When using Gradle or Maven with auto-import, you should exclude module files,
 85 | # since they will be recreated, and may cause churn.  Uncomment if using
 86 | # auto-import.
 87 | # .idea/artifacts
 88 | # .idea/compiler.xml
 89 | # .idea/jarRepositories.xml
 90 | # .idea/modules.xml
 91 | # .idea/*.iml
 92 | # .idea/modules
 93 | # *.iml
 94 | # *.ipr
 95 | 
 96 | # CMake
 97 | cmake-build-*/
 98 | 
 99 | # Mongo Explorer plugin
100 | .idea/**/mongoSettings.xml
101 | 
102 | # File-based project format
103 | *.iws
104 | 
105 | # IntelliJ
106 | out/
107 | 
108 | # mpeltonen/sbt-idea plugin
109 | .idea_modules/
110 | 
111 | # JIRA plugin
112 | atlassian-ide-plugin.xml
113 | 
114 | # Cursive Clojure plugin
115 | .idea/replstate.xml
116 | 
117 | # Crashlytics plugin (for Android Studio and IntelliJ)
118 | com_crashlytics_export_strings.xml
119 | crashlytics.properties
120 | crashlytics-build.properties
121 | fabric.properties
122 | 
123 | # Editor-based Rest Client
124 | .idea/httpRequests
125 | 
126 | # Android studio 3.1+ serialized cache file
127 | .idea/caches/build_file_checksums.ser
128 | 
129 | ### PyCharm+all Patch ###
130 | # Ignores the whole .idea folder and all .iml files
131 | # See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360
132 | 
133 | .idea/
134 | 
135 | # Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023
136 | 
137 | *.iml
138 | modules.xml
139 | .idea/misc.xml
140 | *.ipr
141 | 
142 | # Sonarlint plugin
143 | .idea/sonarlint
144 | 
145 | ### Python ###
146 | # Byte-compiled / optimized / DLL files
147 | __pycache__/
148 | *.py[cod]
149 | *$py.class
150 | 
151 | # C extensions
152 | *.so
153 | 
154 | # Distribution / packaging
155 | .Python
156 | build/
157 | develop-eggs/
158 | dist/
159 | downloads/
160 | eggs/
161 | .eggs/
162 | lib/
163 | lib64/
164 | parts/
165 | sdist/
166 | var/
167 | wheels/
168 | pip-wheel-metadata/
169 | share/python-wheels/
170 | *.egg-info/
171 | .installed.cfg
172 | *.egg
173 | MANIFEST
174 | 
175 | # PyInstaller
176 | #  Usually these files are written by a python script from a template
177 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
178 | *.manifest
179 | *.spec
180 | 
181 | # Installer logs
182 | pip-log.txt
183 | pip-delete-this-directory.txt
184 | 
185 | # Unit test / coverage reports
186 | htmlcov/
187 | .tox/
188 | .nox/
189 | .coverage
190 | .coverage.*
191 | .cache
192 | nosetests.xml
193 | coverage.xml
194 | *.cover
195 | *.py,cover
196 | .hypothesis/
197 | .pytest_cache/
198 | pytestdebug.log
199 | 
200 | # Translations
201 | *.mo
202 | *.pot
203 | 
204 | # Django stuff:
205 | *.log
206 | local_settings.py
207 | db.sqlite3
208 | db.sqlite3-journal
209 | 
210 | # Flask stuff:
211 | instance/
212 | .webassets-cache
213 | 
214 | # Scrapy stuff:
215 | .scrapy
216 | 
217 | # Sphinx documentation
218 | docs/_build/
219 | doc/_build/
220 | 
221 | # PyBuilder
222 | target/
223 | 
224 | # Jupyter Notebook
225 | .ipynb_checkpoints
226 | 
227 | # IPython
228 | profile_default/
229 | ipython_config.py
230 | 
231 | # pyenv
232 | .python-version
233 | 
234 | # pipenv
235 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
236 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
237 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
238 | #   install all needed dependencies.
239 | #Pipfile.lock
240 | 
241 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
242 | __pypackages__/
243 | 
244 | # Celery stuff
245 | celerybeat-schedule
246 | celerybeat.pid
247 | 
248 | # SageMath parsed files
249 | *.sage.py
250 | 
251 | # Environments
252 | .env
253 | .venv
254 | env/
255 | venv/
256 | ENV/
257 | env.bak/
258 | venv.bak/
259 | pythonenv*
260 | 
261 | # Spyder project settings
262 | .spyderproject
263 | .spyproject
264 | 
265 | # Rope project settings
266 | .ropeproject
267 | 
268 | # mkdocs documentation
269 | /site
270 | 
271 | # mypy
272 | .mypy_cache/
273 | .dmypy.json
274 | dmypy.json
275 | 
276 | # Pyre type checker
277 | .pyre/
278 | 
279 | # pytype static type analyzer
280 | .pytype/
281 | 
282 | # profiling data
283 | .prof
284 | 
285 | ### vscode ###
286 | .vscode
287 | .vscode/*
288 | !.vscode/settings.json
289 | !.vscode/tasks.json
290 | !.vscode/launch.json
291 | !.vscode/extensions.json
292 | *.code-workspace
293 | 
294 | ### Windows ###
295 | # Windows thumbnail cache files
296 | Thumbs.db
297 | Thumbs.db:encryptable
298 | ehthumbs.db
299 | ehthumbs_vista.db
300 | 
301 | # Dump file
302 | *.stackdump
303 | 
304 | # Folder config file
305 | [Dd]esktop.ini
306 | 
307 | # Recycle Bin used on file shares
308 | $RECYCLE.BIN/
309 | 
310 | # Windows Installer files
311 | *.cab
312 | *.msi
313 | *.msix
314 | *.msm
315 | *.msp
316 | 
317 | # Windows shortcuts
318 | *.lnk
319 | 
320 | # End of https://www.toptal.com/developers/gitignore/api/python,pycharm+all,vscode,macos,linux,windows
321 | 
322 | /stuff/
323 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/ambv/black
 3 |     rev: 21.9b0
 4 |     hooks:
 5 |     - id: black
 6 | -   repo: https://gitlab.com/pycqa/flake8
 7 |     rev: 3.9.2
 8 |     hooks:
 9 |     - id: flake8
10 | 
11 | default_language_version:
12 |     python: python3


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 | 
  3 | # Transformers Embedder
  4 | 
  5 | [![Open in Visual Studio Code](https://img.shields.io/badge/preview%20in-vscode.dev-blue)](https://github.dev/Riccorl/transformers-embedder)
  6 | [![PyTorch](https://img.shields.io/badge/PyTorch-orange?logo=pytorch)](https://pytorch.org/)
  7 | [![Transformers](https://img.shields.io/badge/4.34-🤗%20Transformers-6670ff)](https://huggingface.co/transformers/)
  8 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000)](https://github.com/psf/black)
  9 | 
 10 | [![Upload to PyPi](https://github.com/Riccorl/transformers-embedder/actions/workflows/python-publish-pypi.yml/badge.svg)](https://github.com/Riccorl/transformers-embedder/actions/workflows/python-publish-pypi.yml)
 11 | [![Upload to PyPi](https://github.com/Riccorl/transformers-embedder/actions/workflows/python-publish-conda.yml/badge.svg)](https://github.com/Riccorl/transformers-embedder/actions/workflows/python-publish-conda.yml)
 12 | [![PyPi Version](https://img.shields.io/github/v/release/Riccorl/transformers-embedder)](https://github.com/Riccorl/transformers-embedder/releases)
 13 | [![Anaconda-Server Badge](https://anaconda.org/riccorl/transformers-embedder/badges/version.svg)](https://anaconda.org/riccorl/transformers-embedder)
 14 | [![DeepSource](https://deepsource.io/gh/Riccorl/transformers-embedder.svg/?label=active+issues)](https://deepsource.io/gh/Riccorl/transformers-embedder/?ref=repository-badge)
 15 | 
 16 | </div>
 17 | 
 18 | A Word Level Transformer layer based on PyTorch and 🤗 Transformers.
 19 | 
 20 | ## How to use
 21 | 
 22 | Install the library from [PyPI](https://pypi.org/project/transformers-embedder):
 23 | 
 24 | ```bash
 25 | pip install transformers-embedder
 26 | ```
 27 | 
 28 | or from [Conda](https://anaconda.org/riccorl/transformers-embedder):
 29 | 
 30 | ```bash
 31 | conda install -c riccorl transformers-embedder
 32 | ```
 33 | 
 34 | It offers a PyTorch layer and a tokenizer that support almost every pretrained model from Huggingface 
 35 | [🤗Transformers](https://huggingface.co/transformers/) library. Here is a quick example:
 36 | 
 37 | ```python
 38 | import transformers_embedder as tre
 39 | 
 40 | tokenizer = tre.Tokenizer("bert-base-cased")
 41 | 
 42 | model = tre.TransformersEmbedder(
 43 |     "bert-base-cased", subword_pooling_strategy="sparse", layer_pooling_strategy="mean"
 44 | )
 45 | 
 46 | example = "This is a sample sentence"
 47 | inputs = tokenizer(example, return_tensors=True)
 48 | ```
 49 | 
 50 | ```text
 51 | {
 52 |    'input_ids': tensor([[ 101, 1188, 1110, 170, 6876, 5650,  102]]),
 53 |    'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]]),
 54 |    'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]])
 55 |    'scatter_offsets': tensor([[0, 1, 2, 3, 4, 5, 6]]),
 56 |    'sparse_offsets': {
 57 |         'sparse_indices': tensor(
 58 |             [
 59 |                 [0, 0, 0, 0, 0, 0, 0],
 60 |                 [0, 1, 2, 3, 4, 5, 6],
 61 |                 [0, 1, 2, 3, 4, 5, 6]
 62 |             ]
 63 |         ), 
 64 |         'sparse_values': tensor([1., 1., 1., 1., 1., 1., 1.]), 
 65 |         'sparse_size': torch.Size([1, 7, 7])
 66 |     },
 67 |    'sentence_length': 7  # with special tokens included
 68 | }
 69 | ```
 70 | 
 71 | ```python
 72 | outputs = model(**inputs)
 73 | ```
 74 | 
 75 | ```text
 76 | # outputs.word_embeddings.shape[1:-1]       # remove [CLS] and [SEP]
 77 | torch.Size([1, 5, 768])
 78 | # len(example)
 79 | 5
 80 | ```
 81 | 
 82 | ## Info
 83 | 
 84 | One of the annoyance of using transformer-based models is that it is not trivial to compute word embeddings 
 85 | from the sub-token embeddings they output. With this API it's as easy as using 🤗Transformers to get 
 86 | word-level embeddings from theoretically every transformer model it supports.
 87 | 
 88 | ### Model
 89 | 
 90 | #### Subword Pooling Strategy
 91 | 
 92 | The `TransformersEmbedder` class offers 3 ways to get the embeddings:
 93 | 
 94 | - `subword_pooling_strategy="sparse"`: computes the mean of the embeddings of the sub-tokens of each word 
 95 |   (i.e. the embeddings of the sub-tokens are pooled together) using a sparse matrix multiplication. This 
 96 |   strategy is the default one.
 97 | - `subword_pooling_strategy="scatter"`: computes the mean of the embeddings of the sub-tokens of each word
 98 |   using a scatter-gather operation. It is not deterministic, but it works with ONNX export.
 99 | - `subword_pooling_strategy="none"`: returns the raw output of the transformer model without sub-token pooling.
100 | 
101 | Here a little feature table:
102 | 
103 | |             |      Pooling       |   Deterministic    |        ONNX        |
104 | |-------------|:------------------:|:------------------:|:------------------:|
105 | | **Sparse**  | :white_check_mark: | :white_check_mark: |        :x:         |
106 | | **Scatter** | :white_check_mark: |        :x:         | :white_check_mark: |
107 | | **None**    |        :x:         | :white_check_mark: | :white_check_mark: |
108 | 
109 | #### Layer Pooling Strategy
110 | 
111 | There are also multiple type of outputs you can get using `layer_pooling_strategy` parameter:
112 | 
113 | - `layer_pooling_strategy="last"`: returns the last hidden state of the transformer model
114 | - `layer_pooling_strategy="concat"`: returns the concatenation of the selected `output_layers` of the  
115 |    transformer model
116 | - `layer_pooling_strategy="sum"`: returns the sum of the selected `output_layers` of the transformer model
117 | - `layer_pooling_strategy="mean"`: returns the average of the selected `output_layers` of the transformer model
118 | - `layer_pooling_strategy="scalar_mix"`: returns the output of a parameterised scalar mixture layer of the 
119 |    selected `output_layers` of the transformer model
120 | 
121 | If you also want all the outputs from the HuggingFace model, you can set `return_all=True` to get them.
122 | 
123 | ```python
124 | class TransformersEmbedder(torch.nn.Module):
125 |     def __init__(
126 |         self,
127 |         model: Union[str, tr.PreTrainedModel],
128 |         subword_pooling_strategy: str = "sparse",
129 |         layer_pooling_strategy: str = "last",
130 |         output_layers: Tuple[int] = (-4, -3, -2, -1),
131 |         fine_tune: bool = True,
132 |         return_all: bool = True,
133 |     )
134 | ```
135 | 
136 | ### Tokenizer
137 | 
138 | The `Tokenizer` class provides the `tokenize` method to preprocess the input for the `TransformersEmbedder` 
139 | layer. You can pass raw sentences, pre-tokenized sentences and sentences in batch. It will preprocess them 
140 | returning a dictionary with the inputs for the model. By passing `return_tensors=True` it will return the 
141 | inputs as `torch.Tensor`.
142 | 
143 | By default, if you pass text (or batch) as strings, it uses the HuggingFace tokenizer to tokenize them.
144 | 
145 | ```python
146 | text = "This is a sample sentence"
147 | tokenizer(text)
148 | 
149 | text = ["This is a sample sentence", "This is another sample sentence"]
150 | tokenizer(text)
151 | ```
152 | 
153 | You can pass a pre-tokenized sentence (or batch of sentences) by setting `is_split_into_words=True`
154 | 
155 | ```python
156 | text = ["This", "is", "a", "sample", "sentence"]
157 | tokenizer(text, is_split_into_words=True)
158 | 
159 | text = [
160 |     ["This", "is", "a", "sample", "sentence", "1"],
161 |     ["This", "is", "sample", "sentence", "2"],
162 | ]
163 | tokenizer(text, is_split_into_words=True)
164 | ```
165 | 
166 | #### Examples
167 | 
168 | First, initialize the tokenizer
169 | 
170 | ```python
171 | import transformers_embedder as tre
172 | 
173 | tokenizer = tre.Tokenizer("bert-base-cased")
174 | ```
175 | 
176 | - You can pass a single sentence as a string:
177 | 
178 | ```python
179 | text = "This is a sample sentence"
180 | tokenizer(text)
181 | ```
182 | 
183 | ```text
184 | {
185 | {
186 |     'input_ids': [[101, 1188, 1110, 170, 6876, 5650, 102]],
187 |     'token_type_ids': [[0, 0, 0, 0, 0, 0, 0]],
188 |     'attention_mask': [[1, 1, 1, 1, 1, 1, 1]],
189 |     'scatter_offsets': [[0, 1, 2, 3, 4, 5, 6]],
190 |     'sparse_offsets': {
191 |         'sparse_indices': tensor(
192 |             [
193 |                 [0, 0, 0, 0, 0, 0, 0],
194 |                 [0, 1, 2, 3, 4, 5, 6],
195 |                 [0, 1, 2, 3, 4, 5, 6]
196 |             ]
197 |         ),
198 |         'sparse_values': tensor([1., 1., 1., 1., 1., 1., 1.]),
199 |         'sparse_size': torch.Size([1, 7, 7])
200 |     },
201 |     'sentence_lengths': [7],
202 | }
203 | ```
204 | 
205 | - A sentence pair
206 | 
207 | ```python
208 | text = "This is a sample sentence A"
209 | text_pair = "This is a sample sentence B"
210 | tokenizer(text, text_pair)
211 | ```
212 | 
213 | ```text
214 | {
215 |     'input_ids': [[101, 1188, 1110, 170, 6876, 5650, 138, 102, 1188, 1110, 170, 6876, 5650, 139, 102]],
216 |     'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]],
217 |     'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
218 |     'scatter_offsets': [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]],
219 |     'sparse_offsets': {
220 |         'sparse_indices': tensor(
221 |             [
222 |                 [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0],
223 |                 [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
224 |                 [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
225 |             ]
226 |         ),
227 |         'sparse_values': tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
228 |         'sparse_size': torch.Size([1, 15, 15])
229 |     },
230 |     'sentence_lengths': [15],
231 | }
232 | ```
233 | 
234 | - A batch of sentences or sentence pairs. Using `padding=True` and `return_tensors=True`, the tokenizer 
235 | returns the text ready for the model
236 | 
237 | ```python
238 | batch = [
239 |     ["This", "is", "a", "sample", "sentence", "1"],
240 |     ["This", "is", "sample", "sentence", "2"],
241 |     ["This", "is", "a", "sample", "sentence", "3"],
242 |     # ...
243 |     ["This", "is", "a", "sample", "sentence", "n", "for", "batch"],
244 | ]
245 | tokenizer(batch, padding=True, return_tensors=True)
246 | 
247 | batch_pair = [
248 |     ["This", "is", "a", "sample", "sentence", "pair", "1"],
249 |     ["This", "is", "sample", "sentence", "pair", "2"],
250 |     ["This", "is", "a", "sample", "sentence", "pair", "3"],
251 |     # ...
252 |     ["This", "is", "a", "sample", "sentence", "pair", "n", "for", "batch"],
253 | ]
254 | tokenizer(batch, batch_pair, padding=True, return_tensors=True)
255 | ```
256 | 
257 | #### Custom fields
258 | 
259 | It is possible to add custom fields to the model input and tell the `tokenizer` how to pad them using 
260 | `add_padding_ops`. Start by initializing the tokenizer with the model name:
261 | 
262 | ```python
263 | import transformers_embedder as tre
264 | 
265 | tokenizer = tre.Tokenizer("bert-base-cased")
266 | ```
267 | 
268 | Then add the custom fields to it:
269 | 
270 | ```python
271 | custom_fields = {
272 |   "custom_filed_1": [
273 |     [0, 0, 0, 0, 1, 0, 0],
274 |     [0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0]
275 |   ]
276 | }
277 | ```
278 | 
279 | Now we can add the padding logic for our custom field `custom_filed_1`. `add_padding_ops` method takes in 
280 | input
281 | 
282 | - `key`: name of the field in the tokenizer input
283 | - `value`: value to use for padding
284 | - `length`: length to pad. It can be an `int`, or two string value, `subword` in which the element is padded 
285 | to match the length of the subwords, and `word` where the element is padded relative to the length of the
286 | batch after the merge of the subwords.
287 | 
288 | ```python
289 | tokenizer.add_padding_ops("custom_filed_1", 0, "word")
290 | ```
291 | 
292 | Finally, we can tokenize the input with the custom field:
293 | 
294 | ```python
295 | text = [
296 |     "This is a sample sentence",
297 |     "This is another example sentence just make it longer, with a comma too!"
298 | ]
299 | 
300 | tokenizer(text, padding=True, return_tensors=True, additional_inputs=custom_fields)
301 | ```
302 | 
303 | The inputs are ready for the model, including the custom filed.
304 | 
305 | ```text
306 | >>> inputs
307 | 
308 | {
309 |     'input_ids': tensor(
310 |         [
311 |             [ 101, 1188, 1110, 170, 6876, 5650, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
312 |             [ 101, 1188, 1110, 1330, 1859, 5650, 1198, 1294, 1122, 2039, 117, 1114, 170, 3254, 1918, 1315, 106, 102]
313 |         ]
314 |     ),
315 |     'token_type_ids': tensor(
316 |         [
317 |             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
318 |             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
319 |         ]
320 |     ), 
321 |     'attention_mask': tensor(
322 |         [
323 |             [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
324 |             [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
325 |         ]
326 |     ),
327 |     'scatter_offsets': tensor(
328 |         [
329 |             [ 0, 1, 2, 3, 4, 5, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
330 |             [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16]
331 |         ]
332 |     ),
333 |     'sparse_offsets': {
334 |         'sparse_indices': tensor(
335 |             [
336 |                 [ 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1],
337 |                 [ 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16],
338 |                 [ 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
339 |             ]
340 |         ),
341 |         'sparse_values': tensor(
342 |             [1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
343 |             1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
344 |             1.0000, 1.0000, 0.5000, 0.5000, 1.0000, 1.0000, 1.0000]
345 |         ), 
346 |         'sparse_size': torch.Size([2, 17, 18])
347 |     }
348 |     'sentence_lengths': [7, 17],
349 | }
350 | ```
351 | 
352 | ## Acknowledgements
353 | 
354 | Some code in the `TransformersEmbedder` class is taken from the [PyTorch Scatter](https://github.com/rusty1s/pytorch_scatter/)
355 | library. The pretrained models and the core of the tokenizer is from [🤗 Transformers](https://huggingface.co/transformers/).
356 | 


--------------------------------------------------------------------------------
/docs/gen_ref_pages.py:
--------------------------------------------------------------------------------
 1 | """Generate the code reference pages and navigation."""
 2 | 
 3 | from pathlib import Path
 4 | 
 5 | import os
 6 | 
 7 | import mkdocs_gen_files
 8 | 
 9 | nav = mkdocs_gen_files.Nav()
10 | 
11 | ROOT_DIR = Path(__file__).parent.parent
12 | SRC_DIR = ROOT_DIR / "transformers_embedder"
13 | DOC_DIR = ROOT_DIR / "references"
14 | 
15 | for path in sorted(Path("transformers_embedder").glob("**/*.py")):
16 |     module_path = path.with_suffix("")
17 |     doc_path = path.with_suffix(".md").name
18 |     full_doc_path = DOC_DIR / doc_path
19 |     parts = tuple(module_path.parts)
20 | 
21 |     if parts[-1] == "__init__":
22 |         parts = parts[:-1]
23 |         # doc_path = doc_path.with_name("index.md")
24 |         # full_doc_path = full_doc_path.with_name("index.md")
25 |     elif parts[-1] == "__main__":
26 |         continue
27 | 
28 |     nav[parts] = doc_path
29 | 
30 |     with mkdocs_gen_files.open(full_doc_path, "w") as fd:
31 |         ident = ".".join(parts)
32 |         fd.write(f"::: {ident}")
33 | 
34 |     mkdocs_gen_files.set_edit_path(full_doc_path, path)
35 | 
36 | with mkdocs_gen_files.open(DOC_DIR / "main.md", "w") as nav_file:
37 |     nav_file.writelines(nav.build_literate_nav())
38 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 | 
  3 | # Transformers Embedder
  4 | 
  5 | [![Open in Visual Studio Code](https://img.shields.io/badge/preview%20in-vscode.dev-blue)](https://github.dev/Riccorl/transformers-embedder)
  6 | [![PyTorch](https://img.shields.io/badge/PyTorch-orange?logo=pytorch)](https://pytorch.org/)
  7 | [![Transformers](https://img.shields.io/badge/4.34-🤗%20Transformers-6670ff)](https://huggingface.co/transformers/)
  8 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000)](https://github.com/psf/black)
  9 | 
 10 | [![Upload to PyPi](https://github.com/Riccorl/transformers-embedder/actions/workflows/python-publish-pypi.yml/badge.svg)](https://github.com/Riccorl/transformers-embedder/actions/workflows/python-publish-pypi.yml)
 11 | [![Upload to PyPi](https://github.com/Riccorl/transformers-embedder/actions/workflows/python-publish-conda.yml/badge.svg)](https://github.com/Riccorl/transformers-embedder/actions/workflows/python-publish-conda.yml)
 12 | [![PyPi Version](https://img.shields.io/github/v/release/Riccorl/transformers-embedder)](https://github.com/Riccorl/transformers-embedder/releases)
 13 | [![Anaconda-Server Badge](https://anaconda.org/riccorl/transformers-embedder/badges/version.svg)](https://anaconda.org/riccorl/transformers-embedder)
 14 | [![DeepSource](https://deepsource.io/gh/Riccorl/transformers-embedder.svg/?label=active+issues)](https://deepsource.io/gh/Riccorl/transformers-embedder/?ref=repository-badge)
 15 | 
 16 | </div>
 17 | 
 18 | A Word Level Transformer layer based on PyTorch and 🤗 Transformers.
 19 | 
 20 | ## How to use
 21 | 
 22 | Install the library from [PyPI](https://pypi.org/project/transformers-embedder):
 23 | 
 24 | ```bash
 25 | pip install transformers-embedder
 26 | ```
 27 | 
 28 | or from [Conda](https://anaconda.org/riccorl/transformers-embedder):
 29 | 
 30 | ```bash
 31 | conda install -c riccorl transformers-embedder
 32 | ```
 33 | 
 34 | It offers a PyTorch layer and a tokenizer that support almost every pretrained model from Huggingface 
 35 | [🤗Transformers](https://huggingface.co/transformers/) library. Here is a quick example:
 36 | 
 37 | ```python
 38 | import transformers_embedder as tre
 39 | 
 40 | tokenizer = tre.Tokenizer("bert-base-cased")
 41 | 
 42 | model = tre.TransformersEmbedder(
 43 |     "bert-base-cased", subword_pooling_strategy="sparse", layer_pooling_strategy="mean"
 44 | )
 45 | 
 46 | example = "This is a sample sentence"
 47 | inputs = tokenizer(example, return_tensors=True)
 48 | ```
 49 | 
 50 | ```text
 51 | {
 52 |    'input_ids': tensor([[ 101, 1188, 1110, 170, 6876, 5650,  102]]),
 53 |    'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]]),
 54 |    'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]])
 55 |    'scatter_offsets': tensor([[0, 1, 2, 3, 4, 5, 6]]),
 56 |    'sparse_offsets': {
 57 |         'sparse_indices': tensor(
 58 |             [
 59 |                 [0, 0, 0, 0, 0, 0, 0],
 60 |                 [0, 1, 2, 3, 4, 5, 6],
 61 |                 [0, 1, 2, 3, 4, 5, 6]
 62 |             ]
 63 |         ), 
 64 |         'sparse_values': tensor([1., 1., 1., 1., 1., 1., 1.]), 
 65 |         'sparse_size': torch.Size([1, 7, 7])
 66 |     },
 67 |    'sentence_length': 7  # with special tokens included
 68 | }
 69 | ```
 70 | 
 71 | ```python
 72 | outputs = model(**inputs)
 73 | ```
 74 | 
 75 | ```text
 76 | # outputs.word_embeddings.shape[1:-1]       # remove [CLS] and [SEP]
 77 | torch.Size([1, 5, 768])
 78 | # len(example)
 79 | 5
 80 | ```
 81 | 
 82 | ## Info
 83 | 
 84 | One of the annoyance of using transformer-based models is that it is not trivial to compute word embeddings 
 85 | from the sub-token embeddings they output. With this API it's as easy as using 🤗Transformers to get 
 86 | word-level embeddings from theoretically every transformer model it supports.
 87 | 
 88 | ### Model
 89 | 
 90 | #### Subword Pooling Strategy
 91 | 
 92 | The `TransformersEmbedder` class offers 3 ways to get the embeddings:
 93 | 
 94 | - `subword_pooling_strategy="sparse"`: computes the mean of the embeddings of the sub-tokens of each word 
 95 |   (i.e. the embeddings of the sub-tokens are pooled together) using a sparse matrix multiplication. This 
 96 |   strategy is the default one.
 97 | - `subword_pooling_strategy="scatter"`: computes the mean of the embeddings of the sub-tokens of each word
 98 |   using a scatter-gather operation. It is not deterministic, but it works with ONNX export.
 99 | - `subword_pooling_strategy="none"`: returns the raw output of the transformer model without sub-token pooling.
100 | 
101 | Here a little feature table:
102 | 
103 | |             |      Pooling       |   Deterministic    |        ONNX        |
104 | |-------------|:------------------:|:------------------:|:------------------:|
105 | | **Sparse**  | :white_check_mark: | :white_check_mark: |        :x:         |
106 | | **Scatter** | :white_check_mark: |        :x:         | :white_check_mark: |
107 | | **None**    |        :x:         | :white_check_mark: | :white_check_mark: |
108 | 
109 | #### Layer Pooling Strategy
110 | 
111 | There are also multiple type of outputs you can get using `layer_pooling_strategy` parameter:
112 | 
113 | - `layer_pooling_strategy="last"`: returns the last hidden state of the transformer model
114 | - `layer_pooling_strategy="concat"`: returns the concatenation of the selected `output_layers` of the  
115 |    transformer model
116 | - `layer_pooling_strategy="sum"`: returns the sum of the selected `output_layers` of the transformer model
117 | - `layer_pooling_strategy="mean"`: returns the average of the selected `output_layers` of the transformer model
118 | - `layer_pooling_strategy="scalar_mix"`: returns the output of a parameterised scalar mixture layer of the 
119 |    selected `output_layers` of the transformer model
120 | 
121 | If you also want all the outputs from the HuggingFace model, you can set `return_all=True` to get them.
122 | 
123 | ```python
124 | class TransformersEmbedder(torch.nn.Module):
125 |     def __init__(
126 |         self,
127 |         model: Union[str, tr.PreTrainedModel],
128 |         subword_pooling_strategy: str = "sparse",
129 |         layer_pooling_strategy: str = "last",
130 |         output_layers: Tuple[int] = (-4, -3, -2, -1),
131 |         fine_tune: bool = True,
132 |         return_all: bool = True,
133 |     )
134 | ```
135 | 
136 | ### Tokenizer
137 | 
138 | The `Tokenizer` class provides the `tokenize` method to preprocess the input for the `TransformersEmbedder` 
139 | layer. You can pass raw sentences, pre-tokenized sentences and sentences in batch. It will preprocess them 
140 | returning a dictionary with the inputs for the model. By passing `return_tensors=True` it will return the 
141 | inputs as `torch.Tensor`.
142 | 
143 | By default, if you pass text (or batch) as strings, it uses the HuggingFace tokenizer to tokenize them.
144 | 
145 | ```python
146 | text = "This is a sample sentence"
147 | tokenizer(text)
148 | 
149 | text = ["This is a sample sentence", "This is another sample sentence"]
150 | tokenizer(text)
151 | ```
152 | 
153 | You can pass a pre-tokenized sentence (or batch of sentences) by setting `is_split_into_words=True`
154 | 
155 | ```python
156 | text = ["This", "is", "a", "sample", "sentence"]
157 | tokenizer(text, is_split_into_words=True)
158 | 
159 | text = [
160 |     ["This", "is", "a", "sample", "sentence", "1"],
161 |     ["This", "is", "sample", "sentence", "2"],
162 | ]
163 | tokenizer(text, is_split_into_words=True)
164 | ```
165 | 
166 | #### Examples
167 | 
168 | First, initialize the tokenizer
169 | 
170 | ```python
171 | import transformers_embedder as tre
172 | 
173 | tokenizer = tre.Tokenizer("bert-base-cased")
174 | ```
175 | 
176 | - You can pass a single sentence as a string:
177 | 
178 | ```python
179 | text = "This is a sample sentence"
180 | tokenizer(text)
181 | ```
182 | 
183 | ```text
184 | {
185 | {
186 |     'input_ids': [[101, 1188, 1110, 170, 6876, 5650, 102]],
187 |     'token_type_ids': [[0, 0, 0, 0, 0, 0, 0]],
188 |     'attention_mask': [[1, 1, 1, 1, 1, 1, 1]],
189 |     'scatter_offsets': [[0, 1, 2, 3, 4, 5, 6]],
190 |     'sparse_offsets': {
191 |         'sparse_indices': tensor(
192 |             [
193 |                 [0, 0, 0, 0, 0, 0, 0],
194 |                 [0, 1, 2, 3, 4, 5, 6],
195 |                 [0, 1, 2, 3, 4, 5, 6]
196 |             ]
197 |         ),
198 |         'sparse_values': tensor([1., 1., 1., 1., 1., 1., 1.]),
199 |         'sparse_size': torch.Size([1, 7, 7])
200 |     },
201 |     'sentence_lengths': [7],
202 | }
203 | ```
204 | 
205 | - A sentence pair
206 | 
207 | ```python
208 | text = "This is a sample sentence A"
209 | text_pair = "This is a sample sentence B"
210 | tokenizer(text, text_pair)
211 | ```
212 | 
213 | ```text
214 | {
215 |     'input_ids': [[101, 1188, 1110, 170, 6876, 5650, 138, 102, 1188, 1110, 170, 6876, 5650, 139, 102]],
216 |     'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]],
217 |     'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
218 |     'scatter_offsets': [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]],
219 |     'sparse_offsets': {
220 |         'sparse_indices': tensor(
221 |             [
222 |                 [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0],
223 |                 [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
224 |                 [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
225 |             ]
226 |         ),
227 |         'sparse_values': tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
228 |         'sparse_size': torch.Size([1, 15, 15])
229 |     },
230 |     'sentence_lengths': [15],
231 | }
232 | ```
233 | 
234 | - A batch of sentences or sentence pairs. Using `padding=True` and `return_tensors=True`, the tokenizer 
235 | returns the text ready for the model
236 | 
237 | ```python
238 | batch = [
239 |     ["This", "is", "a", "sample", "sentence", "1"],
240 |     ["This", "is", "sample", "sentence", "2"],
241 |     ["This", "is", "a", "sample", "sentence", "3"],
242 |     # ...
243 |     ["This", "is", "a", "sample", "sentence", "n", "for", "batch"],
244 | ]
245 | tokenizer(batch, padding=True, return_tensors=True)
246 | 
247 | batch_pair = [
248 |     ["This", "is", "a", "sample", "sentence", "pair", "1"],
249 |     ["This", "is", "sample", "sentence", "pair", "2"],
250 |     ["This", "is", "a", "sample", "sentence", "pair", "3"],
251 |     # ...
252 |     ["This", "is", "a", "sample", "sentence", "pair", "n", "for", "batch"],
253 | ]
254 | tokenizer(batch, batch_pair, padding=True, return_tensors=True)
255 | ```
256 | 
257 | #### Custom fields
258 | 
259 | It is possible to add custom fields to the model input and tell the `tokenizer` how to pad them using 
260 | `add_padding_ops`. Start by initializing the tokenizer with the model name:
261 | 
262 | ```python
263 | import transformers_embedder as tre
264 | 
265 | tokenizer = tre.Tokenizer("bert-base-cased")
266 | ```
267 | 
268 | Then add the custom fields to it:
269 | 
270 | ```python
271 | custom_fields = {
272 |   "custom_filed_1": [
273 |     [0, 0, 0, 0, 1, 0, 0],
274 |     [0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0]
275 |   ]
276 | }
277 | ```
278 | 
279 | Now we can add the padding logic for our custom field `custom_filed_1`. `add_padding_ops` method takes in 
280 | input
281 | 
282 | - `key`: name of the field in the tokenizer input
283 | - `value`: value to use for padding
284 | - `length`: length to pad. It can be an `int`, or two string value, `subword` in which the element is padded 
285 | to match the length of the subwords, and `word` where the element is padded relative to the length of the
286 | batch after the merge of the subwords.
287 | 
288 | ```python
289 | tokenizer.add_padding_ops("custom_filed_1", 0, "word")
290 | ```
291 | 
292 | Finally, we can tokenize the input with the custom field:
293 | 
294 | ```python
295 | text = [
296 |     "This is a sample sentence",
297 |     "This is another example sentence just make it longer, with a comma too!"
298 | ]
299 | 
300 | tokenizer(text, padding=True, return_tensors=True, additional_inputs=custom_fields)
301 | ```
302 | 
303 | The inputs are ready for the model, including the custom filed.
304 | 
305 | ```text
306 | >>> inputs
307 | 
308 | {
309 |     'input_ids': tensor(
310 |         [
311 |             [ 101, 1188, 1110, 170, 6876, 5650, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
312 |             [ 101, 1188, 1110, 1330, 1859, 5650, 1198, 1294, 1122, 2039, 117, 1114, 170, 3254, 1918, 1315, 106, 102]
313 |         ]
314 |     ),
315 |     'token_type_ids': tensor(
316 |         [
317 |             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
318 |             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
319 |         ]
320 |     ), 
321 |     'attention_mask': tensor(
322 |         [
323 |             [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
324 |             [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
325 |         ]
326 |     ),
327 |     'scatter_offsets': tensor(
328 |         [
329 |             [ 0, 1, 2, 3, 4, 5, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
330 |             [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16]
331 |         ]
332 |     ),
333 |     'sparse_offsets': {
334 |         'sparse_indices': tensor(
335 |             [
336 |                 [ 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1],
337 |                 [ 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16],
338 |                 [ 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
339 |             ]
340 |         ),
341 |         'sparse_values': tensor(
342 |             [1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
343 |             1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
344 |             1.0000, 1.0000, 0.5000, 0.5000, 1.0000, 1.0000, 1.0000]
345 |         ), 
346 |         'sparse_size': torch.Size([2, 17, 18])
347 |     }
348 |     'sentence_lengths': [7, 17],
349 | }
350 | ```
351 | 
352 | ## Acknowledgements
353 | 
354 | Some code in the `TransformersEmbedder` class is taken from the [PyTorch Scatter](https://github.com/rusty1s/pytorch_scatter/)
355 | library. The pretrained models and the core of the tokenizer is from [🤗 Transformers](https://huggingface.co/transformers/).
356 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: Transformers Embedder
 2 | repo_url: https://github.com/riccorl/transformers-embedder
 3 | 
 4 | plugins:
 5 |   - search
 6 |   - gen-files:
 7 |       scripts:
 8 |         - docs/gen_ref_pages.py
 9 |   - literate-nav:
10 |       nav_file: main.md
11 |   - section-index
12 |   - mkdocstrings:
13 |       custom_templates: templates
14 |       default_handler: python
15 |       handlers:
16 |         python:
17 |           options:
18 |             docstring_style: google
19 |       watch:
20 |         - transformers_embedder
21 | 
22 | theme:
23 |   name: material
24 |   features:
25 |     - search.suggest
26 |     - search.highlight
27 |   icon:
28 |     repo: fontawesome/brands/github
29 |   palette:
30 |     # Palette toggle for light mode
31 |     - media: "(prefers-color-scheme: light)"
32 |       primary: deep purple
33 |       accent: yellow
34 |       scheme: default
35 |       font:
36 |         text: Work Sans
37 |         code: Fira Mono
38 |       toggle:
39 |         icon: material/brightness-7
40 |         name: Switch to dark mode
41 |     # Palette toggle for dark mode
42 |     - media: "(prefers-color-scheme: dark)"
43 |       primary: deep purple
44 |       accent: yellow
45 |       scheme: slate
46 |       font:
47 |         text: Work Sans
48 |         code: Fira Mono
49 |       toggle:
50 |         icon: material/brightness-4
51 |         name: Switch to light mode
52 | 
53 | nav:
54 |   - API References: references/
55 | 
56 | extra:
57 | #  version:
58 | #    provider: mike
59 | 
60 |   social:
61 |     - icon: fontawesome/brands/twitter
62 |       link: https://twitter.com/RiccrdoRicOrl
63 |     - icon: fontawesome/brands/github
64 |       link: https://github.com/riccorl
65 | 
66 | markdown_extensions:
67 |   - admonition
68 |   - codehilite
69 |   - pymdownx.superfences


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.black]
 2 | include = '\.pyi?$'
 3 | exclude = '''
 4 | /(
 5 |     \.git
 6 |   | \.hg
 7 |   | \.mypy_cache
 8 |   | \.tox
 9 |   | \.venv
10 |   | _build
11 |   | buck-out
12 |   | build
13 |   | dist
14 | )/
15 | '''


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | pre-commit
2 | datasets
3 | mkdocs-material
4 | mkdocstrings[python]
5 | mkdocs-literate-nav
6 | mkdocs-section-index
7 | mkdocs-gen-files
8 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch>=1.7
2 | transformers>=4.14,<4.35
3 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("README.md", "r") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | extras = {}
 7 | extras["torch"] = ["torch>=1.5,<2.2"]
 8 | extras["all"] = extras["torch"]
 9 | extras["docs"] = ["mkdocs-material"]
10 | 
11 | install_requires = ["transformers>=4.14,<4.35"]
12 | 
13 | setuptools.setup(
14 |     name="transformers_embedder",
15 |     version="3.0.11",
16 |     author="Riccardo Orlando",
17 |     author_email="orlandoricc@gmail.com",
18 |     description="Word level transformer based embeddings",
19 |     long_description=long_description,
20 |     long_description_content_type="text/markdown",
21 |     url="https://github.com/Riccorl/transformers-embedder",
22 |     keywords="NLP deep learning transformer pytorch BERT google subtoken wordpieces embeddings",
23 |     packages=setuptools.find_packages(),
24 |     include_package_data=True,
25 |     license="Apache",
26 |     classifiers=[
27 |         "Programming Language :: Python :: 3",
28 |         "License :: OSI Approved :: Apache Software License",
29 |         "Operating System :: OS Independent",
30 |     ],
31 |     extras_require=extras,
32 |     install_requires=install_requires,
33 |     python_requires=">=3.6",
34 | )
35 | 


--------------------------------------------------------------------------------
/transformers_embedder/__init__.py:
--------------------------------------------------------------------------------
 1 | from transformers_embedder import utils
 2 | 
 3 | if utils.is_torch_available():
 4 |     from transformers_embedder.embedder import TransformersEmbedder, TransformersEncoder
 5 | 
 6 | from transformers import (
 7 |     BertTokenizer,
 8 |     BertTokenizerFast,
 9 |     BertweetTokenizer,
10 |     CamembertTokenizer,
11 |     CamembertTokenizerFast,
12 |     DebertaTokenizer,
13 |     DebertaTokenizerFast,
14 |     DebertaV2Tokenizer,
15 |     DebertaV2TokenizerFast,
16 |     DistilBertTokenizer,
17 |     DistilBertTokenizerFast,
18 |     MobileBertTokenizer,
19 |     MobileBertTokenizerFast,
20 |     RobertaTokenizer,
21 |     RobertaTokenizerFast,
22 |     XLMRobertaTokenizer,
23 |     XLMRobertaTokenizerFast,
24 |     XLMTokenizer,
25 | )
26 | 
27 | 
28 | MODELS_WITH_STARTING_TOKEN = (
29 |     BertTokenizer,
30 |     BertTokenizerFast,
31 |     DistilBertTokenizer,
32 |     DistilBertTokenizerFast,
33 |     MobileBertTokenizer,
34 |     MobileBertTokenizerFast,
35 |     BertweetTokenizer,
36 |     CamembertTokenizer,
37 |     CamembertTokenizerFast,
38 |     DebertaTokenizer,
39 |     DebertaTokenizerFast,
40 |     DebertaV2Tokenizer,
41 |     DebertaV2TokenizerFast,
42 |     RobertaTokenizer,
43 |     RobertaTokenizerFast,
44 |     XLMRobertaTokenizer,
45 |     XLMRobertaTokenizerFast,
46 |     XLMTokenizer,
47 | )
48 | 
49 | MODELS_WITH_DOUBLE_SEP = (
50 |     CamembertTokenizer,
51 |     CamembertTokenizerFast,
52 |     BertweetTokenizer,
53 |     RobertaTokenizer,
54 |     RobertaTokenizerFast,
55 |     XLMRobertaTokenizer,
56 |     XLMRobertaTokenizerFast,
57 | )
58 | 
59 | from transformers_embedder.tokenizer import Tokenizer
60 | 


--------------------------------------------------------------------------------
/transformers_embedder/embedder.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | from pathlib import Path
  3 | from typing import Optional, Union, Tuple, Sequence, Any, Mapping
  4 | 
  5 | import transformers as tr
  6 | 
  7 | from transformers_embedder import utils
  8 | from transformers_embedder.modules.scalar_mix import ScalarMix
  9 | from transformers_embedder.modules.encoder import Encoder
 10 | 
 11 | if utils.is_torch_available():
 12 |     import torch
 13 | 
 14 | logger = utils.get_logger(__name__)
 15 | utils.get_logger("transformers")
 16 | 
 17 | 
 18 | @dataclass
 19 | class TransformersEmbedderOutput(tr.file_utils.ModelOutput):
 20 |     """Class for model's outputs."""
 21 | 
 22 |     word_embeddings: Optional[torch.FloatTensor] = None
 23 |     last_hidden_state: Optional[torch.FloatTensor] = None
 24 |     pooler_output: Optional[torch.FloatTensor] = None
 25 |     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
 26 |     attentions: Optional[Tuple[torch.FloatTensor]] = None
 27 | 
 28 | 
 29 | class TransformersEmbedder(torch.nn.Module):
 30 |     """
 31 |     Transformer Embedder class.
 32 | 
 33 |     Word level embeddings from various transformer architectures from Huggingface Transformers API.
 34 | 
 35 |     Args:
 36 |         model (`str`, `tr.PreTrainedModel`):
 37 |             Transformer model to use (https://huggingface.co/models).
 38 |         layer_pooling_strategy (`str`, optional, defaults to `last`):
 39 |             What output to get from the transformer model. The last hidden state (``last``),
 40 |             the concatenation of the selected hidden layers (``concat``), the sum of the selected hidden
 41 |             layers (``sum``), the average of the selected hidden layers (``mean``), or a scalar mixture of
 42 |             the selected hidden layers (``scalar_mix``).
 43 |         subword_pooling_strategy (`str`, optional, defaults to `sparse`):
 44 |             What pooling strategy to use for the sub-word embeddings. Methods available are ``sparse``,
 45 |             ``scatter`` and ``none``. The ``scatter`` strategy is ONNX comptabile but uses ``scatter_add_``
 46 |             that is not deterministic. The ``sparse`` strategy is deterministic but it is not comptabile
 47 |             with ONNX. When ``subword_pooling_strategy`` is ``none``, the sub-word embeddings are not
 48 |             pooled.
 49 |         output_layers (`tuple`, `list`, `str`, optional, defaults to `(-4, -3, -2, -1)`):
 50 |             Which hidden layers to get from the transformer model. If ``output_layers`` is ``all``,
 51 |             all the hidden layers are returned. If ``output_layers`` is a tuple or a list, the hidden
 52 |             layers are selected according to the indexes in the tuple or list. If ``output_layers`` is
 53 |             a string, it must be ``all``.
 54 |         fine_tune (`bool`, optional, defaults to `True`):
 55 |             If ``True``, the transformer model is fine-tuned during training.
 56 |         return_all (`bool`, optional, defaults to `False`):
 57 |             If ``True``, returns all the outputs from the HuggingFace model.
 58 |         from_pretrained (`bool`, optional, defaults to `True`):
 59 |             If ``True``, the model is loaded from a pre-trained model, otherwise it is initialized with
 60 |             random weights. Usefull when you want to load a model from a specific checkpoint, without
 61 |             having to download the entire model.
 62 |     """
 63 | 
 64 |     def __init__(
 65 |         self,
 66 |         model: Union[str, tr.PreTrainedModel],
 67 |         layer_pooling_strategy: str = "last",
 68 |         subword_pooling_strategy: str = "scatter",
 69 |         output_layers: Union[Sequence[int], str] = (-4, -3, -2, -1),
 70 |         fine_tune: bool = True,
 71 |         return_all: bool = False,
 72 |         from_pretrained: bool = True,
 73 |         *args,
 74 |         **kwargs,
 75 |     ) -> None:
 76 |         super().__init__()
 77 |         if isinstance(model, str):
 78 |             self.config = tr.AutoConfig.from_pretrained(
 79 |                 model,
 80 |                 output_hidden_states=True,
 81 |                 output_attentions=True,
 82 |                 *args,
 83 |                 **kwargs,
 84 |             )
 85 |             if from_pretrained:
 86 |                 self.transformer_model = tr.AutoModel.from_pretrained(
 87 |                     model, config=self.config, *args, **kwargs
 88 |                 )
 89 |             else:
 90 |                 self.transformer_model = tr.AutoModel.from_config(
 91 |                     self.config, *args, **kwargs
 92 |                 )
 93 |         else:
 94 |             self.transformer_model = model
 95 | 
 96 |         # pooling strategy parameters
 97 |         self.layer_pooling_strategy = layer_pooling_strategy
 98 |         self.subword_pooling_strategy = subword_pooling_strategy
 99 | 
100 |         if output_layers == "all":
101 |             output_layers = tuple(
102 |                 range(self.transformer_model.config.num_hidden_layers)
103 |             )
104 | 
105 |         # check output_layers is well defined
106 |         if (
107 |             max(map(abs, output_layers))
108 |             >= self.transformer_model.config.num_hidden_layers
109 |         ):
110 |             raise ValueError(
111 |                 f"`output_layers` parameter not valid, choose between 0 and "
112 |                 f"{self.transformer_model.config.num_hidden_layers - 1}. "
113 |                 f"Current value is `{output_layers}`"
114 |             )
115 |         self.output_layers = output_layers
116 | 
117 |         self._scalar_mix: Optional[ScalarMix] = None
118 |         if layer_pooling_strategy == "scalar_mix":
119 |             self._scalar_mix = ScalarMix(len(output_layers))
120 | 
121 |         # check if return all transformer outputs
122 |         self.return_all = return_all
123 | 
124 |         # if fine_tune is False, freeze all the transformer's parameters
125 |         if not fine_tune:
126 |             for param in self.transformer_model.parameters():
127 |                 param.requires_grad = False
128 | 
129 |     def forward(
130 |         self,
131 |         input_ids: torch.Tensor,
132 |         attention_mask: Optional[torch.Tensor] = None,
133 |         token_type_ids: Optional[torch.Tensor] = None,
134 |         scatter_offsets: Optional[torch.Tensor] = None,
135 |         sparse_offsets: Optional[Mapping[str, Any]] = None,
136 |         **kwargs,
137 |     ) -> TransformersEmbedderOutput:
138 |         """
139 |         Forward method of the PyTorch module.
140 | 
141 |         Args:
142 |             input_ids (`torch.Tensor`):
143 |                 Input ids for the transformer model.
144 |             attention_mask (`torch.Tensor`, optional):
145 |                 Attention mask for the transformer model.
146 |             token_type_ids (`torch.Tensor`, optional):
147 |                 Token type ids for the transformer model.
148 |             scatter_offsets (`torch.Tensor`, optional):
149 |                 Offsets of the sub-word, used to reconstruct the word embeddings using
150 |                 the ``scatter`` method.
151 |             sparse_offsets (`Mapping[str, Any]`, optional):
152 |                 Offsets of the sub-word, used to reconstruct the word embeddings using
153 |                 the ``sparse`` method.
154 | 
155 |         Returns:
156 |             `TransformersEmbedderOutput`:
157 |                 Word level embeddings plus the output of the transformer model.
158 |         """
159 |         # Some HuggingFace models don't have the
160 |         # token_type_ids parameter and fail even when it's given as None.
161 |         inputs = {"input_ids": input_ids, "attention_mask": attention_mask}
162 |         if token_type_ids is not None:
163 |             inputs["token_type_ids"] = token_type_ids
164 | 
165 |         # Shape: [batch_size, num_sub-words, embedding_size].
166 |         transformer_outputs = self.transformer_model(**inputs)
167 |         if self.layer_pooling_strategy == "last":
168 |             word_embeddings = transformer_outputs.last_hidden_state
169 |         elif self.layer_pooling_strategy == "concat":
170 |             word_embeddings = [
171 |                 transformer_outputs.hidden_states[layer] for layer in self.output_layers
172 |             ]
173 |             word_embeddings = torch.cat(word_embeddings, dim=-1)
174 |         elif self.layer_pooling_strategy == "sum":
175 |             word_embeddings = [
176 |                 transformer_outputs.hidden_states[layer] for layer in self.output_layers
177 |             ]
178 |             word_embeddings = torch.stack(word_embeddings, dim=0).sum(dim=0)
179 |         elif self.layer_pooling_strategy == "mean":
180 |             word_embeddings = [
181 |                 transformer_outputs.hidden_states[layer] for layer in self.output_layers
182 |             ]
183 |             word_embeddings = torch.stack(word_embeddings, dim=0).mean(
184 |                 dim=0, dtype=torch.float
185 |             )
186 |         elif self.layer_pooling_strategy == "scalar_mix":
187 |             word_embeddings = [
188 |                 transformer_outputs.hidden_states[layer] for layer in self.output_layers
189 |             ]
190 |             word_embeddings = self._scalar_mix(word_embeddings)
191 |         else:
192 |             raise ValueError(
193 |                 "`layer_pooling_strategy` parameter not valid, choose between `last`, `concat`, "
194 |                 f"`sum`, `mean` and `scalar_mix`. Current value `{self.layer_pooling_strategy}`"
195 |             )
196 | 
197 |         if (
198 |             self.subword_pooling_strategy != "none"
199 |             and scatter_offsets is None
200 |             and sparse_offsets is None
201 |         ):
202 |             raise ValueError(
203 |                 "`subword_pooling_strategy` is not `none` but neither `scatter_offsets` not `sparse_offsets` "
204 |                 "were passed to the model. Cannot compute word embeddings.\nTo solve:\n"
205 |                 "- Set `subword_pooling_strategy` to `none` or\n"
206 |                 "- Pass `scatter_offsets` to the model during forward or\n"
207 |                 "- Pass `sparse_offsets` to the model during forward."
208 |             )
209 | 
210 |         if self.subword_pooling_strategy not in ["none", "scatter", "sparse"]:
211 |             raise ValueError(
212 |                 "`subword_pooling_strategy` parameter not valid, choose between `scatter`, `sparse`"
213 |                 f" and `none`. Current value is `{self.subword_pooling_strategy}`."
214 |             )
215 |         if self.subword_pooling_strategy == "scatter":
216 |             if scatter_offsets is None:
217 |                 raise ValueError(
218 |                     "`subword_pooling_strategy` is `scatter` but `scatter_offsets` "
219 |                     "were not passed to the model. Cannot compute word embeddings.\nTo solve:\n"
220 |                     "- Set `subword_pooling_strategy` to `none` or\n"
221 |                     "- Pass `scatter_offsets` to the model during forward."
222 |                 )
223 |             word_embeddings = self.merge_scatter(
224 |                 word_embeddings, indices=scatter_offsets
225 |             )
226 |         if self.subword_pooling_strategy == "sparse":
227 |             if sparse_offsets is None:
228 |                 raise ValueError(
229 |                     "`subword_pooling_strategy` is `sparse` but `sparse_offsets` "
230 |                     "were not passed to the model. Cannot compute word embeddings.\nTo solve:\n"
231 |                     "- Set `subword_pooling_strategy` to `none` or\n"
232 |                     "- Pass `sparse_offsets` to the model during forward."
233 |                 )
234 |             word_embeddings = self.merge_sparse(word_embeddings, sparse_offsets)
235 | 
236 |         if self.return_all:
237 |             return TransformersEmbedderOutput(
238 |                 word_embeddings=word_embeddings,
239 |                 last_hidden_state=transformer_outputs.last_hidden_state,
240 |                 hidden_states=transformer_outputs.hidden_states,
241 |                 pooler_output=transformer_outputs.pooler_output
242 |                 if hasattr(transformer_outputs, "pooler_output")
243 |                 else None,
244 |                 attentions=transformer_outputs.attentions,
245 |             )
246 |         return TransformersEmbedderOutput(word_embeddings=word_embeddings)
247 | 
248 |     @staticmethod
249 |     def merge_scatter(embeddings: torch.Tensor, indices: torch.Tensor) -> torch.Tensor:
250 |         """
251 |         Minimal version of ``scatter_mean``, from `pytorch_scatter
252 |         <https://github.com/rusty1s/pytorch_scatter/>`_
253 |         library, that is compatible for ONNX but works only for our case.
254 |         It is used to compute word level embeddings from the transformer output.
255 | 
256 |         Args:
257 |             embeddings (`torch.Tensor`):
258 |                 The embeddings tensor.
259 |             indices (`torch.Tensor`):
260 |                 The sub-word indices.
261 | 
262 |         Returns:
263 |             `torch.Tensor`
264 |         """
265 | 
266 |         def broadcast(src: torch.Tensor, other: torch.Tensor):
267 |             """
268 |             Broadcast ``src`` to match the shape of ``other``.
269 | 
270 |             Args:
271 |                 src (`torch.Tensor`):
272 |                     The tensor to broadcast.
273 |                 other (`torch.Tensor`):
274 |                     The tensor to match the shape of.
275 | 
276 |             Returns:
277 |                 `torch.Tensor`: The broadcasted tensor.
278 |             """
279 |             for _ in range(src.dim(), other.dim()):
280 |                 src = src.unsqueeze(-1)
281 |             src = src.expand_as(other)
282 |             return src
283 | 
284 |         def scatter_sum(src: torch.Tensor, index: torch.Tensor) -> torch.Tensor:
285 |             """
286 |             Sums the elements in ``src`` that have the same indices as in ``index``.
287 | 
288 |             Args:
289 |                 src (`torch.Tensor`):
290 |                     The tensor to sum.
291 |                 index (`torch.Tensor`):
292 |                     The indices to sum.
293 | 
294 |             Returns:
295 |                 `torch.Tensor`: The summed tensor.
296 |             """
297 |             index = broadcast(index, src)
298 |             size = list(src.size())
299 |             size[1] = index.max() + 1
300 |             out = torch.zeros(size, dtype=src.dtype, device=src.device)
301 |             return out.scatter_add_(1, index, src)
302 | 
303 |         # replace padding indices with the maximum value inside the batch
304 |         indices[indices == -1] = torch.max(indices)
305 |         merged = scatter_sum(embeddings, indices)
306 |         ones = torch.ones(
307 |             indices.size(), dtype=embeddings.dtype, device=embeddings.device
308 |         )
309 |         count = scatter_sum(ones, indices)
310 |         count.clamp_(1)
311 |         count = broadcast(count, merged)
312 |         merged.true_divide_(count)
313 |         return merged
314 | 
315 |     @staticmethod
316 |     def merge_sparse(
317 |         embeddings: torch.Tensor, bpe_info: Optional[Mapping[str, Any]]
318 |     ) -> torch.Tensor:
319 |         """
320 |         Merges the subword embeddings into a single tensor, using sparse indices.
321 | 
322 |         Args:
323 |             embeddings (`torch.Tensor`):
324 |                 The embeddings tensor.
325 |             bpe_info (`Mapping[str, Any]`, `optional`):
326 |                 The BPE info.
327 | 
328 |         Returns:
329 |             `torch.Tensor`: The merged embeddings.
330 |         """
331 |         # it is constructed here and not in the tokenizer/collate because pin_memory is not sparse-compatible
332 |         bpe_weights = torch.sparse_coo_tensor(
333 |             indices=bpe_info["sparse_indices"],
334 |             values=bpe_info["sparse_values"],
335 |             size=bpe_info["sparse_size"],
336 |         )
337 |         # (sentence, word, bpe) x (sentence, bpe, transformer_dim) -> (sentence, word, transformer_dim)
338 |         merged = torch.bmm(bpe_weights.to_dense(), embeddings)
339 |         return merged
340 | 
341 |     def resize_token_embeddings(
342 |         self, new_num_tokens: Optional[int] = None
343 |     ) -> torch.nn.Embedding:
344 |         """
345 |         Resizes input token embeddings' matrix of the model if `new_num_tokens != config.vocab_size`.
346 | 
347 |         Args:
348 |             new_num_tokens (`int`):
349 |                 The number of new tokens in the embedding matrix.
350 | 
351 |         Returns:
352 |             `torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
353 |         """
354 |         return self.transformer_model.resize_token_embeddings(new_num_tokens)
355 | 
356 |     def save_pretrained(self, save_directory: Union[str, Path]):
357 |         """
358 |         Save a model and its configuration file to a directory.
359 | 
360 |         Args:
361 |             save_directory (`str`, `Path`):
362 |                 Directory to which to save.
363 |         """
364 |         self.transformer_model.save_pretrained(save_directory)
365 | 
366 |     @property
367 |     def hidden_size(self) -> int:
368 |         """
369 |         Returns the hidden size of TransformersEmbedder.
370 | 
371 |         Returns:
372 |             `int`: Hidden size of ``self.transformer_model``.
373 |         """
374 |         multiplier = (
375 |             len(self.output_layers) if self.layer_pooling_strategy == "concat" else 1
376 |         )
377 |         return self.transformer_model.config.hidden_size * multiplier
378 | 
379 |     @property
380 |     def transformer_hidden_size(self) -> int:
381 |         """
382 |         Returns the hidden size of the inner transformer.
383 | 
384 |         Returns:
385 |             `int`: Hidden size of ``self.transformer_model``.
386 |         """
387 |         multiplier = (
388 |             len(self.output_layers) if self.layer_pooling_strategy == "concat" else 1
389 |         )
390 |         return self.transformer_model.config.hidden_size * multiplier
391 | 
392 | 
393 | class TransformersEncoder(TransformersEmbedder):
394 |     """
395 |     Transformer Embedder class.
396 | 
397 |     Word level embeddings from various transformer architectures from Huggingface Transformers API.
398 | 
399 |     Args:
400 |         model (`str`, `tr.PreTrainedModel`):
401 |             Transformer model to use (https://huggingface.co/models).
402 |         layer_pooling_strategy (`str`, optional, defaults to `last`):
403 |             What output to get from the transformer model. The last hidden state (``last``),
404 |             the concatenation of the selected hidden layers (``concat``), the sum of the selected hidden
405 |             layers (``sum``), the average of the selected hidden layers (``mean``).
406 |         subword_pooling_strategy (`str`, optional, defaults to `scatter`):
407 |             What pooling strategy to use for the sub-word embeddings. Methods available are ``scatter``,
408 |             ``sparse`` and ``none``. The ``scatter`` strategy is ONNX comptabile but uses ``scatter_add``
409 |             that is not deterministic. The ``sparse`` strategy is deterministic but it is not comptabile
410 |             with ONNX.
411 |         output_layers (`tuple`, optional, defaults to `(-4, -3, -2, -1)`):
412 |             Which hidden layers to get from the transformer model.
413 |         fine_tune (`bool`, optional, defaults to `True`):
414 |             If ``True``, the transformer model is fine-tuned during training.
415 |         return_all (`bool`, optional, defaults to `False`):
416 |             If ``True``, returns all the outputs from the HuggingFace model.
417 |         projection_size (`int`, optional, defaults to `None`):
418 |             If not ``None``, the output of the transformer is projected to this size.
419 |         activation_layer (`torch.nn.Module`, optional, defaults to `None`):
420 |             Activation layer to use. If ``None``, no activation layer is used.
421 |         dropout (`float`, optional, defaults to `0.1`):
422 |             The dropout probability.
423 |         bias (`bool`, optional, defaults to `True`):
424 |             If ``True``, the transformer model has a bias.
425 |     """
426 | 
427 |     def __init__(
428 |         self,
429 |         model: Union[str, tr.PreTrainedModel],
430 |         layer_pooling_strategy: str = "last",
431 |         subword_pooling_strategy: str = "sparse",
432 |         output_layers: Sequence[int] = (-4, -3, -2, -1),
433 |         fine_tune: bool = True,
434 |         return_all: bool = False,
435 |         projection_size: Optional[int] = None,
436 |         activation_layer: Optional[torch.nn.Module] = None,
437 |         dropout: float = 0.1,
438 |         bias: bool = True,
439 |         *args,
440 |         **kwargs,
441 |     ) -> None:
442 |         super().__init__(
443 |             model,
444 |             layer_pooling_strategy,
445 |             subword_pooling_strategy,
446 |             output_layers,
447 |             fine_tune,
448 |             return_all,
449 |             *args,
450 |             **kwargs,
451 |         )
452 |         self.encoder = Encoder(
453 |             self.transformer_hidden_size,
454 |             projection_size,
455 |             activation_layer,
456 |             dropout,
457 |             bias,
458 |         )
459 | 
460 |     def forward(
461 |         self,
462 |         input_ids: torch.Tensor,
463 |         attention_mask: Optional[torch.Tensor] = None,
464 |         token_type_ids: Optional[torch.Tensor] = None,
465 |         scatter_offsets: Optional[torch.Tensor] = None,
466 |         sparse_offsets: Optional[Mapping[str, Any]] = None,
467 |         **kwargs,
468 |     ) -> TransformersEmbedderOutput:
469 |         """
470 |         Forward method of the PyTorch module.
471 | 
472 |         Args:
473 |             input_ids (`torch.Tensor`):
474 |                 Input ids for the transformer model.
475 |             attention_mask (`torch.Tensor`, optional):
476 |                 Attention mask for the transformer model.
477 |             token_type_ids (`torch.Tensor`, optional):
478 |                 Token type ids for the transformer model.
479 |             scatter_offsets (`torch.Tensor`, optional):
480 |                 Offsets of the sub-word, used to reconstruct the word embeddings.
481 | 
482 |         Returns:
483 |             `TransformersEmbedderOutput`:
484 |                 Word level embeddings plus the output of the transformer model.
485 |         """
486 |         transformers_kwargs = {
487 |             "input_ids": input_ids,
488 |             "attention_mask": attention_mask,
489 |             "token_type_ids": token_type_ids,
490 |             "scatter_offsets": scatter_offsets,
491 |             "sparse_offsets": sparse_offsets,
492 |             **kwargs,
493 |         }
494 |         transformer_output = super().forward(**transformers_kwargs)
495 |         encoder_output = self.encoder(transformer_output.word_embeddings)
496 |         transformer_output.word_embeddings = encoder_output
497 |         return transformer_output
498 | 
499 |     @property
500 |     def hidden_size(self) -> int:
501 |         """
502 |         Returns the hidden size of the transformer.
503 | 
504 |         Returns:
505 |             `int`: Hidden size of ``self.transformer_model``.
506 |         """
507 |         return self.encoder.projection_size
508 | 


--------------------------------------------------------------------------------
/transformers_embedder/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Riccorl/transformers-embedder/bacf4c5c89fb0fa6b550b1b60174cf15fd03d875/transformers_embedder/modules/__init__.py


--------------------------------------------------------------------------------
/transformers_embedder/modules/encoder.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | class Encoder(torch.nn.Module):
 7 |     """
 8 |     An encoder module for the `TransformersEmbedder` class.
 9 | 
10 |     Args:
11 |         transformer_hidden_size (`int`):
12 |             The hidden size of the inner transformer.
13 |         projection_size (`int`, `optional`, defaults to `None`):
14 |             The size of the projection layer.
15 |         activation_layer (`torch.nn.Module`, optional, defaults to `None`):
16 |             Activation layer to use. If ``None``, no activation layer is used.
17 |         dropout (`float`, `optional`, defaults to `0.1`):
18 |             The dropout value.
19 |         bias (`bool`, `optional`, defaults to `True`):
20 |             Whether to use a bias.
21 |     """
22 | 
23 |     def __init__(
24 |         self,
25 |         transformer_hidden_size: int,
26 |         projection_size: Optional[int] = None,
27 |         activation_layer: Optional[torch.nn.Module] = None,
28 |         dropout: float = 0.1,
29 |         bias: bool = True,
30 |     ):
31 |         super().__init__()
32 |         self.projection_size = projection_size or transformer_hidden_size
33 |         self.projection_layer = torch.nn.Linear(
34 |             transformer_hidden_size, self.projection_size, bias=bias
35 |         )
36 |         self.dropout_layer = torch.nn.Dropout(dropout)
37 |         self.activation_layer = activation_layer
38 | 
39 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
40 |         """
41 |         Forward pass of the encoder.
42 | 
43 |         Args:
44 |             x (`torch.Tensor`):
45 |                 The input tensor.
46 | 
47 |         Returns:
48 |             `torch.Tensor`: The encoded tensor.
49 |         """
50 |         x = self.projection_layer(self.dropout_layer(x))
51 |         if self.activation_layer is not None:
52 |             x = self.activation_layer(x)
53 |         return x
54 | 


--------------------------------------------------------------------------------
/transformers_embedder/modules/scalar_mix.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | import torch
 4 | from torch.nn import ParameterList, Parameter
 5 | 
 6 | # This code is taken from AllenNLP
 7 | # https://github.com/allenai/allennlp/blob/main/allennlp/modules/scalar_mix.py
 8 | 
 9 | 
10 | class ScalarMix(torch.nn.Module):
11 |     """
12 |     Computes a parameterised scalar mixture of N tensors, `mixture = gamma * sum(s_k * tensor_k)`
13 |     where `s = softmax(w)`, with `w` and `gamma` scalar parameters.
14 |     In addition, if `do_layer_norm=True` then apply layer normalization to each tensor
15 |     before weighting.
16 |     """
17 | 
18 |     def __init__(
19 |         self,
20 |         mixture_size: int,
21 |         do_layer_norm: bool = False,
22 |         initial_scalar_parameters: List[float] = None,
23 |         trainable: bool = True,
24 |     ) -> None:
25 |         super().__init__()
26 |         self.mixture_size = mixture_size
27 |         self.do_layer_norm = do_layer_norm
28 | 
29 |         if initial_scalar_parameters is None:
30 |             initial_scalar_parameters = [0.0] * mixture_size
31 |         elif len(initial_scalar_parameters) != mixture_size:
32 |             raise ValueError(
33 |                 f"Length of `initial_scalar_parameters` {initial_scalar_parameters} differs "
34 |                 f"from `mixture_size` {mixture_size}"
35 |             )
36 | 
37 |         self.scalar_parameters = ParameterList(
38 |             [
39 |                 Parameter(
40 |                     torch.FloatTensor([initial_scalar_parameters[i]]),
41 |                     requires_grad=trainable,
42 |                 )
43 |                 for i in range(mixture_size)
44 |             ]
45 |         )
46 |         self.gamma = Parameter(torch.FloatTensor([1.0]), requires_grad=trainable)
47 | 
48 |     def forward(
49 |         self, tensors: List[torch.Tensor], mask: torch.BoolTensor = None
50 |     ) -> torch.Tensor:
51 |         """
52 |         Compute a weighted average of the `tensors`.  The input tensors caa be any shape
53 |         with at least two dimensions, but must all be the same shape.
54 |         When `do_layer_norm=True`, the `mask` is a required input. If the `tensors` are
55 |         dimensioned  `(dim_0, ..., dim_{n-1}, dim_n)`, then the `mask` is dimensioned
56 |         `(dim_0, ..., dim_{n-1})`, as in the typical case with `tensors` of shape
57 |         `(batch_size, timesteps, dim)` and `mask` of shape `(batch_size, timesteps)`.
58 |         When `do_layer_norm=False` the `mask` is ignored.
59 |         """
60 |         if len(tensors) != self.mixture_size:
61 |             raise ValueError(
62 |                 f"{len(tensors)} tensors were passed, but the module was initialized to "
63 |                 f"mix {self.mixture_size} tensors."
64 |             )
65 | 
66 |         def _do_layer_norm(tensor, broadcast_mask, num_elements_not_masked):
67 |             tensor_masked = tensor * broadcast_mask
68 |             mean = torch.sum(tensor_masked) / num_elements_not_masked
69 |             variance = (
70 |                 torch.sum(((tensor_masked - mean) * broadcast_mask) ** 2)
71 |                 / num_elements_not_masked
72 |             )
73 |             return (tensor - mean) / torch.sqrt(variance + 1e-4)
74 | 
75 |         normed_weights = torch.nn.functional.softmax(
76 |             torch.cat([parameter for parameter in self.scalar_parameters]), dim=0
77 |         )
78 |         normed_weights = torch.split(normed_weights, split_size_or_sections=1)
79 | 
80 |         if not self.do_layer_norm:
81 |             pieces = []
82 |             for weight, tensor in zip(normed_weights, tensors):
83 |                 pieces.append(weight * tensor)
84 |             return self.gamma * sum(pieces)
85 | 
86 |         else:
87 |             assert mask is not None
88 |             broadcast_mask = mask.unsqueeze(-1)
89 |             input_dim = tensors[0].size(-1)
90 |             num_elements_not_masked = torch.sum(mask) * input_dim
91 | 
92 |             pieces = []
93 |             for weight, tensor in zip(normed_weights, tensors):
94 |                 pieces.append(
95 |                     weight
96 |                     * _do_layer_norm(tensor, broadcast_mask, num_elements_not_masked)
97 |                 )
98 |             return self.gamma * sum(pieces)
99 | 


--------------------------------------------------------------------------------
/transformers_embedder/tokenizer.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from collections import UserDict
  4 | from functools import partial
  5 | from typing import List, Dict, Union, Any, Optional, Tuple, Set, Sequence, Mapping
  6 | 
  7 | import transformers as tr
  8 | from transformers import BatchEncoding
  9 | from transformers.file_utils import PaddingStrategy
 10 | from transformers.tokenization_utils_base import TruncationStrategy
 11 | 
 12 | from transformers_embedder import MODELS_WITH_STARTING_TOKEN, MODELS_WITH_DOUBLE_SEP
 13 | from transformers_embedder import utils
 14 | from transformers_embedder.utils import is_torch_available
 15 | 
 16 | if is_torch_available():
 17 |     import torch
 18 | 
 19 | logger = utils.get_logger(__name__)
 20 | utils.get_logger("transformers")
 21 | 
 22 | 
 23 | class Tokenizer:
 24 |     """
 25 |     A wrapper class for HuggingFace Tokenizer.
 26 | 
 27 |     Args:
 28 |         model (`str`, `transformers.PreTrainedTokenizer`):
 29 |             Language model name (or a transformer `PreTrainedTokenizer`.
 30 |         return_sparse_offsets (`bool`, optional, defaults to `True`):
 31 |             If `True`, the sparse offsets of the tokens in the input text are returned. To reduce
 32 |             memory usage, set this to `False` if you don't need them, e.g. you set the
 33 |             `subword_pooling_strategy` to `scatter` in the `TransformersEmbedder` model.
 34 |     """
 35 | 
 36 |     def __init__(
 37 |         self,
 38 |         model: Union[str, tr.PreTrainedTokenizer],
 39 |         return_sparse_offsets: bool = True,
 40 |         *args,
 41 |         **kwargs,
 42 |     ):
 43 |         if isinstance(model, str):
 44 |             # init HuggingFace tokenizer
 45 |             self.huggingface_tokenizer = tr.AutoTokenizer.from_pretrained(
 46 |                 model, *args, **kwargs
 47 |             )
 48 |             # get config
 49 |             self.config = tr.AutoConfig.from_pretrained(model, *args, **kwargs)
 50 |         else:
 51 |             self.huggingface_tokenizer = model
 52 |             self.config = tr.AutoConfig.from_pretrained(
 53 |                 self.huggingface_tokenizer.name_or_path, *args, **kwargs
 54 |             )
 55 | 
 56 |         self.return_sparse_offsets = return_sparse_offsets
 57 | 
 58 |         # padding stuff
 59 |         # default, batch length is model max length
 60 |         self.subword_max_batch_len = self.huggingface_tokenizer.model_max_length
 61 |         self.word_max_batch_len = self.huggingface_tokenizer.model_max_length
 62 |         # padding ops
 63 |         self.padding_ops = {}
 64 |         # keys that will be converted in tensors
 65 |         self.to_tensor_inputs = set()
 66 | 
 67 |     def __len__(self):
 68 |         """Size of the full vocabulary with the added tokens."""
 69 |         return len(self.huggingface_tokenizer)
 70 | 
 71 |     def __call__(
 72 |         self,
 73 |         text: Union[str, List[str], List[List[str]]],
 74 |         text_pair: Union[str, List[str], List[List[str]], None] = None,
 75 |         padding: Union[bool, str, PaddingStrategy] = False,
 76 |         truncation: Union[bool, str, TruncationStrategy] = False,
 77 |         max_length: Optional[int] = None,
 78 |         return_tensors: Optional[Union[bool, str]] = None,
 79 |         is_split_into_words: bool = False,
 80 |         additional_inputs: Optional[Dict[str, Any]] = None,
 81 |         *args,
 82 |         **kwargs,
 83 |     ) -> ModelInputs:
 84 |         """
 85 |         Prepare the text in input for models that uses HuggingFace as embeddings.
 86 | 
 87 |         Args:
 88 |             text (`str`, `List[str]`, `List[List[str]]`, `List[List[Word]]`, `List[Word]`):
 89 |                 Text or batch of text to be encoded.
 90 |             text_pair (`str`, `List[str]`, `List[List[str]]`, `List[List[Word]]`, `List[Word]`):
 91 |                 Text or batch of text to be encoded.
 92 |             padding (`bool`, optional, defaults to `False`):
 93 |                 If `True`, applies padding to the batch based on the maximum length of the batch.
 94 |             max_length (`int`, optional, defaults to `None`):
 95 |                 If specified, truncates the input sequence to that value. Otherwise,
 96 |                 uses the model max length.
 97 |             return_tensors (`bool`, optional, defaults to `None`):
 98 |                 If `True`, the outputs is converted to `torch.Tensor`
 99 |             is_split_into_words (`bool`, optional, defaults to `False`):
100 |                 If `True` and the input is a string, the input is split on spaces.
101 |             additional_inputs (`Dict[str, Any]`, optional, defaults to `None`):
102 |                 Additional inputs to be passed to the model.
103 | 
104 |         Returns:
105 |             `ModelInputs`: The inputs to the transformer model.
106 |         """
107 |         # some checks before starting
108 |         if return_tensors == "tf":
109 |             raise ValueError(
110 |                 "`return_tensors='tf'` is not supported. Please use `return_tensors='pt'` "
111 |                 "or `return_tensors=True`."
112 |             )
113 |         if return_tensors is True:
114 |             return_tensors = "pt"
115 |         if return_tensors is False:
116 |             return_tensors = None
117 | 
118 |         # check if input is batched or a single sample
119 |         is_batched = bool(
120 |             isinstance(text, (list, tuple))
121 |             and text
122 |             and (
123 |                 (isinstance(text[0], (list, tuple)) and is_split_into_words)
124 |                 or isinstance(text[0], str)
125 |             )
126 |         )
127 |         if not is_batched:  # batch it
128 |             text = [text]
129 |             text_pair = [text_pair] if text_pair is not None else None
130 | 
131 |         # use huggingface tokenizer to encode the text
132 |         model_inputs = self.huggingface_tokenizer(
133 |             text,
134 |             text_pair=text_pair,
135 |             padding=padding,
136 |             truncation=truncation,
137 |             max_length=max_length,
138 |             is_split_into_words=is_split_into_words,
139 |             return_tensors=return_tensors,
140 |             *args,
141 |             **kwargs,
142 |         )
143 |         # build the offsets used to pool the subwords
144 |         scatter_offsets, sentence_lengths = self.build_scatter_offsets(
145 |             model_inputs,
146 |             return_tensors=return_tensors,
147 |             there_is_text_pair=text_pair is not None,
148 |         )
149 | 
150 |         # convert to ModelInputs
151 |         model_inputs = ModelInputs(**model_inputs)
152 |         # add the offsets to the model inputs
153 |         model_inputs.update(
154 |             {"scatter_offsets": scatter_offsets, "sentence_lengths": sentence_lengths}
155 |         )
156 | 
157 |         if self.return_sparse_offsets:
158 |             # build the data used to pool the subwords when in sparse mode
159 |             bpe_info: Mapping[str, Any] = self.build_sparse_offsets(
160 |                 offsets=scatter_offsets,
161 |                 bpe_mask=model_inputs.attention_mask,
162 |                 words_per_sentence=sentence_lengths,
163 |             )
164 |             # add the bpe info to the model inputs
165 |             model_inputs["sparse_offsets"] = ModelInputs(**bpe_info)
166 | 
167 |         # we also update the maximum batch length,
168 |         # both for subword and word level
169 |         self.subword_max_batch_len = max(len(x) for x in model_inputs.input_ids)
170 |         self.word_max_batch_len = max(x for x in model_inputs.sentence_lengths)
171 | 
172 |         # check if we need to convert other stuff to tensors
173 |         if additional_inputs:
174 |             model_inputs.update(additional_inputs)
175 |             # check if there is a padding strategy
176 |             if padding:
177 |                 missing_keys = set(additional_inputs.keys()) - set(
178 |                     self.padding_ops.keys()
179 |                 )
180 |                 if missing_keys:
181 |                     raise ValueError(
182 |                         f"There are no padding strategies for the following keys: {missing_keys}. "
183 |                         "Please add one with `tokenizer.add_padding_ops()`."
184 |                     )
185 |                 self.pad_batch(model_inputs)
186 |             # convert them to tensors
187 |             if return_tensors == "pt":
188 |                 self.to_tensor(model_inputs)
189 | 
190 |         return model_inputs
191 | 
192 |     def build_scatter_offsets(
193 |         self,
194 |         model_inputs: BatchEncoding,
195 |         return_tensors: bool = True,
196 |         there_is_text_pair: bool = False,
197 |     ) -> Tuple:
198 |         """
199 |         Build the offset tensor for the batch of inputs.
200 | 
201 |         Args:
202 |             model_inputs (`BatchEncoding`):
203 |                 The inputs to the transformer model.
204 |             return_tensors (`bool`, optional, defaults to `True`):
205 |                 If `True`, the outputs is converted to `torch.Tensor`
206 |             there_is_text_pair (`bool`, optional, defaults to `False`):
207 |                 If `True` `text_pair` is not None.
208 | 
209 |         Returns:
210 |             `List[List[int]]` or `torch.Tensor`: The offsets of the sub-tokens.
211 |         """
212 |         # output data structure
213 |         offsets = []
214 |         sentence_lengths = []
215 |         # model_inputs should be the output of the HuggingFace tokenizer
216 |         # it contains the word offsets to reconstruct the original tokens from the
217 |         # sub-tokens
218 |         for batch_index in range(len(model_inputs.input_ids)):
219 |             word_ids = model_inputs.word_ids(batch_index)
220 |             # it is slightly different from what we need, so here we make it compatible
221 |             # with our subword pooling strategy
222 |             # if the first token is a special token, we need to take it into account
223 |             if self.has_starting_token:
224 |                 word_offsets = [0] + [
225 |                     w + 1 if w is not None else w for w in word_ids[1:]
226 |                 ]
227 |             # otherwise, we can just use word_ids as is
228 |             else:
229 |                 word_offsets = word_ids
230 | 
231 |             # replace first None occurrence with sep_offset
232 |             sep_index = word_offsets.index(None)
233 | 
234 |             # here we retrieve the max offset for the sample, which will be used as SEP offset
235 |             # and also as padding value for the offsets
236 |             sep_offset_value = max([w for w in word_offsets[:sep_index] if w is not None]) + 1
237 | 
238 |             word_offsets[sep_index] = sep_offset_value
239 |             # if there is a text pair, we need to adjust the offsets for the second text
240 |             if there_is_text_pair:
241 |                 # some models have two SEP tokens in between the two texts
242 |                 if self.has_double_sep:
243 |                     sep_index += 1
244 |                     sep_offset_value += 1
245 |                     word_offsets[sep_index] = sep_offset_value
246 |                 # keep the first offsets as is, adjust the second ones
247 |                 word_offsets = word_offsets[: sep_index + 1] + [
248 |                     w + sep_offset_value if w is not None else w
249 |                     for w in word_offsets[sep_index + 1 :]
250 |                 ]
251 |                 # update again the sep_offset
252 |                 sep_offset_value = max([w for w in word_offsets if w is not None]) + 1
253 |                 # replace first None occurrence with sep_offset, it should be the last SEP
254 |                 sep_index = word_offsets.index(None)
255 |                 word_offsets[sep_index] = sep_offset_value
256 |             # keep track of the maximum offset for padding
257 |             offsets.append(word_offsets)
258 |             sentence_lengths.append(sep_offset_value + 1)
259 |         # replace remaining None occurrences with -1
260 |         # the remaining None occurrences are the padding values
261 |         offsets = [[o if o is not None else -1 for o in offset] for offset in offsets]
262 |         # if return_tensor is True, we need to convert the offsets to tensors
263 |         if return_tensors:
264 |             offsets = torch.as_tensor(offsets)
265 |         return offsets, sentence_lengths
266 | 
267 |     @staticmethod
268 |     def build_sparse_offsets(
269 |         offsets: torch.Tensor | Sequence[Sequence[int]],
270 |         bpe_mask: torch.Tensor | Sequence[Sequence[int]],
271 |         words_per_sentence: Sequence[int],
272 |     ) -> Mapping[str, Any]:
273 |         """Build tensors used as info for BPE pooling, starting from the BPE offsets.
274 | 
275 |         Args:
276 |             offsets (`torch.Tensor` or `List[List[int]]`):
277 |                 The offsets to compute lengths from.
278 |             bpe_mask (`torch.Tensor` or `List[List[int]]`):
279 |                 The attention mask at BPE level.
280 |             words_per_sentence (`List[int]`):
281 |                 The sentence lengths, word-wise.
282 | 
283 |         Returns:
284 |             `Mapping[str, Any]`: Tensors used to construct the sparse one which pools the
285 |             transformer encoding word-wise.
286 |         """
287 |         if not isinstance(offsets, torch.Tensor):
288 |             offsets: torch.Tensor = torch.as_tensor(offsets)
289 |         if not isinstance(bpe_mask, torch.Tensor):
290 |             bpe_mask: torch.Tensor = torch.as_tensor(bpe_mask)
291 | 
292 |         sentence_lengths: torch.Tensor = bpe_mask.sum(dim=1)
293 | 
294 |         # We want to build triplets as coordinates (document, word, bpe)
295 |         # We start by creating the document index for each triplet
296 |         document_indices = torch.arange(offsets.size(0)).repeat_interleave(
297 |             sentence_lengths
298 |         )
299 |         # then the word indices
300 |         word_indices = offsets[offsets != -1]
301 |         # lastly the bpe indices
302 |         max_range: torch.Tensor = torch.arange(bpe_mask.shape[1])
303 |         bpe_indices: torch.LongTensor = torch.cat(
304 |             [max_range[:i] for i in bpe_mask.sum(dim=1)], dim=0
305 |         ).long()
306 | 
307 |         unique_words, word_lengths = torch.unique_consecutive(
308 |             offsets, return_counts=True
309 |         )
310 |         unpadded_word_lengths = word_lengths[unique_words != -1]
311 | 
312 |         # and their weight to be used as multiplication factors
313 |         bpe_weights: torch.FloatTensor = (
314 |             (1 / unpadded_word_lengths).repeat_interleave(unpadded_word_lengths).float()
315 |         )
316 | 
317 |         sparse_indices = torch.stack(
318 |             [document_indices, word_indices, bpe_indices], dim=0
319 |         )
320 | 
321 |         bpe_shape = torch.Size(
322 |             (
323 |                 bpe_mask.size(0),  # batch_size
324 |                 max(words_per_sentence),  # max number of words per sentence
325 |                 bpe_mask.size(1),  # max bpe_number in batch wrt the sentence
326 |             )
327 |         )
328 | 
329 |         return dict(
330 |             sparse_indices=sparse_indices,
331 |             sparse_values=bpe_weights,
332 |             sparse_size=bpe_shape,
333 |         )
334 | 
335 |     def pad_batch(
336 |         self,
337 |         batch: Union[ModelInputs, Dict[str, list]],
338 |         max_length: Optional[int] = None,
339 |     ) -> ModelInputs:
340 |         """
341 |         Pad the batch to its maximum length or to the specified `max_length`.
342 | 
343 |         Args:
344 |             batch (`Dict[str, list]`):
345 |                 The batch to pad.
346 |             max_length (`int`, optional):
347 |                 Override maximum length of the batch.
348 | 
349 |         Returns:
350 |             `Dict[str, list]`: The padded batch.
351 |         """
352 |         if max_length:
353 |             self.subword_max_batch_len = max_length
354 |             self.word_max_batch_len = max_length
355 |         else:
356 |             # get maximum len inside a batch
357 |             self.subword_max_batch_len = max(len(x) for x in batch["input_ids"])
358 |             self.word_max_batch_len = max(x for x in batch["sentence_lengths"])
359 | 
360 |         for key in batch:
361 |             if key in self.padding_ops:
362 |                 batch[key] = [self.padding_ops[key](b) for b in batch[key]]
363 | 
364 |         return ModelInputs(batch)
365 | 
366 |     def pad_sequence(
367 |         self,
368 |         sequence: Union[List, torch.Tensor],
369 |         value: int,
370 |         length: Union[int, str] = "subword",
371 |         pad_to_left: bool = False,
372 |     ) -> Union[List, torch.Tensor]:
373 |         """
374 |         Pad the input to the specified length with the given value.
375 | 
376 |         Args:
377 |             sequence (`List`, `torch.Tensor`):
378 |                 Element to pad, it can be either a `List` or a `torch.Tensor`.
379 |             value (`int`):
380 |                 Value to use as padding.
381 |             length (`int`, `str`, optional, defaults to `subword`):
382 |                 Length after pad.
383 |             pad_to_left (`bool`, optional, defaults to `False`):
384 |                 If `True`, pads to the left, right otherwise.
385 | 
386 |         Returns:
387 |             `List`, `torch.Tensor`: The padded sequence.
388 |         """
389 |         if length == "subword":
390 |             length = self.subword_max_batch_len
391 |         elif length == "word":
392 |             length = self.word_max_batch_len
393 |         else:
394 |             if not isinstance(length, int):
395 |                 raise ValueError(
396 |                     f"`length` must be an `int`, `subword` or `word`. Current value is `{length}`"
397 |                 )
398 |         padding = [value] * abs(length - len(sequence))
399 |         if isinstance(sequence, torch.Tensor):
400 |             if len(sequence.shape) > 1:
401 |                 raise ValueError(
402 |                     f"Sequence tensor must be 1D. Current shape is `{len(sequence.shape)}`"
403 |                 )
404 |             padding = torch.as_tensor(padding)
405 |         if pad_to_left:
406 |             if isinstance(sequence, torch.Tensor):
407 |                 return torch.cat((padding, sequence), -1)
408 |             return padding + sequence
409 |         if isinstance(sequence, torch.Tensor):
410 |             return torch.cat((sequence, padding), -1)
411 |         return sequence + padding
412 | 
413 |     def add_special_tokens(
414 |         self, special_tokens_dict: Dict[str, Union[str, tr.AddedToken]]
415 |     ) -> int:
416 |         """
417 |         Add a dictionary of special tokens (eos, pad, cls, etc.) to the encoder.
418 |         If special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last
419 |         index of the current vocabulary).
420 | 
421 |         Args:
422 |             special_tokens_dict (`Dict`):
423 |                 The dictionary containing special tokens. Keys should be in
424 |                 the list of predefined special attributes: [``bos_token``, ``eos_token``,
425 |                 ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``,
426 |                 ``additional_special_tokens``].
427 | 
428 |         Returns:
429 |             `int`: Number of tokens added to the vocabulary.
430 |         """
431 |         return self.huggingface_tokenizer.add_special_tokens(special_tokens_dict)
432 | 
433 |     def add_padding_ops(self, key: str, value: Any, length: Union[int, str]):
434 |         """
435 |         Add padding logic to custom fields.
436 |         If the field is not in `self.to_tensor_inputs`, this method will add the key to it.
437 | 
438 |         Args:
439 |             key (`str`):
440 |                 Name of the field in the tokenizer input.
441 |             value (`Any`):
442 |                 Value to use for padding.
443 |             length (`int`, `str`):
444 |                 Length to pad. It can be an `int`, or two string value
445 |                 - ``subword``: the element is padded to the batch max length relative to the subwords length
446 |                 - ``word``: the element is padded to the batch max length relative to the original word length
447 |         """
448 |         if key not in self.to_tensor_inputs:
449 |             self.to_tensor_inputs.add(key)
450 |         self.padding_ops[key] = partial(self.pad_sequence, value=value, length=length)
451 | 
452 |     def add_to_tensor_inputs(self, names: Union[str, Sequence[str]]) -> Set[str]:
453 |         """
454 |         Add these keys to the ones that will be converted in Tensors.
455 | 
456 |         Args:
457 |             names (`str`, `set`):
458 |                 Name of the field (or fields) to convert to tensors.
459 | 
460 |         Returns:
461 |             `set`: The set of keys that will be converted to tensors.
462 |         """
463 |         if isinstance(names, str):
464 |             names = {names}
465 |         if not isinstance(names, set):
466 |             names = set(names)
467 |         self.to_tensor_inputs |= names
468 |         return self.to_tensor_inputs
469 | 
470 |     def to_tensor(self, batch: Union[ModelInputs, List[dict], dict]) -> ModelInputs:
471 |         """
472 |         Return the batch in input as Pytorch tensors. The fields that are converted in tensors are in
473 |         `self.to_tensor_inputs`. By default, only the standard model inputs are converted. Use
474 |         `self.add_to_tensor_inputs` to add custom fields.
475 | 
476 |         Args:
477 |             batch (`List[dict]`, `dict`):
478 |                 Batch in input.
479 | 
480 |         Returns:
481 |             `ModelInputs`: The batch as tensor.
482 |         """
483 |         # convert to tensor
484 |         batch = {
485 |             k: torch.as_tensor(v)
486 |             if k in self.to_tensor_inputs and not isinstance(v, torch.Tensor)
487 |             else v
488 |             for k, v in batch.items()
489 |         }
490 |         return ModelInputs(batch)
491 | 
492 |     @staticmethod
493 |     def _clean_output(output: Union[List, Dict]) -> Dict:
494 |         """
495 |         Clean before output.
496 | 
497 |         Args:
498 |             output (:obj`List[dict]`, `dict`):
499 |                 The output to clean.
500 | 
501 |         Returns:
502 |             `dict`: The cleaned output.
503 |         """
504 |         # single sentence case, generalize
505 |         if isinstance(output, dict):
506 |             output = [output]
507 |         # convert list to dict
508 |         output = {k: [d[k] for d in output] for k in output[0]}
509 |         return output
510 | 
511 |     @staticmethod
512 |     def _get_token_type_id(config: tr.PretrainedConfig) -> int:
513 |         """
514 |         Get token type id. Useful when dealing with models that don't accept 1 as type id.
515 |         Args:
516 |             config (`transformers.PretrainedConfig`):
517 |                 Transformer config.
518 | 
519 |         Returns:
520 |             `int`: Correct token type id for that model.
521 |         """
522 |         if hasattr(config, "type_vocab_size"):
523 |             return 1 if config.type_vocab_size == 2 else 0
524 |         return 0
525 | 
526 |     @staticmethod
527 |     def _type_checking(text: Any, text_pair: Any):
528 |         """
529 |         Checks type of the inputs.
530 | 
531 |         Args:
532 |             text (`Any`):
533 |                 Text to check.
534 |             text_pair (`Any`):
535 |                 Text pair to check.
536 | 
537 |         Returns:
538 |         """
539 | 
540 |         def is_type_correct(text_to_check: Any) -> bool:
541 |             """
542 |             Check if input type is correct, returning a boolean value.
543 | 
544 |             Args:
545 |                 text_to_check (`Any`):
546 |                     text to check.
547 | 
548 |             Returns:
549 |                 :obj`bool`: :obj`True` if the type is correct.
550 |             """
551 |             return (
552 |                 text_to_check is None
553 |                 or isinstance(text_to_check, str)
554 |                 or (
555 |                     isinstance(text_to_check, (list, tuple))
556 |                     and (
557 |                         len(text_to_check) == 0
558 |                         or (
559 |                             isinstance(text_to_check[0], str)
560 |                             or (
561 |                                 isinstance(text_to_check[0], (list, tuple))
562 |                                 and (
563 |                                     len(text_to_check[0]) == 0
564 |                                     or isinstance(text_to_check[0][0], str)
565 |                                 )
566 |                             )
567 |                         )
568 |                     )
569 |                 )
570 |             )
571 | 
572 |         if not is_type_correct(text):
573 |             raise AssertionError(
574 |                 "text input must of type `str` (single example), `List[str]` (batch or single "
575 |                 "pre-tokenized example) or `List[List[str]]` (batch of pre-tokenized examples)."
576 |             )
577 | 
578 |         if not is_type_correct(text_pair):
579 |             raise AssertionError(
580 |                 "text_pair input must be `str` (single example), `List[str]` (batch or single "
581 |                 "pre-tokenized example) or `List[List[str]]` (batch of pre-tokenized examples)."
582 |             )
583 | 
584 |     @property
585 |     def num_special_tokens(self) -> int:
586 |         """
587 |         Return the number of special tokens the model needs.
588 |         It assumes the input contains both sentences (`text` and `text_pair`).
589 | 
590 |         Returns:
591 |             `int`: the number of special tokens.
592 |         """
593 |         if isinstance(
594 |             self.huggingface_tokenizer, MODELS_WITH_DOUBLE_SEP
595 |         ) and isinstance(self.huggingface_tokenizer, MODELS_WITH_STARTING_TOKEN):
596 |             return 4
597 |         if isinstance(
598 |             self.huggingface_tokenizer,
599 |             (MODELS_WITH_DOUBLE_SEP, MODELS_WITH_STARTING_TOKEN),
600 |         ):
601 |             return 3
602 |         return 2
603 | 
604 |     @property
605 |     def has_double_sep(self):
606 |         """True if tokenizer uses two SEP tokens."""
607 |         return isinstance(self.huggingface_tokenizer, MODELS_WITH_DOUBLE_SEP)
608 | 
609 |     @property
610 |     def has_starting_token(self):
611 |         """True if tokenizer uses a starting token."""
612 |         return isinstance(self.huggingface_tokenizer, MODELS_WITH_STARTING_TOKEN)
613 | 
614 |     @property
615 |     def token_type_id(self):
616 |         """Padding token."""
617 |         return self._get_token_type_id(self.config)
618 | 
619 |     @property
620 |     def pad_token(self):
621 |         """Padding token."""
622 |         return self.huggingface_tokenizer.pad_token
623 | 
624 |     @property
625 |     def pad_token_id(self):
626 |         """Padding token id."""
627 |         return self.huggingface_tokenizer.pad_token_id
628 | 
629 |     @property
630 |     def unk_token(self):
631 |         """Unknown token."""
632 |         return self.huggingface_tokenizer.unk_token
633 | 
634 |     @property
635 |     def unk_token_id(self):
636 |         """Unknown token id."""
637 |         return self.huggingface_tokenizer.unk_token_id
638 | 
639 |     @property
640 |     def cls_token(self):
641 |         """
642 |         Classification token.
643 |         To extract a summary of an input sequence leveraging self-attention along the
644 |         full depth of the model.
645 |         """
646 |         return self.huggingface_tokenizer.cls_token
647 | 
648 |     @property
649 |     def cls_token_id(self):
650 |         """
651 |         Classification token id.
652 |         To extract a summary of an input sequence leveraging self-attention along the
653 |         full depth of the model.
654 |         """
655 |         return self.huggingface_tokenizer.cls_token_id
656 | 
657 |     @property
658 |     def sep_token(self):
659 |         """Separation token, to separate context and query in an input sequence."""
660 |         return self.huggingface_tokenizer.sep_token
661 | 
662 |     @property
663 |     def sep_token_id(self):
664 |         """Separation token id, to separate context and query in an input sequence."""
665 |         return self.huggingface_tokenizer.sep_token_id
666 | 
667 |     @property
668 |     def bos_token(self):
669 |         """Beginning of sentence token."""
670 |         return self.huggingface_tokenizer.bos_token
671 | 
672 |     @property
673 |     def bos_token_id(self):
674 |         """Beginning of sentence token id."""
675 |         return self.huggingface_tokenizer.bos_token_id
676 | 
677 |     @property
678 |     def eos_token(self):
679 |         """End of sentence token."""
680 |         return self.huggingface_tokenizer.eos_token
681 | 
682 |     @property
683 |     def eos_token_id(self):
684 |         """End of sentence token id."""
685 |         return self.huggingface_tokenizer.eos_token_id
686 | 
687 | 
688 | class ModelInputs(UserDict):
689 |     """Model input dictionary wrapper."""
690 | 
691 |     def __getattr__(self, item: str):
692 |         try:
693 |             return self.data[item]
694 |         except KeyError:
695 |             raise AttributeError(f"`ModelInputs` has no attribute `{item}`")
696 | 
697 |     def __getitem__(self, item: str) -> Any:
698 |         return self.data[item]
699 | 
700 |     def __getstate__(self):
701 |         return {"data": self.data}
702 | 
703 |     def __setstate__(self, state):
704 |         if "data" in state:
705 |             self.data = state["data"]
706 | 
707 |     def keys(self):
708 |         """A set-like object providing a view on D's keys."""
709 |         return self.data.keys()
710 | 
711 |     def values(self):
712 |         """An object providing a view on D's values."""
713 |         return self.data.values()
714 | 
715 |     def items(self):
716 |         """A set-like object providing a view on D's items."""
717 |         return self.data.items()
718 | 
719 |     def to(self, device: Union[str, torch.device]) -> ModelInputs:
720 |         """
721 |         Send all tensors values to device.
722 | 
723 |         Args:
724 |             device (`str` or `torch.device`): The device to put the tensors on.
725 | 
726 |         Returns:
727 |             :class:`tokenizers.ModelInputs`: The same instance of :class:`~tokenizers.ModelInputs`
728 |             after modification.
729 |         """
730 |         if isinstance(device, (str, torch.device, int)):
731 |             self.data = {
732 |                 k: v.to(device=device) if hasattr(v, "to") else v
733 |                 for k, v in self.data.items()
734 |             }
735 |         else:
736 |             logger.warning(
737 |                 f"Attempting to cast to another type, {str(device)}. This is not supported."
738 |             )
739 |         return self
740 | 


--------------------------------------------------------------------------------
/transformers_embedder/utils.py:
--------------------------------------------------------------------------------
 1 | import importlib.util
 2 | import logging
 3 | 
 4 | _torch_available = importlib.util.find_spec("torch") is not None
 5 | 
 6 | 
 7 | def is_torch_available():
 8 |     """Check if PyTorch is available."""
 9 |     return _torch_available
10 | 
11 | 
12 | def get_logger(name: str) -> logging.Logger:
13 |     """
14 |     Return the logger of the given name.
15 | 
16 |     Args:
17 |         name (`str`): The name of the logger.
18 | 
19 |     Returns:
20 |         `logging.Logger`: The logger of the given name.
21 |     """
22 |     return logging.getLogger(name)
23 | 


--------------------------------------------------------------------------------