├── .deepsource.toml
├── .flake8
├── .github
├── conda
│ ├── build.sh
│ └── meta.yaml
├── dependabot.yml
└── workflows
│ ├── black.yml
│ ├── python-package.yml
│ ├── python-publish-conda.yml
│ ├── python-publish-pypi.yml
│ └── website.yml
├── .gitignore
├── .pre-commit-config.yaml
├── MANIFEST.in
├── README.md
├── docs
├── gen_ref_pages.py
└── index.md
├── mkdocs.yml
├── pyproject.toml
├── requirements-dev.txt
├── requirements.txt
├── setup.cfg
├── setup.py
└── transformers_embedder
├── __init__.py
├── embedder.py
├── modules
├── __init__.py
├── encoder.py
└── scalar_mix.py
├── tokenizer.py
└── utils.py
/.deepsource.toml:
--------------------------------------------------------------------------------
1 | version = 1
2 |
3 | [[analyzers]]
4 | name = "python"
5 | enabled = true
6 |
7 | [analyzers.meta]
8 | runtime_version = "3.x.x"
9 |
--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = E203, E266, E501, W503, F403, F401, E402
3 | max-line-length = 88
4 | max-complexity = 18
5 | select = B,C,E,F,W,T4,B9
6 |
--------------------------------------------------------------------------------
/.github/conda/build.sh:
--------------------------------------------------------------------------------
1 | $PYTHON setup.py install # Python command to install the script.
--------------------------------------------------------------------------------
/.github/conda/meta.yaml:
--------------------------------------------------------------------------------
1 | {% set name = "transformers-embedder" %}
2 | {% set data = load_setup_py_data() %}
3 |
4 | package:
5 | name: "{{ name|lower }}"
6 | version: "{{ TRANSFORMERS_EMBEDDER_VERSION }}"
7 |
8 | about:
9 | home: {{ data['url'] }}
10 | license: {{ data['license'] }}
11 | summary: {{ data['description'] }}
12 |
13 | requirements:
14 | build:
15 | - python
16 | - transformers>=4.3,<4.12
17 | - spacy>=3.0,<3.2
18 | run:
19 | - python
20 | - transformers>=4.3,<4.12
21 | - spacy>=3.0,<3.2
22 |
23 | source:
24 | path: ../../
25 |
26 | build:
27 | noarch: python
28 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | - package-ecosystem: pip
4 | directory: "/"
5 | schedule:
6 | interval: daily
7 | time: "04:00"
8 | open-pull-requests-limit: 10
9 |
--------------------------------------------------------------------------------
/.github/workflows/black.yml:
--------------------------------------------------------------------------------
1 | name: Check Code Quality
2 |
3 | on: pull_request
4 |
5 | jobs:
6 | lint:
7 | runs-on: ubuntu-latest
8 | steps:
9 | - uses: actions/checkout@v2
10 | - uses: psf/black@stable
11 | with:
12 | options: "-l 110"
13 | - uses: actions/checkout@v2
14 | - uses: actions/setup-python@v2
15 | with:
16 | python-version: "3.9"
17 | - name: Run flake8
18 | uses: julianwachholz/flake8-action@v2
19 | with:
20 | checkName: "Python Lint"
21 | path: ./transformers_embedder
22 | plugins: "pep8-naming==0.12.1 flake8-comprehensions==3.6.1"
23 | config: .flake8
24 | env:
25 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
26 |
--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: Build Python package
5 |
6 | on:
7 | push:
8 | branches: [$default-branch]
9 | pull_request:
10 | branches: [$default-branch]
11 |
12 | jobs:
13 | build:
14 | runs-on: ubuntu-latest
15 | strategy:
16 | fail-fast: false
17 | matrix:
18 | python-version: [3.6, 3.7, 3.8, 3.9]
19 |
20 | steps:
21 | - uses: actions/checkout@v2
22 | - name: Set up Python ${{ matrix.python-version }}
23 | uses: actions/setup-python@v2
24 | with:
25 | python-version: ${{ matrix.python-version }}
26 | - name: Install dependencies
27 | run: |
28 | python -m pip install --upgrade pip
29 | python -m pip install flake8 pytest
30 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
31 | - name: Lint with flake8
32 | run: |
33 | # stop the build if there are Python syntax errors or undefined names
34 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
35 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
36 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
37 | # - name: Test with pytest
38 | # run: |
39 | # pytest
40 |
--------------------------------------------------------------------------------
/.github/workflows/python-publish-conda.yml:
--------------------------------------------------------------------------------
1 | name: Upload Python Package to Conda
2 |
3 | on:
4 | release:
5 | types: [published]
6 |
7 | env:
8 | ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_TOKEN }}
9 |
10 | jobs:
11 | publish:
12 | runs-on: ubuntu-latest
13 | defaults:
14 | run:
15 | shell: bash -l {0}
16 |
17 | steps:
18 | - name: Checkout repository
19 | uses: actions/checkout@v2
20 |
21 | - name: Install miniconda
22 | uses: conda-incubator/setup-miniconda@v2
23 | with:
24 | auto-update-conda: true
25 | auto-activate-base: false
26 | python-version: 3.9
27 | activate-environment: "build-transformers-embedder"
28 | channels: riccorl
29 |
30 | - name: Setup conda env
31 | run: |
32 | conda install -c defaults anaconda-client conda-build
33 | - name: Extract version
34 | run: echo "TRANSFORMERS_EMBEDDER_VERSION=`python setup.py --version`" >> $GITHUB_ENV
35 |
36 | - name: Build conda packages
37 | run: |
38 | conda info
39 | conda list
40 | conda-build -c riccorl -c conda-forge -c huggingface .github/conda
41 |
42 | - name: Upload to Anaconda
43 | run: anaconda upload `conda-build .github/conda --output` --force
44 |
--------------------------------------------------------------------------------
/.github/workflows/python-publish-pypi.yml:
--------------------------------------------------------------------------------
1 | name: Upload Python Package to PyPi
2 |
3 | on:
4 | release:
5 | types: [published]
6 |
7 | jobs:
8 | publish:
9 | runs-on: ubuntu-latest
10 |
11 | steps:
12 | - uses: actions/checkout@v2
13 | - name: Set up Python
14 | uses: actions/setup-python@v2
15 | with:
16 | python-version: "3.x"
17 |
18 | - name: Install dependencies
19 | run: |
20 | python -m pip install --upgrade pip
21 | pip install build
22 |
23 | - name: Extract version
24 | run: echo "TRANSFORMERS_EMBEDDER_VERSION=`python setup.py --version`" >> $GITHUB_ENV
25 |
26 | - name: Build package
27 | run: python -m build
28 |
29 | - name: Publish package
30 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
31 | with:
32 | user: ${{ secrets.PYPI_USERNAME }}
33 | password: ${{ secrets.PYPI_PASSWORD }}
34 |
--------------------------------------------------------------------------------
/.github/workflows/website.yml:
--------------------------------------------------------------------------------
1 | name: ci
2 | on:
3 | push:
4 | branches:
5 | - master
6 | - main
7 | jobs:
8 | deploy:
9 | runs-on: ubuntu-latest
10 | steps:
11 | - uses: actions/checkout@v2
12 | - uses: actions/setup-python@v2
13 | with:
14 | python-version: 3.x
15 | - run: pip install mkdocs-material mkdocs-literate-nav mkdocstrings[python] mkdocs-section-index mkdocs-gen-files
16 | - run: mkdocs gh-deploy --force
17 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # custom
2 | stuff
3 | /test.ipynb
4 | /test.py
5 |
6 | # Fleet
7 | .fleet
8 |
9 | # Created by https://www.toptal.com/developers/gitignore/api/python,pycharm+all,vscode,macos,linux,windows
10 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,pycharm+all,vscode,macos,linux,windows
11 |
12 | ### Linux ###
13 | *~
14 |
15 | # temporary files which can be created if a process still has a handle open of a deleted file
16 | .fuse_hidden*
17 |
18 | # KDE directory preferences
19 | .directory
20 |
21 | # Linux trash folder which might appear on any partition or disk
22 | .Trash-*
23 |
24 | # .nfs files are created when an open file is removed but is still being accessed
25 | .nfs*
26 |
27 | ### macOS ###
28 | # General
29 | .DS_Store
30 | .AppleDouble
31 | .LSOverride
32 |
33 | # Icon must end with two \r
34 | Icon
35 |
36 |
37 | # Thumbnails
38 | ._*
39 |
40 | # Files that might appear in the root of a volume
41 | .DocumentRevisions-V100
42 | .fseventsd
43 | .Spotlight-V100
44 | .TemporaryItems
45 | .Trashes
46 | .VolumeIcon.icns
47 | .com.apple.timemachine.donotpresent
48 |
49 | # Directories potentially created on remote AFP share
50 | .AppleDB
51 | .AppleDesktop
52 | Network Trash Folder
53 | Temporary Items
54 | .apdisk
55 |
56 | ### PyCharm+all ###
57 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
58 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
59 |
60 | # User-specific stuff
61 | .idea/**/workspace.xml
62 | .idea/**/tasks.xml
63 | .idea/**/usage.statistics.xml
64 | .idea/**/dictionaries
65 | .idea/**/shelf
66 |
67 | # Generated files
68 | .idea/**/contentModel.xml
69 |
70 | # Sensitive or high-churn files
71 | .idea/**/dataSources/
72 | .idea/**/dataSources.ids
73 | .idea/**/dataSources.local.xml
74 | .idea/**/sqlDataSources.xml
75 | .idea/**/dynamic.xml
76 | .idea/**/uiDesigner.xml
77 | .idea/**/dbnavigator.xml
78 |
79 | # Gradle
80 | .idea/**/gradle.xml
81 | .idea/**/libraries
82 |
83 | # Gradle and Maven with auto-import
84 | # When using Gradle or Maven with auto-import, you should exclude module files,
85 | # since they will be recreated, and may cause churn. Uncomment if using
86 | # auto-import.
87 | # .idea/artifacts
88 | # .idea/compiler.xml
89 | # .idea/jarRepositories.xml
90 | # .idea/modules.xml
91 | # .idea/*.iml
92 | # .idea/modules
93 | # *.iml
94 | # *.ipr
95 |
96 | # CMake
97 | cmake-build-*/
98 |
99 | # Mongo Explorer plugin
100 | .idea/**/mongoSettings.xml
101 |
102 | # File-based project format
103 | *.iws
104 |
105 | # IntelliJ
106 | out/
107 |
108 | # mpeltonen/sbt-idea plugin
109 | .idea_modules/
110 |
111 | # JIRA plugin
112 | atlassian-ide-plugin.xml
113 |
114 | # Cursive Clojure plugin
115 | .idea/replstate.xml
116 |
117 | # Crashlytics plugin (for Android Studio and IntelliJ)
118 | com_crashlytics_export_strings.xml
119 | crashlytics.properties
120 | crashlytics-build.properties
121 | fabric.properties
122 |
123 | # Editor-based Rest Client
124 | .idea/httpRequests
125 |
126 | # Android studio 3.1+ serialized cache file
127 | .idea/caches/build_file_checksums.ser
128 |
129 | ### PyCharm+all Patch ###
130 | # Ignores the whole .idea folder and all .iml files
131 | # See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360
132 |
133 | .idea/
134 |
135 | # Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023
136 |
137 | *.iml
138 | modules.xml
139 | .idea/misc.xml
140 | *.ipr
141 |
142 | # Sonarlint plugin
143 | .idea/sonarlint
144 |
145 | ### Python ###
146 | # Byte-compiled / optimized / DLL files
147 | __pycache__/
148 | *.py[cod]
149 | *$py.class
150 |
151 | # C extensions
152 | *.so
153 |
154 | # Distribution / packaging
155 | .Python
156 | build/
157 | develop-eggs/
158 | dist/
159 | downloads/
160 | eggs/
161 | .eggs/
162 | lib/
163 | lib64/
164 | parts/
165 | sdist/
166 | var/
167 | wheels/
168 | pip-wheel-metadata/
169 | share/python-wheels/
170 | *.egg-info/
171 | .installed.cfg
172 | *.egg
173 | MANIFEST
174 |
175 | # PyInstaller
176 | # Usually these files are written by a python script from a template
177 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
178 | *.manifest
179 | *.spec
180 |
181 | # Installer logs
182 | pip-log.txt
183 | pip-delete-this-directory.txt
184 |
185 | # Unit test / coverage reports
186 | htmlcov/
187 | .tox/
188 | .nox/
189 | .coverage
190 | .coverage.*
191 | .cache
192 | nosetests.xml
193 | coverage.xml
194 | *.cover
195 | *.py,cover
196 | .hypothesis/
197 | .pytest_cache/
198 | pytestdebug.log
199 |
200 | # Translations
201 | *.mo
202 | *.pot
203 |
204 | # Django stuff:
205 | *.log
206 | local_settings.py
207 | db.sqlite3
208 | db.sqlite3-journal
209 |
210 | # Flask stuff:
211 | instance/
212 | .webassets-cache
213 |
214 | # Scrapy stuff:
215 | .scrapy
216 |
217 | # Sphinx documentation
218 | docs/_build/
219 | doc/_build/
220 |
221 | # PyBuilder
222 | target/
223 |
224 | # Jupyter Notebook
225 | .ipynb_checkpoints
226 |
227 | # IPython
228 | profile_default/
229 | ipython_config.py
230 |
231 | # pyenv
232 | .python-version
233 |
234 | # pipenv
235 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
236 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
237 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
238 | # install all needed dependencies.
239 | #Pipfile.lock
240 |
241 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
242 | __pypackages__/
243 |
244 | # Celery stuff
245 | celerybeat-schedule
246 | celerybeat.pid
247 |
248 | # SageMath parsed files
249 | *.sage.py
250 |
251 | # Environments
252 | .env
253 | .venv
254 | env/
255 | venv/
256 | ENV/
257 | env.bak/
258 | venv.bak/
259 | pythonenv*
260 |
261 | # Spyder project settings
262 | .spyderproject
263 | .spyproject
264 |
265 | # Rope project settings
266 | .ropeproject
267 |
268 | # mkdocs documentation
269 | /site
270 |
271 | # mypy
272 | .mypy_cache/
273 | .dmypy.json
274 | dmypy.json
275 |
276 | # Pyre type checker
277 | .pyre/
278 |
279 | # pytype static type analyzer
280 | .pytype/
281 |
282 | # profiling data
283 | .prof
284 |
285 | ### vscode ###
286 | .vscode
287 | .vscode/*
288 | !.vscode/settings.json
289 | !.vscode/tasks.json
290 | !.vscode/launch.json
291 | !.vscode/extensions.json
292 | *.code-workspace
293 |
294 | ### Windows ###
295 | # Windows thumbnail cache files
296 | Thumbs.db
297 | Thumbs.db:encryptable
298 | ehthumbs.db
299 | ehthumbs_vista.db
300 |
301 | # Dump file
302 | *.stackdump
303 |
304 | # Folder config file
305 | [Dd]esktop.ini
306 |
307 | # Recycle Bin used on file shares
308 | $RECYCLE.BIN/
309 |
310 | # Windows Installer files
311 | *.cab
312 | *.msi
313 | *.msix
314 | *.msm
315 | *.msp
316 |
317 | # Windows shortcuts
318 | *.lnk
319 |
320 | # End of https://www.toptal.com/developers/gitignore/api/python,pycharm+all,vscode,macos,linux,windows
321 |
322 | /stuff/
323 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/ambv/black
3 | rev: 21.9b0
4 | hooks:
5 | - id: black
6 | - repo: https://gitlab.com/pycqa/flake8
7 | rev: 3.9.2
8 | hooks:
9 | - id: flake8
10 |
11 | default_language_version:
12 | python: python3
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # Transformers Embedder
4 |
5 | [](https://github.dev/Riccorl/transformers-embedder)
6 | [](https://pytorch.org/)
7 | [](https://huggingface.co/transformers/)
8 | [](https://github.com/psf/black)
9 |
10 | [](https://github.com/Riccorl/transformers-embedder/actions/workflows/python-publish-pypi.yml)
11 | [](https://github.com/Riccorl/transformers-embedder/actions/workflows/python-publish-conda.yml)
12 | [](https://github.com/Riccorl/transformers-embedder/releases)
13 | [](https://anaconda.org/riccorl/transformers-embedder)
14 | [](https://deepsource.io/gh/Riccorl/transformers-embedder/?ref=repository-badge)
15 |
16 |
17 |
18 | A Word Level Transformer layer based on PyTorch and 🤗 Transformers.
19 |
20 | ## How to use
21 |
22 | Install the library from [PyPI](https://pypi.org/project/transformers-embedder):
23 |
24 | ```bash
25 | pip install transformers-embedder
26 | ```
27 |
28 | or from [Conda](https://anaconda.org/riccorl/transformers-embedder):
29 |
30 | ```bash
31 | conda install -c riccorl transformers-embedder
32 | ```
33 |
34 | It offers a PyTorch layer and a tokenizer that support almost every pretrained model from Huggingface
35 | [🤗Transformers](https://huggingface.co/transformers/) library. Here is a quick example:
36 |
37 | ```python
38 | import transformers_embedder as tre
39 |
40 | tokenizer = tre.Tokenizer("bert-base-cased")
41 |
42 | model = tre.TransformersEmbedder(
43 | "bert-base-cased", subword_pooling_strategy="sparse", layer_pooling_strategy="mean"
44 | )
45 |
46 | example = "This is a sample sentence"
47 | inputs = tokenizer(example, return_tensors=True)
48 | ```
49 |
50 | ```text
51 | {
52 | 'input_ids': tensor([[ 101, 1188, 1110, 170, 6876, 5650, 102]]),
53 | 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]]),
54 | 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]])
55 | 'scatter_offsets': tensor([[0, 1, 2, 3, 4, 5, 6]]),
56 | 'sparse_offsets': {
57 | 'sparse_indices': tensor(
58 | [
59 | [0, 0, 0, 0, 0, 0, 0],
60 | [0, 1, 2, 3, 4, 5, 6],
61 | [0, 1, 2, 3, 4, 5, 6]
62 | ]
63 | ),
64 | 'sparse_values': tensor([1., 1., 1., 1., 1., 1., 1.]),
65 | 'sparse_size': torch.Size([1, 7, 7])
66 | },
67 | 'sentence_length': 7 # with special tokens included
68 | }
69 | ```
70 |
71 | ```python
72 | outputs = model(**inputs)
73 | ```
74 |
75 | ```text
76 | # outputs.word_embeddings.shape[1:-1] # remove [CLS] and [SEP]
77 | torch.Size([1, 5, 768])
78 | # len(example)
79 | 5
80 | ```
81 |
82 | ## Info
83 |
84 | One of the annoyance of using transformer-based models is that it is not trivial to compute word embeddings
85 | from the sub-token embeddings they output. With this API it's as easy as using 🤗Transformers to get
86 | word-level embeddings from theoretically every transformer model it supports.
87 |
88 | ### Model
89 |
90 | #### Subword Pooling Strategy
91 |
92 | The `TransformersEmbedder` class offers 3 ways to get the embeddings:
93 |
94 | - `subword_pooling_strategy="sparse"`: computes the mean of the embeddings of the sub-tokens of each word
95 | (i.e. the embeddings of the sub-tokens are pooled together) using a sparse matrix multiplication. This
96 | strategy is the default one.
97 | - `subword_pooling_strategy="scatter"`: computes the mean of the embeddings of the sub-tokens of each word
98 | using a scatter-gather operation. It is not deterministic, but it works with ONNX export.
99 | - `subword_pooling_strategy="none"`: returns the raw output of the transformer model without sub-token pooling.
100 |
101 | Here a little feature table:
102 |
103 | | | Pooling | Deterministic | ONNX |
104 | |-------------|:------------------:|:------------------:|:------------------:|
105 | | **Sparse** | :white_check_mark: | :white_check_mark: | :x: |
106 | | **Scatter** | :white_check_mark: | :x: | :white_check_mark: |
107 | | **None** | :x: | :white_check_mark: | :white_check_mark: |
108 |
109 | #### Layer Pooling Strategy
110 |
111 | There are also multiple type of outputs you can get using `layer_pooling_strategy` parameter:
112 |
113 | - `layer_pooling_strategy="last"`: returns the last hidden state of the transformer model
114 | - `layer_pooling_strategy="concat"`: returns the concatenation of the selected `output_layers` of the
115 | transformer model
116 | - `layer_pooling_strategy="sum"`: returns the sum of the selected `output_layers` of the transformer model
117 | - `layer_pooling_strategy="mean"`: returns the average of the selected `output_layers` of the transformer model
118 | - `layer_pooling_strategy="scalar_mix"`: returns the output of a parameterised scalar mixture layer of the
119 | selected `output_layers` of the transformer model
120 |
121 | If you also want all the outputs from the HuggingFace model, you can set `return_all=True` to get them.
122 |
123 | ```python
124 | class TransformersEmbedder(torch.nn.Module):
125 | def __init__(
126 | self,
127 | model: Union[str, tr.PreTrainedModel],
128 | subword_pooling_strategy: str = "sparse",
129 | layer_pooling_strategy: str = "last",
130 | output_layers: Tuple[int] = (-4, -3, -2, -1),
131 | fine_tune: bool = True,
132 | return_all: bool = True,
133 | )
134 | ```
135 |
136 | ### Tokenizer
137 |
138 | The `Tokenizer` class provides the `tokenize` method to preprocess the input for the `TransformersEmbedder`
139 | layer. You can pass raw sentences, pre-tokenized sentences and sentences in batch. It will preprocess them
140 | returning a dictionary with the inputs for the model. By passing `return_tensors=True` it will return the
141 | inputs as `torch.Tensor`.
142 |
143 | By default, if you pass text (or batch) as strings, it uses the HuggingFace tokenizer to tokenize them.
144 |
145 | ```python
146 | text = "This is a sample sentence"
147 | tokenizer(text)
148 |
149 | text = ["This is a sample sentence", "This is another sample sentence"]
150 | tokenizer(text)
151 | ```
152 |
153 | You can pass a pre-tokenized sentence (or batch of sentences) by setting `is_split_into_words=True`
154 |
155 | ```python
156 | text = ["This", "is", "a", "sample", "sentence"]
157 | tokenizer(text, is_split_into_words=True)
158 |
159 | text = [
160 | ["This", "is", "a", "sample", "sentence", "1"],
161 | ["This", "is", "sample", "sentence", "2"],
162 | ]
163 | tokenizer(text, is_split_into_words=True)
164 | ```
165 |
166 | #### Examples
167 |
168 | First, initialize the tokenizer
169 |
170 | ```python
171 | import transformers_embedder as tre
172 |
173 | tokenizer = tre.Tokenizer("bert-base-cased")
174 | ```
175 |
176 | - You can pass a single sentence as a string:
177 |
178 | ```python
179 | text = "This is a sample sentence"
180 | tokenizer(text)
181 | ```
182 |
183 | ```text
184 | {
185 | {
186 | 'input_ids': [[101, 1188, 1110, 170, 6876, 5650, 102]],
187 | 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0]],
188 | 'attention_mask': [[1, 1, 1, 1, 1, 1, 1]],
189 | 'scatter_offsets': [[0, 1, 2, 3, 4, 5, 6]],
190 | 'sparse_offsets': {
191 | 'sparse_indices': tensor(
192 | [
193 | [0, 0, 0, 0, 0, 0, 0],
194 | [0, 1, 2, 3, 4, 5, 6],
195 | [0, 1, 2, 3, 4, 5, 6]
196 | ]
197 | ),
198 | 'sparse_values': tensor([1., 1., 1., 1., 1., 1., 1.]),
199 | 'sparse_size': torch.Size([1, 7, 7])
200 | },
201 | 'sentence_lengths': [7],
202 | }
203 | ```
204 |
205 | - A sentence pair
206 |
207 | ```python
208 | text = "This is a sample sentence A"
209 | text_pair = "This is a sample sentence B"
210 | tokenizer(text, text_pair)
211 | ```
212 |
213 | ```text
214 | {
215 | 'input_ids': [[101, 1188, 1110, 170, 6876, 5650, 138, 102, 1188, 1110, 170, 6876, 5650, 139, 102]],
216 | 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]],
217 | 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
218 | 'scatter_offsets': [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]],
219 | 'sparse_offsets': {
220 | 'sparse_indices': tensor(
221 | [
222 | [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
223 | [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
224 | [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
225 | ]
226 | ),
227 | 'sparse_values': tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
228 | 'sparse_size': torch.Size([1, 15, 15])
229 | },
230 | 'sentence_lengths': [15],
231 | }
232 | ```
233 |
234 | - A batch of sentences or sentence pairs. Using `padding=True` and `return_tensors=True`, the tokenizer
235 | returns the text ready for the model
236 |
237 | ```python
238 | batch = [
239 | ["This", "is", "a", "sample", "sentence", "1"],
240 | ["This", "is", "sample", "sentence", "2"],
241 | ["This", "is", "a", "sample", "sentence", "3"],
242 | # ...
243 | ["This", "is", "a", "sample", "sentence", "n", "for", "batch"],
244 | ]
245 | tokenizer(batch, padding=True, return_tensors=True)
246 |
247 | batch_pair = [
248 | ["This", "is", "a", "sample", "sentence", "pair", "1"],
249 | ["This", "is", "sample", "sentence", "pair", "2"],
250 | ["This", "is", "a", "sample", "sentence", "pair", "3"],
251 | # ...
252 | ["This", "is", "a", "sample", "sentence", "pair", "n", "for", "batch"],
253 | ]
254 | tokenizer(batch, batch_pair, padding=True, return_tensors=True)
255 | ```
256 |
257 | #### Custom fields
258 |
259 | It is possible to add custom fields to the model input and tell the `tokenizer` how to pad them using
260 | `add_padding_ops`. Start by initializing the tokenizer with the model name:
261 |
262 | ```python
263 | import transformers_embedder as tre
264 |
265 | tokenizer = tre.Tokenizer("bert-base-cased")
266 | ```
267 |
268 | Then add the custom fields to it:
269 |
270 | ```python
271 | custom_fields = {
272 | "custom_filed_1": [
273 | [0, 0, 0, 0, 1, 0, 0],
274 | [0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0]
275 | ]
276 | }
277 | ```
278 |
279 | Now we can add the padding logic for our custom field `custom_filed_1`. `add_padding_ops` method takes in
280 | input
281 |
282 | - `key`: name of the field in the tokenizer input
283 | - `value`: value to use for padding
284 | - `length`: length to pad. It can be an `int`, or two string value, `subword` in which the element is padded
285 | to match the length of the subwords, and `word` where the element is padded relative to the length of the
286 | batch after the merge of the subwords.
287 |
288 | ```python
289 | tokenizer.add_padding_ops("custom_filed_1", 0, "word")
290 | ```
291 |
292 | Finally, we can tokenize the input with the custom field:
293 |
294 | ```python
295 | text = [
296 | "This is a sample sentence",
297 | "This is another example sentence just make it longer, with a comma too!"
298 | ]
299 |
300 | tokenizer(text, padding=True, return_tensors=True, additional_inputs=custom_fields)
301 | ```
302 |
303 | The inputs are ready for the model, including the custom filed.
304 |
305 | ```text
306 | >>> inputs
307 |
308 | {
309 | 'input_ids': tensor(
310 | [
311 | [ 101, 1188, 1110, 170, 6876, 5650, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
312 | [ 101, 1188, 1110, 1330, 1859, 5650, 1198, 1294, 1122, 2039, 117, 1114, 170, 3254, 1918, 1315, 106, 102]
313 | ]
314 | ),
315 | 'token_type_ids': tensor(
316 | [
317 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
318 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
319 | ]
320 | ),
321 | 'attention_mask': tensor(
322 | [
323 | [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
324 | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
325 | ]
326 | ),
327 | 'scatter_offsets': tensor(
328 | [
329 | [ 0, 1, 2, 3, 4, 5, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
330 | [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16]
331 | ]
332 | ),
333 | 'sparse_offsets': {
334 | 'sparse_indices': tensor(
335 | [
336 | [ 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
337 | [ 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16],
338 | [ 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
339 | ]
340 | ),
341 | 'sparse_values': tensor(
342 | [1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
343 | 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
344 | 1.0000, 1.0000, 0.5000, 0.5000, 1.0000, 1.0000, 1.0000]
345 | ),
346 | 'sparse_size': torch.Size([2, 17, 18])
347 | }
348 | 'sentence_lengths': [7, 17],
349 | }
350 | ```
351 |
352 | ## Acknowledgements
353 |
354 | Some code in the `TransformersEmbedder` class is taken from the [PyTorch Scatter](https://github.com/rusty1s/pytorch_scatter/)
355 | library. The pretrained models and the core of the tokenizer is from [🤗 Transformers](https://huggingface.co/transformers/).
356 |
--------------------------------------------------------------------------------
/docs/gen_ref_pages.py:
--------------------------------------------------------------------------------
1 | """Generate the code reference pages and navigation."""
2 |
3 | from pathlib import Path
4 |
5 | import os
6 |
7 | import mkdocs_gen_files
8 |
9 | nav = mkdocs_gen_files.Nav()
10 |
11 | ROOT_DIR = Path(__file__).parent.parent
12 | SRC_DIR = ROOT_DIR / "transformers_embedder"
13 | DOC_DIR = ROOT_DIR / "references"
14 |
15 | for path in sorted(Path("transformers_embedder").glob("**/*.py")):
16 | module_path = path.with_suffix("")
17 | doc_path = path.with_suffix(".md").name
18 | full_doc_path = DOC_DIR / doc_path
19 | parts = tuple(module_path.parts)
20 |
21 | if parts[-1] == "__init__":
22 | parts = parts[:-1]
23 | # doc_path = doc_path.with_name("index.md")
24 | # full_doc_path = full_doc_path.with_name("index.md")
25 | elif parts[-1] == "__main__":
26 | continue
27 |
28 | nav[parts] = doc_path
29 |
30 | with mkdocs_gen_files.open(full_doc_path, "w") as fd:
31 | ident = ".".join(parts)
32 | fd.write(f"::: {ident}")
33 |
34 | mkdocs_gen_files.set_edit_path(full_doc_path, path)
35 |
36 | with mkdocs_gen_files.open(DOC_DIR / "main.md", "w") as nav_file:
37 | nav_file.writelines(nav.build_literate_nav())
38 |
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # Transformers Embedder
4 |
5 | [](https://github.dev/Riccorl/transformers-embedder)
6 | [](https://pytorch.org/)
7 | [](https://huggingface.co/transformers/)
8 | [](https://github.com/psf/black)
9 |
10 | [](https://github.com/Riccorl/transformers-embedder/actions/workflows/python-publish-pypi.yml)
11 | [](https://github.com/Riccorl/transformers-embedder/actions/workflows/python-publish-conda.yml)
12 | [](https://github.com/Riccorl/transformers-embedder/releases)
13 | [](https://anaconda.org/riccorl/transformers-embedder)
14 | [](https://deepsource.io/gh/Riccorl/transformers-embedder/?ref=repository-badge)
15 |
16 |
17 |
18 | A Word Level Transformer layer based on PyTorch and 🤗 Transformers.
19 |
20 | ## How to use
21 |
22 | Install the library from [PyPI](https://pypi.org/project/transformers-embedder):
23 |
24 | ```bash
25 | pip install transformers-embedder
26 | ```
27 |
28 | or from [Conda](https://anaconda.org/riccorl/transformers-embedder):
29 |
30 | ```bash
31 | conda install -c riccorl transformers-embedder
32 | ```
33 |
34 | It offers a PyTorch layer and a tokenizer that support almost every pretrained model from Huggingface
35 | [🤗Transformers](https://huggingface.co/transformers/) library. Here is a quick example:
36 |
37 | ```python
38 | import transformers_embedder as tre
39 |
40 | tokenizer = tre.Tokenizer("bert-base-cased")
41 |
42 | model = tre.TransformersEmbedder(
43 | "bert-base-cased", subword_pooling_strategy="sparse", layer_pooling_strategy="mean"
44 | )
45 |
46 | example = "This is a sample sentence"
47 | inputs = tokenizer(example, return_tensors=True)
48 | ```
49 |
50 | ```text
51 | {
52 | 'input_ids': tensor([[ 101, 1188, 1110, 170, 6876, 5650, 102]]),
53 | 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]]),
54 | 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]])
55 | 'scatter_offsets': tensor([[0, 1, 2, 3, 4, 5, 6]]),
56 | 'sparse_offsets': {
57 | 'sparse_indices': tensor(
58 | [
59 | [0, 0, 0, 0, 0, 0, 0],
60 | [0, 1, 2, 3, 4, 5, 6],
61 | [0, 1, 2, 3, 4, 5, 6]
62 | ]
63 | ),
64 | 'sparse_values': tensor([1., 1., 1., 1., 1., 1., 1.]),
65 | 'sparse_size': torch.Size([1, 7, 7])
66 | },
67 | 'sentence_length': 7 # with special tokens included
68 | }
69 | ```
70 |
71 | ```python
72 | outputs = model(**inputs)
73 | ```
74 |
75 | ```text
76 | # outputs.word_embeddings.shape[1:-1] # remove [CLS] and [SEP]
77 | torch.Size([1, 5, 768])
78 | # len(example)
79 | 5
80 | ```
81 |
82 | ## Info
83 |
84 | One of the annoyance of using transformer-based models is that it is not trivial to compute word embeddings
85 | from the sub-token embeddings they output. With this API it's as easy as using 🤗Transformers to get
86 | word-level embeddings from theoretically every transformer model it supports.
87 |
88 | ### Model
89 |
90 | #### Subword Pooling Strategy
91 |
92 | The `TransformersEmbedder` class offers 3 ways to get the embeddings:
93 |
94 | - `subword_pooling_strategy="sparse"`: computes the mean of the embeddings of the sub-tokens of each word
95 | (i.e. the embeddings of the sub-tokens are pooled together) using a sparse matrix multiplication. This
96 | strategy is the default one.
97 | - `subword_pooling_strategy="scatter"`: computes the mean of the embeddings of the sub-tokens of each word
98 | using a scatter-gather operation. It is not deterministic, but it works with ONNX export.
99 | - `subword_pooling_strategy="none"`: returns the raw output of the transformer model without sub-token pooling.
100 |
101 | Here a little feature table:
102 |
103 | | | Pooling | Deterministic | ONNX |
104 | |-------------|:------------------:|:------------------:|:------------------:|
105 | | **Sparse** | :white_check_mark: | :white_check_mark: | :x: |
106 | | **Scatter** | :white_check_mark: | :x: | :white_check_mark: |
107 | | **None** | :x: | :white_check_mark: | :white_check_mark: |
108 |
109 | #### Layer Pooling Strategy
110 |
111 | There are also multiple type of outputs you can get using `layer_pooling_strategy` parameter:
112 |
113 | - `layer_pooling_strategy="last"`: returns the last hidden state of the transformer model
114 | - `layer_pooling_strategy="concat"`: returns the concatenation of the selected `output_layers` of the
115 | transformer model
116 | - `layer_pooling_strategy="sum"`: returns the sum of the selected `output_layers` of the transformer model
117 | - `layer_pooling_strategy="mean"`: returns the average of the selected `output_layers` of the transformer model
118 | - `layer_pooling_strategy="scalar_mix"`: returns the output of a parameterised scalar mixture layer of the
119 | selected `output_layers` of the transformer model
120 |
121 | If you also want all the outputs from the HuggingFace model, you can set `return_all=True` to get them.
122 |
123 | ```python
124 | class TransformersEmbedder(torch.nn.Module):
125 | def __init__(
126 | self,
127 | model: Union[str, tr.PreTrainedModel],
128 | subword_pooling_strategy: str = "sparse",
129 | layer_pooling_strategy: str = "last",
130 | output_layers: Tuple[int] = (-4, -3, -2, -1),
131 | fine_tune: bool = True,
132 | return_all: bool = True,
133 | )
134 | ```
135 |
136 | ### Tokenizer
137 |
138 | The `Tokenizer` class provides the `tokenize` method to preprocess the input for the `TransformersEmbedder`
139 | layer. You can pass raw sentences, pre-tokenized sentences and sentences in batch. It will preprocess them
140 | returning a dictionary with the inputs for the model. By passing `return_tensors=True` it will return the
141 | inputs as `torch.Tensor`.
142 |
143 | By default, if you pass text (or batch) as strings, it uses the HuggingFace tokenizer to tokenize them.
144 |
145 | ```python
146 | text = "This is a sample sentence"
147 | tokenizer(text)
148 |
149 | text = ["This is a sample sentence", "This is another sample sentence"]
150 | tokenizer(text)
151 | ```
152 |
153 | You can pass a pre-tokenized sentence (or batch of sentences) by setting `is_split_into_words=True`
154 |
155 | ```python
156 | text = ["This", "is", "a", "sample", "sentence"]
157 | tokenizer(text, is_split_into_words=True)
158 |
159 | text = [
160 | ["This", "is", "a", "sample", "sentence", "1"],
161 | ["This", "is", "sample", "sentence", "2"],
162 | ]
163 | tokenizer(text, is_split_into_words=True)
164 | ```
165 |
166 | #### Examples
167 |
168 | First, initialize the tokenizer
169 |
170 | ```python
171 | import transformers_embedder as tre
172 |
173 | tokenizer = tre.Tokenizer("bert-base-cased")
174 | ```
175 |
176 | - You can pass a single sentence as a string:
177 |
178 | ```python
179 | text = "This is a sample sentence"
180 | tokenizer(text)
181 | ```
182 |
183 | ```text
184 | {
185 | {
186 | 'input_ids': [[101, 1188, 1110, 170, 6876, 5650, 102]],
187 | 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0]],
188 | 'attention_mask': [[1, 1, 1, 1, 1, 1, 1]],
189 | 'scatter_offsets': [[0, 1, 2, 3, 4, 5, 6]],
190 | 'sparse_offsets': {
191 | 'sparse_indices': tensor(
192 | [
193 | [0, 0, 0, 0, 0, 0, 0],
194 | [0, 1, 2, 3, 4, 5, 6],
195 | [0, 1, 2, 3, 4, 5, 6]
196 | ]
197 | ),
198 | 'sparse_values': tensor([1., 1., 1., 1., 1., 1., 1.]),
199 | 'sparse_size': torch.Size([1, 7, 7])
200 | },
201 | 'sentence_lengths': [7],
202 | }
203 | ```
204 |
205 | - A sentence pair
206 |
207 | ```python
208 | text = "This is a sample sentence A"
209 | text_pair = "This is a sample sentence B"
210 | tokenizer(text, text_pair)
211 | ```
212 |
213 | ```text
214 | {
215 | 'input_ids': [[101, 1188, 1110, 170, 6876, 5650, 138, 102, 1188, 1110, 170, 6876, 5650, 139, 102]],
216 | 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]],
217 | 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
218 | 'scatter_offsets': [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]],
219 | 'sparse_offsets': {
220 | 'sparse_indices': tensor(
221 | [
222 | [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
223 | [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
224 | [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
225 | ]
226 | ),
227 | 'sparse_values': tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
228 | 'sparse_size': torch.Size([1, 15, 15])
229 | },
230 | 'sentence_lengths': [15],
231 | }
232 | ```
233 |
234 | - A batch of sentences or sentence pairs. Using `padding=True` and `return_tensors=True`, the tokenizer
235 | returns the text ready for the model
236 |
237 | ```python
238 | batch = [
239 | ["This", "is", "a", "sample", "sentence", "1"],
240 | ["This", "is", "sample", "sentence", "2"],
241 | ["This", "is", "a", "sample", "sentence", "3"],
242 | # ...
243 | ["This", "is", "a", "sample", "sentence", "n", "for", "batch"],
244 | ]
245 | tokenizer(batch, padding=True, return_tensors=True)
246 |
247 | batch_pair = [
248 | ["This", "is", "a", "sample", "sentence", "pair", "1"],
249 | ["This", "is", "sample", "sentence", "pair", "2"],
250 | ["This", "is", "a", "sample", "sentence", "pair", "3"],
251 | # ...
252 | ["This", "is", "a", "sample", "sentence", "pair", "n", "for", "batch"],
253 | ]
254 | tokenizer(batch, batch_pair, padding=True, return_tensors=True)
255 | ```
256 |
257 | #### Custom fields
258 |
259 | It is possible to add custom fields to the model input and tell the `tokenizer` how to pad them using
260 | `add_padding_ops`. Start by initializing the tokenizer with the model name:
261 |
262 | ```python
263 | import transformers_embedder as tre
264 |
265 | tokenizer = tre.Tokenizer("bert-base-cased")
266 | ```
267 |
268 | Then add the custom fields to it:
269 |
270 | ```python
271 | custom_fields = {
272 | "custom_filed_1": [
273 | [0, 0, 0, 0, 1, 0, 0],
274 | [0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0]
275 | ]
276 | }
277 | ```
278 |
279 | Now we can add the padding logic for our custom field `custom_filed_1`. `add_padding_ops` method takes in
280 | input
281 |
282 | - `key`: name of the field in the tokenizer input
283 | - `value`: value to use for padding
284 | - `length`: length to pad. It can be an `int`, or two string value, `subword` in which the element is padded
285 | to match the length of the subwords, and `word` where the element is padded relative to the length of the
286 | batch after the merge of the subwords.
287 |
288 | ```python
289 | tokenizer.add_padding_ops("custom_filed_1", 0, "word")
290 | ```
291 |
292 | Finally, we can tokenize the input with the custom field:
293 |
294 | ```python
295 | text = [
296 | "This is a sample sentence",
297 | "This is another example sentence just make it longer, with a comma too!"
298 | ]
299 |
300 | tokenizer(text, padding=True, return_tensors=True, additional_inputs=custom_fields)
301 | ```
302 |
303 | The inputs are ready for the model, including the custom filed.
304 |
305 | ```text
306 | >>> inputs
307 |
308 | {
309 | 'input_ids': tensor(
310 | [
311 | [ 101, 1188, 1110, 170, 6876, 5650, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
312 | [ 101, 1188, 1110, 1330, 1859, 5650, 1198, 1294, 1122, 2039, 117, 1114, 170, 3254, 1918, 1315, 106, 102]
313 | ]
314 | ),
315 | 'token_type_ids': tensor(
316 | [
317 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
318 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
319 | ]
320 | ),
321 | 'attention_mask': tensor(
322 | [
323 | [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
324 | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
325 | ]
326 | ),
327 | 'scatter_offsets': tensor(
328 | [
329 | [ 0, 1, 2, 3, 4, 5, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
330 | [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16]
331 | ]
332 | ),
333 | 'sparse_offsets': {
334 | 'sparse_indices': tensor(
335 | [
336 | [ 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
337 | [ 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16],
338 | [ 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
339 | ]
340 | ),
341 | 'sparse_values': tensor(
342 | [1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
343 | 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
344 | 1.0000, 1.0000, 0.5000, 0.5000, 1.0000, 1.0000, 1.0000]
345 | ),
346 | 'sparse_size': torch.Size([2, 17, 18])
347 | }
348 | 'sentence_lengths': [7, 17],
349 | }
350 | ```
351 |
352 | ## Acknowledgements
353 |
354 | Some code in the `TransformersEmbedder` class is taken from the [PyTorch Scatter](https://github.com/rusty1s/pytorch_scatter/)
355 | library. The pretrained models and the core of the tokenizer is from [🤗 Transformers](https://huggingface.co/transformers/).
356 |
--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: Transformers Embedder
2 | repo_url: https://github.com/riccorl/transformers-embedder
3 |
4 | plugins:
5 | - search
6 | - gen-files:
7 | scripts:
8 | - docs/gen_ref_pages.py
9 | - literate-nav:
10 | nav_file: main.md
11 | - section-index
12 | - mkdocstrings:
13 | custom_templates: templates
14 | default_handler: python
15 | handlers:
16 | python:
17 | options:
18 | docstring_style: google
19 | watch:
20 | - transformers_embedder
21 |
22 | theme:
23 | name: material
24 | features:
25 | - search.suggest
26 | - search.highlight
27 | icon:
28 | repo: fontawesome/brands/github
29 | palette:
30 | # Palette toggle for light mode
31 | - media: "(prefers-color-scheme: light)"
32 | primary: deep purple
33 | accent: yellow
34 | scheme: default
35 | font:
36 | text: Work Sans
37 | code: Fira Mono
38 | toggle:
39 | icon: material/brightness-7
40 | name: Switch to dark mode
41 | # Palette toggle for dark mode
42 | - media: "(prefers-color-scheme: dark)"
43 | primary: deep purple
44 | accent: yellow
45 | scheme: slate
46 | font:
47 | text: Work Sans
48 | code: Fira Mono
49 | toggle:
50 | icon: material/brightness-4
51 | name: Switch to light mode
52 |
53 | nav:
54 | - API References: references/
55 |
56 | extra:
57 | # version:
58 | # provider: mike
59 |
60 | social:
61 | - icon: fontawesome/brands/twitter
62 | link: https://twitter.com/RiccrdoRicOrl
63 | - icon: fontawesome/brands/github
64 | link: https://github.com/riccorl
65 |
66 | markdown_extensions:
67 | - admonition
68 | - codehilite
69 | - pymdownx.superfences
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.black]
2 | include = '\.pyi?$'
3 | exclude = '''
4 | /(
5 | \.git
6 | | \.hg
7 | | \.mypy_cache
8 | | \.tox
9 | | \.venv
10 | | _build
11 | | buck-out
12 | | build
13 | | dist
14 | )/
15 | '''
--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | pre-commit
2 | datasets
3 | mkdocs-material
4 | mkdocstrings[python]
5 | mkdocs-literate-nav
6 | mkdocs-section-index
7 | mkdocs-gen-files
8 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch>=1.7
2 | transformers>=4.14,<4.35
3 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 |
3 | with open("README.md", "r") as fh:
4 | long_description = fh.read()
5 |
6 | extras = {}
7 | extras["torch"] = ["torch>=1.5,<2.2"]
8 | extras["all"] = extras["torch"]
9 | extras["docs"] = ["mkdocs-material"]
10 |
11 | install_requires = ["transformers>=4.14,<4.35"]
12 |
13 | setuptools.setup(
14 | name="transformers_embedder",
15 | version="3.0.11",
16 | author="Riccardo Orlando",
17 | author_email="orlandoricc@gmail.com",
18 | description="Word level transformer based embeddings",
19 | long_description=long_description,
20 | long_description_content_type="text/markdown",
21 | url="https://github.com/Riccorl/transformers-embedder",
22 | keywords="NLP deep learning transformer pytorch BERT google subtoken wordpieces embeddings",
23 | packages=setuptools.find_packages(),
24 | include_package_data=True,
25 | license="Apache",
26 | classifiers=[
27 | "Programming Language :: Python :: 3",
28 | "License :: OSI Approved :: Apache Software License",
29 | "Operating System :: OS Independent",
30 | ],
31 | extras_require=extras,
32 | install_requires=install_requires,
33 | python_requires=">=3.6",
34 | )
35 |
--------------------------------------------------------------------------------
/transformers_embedder/__init__.py:
--------------------------------------------------------------------------------
1 | from transformers_embedder import utils
2 |
3 | if utils.is_torch_available():
4 | from transformers_embedder.embedder import TransformersEmbedder, TransformersEncoder
5 |
6 | from transformers import (
7 | BertTokenizer,
8 | BertTokenizerFast,
9 | BertweetTokenizer,
10 | CamembertTokenizer,
11 | CamembertTokenizerFast,
12 | DebertaTokenizer,
13 | DebertaTokenizerFast,
14 | DebertaV2Tokenizer,
15 | DebertaV2TokenizerFast,
16 | DistilBertTokenizer,
17 | DistilBertTokenizerFast,
18 | MobileBertTokenizer,
19 | MobileBertTokenizerFast,
20 | RobertaTokenizer,
21 | RobertaTokenizerFast,
22 | XLMRobertaTokenizer,
23 | XLMRobertaTokenizerFast,
24 | XLMTokenizer,
25 | )
26 |
27 |
28 | MODELS_WITH_STARTING_TOKEN = (
29 | BertTokenizer,
30 | BertTokenizerFast,
31 | DistilBertTokenizer,
32 | DistilBertTokenizerFast,
33 | MobileBertTokenizer,
34 | MobileBertTokenizerFast,
35 | BertweetTokenizer,
36 | CamembertTokenizer,
37 | CamembertTokenizerFast,
38 | DebertaTokenizer,
39 | DebertaTokenizerFast,
40 | DebertaV2Tokenizer,
41 | DebertaV2TokenizerFast,
42 | RobertaTokenizer,
43 | RobertaTokenizerFast,
44 | XLMRobertaTokenizer,
45 | XLMRobertaTokenizerFast,
46 | XLMTokenizer,
47 | )
48 |
49 | MODELS_WITH_DOUBLE_SEP = (
50 | CamembertTokenizer,
51 | CamembertTokenizerFast,
52 | BertweetTokenizer,
53 | RobertaTokenizer,
54 | RobertaTokenizerFast,
55 | XLMRobertaTokenizer,
56 | XLMRobertaTokenizerFast,
57 | )
58 |
59 | from transformers_embedder.tokenizer import Tokenizer
60 |
--------------------------------------------------------------------------------
/transformers_embedder/embedder.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from pathlib import Path
3 | from typing import Optional, Union, Tuple, Sequence, Any, Mapping
4 |
5 | import transformers as tr
6 |
7 | from transformers_embedder import utils
8 | from transformers_embedder.modules.scalar_mix import ScalarMix
9 | from transformers_embedder.modules.encoder import Encoder
10 |
11 | if utils.is_torch_available():
12 | import torch
13 |
14 | logger = utils.get_logger(__name__)
15 | utils.get_logger("transformers")
16 |
17 |
18 | @dataclass
19 | class TransformersEmbedderOutput(tr.file_utils.ModelOutput):
20 | """Class for model's outputs."""
21 |
22 | word_embeddings: Optional[torch.FloatTensor] = None
23 | last_hidden_state: Optional[torch.FloatTensor] = None
24 | pooler_output: Optional[torch.FloatTensor] = None
25 | hidden_states: Optional[Tuple[torch.FloatTensor]] = None
26 | attentions: Optional[Tuple[torch.FloatTensor]] = None
27 |
28 |
29 | class TransformersEmbedder(torch.nn.Module):
30 | """
31 | Transformer Embedder class.
32 |
33 | Word level embeddings from various transformer architectures from Huggingface Transformers API.
34 |
35 | Args:
36 | model (`str`, `tr.PreTrainedModel`):
37 | Transformer model to use (https://huggingface.co/models).
38 | layer_pooling_strategy (`str`, optional, defaults to `last`):
39 | What output to get from the transformer model. The last hidden state (``last``),
40 | the concatenation of the selected hidden layers (``concat``), the sum of the selected hidden
41 | layers (``sum``), the average of the selected hidden layers (``mean``), or a scalar mixture of
42 | the selected hidden layers (``scalar_mix``).
43 | subword_pooling_strategy (`str`, optional, defaults to `sparse`):
44 | What pooling strategy to use for the sub-word embeddings. Methods available are ``sparse``,
45 | ``scatter`` and ``none``. The ``scatter`` strategy is ONNX comptabile but uses ``scatter_add_``
46 | that is not deterministic. The ``sparse`` strategy is deterministic but it is not comptabile
47 | with ONNX. When ``subword_pooling_strategy`` is ``none``, the sub-word embeddings are not
48 | pooled.
49 | output_layers (`tuple`, `list`, `str`, optional, defaults to `(-4, -3, -2, -1)`):
50 | Which hidden layers to get from the transformer model. If ``output_layers`` is ``all``,
51 | all the hidden layers are returned. If ``output_layers`` is a tuple or a list, the hidden
52 | layers are selected according to the indexes in the tuple or list. If ``output_layers`` is
53 | a string, it must be ``all``.
54 | fine_tune (`bool`, optional, defaults to `True`):
55 | If ``True``, the transformer model is fine-tuned during training.
56 | return_all (`bool`, optional, defaults to `False`):
57 | If ``True``, returns all the outputs from the HuggingFace model.
58 | from_pretrained (`bool`, optional, defaults to `True`):
59 | If ``True``, the model is loaded from a pre-trained model, otherwise it is initialized with
60 | random weights. Usefull when you want to load a model from a specific checkpoint, without
61 | having to download the entire model.
62 | """
63 |
64 | def __init__(
65 | self,
66 | model: Union[str, tr.PreTrainedModel],
67 | layer_pooling_strategy: str = "last",
68 | subword_pooling_strategy: str = "scatter",
69 | output_layers: Union[Sequence[int], str] = (-4, -3, -2, -1),
70 | fine_tune: bool = True,
71 | return_all: bool = False,
72 | from_pretrained: bool = True,
73 | *args,
74 | **kwargs,
75 | ) -> None:
76 | super().__init__()
77 | if isinstance(model, str):
78 | self.config = tr.AutoConfig.from_pretrained(
79 | model,
80 | output_hidden_states=True,
81 | output_attentions=True,
82 | *args,
83 | **kwargs,
84 | )
85 | if from_pretrained:
86 | self.transformer_model = tr.AutoModel.from_pretrained(
87 | model, config=self.config, *args, **kwargs
88 | )
89 | else:
90 | self.transformer_model = tr.AutoModel.from_config(
91 | self.config, *args, **kwargs
92 | )
93 | else:
94 | self.transformer_model = model
95 |
96 | # pooling strategy parameters
97 | self.layer_pooling_strategy = layer_pooling_strategy
98 | self.subword_pooling_strategy = subword_pooling_strategy
99 |
100 | if output_layers == "all":
101 | output_layers = tuple(
102 | range(self.transformer_model.config.num_hidden_layers)
103 | )
104 |
105 | # check output_layers is well defined
106 | if (
107 | max(map(abs, output_layers))
108 | >= self.transformer_model.config.num_hidden_layers
109 | ):
110 | raise ValueError(
111 | f"`output_layers` parameter not valid, choose between 0 and "
112 | f"{self.transformer_model.config.num_hidden_layers - 1}. "
113 | f"Current value is `{output_layers}`"
114 | )
115 | self.output_layers = output_layers
116 |
117 | self._scalar_mix: Optional[ScalarMix] = None
118 | if layer_pooling_strategy == "scalar_mix":
119 | self._scalar_mix = ScalarMix(len(output_layers))
120 |
121 | # check if return all transformer outputs
122 | self.return_all = return_all
123 |
124 | # if fine_tune is False, freeze all the transformer's parameters
125 | if not fine_tune:
126 | for param in self.transformer_model.parameters():
127 | param.requires_grad = False
128 |
129 | def forward(
130 | self,
131 | input_ids: torch.Tensor,
132 | attention_mask: Optional[torch.Tensor] = None,
133 | token_type_ids: Optional[torch.Tensor] = None,
134 | scatter_offsets: Optional[torch.Tensor] = None,
135 | sparse_offsets: Optional[Mapping[str, Any]] = None,
136 | **kwargs,
137 | ) -> TransformersEmbedderOutput:
138 | """
139 | Forward method of the PyTorch module.
140 |
141 | Args:
142 | input_ids (`torch.Tensor`):
143 | Input ids for the transformer model.
144 | attention_mask (`torch.Tensor`, optional):
145 | Attention mask for the transformer model.
146 | token_type_ids (`torch.Tensor`, optional):
147 | Token type ids for the transformer model.
148 | scatter_offsets (`torch.Tensor`, optional):
149 | Offsets of the sub-word, used to reconstruct the word embeddings using
150 | the ``scatter`` method.
151 | sparse_offsets (`Mapping[str, Any]`, optional):
152 | Offsets of the sub-word, used to reconstruct the word embeddings using
153 | the ``sparse`` method.
154 |
155 | Returns:
156 | `TransformersEmbedderOutput`:
157 | Word level embeddings plus the output of the transformer model.
158 | """
159 | # Some HuggingFace models don't have the
160 | # token_type_ids parameter and fail even when it's given as None.
161 | inputs = {"input_ids": input_ids, "attention_mask": attention_mask}
162 | if token_type_ids is not None:
163 | inputs["token_type_ids"] = token_type_ids
164 |
165 | # Shape: [batch_size, num_sub-words, embedding_size].
166 | transformer_outputs = self.transformer_model(**inputs)
167 | if self.layer_pooling_strategy == "last":
168 | word_embeddings = transformer_outputs.last_hidden_state
169 | elif self.layer_pooling_strategy == "concat":
170 | word_embeddings = [
171 | transformer_outputs.hidden_states[layer] for layer in self.output_layers
172 | ]
173 | word_embeddings = torch.cat(word_embeddings, dim=-1)
174 | elif self.layer_pooling_strategy == "sum":
175 | word_embeddings = [
176 | transformer_outputs.hidden_states[layer] for layer in self.output_layers
177 | ]
178 | word_embeddings = torch.stack(word_embeddings, dim=0).sum(dim=0)
179 | elif self.layer_pooling_strategy == "mean":
180 | word_embeddings = [
181 | transformer_outputs.hidden_states[layer] for layer in self.output_layers
182 | ]
183 | word_embeddings = torch.stack(word_embeddings, dim=0).mean(
184 | dim=0, dtype=torch.float
185 | )
186 | elif self.layer_pooling_strategy == "scalar_mix":
187 | word_embeddings = [
188 | transformer_outputs.hidden_states[layer] for layer in self.output_layers
189 | ]
190 | word_embeddings = self._scalar_mix(word_embeddings)
191 | else:
192 | raise ValueError(
193 | "`layer_pooling_strategy` parameter not valid, choose between `last`, `concat`, "
194 | f"`sum`, `mean` and `scalar_mix`. Current value `{self.layer_pooling_strategy}`"
195 | )
196 |
197 | if (
198 | self.subword_pooling_strategy != "none"
199 | and scatter_offsets is None
200 | and sparse_offsets is None
201 | ):
202 | raise ValueError(
203 | "`subword_pooling_strategy` is not `none` but neither `scatter_offsets` not `sparse_offsets` "
204 | "were passed to the model. Cannot compute word embeddings.\nTo solve:\n"
205 | "- Set `subword_pooling_strategy` to `none` or\n"
206 | "- Pass `scatter_offsets` to the model during forward or\n"
207 | "- Pass `sparse_offsets` to the model during forward."
208 | )
209 |
210 | if self.subword_pooling_strategy not in ["none", "scatter", "sparse"]:
211 | raise ValueError(
212 | "`subword_pooling_strategy` parameter not valid, choose between `scatter`, `sparse`"
213 | f" and `none`. Current value is `{self.subword_pooling_strategy}`."
214 | )
215 | if self.subword_pooling_strategy == "scatter":
216 | if scatter_offsets is None:
217 | raise ValueError(
218 | "`subword_pooling_strategy` is `scatter` but `scatter_offsets` "
219 | "were not passed to the model. Cannot compute word embeddings.\nTo solve:\n"
220 | "- Set `subword_pooling_strategy` to `none` or\n"
221 | "- Pass `scatter_offsets` to the model during forward."
222 | )
223 | word_embeddings = self.merge_scatter(
224 | word_embeddings, indices=scatter_offsets
225 | )
226 | if self.subword_pooling_strategy == "sparse":
227 | if sparse_offsets is None:
228 | raise ValueError(
229 | "`subword_pooling_strategy` is `sparse` but `sparse_offsets` "
230 | "were not passed to the model. Cannot compute word embeddings.\nTo solve:\n"
231 | "- Set `subword_pooling_strategy` to `none` or\n"
232 | "- Pass `sparse_offsets` to the model during forward."
233 | )
234 | word_embeddings = self.merge_sparse(word_embeddings, sparse_offsets)
235 |
236 | if self.return_all:
237 | return TransformersEmbedderOutput(
238 | word_embeddings=word_embeddings,
239 | last_hidden_state=transformer_outputs.last_hidden_state,
240 | hidden_states=transformer_outputs.hidden_states,
241 | pooler_output=transformer_outputs.pooler_output
242 | if hasattr(transformer_outputs, "pooler_output")
243 | else None,
244 | attentions=transformer_outputs.attentions,
245 | )
246 | return TransformersEmbedderOutput(word_embeddings=word_embeddings)
247 |
248 | @staticmethod
249 | def merge_scatter(embeddings: torch.Tensor, indices: torch.Tensor) -> torch.Tensor:
250 | """
251 | Minimal version of ``scatter_mean``, from `pytorch_scatter
252 | `_
253 | library, that is compatible for ONNX but works only for our case.
254 | It is used to compute word level embeddings from the transformer output.
255 |
256 | Args:
257 | embeddings (`torch.Tensor`):
258 | The embeddings tensor.
259 | indices (`torch.Tensor`):
260 | The sub-word indices.
261 |
262 | Returns:
263 | `torch.Tensor`
264 | """
265 |
266 | def broadcast(src: torch.Tensor, other: torch.Tensor):
267 | """
268 | Broadcast ``src`` to match the shape of ``other``.
269 |
270 | Args:
271 | src (`torch.Tensor`):
272 | The tensor to broadcast.
273 | other (`torch.Tensor`):
274 | The tensor to match the shape of.
275 |
276 | Returns:
277 | `torch.Tensor`: The broadcasted tensor.
278 | """
279 | for _ in range(src.dim(), other.dim()):
280 | src = src.unsqueeze(-1)
281 | src = src.expand_as(other)
282 | return src
283 |
284 | def scatter_sum(src: torch.Tensor, index: torch.Tensor) -> torch.Tensor:
285 | """
286 | Sums the elements in ``src`` that have the same indices as in ``index``.
287 |
288 | Args:
289 | src (`torch.Tensor`):
290 | The tensor to sum.
291 | index (`torch.Tensor`):
292 | The indices to sum.
293 |
294 | Returns:
295 | `torch.Tensor`: The summed tensor.
296 | """
297 | index = broadcast(index, src)
298 | size = list(src.size())
299 | size[1] = index.max() + 1
300 | out = torch.zeros(size, dtype=src.dtype, device=src.device)
301 | return out.scatter_add_(1, index, src)
302 |
303 | # replace padding indices with the maximum value inside the batch
304 | indices[indices == -1] = torch.max(indices)
305 | merged = scatter_sum(embeddings, indices)
306 | ones = torch.ones(
307 | indices.size(), dtype=embeddings.dtype, device=embeddings.device
308 | )
309 | count = scatter_sum(ones, indices)
310 | count.clamp_(1)
311 | count = broadcast(count, merged)
312 | merged.true_divide_(count)
313 | return merged
314 |
315 | @staticmethod
316 | def merge_sparse(
317 | embeddings: torch.Tensor, bpe_info: Optional[Mapping[str, Any]]
318 | ) -> torch.Tensor:
319 | """
320 | Merges the subword embeddings into a single tensor, using sparse indices.
321 |
322 | Args:
323 | embeddings (`torch.Tensor`):
324 | The embeddings tensor.
325 | bpe_info (`Mapping[str, Any]`, `optional`):
326 | The BPE info.
327 |
328 | Returns:
329 | `torch.Tensor`: The merged embeddings.
330 | """
331 | # it is constructed here and not in the tokenizer/collate because pin_memory is not sparse-compatible
332 | bpe_weights = torch.sparse_coo_tensor(
333 | indices=bpe_info["sparse_indices"],
334 | values=bpe_info["sparse_values"],
335 | size=bpe_info["sparse_size"],
336 | )
337 | # (sentence, word, bpe) x (sentence, bpe, transformer_dim) -> (sentence, word, transformer_dim)
338 | merged = torch.bmm(bpe_weights.to_dense(), embeddings)
339 | return merged
340 |
341 | def resize_token_embeddings(
342 | self, new_num_tokens: Optional[int] = None
343 | ) -> torch.nn.Embedding:
344 | """
345 | Resizes input token embeddings' matrix of the model if `new_num_tokens != config.vocab_size`.
346 |
347 | Args:
348 | new_num_tokens (`int`):
349 | The number of new tokens in the embedding matrix.
350 |
351 | Returns:
352 | `torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
353 | """
354 | return self.transformer_model.resize_token_embeddings(new_num_tokens)
355 |
356 | def save_pretrained(self, save_directory: Union[str, Path]):
357 | """
358 | Save a model and its configuration file to a directory.
359 |
360 | Args:
361 | save_directory (`str`, `Path`):
362 | Directory to which to save.
363 | """
364 | self.transformer_model.save_pretrained(save_directory)
365 |
366 | @property
367 | def hidden_size(self) -> int:
368 | """
369 | Returns the hidden size of TransformersEmbedder.
370 |
371 | Returns:
372 | `int`: Hidden size of ``self.transformer_model``.
373 | """
374 | multiplier = (
375 | len(self.output_layers) if self.layer_pooling_strategy == "concat" else 1
376 | )
377 | return self.transformer_model.config.hidden_size * multiplier
378 |
379 | @property
380 | def transformer_hidden_size(self) -> int:
381 | """
382 | Returns the hidden size of the inner transformer.
383 |
384 | Returns:
385 | `int`: Hidden size of ``self.transformer_model``.
386 | """
387 | multiplier = (
388 | len(self.output_layers) if self.layer_pooling_strategy == "concat" else 1
389 | )
390 | return self.transformer_model.config.hidden_size * multiplier
391 |
392 |
393 | class TransformersEncoder(TransformersEmbedder):
394 | """
395 | Transformer Embedder class.
396 |
397 | Word level embeddings from various transformer architectures from Huggingface Transformers API.
398 |
399 | Args:
400 | model (`str`, `tr.PreTrainedModel`):
401 | Transformer model to use (https://huggingface.co/models).
402 | layer_pooling_strategy (`str`, optional, defaults to `last`):
403 | What output to get from the transformer model. The last hidden state (``last``),
404 | the concatenation of the selected hidden layers (``concat``), the sum of the selected hidden
405 | layers (``sum``), the average of the selected hidden layers (``mean``).
406 | subword_pooling_strategy (`str`, optional, defaults to `scatter`):
407 | What pooling strategy to use for the sub-word embeddings. Methods available are ``scatter``,
408 | ``sparse`` and ``none``. The ``scatter`` strategy is ONNX comptabile but uses ``scatter_add``
409 | that is not deterministic. The ``sparse`` strategy is deterministic but it is not comptabile
410 | with ONNX.
411 | output_layers (`tuple`, optional, defaults to `(-4, -3, -2, -1)`):
412 | Which hidden layers to get from the transformer model.
413 | fine_tune (`bool`, optional, defaults to `True`):
414 | If ``True``, the transformer model is fine-tuned during training.
415 | return_all (`bool`, optional, defaults to `False`):
416 | If ``True``, returns all the outputs from the HuggingFace model.
417 | projection_size (`int`, optional, defaults to `None`):
418 | If not ``None``, the output of the transformer is projected to this size.
419 | activation_layer (`torch.nn.Module`, optional, defaults to `None`):
420 | Activation layer to use. If ``None``, no activation layer is used.
421 | dropout (`float`, optional, defaults to `0.1`):
422 | The dropout probability.
423 | bias (`bool`, optional, defaults to `True`):
424 | If ``True``, the transformer model has a bias.
425 | """
426 |
427 | def __init__(
428 | self,
429 | model: Union[str, tr.PreTrainedModel],
430 | layer_pooling_strategy: str = "last",
431 | subword_pooling_strategy: str = "sparse",
432 | output_layers: Sequence[int] = (-4, -3, -2, -1),
433 | fine_tune: bool = True,
434 | return_all: bool = False,
435 | projection_size: Optional[int] = None,
436 | activation_layer: Optional[torch.nn.Module] = None,
437 | dropout: float = 0.1,
438 | bias: bool = True,
439 | *args,
440 | **kwargs,
441 | ) -> None:
442 | super().__init__(
443 | model,
444 | layer_pooling_strategy,
445 | subword_pooling_strategy,
446 | output_layers,
447 | fine_tune,
448 | return_all,
449 | *args,
450 | **kwargs,
451 | )
452 | self.encoder = Encoder(
453 | self.transformer_hidden_size,
454 | projection_size,
455 | activation_layer,
456 | dropout,
457 | bias,
458 | )
459 |
460 | def forward(
461 | self,
462 | input_ids: torch.Tensor,
463 | attention_mask: Optional[torch.Tensor] = None,
464 | token_type_ids: Optional[torch.Tensor] = None,
465 | scatter_offsets: Optional[torch.Tensor] = None,
466 | sparse_offsets: Optional[Mapping[str, Any]] = None,
467 | **kwargs,
468 | ) -> TransformersEmbedderOutput:
469 | """
470 | Forward method of the PyTorch module.
471 |
472 | Args:
473 | input_ids (`torch.Tensor`):
474 | Input ids for the transformer model.
475 | attention_mask (`torch.Tensor`, optional):
476 | Attention mask for the transformer model.
477 | token_type_ids (`torch.Tensor`, optional):
478 | Token type ids for the transformer model.
479 | scatter_offsets (`torch.Tensor`, optional):
480 | Offsets of the sub-word, used to reconstruct the word embeddings.
481 |
482 | Returns:
483 | `TransformersEmbedderOutput`:
484 | Word level embeddings plus the output of the transformer model.
485 | """
486 | transformers_kwargs = {
487 | "input_ids": input_ids,
488 | "attention_mask": attention_mask,
489 | "token_type_ids": token_type_ids,
490 | "scatter_offsets": scatter_offsets,
491 | "sparse_offsets": sparse_offsets,
492 | **kwargs,
493 | }
494 | transformer_output = super().forward(**transformers_kwargs)
495 | encoder_output = self.encoder(transformer_output.word_embeddings)
496 | transformer_output.word_embeddings = encoder_output
497 | return transformer_output
498 |
499 | @property
500 | def hidden_size(self) -> int:
501 | """
502 | Returns the hidden size of the transformer.
503 |
504 | Returns:
505 | `int`: Hidden size of ``self.transformer_model``.
506 | """
507 | return self.encoder.projection_size
508 |
--------------------------------------------------------------------------------
/transformers_embedder/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Riccorl/transformers-embedder/bacf4c5c89fb0fa6b550b1b60174cf15fd03d875/transformers_embedder/modules/__init__.py
--------------------------------------------------------------------------------
/transformers_embedder/modules/encoder.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 |
3 | import torch
4 |
5 |
6 | class Encoder(torch.nn.Module):
7 | """
8 | An encoder module for the `TransformersEmbedder` class.
9 |
10 | Args:
11 | transformer_hidden_size (`int`):
12 | The hidden size of the inner transformer.
13 | projection_size (`int`, `optional`, defaults to `None`):
14 | The size of the projection layer.
15 | activation_layer (`torch.nn.Module`, optional, defaults to `None`):
16 | Activation layer to use. If ``None``, no activation layer is used.
17 | dropout (`float`, `optional`, defaults to `0.1`):
18 | The dropout value.
19 | bias (`bool`, `optional`, defaults to `True`):
20 | Whether to use a bias.
21 | """
22 |
23 | def __init__(
24 | self,
25 | transformer_hidden_size: int,
26 | projection_size: Optional[int] = None,
27 | activation_layer: Optional[torch.nn.Module] = None,
28 | dropout: float = 0.1,
29 | bias: bool = True,
30 | ):
31 | super().__init__()
32 | self.projection_size = projection_size or transformer_hidden_size
33 | self.projection_layer = torch.nn.Linear(
34 | transformer_hidden_size, self.projection_size, bias=bias
35 | )
36 | self.dropout_layer = torch.nn.Dropout(dropout)
37 | self.activation_layer = activation_layer
38 |
39 | def forward(self, x: torch.Tensor) -> torch.Tensor:
40 | """
41 | Forward pass of the encoder.
42 |
43 | Args:
44 | x (`torch.Tensor`):
45 | The input tensor.
46 |
47 | Returns:
48 | `torch.Tensor`: The encoded tensor.
49 | """
50 | x = self.projection_layer(self.dropout_layer(x))
51 | if self.activation_layer is not None:
52 | x = self.activation_layer(x)
53 | return x
54 |
--------------------------------------------------------------------------------
/transformers_embedder/modules/scalar_mix.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | import torch
4 | from torch.nn import ParameterList, Parameter
5 |
6 | # This code is taken from AllenNLP
7 | # https://github.com/allenai/allennlp/blob/main/allennlp/modules/scalar_mix.py
8 |
9 |
10 | class ScalarMix(torch.nn.Module):
11 | """
12 | Computes a parameterised scalar mixture of N tensors, `mixture = gamma * sum(s_k * tensor_k)`
13 | where `s = softmax(w)`, with `w` and `gamma` scalar parameters.
14 | In addition, if `do_layer_norm=True` then apply layer normalization to each tensor
15 | before weighting.
16 | """
17 |
18 | def __init__(
19 | self,
20 | mixture_size: int,
21 | do_layer_norm: bool = False,
22 | initial_scalar_parameters: List[float] = None,
23 | trainable: bool = True,
24 | ) -> None:
25 | super().__init__()
26 | self.mixture_size = mixture_size
27 | self.do_layer_norm = do_layer_norm
28 |
29 | if initial_scalar_parameters is None:
30 | initial_scalar_parameters = [0.0] * mixture_size
31 | elif len(initial_scalar_parameters) != mixture_size:
32 | raise ValueError(
33 | f"Length of `initial_scalar_parameters` {initial_scalar_parameters} differs "
34 | f"from `mixture_size` {mixture_size}"
35 | )
36 |
37 | self.scalar_parameters = ParameterList(
38 | [
39 | Parameter(
40 | torch.FloatTensor([initial_scalar_parameters[i]]),
41 | requires_grad=trainable,
42 | )
43 | for i in range(mixture_size)
44 | ]
45 | )
46 | self.gamma = Parameter(torch.FloatTensor([1.0]), requires_grad=trainable)
47 |
48 | def forward(
49 | self, tensors: List[torch.Tensor], mask: torch.BoolTensor = None
50 | ) -> torch.Tensor:
51 | """
52 | Compute a weighted average of the `tensors`. The input tensors caa be any shape
53 | with at least two dimensions, but must all be the same shape.
54 | When `do_layer_norm=True`, the `mask` is a required input. If the `tensors` are
55 | dimensioned `(dim_0, ..., dim_{n-1}, dim_n)`, then the `mask` is dimensioned
56 | `(dim_0, ..., dim_{n-1})`, as in the typical case with `tensors` of shape
57 | `(batch_size, timesteps, dim)` and `mask` of shape `(batch_size, timesteps)`.
58 | When `do_layer_norm=False` the `mask` is ignored.
59 | """
60 | if len(tensors) != self.mixture_size:
61 | raise ValueError(
62 | f"{len(tensors)} tensors were passed, but the module was initialized to "
63 | f"mix {self.mixture_size} tensors."
64 | )
65 |
66 | def _do_layer_norm(tensor, broadcast_mask, num_elements_not_masked):
67 | tensor_masked = tensor * broadcast_mask
68 | mean = torch.sum(tensor_masked) / num_elements_not_masked
69 | variance = (
70 | torch.sum(((tensor_masked - mean) * broadcast_mask) ** 2)
71 | / num_elements_not_masked
72 | )
73 | return (tensor - mean) / torch.sqrt(variance + 1e-4)
74 |
75 | normed_weights = torch.nn.functional.softmax(
76 | torch.cat([parameter for parameter in self.scalar_parameters]), dim=0
77 | )
78 | normed_weights = torch.split(normed_weights, split_size_or_sections=1)
79 |
80 | if not self.do_layer_norm:
81 | pieces = []
82 | for weight, tensor in zip(normed_weights, tensors):
83 | pieces.append(weight * tensor)
84 | return self.gamma * sum(pieces)
85 |
86 | else:
87 | assert mask is not None
88 | broadcast_mask = mask.unsqueeze(-1)
89 | input_dim = tensors[0].size(-1)
90 | num_elements_not_masked = torch.sum(mask) * input_dim
91 |
92 | pieces = []
93 | for weight, tensor in zip(normed_weights, tensors):
94 | pieces.append(
95 | weight
96 | * _do_layer_norm(tensor, broadcast_mask, num_elements_not_masked)
97 | )
98 | return self.gamma * sum(pieces)
99 |
--------------------------------------------------------------------------------
/transformers_embedder/tokenizer.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from collections import UserDict
4 | from functools import partial
5 | from typing import List, Dict, Union, Any, Optional, Tuple, Set, Sequence, Mapping
6 |
7 | import transformers as tr
8 | from transformers import BatchEncoding
9 | from transformers.file_utils import PaddingStrategy
10 | from transformers.tokenization_utils_base import TruncationStrategy
11 |
12 | from transformers_embedder import MODELS_WITH_STARTING_TOKEN, MODELS_WITH_DOUBLE_SEP
13 | from transformers_embedder import utils
14 | from transformers_embedder.utils import is_torch_available
15 |
16 | if is_torch_available():
17 | import torch
18 |
19 | logger = utils.get_logger(__name__)
20 | utils.get_logger("transformers")
21 |
22 |
23 | class Tokenizer:
24 | """
25 | A wrapper class for HuggingFace Tokenizer.
26 |
27 | Args:
28 | model (`str`, `transformers.PreTrainedTokenizer`):
29 | Language model name (or a transformer `PreTrainedTokenizer`.
30 | return_sparse_offsets (`bool`, optional, defaults to `True`):
31 | If `True`, the sparse offsets of the tokens in the input text are returned. To reduce
32 | memory usage, set this to `False` if you don't need them, e.g. you set the
33 | `subword_pooling_strategy` to `scatter` in the `TransformersEmbedder` model.
34 | """
35 |
36 | def __init__(
37 | self,
38 | model: Union[str, tr.PreTrainedTokenizer],
39 | return_sparse_offsets: bool = True,
40 | *args,
41 | **kwargs,
42 | ):
43 | if isinstance(model, str):
44 | # init HuggingFace tokenizer
45 | self.huggingface_tokenizer = tr.AutoTokenizer.from_pretrained(
46 | model, *args, **kwargs
47 | )
48 | # get config
49 | self.config = tr.AutoConfig.from_pretrained(model, *args, **kwargs)
50 | else:
51 | self.huggingface_tokenizer = model
52 | self.config = tr.AutoConfig.from_pretrained(
53 | self.huggingface_tokenizer.name_or_path, *args, **kwargs
54 | )
55 |
56 | self.return_sparse_offsets = return_sparse_offsets
57 |
58 | # padding stuff
59 | # default, batch length is model max length
60 | self.subword_max_batch_len = self.huggingface_tokenizer.model_max_length
61 | self.word_max_batch_len = self.huggingface_tokenizer.model_max_length
62 | # padding ops
63 | self.padding_ops = {}
64 | # keys that will be converted in tensors
65 | self.to_tensor_inputs = set()
66 |
67 | def __len__(self):
68 | """Size of the full vocabulary with the added tokens."""
69 | return len(self.huggingface_tokenizer)
70 |
71 | def __call__(
72 | self,
73 | text: Union[str, List[str], List[List[str]]],
74 | text_pair: Union[str, List[str], List[List[str]], None] = None,
75 | padding: Union[bool, str, PaddingStrategy] = False,
76 | truncation: Union[bool, str, TruncationStrategy] = False,
77 | max_length: Optional[int] = None,
78 | return_tensors: Optional[Union[bool, str]] = None,
79 | is_split_into_words: bool = False,
80 | additional_inputs: Optional[Dict[str, Any]] = None,
81 | *args,
82 | **kwargs,
83 | ) -> ModelInputs:
84 | """
85 | Prepare the text in input for models that uses HuggingFace as embeddings.
86 |
87 | Args:
88 | text (`str`, `List[str]`, `List[List[str]]`, `List[List[Word]]`, `List[Word]`):
89 | Text or batch of text to be encoded.
90 | text_pair (`str`, `List[str]`, `List[List[str]]`, `List[List[Word]]`, `List[Word]`):
91 | Text or batch of text to be encoded.
92 | padding (`bool`, optional, defaults to `False`):
93 | If `True`, applies padding to the batch based on the maximum length of the batch.
94 | max_length (`int`, optional, defaults to `None`):
95 | If specified, truncates the input sequence to that value. Otherwise,
96 | uses the model max length.
97 | return_tensors (`bool`, optional, defaults to `None`):
98 | If `True`, the outputs is converted to `torch.Tensor`
99 | is_split_into_words (`bool`, optional, defaults to `False`):
100 | If `True` and the input is a string, the input is split on spaces.
101 | additional_inputs (`Dict[str, Any]`, optional, defaults to `None`):
102 | Additional inputs to be passed to the model.
103 |
104 | Returns:
105 | `ModelInputs`: The inputs to the transformer model.
106 | """
107 | # some checks before starting
108 | if return_tensors == "tf":
109 | raise ValueError(
110 | "`return_tensors='tf'` is not supported. Please use `return_tensors='pt'` "
111 | "or `return_tensors=True`."
112 | )
113 | if return_tensors is True:
114 | return_tensors = "pt"
115 | if return_tensors is False:
116 | return_tensors = None
117 |
118 | # check if input is batched or a single sample
119 | is_batched = bool(
120 | isinstance(text, (list, tuple))
121 | and text
122 | and (
123 | (isinstance(text[0], (list, tuple)) and is_split_into_words)
124 | or isinstance(text[0], str)
125 | )
126 | )
127 | if not is_batched: # batch it
128 | text = [text]
129 | text_pair = [text_pair] if text_pair is not None else None
130 |
131 | # use huggingface tokenizer to encode the text
132 | model_inputs = self.huggingface_tokenizer(
133 | text,
134 | text_pair=text_pair,
135 | padding=padding,
136 | truncation=truncation,
137 | max_length=max_length,
138 | is_split_into_words=is_split_into_words,
139 | return_tensors=return_tensors,
140 | *args,
141 | **kwargs,
142 | )
143 | # build the offsets used to pool the subwords
144 | scatter_offsets, sentence_lengths = self.build_scatter_offsets(
145 | model_inputs,
146 | return_tensors=return_tensors,
147 | there_is_text_pair=text_pair is not None,
148 | )
149 |
150 | # convert to ModelInputs
151 | model_inputs = ModelInputs(**model_inputs)
152 | # add the offsets to the model inputs
153 | model_inputs.update(
154 | {"scatter_offsets": scatter_offsets, "sentence_lengths": sentence_lengths}
155 | )
156 |
157 | if self.return_sparse_offsets:
158 | # build the data used to pool the subwords when in sparse mode
159 | bpe_info: Mapping[str, Any] = self.build_sparse_offsets(
160 | offsets=scatter_offsets,
161 | bpe_mask=model_inputs.attention_mask,
162 | words_per_sentence=sentence_lengths,
163 | )
164 | # add the bpe info to the model inputs
165 | model_inputs["sparse_offsets"] = ModelInputs(**bpe_info)
166 |
167 | # we also update the maximum batch length,
168 | # both for subword and word level
169 | self.subword_max_batch_len = max(len(x) for x in model_inputs.input_ids)
170 | self.word_max_batch_len = max(x for x in model_inputs.sentence_lengths)
171 |
172 | # check if we need to convert other stuff to tensors
173 | if additional_inputs:
174 | model_inputs.update(additional_inputs)
175 | # check if there is a padding strategy
176 | if padding:
177 | missing_keys = set(additional_inputs.keys()) - set(
178 | self.padding_ops.keys()
179 | )
180 | if missing_keys:
181 | raise ValueError(
182 | f"There are no padding strategies for the following keys: {missing_keys}. "
183 | "Please add one with `tokenizer.add_padding_ops()`."
184 | )
185 | self.pad_batch(model_inputs)
186 | # convert them to tensors
187 | if return_tensors == "pt":
188 | self.to_tensor(model_inputs)
189 |
190 | return model_inputs
191 |
192 | def build_scatter_offsets(
193 | self,
194 | model_inputs: BatchEncoding,
195 | return_tensors: bool = True,
196 | there_is_text_pair: bool = False,
197 | ) -> Tuple:
198 | """
199 | Build the offset tensor for the batch of inputs.
200 |
201 | Args:
202 | model_inputs (`BatchEncoding`):
203 | The inputs to the transformer model.
204 | return_tensors (`bool`, optional, defaults to `True`):
205 | If `True`, the outputs is converted to `torch.Tensor`
206 | there_is_text_pair (`bool`, optional, defaults to `False`):
207 | If `True` `text_pair` is not None.
208 |
209 | Returns:
210 | `List[List[int]]` or `torch.Tensor`: The offsets of the sub-tokens.
211 | """
212 | # output data structure
213 | offsets = []
214 | sentence_lengths = []
215 | # model_inputs should be the output of the HuggingFace tokenizer
216 | # it contains the word offsets to reconstruct the original tokens from the
217 | # sub-tokens
218 | for batch_index in range(len(model_inputs.input_ids)):
219 | word_ids = model_inputs.word_ids(batch_index)
220 | # it is slightly different from what we need, so here we make it compatible
221 | # with our subword pooling strategy
222 | # if the first token is a special token, we need to take it into account
223 | if self.has_starting_token:
224 | word_offsets = [0] + [
225 | w + 1 if w is not None else w for w in word_ids[1:]
226 | ]
227 | # otherwise, we can just use word_ids as is
228 | else:
229 | word_offsets = word_ids
230 |
231 | # replace first None occurrence with sep_offset
232 | sep_index = word_offsets.index(None)
233 |
234 | # here we retrieve the max offset for the sample, which will be used as SEP offset
235 | # and also as padding value for the offsets
236 | sep_offset_value = max([w for w in word_offsets[:sep_index] if w is not None]) + 1
237 |
238 | word_offsets[sep_index] = sep_offset_value
239 | # if there is a text pair, we need to adjust the offsets for the second text
240 | if there_is_text_pair:
241 | # some models have two SEP tokens in between the two texts
242 | if self.has_double_sep:
243 | sep_index += 1
244 | sep_offset_value += 1
245 | word_offsets[sep_index] = sep_offset_value
246 | # keep the first offsets as is, adjust the second ones
247 | word_offsets = word_offsets[: sep_index + 1] + [
248 | w + sep_offset_value if w is not None else w
249 | for w in word_offsets[sep_index + 1 :]
250 | ]
251 | # update again the sep_offset
252 | sep_offset_value = max([w for w in word_offsets if w is not None]) + 1
253 | # replace first None occurrence with sep_offset, it should be the last SEP
254 | sep_index = word_offsets.index(None)
255 | word_offsets[sep_index] = sep_offset_value
256 | # keep track of the maximum offset for padding
257 | offsets.append(word_offsets)
258 | sentence_lengths.append(sep_offset_value + 1)
259 | # replace remaining None occurrences with -1
260 | # the remaining None occurrences are the padding values
261 | offsets = [[o if o is not None else -1 for o in offset] for offset in offsets]
262 | # if return_tensor is True, we need to convert the offsets to tensors
263 | if return_tensors:
264 | offsets = torch.as_tensor(offsets)
265 | return offsets, sentence_lengths
266 |
267 | @staticmethod
268 | def build_sparse_offsets(
269 | offsets: torch.Tensor | Sequence[Sequence[int]],
270 | bpe_mask: torch.Tensor | Sequence[Sequence[int]],
271 | words_per_sentence: Sequence[int],
272 | ) -> Mapping[str, Any]:
273 | """Build tensors used as info for BPE pooling, starting from the BPE offsets.
274 |
275 | Args:
276 | offsets (`torch.Tensor` or `List[List[int]]`):
277 | The offsets to compute lengths from.
278 | bpe_mask (`torch.Tensor` or `List[List[int]]`):
279 | The attention mask at BPE level.
280 | words_per_sentence (`List[int]`):
281 | The sentence lengths, word-wise.
282 |
283 | Returns:
284 | `Mapping[str, Any]`: Tensors used to construct the sparse one which pools the
285 | transformer encoding word-wise.
286 | """
287 | if not isinstance(offsets, torch.Tensor):
288 | offsets: torch.Tensor = torch.as_tensor(offsets)
289 | if not isinstance(bpe_mask, torch.Tensor):
290 | bpe_mask: torch.Tensor = torch.as_tensor(bpe_mask)
291 |
292 | sentence_lengths: torch.Tensor = bpe_mask.sum(dim=1)
293 |
294 | # We want to build triplets as coordinates (document, word, bpe)
295 | # We start by creating the document index for each triplet
296 | document_indices = torch.arange(offsets.size(0)).repeat_interleave(
297 | sentence_lengths
298 | )
299 | # then the word indices
300 | word_indices = offsets[offsets != -1]
301 | # lastly the bpe indices
302 | max_range: torch.Tensor = torch.arange(bpe_mask.shape[1])
303 | bpe_indices: torch.LongTensor = torch.cat(
304 | [max_range[:i] for i in bpe_mask.sum(dim=1)], dim=0
305 | ).long()
306 |
307 | unique_words, word_lengths = torch.unique_consecutive(
308 | offsets, return_counts=True
309 | )
310 | unpadded_word_lengths = word_lengths[unique_words != -1]
311 |
312 | # and their weight to be used as multiplication factors
313 | bpe_weights: torch.FloatTensor = (
314 | (1 / unpadded_word_lengths).repeat_interleave(unpadded_word_lengths).float()
315 | )
316 |
317 | sparse_indices = torch.stack(
318 | [document_indices, word_indices, bpe_indices], dim=0
319 | )
320 |
321 | bpe_shape = torch.Size(
322 | (
323 | bpe_mask.size(0), # batch_size
324 | max(words_per_sentence), # max number of words per sentence
325 | bpe_mask.size(1), # max bpe_number in batch wrt the sentence
326 | )
327 | )
328 |
329 | return dict(
330 | sparse_indices=sparse_indices,
331 | sparse_values=bpe_weights,
332 | sparse_size=bpe_shape,
333 | )
334 |
335 | def pad_batch(
336 | self,
337 | batch: Union[ModelInputs, Dict[str, list]],
338 | max_length: Optional[int] = None,
339 | ) -> ModelInputs:
340 | """
341 | Pad the batch to its maximum length or to the specified `max_length`.
342 |
343 | Args:
344 | batch (`Dict[str, list]`):
345 | The batch to pad.
346 | max_length (`int`, optional):
347 | Override maximum length of the batch.
348 |
349 | Returns:
350 | `Dict[str, list]`: The padded batch.
351 | """
352 | if max_length:
353 | self.subword_max_batch_len = max_length
354 | self.word_max_batch_len = max_length
355 | else:
356 | # get maximum len inside a batch
357 | self.subword_max_batch_len = max(len(x) for x in batch["input_ids"])
358 | self.word_max_batch_len = max(x for x in batch["sentence_lengths"])
359 |
360 | for key in batch:
361 | if key in self.padding_ops:
362 | batch[key] = [self.padding_ops[key](b) for b in batch[key]]
363 |
364 | return ModelInputs(batch)
365 |
366 | def pad_sequence(
367 | self,
368 | sequence: Union[List, torch.Tensor],
369 | value: int,
370 | length: Union[int, str] = "subword",
371 | pad_to_left: bool = False,
372 | ) -> Union[List, torch.Tensor]:
373 | """
374 | Pad the input to the specified length with the given value.
375 |
376 | Args:
377 | sequence (`List`, `torch.Tensor`):
378 | Element to pad, it can be either a `List` or a `torch.Tensor`.
379 | value (`int`):
380 | Value to use as padding.
381 | length (`int`, `str`, optional, defaults to `subword`):
382 | Length after pad.
383 | pad_to_left (`bool`, optional, defaults to `False`):
384 | If `True`, pads to the left, right otherwise.
385 |
386 | Returns:
387 | `List`, `torch.Tensor`: The padded sequence.
388 | """
389 | if length == "subword":
390 | length = self.subword_max_batch_len
391 | elif length == "word":
392 | length = self.word_max_batch_len
393 | else:
394 | if not isinstance(length, int):
395 | raise ValueError(
396 | f"`length` must be an `int`, `subword` or `word`. Current value is `{length}`"
397 | )
398 | padding = [value] * abs(length - len(sequence))
399 | if isinstance(sequence, torch.Tensor):
400 | if len(sequence.shape) > 1:
401 | raise ValueError(
402 | f"Sequence tensor must be 1D. Current shape is `{len(sequence.shape)}`"
403 | )
404 | padding = torch.as_tensor(padding)
405 | if pad_to_left:
406 | if isinstance(sequence, torch.Tensor):
407 | return torch.cat((padding, sequence), -1)
408 | return padding + sequence
409 | if isinstance(sequence, torch.Tensor):
410 | return torch.cat((sequence, padding), -1)
411 | return sequence + padding
412 |
413 | def add_special_tokens(
414 | self, special_tokens_dict: Dict[str, Union[str, tr.AddedToken]]
415 | ) -> int:
416 | """
417 | Add a dictionary of special tokens (eos, pad, cls, etc.) to the encoder.
418 | If special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last
419 | index of the current vocabulary).
420 |
421 | Args:
422 | special_tokens_dict (`Dict`):
423 | The dictionary containing special tokens. Keys should be in
424 | the list of predefined special attributes: [``bos_token``, ``eos_token``,
425 | ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``,
426 | ``additional_special_tokens``].
427 |
428 | Returns:
429 | `int`: Number of tokens added to the vocabulary.
430 | """
431 | return self.huggingface_tokenizer.add_special_tokens(special_tokens_dict)
432 |
433 | def add_padding_ops(self, key: str, value: Any, length: Union[int, str]):
434 | """
435 | Add padding logic to custom fields.
436 | If the field is not in `self.to_tensor_inputs`, this method will add the key to it.
437 |
438 | Args:
439 | key (`str`):
440 | Name of the field in the tokenizer input.
441 | value (`Any`):
442 | Value to use for padding.
443 | length (`int`, `str`):
444 | Length to pad. It can be an `int`, or two string value
445 | - ``subword``: the element is padded to the batch max length relative to the subwords length
446 | - ``word``: the element is padded to the batch max length relative to the original word length
447 | """
448 | if key not in self.to_tensor_inputs:
449 | self.to_tensor_inputs.add(key)
450 | self.padding_ops[key] = partial(self.pad_sequence, value=value, length=length)
451 |
452 | def add_to_tensor_inputs(self, names: Union[str, Sequence[str]]) -> Set[str]:
453 | """
454 | Add these keys to the ones that will be converted in Tensors.
455 |
456 | Args:
457 | names (`str`, `set`):
458 | Name of the field (or fields) to convert to tensors.
459 |
460 | Returns:
461 | `set`: The set of keys that will be converted to tensors.
462 | """
463 | if isinstance(names, str):
464 | names = {names}
465 | if not isinstance(names, set):
466 | names = set(names)
467 | self.to_tensor_inputs |= names
468 | return self.to_tensor_inputs
469 |
470 | def to_tensor(self, batch: Union[ModelInputs, List[dict], dict]) -> ModelInputs:
471 | """
472 | Return the batch in input as Pytorch tensors. The fields that are converted in tensors are in
473 | `self.to_tensor_inputs`. By default, only the standard model inputs are converted. Use
474 | `self.add_to_tensor_inputs` to add custom fields.
475 |
476 | Args:
477 | batch (`List[dict]`, `dict`):
478 | Batch in input.
479 |
480 | Returns:
481 | `ModelInputs`: The batch as tensor.
482 | """
483 | # convert to tensor
484 | batch = {
485 | k: torch.as_tensor(v)
486 | if k in self.to_tensor_inputs and not isinstance(v, torch.Tensor)
487 | else v
488 | for k, v in batch.items()
489 | }
490 | return ModelInputs(batch)
491 |
492 | @staticmethod
493 | def _clean_output(output: Union[List, Dict]) -> Dict:
494 | """
495 | Clean before output.
496 |
497 | Args:
498 | output (:obj`List[dict]`, `dict`):
499 | The output to clean.
500 |
501 | Returns:
502 | `dict`: The cleaned output.
503 | """
504 | # single sentence case, generalize
505 | if isinstance(output, dict):
506 | output = [output]
507 | # convert list to dict
508 | output = {k: [d[k] for d in output] for k in output[0]}
509 | return output
510 |
511 | @staticmethod
512 | def _get_token_type_id(config: tr.PretrainedConfig) -> int:
513 | """
514 | Get token type id. Useful when dealing with models that don't accept 1 as type id.
515 | Args:
516 | config (`transformers.PretrainedConfig`):
517 | Transformer config.
518 |
519 | Returns:
520 | `int`: Correct token type id for that model.
521 | """
522 | if hasattr(config, "type_vocab_size"):
523 | return 1 if config.type_vocab_size == 2 else 0
524 | return 0
525 |
526 | @staticmethod
527 | def _type_checking(text: Any, text_pair: Any):
528 | """
529 | Checks type of the inputs.
530 |
531 | Args:
532 | text (`Any`):
533 | Text to check.
534 | text_pair (`Any`):
535 | Text pair to check.
536 |
537 | Returns:
538 | """
539 |
540 | def is_type_correct(text_to_check: Any) -> bool:
541 | """
542 | Check if input type is correct, returning a boolean value.
543 |
544 | Args:
545 | text_to_check (`Any`):
546 | text to check.
547 |
548 | Returns:
549 | :obj`bool`: :obj`True` if the type is correct.
550 | """
551 | return (
552 | text_to_check is None
553 | or isinstance(text_to_check, str)
554 | or (
555 | isinstance(text_to_check, (list, tuple))
556 | and (
557 | len(text_to_check) == 0
558 | or (
559 | isinstance(text_to_check[0], str)
560 | or (
561 | isinstance(text_to_check[0], (list, tuple))
562 | and (
563 | len(text_to_check[0]) == 0
564 | or isinstance(text_to_check[0][0], str)
565 | )
566 | )
567 | )
568 | )
569 | )
570 | )
571 |
572 | if not is_type_correct(text):
573 | raise AssertionError(
574 | "text input must of type `str` (single example), `List[str]` (batch or single "
575 | "pre-tokenized example) or `List[List[str]]` (batch of pre-tokenized examples)."
576 | )
577 |
578 | if not is_type_correct(text_pair):
579 | raise AssertionError(
580 | "text_pair input must be `str` (single example), `List[str]` (batch or single "
581 | "pre-tokenized example) or `List[List[str]]` (batch of pre-tokenized examples)."
582 | )
583 |
584 | @property
585 | def num_special_tokens(self) -> int:
586 | """
587 | Return the number of special tokens the model needs.
588 | It assumes the input contains both sentences (`text` and `text_pair`).
589 |
590 | Returns:
591 | `int`: the number of special tokens.
592 | """
593 | if isinstance(
594 | self.huggingface_tokenizer, MODELS_WITH_DOUBLE_SEP
595 | ) and isinstance(self.huggingface_tokenizer, MODELS_WITH_STARTING_TOKEN):
596 | return 4
597 | if isinstance(
598 | self.huggingface_tokenizer,
599 | (MODELS_WITH_DOUBLE_SEP, MODELS_WITH_STARTING_TOKEN),
600 | ):
601 | return 3
602 | return 2
603 |
604 | @property
605 | def has_double_sep(self):
606 | """True if tokenizer uses two SEP tokens."""
607 | return isinstance(self.huggingface_tokenizer, MODELS_WITH_DOUBLE_SEP)
608 |
609 | @property
610 | def has_starting_token(self):
611 | """True if tokenizer uses a starting token."""
612 | return isinstance(self.huggingface_tokenizer, MODELS_WITH_STARTING_TOKEN)
613 |
614 | @property
615 | def token_type_id(self):
616 | """Padding token."""
617 | return self._get_token_type_id(self.config)
618 |
619 | @property
620 | def pad_token(self):
621 | """Padding token."""
622 | return self.huggingface_tokenizer.pad_token
623 |
624 | @property
625 | def pad_token_id(self):
626 | """Padding token id."""
627 | return self.huggingface_tokenizer.pad_token_id
628 |
629 | @property
630 | def unk_token(self):
631 | """Unknown token."""
632 | return self.huggingface_tokenizer.unk_token
633 |
634 | @property
635 | def unk_token_id(self):
636 | """Unknown token id."""
637 | return self.huggingface_tokenizer.unk_token_id
638 |
639 | @property
640 | def cls_token(self):
641 | """
642 | Classification token.
643 | To extract a summary of an input sequence leveraging self-attention along the
644 | full depth of the model.
645 | """
646 | return self.huggingface_tokenizer.cls_token
647 |
648 | @property
649 | def cls_token_id(self):
650 | """
651 | Classification token id.
652 | To extract a summary of an input sequence leveraging self-attention along the
653 | full depth of the model.
654 | """
655 | return self.huggingface_tokenizer.cls_token_id
656 |
657 | @property
658 | def sep_token(self):
659 | """Separation token, to separate context and query in an input sequence."""
660 | return self.huggingface_tokenizer.sep_token
661 |
662 | @property
663 | def sep_token_id(self):
664 | """Separation token id, to separate context and query in an input sequence."""
665 | return self.huggingface_tokenizer.sep_token_id
666 |
667 | @property
668 | def bos_token(self):
669 | """Beginning of sentence token."""
670 | return self.huggingface_tokenizer.bos_token
671 |
672 | @property
673 | def bos_token_id(self):
674 | """Beginning of sentence token id."""
675 | return self.huggingface_tokenizer.bos_token_id
676 |
677 | @property
678 | def eos_token(self):
679 | """End of sentence token."""
680 | return self.huggingface_tokenizer.eos_token
681 |
682 | @property
683 | def eos_token_id(self):
684 | """End of sentence token id."""
685 | return self.huggingface_tokenizer.eos_token_id
686 |
687 |
688 | class ModelInputs(UserDict):
689 | """Model input dictionary wrapper."""
690 |
691 | def __getattr__(self, item: str):
692 | try:
693 | return self.data[item]
694 | except KeyError:
695 | raise AttributeError(f"`ModelInputs` has no attribute `{item}`")
696 |
697 | def __getitem__(self, item: str) -> Any:
698 | return self.data[item]
699 |
700 | def __getstate__(self):
701 | return {"data": self.data}
702 |
703 | def __setstate__(self, state):
704 | if "data" in state:
705 | self.data = state["data"]
706 |
707 | def keys(self):
708 | """A set-like object providing a view on D's keys."""
709 | return self.data.keys()
710 |
711 | def values(self):
712 | """An object providing a view on D's values."""
713 | return self.data.values()
714 |
715 | def items(self):
716 | """A set-like object providing a view on D's items."""
717 | return self.data.items()
718 |
719 | def to(self, device: Union[str, torch.device]) -> ModelInputs:
720 | """
721 | Send all tensors values to device.
722 |
723 | Args:
724 | device (`str` or `torch.device`): The device to put the tensors on.
725 |
726 | Returns:
727 | :class:`tokenizers.ModelInputs`: The same instance of :class:`~tokenizers.ModelInputs`
728 | after modification.
729 | """
730 | if isinstance(device, (str, torch.device, int)):
731 | self.data = {
732 | k: v.to(device=device) if hasattr(v, "to") else v
733 | for k, v in self.data.items()
734 | }
735 | else:
736 | logger.warning(
737 | f"Attempting to cast to another type, {str(device)}. This is not supported."
738 | )
739 | return self
740 |
--------------------------------------------------------------------------------
/transformers_embedder/utils.py:
--------------------------------------------------------------------------------
1 | import importlib.util
2 | import logging
3 |
4 | _torch_available = importlib.util.find_spec("torch") is not None
5 |
6 |
7 | def is_torch_available():
8 | """Check if PyTorch is available."""
9 | return _torch_available
10 |
11 |
12 | def get_logger(name: str) -> logging.Logger:
13 | """
14 | Return the logger of the given name.
15 |
16 | Args:
17 | name (`str`): The name of the logger.
18 |
19 | Returns:
20 | `logging.Logger`: The logger of the given name.
21 | """
22 | return logging.getLogger(name)
23 |
--------------------------------------------------------------------------------