├── .github
├── PULL_REQUEST_TEMPLATE.md
├── no-response.yml
└── workflows
│ ├── cibuildwheel.yml
│ ├── explosionbot.yml
│ ├── issue-manager.yml
│ ├── publish_pypi.yml
│ └── tests.yml
├── .gitignore
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── bin
├── get-version.sh
└── push-tag.sh
├── build-constraints.txt
├── examples
└── configs
│ ├── joint-core-bert.cfg
│ └── ner-albert.cfg
├── pyproject.toml
├── requirements.txt
├── setup.cfg
├── setup.py
└── spacy_transformers
├── __init__.py
├── align.pyi
├── align.pyx
├── annotation_setters.py
├── architectures.py
├── data_classes.py
├── layers
├── __init__.py
├── _util.py
├── hf_shim.py
├── hf_wrapper.py
├── listener.py
├── split_trf.py
├── transformer_model.py
└── trfs2arrays.py
├── pipeline_component.py
├── py.typed
├── span_getters.py
├── tests
├── __init__.py
├── enable_gpu.py
├── regression
│ ├── __init__.py
│ ├── test_spacy_issue6401.py
│ └── test_spacy_issue7029.py
├── test_alignment.py
├── test_configs.py
├── test_data_classes.py
├── test_deprecations.py
├── test_model_sequence_classification.py
├── test_model_wrapper.py
├── test_pipeline_component.py
├── test_serialize.py
├── test_spanners.py
├── test_textcatcnn.py
├── test_tok2vectransformer.py
├── test_truncation.py
└── util.py
├── truncate.py
└── util.py
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ## Description
4 |
5 |
10 |
11 | ### Types of change
12 |
13 |
15 |
16 | ## Checklist
17 |
18 |
20 |
21 | - [ ] I confirm that I have the right to submit this contribution under the project's MIT license.
22 | - [ ] I ran the tests, and all new and existing tests passed.
23 | - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
24 |
--------------------------------------------------------------------------------
/.github/no-response.yml:
--------------------------------------------------------------------------------
1 | # Configuration for probot-no-response - https://github.com/probot/no-response
2 |
3 | # Number of days of inactivity before an Issue is closed for lack of response
4 | daysUntilClose: 14
5 | # Label requiring a response
6 | responseRequiredLabel: more-info-needed
7 | # Comment to post when closing an Issue for lack of response. Set to `false` to disable
8 | closeComment: >
9 | This issue has been automatically closed because there has been no response
10 | to a request for more information from the original author. With only the
11 | information that is currently in the issue, there's not enough information
12 | to take action. If you're the original author, feel free to reopen the issue
13 | if you have or find the answers needed to investigate further.
14 |
--------------------------------------------------------------------------------
/.github/workflows/cibuildwheel.yml:
--------------------------------------------------------------------------------
1 | name: Build
2 |
3 | on:
4 | push:
5 | tags:
6 | # ytf did they invent their own syntax that's almost regex?
7 | # ** matches 'zero or more of any character'
8 | - 'release-v[0-9]+.[0-9]+.[0-9]+**'
9 | - 'prerelease-v[0-9]+.[0-9]+.[0-9]+**'
10 | jobs:
11 | build_wheels:
12 | name: Build wheels on ${{ matrix.os }}
13 | runs-on: ${{ matrix.os }}
14 | strategy:
15 | matrix:
16 | # macos-13 is an intel runner, macos-14 is apple silicon
17 | os: [ubuntu-latest, windows-latest, macos-13, macos-14, ubuntu-24.04-arm]
18 |
19 | steps:
20 | - uses: actions/checkout@v4
21 | - name: Build wheels
22 | uses: pypa/cibuildwheel@v2.21.3
23 | env:
24 | CIBW_SOME_OPTION: value
25 | with:
26 | package-dir: .
27 | output-dir: wheelhouse
28 | config-file: "{package}/pyproject.toml"
29 | - uses: actions/upload-artifact@v4
30 | with:
31 | name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }}
32 | path: ./wheelhouse/*.whl
33 |
34 | build_sdist:
35 | name: Build source distribution
36 | runs-on: ubuntu-latest
37 | steps:
38 | - uses: actions/checkout@v4
39 |
40 | - name: Build sdist
41 | run: pipx run build --sdist
42 | - uses: actions/upload-artifact@v4
43 | with:
44 | name: cibw-sdist
45 | path: dist/*.tar.gz
46 | create_release:
47 | needs: [build_wheels, build_sdist]
48 | runs-on: ubuntu-latest
49 | permissions:
50 | contents: write
51 | checks: write
52 | actions: read
53 | issues: read
54 | packages: write
55 | pull-requests: read
56 | repository-projects: read
57 | statuses: read
58 | steps:
59 | - name: Get the tag name and determine if it's a prerelease
60 | id: get_tag_info
61 | run: |
62 | FULL_TAG=${GITHUB_REF#refs/tags/}
63 | if [[ $FULL_TAG == release-* ]]; then
64 | TAG_NAME=${FULL_TAG#release-}
65 | IS_PRERELEASE=false
66 | elif [[ $FULL_TAG == prerelease-* ]]; then
67 | TAG_NAME=${FULL_TAG#prerelease-}
68 | IS_PRERELEASE=true
69 | else
70 | echo "Tag does not match expected patterns" >&2
71 | exit 1
72 | fi
73 | echo "FULL_TAG=$TAG_NAME" >> $GITHUB_ENV
74 | echo "TAG_NAME=$TAG_NAME" >> $GITHUB_ENV
75 | echo "IS_PRERELEASE=$IS_PRERELEASE" >> $GITHUB_ENV
76 | - uses: actions/download-artifact@v4
77 | with:
78 | # unpacks all CIBW artifacts into dist/
79 | pattern: cibw-*
80 | path: dist
81 | merge-multiple: true
82 | - name: Create Draft Release
83 | id: create_release
84 | uses: softprops/action-gh-release@v2
85 | if: startsWith(github.ref, 'refs/tags/')
86 | env:
87 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
88 | with:
89 | name: ${{ env.TAG_NAME }}
90 | draft: true
91 | prerelease: ${{ env.IS_PRERELEASE }}
92 | files: "./dist/*"
93 |
--------------------------------------------------------------------------------
/.github/workflows/explosionbot.yml:
--------------------------------------------------------------------------------
1 | name: Explosion Bot
2 |
3 | on:
4 | issue_comment:
5 | types:
6 | - created
7 | - edited
8 |
9 | jobs:
10 | explosion-bot:
11 | runs-on: ubuntu-latest
12 | steps:
13 | - name: Dump GitHub context
14 | env:
15 | GITHUB_CONTEXT: ${{ toJson(github) }}
16 | run: echo "$GITHUB_CONTEXT"
17 | - uses: actions/checkout@v3
18 | - uses: actions/setup-python@v4
19 | - name: Install and run explosion-bot
20 | run: |
21 | pip install git+https://${{ secrets.EXPLOSIONBOT_TOKEN }}@github.com/explosion/explosion-bot
22 | python -m explosionbot
23 | env:
24 | INPUT_TOKEN: ${{ secrets.EXPLOSIONBOT_TOKEN }}
25 | INPUT_BK_TOKEN: ${{ secrets.BUILDKITE_SECRET }}
26 | ENABLED_COMMANDS: "test_gpu"
27 | ALLOWED_TEAMS: "spacy-maintainers"
28 |
--------------------------------------------------------------------------------
/.github/workflows/issue-manager.yml:
--------------------------------------------------------------------------------
1 | name: Issue Manager
2 |
3 | on:
4 | schedule:
5 | - cron: "0 0 * * *"
6 | issue_comment:
7 | types:
8 | - created
9 | - edited
10 | issues:
11 | types:
12 | - labeled
13 |
14 | jobs:
15 | issue-manager:
16 | runs-on: ubuntu-latest
17 | steps:
18 | - uses: tiangolo/issue-manager@0.2.1
19 | with:
20 | token: ${{ secrets.GITHUB_TOKEN }}
21 | config: >
22 | {
23 | "resolved": {
24 | "delay": "P7D",
25 | "message": "This issue has been automatically closed because it was answered and there was no follow-up discussion.",
26 | "remove_label_on_comment": true,
27 | "remove_label_on_close": true
28 | }
29 | }
--------------------------------------------------------------------------------
/.github/workflows/publish_pypi.yml:
--------------------------------------------------------------------------------
1 | # The cibuildwheel action triggers on creation of a release, this
2 | # triggers on publication.
3 | # The expected workflow is to create a draft release and let the wheels
4 | # upload, and then hit 'publish', which uploads to PyPi.
5 |
6 | on:
7 | release:
8 | types:
9 | - published
10 |
11 | jobs:
12 | upload_pypi:
13 | runs-on: ubuntu-latest
14 | environment:
15 | name: pypi
16 | url: https://pypi.org/p/spacy-transformers
17 | permissions:
18 | id-token: write
19 | contents: read
20 | if: github.event_name == 'release' && github.event.action == 'published'
21 | # or, alternatively, upload to PyPI on every tag starting with 'v' (remove on: release above to use this)
22 | # if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
23 | steps:
24 | - uses: robinraju/release-downloader@v1
25 | with:
26 | tag: ${{ github.event.release.tag_name }}
27 | fileName: '*'
28 | out-file-path: 'dist'
29 | - uses: pypa/gh-action-pypi-publish@release/v1
30 |
--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
1 | name: tests
2 |
3 | on:
4 | push:
5 | paths-ignore:
6 | - "*.md"
7 | pull_request:
8 | types: [opened, synchronize, reopened, edited]
9 | paths-ignore:
10 | - "*.md"
11 |
12 | env:
13 | MODULE_NAME: "spacy_transformers"
14 | RUN_MYPY: "true"
15 |
16 | jobs:
17 | tests:
18 | name: Test
19 | if: github.repository_owner == 'explosion'
20 | strategy:
21 | fail-fast: false
22 | matrix:
23 | os: [ubuntu-latest, windows-latest, macos-latest]
24 | python_version: ["3.12"]
25 | include:
26 | - os: macos-13
27 | python_version: "3.10"
28 | - os: windows-latest
29 | python_version: "3.11"
30 | - os: ubuntu-latest
31 | python_version: "3.12"
32 | - os: macos-13
33 | python_version: "3.12"
34 | - os: windows-latest
35 | python_version: "3.12"
36 |
37 | runs-on: ${{ matrix.os }}
38 |
39 | steps:
40 | - name: Check out repo
41 | uses: actions/checkout@v3
42 |
43 | - name: Configure Python version
44 | uses: actions/setup-python@v4
45 | with:
46 | python-version: ${{ matrix.python_version }}
47 |
48 | - name: Install dependencies
49 | run: |
50 | python -m pip install -U build pip setuptools wheel
51 | python -m pip install -r requirements.txt --force-reinstall
52 |
53 | - name: Build sdist
54 | run: |
55 | python -m build --sdist
56 |
57 | - name: Run mypy
58 | if: env.RUN_MYPY == 'true' && matrix.python_version != '3.6'
59 | shell: bash
60 | run: |
61 | python -m mypy $MODULE_NAME
62 |
63 | - name: Delete source directory
64 | shell: bash
65 | run: |
66 | rm -rf $MODULE_NAME
67 |
68 | - name: Uninstall all packages
69 | run: |
70 | python -m pip freeze --exclude pywin32 --exclude torch
71 | python -m pip freeze --exclude pywin32 --exclude torch > installed.txt
72 | python -m pip uninstall -y -r installed.txt
73 |
74 | - name: Install newest torch for python 3.7+
75 | if: matrix.python_version != '3.6'
76 | run: |
77 | python -m pip install torch --index-url https://download.pytorch.org/whl/cpu --force-reinstall
78 |
79 | - name: Install from sdist
80 | shell: bash
81 | run: |
82 | SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
83 | python -m pip install dist/$SDIST
84 |
85 | - name: Run tests
86 | shell: bash
87 | run: |
88 | python -m pip install -r requirements.txt --force-reinstall
89 | # The version of pytorch being used here requires numpy v2, but because of the way we're doing the
90 | # requirements installation here it's not being resolved that way. So just install numpy 1 here.
91 | python -m pip install "numpy<2"
92 | python -m pytest --pyargs $MODULE_NAME --cov=$MODULE_NAME
93 |
94 | - name: Test backwards compatibility for v1.0 models
95 | if: matrix.python_version == '3.9'
96 | run: |
97 | python -m pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.1.0/en_core_web_trf-3.1.0-py3-none-any.whl --no-deps
98 | python -c "import spacy; nlp = spacy.load('en_core_web_trf'); doc = nlp('test')"
99 |
100 | - name: Test backwards compatibility for v1.1 models
101 | if: matrix.python_version == '3.9'
102 | run: |
103 | python -m pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.4.0/en_core_web_trf-3.4.0-py3-none-any.whl --no-deps
104 | python -c "import spacy; nlp = spacy.load('en_core_web_trf'); doc = nlp('test')"
105 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode
2 | tmp/
3 |
4 | # Byte-compiled / optimized / DLL files
5 | __pycache__/
6 | *.py[cod]
7 | *$py.class
8 |
9 | # C extensions
10 | *.so
11 |
12 | # vim
13 | .*.sw*
14 |
15 | # Cython / C extensions
16 | cythonize.json
17 | spacy_transformers/*.html
18 | *.cpp
19 | *.so
20 |
21 | # Distribution / packaging
22 | .Python
23 | build/
24 | develop-eggs/
25 | dist/
26 | downloads/
27 | eggs/
28 | .eggs/
29 | lib/
30 | lib64/
31 | parts/
32 | sdist/
33 | var/
34 | wheels/
35 | pip-wheel-metadata/
36 | share/python-wheels/
37 | *.egg-info/
38 | .installed.cfg
39 | *.egg
40 | MANIFEST
41 |
42 | # PyInstaller
43 | # Usually these files are written by a python script from a template
44 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
45 | *.manifest
46 | *.spec
47 |
48 | # Installer logs
49 | pip-log.txt
50 | pip-delete-this-directory.txt
51 |
52 | # Unit test / coverage reports
53 | htmlcov/
54 | .tox/
55 | .nox/
56 | .coverage
57 | .coverage.*
58 | .cache
59 | nosetests.xml
60 | coverage.xml
61 | *.cover
62 | .hypothesis/
63 | .pytest_cache/
64 |
65 | # Translations
66 | *.mo
67 | *.pot
68 |
69 | # Django stuff:
70 | *.log
71 | local_settings.py
72 | db.sqlite3
73 | db.sqlite3-journal
74 |
75 | # Flask stuff:
76 | instance/
77 | .webassets-cache
78 |
79 | # Scrapy stuff:
80 | .scrapy
81 |
82 | # Sphinx documentation
83 | docs/_build/
84 |
85 | # PyBuilder
86 | target/
87 |
88 | # Jupyter Notebook
89 | .ipynb_checkpoints
90 |
91 | # IPython
92 | profile_default/
93 | ipython_config.py
94 |
95 | # pyenv
96 | .python-version
97 |
98 | # pipenv
99 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
100 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
101 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
102 | # install all needed dependencies.
103 | #Pipfile.lock
104 |
105 | # celery beat schedule file
106 | celerybeat-schedule
107 |
108 | # SageMath parsed files
109 | *.sage.py
110 |
111 | # Environments
112 | .env
113 | .venv
114 | env/
115 | venv/
116 | ENV/
117 | env.bak/
118 | venv.bak/
119 |
120 | # Spyder project settings
121 | .spyderproject
122 | .spyproject
123 |
124 | # Rope project settings
125 | .ropeproject
126 |
127 | # mkdocs documentation
128 | /site
129 |
130 | # mypy
131 | .mypy_cache/
132 | .dmypy.json
133 | dmypy.json
134 |
135 | # Pyre type checker
136 | .pyre/
137 |
138 | # Pycharm project files
139 | *.idea
140 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 ExplosionAI GmbH
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include spacy_transformers *.pyi *.pyx *.pxd
2 | recursive-exclude spacy_transformers *.cpp
3 | include LICENSE
4 | include README.md
5 | include pyproject.toml
6 | include spacy_transformers/py.typed
7 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | SHELL := /bin/bash
2 | PYVER := 3.6
3 | VENV := ./env$(PYVER)
4 |
5 | version := $(shell "bin/get-version.sh")
6 |
7 | dist/spacy-trf-$(version).pex : wheelhouse/spacy-trf-$(version).stamp
8 | $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -o $@ spacy_transformers==$(version) jsonschema
9 | chmod a+rx $@
10 |
11 | wheelhouse/spacy-trf-$(version).stamp : $(VENV)/bin/pex setup.py spacy_transformers/*.py* spacy_transformers/*/*.py*
12 | $(VENV)/bin/pip wheel . -w ./wheelhouse
13 | $(VENV)/bin/pip wheel jsonschema -w ./wheelhouse
14 | touch $@
15 |
16 | $(VENV)/bin/pex :
17 | python$(PYVER) -m venv $(VENV)
18 | $(VENV)/bin/pip install -U pip setuptools pex wheel
19 |
20 | .PHONY : clean
21 |
22 | clean : setup.py
23 | rm -rf dist/*
24 | rm -rf ./wheelhouse
25 | rm -rf $(VENV)
26 | python setup.py clean --all
27 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # spacy-transformers: Use pretrained transformers like BERT, XLNet and GPT-2 in spaCy
4 |
5 | This package provides [spaCy](https://github.com/explosion/spaCy) components and
6 | architectures to use transformer models via
7 | [Hugging Face's `transformers`](https://github.com/huggingface/transformers) in
8 | spaCy. The result is convenient access to state-of-the-art transformer
9 | architectures, such as BERT, GPT-2, XLNet, etc.
10 |
11 | > **This release requires [spaCy v3](https://spacy.io/usage/v3).** For the
12 | > previous version of this library, see the
13 | > [`v0.6.x` branch](https://github.com/explosion/spacy-transformers/tree/v0.6.x).
14 |
15 | [](https://github.com/explosion/spacy-transformers/actions/workflows/tests.yml)
16 | [](https://pypi.python.org/pypi/spacy-transformers)
17 | [](https://github.com/explosion/spacy-transformers/releases)
18 | [](https://github.com/ambv/black)
19 |
20 | ## Features
21 |
22 | - Use pretrained transformer models like **BERT**, **RoBERTa** and **XLNet** to
23 | power your spaCy pipeline.
24 | - Easy **multi-task learning**: backprop to one transformer model from several
25 | pipeline components.
26 | - Train using spaCy v3's powerful and extensible config system.
27 | - Automatic alignment of transformer output to spaCy's tokenization.
28 | - Easily customize what transformer data is saved in the `Doc` object.
29 | - Easily customize how long documents are processed.
30 | - Out-of-the-box serialization and model packaging.
31 |
32 | ## 🚀 Installation
33 |
34 | Installing the package from pip will automatically install all dependencies,
35 | including PyTorch and spaCy. Make sure you install this package **before** you
36 | install the models. Also note that this package requires **Python 3.6+**,
37 | **PyTorch v1.5+** and **spaCy v3.0+**.
38 |
39 | ```bash
40 | pip install 'spacy[transformers]'
41 | ```
42 |
43 | For GPU installation, find your CUDA version using `nvcc --version` and add the
44 | [version in brackets](https://spacy.io/usage/#gpu), e.g.
45 | `spacy[transformers,cuda92]` for CUDA9.2 or `spacy[transformers,cuda100]` for
46 | CUDA10.0.
47 |
48 | If you are having trouble installing PyTorch, follow the
49 | [instructions](https://pytorch.org/get-started/locally/) on the official website
50 | for your specific operating system and requirements.
51 |
52 | ## 📖 Documentation
53 |
54 | > ⚠️ **Important note:** This package has been extensively refactored to take
55 | > advantage of [spaCy v3.0](https://spacy.io). Previous versions that were built
56 | > for [spaCy v2.x](https://v2.spacy.io) worked considerably differently. Please
57 | > see previous tagged versions of this README for documentation on prior
58 | > versions.
59 |
60 | - 📘
61 | [Embeddings, Transformers and Transfer Learning](https://spacy.io/usage/embeddings-transformers):
62 | How to use transformers in spaCy
63 | - 📘 [Training Pipelines and Models](https://spacy.io/usage/training): Train and
64 | update components on your own data and integrate custom models
65 | - 📘
66 | [Layers and Model Architectures](https://spacy.io/usage/layers-architectures):
67 | Power spaCy components with custom neural networks
68 | - 📗 [`Transformer`](https://spacy.io/api/transformer): Pipeline component API
69 | reference
70 | - 📗
71 | [Transformer architectures](https://spacy.io/api/architectures#transformers):
72 | Architectures and registered functions
73 |
74 | ## Applying pretrained text and token classification models
75 |
76 | Note that the `transformer` component from `spacy-transformers` does not support
77 | task-specific heads like token or text classification. A task-specific
78 | transformer model can be used as a source of features to train spaCy components
79 | like `ner` or `textcat`, but the `transformer` component does not provide access
80 | to task-specific heads for training or inference.
81 |
82 | Alternatively, if you only want use to the **predictions** from an existing
83 | Hugging Face text or token classification model, you can use the wrappers from
84 | [`spacy-huggingface-pipelines`](https://github.com/explosion/spacy-huggingface-pipelines)
85 | to incorporate task-specific transformer models into your spaCy pipelines.
86 |
87 | ## Bug reports and other issues
88 |
89 | Please use [spaCy's issue tracker](https://github.com/explosion/spaCy/issues) to
90 | report a bug, or open a new thread on the
91 | [discussion board](https://github.com/explosion/spaCy/discussions) for any other
92 | issue.
93 |
--------------------------------------------------------------------------------
/bin/get-version.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -e
4 |
5 | version=$(grep "version = " setup.cfg)
6 | version=${version/version = }
7 | version=${version/\'/}
8 | version=${version/\'/}
9 | version=${version/\"/}
10 | version=${version/\"/}
11 |
12 | echo $version
13 |
--------------------------------------------------------------------------------
/bin/push-tag.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -e
4 |
5 | # Insist repository is clean
6 | git diff-index --quiet HEAD
7 |
8 | git checkout $1
9 | git pull origin $1
10 | git push origin $1
11 |
12 | version=$(grep "version = " setup.cfg)
13 | version=${version/version = }
14 | version=${version/\'/}
15 | version=${version/\'/}
16 | version=${version/\"/}
17 | version=${version/\"/}
18 | git tag "v$version"
19 | git push origin "v$version"
20 |
--------------------------------------------------------------------------------
/build-constraints.txt:
--------------------------------------------------------------------------------
1 | # build version constraints for use with wheelwright + multibuild
2 | numpy==1.15.0; python_version<='3.7' and platform_machine!='aarch64'
3 | numpy==1.19.2; python_version<='3.7' and platform_machine=='aarch64'
4 | numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
5 | numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
6 | numpy>=1.25.0; python_version>='3.9'
7 |
--------------------------------------------------------------------------------
/examples/configs/joint-core-bert.cfg:
--------------------------------------------------------------------------------
1 | [training]
2 | seed = 0
3 | gold_preproc = false
4 | # Limitations on training document length or number of examples.
5 | max_length = 500
6 | limit = 0
7 | patience = 10000
8 | eval_frequency = 400
9 | dropout = 0.1
10 | init_tok2vec = null
11 | max_epochs = 0
12 | max_steps = 0
13 | orth_variant_level = 0.0
14 |
15 | scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
16 | score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
17 |
18 | base_model = null
19 | use_pytorch_for_gpu_memory = true
20 | omit_extra_lookups = false
21 | raw_text = null
22 | tag_map = null
23 | vectors = null
24 | morph_rules = null
25 |
26 | batch_by = "padded"
27 | batch_size = 2000
28 | accumulate_gradient = 3
29 | discard_oversize = true
30 | eval_batch_size = 256
31 |
32 | [training.optimizer]
33 | @optimizers = "Adam.v1"
34 | beta1 = 0.9
35 | beta2 = 0.999
36 | eps = 1e-8
37 | L2_is_weight_decay = true
38 | L2 = 0.01
39 | grad_clip = 1.0
40 | use_averages = false
41 |
42 | [training.optimizer.learn_rate]
43 | @schedules = "warmup_linear.v1"
44 | warmup_steps = 250
45 | total_steps = 20000
46 | initial_rate = 5e-5
47 |
48 |
49 | [nlp]
50 | lang = "en"
51 | stop_words = []
52 | lex_attr_getters = {}
53 | pipeline = ["transformer", "tagger", "parser", "ner"]
54 |
55 | [nlp.tokenizer]
56 | @tokenizers = "spacy.Tokenizer.v1"
57 |
58 | [nlp.lemmatizer]
59 | @lemmatizers = "spacy.Lemmatizer.v1"
60 |
61 | [nlp.writing_system]
62 | direction = "ltr"
63 | has_case = true
64 | has_letters = true
65 |
66 | [components]
67 |
68 | [components.transformer]
69 | factory = "transformer"
70 | max_batch_items = 4096
71 |
72 | [components.tagger]
73 | factory = "tagger"
74 |
75 | [components.parser]
76 | factory = "parser"
77 | learn_tokens = false
78 | min_action_freq = 1
79 |
80 | [components.ner]
81 | factory = "ner"
82 | learn_tokens = false
83 | min_action_freq = 1
84 |
85 | # This loads the Huggingface Transformers model. The transformer is applied
86 | # to a batch of Doc objects, which are preprocessed into Span objects to support
87 | # longer documents.
88 | [components.transformer.model]
89 | @architectures = "spacy-transformers.TransformerModel.v3"
90 | name = "roberta-base"
91 | tokenizer_config = {"use_fast": true}
92 | transformer_config = {"output_attentions": false}
93 |
94 | [components.transformer.model.get_spans]
95 | # You can set a custom strategy for preparing spans from the batch, e.g. you
96 | # can predict over sentences. Here we predict over the whole document.
97 | @span_getters = "strided_spans.v1"
98 | window = 128
99 | stride = 96
100 |
101 | [components.tagger.model]
102 | @architectures = "spacy.Tagger.v1"
103 |
104 | [components.parser.model]
105 | @architectures = "spacy.TransitionBasedParser.v1"
106 | nr_feature_tokens = 8
107 | hidden_width = 64
108 | maxout_pieces = 2
109 | use_upper = false
110 |
111 | [components.ner.model]
112 | @architectures = "spacy.TransitionBasedParser.v1"
113 | nr_feature_tokens = 3
114 | hidden_width = 64
115 | maxout_pieces = 2
116 | use_upper = false
117 |
118 | # These "listener" layers are connected to the transformer pipeline component
119 | # in order to achieve multi-task learning across the pipeline.
120 | # They rely on the transformer to predict over the batch and cache the result
121 | # and callback. The gradient for the transformers will be accumulated by
122 | # the listeners, and then the last listener will call the backprop callback.
123 | [components.tagger.model.tok2vec]
124 | @architectures = "spacy-transformers.TransformerListener.v1"
125 | grad_factor = 1.0
126 |
127 | [components.parser.model.tok2vec]
128 | @architectures = "spacy-transformers.TransformerListener.v1"
129 | grad_factor = 1.0
130 |
131 | [components.ner.model.tok2vec]
132 | @architectures = "spacy-transformers.TransformerListener.v1"
133 | grad_factor = 1.0
134 |
135 | # These pooling layers control how the token vectors are calculated from
136 | # the word pieces. The reduce_mean layer averages the wordpieces, so if you
137 | # have one token aligned to multiple wordpieces (as is expected), the token's
138 | # vector will be the average of the wordpieces. The most obvious alternative
139 | # is reduce_last.v1, which would just use the last wordpiece. You could also
140 | # try reduce_first, reduce_sum or even reduce_max.
141 |
142 | [components.tagger.model.tok2vec.pooling]
143 | @layers = "reduce_mean.v1"
144 |
145 | [components.parser.model.tok2vec.pooling]
146 | @layers = "reduce_mean.v1"
147 |
148 | [components.ner.model.tok2vec.pooling]
149 | @layers = "reduce_mean.v1"
150 |
--------------------------------------------------------------------------------
/examples/configs/ner-albert.cfg:
--------------------------------------------------------------------------------
1 | [training]
2 | patience = 10000
3 | eval_frequency = 200
4 | dropout = 0.1
5 | init_tok2vec = null
6 | vectors = null
7 | max_epochs = 10000
8 | orth_variant_level = 0.3
9 | gold_preproc = true
10 | max_length = 0
11 | scores = ["speed", "ents_p", "ents_r", "ents_f"]
12 | score_weights = {"ents_f": 1.0}
13 | limit = 0
14 | width = 768
15 | accumulate_gradient = 2
16 | seed = 0
17 | use_pytorch_for_gpu_memory = true
18 |
19 |
20 | [training.batch_size]
21 | @schedules = "compounding.v1"
22 | start = 500
23 | stop = 500
24 | compound = 1.001
25 |
26 | [optimizer]
27 | @optimizers = "Adam.v1"
28 | beta1 = 0.9
29 | beta2 = 0.999
30 | L2_is_weight_decay = true
31 | L2 = 0.01
32 | grad_clip = 1.0
33 | use_averages = false
34 | eps = 1e-8
35 |
36 | [optimizer.learn_rate]
37 | @schedules = "warmup_linear.v1"
38 | initial_rate = 5e-5
39 | warmup_steps = 250
40 | total_steps = 5000
41 |
42 | [nlp]
43 | lang = "en"
44 | vectors = ${training:vectors}
45 |
46 | [nlp.pipeline.ner]
47 | factory = "ner"
48 |
49 | [nlp.pipeline.ner.model]
50 | @architectures = "spacy.TransitionBasedParser.v1"
51 | nr_feature_tokens = 3
52 | hidden_width = 128
53 | maxout_pieces = 3
54 | use_upper = false
55 |
56 | [nlp.pipeline.ner.model.tok2vec]
57 | @architectures = "spacy.Tok2VecTransformer.v3"
58 | name = "albert-base-v2"
59 | tokenizer_config = {"use_fast": false}
60 | transformer_config = {"output_attentions": false}
61 | grad_factor = 1.0
62 |
63 | [nlp.pipeline.ner.model.tok2vec.get_spans]
64 | @span_getters = "spacy-transformers.strided_spans.v1"
65 | window = 256
66 | stride = 256
67 |
68 | [nlp.pipeline.ner.model.tok2vec.pooling]
69 | @layers = "reduce_mean.v1"
70 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 | "setuptools",
4 | "cython>=0.25",
5 | "numpy>=2.0.0,<3.0.0"
6 | ]
7 | build-backend = "setuptools.build_meta"
8 |
9 | [tool.cibuildwheel]
10 | build = "*"
11 | skip = "pp* cp36* cp37* cp38*"
12 | test-skip = ""
13 | free-threaded-support = false
14 |
15 | archs = ["native"]
16 |
17 | build-frontend = "default"
18 | config-settings = {}
19 | dependency-versions = "pinned"
20 | environment = {}
21 | environment-pass = []
22 | build-verbosity = 0
23 |
24 | before-all = ""
25 | before-build = ""
26 | repair-wheel-command = ""
27 |
28 | test-command = ""
29 | before-test = ""
30 | test-requires = []
31 | test-extras = []
32 |
33 | container-engine = "docker"
34 |
35 | manylinux-x86_64-image = "manylinux2014"
36 | manylinux-i686-image = "manylinux2014"
37 | manylinux-aarch64-image = "manylinux2014"
38 | manylinux-ppc64le-image = "manylinux2014"
39 | manylinux-s390x-image = "manylinux2014"
40 | manylinux-pypy_x86_64-image = "manylinux2014"
41 | manylinux-pypy_i686-image = "manylinux2014"
42 | manylinux-pypy_aarch64-image = "manylinux2014"
43 |
44 | musllinux-x86_64-image = "musllinux_1_2"
45 | musllinux-i686-image = "musllinux_1_2"
46 | musllinux-aarch64-image = "musllinux_1_2"
47 | musllinux-ppc64le-image = "musllinux_1_2"
48 | musllinux-s390x-image = "musllinux_1_2"
49 |
50 |
51 | [tool.cibuildwheel.linux]
52 | repair-wheel-command = "auditwheel repair -w {dest_dir} {wheel}"
53 |
54 | [tool.cibuildwheel.macos]
55 | repair-wheel-command = "delocate-wheel --require-archs {delocate_archs} -w {dest_dir} -v {wheel}"
56 |
57 | [tool.cibuildwheel.windows]
58 |
59 | [tool.cibuildwheel.pyodide]
60 |
61 | [tool.isort]
62 | profile = "black"
63 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | spacy>=3.5.0,<4.1.0
2 | numpy>=1.15.0
3 | transformers[sentencepiece]>=3.4.0,<4.42.0
4 | torch>=1.8.0
5 | srsly>=2.4.0,<3.0.0
6 | dataclasses>=0.6,<1.0; python_version < "3.7"
7 | spacy-alignments>=0.7.2,<1.0.0
8 | # Development dependencies
9 | cython>=0.25
10 | pytest>=5.2.0
11 | pytest-cov>=2.7.0,<5.0.0
12 | mypy>=1.0.0,<1.6.0; platform_machine!='aarch64' and python_version >= "3.7"
13 | types-contextvars>=0.1.2; python_version < "3.7"
14 | types-dataclasses>=0.1.3; python_version < "3.7"
15 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | version = 1.3.9
3 | description = spaCy pipelines for pre-trained BERT and other transformers
4 | url = https://spacy.io
5 | author = Explosion
6 | author_email = contact@explosion.ai
7 | license = MIT
8 | long_description = file: README.md
9 | long_description_content_type = text/markdown
10 | classifiers =
11 | Development Status :: 5 - Production/Stable
12 | Environment :: Console
13 | Intended Audience :: Developers
14 | Intended Audience :: Science/Research
15 | Topic :: Scientific/Engineering
16 | Topic :: Scientific/Engineering :: Artificial Intelligence
17 | License :: OSI Approved :: MIT License
18 | Operating System :: POSIX :: Linux
19 | Operating System :: MacOS :: MacOS X
20 | Operating System :: Microsoft :: Windows
21 | Programming Language :: Python :: 3
22 | Programming Language :: Python :: 3.7
23 | Programming Language :: Python :: 3.8
24 | Programming Language :: Python :: 3.9
25 | Programming Language :: Python :: 3.10
26 | Programming Language :: Python :: 3.11
27 |
28 | [options]
29 | zip_safe = false
30 | include_package_data = true
31 | python_requires = >=3.9,<3.14
32 | install_requires =
33 | spacy>=3.5.0,<4.1.0
34 | numpy>=1.15.0; python_version < "3.9"
35 | numpy>=1.19.0; python_version >= "3.9"
36 | transformers>=3.4.0,<4.50.0
37 | torch>=1.8.0
38 | srsly>=2.4.0,<3.0.0
39 | dataclasses>=0.6,<1.0; python_version < "3.7"
40 | spacy-alignments>=0.7.2,<1.0.0
41 |
42 | [options.extras_require]
43 | cuda =
44 | cupy>=5.0.0b4
45 | cuda80 =
46 | cupy-cuda80>=5.0.0b4
47 | cuda90 =
48 | cupy-cuda90>=5.0.0b4
49 | cuda91 =
50 | cupy-cuda91>=5.0.0b4
51 | cuda92 =
52 | cupy-cuda92>=5.0.0b4
53 | cuda100 =
54 | cupy-cuda100>=5.0.0b4
55 | cuda101 =
56 | cupy-cuda101>=5.0.0b4
57 | cuda102 =
58 | cupy-cuda102>=5.0.0b4
59 | cuda110 =
60 | cupy-cuda110>=5.0.0b4
61 | cuda111 =
62 | cupy-cuda111>=5.0.0b4
63 | cuda112 =
64 | cupy-cuda112>=5.0.0b4
65 |
66 | [options.entry_points]
67 | spacy_factories =
68 | transformer = spacy_transformers.pipeline_component:make_transformer
69 |
70 | spacy_architectures =
71 | spacy-transformers.TransformerListener.v1 = spacy_transformers:architectures.transformer_listener_tok2vec_v1
72 | spacy-transformers.Tok2VecTransformer.v1 = spacy_transformers:architectures.transformer_tok2vec_v1
73 | spacy-transformers.Tok2VecTransformer.v2 = spacy_transformers:architectures.transformer_tok2vec_v2
74 | spacy-transformers.Tok2VecTransformer.v3 = spacy_transformers:architectures.transformer_tok2vec_v3
75 | spacy-transformers.TransformerModel.v1 = spacy_transformers:architectures.create_TransformerModel_v1
76 | spacy-transformers.TransformerModel.v2 = spacy_transformers:architectures.create_TransformerModel_v2
77 | spacy-transformers.TransformerModel.v3 = spacy_transformers:architectures.create_TransformerModel_v3
78 |
79 | [bdist_wheel]
80 | universal = true
81 |
82 | [sdist]
83 | formats = gztar
84 |
85 | [flake8]
86 | ignore = E203, E266, E501, E731, W503
87 | max-line-length = 80
88 | select = B,C,E,F,W,T4,B9
89 | exclude =
90 | .env,
91 | .git,
92 | __pycache__,
93 |
94 | [mypy]
95 | ignore_missing_imports = True
96 | no_implicit_optional = True
97 | plugins = pydantic.mypy, thinc.mypy
98 |
99 | [coverage:run]
100 |
101 | [coverage:report]
102 | omit =
103 | **/tests/*
104 | **/_vendorized/*
105 | **/about.py
106 | exclude_lines =
107 | pragma: no cover
108 | # Don't complain about missing debug-only code:
109 | def __unicode__
110 | def __repr__
111 | if self\.debug
112 | # Don't complain if tests don't hit defensive assertion code:
113 | raise AssertionError
114 | raise NotImplementedError
115 | # Don't complain if non-runnable code isn't run:
116 | if 0:
117 | if __name__ == .__main__.:
118 | show_missing = True
119 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, Extension, find_packages
2 | from setuptools.command.build_ext import build_ext
3 | from Cython.Build import cythonize
4 | from Cython.Compiler import Options
5 | import numpy
6 |
7 |
8 | # Preserve `__doc__` on functions and classes
9 | # http://docs.cython.org/en/latest/src/userguide/source_files_and_compilation.html#compiler-options
10 | Options.docstrings = True
11 |
12 | COMPILE_OPTIONS = {
13 | "msvc": ["/Ox", "/EHsc"],
14 | "mingw32": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"],
15 | "other": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"],
16 | }
17 | LINK_OPTIONS = {"msvc": ["-std=c++11"], "mingw32": ["-std=c++11"], "other": []}
18 | COMPILER_DIRECTIVES = {
19 | "language_level": -3,
20 | "embedsignature": True,
21 | "annotation_typing": False,
22 | }
23 |
24 |
25 | # By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options
26 | # http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used
27 | class build_ext_options:
28 | def build_options(self):
29 | for e in self.extensions:
30 | e.extra_compile_args += COMPILE_OPTIONS.get(
31 | self.compiler.compiler_type, COMPILE_OPTIONS["other"]
32 | )
33 | for e in self.extensions:
34 | e.extra_link_args += LINK_OPTIONS.get(
35 | self.compiler.compiler_type, LINK_OPTIONS["other"]
36 | )
37 |
38 |
39 | class build_ext_subclass(build_ext, build_ext_options):
40 | def build_extensions(self):
41 | build_ext_options.build_options(self)
42 | build_ext.build_extensions(self)
43 |
44 |
45 | def setup_package():
46 | ext_modules = [
47 | Extension(
48 | "spacy_transformers.align",
49 | ["spacy_transformers/align.pyx"],
50 | language="c++",
51 | include_dirs=[numpy.get_include()],
52 | extra_compile_args=["-std=c++11"],
53 | ),
54 | ]
55 |
56 | ext_modules = cythonize(ext_modules, compiler_directives=COMPILER_DIRECTIVES)
57 |
58 | setup(
59 | name="spacy-transformers",
60 | packages=find_packages(),
61 | ext_modules=ext_modules,
62 | cmdclass={"build_ext": build_ext_subclass},
63 | package_data={"": ["*.pyx", "*.pxd", "*.pxi"]},
64 | )
65 |
66 |
67 | if __name__ == "__main__":
68 | setup_package()
69 |
--------------------------------------------------------------------------------
/spacy_transformers/__init__.py:
--------------------------------------------------------------------------------
1 | from . import architectures
2 | from . import annotation_setters
3 | from . import span_getters
4 | from .layers import TransformerModel
5 | from .pipeline_component import Transformer, install_extensions
6 | from .data_classes import TransformerData, FullTransformerBatch
7 | from .util import registry
8 |
9 |
10 | __all__ = [
11 | "install_extensions",
12 | "Transformer",
13 | "TransformerModel",
14 | "TransformerData",
15 | "FullTransformerBatch",
16 | "architectures",
17 | "annotation_setters",
18 | "span_getters",
19 | "registry",
20 | ]
21 |
--------------------------------------------------------------------------------
/spacy_transformers/align.pyi:
--------------------------------------------------------------------------------
1 | from typing import Dict, List, Tuple, Callable, Optional
2 | from spacy.tokens import Span, Token
3 | from thinc.api import Ops
4 | from thinc.types import Ragged, Floats2d, Ints2d
5 |
6 | def apply_alignment(
7 | ops: Ops, align: Ragged, X: Floats2d
8 | ) -> Tuple[Ragged, Callable]: ...
9 | def get_token_positions(spans: List[Span]) -> Dict[Token, int]: ...
10 | def get_alignment_via_offset_mapping(
11 | spans: List[Span],
12 | offset_mapping: Ints2d,
13 | ) -> Ragged: ...
14 | def get_alignment(
15 | spans: List[Span],
16 | wordpieces: List[List[str]],
17 | special_tokens: Optional[List[str]] = None,
18 | ) -> Ragged: ...
19 | def get_span2wp_from_offset_mapping(
20 | span: Span,
21 | wp_char_offsets: Tuple[int],
22 | ) -> List[List[int]]: ...
23 |
--------------------------------------------------------------------------------
/spacy_transformers/align.pyx:
--------------------------------------------------------------------------------
1 | # cython: infer_types=True, boundscheck=False
2 | from typing import cast, Dict, List, Tuple, Callable, Set, Optional
3 | import numpy
4 | from spacy_alignments.tokenizations import get_alignments
5 | from spacy.tokens import Span, Token
6 | from thinc.api import Ops
7 | from thinc.types import Ragged, Floats2d, Ints1d, Ints2d
8 |
9 | from cython.operator cimport dereference as deref
10 | from cython.operator cimport preincrement as preinc
11 | from libc.stdint cimport uint32_t, int32_t, int64_t
12 | from libc.stdlib cimport free
13 | from libcpp.unordered_set cimport unordered_set
14 | from libcpp.vector cimport vector
15 |
16 | ctypedef unordered_set[uint32_t]* unordered_set_uint32_t_ptr
17 |
18 |
19 | def apply_alignment(ops: Ops, align: Ragged, X: Floats2d) -> Tuple[Ragged, Callable]:
20 | """Align wordpiece data (X) to match tokens, and provide a callback to
21 | reverse it.
22 |
23 | This function returns a Ragged array, which represents the fact that one
24 | token may be aligned against multiple wordpieces. It's a nested list,
25 | concatenated with a lengths array to indicate the nested structure.
26 |
27 | The alignment is also a Ragged array, where the lengths indicate how many
28 | wordpieces each token is aligned against. The output ragged therefore has
29 | the same lengths as the alignment ragged, which means the output data
30 | also has the same number of data rows as the alignment. The size of the
31 | lengths array indicates the number of tokens in the batch.
32 |
33 | The actual alignment is a simple indexing operation:
34 |
35 | for i, index in enumerate(align.data):
36 | Y[i] = X[index]
37 |
38 | Which is vectorized via numpy advanced indexing:
39 |
40 | Y = X[align.data]
41 |
42 | The inverse operation, for the backward pass, uses the 'scatter_add' op
43 | because one wordpiece may be aligned against multiple tokens. So we need:
44 |
45 | for i, index in enumerate(align.data):
46 | X[index] += Y[i]
47 |
48 | The addition wouldn't occur if we simply did `X[index] = Y`, so we use
49 | the scatter_add op.
50 | """
51 | if not align.lengths.sum():
52 | return _apply_empty_alignment(ops, align, X)
53 | shape = X.shape
54 | indices = cast(Ints1d, align.dataXd)
55 | Y = Ragged(X[indices], cast(Ints1d, ops.asarray(align.lengths)))
56 |
57 | def backprop_apply_alignment(dY: Ragged) -> Floats2d:
58 | assert dY.data.shape[0] == indices.shape[0]
59 | dX = ops.alloc2f(*shape)
60 | ops.scatter_add(dX, indices, cast(Floats2d, dY.dataXd))
61 | return dX
62 |
63 | return Y, backprop_apply_alignment
64 |
65 |
66 | def _apply_empty_alignment(ops, align, X):
67 | shape = X.shape
68 | Y = Ragged(
69 | ops.alloc2f(align.lengths.shape[0], X.shape[1]),
70 | ops.alloc1i(align.lengths.shape[0]) + 1,
71 | )
72 |
73 | def backprop_null_alignment(dY: Ragged) -> Floats2d:
74 | return ops.alloc2f(*shape)
75 |
76 | return Y, backprop_null_alignment
77 |
78 |
79 | def get_token_positions(spans: List[Span]) -> Dict[Token, int]:
80 | token_positions: Dict[Token, int] = {}
81 | seen_docs = set()
82 | for span in spans:
83 | if span.doc in seen_docs:
84 | continue
85 | seen_docs.add(span.doc)
86 | for token in span.doc:
87 | if token not in token_positions:
88 | token_positions[token] = len(token_positions)
89 | return token_positions
90 |
91 |
92 | def get_alignment_via_offset_mapping(
93 | spans: List[Span],
94 | offset_mapping: Ints2d,
95 | ) -> Ragged:
96 | if len(spans) != len(offset_mapping):
97 | raise ValueError("Cannot align batches of different sizes.")
98 | # Tokens can occur more than once, and we need the alignment of each token
99 | # to its place in the concatenated wordpieces array.
100 | token_positions = get_token_positions(spans)
101 | alignment: List[Set[int]] = [set() for _ in range(len(token_positions))]
102 | wp_start = 0
103 | for i, span in enumerate(spans):
104 | span_offset_mapping = offset_mapping[i]
105 | span2wp = get_span2wp_from_offset_mapping(span, span_offset_mapping)
106 | for token, wp_js in zip(span, span2wp):
107 | position = token_positions[token]
108 | alignment[position].update(wp_start + j for j in wp_js)
109 | wp_start += span_offset_mapping.shape[0]
110 | lengths: List[int] = []
111 | flat: List[int] = []
112 | for a in alignment:
113 | lengths.append(len(a))
114 | flat.extend(sorted(a))
115 | align = Ragged(
116 | cast(Ints1d, numpy.array(flat, dtype="i")),
117 | cast(Ints1d, numpy.array(lengths, dtype="i")),
118 | )
119 | return align
120 |
121 |
122 | def get_alignment(
123 | spans: List[Span],
124 | wordpieces: List[List[str]],
125 | special_tokens: Optional[List[str]] = None,
126 | ) -> Ragged:
127 | """Compute a ragged alignment array that records, for each unique token in
128 | `spans`, the corresponding indices in the flattened `wordpieces` array.
129 | For instance, imagine you have two overlapping spans:
130 |
131 | [[I, like, walking], [walking, outdoors]]
132 |
133 | And their wordpieces are:
134 |
135 | [[I, like, walk, ing], [walk, ing, out, doors]]
136 |
137 | We want to align "walking" against [walk, ing, walk, ing], which have
138 | indices [2, 3, 4, 5] once the nested wordpieces list is flattened.
139 |
140 | The nested alignment list would be:
141 |
142 | [[0], [1], [2, 3, 4, 5], [6, 7]]
143 | I like walking outdoors
144 |
145 | Which gets flattened into the ragged array:
146 |
147 | [0, 1, 2, 3, 4, 5, 6, 7]
148 | [1, 1, 4, 2]
149 |
150 | The ragged format allows the aligned data to be computed via:
151 |
152 | tokens = Ragged(wp_tensor[align.data], align.lengths)
153 |
154 | This produces a ragged format, indicating which tokens need to be collapsed
155 | to make the aligned array. The reduction is deferred for a later step, so
156 | the user can configure it. The indexing is especially efficient in trivial
157 | cases like this where the indexing array is completely continuous.
158 | """
159 | if len(spans) != len(wordpieces):
160 | raise ValueError("Cannot align batches of different sizes.")
161 | if special_tokens is None:
162 | special_tokens = []
163 | # Tokens can occur more than once, and we need the alignment of each token
164 | # to its place in the concatenated wordpieces array.
165 | token_positions = get_token_positions(spans)
166 | alignment: List[Set[int]] = [set() for _ in range(len(token_positions))]
167 | wp_start = 0
168 | for i, (span, wp_toks) in enumerate(zip(spans, wordpieces)):
169 | sp_toks = [token.text for token in span]
170 | wp_toks_filtered = wp_toks
171 | # In the case that the special tokens do not appear in the text, filter
172 | # them out for alignment purposes so that special tokens like "" are
173 | # not aligned to the character "s" in the text. (If the special tokens
174 | # appear in the text, it's not possible to distinguish them from the
175 | # added special tokens, so they may be aligned incorrectly.)
176 | if not any([special in span.text for special in special_tokens]):
177 | wp_toks_filtered = [
178 | tok if tok not in special_tokens else "" for tok in wp_toks
179 | ]
180 | span2wp, wp2span = get_alignments(sp_toks, wp_toks_filtered)
181 | for token, wp_js in zip(span, span2wp):
182 | position = token_positions[token]
183 | alignment[position].update(wp_start + j for j in wp_js)
184 | wp_start += len(wp_toks)
185 | lengths: List[int] = []
186 | flat: List[int] = []
187 | for a in alignment:
188 | lengths.append(len(a))
189 | flat.extend(sorted(a))
190 | align = Ragged(
191 | cast(Ints1d, numpy.array(flat, dtype="i")),
192 | cast(Ints1d, numpy.array(lengths, dtype="i")),
193 | )
194 | return align
195 |
196 |
197 | def get_span2wp_from_offset_mapping(span, wp_char_offsets):
198 | # create a mapping of char indices to spacy token indices
199 | cdef int span_idx = span[0].idx
200 | cdef int span_i = span[0].i
201 | cdef int char_idx, rel_token_i
202 | # size is +1 so we don't have to check whether the text has a trailing space
203 | char_to_sp_token = numpy.full((len(span.text) + 1,), -1, dtype="int32")
204 | for token in span:
205 | rel_token_i = token.i - span_i
206 | for char_idx in range(
207 | token.idx - span_idx,
208 | token.idx - span_idx + len(token) + 1,
209 | ):
210 | char_to_sp_token[char_idx] = rel_token_i
211 |
212 | # align all wordpiece tokens to one or more spacy token indices
213 | cdef vector[unordered_set_uint32_t_ptr] alignment
214 | for _ in range(len(span)):
215 | alignment.push_back(new unordered_set[uint32_t]())
216 | _get_span2wp_alignment(
217 | &alignment,
218 | numpy.ascontiguousarray(char_to_sp_token),
219 | char_to_sp_token.size,
220 | numpy.ascontiguousarray(wp_char_offsets, dtype="int64"),
221 | wp_char_offsets.shape[0],
222 | )
223 |
224 | # convert the alignment into a list of aligned wordpiece indices per spacy
225 | # token index (unsorted at this point)
226 | cdef unordered_set_uint32_t_ptr s
227 | cdef vector[unordered_set_uint32_t_ptr].iterator it_v = alignment.begin()
228 | cdef unordered_set[uint32_t].iterator it_s
229 | result: List[List[int]] = []
230 | while it_v != alignment.end():
231 | result.append([])
232 | s = deref(it_v)
233 | it_s = s.begin()
234 | while it_s != s.end():
235 | result[-1].append(deref(it_s))
236 | preinc(it_s)
237 | del s
238 | preinc(it_v)
239 | return result
240 |
241 |
242 | cdef int _get_span2wp_alignment(
243 | vector[unordered_set_uint32_t_ptr]* alignment,
244 | int32_t[::1] char_to_sp_token,
245 | int char_to_sp_token_length,
246 | int64_t[:, ::1] wp_char_offsets,
247 | int wp_char_offsets_length,
248 | ) nogil:
249 | cdef int char_idx, start_idx, end_idx, token_i
250 | cdef int wp_j = 0
251 | cdef int alignment_size = alignment.size()
252 | while wp_j < wp_char_offsets_length:
253 | start_idx = wp_char_offsets[wp_j][0]
254 | end_idx = wp_char_offsets[wp_j][1]
255 | char_idx = start_idx
256 | while char_idx < end_idx:
257 | if 0 <= char_idx < char_to_sp_token_length:
258 | token_i = char_to_sp_token[char_idx]
259 | else:
260 | token_i = -1
261 | if 0 <= token_i < alignment_size:
262 | deref(alignment.at(token_i)).insert(wp_j)
263 | char_idx += 1
264 | wp_j += 1
265 | return 0
266 |
--------------------------------------------------------------------------------
/spacy_transformers/annotation_setters.py:
--------------------------------------------------------------------------------
1 | from typing import Callable, List
2 | from spacy.tokens import Doc
3 |
4 | from .util import registry
5 | from .data_classes import FullTransformerBatch
6 |
7 |
8 | def null_annotation_setter(docs: List[Doc], trf_data: FullTransformerBatch) -> None:
9 | """Set no additional annotations on the Doc objects."""
10 | pass
11 |
12 |
13 | @registry.annotation_setters("spacy-transformers.null_annotation_setter.v1") # type: ignore
14 | def configure_null_annotation_setter() -> Callable[
15 | [List[Doc], FullTransformerBatch], None
16 | ]:
17 | return null_annotation_setter
18 |
19 |
20 | __all__ = ["null_annotation_setter", "configure_null_annotation_setter"]
21 |
--------------------------------------------------------------------------------
/spacy_transformers/architectures.py:
--------------------------------------------------------------------------------
1 | from typing import List, Callable
2 | from thinc.api import Model, chain
3 | from thinc.types import Ragged, Floats2d
4 | from spacy.tokens import Doc
5 |
6 | from .layers import TransformerModel, TransformerListener
7 | from .layers import trfs2arrays, split_trf_batch
8 | from .util import registry
9 | from .data_classes import FullTransformerBatch
10 |
11 |
12 | @registry.architectures.register("spacy-transformers.TransformerListener.v1")
13 | def transformer_listener_tok2vec_v1(
14 | pooling: Model[Ragged, Floats2d], grad_factor: float = 1.0, upstream: str = "*"
15 | ) -> Model[List[Doc], List[Floats2d]]:
16 | """Create a 'TransformerListener' layer, which will connect to a Transformer
17 | component earlier in the pipeline.
18 |
19 | The layer takes a list of Doc objects as input, and produces a list of
20 | 2d arrays as output, with each array having one row per token. Most spaCy
21 | models expect a sublayer with this signature, making it easy to connect them
22 | to a transformer model via this sublayer.
23 | Transformer models usually operate over wordpieces, which usually don't align
24 | one-to-one against spaCy tokens. The layer therefore requires a reduction
25 | operation in order to calculate a single token vector given zero or more
26 | wordpiece vectors.
27 |
28 | pooling (Model[Ragged, Floats2d]): A reduction layer used to calculate
29 | the token vectors based on zero or more wordpiece vectors. If in doubt,
30 | mean pooling (see `thinc.layers.reduce_mean`) is usually a good choice.
31 | grad_factor (float): Reweight gradients from the component before passing
32 | them upstream. You can set this to 0 to "freeze" the transformer weights
33 | with respect to the component, or use it to make some components more
34 | significant than others. Leaving it at 1.0 is usually fine.
35 | upstream (str): A string to identify the 'upstream' Transformer
36 | to communicate with. The upstream name should either be the wildcard
37 | string '*', or the name of the `Transformer` component. You'll almost
38 | never have multiple upstream Transformer components, so the wildcard
39 | string will almost always be fine.
40 | """
41 | listener = TransformerListener(upstream_name=upstream)
42 | model: Model = chain(listener, trfs2arrays(pooling, grad_factor))
43 | model.set_ref("listener", listener)
44 | return model
45 |
46 |
47 | @registry.architectures.register("spacy-transformers.Tok2VecTransformer.v1")
48 | def transformer_tok2vec_v1(
49 | name: str,
50 | get_spans,
51 | tokenizer_config: dict,
52 | pooling: Model[Ragged, Floats2d],
53 | grad_factor: float = 1.0,
54 | ) -> Model[List[Doc], List[Floats2d]]:
55 | """Use a transformer as a "Tok2Vec" layer directly. This does not allow
56 | multiple components to share the transformer weights, and does not allow
57 | the transformer to set annotations into the `Doc` object, but it's a
58 | simpler solution if you only need the transformer within one component.
59 |
60 | get_spans (Callable[[List[Doc]], List[List[Span]]]): A function to extract
61 | spans from the batch of Doc objects. See the "TransformerModel" layer
62 | for details.
63 | tokenizer_config (dict): Settings to pass to the transformers tokenizer.
64 | pooling (Model[Ragged, Floats2d]): A reduction layer used to calculate
65 | the token vectors based on zero or more wordpiece vectors. If in doubt,
66 | mean pooling (see `thinc.layers.reduce_mean`) is usually a good choice.
67 | grad_factor (float): Reweight gradients from the component before passing
68 | them to the transformer. You can set this to 0 to "freeze" the transformer
69 | weights with respect to the component, or to make it learn more slowly.
70 | Leaving it at 1.0 is usually fine.
71 | """
72 | return chain(
73 | TransformerModel(name, get_spans, tokenizer_config),
74 | split_trf_batch(),
75 | trfs2arrays(pooling, grad_factor),
76 | )
77 |
78 |
79 | @registry.architectures.register("spacy-transformers.Tok2VecTransformer.v2")
80 | def transformer_tok2vec_v2(
81 | name: str,
82 | get_spans,
83 | tokenizer_config: dict,
84 | pooling: Model[Ragged, Floats2d],
85 | grad_factor: float = 1.0,
86 | transformer_config: dict = {},
87 | ) -> Model[List[Doc], List[Floats2d]]:
88 | """Use a transformer as a "Tok2Vec" layer directly. This does not allow
89 | multiple components to share the transformer weights, and does not allow
90 | the transformer to set annotations into the `Doc` object, but it's a
91 | simpler solution if you only need the transformer within one component.
92 |
93 | get_spans (Callable[[List[Doc]], List[List[Span]]]): A function to extract
94 | spans from the batch of Doc objects. See the "TransformerModel" layer
95 | for details.
96 | tokenizer_config (dict): Settings to pass to the transformers tokenizer.
97 | pooling (Model[Ragged, Floats2d]): A reduction layer used to calculate
98 | the token vectors based on zero or more wordpiece vectors. If in doubt,
99 | mean pooling (see `thinc.layers.reduce_mean`) is usually a good choice.
100 | grad_factor (float): Reweight gradients from the component before passing
101 | them to the transformer. You can set this to 0 to "freeze" the transformer
102 | weights with respect to the component, or to make it learn more slowly.
103 | Leaving it at 1.0 is usually fine.
104 | transformers_config (dict): Settings to pass to the transformers forward pass
105 | of the transformer.
106 | """
107 | return chain(
108 | TransformerModel(name, get_spans, tokenizer_config, transformer_config),
109 | split_trf_batch(),
110 | trfs2arrays(pooling, grad_factor),
111 | )
112 |
113 |
114 | # Note: when updating, also make sure to update 'replace_listener_cfg' in _util.py
115 | @registry.architectures.register("spacy-transformers.Tok2VecTransformer.v3")
116 | def transformer_tok2vec_v3(
117 | name: str,
118 | get_spans,
119 | tokenizer_config: dict,
120 | pooling: Model[Ragged, Floats2d],
121 | grad_factor: float = 1.0,
122 | transformer_config: dict = {},
123 | mixed_precision: bool = False,
124 | grad_scaler_config: dict = {},
125 | ) -> Model[List[Doc], List[Floats2d]]:
126 | """Use a transformer as a "Tok2Vec" layer directly. This does not allow
127 | multiple components to share the transformer weights, and does not allow
128 | the transformer to set annotations into the `Doc` object, but it's a
129 | simpler solution if you only need the transformer within one component.
130 |
131 | get_spans (Callable[[List[Doc]], List[List[Span]]]): A function to extract
132 | spans from the batch of Doc objects. See the "TransformerModel" layer
133 | for details.
134 | tokenizer_config (dict): Settings to pass to the transformers tokenizer.
135 | pooling (Model[Ragged, Floats2d]): A reduction layer used to calculate
136 | the token vectors based on zero or more wordpiece vectors. If in doubt,
137 | mean pooling (see `thinc.layers.reduce_mean`) is usually a good choice.
138 | grad_factor (float): Reweight gradients from the component before passing
139 | them to the transformer. You can set this to 0 to "freeze" the transformer
140 | weights with respect to the component, or to make it learn more slowly.
141 | Leaving it at 1.0 is usually fine.
142 | transformers_config (dict): Settings to pass to the transformers forward pass
143 | of the transformer.
144 | mixed_precision (bool): Enable mixed-precision. Mixed-precision replaces
145 | whitelisted ops to half-precision counterparts. This speeds up training
146 | and prediction on modern GPUs and reduces GPU memory use.
147 | grad_scaler_config (dict): Configuration for gradient scaling in mixed-precision
148 | training. Gradient scaling is enabled automatically when mixed-precision
149 | training is used.
150 |
151 | Setting `enabled` to `False` in the gradient scaling configuration disables
152 | gradient scaling. The `init_scale` (default: `2 ** 16`) determines the
153 | initial scale. `backoff_factor` (default: `0.5`) specifies the factor
154 | by which the scale should be reduced when gradients overflow.
155 | `growth_interval` (default: `2000`) configures the number of steps
156 | without gradient overflows after which the scale should be increased.
157 | Finally, `growth_factor` (default: `2.0`) determines the factor by which
158 | the scale should be increased when no overflows were found for
159 | `growth_interval` steps.
160 | """
161 | # Note that this is a chain of chain on purpose, to match the structure of
162 | # TransformerListener.v1 after it is run through replace_listener (cf PR #310)
163 | return chain( # type: ignore
164 | chain(
165 | TransformerModel(
166 | name,
167 | get_spans,
168 | tokenizer_config,
169 | transformer_config,
170 | mixed_precision,
171 | grad_scaler_config,
172 | ),
173 | split_trf_batch(),
174 | ),
175 | trfs2arrays(pooling, grad_factor),
176 | )
177 |
178 |
179 | @registry.architectures.register("spacy-transformers.TransformerModel.v1")
180 | def create_TransformerModel_v1(
181 | name: str,
182 | get_spans: Callable,
183 | tokenizer_config: dict = {},
184 | ) -> Model[List[Doc], FullTransformerBatch]:
185 | model = TransformerModel(name, get_spans, tokenizer_config)
186 | return model
187 |
188 |
189 | @registry.architectures.register("spacy-transformers.TransformerModel.v2")
190 | def create_TransformerModel_v2(
191 | name: str,
192 | get_spans: Callable,
193 | tokenizer_config: dict = {},
194 | transformer_config: dict = {},
195 | ) -> Model[List[Doc], FullTransformerBatch]:
196 | model = TransformerModel(name, get_spans, tokenizer_config, transformer_config)
197 | return model
198 |
199 |
200 | @registry.architectures.register("spacy-transformers.TransformerModel.v3")
201 | def create_TransformerModel_v3(
202 | name: str,
203 | get_spans: Callable,
204 | tokenizer_config: dict = {},
205 | transformer_config: dict = {},
206 | mixed_precision: bool = False,
207 | grad_scaler_config: dict = {},
208 | ) -> Model[List[Doc], FullTransformerBatch]:
209 | """Pretrained transformer model that can be finetuned for downstream tasks.
210 |
211 | name (str): Name of the pretrained Huggingface model to use.
212 | get_spans (Callable[[List[Doc]], List[List[Span]]]): A function to extract
213 | spans from the batch of Doc objects. See the "TransformerModel" layer
214 | for details.
215 | tokenizer_config (dict): Settings to pass to the transformers tokenizer.
216 | transformers_config (dict): Settings to pass to the transformers forward pass
217 | of the transformer.
218 | mixed_precision (bool): Enable mixed-precision. Mixed-precision replaces
219 | whitelisted ops to half-precision counterparts. This speeds up training
220 | and prediction on modern GPUs and reduces GPU memory use.
221 | grad_scaler_config (dict): Configuration for gradient scaling in mixed-precision
222 | training. Gradient scaling is enabled automatically when mixed-precision
223 | training is used.
224 |
225 | Setting `enabled` to `False` in the gradient scaling configuration disables
226 | gradient scaling. The `init_scale` (default: `2 ** 16`) determines the
227 | initial scale. `backoff_factor` (default: `0.5`) specifies the factor
228 | by which the scale should be reduced when gradients overflow.
229 | `growth_interval` (default: `2000`) configures the number of steps
230 | without gradient overflows after which the scale should be increased.
231 | Finally, `growth_factor` (default: `2.0`) determines the factor by which
232 | the scale should be increased when no overflows were found for
233 | `growth_interval` steps.
234 | """
235 | model = TransformerModel(
236 | name,
237 | get_spans,
238 | tokenizer_config,
239 | transformer_config,
240 | mixed_precision,
241 | grad_scaler_config,
242 | )
243 | return model
244 |
--------------------------------------------------------------------------------
/spacy_transformers/data_classes.py:
--------------------------------------------------------------------------------
1 | from typing import Optional, List, Dict, Any, Union, Tuple, cast
2 | from dataclasses import dataclass, field
3 | import torch
4 | import numpy
5 | from transformers.tokenization_utils import BatchEncoding
6 | from transformers.file_utils import ModelOutput
7 | from transformers.modeling_outputs import BaseModelOutput
8 | from thinc.types import Ragged, Floats2d, Floats3d, FloatsXd, Ints1d, Ints2d
9 | from thinc.api import NumpyOps, get_array_module, xp2torch, torch2xp
10 | from spacy.tokens import Span
11 | import srsly
12 |
13 | from .util import transpose_list
14 | from .align import get_token_positions
15 |
16 |
17 | @dataclass
18 | class WordpieceBatch:
19 | """Holds data from the transformers BatchEncoding class.
20 |
21 | We would have preferred to use the BatchEncoding class directly, but
22 | there's a few problems with that.
23 |
24 | 1. Some BatchEncoding functionality requires the tokenizers.Encoding object,
25 | and it's impossible for us to create or manipulate that object. This means
26 | we can't really create BatchEncoding objects, which limits what we can do.
27 | 2. We want some semantic differences, for instance the "lengths" data in the
28 | BatchEncoding is useless when the inputs are padded. We want it to tell
29 | us the *unpadded* lengths.
30 | 3. We want typed attributes, so that we can type-check properly.
31 | 4. We prefer to have numpy/cupy arrays rather than torch arrays.
32 | 5. The API around the BatchEncoding object has been changing a lot, so we
33 | want to minimize the places where we touch it.
34 | """
35 |
36 | strings: List[List[str]]
37 | input_ids: Ints2d
38 | attention_mask: Floats2d
39 | lengths: List[int]
40 | token_type_ids: Optional[Ints2d]
41 |
42 | def __len__(self) -> int:
43 | return len(self.strings)
44 |
45 | def __getitem__(self, index) -> "WordpieceBatch":
46 | if isinstance(index, int):
47 | slice_ = slice(index, index + 1)
48 | else:
49 | slice_ = index
50 | return WordpieceBatch(
51 | strings=self.strings[slice_],
52 | input_ids=self.input_ids[slice_],
53 | attention_mask=self.attention_mask[slice_],
54 | lengths=self.lengths[slice_],
55 | token_type_ids=(
56 | self.token_type_ids[slice_] if self.token_type_ids is not None else None
57 | ),
58 | )
59 |
60 | def to_hf_dict(self) -> Dict:
61 | """Return a dict similar to the format produced by the Huggingface
62 | tokenizer, converting arrays to pytorch tensors as well.
63 | """
64 | output = {
65 | "input_ids": xp2torch(self.input_ids),
66 | "attention_mask": xp2torch(self.attention_mask),
67 | "input_texts": self.strings,
68 | }
69 | if self.token_type_ids is not None:
70 | output["token_type_ids"] = xp2torch(self.token_type_ids)
71 | return output
72 |
73 | @classmethod
74 | def empty(cls, *, xp=numpy) -> "WordpieceBatch":
75 | return cls(
76 | strings=[],
77 | input_ids=xp.zeros((0, 0), dtype="i"),
78 | attention_mask=xp.ones((0, 0), dtype="bool"),
79 | lengths=[],
80 | token_type_ids=None,
81 | )
82 |
83 | @classmethod
84 | def zeros(cls, lengths: List[int], xp=numpy) -> "WordpieceBatch":
85 | return cls(
86 | strings=[[""] * length for length in lengths],
87 | input_ids=xp.array([[0] * length for length in lengths], dtype="i"),
88 | attention_mask=xp.ones((len(lengths), max(lengths)), dtype="bool"),
89 | lengths=lengths,
90 | token_type_ids=None,
91 | )
92 |
93 | @classmethod
94 | def from_batch_encoding(cls, token_data: BatchEncoding) -> "WordpieceBatch":
95 | assert isinstance(token_data, BatchEncoding) or isinstance(token_data, dict)
96 | pad_token = token_data.get("pad_token", "[PAD]")
97 | lengths = [
98 | len([tok for tok in tokens if tok != pad_token])
99 | for tokens in token_data["input_texts"]
100 | ]
101 |
102 | # The following tensors are intentionally allocated on the CPU to reduce
103 | # host-to-device copies.
104 | numpy_ops = NumpyOps()
105 | input_ids = token_data["input_ids"]
106 | token_type_ids = token_data.get("token_type_ids")
107 |
108 | return cls(
109 | strings=token_data["input_texts"],
110 | input_ids=numpy_ops.asarray(input_ids, dtype=input_ids.dtype),
111 | attention_mask=numpy_ops.asarray2f(token_data["attention_mask"]),
112 | lengths=lengths,
113 | token_type_ids=(
114 | numpy_ops.asarray(token_type_ids, dtype=token_type_ids.dtype)
115 | if token_type_ids is not None
116 | else None
117 | ),
118 | )
119 |
120 | def to_dict(self) -> Dict[str, Any]:
121 | return {
122 | "strings": self.strings,
123 | "input_ids": self.input_ids,
124 | "attention_mask": self.attention_mask,
125 | "lengths": self.lengths,
126 | "token_type_ids": self.token_type_ids,
127 | }
128 |
129 | def from_dict(self, msg: Dict[str, Any]) -> "WordpieceBatch":
130 | self.strings = msg["strings"]
131 | self.input_ids = msg["input_ids"]
132 | self.attention_mask = msg["attention_mask"]
133 | self.lengths = msg["lengths"]
134 | self.token_type_ids = msg["token_type_ids"]
135 | return self
136 |
137 |
138 | @dataclass
139 | class TransformerData:
140 | """Transformer tokens and outputs for one Doc object.
141 |
142 | The transformer models return tensors that refer to a whole padded batch
143 | of documents. These tensors are wrapped into the FullTransformerBatch object.
144 | The FullTransformerBatch then splits out the per-document data, which is
145 | handled by this class. Instances of this class are typically assigned to
146 | the doc._.trf_data extension attribute.
147 |
148 | Attributes
149 | ----------
150 | wordpieces (WordpieceBatch): A slice of the wordpiece token data produced
151 | by the Huggingface tokenizer.
152 | model_output (ModelOutput): The model output from the transformer model,
153 | determined by the model and transformer config.
154 | align (Ragged): Alignment from the Doc's tokenization to the wordpieces.
155 | This is a ragged array, where align.lengths[i] indicates the number of
156 | wordpiece tokens that token i aligns against. The actual indices are
157 | provided at align[i].dataXd.
158 | """
159 |
160 | wordpieces: WordpieceBatch
161 | model_output: ModelOutput
162 | align: Ragged
163 |
164 | @classmethod
165 | def empty(cls) -> "TransformerData":
166 | align = Ragged(
167 | cast(Ints1d, numpy.zeros((0,), dtype="i")),
168 | cast(Ints1d, numpy.zeros((0,), dtype="i")),
169 | )
170 | return cls(
171 | wordpieces=WordpieceBatch.empty(), model_output=ModelOutput(), align=align
172 | )
173 |
174 | @classmethod
175 | def zeros(cls, length: int, width: int, *, xp=numpy) -> "TransformerData":
176 | """Create a valid TransformerData container for a given shape, filled
177 | with zeros."""
178 | return cls(
179 | wordpieces=WordpieceBatch.zeros([length], xp=xp),
180 | model_output=ModelOutput(
181 | last_hidden_state=xp.zeros((1, length, width), dtype="f")
182 | ),
183 | align=Ragged(
184 | cast(Ints1d, numpy.arange(length)),
185 | cast(Ints1d, numpy.ones((length,), dtype="i")),
186 | ),
187 | )
188 |
189 | @property
190 | def tensors(self) -> Tuple[Union[FloatsXd, List[FloatsXd]]]:
191 | return self.model_output.to_tuple()
192 |
193 | @property
194 | def tokens(self) -> Dict[str, Any]:
195 | """Deprecated. A dict with the wordpiece token data."""
196 | return self.wordpieces.to_hf_dict()
197 |
198 | @property
199 | def width(self) -> int:
200 | if "last_hidden_state" in self.model_output:
201 | return cast(BaseModelOutput, self.model_output).last_hidden_state.shape[-1]
202 | else:
203 | raise ValueError("Cannot find last hidden state")
204 |
205 | def to_dict(self) -> Dict[str, Any]:
206 | return {
207 | "wordpieces": self.wordpieces.to_dict(),
208 | "model_output": self.model_output,
209 | "align": [self.align.dataXd, self.align.lengths],
210 | }
211 |
212 | def from_dict(self, msg: Dict[str, Any]) -> "TransformerData":
213 | self.wordpieces = WordpieceBatch.empty().from_dict(msg["wordpieces"])
214 | self.model_output = ModelOutput(msg["model_output"])
215 | self.align = Ragged(*msg["align"])
216 | return self
217 |
218 | def to_bytes(self) -> bytes:
219 | return srsly.msgpack_dumps(self.to_dict())
220 |
221 | def from_bytes(self, byte_string: bytes) -> "TransformerData":
222 | msg = srsly.msgpack_loads(byte_string)
223 | self.from_dict(msg)
224 | return self
225 |
226 |
227 | @srsly.msgpack_encoders("transformerdata")
228 | def serialize_transformer_data(obj, chain=None):
229 | if isinstance(obj, TransformerData):
230 | return {"__transformerdata__": obj.to_dict()}
231 | return obj if chain is None else chain(obj)
232 |
233 |
234 | @srsly.msgpack_decoders("transformerdata")
235 | def deserialize_transformer_data(obj, chain=None):
236 | if "__transformerdata__" in obj:
237 | return TransformerData.empty().from_dict(obj["__transformerdata__"])
238 | return obj if chain is None else chain(obj)
239 |
240 |
241 | @dataclass
242 | class FullTransformerBatch:
243 | """Holds a batch of input and output objects for a transformer model. The
244 | data can then be split to a list of `TransformerData` objects to associate
245 | the outputs to each `Doc` in the batch.
246 |
247 | Attributes
248 | ----------
249 | spans (List[List[Span]]): The batch of input spans. The outer list refers
250 | to the Doc objects in the batch, and the inner list are the spans for
251 | that `Doc`. Note that spans are allowed to overlap or exclude tokens,
252 | but each Span can only refer to one Doc (by definition). This means that
253 | within a Doc, the regions of the output tensors that correspond to each
254 | Span may overlap or have gaps, but for each Doc, there is a non-overlapping
255 | contiguous slice of the outputs.
256 | wordpieces (WordpieceBatch): Token data from the Huggingface tokenizer.
257 | model_output (ModelOutput): The output of the transformer model.
258 | align (Ragged): Alignment from the spaCy tokenization to the wordpieces.
259 | This is a ragged array, where align.lengths[i] indicates the number of
260 | wordpiece tokens that token i aligns against. The actual indices are
261 | provided at align[i].dataXd.
262 | """
263 |
264 | spans: List[List[Span]]
265 | wordpieces: WordpieceBatch
266 | model_output: ModelOutput
267 | align: Ragged
268 | cached_doc_data: Optional[List[TransformerData]] = None
269 |
270 | @classmethod
271 | def empty(cls, nr_docs) -> "FullTransformerBatch":
272 | spans: List[List[Span]] = [[] for _ in range(nr_docs)]
273 | doc_data = [TransformerData.empty() for _ in range(nr_docs)]
274 | align = Ragged(
275 | cast(Ints1d, numpy.zeros((0,), dtype="i")),
276 | cast(Ints1d, numpy.zeros((0,), dtype="i")),
277 | )
278 | return cls(
279 | spans=spans,
280 | wordpieces=WordpieceBatch.empty(),
281 | model_output=ModelOutput(),
282 | align=align,
283 | cached_doc_data=doc_data,
284 | )
285 |
286 | @property
287 | def tensors(self) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
288 | return self.model_output.to_tuple()
289 |
290 | @property
291 | def tokens(self) -> Dict[str, Any]:
292 | """Deprecated. Dict formatted version of the self.wordpieces data,
293 | with values converted to PyTorch tensors.
294 | """
295 | return self.wordpieces.to_hf_dict()
296 |
297 | @property
298 | def doc_data(self) -> List[TransformerData]:
299 | """The outputs, split per spaCy Doc object."""
300 | if self.cached_doc_data is None:
301 | self.cached_doc_data = self.split_by_doc()
302 | return self.cached_doc_data
303 |
304 | def unsplit_by_doc(self, arrays: List[List[Floats3d]]) -> "FullTransformerBatch":
305 | """Return a new FullTransformerBatch from a split batch of activations,
306 | using the current object's spans, wordpieces and alignment.
307 |
308 | This is used during the backward pass, in order to construct the gradients
309 | to pass back into the transformer model.
310 | """
311 | xp = get_array_module(arrays[0][0])
312 | # construct a dummy ModelOutput with the tensor values
313 | model_output = ModelOutput()
314 | for i, x in enumerate(transpose_list(arrays)):
315 | model_output[f"output_{i}"] = xp2torch(xp.vstack(x))
316 | return FullTransformerBatch(
317 | spans=self.spans,
318 | wordpieces=self.wordpieces,
319 | model_output=model_output,
320 | align=self.align,
321 | )
322 |
323 | def split_by_doc(self) -> List[TransformerData]:
324 | """Split a TransformerData that represents a batch into a list with
325 | one TransformerData per Doc.
326 | """
327 | flat_spans = []
328 | for doc_spans in self.spans:
329 | flat_spans.extend(doc_spans)
330 | token_positions = get_token_positions(flat_spans)
331 |
332 | # Convert all outputs to XP arrays.
333 | xp_model_output = ModelOutput()
334 | last_hidden_state = cast(BaseModelOutput, self.model_output).last_hidden_state
335 | for key, output in self.model_output.items():
336 | if isinstance(output, torch.Tensor):
337 | xp_model_output[key] = torch2xp(output)
338 | elif (
339 | isinstance(output, tuple)
340 | and all(isinstance(t, torch.Tensor) for t in output)
341 | and all(t.shape[0] == last_hidden_state.shape[0] for t in output)
342 | ):
343 | xp_model_output[key] = [torch2xp(t) for t in output]
344 |
345 | # Split outputs per Doc.
346 | outputs = []
347 | start = 0
348 | prev_tokens = 0
349 | for doc_spans in self.spans:
350 | if len(doc_spans) == 0 or len(doc_spans[0]) == 0:
351 | outputs.append(TransformerData.empty())
352 | continue
353 | start_i = token_positions[doc_spans[0][0]]
354 | end_i = token_positions[doc_spans[-1][-1]] + 1
355 | end = start + len(doc_spans)
356 | doc_tokens = self.wordpieces[start:end]
357 | doc_align = self.align[start_i:end_i]
358 | doc_align.data = doc_align.data - prev_tokens
359 | model_output = ModelOutput()
360 | for key, output in xp_model_output.items():
361 | # After the torch2xp conversion above, we only have XP arrays
362 | # and lists of XP arrays.
363 | if not isinstance(output, list):
364 | model_output[key] = output[start:end]
365 | else:
366 | model_output[key] = [t[start:end] for t in output]
367 | outputs.append(
368 | TransformerData(
369 | wordpieces=doc_tokens,
370 | model_output=model_output,
371 | align=doc_align,
372 | )
373 | )
374 | prev_tokens += doc_tokens.input_ids.size
375 | start += len(doc_spans)
376 | return outputs
377 |
378 |
379 | @dataclass
380 | class HFObjects:
381 |
382 | tokenizer: Any
383 | transformer: Any
384 | vocab_file_contents: Any
385 | _init_tokenizer_config: Dict[str, Any] = field(default_factory=dict)
386 | _init_transformer_config: Dict[str, Any] = field(default_factory=dict)
387 |
--------------------------------------------------------------------------------
/spacy_transformers/layers/__init__.py:
--------------------------------------------------------------------------------
1 | from .listener import TransformerListener
2 | from .transformer_model import TransformerModel
3 | from .split_trf import split_trf_batch
4 | from .trfs2arrays import trfs2arrays
5 |
6 |
7 | __all__ = ["TransformerListener", "TransformerModel", "split_trf_batch", "trfs2arrays"]
8 |
--------------------------------------------------------------------------------
/spacy_transformers/layers/_util.py:
--------------------------------------------------------------------------------
1 | from thinc.api import chain
2 | from .split_trf import split_trf_batch
3 |
4 |
5 | def replace_listener(model):
6 | return chain(model, split_trf_batch())
7 |
8 |
9 | def replace_listener_cfg(tok2vec_model_cfg, listener_model_cfg):
10 | result = tok2vec_model_cfg.copy()
11 | if (
12 | "TransformerModel" in tok2vec_model_cfg["@architectures"]
13 | and "TransformerListener" in listener_model_cfg["@architectures"]
14 | ):
15 | result["@architectures"] = "spacy-transformers.Tok2VecTransformer.v3"
16 | for key in ["pooling", "grad_factor"]:
17 | if key in listener_model_cfg and key not in result:
18 | result[key] = listener_model_cfg[key]
19 | return result
20 |
--------------------------------------------------------------------------------
/spacy_transformers/layers/hf_shim.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Dict
2 | from io import BytesIO
3 | from pathlib import Path
4 | import srsly
5 | import torch
6 | import warnings
7 | from thinc.api import get_torch_default_device
8 | from spacy.util import SimpleFrozenDict
9 |
10 | from ..data_classes import HFObjects
11 | from ..util import make_tempdir
12 |
13 | from thinc.api import PyTorchGradScaler, PyTorchShim
14 |
15 | from transformers import AutoModel, AutoConfig, AutoTokenizer
16 |
17 |
18 | class HFShim(PyTorchShim):
19 | """Interface between a HF Pytorch model and a Thinc Model."""
20 |
21 | def __init__(
22 | self,
23 | model: HFObjects,
24 | config=None,
25 | optimizer: Any = None,
26 | mixed_precision: bool = False,
27 | grad_scaler_config: dict = {},
28 | config_cls=AutoConfig,
29 | model_cls=AutoModel,
30 | tokenizer_cls=AutoTokenizer,
31 | ):
32 | self._hfmodel = model
33 | self.config_cls = config_cls
34 | self.model_cls = model_cls
35 | self.tokenizer_cls = tokenizer_cls
36 |
37 | # Enable gradient scaling when mixed precision is enabled and gradient
38 | # scaling is not explicitly disabled in the configuration.
39 | if "enabled" not in grad_scaler_config:
40 | grad_scaler_config["enabled"] = mixed_precision
41 |
42 | super().__init__(
43 | model.transformer,
44 | config,
45 | optimizer,
46 | mixed_precision,
47 | grad_scaler=PyTorchGradScaler(**grad_scaler_config),
48 | )
49 |
50 | def to_bytes(self):
51 | config = {}
52 | tok_dict = {}
53 | weights_bytes = {}
54 | tok_cfg = {}
55 | trf_cfg = {}
56 | hf_model = self._hfmodel
57 | if hf_model.transformer is not None:
58 | tok_dict = {}
59 | config = hf_model.transformer.config.to_dict()
60 | tokenizer = hf_model.tokenizer
61 | with make_tempdir() as temp_dir:
62 | if hasattr(tokenizer, "vocab_file"):
63 | vocab_file_name = tokenizer.vocab_files_names["vocab_file"]
64 | vocab_file_path = str((temp_dir / vocab_file_name).absolute())
65 | with open(vocab_file_path, "wb") as fileh:
66 | fileh.write(hf_model.vocab_file_contents)
67 | tokenizer.vocab_file = vocab_file_path
68 | tok_dict["kwargs"] = {"use_fast": tokenizer.is_fast}
69 | tokenizer.save_pretrained(str(temp_dir.absolute()))
70 | for x in temp_dir.glob("**/*"):
71 | if x.is_file():
72 | tok_dict[x.name] = x.read_bytes()
73 | filelike = BytesIO()
74 | torch.save(self._model.state_dict(), filelike)
75 | filelike.seek(0)
76 | weights_bytes = filelike.getvalue()
77 | else:
78 | tok_cfg = hf_model._init_tokenizer_config
79 | trf_cfg = hf_model._init_transformer_config
80 | msg = {
81 | "config": config,
82 | "state": weights_bytes,
83 | "tokenizer": tok_dict,
84 | "_init_tokenizer_config": tok_cfg,
85 | "_init_transformer_config": trf_cfg,
86 | }
87 | return srsly.msgpack_dumps(msg)
88 |
89 | def from_bytes(self, bytes_data):
90 | msg = srsly.msgpack_loads(bytes_data)
91 | config_dict = msg["config"]
92 | tok_dict = msg["tokenizer"]
93 | if config_dict:
94 | with make_tempdir() as temp_dir:
95 | config_file = temp_dir / "config.json"
96 | srsly.write_json(config_file, config_dict)
97 | config = self.config_cls.from_pretrained(config_file)
98 | tok_kwargs = tok_dict.pop("kwargs", {})
99 | for x, x_bytes in tok_dict.items():
100 | Path(temp_dir / x).write_bytes(x_bytes)
101 | tokenizer = self.tokenizer_cls.from_pretrained(
102 | str(temp_dir.absolute()), **tok_kwargs
103 | )
104 | vocab_file_contents = None
105 | if hasattr(tokenizer, "vocab_file"):
106 | vocab_file_name = tokenizer.vocab_files_names["vocab_file"]
107 | vocab_file_path = str((temp_dir / vocab_file_name).absolute())
108 | with open(vocab_file_path, "rb") as fileh:
109 | vocab_file_contents = fileh.read()
110 |
111 | transformer = self.model_cls.from_config(config)
112 | self._hfmodel = HFObjects(
113 | tokenizer,
114 | transformer,
115 | vocab_file_contents,
116 | SimpleFrozenDict(),
117 | SimpleFrozenDict(),
118 | )
119 | self._model = transformer
120 | filelike = BytesIO(msg["state"])
121 | filelike.seek(0)
122 | device = get_torch_default_device()
123 | try:
124 | self._model.load_state_dict(torch.load(filelike, map_location=device))
125 | except RuntimeError:
126 | warn_msg = (
127 | "Error loading saved torch state_dict with strict=True, "
128 | "likely due to differences between 'transformers' "
129 | "versions. Attempting to load with strict=False as a "
130 | "fallback...\n\n"
131 | "If you see errors or degraded performance, download a "
132 | "newer compatible model or retrain your custom model with "
133 | "the current 'transformers' and 'spacy-transformers' "
134 | "versions. For more details and available updates, run: "
135 | "python -m spacy validate"
136 | )
137 | warnings.warn(warn_msg)
138 | filelike.seek(0)
139 | b = torch.load(filelike, map_location=device)
140 | self._model.load_state_dict(b, strict=False)
141 | self._model.to(device)
142 | else:
143 | self._hfmodel = HFObjects(
144 | None,
145 | None,
146 | None,
147 | msg["_init_tokenizer_config"],
148 | msg["_init_transformer_config"],
149 | )
150 | return self
151 |
--------------------------------------------------------------------------------
/spacy_transformers/layers/hf_wrapper.py:
--------------------------------------------------------------------------------
1 | from typing import Callable, Optional, Any
2 | from thinc.layers.pytorchwrapper import forward as pt_forward
3 | from thinc.layers.pytorchwrapper import convert_pytorch_default_inputs
4 | from thinc.layers.pytorchwrapper import convert_pytorch_default_outputs
5 | from thinc.api import registry, Model
6 |
7 | from transformers import AutoConfig, AutoModel, AutoTokenizer
8 |
9 | from ..data_classes import HFObjects
10 | from .hf_shim import HFShim
11 |
12 |
13 | @registry.layers("HFWrapper.v1")
14 | def HFWrapper(
15 | hf_model: HFObjects,
16 | convert_inputs: Optional[Callable] = None,
17 | convert_outputs: Optional[Callable] = None,
18 | mixed_precision: bool = False,
19 | grad_scaler_config: dict = {},
20 | config_cls=AutoConfig,
21 | model_cls=AutoModel,
22 | tokenizer_cls=AutoTokenizer,
23 | ) -> Model[Any, Any]:
24 | """Wrap a PyTorch HF model, so that it has the same API as Thinc models.
25 | To optimize the model, you'll need to create a PyTorch optimizer and call
26 | optimizer.step() after each batch. See examples/wrap_pytorch.py
27 |
28 | Your PyTorch model's forward method can take arbitrary args and kwargs,
29 | but must return either a single tensor as output or a tuple. You may find the
30 | PyTorch register_forward_hook helpful if you need to adapt the output.
31 |
32 | The convert functions are used to map inputs and outputs to and from your
33 | PyTorch model. Each function should return the converted output, and a callback
34 | to use during the backward pass. So:
35 |
36 | Xtorch, get_dX = convert_inputs(X)
37 | Ytorch, torch_backprop = model.shims[0](Xtorch, is_train)
38 | Y, get_dYtorch = convert_outputs(Ytorch)
39 |
40 | To allow maximum flexibility, the PyTorchShim expects ArgsKwargs objects
41 | on the way into the forward and backward passed. The ArgsKwargs objects
42 | will be passed straight into the model in the forward pass, and straight
43 | into `torch.autograd.backward` during the backward pass.
44 | """
45 | if convert_inputs is None:
46 | convert_inputs = convert_pytorch_default_inputs
47 | if convert_outputs is None:
48 | convert_outputs = convert_pytorch_default_outputs
49 |
50 | return Model(
51 | "hf-pytorch",
52 | pt_forward,
53 | attrs={"convert_inputs": convert_inputs, "convert_outputs": convert_outputs},
54 | shims=[
55 | HFShim(
56 | hf_model,
57 | mixed_precision=mixed_precision,
58 | grad_scaler_config=grad_scaler_config,
59 | config_cls=config_cls,
60 | model_cls=model_cls,
61 | tokenizer_cls=tokenizer_cls,
62 | )
63 | ],
64 | dims={"nI": None, "nO": None},
65 | )
66 |
--------------------------------------------------------------------------------
/spacy_transformers/layers/listener.py:
--------------------------------------------------------------------------------
1 | from typing import Optional, Callable, List
2 | from thinc.api import Model
3 | from spacy.errors import Errors
4 | from spacy.tokens import Doc
5 | from ..data_classes import TransformerData
6 |
7 |
8 | class TransformerListener(Model):
9 | """A layer that gets fed its answers from an upstream connection,
10 | for instance from a component earlier in the pipeline.
11 | """
12 |
13 | name = "transformer-listener"
14 |
15 | _batch_id: Optional[int]
16 | _outputs: Optional[List[TransformerData]]
17 | _backprop: Optional[Callable[[List[TransformerData]], List[Doc]]]
18 |
19 | def __init__(self, upstream_name: str):
20 | Model.__init__(self, name=self.name, forward=forward, dims={"nO": None})
21 | self.upstream_name = upstream_name
22 | self._batch_id = None
23 | self._outputs = None
24 | self._backprop = None
25 |
26 | @classmethod
27 | def get_batch_id(cls, inputs: List[Doc]):
28 | return sum(sum(token.orth for token in doc) for doc in inputs)
29 |
30 | def receive(self, batch_id, outputs, backprop):
31 | self._batch_id = batch_id
32 | self._outputs = outputs
33 | self._backprop = backprop
34 |
35 | def backprop_and_clear(self, *args, **kwargs):
36 | """Call the stored _backprop callback, and then
37 | clears it. This saves memory, as otherwise we hold onto that callback
38 | until the next batch.
39 | """
40 | if self._backprop is not None:
41 | result = self._backprop(*args, **kwargs)
42 | else:
43 | result = None
44 | self._batch_id = None
45 | self._outputs = None
46 | self._backprop = None
47 | return result
48 |
49 | def verify_inputs(self, inputs):
50 | if self._batch_id is None and self._outputs is None:
51 | raise ValueError
52 | else:
53 | batch_id = self.get_batch_id(inputs)
54 | if batch_id != self._batch_id:
55 | raise ValueError(f"Mismatched IDs! {batch_id} vs {self._batch_id}")
56 | else:
57 | return True
58 |
59 |
60 | def forward(model: TransformerListener, docs, is_train):
61 | if is_train:
62 | # This might occur during training when the transformer layer is frozen / hasn't been updated.
63 | # In that case, it should be set to "annotating" so we can retrieve the embeddings from the doc.
64 | if model._batch_id is None:
65 | outputs = []
66 | for doc in docs:
67 | if doc._.trf_data is None:
68 | raise ValueError(Errors.E203.format(name="transformer"))
69 | else:
70 | outputs.append(doc._.trf_data)
71 | return outputs, _empty_backprop
72 | else:
73 | model.verify_inputs(docs)
74 | return model._outputs, model.backprop_and_clear
75 | else:
76 | width = model.get_dim("nO")
77 | outputs = []
78 | for doc in docs:
79 | if doc._.trf_data is None:
80 | outputs.append(TransformerData.zeros(len(doc), width, xp=model.ops.xp))
81 | else:
82 | outputs.append(doc._.trf_data)
83 | return outputs, _empty_backprop
84 |
85 |
86 | def _empty_backprop(dX):
87 | return []
88 |
--------------------------------------------------------------------------------
/spacy_transformers/layers/split_trf.py:
--------------------------------------------------------------------------------
1 | from thinc.api import Model
2 | from typing import List
3 | from ..data_classes import FullTransformerBatch, TransformerData
4 |
5 |
6 | def split_trf_batch() -> Model[FullTransformerBatch, List[TransformerData]]:
7 | return Model("split-trf-batch", forward)
8 |
9 |
10 | def forward(model, trf_full, is_train):
11 | def backprop(d_trf_datas):
12 | return trf_full.unsplit_by_doc([x.tensors for x in d_trf_datas])
13 |
14 | return trf_full.doc_data, backprop
15 |
--------------------------------------------------------------------------------
/spacy_transformers/layers/transformer_model.py:
--------------------------------------------------------------------------------
1 | from typing import List, Tuple, Callable, Union, Dict
2 | import copy
3 | from pathlib import Path
4 | from transformers.file_utils import ModelOutput
5 | from transformers import AutoConfig, AutoModel, AutoTokenizer
6 | from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
7 | from transformers.tokenization_utils import BatchEncoding
8 |
9 | from spacy.tokens import Doc
10 | from thinc.api import Model, get_torch_default_device, xp2torch
11 | from thinc.types import ArgsKwargs
12 |
13 | import logging
14 |
15 | from ..data_classes import FullTransformerBatch, WordpieceBatch, HFObjects
16 | from ..util import maybe_flush_pytorch_cache
17 | from ..util import log_gpu_memory, log_batch_size
18 | from ..layers._util import replace_listener, replace_listener_cfg
19 | from ..truncate import truncate_oversize_splits
20 | from ..align import get_alignment, get_alignment_via_offset_mapping
21 | from .hf_wrapper import HFWrapper
22 |
23 |
24 | class TransformerModel(Model):
25 | def __init__(
26 | self,
27 | name: str,
28 | get_spans: Callable,
29 | tokenizer_config: dict = {},
30 | transformer_config: dict = {},
31 | mixed_precision: bool = False,
32 | grad_scaler_config: dict = {},
33 | ):
34 | """
35 | get_spans (Callable[[List[Doc]], List[Span]]):
36 | A function to extract spans from the batch of Doc objects.
37 | This is used to manage long documents, by cutting them into smaller
38 | sequences before running the transformer. The spans are allowed to
39 | overlap, and you can also omit sections of the Doc if they are not
40 | relevant.
41 | tokenizer_config (dict): Settings to pass to the transformers tokenizer.
42 | transformer_config (dict): Settings to pass to the transformers forward pass.
43 | """
44 | hf_model = HFObjects(None, None, None, tokenizer_config, transformer_config)
45 | wrapper = HFWrapper(
46 | hf_model,
47 | convert_inputs=_convert_transformer_inputs,
48 | convert_outputs=_convert_transformer_outputs,
49 | mixed_precision=mixed_precision,
50 | grad_scaler_config=grad_scaler_config,
51 | )
52 | super().__init__(
53 | "transformer",
54 | forward,
55 | init=init,
56 | layers=[wrapper],
57 | dims={"nO": None},
58 | attrs={
59 | "get_spans": get_spans,
60 | "name": name,
61 | "set_transformer": set_pytorch_transformer,
62 | "has_transformer": False,
63 | "flush_cache_chance": 0.0,
64 | "replace_listener": replace_listener,
65 | "replace_listener_cfg": replace_listener_cfg,
66 | },
67 | )
68 |
69 | @property
70 | def tokenizer(self):
71 | return self.layers[0].shims[0]._hfmodel.tokenizer
72 |
73 | @property
74 | def transformer(self):
75 | return self.layers[0].shims[0]._hfmodel.transformer
76 |
77 | @property
78 | def _init_tokenizer_config(self):
79 | return self.layers[0].shims[0]._hfmodel._init_tokenizer_config
80 |
81 | @property
82 | def _init_transformer_config(self):
83 | return self.layers[0].shims[0]._hfmodel._init_transformer_config
84 |
85 | def copy(self):
86 | """
87 | Create a copy of the model, its attributes, and its parameters. Any child
88 | layers will also be deep-copied. The copy will receive a distinct `model.id`
89 | value.
90 | """
91 | copied = TransformerModel(self.name, self.attrs["get_spans"])
92 | params = {}
93 | for name in self.param_names:
94 | params[name] = self.get_param(name) if self.has_param(name) else None
95 | copied.params = copy.deepcopy(params)
96 | copied.dims = copy.deepcopy(self._dims)
97 | copied.layers[0] = copy.deepcopy(self.layers[0])
98 | for name in self.grad_names:
99 | copied.set_grad(name, self.get_grad(name).copy())
100 | return copied
101 |
102 |
103 | def set_logger(model, out_file):
104 | """Add a logger that will log memory usage to the given file.
105 |
106 | Used to debug OOM errors.
107 | """
108 | logging.basicConfig(
109 | level="INFO", format="%(asctime)s:%(levelname)s: %(message)s", stream=out_file
110 | )
111 | model.attrs["logger"] = logging.getLogger(__name__)
112 |
113 |
114 | def set_pytorch_transformer(model, hf_model: HFObjects):
115 | if model.attrs["has_transformer"]:
116 | raise ValueError("Cannot set second transformer.")
117 | model.layers[0].shims[0]._model = hf_model.transformer
118 | model.layers[0].shims[0]._hfmodel.tokenizer = hf_model.tokenizer
119 | model.layers[0].shims[0]._hfmodel.transformer = hf_model.transformer
120 | model.layers[0].shims[0]._hfmodel.vocab_file_contents = hf_model.vocab_file_contents
121 | model.attrs["has_transformer"] = True
122 | model.set_dim("nO", hf_model.transformer.config.hidden_size)
123 |
124 |
125 | def init(model: TransformerModel, X=None, Y=None):
126 | if model.attrs["has_transformer"]:
127 | return
128 | name = model.attrs["name"]
129 | tok_cfg = model._init_tokenizer_config
130 | trf_cfg = model._init_transformer_config
131 | hf_model = huggingface_from_pretrained(name, tok_cfg, trf_cfg)
132 | model.attrs["set_transformer"](model, hf_model)
133 | tokenizer = model.tokenizer
134 | # Call the model with a batch of inputs to infer the width
135 | if X:
136 | # If we're dealing with actual texts, do the work to setup the wordpieces
137 | # batch properly
138 | docs = X
139 | get_spans = model.attrs["get_spans"]
140 | nested_spans = get_spans(docs)
141 | flat_spans = []
142 | for doc_spans in nested_spans:
143 | flat_spans.extend(doc_spans)
144 | token_data = huggingface_tokenize(tokenizer, [span.text for span in flat_spans])
145 | wordpieces = WordpieceBatch.from_batch_encoding(token_data)
146 | if "offset_mapping" in token_data:
147 | align = get_alignment_via_offset_mapping(
148 | flat_spans,
149 | token_data["offset_mapping"],
150 | )
151 | else:
152 | align = get_alignment(
153 | flat_spans, wordpieces.strings, tokenizer.all_special_tokens
154 | )
155 | wordpieces, align = truncate_oversize_splits(
156 | wordpieces, align, tokenizer.model_max_length
157 | )
158 | else:
159 | texts = ["hello world", "foo bar"]
160 | token_data = huggingface_tokenize(tokenizer, texts)
161 | wordpieces = WordpieceBatch.from_batch_encoding(token_data)
162 | model.layers[0].initialize(X=wordpieces)
163 | model_output = model.layers[0].predict(wordpieces)
164 | model.set_dim("nO", model_output.last_hidden_state.shape[-1])
165 |
166 |
167 | def forward(
168 | model: TransformerModel, docs: List[Doc], is_train: bool
169 | ) -> Tuple[FullTransformerBatch, Callable]:
170 | tokenizer = model.tokenizer
171 | get_spans = model.attrs["get_spans"]
172 | transformer = model.layers[0]
173 |
174 | nested_spans = get_spans(docs)
175 | flat_spans = []
176 | for doc_spans in nested_spans:
177 | flat_spans.extend(doc_spans)
178 | # Flush the PyTorch cache every so often. It seems to help with memory :(
179 | # This shouldn't be necessary, I'm not sure what I'm doing wrong?
180 | maybe_flush_pytorch_cache(chance=model.attrs.get("flush_cache_chance", 0))
181 | if "logger" in model.attrs:
182 | log_gpu_memory(model.attrs["logger"], "begin forward")
183 | batch_encoding = huggingface_tokenize(tokenizer, [span.text for span in flat_spans])
184 | wordpieces = WordpieceBatch.from_batch_encoding(batch_encoding)
185 | if "logger" in model.attrs:
186 | log_batch_size(model.attrs["logger"], wordpieces, is_train)
187 | if "offset_mapping" in batch_encoding:
188 | align = get_alignment_via_offset_mapping(
189 | flat_spans,
190 | batch_encoding["offset_mapping"],
191 | )
192 | else:
193 | align = get_alignment(
194 | flat_spans, wordpieces.strings, tokenizer.all_special_tokens
195 | )
196 | wordpieces, align = truncate_oversize_splits(
197 | wordpieces, align, tokenizer.model_max_length
198 | )
199 | model_output, bp_tensors = transformer(wordpieces, is_train)
200 | if "logger" in model.attrs:
201 | log_gpu_memory(model.attrs["logger"], "after forward")
202 | output = FullTransformerBatch(
203 | spans=nested_spans,
204 | wordpieces=wordpieces,
205 | model_output=model_output,
206 | align=align,
207 | )
208 | if "logger" in model.attrs:
209 | log_gpu_memory(model.attrs["logger"], "return from forward")
210 |
211 | def backprop_transformer(d_output: FullTransformerBatch) -> List[Doc]:
212 | if "logger" in model.attrs:
213 | log_gpu_memory(model.attrs["logger"], "Begin backprop")
214 | _ = bp_tensors(d_output.model_output)
215 | if "logger" in model.attrs:
216 | log_gpu_memory(model.attrs["logger"], "After backprop")
217 | return docs
218 |
219 | return output, backprop_transformer
220 |
221 |
222 | def _convert_transformer_inputs(model, wps: WordpieceBatch, is_train):
223 | # Adapter for the HFWrapper. See https://thinc.ai/docs/usage-frameworks
224 |
225 | hf_device = model.shims[0]._hfmodel.transformer.device
226 | kwargs = {
227 | "input_ids": xp2torch(wps.input_ids, device=hf_device),
228 | "attention_mask": xp2torch(wps.attention_mask, device=hf_device),
229 | }
230 | if wps.token_type_ids is not None:
231 | kwargs["token_type_ids"] = xp2torch(wps.token_type_ids, device=hf_device)
232 | return ArgsKwargs(args=(), kwargs=kwargs), lambda dX: []
233 |
234 |
235 | def _convert_transformer_outputs(model, inputs_outputs, is_train):
236 | _, model_output = inputs_outputs
237 |
238 | def backprop(d_model_output: ModelOutput) -> ArgsKwargs:
239 | return ArgsKwargs(
240 | args=(model_output.last_hidden_state,),
241 | kwargs={"grad_tensors": d_model_output.values()},
242 | )
243 |
244 | return model_output, backprop
245 |
246 |
247 | def huggingface_from_pretrained(
248 | source: Union[Path, str],
249 | tok_config: Dict,
250 | trf_config: Dict,
251 | config_cls=AutoConfig,
252 | model_cls=AutoModel,
253 | tokenizer_cls=AutoTokenizer,
254 | ) -> HFObjects:
255 | """Create a Huggingface transformer model from pretrained weights. Will
256 | download the model if it is not already downloaded.
257 |
258 | source (Union[str, Path]): The name of the model or a path to it, such as
259 | 'bert-base-cased'.
260 | tok_config (dict): Settings to pass to the tokenizer.
261 | trf_config (dict): Settings to pass to the transformer.
262 | """
263 | if isinstance(source, Path):
264 | str_path = str(source.absolute())
265 | else:
266 | str_path = source
267 | tokenizer = tokenizer_cls.from_pretrained(str_path, **tok_config)
268 | vocab_file_contents = None
269 | if hasattr(tokenizer, "vocab_file"):
270 | with open(tokenizer.vocab_file, "rb") as fileh:
271 | vocab_file_contents = fileh.read()
272 | trf_config["return_dict"] = True
273 | config = config_cls.from_pretrained(str_path, **trf_config)
274 | transformer = model_cls.from_pretrained(str_path, config=config)
275 | torch_device = get_torch_default_device()
276 | transformer.to(torch_device)
277 | return HFObjects(tokenizer, transformer, vocab_file_contents)
278 |
279 |
280 | def huggingface_tokenize(tokenizer, texts: List[str]) -> BatchEncoding:
281 | """Apply a Huggingface tokenizer to a batch of texts."""
282 |
283 | # Use NumPy arrays rather than PyTorch tensors to avoid a lot of
284 | # host <-> device transfers during tokenization and post-processing
285 | # when a GPU is used.
286 | token_data = tokenizer(
287 | texts,
288 | add_special_tokens=True,
289 | return_attention_mask=True,
290 | return_offsets_mapping=isinstance(tokenizer, PreTrainedTokenizerFast),
291 | return_tensors="np",
292 | return_token_type_ids=None, # Sets to model default
293 | padding="longest",
294 | )
295 | token_data["input_texts"] = []
296 | for i in range(len(token_data["input_ids"])):
297 | wp_texts = tokenizer.convert_ids_to_tokens(token_data["input_ids"][i])
298 | token_data["input_texts"].append(wp_texts)
299 | token_data["pad_token"] = tokenizer.pad_token
300 | return token_data
301 |
--------------------------------------------------------------------------------
/spacy_transformers/layers/trfs2arrays.py:
--------------------------------------------------------------------------------
1 | from typing import Callable, List, Optional, Tuple, cast
2 | import numpy
3 | from spacy.util import all_equal
4 | from transformers.file_utils import ModelOutput
5 | from transformers.modeling_outputs import BaseModelOutput
6 | from thinc.api import Model
7 | from thinc.types import Ragged, Floats2d
8 | from ..data_classes import TransformerData
9 | from ..align import apply_alignment
10 |
11 |
12 | def trfs2arrays(
13 | pooling: Model[Ragged, Floats2d], grad_factor: float
14 | ) -> Model[List[TransformerData], List[Floats2d]]:
15 | """Pool transformer data into token-aligned tensors."""
16 | return Model(
17 | "trfs2arrays",
18 | forward,
19 | layers=[pooling],
20 | attrs={"grad_factor": grad_factor},
21 | )
22 |
23 |
24 | def forward(model: Model, trf_datas: List[TransformerData], is_train: bool):
25 | pooling: Model[Ragged, Floats2d] = model.layers[0]
26 | grad_factor = model.attrs["grad_factor"]
27 | zero_outputs: List[Tuple[int, Floats2d]] = []
28 | backprops_alignment: List[Optional[Callable]] = []
29 | aligned_outputs: List[Tuple[int, Ragged]] = []
30 |
31 | # For zero-length documents, we could cache the output width by iterating
32 | # through the batch outputs and retrieving the shape of a non-zero length
33 | # Doc. This, however, is not fool-proof as one can pass an entire batch of
34 | # zero-length Docs to the transformer model (at least during prediction).
35 | # Instead of being conditionally correct, we'll explicitly leave the width as
36 | # zero in these cases as the effective length of the resultant tensor is zero anyway.
37 | output_width = 0
38 |
39 | for i, trf_data in enumerate(trf_datas):
40 | if not isinstance(trf_data, TransformerData):
41 | raise ValueError(
42 | "Expected spacy_transformers.data_classes.TransformerData "
43 | f"in trf_data, got: {type(trf_data)}\n"
44 | "Check that your pipeline contains a transformer component "
45 | "with a spacy-transformers TransformerModel architecture."
46 | )
47 | if "last_hidden_state" in trf_data.model_output:
48 | tensor_t_i = cast(BaseModelOutput, trf_data.model_output).last_hidden_state
49 | if tensor_t_i.size == 0:
50 | # This can happen during prediction/initialization if the transformer pipe was disabled/not executed and one of the inputs
51 | # was of length zero. This causes the listenener to generate a zero-sized (in the sequence length dim) TransformerData
52 | # output and pass it downstream.
53 | zero_outputs.append((i, model.ops.alloc2f(0, output_width)))
54 | backprops_alignment.append(None)
55 | else:
56 | # This is the general case for non-zero length documents.
57 | src = model.ops.reshape2f(tensor_t_i, -1, trf_data.width) # type: ignore
58 | dst, get_d_src = apply_alignment(model.ops, trf_data.align, src)
59 | aligned_outputs.append((i, dst))
60 | backprops_alignment.append(get_d_src)
61 | else:
62 | # This can happen during prediction/training for zero-length documents. Since zero-length docs
63 | # are implicitly ignored in the span generation stage, the transformer model does not return any
64 | # predictions for them and subsequently, FullTransformerBatch.split_by_doc() generates an empty
65 | # TransformerData.
66 | zero_outputs.append((i, model.ops.alloc2f(0, output_width)))
67 | backprops_alignment.append(None)
68 |
69 | pooling_outputs, backprop_pooling = concat_pooling_forward(
70 | pooling, [dst for _, dst in aligned_outputs], is_train
71 | )
72 |
73 | # Interleave the zero and non-zero outputs into the final result.
74 | outputs: List[Optional[Floats2d]] = [None] * (
75 | len(zero_outputs) + len(aligned_outputs)
76 | )
77 | for i, zero_output in zero_outputs:
78 | outputs[i] = zero_output
79 | for (i, _), pooling_output in zip(aligned_outputs, pooling_outputs):
80 | outputs[i] = pooling_output
81 |
82 | def backprop_trf_to_tensor(d_outputs: List[Floats2d]) -> List[TransformerData]:
83 | d_trf_datas: List[TransformerData] = []
84 |
85 | # Only update the gradients that are relevant for pooling.
86 | d_pooling = backprop_pooling([d_outputs[i] for i, _ in aligned_outputs])
87 | for (i, _), d_pooling_i in zip(aligned_outputs, d_pooling):
88 | d_outputs[i] = d_pooling_i
89 |
90 | to_zip = (trf_datas, d_outputs, backprops_alignment)
91 | assert all_equal(len(x) for x in to_zip) # type: ignore
92 | zipped = zip(*to_zip)
93 | for trf_data, d_output, get_d_src in zipped:
94 | if "last_hidden_state" not in trf_data.model_output:
95 | # This gradient belongs to a zero-length doc and must be ignored as it doesn't have a corresponding
96 | # output from the transformer model (due to empty documents being skipped during the span generation
97 | # stage in the forward pass).
98 | assert len(d_output) == 0
99 | assert get_d_src is None
100 | continue
101 |
102 | assert get_d_src is not None
103 | d_model_output = ModelOutput(
104 | last_hidden_state=model.ops.alloc(
105 | trf_data.model_output.last_hidden_state.shape, # type: ignore
106 | dtype=trf_data.model_output.last_hidden_state.dtype, # type: ignore
107 | )
108 | )
109 | d_src = get_d_src(d_output)
110 | d_src *= grad_factor
111 | d_model_output["last_hidden_state"] = d_src.reshape(
112 | cast(BaseModelOutput, trf_data.model_output).last_hidden_state.shape
113 | )
114 | d_trf_datas.append(
115 | TransformerData(
116 | model_output=d_model_output,
117 | wordpieces=trf_data.wordpieces,
118 | align=trf_data.align,
119 | )
120 | )
121 | return d_trf_datas
122 |
123 | assert len(outputs) == len(trf_datas)
124 | return outputs, backprop_trf_to_tensor
125 |
126 |
127 | def concat_pooling_forward(
128 | pooling: Model[Ragged, Floats2d], X: List[Ragged], is_train: bool
129 | ):
130 | xp = pooling.ops.xp
131 |
132 | datas = []
133 | lens = []
134 | doc_lens = []
135 | for X_doc_data in X:
136 | datas.append(X_doc_data.dataXd)
137 | lens.append(X_doc_data.lengths)
138 | doc_lens.append(len(X_doc_data.lengths))
139 |
140 | X_flat = Ragged(xp.concatenate(datas, axis=0), xp.concatenate(lens, axis=0))
141 | Y_pooled, pooling_backprop = pooling(X_flat, is_train)
142 | Y = xp.split(Y_pooled, numpy.cumsum(doc_lens)[:-1])
143 |
144 | def backprop(dY):
145 | dY_pooled_flat = xp.concatenate(dY)
146 | dY_flat = pooling_backprop(dY_pooled_flat).dataXd
147 |
148 | dY = []
149 | for X_doc_data in X:
150 | doc_unpooled_len = X_doc_data.dataXd.shape[0]
151 | dY.append(Ragged(dY_flat[:doc_unpooled_len], X_doc_data.lengths))
152 | dY_flat = dY_flat[doc_unpooled_len:]
153 |
154 | return dY
155 |
156 | return Y, backprop
157 |
--------------------------------------------------------------------------------
/spacy_transformers/pipeline_component.py:
--------------------------------------------------------------------------------
1 | from typing import List, Callable, Iterable, Iterator, Optional, Dict, Union
2 | import warnings
3 | from spacy.language import Language
4 | from spacy.pipeline.trainable_pipe import TrainablePipe
5 | from spacy.pipeline.pipe import deserialize_config
6 | from spacy.tokens import Doc
7 | from spacy.vocab import Vocab
8 | from spacy.training import Example, validate_examples
9 | from spacy import util, Errors
10 | from spacy.util import minibatch
11 | from thinc.api import Model, Config, set_dropout_rate, Optimizer
12 | import srsly
13 | from pathlib import Path
14 |
15 | from .layers.transformer_model import huggingface_from_pretrained
16 | from .util import batch_by_length
17 | from .annotation_setters import null_annotation_setter
18 | from .data_classes import FullTransformerBatch, TransformerData
19 | from .layers import TransformerListener
20 |
21 |
22 | DEFAULT_CONFIG_STR = """
23 | [transformer]
24 | max_batch_items = 4096
25 |
26 | [transformer.set_extra_annotations]
27 | @annotation_setters = "spacy-transformers.null_annotation_setter.v1"
28 |
29 | [transformer.model]
30 | @architectures = "spacy-transformers.TransformerModel.v3"
31 | name = "roberta-base"
32 | tokenizer_config = {"use_fast": true}
33 | transformer_config = {}
34 | mixed_precision = false
35 | grad_scaler_config = {}
36 |
37 | [transformer.model.get_spans]
38 | @span_getters = "spacy-transformers.strided_spans.v1"
39 | window = 128
40 | stride = 96
41 | """
42 |
43 | DEFAULT_CONFIG = Config().from_str(DEFAULT_CONFIG_STR)
44 | DOC_EXT_ATTR = "trf_data"
45 |
46 |
47 | @Language.factory(
48 | "transformer",
49 | assigns=[f"doc._.{DOC_EXT_ATTR}"],
50 | default_config=DEFAULT_CONFIG["transformer"],
51 | )
52 | def make_transformer(
53 | nlp: Language,
54 | name: str,
55 | model: Model[List[Doc], FullTransformerBatch],
56 | set_extra_annotations: Callable[[List[Doc], FullTransformerBatch], None],
57 | max_batch_items: int,
58 | ):
59 | """Construct a Transformer component, which lets you plug a model from the
60 | Huggingface transformers library into spaCy so you can use it in your
61 | pipeline. One or more subsequent spaCy components can use the transformer
62 | outputs as features in its model, with gradients backpropagated to the single
63 | shared weights.
64 |
65 | model (Model[List[Doc], FullTransformerBatch]): A thinc Model object wrapping
66 | the transformer. Usually you will want to use the TransformerModel
67 | layer for this.
68 | set_extra_annotations (Callable[[List[Doc], FullTransformerBatch], None]): A
69 | callback to set additional information onto the batch of `Doc` objects.
70 | The doc._.trf_data attribute is set prior to calling the callback.
71 | By default, no additional annotations are set.
72 | """
73 | return Transformer(
74 | nlp.vocab,
75 | model,
76 | set_extra_annotations,
77 | max_batch_items=max_batch_items,
78 | name=name,
79 | )
80 |
81 |
82 | def install_extensions() -> None:
83 | if not Doc.has_extension(DOC_EXT_ATTR):
84 | Doc.set_extension(DOC_EXT_ATTR, default=None)
85 |
86 |
87 | class Transformer(TrainablePipe):
88 | """spaCy pipeline component that provides access to a transformer model from
89 | the Huggingface transformers library. Usually you will connect subsequent
90 | components to the shared transformer using the TransformerListener layer.
91 | This works similarly to spaCy's Tok2Vec component and Tok2VecListener
92 | sublayer.
93 |
94 | The activations from the transformer are saved in the doc._.trf_data extension
95 | attribute. You can also provide a callback to set additional annotations.
96 |
97 | vocab (Vocab): The Vocab object for the pipeline.
98 | model (Model[List[Doc], FullTransformerBatch]): A thinc Model object wrapping
99 | the transformer. Usually you will want to use the TransformerModel
100 | layer for this.
101 | set_extra_annotations (Callable[[List[Doc], FullTransformerBatch], None]): A
102 | callback to set additional information onto the batch of `Doc` objects.
103 | The doc._.trf_data attribute is set prior to calling the callback.
104 | By default, no additional annotations are set.
105 | """
106 |
107 | def __init__(
108 | self,
109 | vocab: Vocab,
110 | model: Model[List[Doc], FullTransformerBatch],
111 | set_extra_annotations: Callable = null_annotation_setter,
112 | *,
113 | name: str = "transformer",
114 | max_batch_items: int = 128 * 32, # Max size of padded batch
115 | ):
116 | """Initialize the transformer component."""
117 | self.name = name
118 | self.vocab = vocab
119 | self.model = model
120 | if not isinstance(self.model, Model):
121 | raise ValueError(f"Expected Thinc Model, got: {type(self.model)}")
122 | self.set_extra_annotations = set_extra_annotations
123 | self.cfg = {"max_batch_items": max_batch_items}
124 | self.listener_map: Dict[str, List[TransformerListener]] = {}
125 | install_extensions()
126 |
127 | @property
128 | def listeners(self) -> List[TransformerListener]:
129 | """RETURNS (List[TransformerListener]): The listener models listening
130 | to this component. Usually internals.
131 | """
132 | return [m for c in self.listening_components for m in self.listener_map[c]]
133 |
134 | @property
135 | def listening_components(self) -> List[str]:
136 | """RETURNS (List[str]): The downstream components listening to this
137 | component. Usually internals.
138 | """
139 | return list(self.listener_map.keys())
140 |
141 | def add_listener(self, listener: TransformerListener, component_name: str) -> None:
142 | """Add a listener for a downstream component. Usually internals."""
143 | self.listener_map.setdefault(component_name, [])
144 | if listener not in self.listener_map[component_name]:
145 | self.listener_map[component_name].append(listener)
146 | if self.model.has_dim("nO") and listener.has_dim("nO") is None:
147 | listener.set_dim("nO", self.model.get_dim("nO"))
148 |
149 | def remove_listener(
150 | self, listener: TransformerListener, component_name: str
151 | ) -> bool:
152 | """Remove a listener for a downstream component. Usually internals."""
153 | if component_name in self.listener_map:
154 | if listener in self.listener_map[component_name]:
155 | self.listener_map[component_name].remove(listener)
156 | # If no listeners are left, remove entry
157 | if not self.listener_map[component_name]:
158 | del self.listener_map[component_name]
159 | return True
160 | return False
161 |
162 | def find_listeners(self, component) -> None:
163 | """Walk over a model of a processing component, looking for layers that
164 | are TransformerListener subclasses that have an upstream_name that
165 | matches this component.
166 | Listeners can also set their upstream_name attribute to the wildcard
167 | string '*' to match any `Transformer`.
168 |
169 | You're unlikely to ever need multiple `Transformer` components, so it's
170 | fine to leave your listeners upstream_name on '*'.
171 | """
172 | names = ("*", self.name)
173 | if isinstance(getattr(component, "model", None), Model):
174 | for node in component.model.walk():
175 | if (
176 | isinstance(node, TransformerListener)
177 | and node.upstream_name in names
178 | ):
179 | self.add_listener(node, component.name)
180 |
181 | def __call__(self, doc: Doc) -> Doc:
182 | """Apply the pipe to one document. The document is modified in place,
183 | and returned. This usually happens under the hood when the nlp object
184 | is called on a text and all components are applied to the Doc.
185 |
186 | docs (Doc): The Doc to process.
187 | RETURNS (Doc): The processed Doc.
188 |
189 | DOCS: https://spacy.io/api/transformer#call
190 | """
191 | install_extensions()
192 | outputs = self.predict([doc])
193 | self.set_annotations([doc], outputs)
194 | return doc
195 |
196 | def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
197 | """Apply the pipe to a stream of documents. This usually happens under
198 | the hood when the nlp object is called on a text and all components are
199 | applied to the Doc.
200 |
201 | stream (Iterable[Doc]): A stream of documents.
202 | batch_size (int): The number of documents to buffer.
203 | YIELDS (Doc): Processed documents in order.
204 |
205 | DOCS: https://spacy.io/api/transformer#pipe
206 | """
207 | install_extensions()
208 | for outer_batch in minibatch(stream, batch_size):
209 | outer_batch = list(outer_batch)
210 | for indices in batch_by_length(outer_batch, self.cfg["max_batch_items"]):
211 | subbatch = [outer_batch[i] for i in indices]
212 | self.set_annotations(subbatch, self.predict(subbatch))
213 | yield from outer_batch
214 |
215 | def predict(self, docs: Iterable[Doc]) -> FullTransformerBatch:
216 | """Apply the pipeline's model to a batch of docs, without modifying them.
217 | Returns the extracted features as the FullTransformerBatch dataclass.
218 |
219 | docs (Iterable[Doc]): The documents to predict.
220 | RETURNS (FullTransformerBatch): The extracted features.
221 |
222 | DOCS: https://spacy.io/api/transformer#predict
223 | """
224 | docs = list(docs)
225 | if not any(len(doc) for doc in docs):
226 | # Handle cases where there are no tokens in any docs.
227 | activations = FullTransformerBatch.empty(len(docs))
228 | else:
229 | activations = self.model.predict(docs)
230 | return activations
231 |
232 | def set_annotations(
233 | self, docs: Iterable[Doc], predictions: FullTransformerBatch
234 | ) -> None:
235 | """Assign the extracted features to the Doc objects. By default, the
236 | TransformerData object is written to the doc._.trf_data attribute. Your
237 | set_extra_annotations callback is then called, if provided.
238 |
239 | docs (Iterable[Doc]): The documents to modify.
240 | predictions: (FullTransformerBatch): A batch of activations.
241 |
242 | DOCS: https://spacy.io/api/pipe#set_annotations
243 | """
244 | doc_data = list(predictions.doc_data)
245 | for doc, data in zip(docs, doc_data):
246 | doc._.trf_data = data
247 | self.set_extra_annotations(list(docs), predictions)
248 |
249 | def update(
250 | self,
251 | examples: Iterable[Example],
252 | *,
253 | drop: float = 0.0,
254 | sgd: Optional[Optimizer] = None,
255 | losses: Optional[Dict[str, float]] = None,
256 | ) -> Dict[str, float]:
257 | """Prepare for an update to the transformer.
258 |
259 | Like the `Tok2Vec` component, the `Transformer` component is unusual
260 | in that it does not receive "gold standard" annotations to calculate
261 | a weight update. The optimal output of the transformer data is unknown;
262 | it's a hidden layer inside the network that is updated by backpropagating
263 | from output layers.
264 |
265 | The `Transformer` component therefore does not perform a weight update
266 | during its own `update` method. Instead, it runs its transformer model
267 | and communicates the output and the backpropagation callback to any
268 | downstream components that have been connected to it via the
269 | TransformerListener sublayer. If there are multiple listeners, the last
270 | layer will actually backprop to the transformer and call the optimizer,
271 | while the others simply increment the gradients.
272 |
273 | examples (Iterable[Example]):
274 | A batch of Example objects. Only the `predicted` doc object is used,
275 | the reference doc is ignored.
276 | drop (float): The dropout rate.
277 | sgd (thinc.api.Optimizer): The optimizer.
278 | losses (Dict[str, float]): Optional record of the loss during training.
279 | Updated using the component name as the key.
280 | RETURNS (Dict[str, float]): The updated losses dictionary.
281 |
282 | DOCS: https://spacy.io/api/transformer#update
283 | """
284 | validate_examples(examples, "Transformer.update")
285 | if losses is None:
286 | losses = {}
287 | docs = [eg.predicted for eg in examples]
288 | if isinstance(docs, Doc):
289 | docs = [docs]
290 | if not any(len(doc) for doc in docs):
291 | # Handle cases where there are no tokens in any docs.
292 | return losses
293 | set_dropout_rate(self.model, drop)
294 | trf_full, bp_trf_full = self.model.begin_update(docs)
295 | d_tensors: List = []
296 | losses.setdefault(self.name, 0.0)
297 |
298 | def accumulate_gradient(d_trf_datas: List[TransformerData]):
299 | """Accumulate tok2vec loss and gradient. This is passed as a callback
300 | to all but the last listener. Only the last one does the backprop.
301 | """
302 | nonlocal d_tensors
303 | for i, d_trf_data in enumerate(d_trf_datas):
304 | for d_tensor in d_trf_data.tensors:
305 | losses[self.name] += float((d_tensor**2).sum()) # type:ignore
306 | if i >= len(d_tensors):
307 | d_tensors.append(list(d_trf_data.tensors))
308 | else:
309 | for j, d_tensor in enumerate(d_trf_data.tensors):
310 | d_tensors[i][j] += d_tensor
311 |
312 | def backprop(d_trf_datas: List[TransformerData]):
313 | """Callback to actually do the backprop. Passed to last listener."""
314 | nonlocal d_tensors
315 | accumulate_gradient(d_trf_datas)
316 | d_trf_full = trf_full.unsplit_by_doc(d_tensors)
317 | d_docs = bp_trf_full(d_trf_full) # type: ignore
318 | if sgd is not None:
319 | self.model.finish_update(sgd)
320 | d_tensors = []
321 | return d_docs
322 |
323 | batch_id = TransformerListener.get_batch_id(docs)
324 | for listener in self.listeners[:-1]:
325 | listener.receive(batch_id, trf_full.doc_data, accumulate_gradient)
326 | if self.listeners:
327 | self.listeners[-1].receive(batch_id, trf_full.doc_data, backprop)
328 | return losses
329 |
330 | def get_loss(self, docs, golds, scores):
331 | """A noop function, for compatibility with the Pipe API. See the `update`
332 | method for an explanation of the loss mechanics of the component.
333 | """
334 | pass
335 |
336 | def initialize(
337 | self,
338 | get_examples: Callable[[], Iterable[Example]],
339 | *,
340 | nlp: Optional[Language] = None,
341 | ):
342 | """Initialize the pipe for training, using data examples if available.
343 |
344 | get_examples (Callable[[], Iterable[Example]]): Optional function that
345 | returns gold-standard Example objects.
346 | nlp (Language): The current nlp object.
347 |
348 | DOCS: https://spacy.io/api/transformer#initialize
349 | """
350 | docs = [Doc(Vocab(), words=["hello"])]
351 | self.model.initialize(X=docs)
352 | if nlp is not None:
353 | for i, (name1, proc1) in enumerate(nlp.pipeline):
354 | if proc1 is self:
355 | for name2, proc2 in nlp.pipeline[i:]:
356 | self.find_listeners(proc2)
357 | break
358 |
359 | def to_disk(
360 | self, path: Union[str, Path], *, exclude: Iterable[str] = tuple()
361 | ) -> None:
362 | """Serialize the pipe to disk.
363 |
364 | path (str / Path): Path to a directory.
365 | exclude (Iterable[str]): String names of serialization fields to exclude.
366 |
367 | DOCS: https://spacy.io/api/transformer#to_disk
368 | """
369 | serialize = {}
370 | serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
371 | serialize["vocab"] = lambda p: self.vocab.to_disk(p)
372 | serialize["model"] = lambda p: self.model.to_disk(p)
373 | util.to_disk(path, serialize, exclude)
374 |
375 | def from_disk(
376 | self, path: Union[str, Path], *, exclude: Iterable[str] = tuple()
377 | ) -> "Transformer":
378 | """Load the pipe from disk.
379 |
380 | path (str / Path): Path to a directory.
381 | exclude (Iterable[str]): String names of serialization fields to exclude.
382 | RETURNS (Transformer): The loaded object.
383 |
384 | DOCS: https://spacy.io/api/transformer#from_disk
385 | """
386 |
387 | def load_model(p):
388 | try:
389 | with open(p, "rb") as mfile:
390 | self.model.from_bytes(mfile.read())
391 | except AttributeError:
392 | raise ValueError(Errors.E149) from None
393 | except (IsADirectoryError, PermissionError):
394 | warn_msg = (
395 | "Automatically converting a transformer component "
396 | "from spacy-transformers v1.0 to v1.1+. If you see errors "
397 | "or degraded performance, download a newer compatible "
398 | "model or retrain your custom model with the current "
399 | "spacy-transformers version. For more details and "
400 | "available updates, run: python -m spacy validate"
401 | )
402 | warnings.warn(warn_msg)
403 | p = Path(p).absolute()
404 | hf_model = huggingface_from_pretrained(
405 | p,
406 | self.model._init_tokenizer_config,
407 | self.model._init_transformer_config,
408 | )
409 | self.model.attrs["set_transformer"](self.model, hf_model)
410 |
411 | deserialize = {
412 | "vocab": self.vocab.from_disk,
413 | "cfg": lambda p: self.cfg.update(deserialize_config(p)),
414 | "model": load_model,
415 | }
416 | util.from_disk(path, deserialize, exclude) # type: ignore
417 | return self
418 |
--------------------------------------------------------------------------------
/spacy_transformers/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spacy-transformers/aa1bb58f74570035e8a6dc3623292deaf95e03da/spacy_transformers/py.typed
--------------------------------------------------------------------------------
/spacy_transformers/span_getters.py:
--------------------------------------------------------------------------------
1 | from typing import Callable, Iterable, List
2 | from functools import partial
3 | from spacy.tokens import Doc, Span
4 |
5 | from .util import registry
6 |
7 | SpannerT = Callable[[List[Doc]], List[List[Span]]]
8 |
9 |
10 | def get_strided_spans(
11 | docs: Iterable[Doc], window: int, stride: int
12 | ) -> List[List[Span]]:
13 | spans: List[List[Span]] = []
14 | for doc in docs:
15 | start = 0
16 | spans.append([])
17 | for i in range(len(doc) // stride):
18 | spans[-1].append(doc[start : start + window])
19 | if (start + window) >= len(doc):
20 | break
21 | start += stride
22 | else:
23 | if start < len(doc):
24 | spans[-1].append(doc[start:])
25 | return spans
26 |
27 |
28 | @registry.span_getters("spacy-transformers.strided_spans.v1") # type: ignore
29 | def configure_strided_spans(window: int, stride: int) -> SpannerT:
30 | """
31 | Set the 'window' and 'stride' options for getting strided spans.
32 |
33 | If you set the window and stride to the same value, the spans will cover
34 | each token once. Setting 'stride' lower than 'window' will allow for an
35 | overlap, so that some tokens are counted twice. This can be desirable,
36 | because it allows all tokens to have both a left and right context.
37 | """
38 | return partial(get_strided_spans, window=window, stride=stride)
39 |
40 |
41 | def get_sent_spans(docs: Iterable[Doc]) -> List[List[Span]]:
42 | return [list(doc.sents) for doc in docs]
43 |
44 |
45 | @registry.span_getters("spacy-transformers.sent_spans.v1") # type: ignore
46 | def configure_get_sent_spans() -> Callable:
47 | """
48 | Create a `span_getter` that uses sentence boundary markers to extract
49 | the spans. This requires sentence boundaries to be set, and may result
50 | in somewhat uneven batches, depending on the sentence lengths. However,
51 | it does provide the transformer with more meaningful windows to attend over.
52 | """
53 | return get_sent_spans
54 |
55 |
56 | def get_doc_spans(docs: Iterable[Doc]) -> List[List[Span]]:
57 | return [[doc[:]] for doc in docs]
58 |
59 |
60 | @registry.span_getters("spacy-transformers.doc_spans.v1") # type: ignore
61 | def configure_get_doc_spans() -> Callable:
62 | """
63 | Create a `span_getter` that uses the whole document as its spans. This is
64 | the best approach if your `Doc` objects already refer to relatively short
65 | texts.
66 | """
67 | return get_doc_spans
68 |
69 |
70 | __all__ = [
71 | "get_sent_spans",
72 | "get_doc_spans",
73 | "configure_get_doc_spans",
74 | "configure_get_sent_spans",
75 | "configure_strided_spans",
76 | ]
77 |
--------------------------------------------------------------------------------
/spacy_transformers/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spacy-transformers/aa1bb58f74570035e8a6dc3623292deaf95e03da/spacy_transformers/tests/__init__.py
--------------------------------------------------------------------------------
/spacy_transformers/tests/enable_gpu.py:
--------------------------------------------------------------------------------
1 | from spacy import require_gpu
2 |
3 | require_gpu()
4 |
--------------------------------------------------------------------------------
/spacy_transformers/tests/regression/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spacy-transformers/aa1bb58f74570035e8a6dc3623292deaf95e03da/spacy_transformers/tests/regression/__init__.py
--------------------------------------------------------------------------------
/spacy_transformers/tests/regression/test_spacy_issue6401.py:
--------------------------------------------------------------------------------
1 | import spacy
2 | from spacy.training.example import Example
3 | from spacy.util import make_tempdir
4 | from spacy import util
5 | from thinc.api import Config
6 |
7 |
8 | TRAIN_DATA = [
9 | ("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
10 | ("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}),
11 | ]
12 |
13 |
14 | cfg_string = """
15 | [nlp]
16 | lang = "en"
17 | pipeline = ["transformer","textcat"]
18 |
19 | [components]
20 |
21 | [components.textcat]
22 | factory = "textcat"
23 |
24 | [components.textcat.model]
25 | @architectures = "spacy.TextCatEnsemble.v2"
26 |
27 | [components.textcat.model.tok2vec]
28 | @architectures = "spacy-transformers.TransformerListener.v1"
29 | grad_factor = 1.0
30 |
31 | [components.textcat.model.tok2vec.pooling]
32 | @layers = "reduce_mean.v1"
33 |
34 | [components.transformer]
35 | factory = "transformer"
36 |
37 | [components.transformer.model]
38 | name = "distilbert-base-uncased"
39 | """
40 |
41 |
42 | def test_transformer_pipeline_textcat():
43 | """Test that a pipeline with just a transformer+textcat runs and trains properly.
44 | This used to throw an error because of shape inference issues -
45 | cf https://github.com/explosion/spaCy/issues/6401"""
46 | orig_config = Config().from_str(cfg_string)
47 | nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
48 | assert nlp.pipe_names == ["transformer", "textcat"]
49 | train_examples = []
50 |
51 | for text, annotations in TRAIN_DATA:
52 | train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
53 | optimizer = nlp.initialize(get_examples=lambda: train_examples)
54 |
55 | for i in range(2):
56 | losses = {}
57 | nlp.update(train_examples, sgd=optimizer, losses=losses)
58 |
59 | doc = nlp("We're interested at underwater basket weaving.")
60 | cats1 = doc.cats
61 |
62 | # ensure IO goes OK
63 | with make_tempdir() as d:
64 | file_path = d / "trained_nlp"
65 | nlp.to_disk(file_path)
66 | nlp2 = spacy.load(file_path)
67 | doc2 = nlp2("We're interested at underwater basket weaving.")
68 | cats2 = doc2.cats
69 | assert cats1 == cats2
70 |
--------------------------------------------------------------------------------
/spacy_transformers/tests/regression/test_spacy_issue7029.py:
--------------------------------------------------------------------------------
1 | from spacy.lang.en import English
2 | from spacy.training import Example
3 | from spacy.util import load_config_from_str
4 |
5 | CONFIG = """
6 | [nlp]
7 | lang = "en"
8 | pipeline = ["transformer", "tagger"]
9 |
10 | [components]
11 |
12 | [components.transformer]
13 | factory = "transformer"
14 |
15 | [components.transformer.model]
16 | name = "distilbert-base-uncased"
17 |
18 | [components.tagger]
19 | factory = "tagger"
20 |
21 | [components.tagger.model]
22 | @architectures = "spacy.Tagger.v1"
23 | nO = null
24 |
25 | [components.tagger.model.tok2vec]
26 | @architectures = "spacy-transformers.TransformerListener.v1"
27 | grad_factor = 1.0
28 |
29 | [components.tagger.model.tok2vec.pooling]
30 | @layers = "reduce_mean.v1"
31 | """
32 |
33 |
34 | TRAIN_DATA = [
35 | ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
36 | ("", {}),
37 | ("Eat blue ham", {"tags": ["V", "J", "N"]}),
38 | ]
39 |
40 |
41 | def test_empty_doc():
42 | """Test that an empty document gets processed correctly"""
43 | nlp = English.from_config(load_config_from_str(CONFIG))
44 | train_examples = []
45 | for t in TRAIN_DATA:
46 | train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
47 | optimizer = nlp.initialize(get_examples=lambda: train_examples)
48 | for i in range(2):
49 | losses = {}
50 | nlp.update(train_examples, sgd=optimizer, losses=losses)
51 | texts = ["first", "second", "third", "fourth", "and", "then", "some", ""]
52 |
53 | # run as normal
54 | nlp.select_pipes(enable=["transformer", "tagger"])
55 | docs1 = list(nlp.pipe(texts, batch_size=1))
56 | docs2 = list(nlp.pipe(texts, batch_size=4))
57 | assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]]
58 |
59 | # disable the transformer (the listener will produce random output)
60 | nlp.select_pipes(enable=["tagger"])
61 | docs1 = list(nlp.pipe(texts, batch_size=1))
62 | docs2 = list(nlp.pipe(texts, batch_size=4))
63 | assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]]
64 |
--------------------------------------------------------------------------------
/spacy_transformers/tests/test_alignment.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from typing import List
3 | import numpy
4 | from spacy.tokens import Doc
5 | from spacy.vocab import Vocab
6 | from thinc.api import NumpyOps
7 | from thinc.types import Ragged
8 | from ..align import get_alignment, apply_alignment
9 | from ..align import get_span2wp_from_offset_mapping
10 |
11 |
12 | def get_ragged(ops, nested: List[List[int]]):
13 | nested = [ops.asarray(x) for x in nested]
14 | return Ragged(ops.flatten(nested), ops.asarray([len(x) for x in nested]))
15 |
16 |
17 | def get_spans(word_seqs):
18 | vocab = Vocab()
19 | docs = [Doc(vocab, words=words) for words in word_seqs]
20 | return [doc[:] for doc in docs]
21 |
22 |
23 | def flatten_strings(words1, words2):
24 | flat1 = []
25 | flat2 = []
26 | for seq in words1:
27 | flat1.extend(seq)
28 | stride = max((len(seq) for seq in words2), default=0)
29 | for seq in words2:
30 | flat2.extend(seq)
31 | flat2.extend([""] * (stride - len(seq)))
32 | return flat1, flat2
33 |
34 |
35 | @pytest.mark.parametrize(
36 | "words1,words2",
37 | [
38 | ([["a", "b"]], [["a", "b"]]),
39 | ([["ab"]], [["a", "b"]]),
40 | ([["a", "b"]], [["ab"]]),
41 | ([["ab", "c"]], [["a", "bc"]]),
42 | ([["ab", "cd"]], [["a", "bc", "d"]]),
43 | ],
44 | )
45 | def test_alignments_match(words1, words2):
46 | spans = get_spans(words1)
47 | align = get_alignment(spans, words2)
48 | unique_tokens = set()
49 | for span in spans:
50 | for token in span:
51 | unique_tokens.add((id(token.doc), token.idx))
52 | assert len(unique_tokens) == align.lengths.shape[0]
53 | flat_words1, flat_words2 = flatten_strings(words1, words2)
54 | for i, word in enumerate(flat_words1):
55 | wp_word = "".join([flat_words2[int(j[0])] for j in align[i].data])
56 | if len(word) < len(wp_word):
57 | assert word in wp_word
58 | elif len(word) > len(wp_word):
59 | assert wp_word in word
60 | else:
61 | assert word == wp_word
62 |
63 |
64 | @pytest.mark.parametrize(
65 | "nested_align,X_cols",
66 | [
67 | ([[0, 1, 2], [3], [4]], 4),
68 | ([[], [1], [1], [2]], 2),
69 | ([[0, 1], [1, 2], [], [4]], 2),
70 | ],
71 | )
72 | def test_apply_alignment(nested_align, X_cols):
73 | ops = NumpyOps()
74 | align = get_ragged(ops, nested_align)
75 | X_shape = (align.data.max() + 1, X_cols)
76 | X = ops.alloc2f(*X_shape)
77 | Y, get_dX = apply_alignment(ops, align, X)
78 | assert isinstance(Y, Ragged)
79 | assert Y.data.shape[0] == align.data.shape[0]
80 | assert Y.lengths.shape[0] == len(nested_align)
81 | dX = get_dX(Y)
82 | assert dX.shape == X.shape
83 |
84 |
85 | @pytest.mark.parametrize(
86 | # fmt: off
87 | # roberta-base offset_mapping and expected alignment
88 | "words,offset_mapping,alignment",
89 | [
90 | (
91 | ["Áaaa"],
92 | numpy.asarray([(0, 0), (0, 1), (0, 1), (1, 4), (0, 0)], dtype="i"),
93 | [[1, 2, 3]],
94 | ),
95 | (
96 | ["INGG", "á", "aäa"],
97 | numpy.asarray([(0, 0), (0, 3), (3, 4), (5, 6), (5, 6), (7, 8), (8, 9), (9, 10), (0, 0)], dtype="i"),
98 | [[1, 2], [3, 4], [5, 6, 7]],
99 | ),
100 | ],
101 | # fmt: on
102 | )
103 | def test_offset_alignment(words, offset_mapping, alignment):
104 | spans = get_spans([words])
105 | result = get_span2wp_from_offset_mapping(spans[0], offset_mapping)
106 | assert all(sorted(r) == a for r, a in zip(result, alignment))
107 |
--------------------------------------------------------------------------------
/spacy_transformers/tests/test_configs.py:
--------------------------------------------------------------------------------
1 | from functools import partial
2 |
3 | import pytest
4 | import spacy
5 | from spacy.training import Example
6 | from spacy.training.initialize import init_nlp
7 | from spacy.util import CONFIG_SECTION_ORDER
8 | from spacy.language import DEFAULT_CONFIG
9 | from thinc.config import Config
10 |
11 |
12 | TRAIN_TAGGER_DATA = [
13 | ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
14 | ("Eat blue ham", {"tags": ["V", "J", "N"]}),
15 | ]
16 |
17 |
18 | cfg_string = """
19 | [nlp]
20 | lang = "en"
21 | pipeline = ["custom_transformer","tagger"]
22 |
23 | [components]
24 |
25 | [components.tagger]
26 | factory = "tagger"
27 |
28 | [components.tagger.model]
29 | @architectures = "spacy.Tagger.v1"
30 | nO = null
31 |
32 | [components.tagger.model.tok2vec]
33 | @architectures = "spacy-transformers.TransformerListener.v1"
34 | grad_factor = 1.0
35 | upstream = "custom_transformer"
36 |
37 | [components.tagger.model.tok2vec.pooling]
38 | @layers = "reduce_mean.v1"
39 |
40 | [components.custom_transformer]
41 | factory = "transformer"
42 |
43 | [corpora]
44 | @readers = toy_tagger_data.v1
45 |
46 | [initialize]
47 |
48 | [initialize.components]
49 |
50 | [initialize.components.tagger]
51 | labels = ["LABEL"]
52 | """
53 |
54 |
55 | @pytest.mark.parametrize("config_string", [cfg_string])
56 | def test_init_nlp(config_string):
57 | @spacy.registry.readers.register("toy_tagger_data.v1")
58 | def read_tagger_data():
59 | def parse_data(nlp, index):
60 | ex = TRAIN_TAGGER_DATA[index]
61 | yield Example.from_dict(nlp.make_doc(ex[0]), ex[1])
62 |
63 | return {
64 | "train": partial(parse_data, index=0),
65 | "dev": partial(parse_data, index=1),
66 | }
67 |
68 | config = spacy.util.load_config_from_str(config_string, interpolate=False)
69 | config = Config(DEFAULT_CONFIG, section_order=CONFIG_SECTION_ORDER).merge(config)
70 | nlp = init_nlp(config, use_gpu=False)
71 | assert nlp is not None
72 |
73 | tagger = nlp.get_pipe("tagger")
74 | transformer = nlp.get_pipe("custom_transformer")
75 | tagger_trf = tagger.model.get_ref("tok2vec").layers[0]
76 | assert tagger_trf.upstream_name == "custom_transformer"
77 | assert transformer.listeners[0] == tagger_trf
78 |
--------------------------------------------------------------------------------
/spacy_transformers/tests/test_data_classes.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import numpy
3 | from numpy.testing import assert_equal
4 | from spacy_transformers.data_classes import WordpieceBatch
5 |
6 |
7 | @pytest.fixture
8 | def wordpieces():
9 | strings = [["some", "random", "strings"], ["are"], ["added", "here"]]
10 | shape = (len(strings), max(len(seq) for seq in strings))
11 | wordpieces = WordpieceBatch(
12 | strings=strings,
13 | input_ids=numpy.zeros(shape, dtype="i"),
14 | token_type_ids=numpy.zeros(shape, dtype="i"),
15 | attention_mask=numpy.zeros((shape[0], shape[1]), dtype="bool"),
16 | lengths=[len(seq) for seq in strings],
17 | )
18 | return wordpieces
19 |
20 |
21 | def test_wordpieces_IO(wordpieces):
22 | wp_dict = wordpieces.to_dict()
23 | wordpieces_2 = WordpieceBatch.empty().from_dict(wp_dict)
24 | for key, value in wordpieces_2.to_dict().items():
25 | assert_equal(value, wp_dict[key])
26 |
--------------------------------------------------------------------------------
/spacy_transformers/tests/test_deprecations.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from spacy_transformers.util import huggingface_from_pretrained
3 | from spacy_transformers.util import huggingface_tokenize
4 |
5 |
6 | def test_deprecation_warnings():
7 | with pytest.warns(DeprecationWarning):
8 | tokenizer, transformer = huggingface_from_pretrained(
9 | "distilbert-base-uncased", {}
10 | )
11 | with pytest.warns(DeprecationWarning):
12 | token_data = huggingface_tokenize(tokenizer, ["a", "b", "c"])
13 |
--------------------------------------------------------------------------------
/spacy_transformers/tests/test_model_sequence_classification.py:
--------------------------------------------------------------------------------
1 | from typing import Callable
2 | from functools import partial
3 | import copy
4 |
5 | import torch
6 | from transformers import AutoModelForSequenceClassification
7 | from transformers.models.distilbert.modeling_distilbert import (
8 | DistilBertForSequenceClassification,
9 | )
10 | from transformers.modeling_outputs import SequenceClassifierOutput
11 |
12 | import spacy
13 | from thinc.api import Model
14 |
15 | from spacy_transformers.data_classes import HFObjects, WordpieceBatch
16 | from spacy_transformers.layers.hf_wrapper import HFWrapper
17 | from spacy_transformers.layers.transformer_model import _convert_transformer_inputs
18 | from spacy_transformers.layers.transformer_model import _convert_transformer_outputs
19 | from spacy_transformers.layers.transformer_model import forward
20 | from spacy_transformers.layers.transformer_model import huggingface_from_pretrained
21 | from spacy_transformers.layers.transformer_model import huggingface_tokenize
22 | from spacy_transformers.layers.transformer_model import set_pytorch_transformer
23 | from spacy_transformers.span_getters import get_strided_spans
24 |
25 |
26 | def test_model_for_sequence_classification():
27 | # adapted from https://github.com/KennethEnevoldsen/spacy-wrap/
28 | class ClassificationTransformerModel(Model):
29 | def __init__(
30 | self,
31 | name: str,
32 | get_spans: Callable,
33 | tokenizer_config: dict = {},
34 | transformer_config: dict = {},
35 | mixed_precision: bool = False,
36 | grad_scaler_config: dict = {},
37 | ):
38 | hf_model = HFObjects(None, None, None, tokenizer_config, transformer_config)
39 | wrapper = HFWrapper(
40 | hf_model,
41 | convert_inputs=_convert_transformer_inputs,
42 | convert_outputs=_convert_transformer_outputs,
43 | mixed_precision=mixed_precision,
44 | grad_scaler_config=grad_scaler_config,
45 | model_cls=AutoModelForSequenceClassification,
46 | )
47 | super().__init__(
48 | "clf_transformer",
49 | forward,
50 | init=init,
51 | layers=[wrapper],
52 | dims={"nO": None},
53 | attrs={
54 | "get_spans": get_spans,
55 | "name": name,
56 | "set_transformer": set_pytorch_transformer,
57 | "has_transformer": False,
58 | "flush_cache_chance": 0.0,
59 | },
60 | )
61 |
62 | @property
63 | def tokenizer(self):
64 | return self.layers[0].shims[0]._hfmodel.tokenizer
65 |
66 | @property
67 | def transformer(self):
68 | return self.layers[0].shims[0]._hfmodel.transformer
69 |
70 | @property
71 | def _init_tokenizer_config(self):
72 | return self.layers[0].shims[0]._hfmodel._init_tokenizer_config
73 |
74 | @property
75 | def _init_transformer_config(self):
76 | return self.layers[0].shims[0]._hfmodel._init_transformer_config
77 |
78 | def copy(self):
79 | """
80 | Create a copy of the model, its attributes, and its parameters. Any child
81 | layers will also be deep-copied. The copy will receive a distinct `model.id`
82 | value.
83 | """
84 | copied = ClassificationTransformerModel(self.name, self.attrs["get_spans"])
85 | params = {}
86 | for name in self.param_names:
87 | params[name] = self.get_param(name) if self.has_param(name) else None
88 | copied.params = copy.deepcopy(params)
89 | copied.dims = copy.deepcopy(self._dims)
90 | copied.layers[0] = copy.deepcopy(self.layers[0])
91 | for name in self.grad_names:
92 | copied.set_grad(name, self.get_grad(name).copy())
93 | return copied
94 |
95 | def init(model: ClassificationTransformerModel, X=None, Y=None):
96 | if model.attrs["has_transformer"]:
97 | return
98 | name = model.attrs["name"]
99 | tok_cfg = model._init_tokenizer_config
100 | trf_cfg = model._init_transformer_config
101 | hf_model = huggingface_from_pretrained(
102 | name, tok_cfg, trf_cfg, model_cls=AutoModelForSequenceClassification
103 | )
104 | model.attrs["set_transformer"](model, hf_model)
105 | tokenizer = model.tokenizer
106 | texts = ["hello world", "foo bar"]
107 | token_data = huggingface_tokenize(tokenizer, texts)
108 | wordpieces = WordpieceBatch.from_batch_encoding(token_data)
109 | model.layers[0].initialize(X=wordpieces)
110 |
111 | model = ClassificationTransformerModel(
112 | "sgugger/tiny-distilbert-classification",
113 | get_spans=partial(get_strided_spans, window=128, stride=96),
114 | )
115 | model.initialize()
116 |
117 | assert isinstance(model.transformer, DistilBertForSequenceClassification)
118 | nlp = spacy.blank("en")
119 | doc = nlp.make_doc("some text")
120 | assert isinstance(model.predict([doc]).model_output, SequenceClassifierOutput)
121 |
122 | b = model.to_bytes()
123 | model_re = ClassificationTransformerModel(
124 | "sgugger/tiny-distilbert-classification",
125 | get_spans=partial(get_strided_spans, window=128, stride=96),
126 | ).from_bytes(b)
127 | assert isinstance(model_re.transformer, DistilBertForSequenceClassification)
128 | assert isinstance(model_re.predict([doc]).model_output, SequenceClassifierOutput)
129 | assert torch.equal(
130 | model.predict([doc]).model_output.logits,
131 | model_re.predict([doc]).model_output.logits,
132 | )
133 | # Note that model.to_bytes() != model_re.to_bytes(), but this is also not
134 | # true for the default models.
135 |
--------------------------------------------------------------------------------
/spacy_transformers/tests/test_model_wrapper.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import spacy
3 | from thinc.api import Model
4 | from ..layers import TransformerModel
5 | from ..data_classes import FullTransformerBatch
6 | from ..span_getters import get_doc_spans
7 |
8 |
9 | MODEL_NAMES = [
10 | "distilbert-base-uncased",
11 | "hf-internal-testing/tiny-random-gpt2",
12 | "hf-internal-testing/tiny-random-xlnet",
13 | ]
14 |
15 |
16 | @pytest.fixture
17 | def nlp():
18 | return spacy.blank("en")
19 |
20 |
21 | @pytest.fixture
22 | def docs(nlp):
23 | texts = ["the cat sat on the mat.", "hello world."]
24 | return [nlp(text) for text in texts]
25 |
26 |
27 | @pytest.fixture(scope="module", params=MODEL_NAMES)
28 | def name(request):
29 | return request.param
30 |
31 |
32 | @pytest.fixture(scope="module", params=[True, False])
33 | def output_attentions(request):
34 | return request.param
35 |
36 |
37 | @pytest.fixture(scope="module", params=[True, False])
38 | def output_hidden_states(request):
39 | return request.param
40 |
41 |
42 | @pytest.fixture(scope="module")
43 | def trf_model(name, output_attentions, output_hidden_states):
44 | if "gpt2" in name:
45 | model = TransformerModel(
46 | name,
47 | get_doc_spans,
48 | {"use_fast": True, "pad_token": "<|endoftext|>"},
49 | {
50 | "output_attentions": output_attentions,
51 | "output_hidden_states": output_hidden_states,
52 | },
53 | )
54 |
55 | else:
56 | # test slow tokenizers with distilbert-base-uncased (parameterizing
57 | # for all models blows up the memory usage during the test suite)
58 | if name == "distilbert-base-uncased":
59 | use_fast = False
60 | else:
61 | use_fast = True
62 | model = TransformerModel(
63 | name,
64 | get_doc_spans,
65 | {"use_fast": use_fast},
66 | {
67 | "output_attentions": output_attentions,
68 | "output_hidden_states": output_hidden_states,
69 | },
70 | )
71 | model.initialize()
72 | return model
73 |
74 |
75 | def test_model_init(name, trf_model):
76 | assert isinstance(trf_model, Model)
77 | if name == "distilbert-base-uncased":
78 | assert not trf_model.tokenizer.is_fast
79 | else:
80 | assert trf_model.tokenizer.is_fast
81 |
82 |
83 | def test_model_predict(nlp, docs, trf_model):
84 | outputs = trf_model.predict(docs)
85 | shape = outputs.model_output.last_hidden_state.shape
86 | if trf_model.transformer.config.output_attentions is True:
87 | assert outputs.model_output.attentions is not None
88 | assert all([t.shape[0] == shape[0] for t in outputs.model_output.attentions])
89 | else:
90 | assert outputs.model_output.attentions is None
91 | if trf_model.transformer.config.output_hidden_states is True:
92 | assert outputs.model_output.hidden_states is not None
93 | assert all([t.shape[0] == shape[0] for t in outputs.model_output.hidden_states])
94 | else:
95 | assert outputs.model_output.hidden_states is None
96 | assert isinstance(outputs, FullTransformerBatch)
97 |
98 | # for a fast tokenizer check that all non-special wordpieces are aligned
99 | # (which is not necessarily true for the slow tokenizers)
100 | if trf_model.tokenizer.is_fast:
101 | outputs = trf_model.predict([nlp.make_doc("\tÁaaa \n\n")])
102 | aligned_wps = outputs.align.data.flatten()
103 | for i in range(len(outputs.wordpieces.strings[0])):
104 | if (
105 | outputs.wordpieces.strings[0][i]
106 | not in trf_model.tokenizer.all_special_tokens
107 | ):
108 | assert i in aligned_wps
109 |
--------------------------------------------------------------------------------
/spacy_transformers/tests/test_pipeline_component.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from packaging.version import Version
3 | import torch
4 | import spacy
5 | from spacy.language import Language
6 | from spacy.training.example import Example
7 | from spacy.util import make_tempdir
8 | from spacy.vocab import Vocab
9 | from spacy.tokens import Doc
10 | from spacy import util
11 | from thinc.api import Model, Config, get_current_ops, NumpyOps
12 | from spacy.tests.util import assert_docs_equal
13 |
14 | from .util import DummyTransformer, _assert_equal_tensors
15 | from .. import TransformerModel
16 | from ..pipeline_component import Transformer
17 | from ..layers import TransformerListener
18 | from ..data_classes import TransformerData, FullTransformerBatch
19 |
20 |
21 | torch.set_num_threads(1)
22 |
23 |
24 | @pytest.fixture
25 | def vocab():
26 | return Vocab()
27 |
28 |
29 | @pytest.fixture
30 | def docs(vocab):
31 | return [
32 | Doc(vocab, words=["hello", "world"]),
33 | Doc(vocab, words=["this", "is", "another"]),
34 | ]
35 |
36 |
37 | @pytest.fixture
38 | def component(vocab):
39 | return Transformer(Vocab(), DummyTransformer())
40 |
41 |
42 | @pytest.fixture(scope="module")
43 | def simple_nlp():
44 | nlp = Language()
45 | nlp.add_pipe("transformer")
46 | train_examples = []
47 | for t in TRAIN_DATA:
48 | train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
49 |
50 | optimizer = nlp.initialize()
51 | for i in range(2):
52 | losses = {}
53 | nlp.update(train_examples, sgd=optimizer, losses=losses)
54 |
55 | return nlp
56 |
57 |
58 | def test_init(component):
59 | assert isinstance(component.vocab, Vocab)
60 | assert isinstance(component.model, Model)
61 | assert hasattr(component.set_extra_annotations, "__call__")
62 | assert component.listeners == []
63 | assert component.cfg == {"max_batch_items": 4096}
64 |
65 |
66 | def test_predict(component, docs):
67 | trf_data = component.predict(docs)
68 | n_tokens = trf_data.wordpieces.input_ids.shape[1]
69 | width = component.model.layers[0].attrs["width"]
70 | assert isinstance(trf_data, FullTransformerBatch)
71 | assert (
72 | len(trf_data.model_output.last_hidden_state)
73 | == component.model.layers[0].attrs["depth"]
74 | )
75 | assert trf_data.model_output.last_hidden_state[0].shape == (
76 | len(docs),
77 | n_tokens,
78 | width,
79 | )
80 |
81 |
82 | def test_set_annotations(component, docs):
83 | trf_data = component.predict(docs)
84 | component.set_annotations(docs, trf_data)
85 | for doc in docs:
86 | assert isinstance(doc._.trf_data, TransformerData)
87 |
88 |
89 | def test_set_extra_annotations(component, docs):
90 | Doc.set_extension("custom_attr", default="")
91 |
92 | def custom_annotation_setter(docs, trf_data):
93 | doc_data = list(trf_data.doc_data)
94 | for doc, data in zip(docs, doc_data):
95 | doc._.custom_attr = data
96 |
97 | component.set_extra_annotations = custom_annotation_setter
98 | trf_data = component.predict(docs)
99 | component.set_annotations(docs, trf_data)
100 | for doc in docs:
101 | assert isinstance(doc._.custom_attr, TransformerData)
102 |
103 |
104 | def test_listeners(component, docs):
105 | docs = list(component.pipe(docs))
106 | for listener in component.listeners:
107 | assert listener.verify_inputs(docs)
108 |
109 |
110 | TRAIN_DATA = [
111 | (
112 | "I like green eggs",
113 | {"tags": ["N", "V", "J", "N"], "sent_starts": [True, False, True, False]},
114 | ),
115 | ("Eat blue ham", {"tags": ["V", "J", "N"], "sent_starts": [True, False, False]}),
116 | ]
117 |
118 |
119 | def test_transformer_pipeline_simple(simple_nlp):
120 | """Test that a simple pipeline with just a transformer at least runs"""
121 | doc = simple_nlp("We're interested at underwater basket weaving.")
122 | assert doc
123 |
124 |
125 | def test_transformer_pipeline_long_token(simple_nlp):
126 | """Test that a simple pipeline does not raise an error on texts that exceeds
127 | the model max length. We should truncate instead.
128 | """
129 | doc = simple_nlp("https://example.com/" + "a/" * 1000)
130 | assert len(doc) == 1
131 |
132 |
133 | cfg_string = """
134 | [nlp]
135 | lang = "en"
136 | pipeline = ["transformer","tagger","senter"]
137 |
138 | [components]
139 |
140 | [components.senter]
141 | factory = "senter"
142 |
143 | [components.senter.model]
144 | @architectures = "spacy.Tagger.v1"
145 | nO = null
146 |
147 | [components.senter.model.tok2vec]
148 | @architectures = "spacy-transformers.TransformerListener.v1"
149 | grad_factor = 1.0
150 | upstream = "transformer"
151 |
152 | [components.senter.model.tok2vec.pooling]
153 | @layers = "reduce_mean.v1"
154 |
155 | [components.tagger]
156 | factory = "tagger"
157 |
158 | [components.tagger.model]
159 | @architectures = "spacy.Tagger.v1"
160 | nO = null
161 |
162 | [components.tagger.model.tok2vec]
163 | @architectures = "spacy-transformers.TransformerListener.v1"
164 | grad_factor = 1.0
165 | upstream = "transformer"
166 |
167 | [components.tagger.model.tok2vec.pooling]
168 | @layers = "reduce_mean.v1"
169 |
170 | [components.transformer]
171 | factory = "transformer"
172 |
173 | [components.transformer.model]
174 | @architectures = "spacy-transformers.TransformerModel.v3"
175 | name = "albert-base-v2"
176 |
177 | [components.transformer.model.transformer_config]
178 | output_attentions = true
179 | """
180 |
181 |
182 | def test_transformer_pipeline_tagger_senter_listener():
183 | """Test that a pipeline with just a transformer+tagger+senter runs and
184 | trains properly"""
185 | orig_config = Config().from_str(cfg_string)
186 | nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
187 | assert nlp.pipe_names == ["transformer", "tagger", "senter"]
188 | tagger = nlp.get_pipe("tagger")
189 | transformer = nlp.get_pipe("transformer")
190 | tagger_trf = tagger.model.get_ref("tok2vec").layers[0]
191 | assert isinstance(transformer, Transformer)
192 | assert isinstance(tagger_trf, TransformerListener)
193 | train_examples = []
194 | for t in TRAIN_DATA:
195 | train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
196 | for tag in t[1]["tags"]:
197 | tagger.add_label(tag)
198 |
199 | # Check that the Transformer component finds it listeners
200 | optimizer = nlp.initialize(lambda: train_examples)
201 | assert tagger_trf in transformer.listeners
202 |
203 | for i in range(2):
204 | losses = {}
205 | nlp.update(train_examples, sgd=optimizer, losses=losses)
206 |
207 | text = "We're interested at underwater basket weaving."
208 | doc = nlp(text)
209 | doc_tensor = tagger_trf.predict([doc])
210 | _assert_equal_tensors(doc._.trf_data.tensors, doc_tensor[0].tensors)
211 |
212 | # ensure IO goes OK
213 | with make_tempdir() as d:
214 | file_path = d / "trained_nlp"
215 | nlp.to_disk(file_path)
216 | nlp2 = util.load_model_from_path(file_path)
217 | doc2 = nlp2(text)
218 | tagger2 = nlp2.get_pipe("tagger")
219 | tagger_trf2 = tagger2.model.get_ref("tok2vec").layers[0]
220 | doc_tensor2 = tagger_trf2.predict([doc2])
221 | _assert_equal_tensors(doc_tensor2[0].tensors, doc_tensor[0].tensors)
222 |
223 | # make sure that this can be saved to directory once more
224 | file_path_2 = d / "trained_nlp_2"
225 | nlp2.to_disk(file_path_2)
226 |
227 | # ensure to_bytes / from_bytes works
228 | nlp_bytes = nlp.to_bytes()
229 | nlp3 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
230 | nlp3.from_bytes(nlp_bytes)
231 | doc3 = nlp3(text)
232 | tagger3 = nlp3.get_pipe("tagger")
233 | tagger_trf3 = tagger3.model.get_ref("tok2vec").layers[0]
234 | doc_tensor3 = tagger_trf3.predict([doc3])
235 | _assert_equal_tensors(doc_tensor3[0].tensors, doc_tensor[0].tensors)
236 |
237 |
238 | def test_transformer_sentencepiece_IO():
239 | """Test that a transformer using sentencepiece trains + IO goes OK"""
240 | orig_config = Config().from_str(cfg_string)
241 | orig_config["components"]["transformer"]["model"]["name"] = "hf-internal-testing/tiny-xlm-roberta"
242 | orig_config["components"]["transformer"]["model"]["tokenizer_config"] = {"use_fast": False}
243 | nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
244 | tagger = nlp.get_pipe("tagger")
245 | tagger_trf = tagger.model.get_ref("tok2vec").layers[0]
246 | train_examples = []
247 | for t in TRAIN_DATA:
248 | train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
249 | for tag in t[1]["tags"]:
250 | tagger.add_label(tag)
251 |
252 | optimizer = nlp.initialize(lambda: train_examples)
253 | for i in range(2):
254 | losses = {}
255 | nlp.update(train_examples, sgd=optimizer, losses=losses)
256 |
257 | text = "We're interested at underwater basket weaving."
258 | doc = nlp(text)
259 | doc_tensor = tagger_trf.predict([doc])
260 |
261 | # ensure IO goes OK
262 | with make_tempdir() as d:
263 | file_path = d / "trained_nlp"
264 | nlp.to_disk(file_path)
265 | nlp2 = util.load_model_from_path(file_path)
266 | doc2 = nlp2(text)
267 | tagger2 = nlp2.get_pipe("tagger")
268 | tagger_trf2 = tagger2.model.get_ref("tok2vec").layers[0]
269 | doc_tensor2 = tagger_trf2.predict([doc2])
270 | _assert_equal_tensors(doc_tensor2[0].tensors, doc_tensor[0].tensors)
271 |
272 | # make sure that this can be saved to directory once more
273 | file_path_2 = d / "trained_nlp_2"
274 | nlp2.to_disk(file_path_2)
275 |
276 | # ensure to_bytes / from_bytes works
277 | nlp_bytes = nlp.to_bytes()
278 | nlp3 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
279 | nlp3.from_bytes(nlp_bytes)
280 | doc3 = nlp3(text)
281 | tagger3 = nlp3.get_pipe("tagger")
282 | tagger_trf3 = tagger3.model.get_ref("tok2vec").layers[0]
283 | doc_tensor3 = tagger_trf3.predict([doc3])
284 | _assert_equal_tensors(doc_tensor3[0].tensors, doc_tensor[0].tensors)
285 |
286 |
287 | def test_transformer_pipeline_empty():
288 | """Test that the pipeline doesn't fail with empty input"""
289 | orig_config = Config().from_str(cfg_string)
290 | nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
291 | tagger = nlp.get_pipe("tagger")
292 | train_examples = []
293 | for t in TRAIN_DATA:
294 | train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
295 | for tag in t[1]["tags"]:
296 | tagger.add_label(tag)
297 |
298 | # train on empty doc
299 | optimizer = nlp.initialize()
300 | losses = {}
301 | empty_train_example = Example.from_dict(nlp.make_doc(""), {})
302 | nlp.update(train_examples, sgd=optimizer, losses=losses)
303 | nlp.update([empty_train_example], sgd=optimizer, losses=losses)
304 | train_examples.append(empty_train_example)
305 | nlp.update(train_examples, sgd=optimizer, losses=losses)
306 | # Interleave an empty doc between non-empty ones
307 | train_examples.insert(1, Example.from_dict(nlp.make_doc(""), {}))
308 | nlp.update(train_examples, sgd=optimizer, losses=losses)
309 |
310 | # predict empty doc
311 | doc = nlp("")
312 | _assert_empty(doc._.trf_data)
313 | docs = nlp.pipe(["", ""])
314 | for doc in docs:
315 | _assert_empty(doc._.trf_data)
316 | nlp.pipe([])
317 |
318 | # predict combination of empty and non-empty
319 | doc = nlp("This is a sentence")
320 | normal_tags = [t.tag_ for t in doc]
321 |
322 | docs = list(nlp.pipe(["", "This is a sentence", "", ""]))
323 | _assert_empty(docs[0]._.trf_data)
324 | assert [t.tag_ for t in docs[0]] == []
325 | assert [t.tag_ for t in docs[1]] == normal_tags
326 | _assert_empty(docs[2]._.trf_data)
327 | _assert_empty(docs[3]._.trf_data)
328 |
329 |
330 | def _assert_empty(trf_data):
331 | assert trf_data.wordpieces.strings == []
332 | assert trf_data.wordpieces.input_ids.size == 0
333 | assert trf_data.wordpieces.attention_mask.size == 0
334 | assert trf_data.tensors == ()
335 | assert len(trf_data.align.data) == 0
336 |
337 |
338 | def test_replace_listeners():
339 | orig_config = Config().from_str(cfg_string)
340 | nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
341 | text = "This is awesome"
342 | examples = [Example.from_dict(nlp.make_doc(text), {"tags": ["A", "B", "C"]})]
343 | optimizer = nlp.initialize(lambda: examples)
344 | # verify correct configuration with transformer listener
345 | transformer = nlp.get_pipe("transformer")
346 | tagger = nlp.get_pipe("tagger")
347 | tagger_tok2vec = tagger.model.get_ref("tok2vec")
348 | tagger_listener = tagger_tok2vec.get_ref("listener")
349 | assert isinstance(tagger_listener, TransformerListener)
350 | assert transformer.listener_map["tagger"][0] == tagger_listener
351 | assert isinstance(transformer.model, TransformerModel)
352 | assert (
353 | nlp.config["components"]["transformer"]["model"]["@architectures"]
354 | == "spacy-transformers.TransformerModel.v3"
355 | )
356 | assert (
357 | nlp.config["components"]["tagger"]["model"]["tok2vec"]["@architectures"]
358 | == "spacy-transformers.TransformerListener.v1"
359 | )
360 | # train pipe before replacing listeners
361 | for i in range(2):
362 | losses = {}
363 | nlp.update(examples, sgd=optimizer, losses=losses)
364 | doc = nlp(text)
365 |
366 | preds = [t.tag_ for t in doc]
367 | doc_tensor = tagger_tok2vec.predict([doc])
368 |
369 | # replace listener and verify predictions are still the same
370 | nlp.replace_listeners("transformer", "tagger", ["model.tok2vec"])
371 | tagger = nlp.get_pipe("tagger")
372 | tagger_tok2vec = tagger.model.get_ref("tok2vec")
373 | assert isinstance(tagger_tok2vec, Model)
374 | assert tagger_tok2vec.layers[0].layers[0].name == "transformer"
375 | assert (
376 | nlp.config["components"]["tagger"]["model"]["tok2vec"]["@architectures"]
377 | == "spacy-transformers.Tok2VecTransformer.v3"
378 | )
379 | doc2 = nlp(text)
380 | assert preds == [t.tag_ for t in doc2]
381 | pred_tensor = tagger_tok2vec.predict([doc2])
382 | _assert_equal_tensors(doc_tensor, pred_tensor)
383 |
384 | # attempt training with the new pipeline
385 | optimizer = nlp.resume_training()
386 | for i in range(2):
387 | losses = {}
388 | nlp.update(examples, sgd=optimizer, losses=losses)
389 | assert losses["tagger"] > 0.0
390 |
391 | # check for presence of additional fields in model_output
392 | assert doc2._.trf_data.model_output.pooler_output is not None
393 | assert doc2._.trf_data.model_output.attentions is not None
394 |
395 | # ensure IO goes OK
396 | doc_tensor_trained = tagger_tok2vec.predict([doc])
397 | with make_tempdir() as d:
398 | file_path = d / "trained_nlp"
399 | nlp.to_disk(file_path)
400 | nlp2 = util.load_model_from_path(file_path)
401 | doc3 = nlp2(text)
402 | tagger2 = nlp2.get_pipe("tagger")
403 | tagger_tok2vec2 = tagger2.model.get_ref("tok2vec")
404 | pred_tensor = tagger_tok2vec2.predict([doc3])
405 | _assert_equal_tensors(doc_tensor_trained, pred_tensor)
406 |
407 |
408 | def test_replace_listeners_invalid():
409 | orig_config = Config().from_str(cfg_string)
410 | nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
411 | text = "This is awesome"
412 | examples = [Example.from_dict(nlp.make_doc(text), {"tags": ["A", "B", "C"]})]
413 | optimizer = nlp.initialize(lambda: examples)
414 | for i in range(2):
415 | losses = {}
416 | nlp.update(examples, sgd=optimizer, losses=losses)
417 | with pytest.raises(ValueError):
418 | nlp.replace_listeners("invalid", "tagger", ["model.tok2vec"])
419 | with pytest.raises(ValueError):
420 | nlp.replace_listeners("transformer", "parser", ["model.tok2vec"])
421 | with pytest.raises(ValueError):
422 | nlp.replace_listeners("transformer", "tagger", ["model.yolo"])
423 | with pytest.raises(ValueError):
424 | nlp.replace_listeners("transformer", "tagger", ["model.tok2vec", "model.yolo"])
425 |
426 |
427 | @pytest.fixture
428 | def texts():
429 | data = [
430 | "Hello world.",
431 | "This is spacy.",
432 | "You can use multiprocessing with pipe method.",
433 | "Please try!",
434 | ]
435 | return data
436 |
437 |
438 | def test_multiprocessing(simple_nlp, texts):
439 | ops = get_current_ops()
440 | if isinstance(ops, NumpyOps):
441 | texts = texts * 3
442 | expecteds = [simple_nlp(text) for text in texts]
443 | docs = simple_nlp.pipe(texts, n_process=2, batch_size=2)
444 |
445 | for doc, expected_doc in zip(docs, expecteds):
446 | assert_docs_equal(doc, expected_doc)
447 |
448 |
449 | def test_frozen_listener():
450 | orig_config = Config().from_str(cfg_string)
451 | nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
452 | text = "This is awesome"
453 | examples = [Example.from_dict(nlp.make_doc(text), {"tags": ["A", "B", "C"]})]
454 | optimizer = nlp.initialize(lambda: examples)
455 | # train pipe before freezing listener
456 | for i in range(2):
457 | losses = {}
458 | nlp.update(examples, sgd=optimizer, losses=losses)
459 | doc = nlp(text)
460 |
461 | transformer_bytes = nlp.get_pipe("transformer").to_bytes()
462 | tagger_bytes = nlp.get_pipe("tagger").to_bytes()
463 |
464 | # train further with frozen listener
465 | for i in range(2):
466 | losses = {}
467 | nlp.update(
468 | examples,
469 | sgd=optimizer,
470 | losses=losses,
471 | exclude=["transformer"],
472 | annotates=["transformer"],
473 | )
474 | doc = nlp(text)
475 |
476 | # only tagger was updated
477 | assert nlp.get_pipe("transformer").to_bytes() == transformer_bytes
478 | assert nlp.get_pipe("tagger").to_bytes() != tagger_bytes
479 |
480 |
481 | def test_no_update_listener_in_predict():
482 | orig_config = Config().from_str(cfg_string)
483 | nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
484 | listener = nlp.get_pipe("tagger").model.get_ref("tok2vec").get_ref("listener")
485 | transformer = nlp.get_pipe("transformer")
486 |
487 | text = "This is awesome"
488 | examples = [Example.from_dict(nlp.make_doc(text), {"tags": ["A", "B", "C"]})]
489 | docs = [eg.predicted for eg in examples]
490 | nlp.initialize(lambda: examples)
491 |
492 | transformer.update(examples)
493 | assert listener._backprop is not None
494 |
495 | transformer.predict(docs)
496 | assert listener._backprop is not None
497 |
498 |
499 | @pytest.mark.skipif(
500 | Version(spacy.__version__) < Version("3.5.4"), reason="Bug fixed in spaCy v3.5.4"
501 | )
502 | def test_source_replace_listeners():
503 | """Test that a pipeline with a transformer+tagger+senter and some replaced
504 | listeners runs and trains properly"""
505 | orig_config = """
506 | [nlp]
507 | lang = "en"
508 | pipeline = ["transformer","tagger","senter"]
509 |
510 | [components]
511 |
512 | [components.senter]
513 | factory = "senter"
514 |
515 | [components.senter.model]
516 | @architectures = "spacy.Tagger.v1"
517 | nO = null
518 |
519 | [components.senter.model.tok2vec]
520 | @architectures = "spacy-transformers.TransformerListener.v1"
521 | grad_factor = 1.0
522 | upstream = "transformer"
523 |
524 | [components.senter.model.tok2vec.pooling]
525 | @layers = "reduce_mean.v1"
526 |
527 | [components.tagger]
528 | factory = "tagger"
529 |
530 | [components.tagger.model]
531 | @architectures = "spacy.Tagger.v1"
532 | nO = null
533 |
534 | [components.tagger.model.tok2vec]
535 | @architectures = "spacy-transformers.TransformerListener.v1"
536 | grad_factor = 1.0
537 | upstream = "transformer"
538 |
539 | [components.tagger.model.tok2vec.pooling]
540 | @layers = "reduce_mean.v1"
541 |
542 | [components.transformer]
543 | factory = "transformer"
544 |
545 | [components.transformer.model]
546 | @architectures = "spacy-transformers.TransformerModel.v3"
547 | name = "distilbert-base-uncased"
548 | """
549 | orig_config = Config().from_str(cfg_string)
550 | nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
551 | assert nlp.pipe_names == ["transformer", "tagger", "senter"]
552 | tagger = nlp.get_pipe("tagger")
553 | train_examples = []
554 | for t in TRAIN_DATA:
555 | train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
556 | for tag in t[1]["tags"]:
557 | tagger.add_label(tag)
558 | optimizer = nlp.initialize(lambda: train_examples)
559 | assert nlp.get_pipe("transformer").listening_components == ["tagger", "senter"]
560 | for i in range(2):
561 | losses = {}
562 | nlp.update(train_examples, sgd=optimizer, losses=losses)
563 |
564 | with make_tempdir() as dir_path:
565 | nlp.to_disk(dir_path)
566 | base_model = str(dir_path)
567 | new_config = {
568 | "nlp": {
569 | "lang": "en",
570 | "pipeline": ["transformer", "tagger", "senter", "ner"],
571 | },
572 | "components": {
573 | "transformer": {"source": base_model},
574 | "tagger": {
575 | "source": base_model,
576 | "replace_listeners": ["model.tok2vec"],
577 | },
578 | "senter": {
579 | "source": base_model,
580 | "replace_listeners": ["model.tok2vec"],
581 | },
582 | "ner": {
583 | "factory": "ner",
584 | "model": {
585 | "@architectures": "spacy.TransitionBasedParser.v2",
586 | "state_type": "ner",
587 | "tok2vec": {
588 | "@architectures": "spacy-transformers.TransformerListener.v1",
589 | "grad_factor": 1.0,
590 | "upstream": "transformer",
591 | "pooling": {"@layers": "reduce_mean.v1"},
592 | },
593 | },
594 | },
595 | },
596 | }
597 | new_nlp = util.load_model_from_config(new_config, auto_fill=True)
598 | for component in ("tagger", "senter"):
599 | assert (
600 | new_nlp.config["components"][component]["model"]["tok2vec"][
601 | "@architectures"
602 | ]
603 | == "spacy-transformers.Tok2VecTransformer.v3"
604 | )
605 | assert new_nlp.get_pipe("transformer").listening_components == ["ner"]
606 |
607 | with make_tempdir() as new_dir_path:
608 | new_nlp.to_disk(new_dir_path)
609 | new_nlp_re = spacy.load(new_dir_path)
610 | for component in ("tagger", "senter"):
611 | assert (
612 | new_nlp.config["components"][component]["model"]["tok2vec"][
613 | "@architectures"
614 | ]
615 | == "spacy-transformers.Tok2VecTransformer.v3"
616 | )
617 | assert new_nlp_re.get_pipe("transformer").listening_components == ["ner"]
618 |
--------------------------------------------------------------------------------
/spacy_transformers/tests/test_serialize.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import copy
3 | import spacy
4 | from spacy import Language
5 | from spacy.lang.en import English
6 | from spacy.tests.util import assert_docs_equal
7 | from spacy.tokens import Doc
8 | from spacy.util import make_tempdir
9 | from spacy import util
10 | import srsly
11 | from thinc.api import Config, get_current_ops
12 | from numpy.testing import assert_array_equal
13 |
14 | from .. import TransformerData
15 |
16 |
17 | DEFAULT_CONFIG = {
18 | "model": {
19 | "@architectures": "spacy-transformers.TransformerModel.v3",
20 | "name": "hf-internal-testing/tiny-random-DistilBertModel",
21 | "tokenizer_config": {"use_fast": False},
22 | }
23 | }
24 |
25 |
26 | def test_serialize_transformer_data():
27 | data = {"x": TransformerData.empty()}
28 | bytes_data = srsly.msgpack_dumps(data)
29 | new_data = srsly.msgpack_loads(bytes_data)
30 | assert isinstance(new_data["x"], TransformerData)
31 |
32 | nlp = Language()
33 | nlp.add_pipe(
34 | "transformer",
35 | config={
36 | "model": {
37 | "name": "hf-internal-testing/tiny-random-DistilBertModel",
38 | "transformer_config": {"output_attentions": True},
39 | }
40 | },
41 | )
42 | nlp.initialize()
43 | doc = nlp("This is a test.")
44 | b = doc.to_bytes()
45 | reloaded_doc = Doc(nlp.vocab)
46 | reloaded_doc.from_bytes(b)
47 | assert_docs_equal(doc, reloaded_doc)
48 | ops = get_current_ops()
49 | for key in doc._.trf_data.model_output:
50 | assert_array_equal(
51 | ops.to_numpy(ops.asarray(doc._.trf_data.model_output[key])),
52 | ops.to_numpy(ops.asarray(reloaded_doc._.trf_data.model_output[key])),
53 | )
54 |
55 |
56 | def test_transformer_tobytes():
57 | nlp = Language()
58 | trf = nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
59 | trf_bytes = trf.to_bytes()
60 |
61 | nlp2 = Language()
62 | trf2 = nlp2.add_pipe("transformer", config=DEFAULT_CONFIG)
63 | trf2.from_bytes(trf_bytes)
64 |
65 |
66 | def test_initialized_transformer_tobytes():
67 | nlp = Language()
68 | trf = nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
69 | nlp.initialize()
70 | trf_bytes = trf.to_bytes()
71 |
72 | nlp2 = Language()
73 | trf2 = nlp2.add_pipe("transformer", config=DEFAULT_CONFIG)
74 | trf2.from_bytes(trf_bytes)
75 |
76 | assert trf2.model.tokenizer.is_fast is False
77 |
78 |
79 | def test_initialized_transformer_todisk():
80 | nlp = Language()
81 | trf = nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
82 | nlp.initialize()
83 | with make_tempdir() as d:
84 | trf.to_disk(d)
85 | nlp2 = Language()
86 | trf2 = nlp2.add_pipe("transformer", config=DEFAULT_CONFIG)
87 | trf2.from_disk(d)
88 |
89 | assert trf2.model.tokenizer.is_fast is False
90 |
91 | fast_config = copy.deepcopy(DEFAULT_CONFIG)
92 | fast_config["model"]["tokenizer_config"]["use_fast"] = True
93 | nlp = Language()
94 | trf = nlp.add_pipe("transformer", config=fast_config)
95 | nlp.initialize()
96 | with make_tempdir() as d:
97 | trf.to_disk(d)
98 | nlp2 = Language()
99 | trf2 = nlp2.add_pipe("transformer", config=fast_config)
100 | trf2.from_disk(d)
101 |
102 | assert trf2.model.tokenizer.is_fast is True
103 |
104 |
105 | def test_transformer_pipeline_tobytes():
106 | nlp = Language()
107 | nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
108 | nlp.initialize()
109 | assert nlp.pipe_names == ["transformer"]
110 | nlp_bytes = nlp.to_bytes()
111 |
112 | nlp2 = Language()
113 | nlp2.add_pipe("transformer", config=DEFAULT_CONFIG)
114 | nlp2.from_bytes(nlp_bytes)
115 | assert nlp2.pipe_names == ["transformer"]
116 |
117 |
118 | def test_transformer_pipeline_todisk():
119 | nlp = English()
120 | nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
121 | nlp.initialize()
122 | with make_tempdir() as d:
123 | nlp.to_disk(d)
124 | nlp2 = spacy.load(d)
125 | assert nlp2.pipe_names == ["transformer"]
126 |
127 |
128 | def test_transformer_pipeline_todisk_settings():
129 | nlp = English()
130 | trf = nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
131 | nlp.initialize()
132 | # initially no attentions
133 | assert trf.model.tokenizer.model_max_length == 512
134 | assert trf.model.transformer.config.output_attentions is False
135 | assert "attentions" not in nlp("test")._.trf_data.model_output
136 | # modify model_max_length (note that modifications to
137 | # tokenizer.model_max_length for transformers<4.25 are not serialized by
138 | # save_pretrained, see: https://github.com/explosion/spaCy/discussions/7393)
139 | trf.model.tokenizer.init_kwargs["model_max_length"] = 499
140 | # transformer>=4.25, model_max_length is saved and init_kwargs changes are
141 | # clobbered, so do both for this test
142 | trf.model.tokenizer.model_max_length = 499
143 | # add attentions on-the-fly
144 | trf.model.transformer.config.output_attentions = True
145 | assert nlp("test")._.trf_data.model_output.attentions is not None
146 | with make_tempdir() as d:
147 | nlp.to_disk(d)
148 | nlp2 = spacy.load(d)
149 | assert nlp2.pipe_names == ["transformer"]
150 | trf2 = nlp2.get_pipe("transformer")
151 | # model_max_length is preserved
152 | assert trf2.model.tokenizer.model_max_length == 499
153 | # output_attentions setting is preserved
154 | assert trf2.model.transformer.config.output_attentions is True
155 | assert nlp2("test")._.trf_data.model_output.attentions is not None
156 | # the init configs are empty SimpleFrozenDicts
157 | assert trf2.model._init_tokenizer_config == {}
158 | with pytest.raises(NotImplementedError):
159 | trf2.model._init_tokenizer_config["use_fast"] = False
160 |
161 |
162 | def test_transformer_pipeline_todisk_before_initialize():
163 | nlp = English()
164 | nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
165 | with make_tempdir() as d:
166 | # serialize before initialization
167 | nlp.to_disk(d)
168 | nlp2 = spacy.load(d)
169 | nlp2.initialize()
170 | assert "last_hidden_state" in nlp2("test")._.trf_data.model_output
171 |
172 |
173 | inline_cfg_string = """
174 | [nlp]
175 | lang = "en"
176 | pipeline = ["tagger"]
177 |
178 | [components]
179 |
180 | [components.tagger]
181 | factory = "tagger"
182 |
183 | [components.tagger.model]
184 | @architectures = "spacy.Tagger.v1"
185 | nO = null
186 |
187 | [components.tagger.model.tok2vec]
188 | @architectures = "spacy-transformers.Tok2VecTransformer.v3"
189 | name = "hf-internal-testing/tiny-random-DistilBertModel"
190 | tokenizer_config = {"use_fast": true}
191 | transformer_config = {"output_attentions": false}
192 | grad_factor = 1.0
193 |
194 | [components.tagger.model.tok2vec.get_spans]
195 | @span_getters = "spacy-transformers.strided_spans.v1"
196 | window = 256
197 | stride = 256
198 |
199 | [components.tagger.model.tok2vec.pooling]
200 | @layers = "reduce_mean.v1"
201 | """
202 |
203 |
204 | def test_inline_transformer_tobytes():
205 | orig_config = Config().from_str(inline_cfg_string)
206 | nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
207 | tagger = nlp.get_pipe("tagger")
208 | tagger_bytes = tagger.to_bytes()
209 |
210 | nlp2 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
211 | tagger2 = nlp2.get_pipe("tagger")
212 | tagger2.from_bytes(tagger_bytes)
213 |
214 |
215 | def test_initialized_inline_transformer_tobytes():
216 | orig_config = Config().from_str(inline_cfg_string)
217 | nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
218 | assert nlp.pipe_names == ["tagger"]
219 | tagger = nlp.get_pipe("tagger")
220 | tagger.add_label("V")
221 | nlp.initialize()
222 | tagger_bytes = tagger.to_bytes()
223 |
224 | nlp2 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
225 | tagger2 = nlp2.get_pipe("tagger")
226 | tagger2.from_bytes(tagger_bytes)
227 | assert list(tagger2.labels) == ["V"]
228 |
229 |
230 | def test_inline_transformer_todisk():
231 | orig_config = Config().from_str(inline_cfg_string)
232 | nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
233 | assert nlp.pipe_names == ["tagger"]
234 | tagger = nlp.get_pipe("tagger")
235 | tagger.add_label("V")
236 | with make_tempdir() as d:
237 | tagger.to_disk(d)
238 | nlp2 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
239 | tagger2 = nlp2.get_pipe("tagger")
240 | tagger2.from_disk(d)
241 | assert list(tagger2.labels) == ["V"]
242 |
243 |
244 | def test_initialized_inline_transformer_todisk():
245 | orig_config = Config().from_str(inline_cfg_string)
246 | nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
247 | assert nlp.pipe_names == ["tagger"]
248 | tagger = nlp.get_pipe("tagger")
249 | tagger.add_label("V")
250 | nlp.initialize()
251 | with make_tempdir() as d:
252 | tagger.to_disk(d)
253 | nlp2 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
254 | tagger2 = nlp2.get_pipe("tagger")
255 | tagger2.from_disk(d)
256 | assert list(tagger2.labels) == ["V"]
257 |
258 |
259 | def test_inline_transformer_pipeline_tobytes():
260 | orig_config = Config().from_str(inline_cfg_string)
261 | nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
262 | assert nlp.pipe_names == ["tagger"]
263 | tagger = nlp.get_pipe("tagger")
264 | tagger.add_label("V")
265 | nlp.initialize()
266 | nlp_bytes = nlp.to_bytes()
267 |
268 | nlp2 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
269 | nlp2.from_bytes(nlp_bytes)
270 | assert nlp2.pipe_names == ["tagger"]
271 |
272 |
273 | def test_inline_transformer_pipeline_todisk():
274 | orig_config = Config().from_str(inline_cfg_string)
275 | nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
276 | assert nlp.pipe_names == ["tagger"]
277 | with make_tempdir() as d:
278 | nlp.to_disk(d)
279 | nlp2 = spacy.load(d)
280 | assert nlp2.pipe_names == ["tagger"]
281 |
282 |
283 | def test_initialized_inline_transformer_pipeline_todisk():
284 | orig_config = Config().from_str(inline_cfg_string)
285 | nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
286 | assert nlp.pipe_names == ["tagger"]
287 | tagger = nlp.get_pipe("tagger")
288 | tagger.add_label("V")
289 | nlp.initialize()
290 | with make_tempdir() as d:
291 | nlp.to_disk(d)
292 | nlp2 = spacy.load(d)
293 | assert nlp2.pipe_names == ["tagger"]
294 | tagger2 = nlp2.get_pipe("tagger")
295 | assert list(tagger2.labels) == ["V"]
296 |
--------------------------------------------------------------------------------
/spacy_transformers/tests/test_spanners.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from spacy.lang.en import English
3 |
4 | from ..span_getters import configure_strided_spans, configure_get_sent_spans
5 |
6 |
7 | @pytest.mark.parametrize(
8 | "window,stride,docs,result",
9 | [
10 | (4, 3, ["0", "1234", "56789a"], [["0"], ["1234"], ["5678", "89a"]]),
11 | (4, 4, ["0", "1234", "56789a"], [["0"], ["1234"], ["5678", "9a"]]),
12 | (4, 2, ["0", "1234", "56789a"], [["0"], ["1234"], ["5678", "789a"]]),
13 | ],
14 | )
15 | def test_get_strided_spans(window, stride, docs, result):
16 | get_strided = configure_strided_spans(window, stride)
17 | spans = get_strided(docs)
18 | assert spans == result
19 |
20 |
21 | def test_get_sent_spans():
22 | nlp = English()
23 | nlp.add_pipe("sentencizer")
24 | doc = nlp("One. One more. Three sentences in total.")
25 | assert len(list(doc.sents)) == 3
26 | get_sent_spans = configure_get_sent_spans()
27 | spans = get_sent_spans([doc])[0]
28 | assert len(spans) == 3
29 | assert spans[0].text == "One."
30 | assert spans[1].text == "One more."
31 | assert spans[2].text == "Three sentences in total."
32 |
33 |
34 | def test_get_custom_spans():
35 | def configure_custom_sent_spans(max_length: int):
36 | def get_custom_sent_spans(docs):
37 | spans = []
38 | for doc in docs:
39 | spans.append([])
40 | for sent in doc.sents:
41 | start = 0
42 | end = max_length
43 | while end <= len(sent):
44 | spans[-1].append(sent[start:end])
45 | start += max_length
46 | end += max_length
47 | if start < len(sent):
48 | spans[-1].append(sent[start : len(sent)])
49 | return spans
50 |
51 | return get_custom_sent_spans
52 |
53 | nlp = English()
54 | nlp.add_pipe("sentencizer")
55 | doc = nlp(
56 | "One. And one more. So that makes three sentences and this one is a bit longer."
57 | )
58 | assert len(list(doc.sents)) == 3
59 | get_sent_spans = configure_custom_sent_spans(max_length=4)
60 | spans = get_sent_spans([doc])[0]
61 | assert len(spans) == 6
62 | assert spans[0].text == "One."
63 | assert spans[1].text == "And one more."
64 | assert spans[2].text == "So that makes three"
65 | assert spans[3].text == "sentences and this one"
66 | assert spans[4].text == "is a bit longer"
67 | assert spans[5].text == "."
68 |
--------------------------------------------------------------------------------
/spacy_transformers/tests/test_textcatcnn.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from packaging.version import Version
3 |
4 | from spacy.training.example import Example
5 | from spacy import util
6 | import thinc
7 | from thinc.api import Model, Config
8 |
9 | # fmt: off
10 | cfg_string = """
11 | [nlp]
12 | lang = "en"
13 | pipeline = ["textcat"]
14 |
15 | [components]
16 |
17 | [components.textcat]
18 | factory = "textcat"
19 |
20 | [components.textcat.model]
21 | @architectures = "spacy.TextCatCNN.v2"
22 | nO = null
23 | exclusive_classes = false
24 |
25 | [components.textcat.model.tok2vec]
26 | @architectures = "spacy-transformers.Tok2VecTransformer.v1"
27 | name = "roberta-base"
28 | tokenizer_config = {"use_fast": false}
29 | grad_factor = 1.0
30 |
31 | [components.textcat.model.tok2vec.get_spans]
32 | @span_getters = "spacy-transformers.strided_spans.v1"
33 | window = 256
34 | stride = 256
35 |
36 | [components.textcat.model.tok2vec.pooling]
37 | @layers = "reduce_mean.v1"
38 | """
39 | # fmt: on
40 |
41 |
42 | # TODO: remove skip after requiring spacy>=3.5.1 or at the very latest, after
43 | # dropping python 3.7 switch to importlib.metadata.version("thinc")
44 | @pytest.mark.skipif(
45 | Version(thinc.__version__) < Version("8.1.8"), reason="Requires thinc>=8.1.8"
46 | )
47 | def test_textcatcnn():
48 | orig_config = Config().from_str(cfg_string)
49 | nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
50 | assert nlp.pipe_names == ["textcat"]
51 |
52 | textcat = nlp.get_pipe("textcat")
53 | assert textcat.is_resizable is True
54 |
55 | train_examples = []
56 | doc = nlp.make_doc("ok")
57 | doc.cats["X"] = 1.0
58 | doc.cats["Y"] = 0.0
59 | train_examples.append(Example(doc, doc))
60 |
61 | nlp.initialize(lambda: train_examples)
62 |
--------------------------------------------------------------------------------
/spacy_transformers/tests/test_tok2vectransformer.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from spacy.training.example import Example
3 | from spacy.util import make_tempdir
4 | from spacy import util
5 | from thinc.api import Model, Config
6 | from .util import _assert_equal_tensors
7 |
8 | # fmt: off
9 | TRAIN_DATA = [
10 | ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
11 | ("Eat blue ham", {"tags": ["V", "J", "N"]}),
12 | ]
13 |
14 |
15 | cfg_string = """
16 | [nlp]
17 | lang = "en"
18 | pipeline = ["tagger"]
19 |
20 | [components]
21 |
22 | [components.tagger]
23 | factory = "tagger"
24 |
25 | [components.tagger.model]
26 | @architectures = "spacy.Tagger.v1"
27 | nO = null
28 |
29 | [components.tagger.model.tok2vec]
30 | @architectures = "spacy-transformers.Tok2VecTransformer.v1"
31 | name = "distilbert-base-uncased"
32 | tokenizer_config = {"use_fast": false}
33 | grad_factor = 1.0
34 |
35 | [components.tagger.model.tok2vec.get_spans]
36 | @span_getters = "spacy-transformers.strided_spans.v1"
37 | window = 256
38 | stride = 256
39 |
40 | [components.tagger.model.tok2vec.pooling]
41 | @layers = "reduce_mean.v1"
42 | """
43 | # fmt: on
44 |
45 |
46 | def test_transformer_pipeline_tagger_internal():
47 | """Test that a tagger with internal transformer runs and trains properly"""
48 | orig_config = Config().from_str(cfg_string)
49 | nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
50 | assert nlp.pipe_names == ["tagger"]
51 | tagger = nlp.get_pipe("tagger")
52 | tagger_trf = tagger.model.get_ref("tok2vec").layers[0]
53 | assert isinstance(tagger_trf, Model)
54 | train_examples = []
55 | for t in TRAIN_DATA:
56 | train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
57 | for tag in t[1]["tags"]:
58 | tagger.add_label(tag)
59 |
60 | optimizer = nlp.initialize(lambda: train_examples)
61 | for i in range(2):
62 | losses = {}
63 | nlp.update(train_examples, sgd=optimizer, losses=losses)
64 |
65 | doc = nlp("We're interested at underwater basket weaving.")
66 | doc_tensor = tagger_trf.predict([doc])
67 |
68 | # ensure IO goes OK
69 | with make_tempdir() as d:
70 | file_path = d / "trained_nlp"
71 | nlp.to_disk(file_path)
72 |
73 | # results are not the same if we don't call from_disk
74 | nlp2 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
75 | nlp2.initialize(lambda: train_examples)
76 | doc2 = nlp2("We're interested at underwater basket weaving.")
77 | tagger2 = nlp2.get_pipe("tagger")
78 | tagger_trf2 = tagger2.model.get_ref("tok2vec").layers[0]
79 | doc_tensor2 = tagger_trf2.predict([doc2])
80 | with pytest.raises(AssertionError):
81 | _assert_equal_tensors(
82 | doc_tensor2.doc_data[0].tensors, doc_tensor.doc_data[0].tensors
83 | )
84 |
85 | # results ARE the same if we call from_disk
86 | nlp3 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
87 | nlp3.from_disk(file_path)
88 | doc3 = nlp3("We're interested at underwater basket weaving.")
89 | tagger3 = nlp3.get_pipe("tagger")
90 | tagger_trf3 = tagger3.model.get_ref("tok2vec").layers[0]
91 | doc_tensor3 = tagger_trf3.predict([doc3])
92 | _assert_equal_tensors(
93 | doc_tensor3.doc_data[0].tensors, doc_tensor.doc_data[0].tensors
94 | )
95 |
--------------------------------------------------------------------------------
/spacy_transformers/tests/test_truncation.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import numpy
3 | from thinc.types import Ragged
4 | from thinc.api import NumpyOps
5 | from ..data_classes import WordpieceBatch
6 | from ..truncate import _truncate_tokens, _truncate_alignment
7 |
8 |
9 | @pytest.fixture
10 | def sequences():
11 | # Each sequence is a list of tokens, and each token is a number of wordpieces
12 | return [
13 | [1, 3, 1], # So 5 wordpieces this sequence
14 | [3, 7, 1, 1], # 12
15 | [1], # 1
16 | [20, 1], # 21
17 | ]
18 |
19 |
20 | @pytest.fixture
21 | def shape(sequences):
22 | # Get the shape of the input_ids, which includes the padding.
23 | maximum = max(sum(lengths) for lengths in sequences)
24 | return (len(sequences), maximum)
25 |
26 |
27 | @pytest.fixture
28 | def seq_lengths(sequences):
29 | return numpy.array([sum(seq) for seq in sequences], dtype="i")
30 |
31 |
32 | @pytest.fixture
33 | def wordpieces(sequences):
34 | strings = []
35 | for token_lengths in sequences:
36 | strings.append([])
37 | for length in token_lengths:
38 | strings[-1].extend(str(i) for i in range(length))
39 | shape = (len(strings), max(len(seq) for seq in strings))
40 | wordpieces = WordpieceBatch(
41 | strings=strings,
42 | input_ids=numpy.zeros(shape, dtype="i"),
43 | token_type_ids=numpy.zeros(shape, dtype="i"),
44 | attention_mask=numpy.zeros((shape[0], shape[1]), dtype="bool"),
45 | lengths=[len(seq) for seq in strings],
46 | )
47 | return wordpieces
48 |
49 |
50 | @pytest.fixture
51 | def align(sequences):
52 | lengths = []
53 | indices = []
54 | offset = 0
55 | for seq in sequences:
56 | for token_length in seq:
57 | lengths.append(token_length)
58 | indices.extend(i + offset for i in range(token_length))
59 | offset += token_length
60 | return Ragged(numpy.array(indices, dtype="i"), numpy.array(lengths, dtype="i"))
61 |
62 |
63 | @pytest.fixture
64 | def max_length():
65 | return 6
66 |
67 |
68 | @pytest.fixture
69 | def mask_from_end(shape, max_length):
70 | n_seq, length = shape
71 | bools = [
72 | numpy.array([i < max_length for i in range(length)], dtype="bool")
73 | for _ in range(n_seq)
74 | ]
75 | return numpy.concatenate(bools)
76 |
77 |
78 | def test_truncate_wordpieces(wordpieces, max_length, mask_from_end):
79 | truncated = _truncate_tokens(wordpieces, mask_from_end)
80 | for i, seq in enumerate(truncated.strings):
81 | assert len(seq) <= max_length
82 | assert seq == wordpieces.strings[i][:max_length]
83 | assert truncated.input_ids[i].shape[0] <= max_length
84 | assert truncated.token_type_ids[i].shape[0] <= max_length
85 | assert truncated.attention_mask[i].shape[0] <= max_length
86 |
87 |
88 | def test_truncate_alignment_from_end(sequences, max_length, align, mask_from_end):
89 | # print("Max length", max_length)
90 | # print("Sequences", sequences)
91 | # print("Mask", mask_from_end)
92 | ops = NumpyOps()
93 | truncated = _truncate_alignment(align, mask_from_end)
94 | # print(truncated.dataXd.shape, truncated.lengths.sum())
95 | # print("Before", list(map(list, ops.unflatten(align.dataXd, align.lengths))))
96 | # print("After", list(map(list, ops.unflatten(truncated.dataXd, truncated.lengths))))
97 | # Check that the number of tokens hasn't changed. We still need to have
98 | # alignment for every token.
99 | assert truncated.lengths.shape[0] == align.lengths.shape[0]
100 | start = 0
101 | for i, seq in enumerate(sequences):
102 | end = start + len(seq)
103 | # Get the alignment for this sequence of tokens. Each length in the
104 | # alignment indicates the number of wordpiece tokens, so we need to
105 | # check that the sum of the lengths doesn't exceed the maximum.
106 | wp_indices = truncated[start:end]
107 | assert wp_indices.lengths.sum() <= max_length
108 | # We're truncating from the end, so we shouldn't see different values
109 | # except at the end of the sequence.
110 | seen_zero = False
111 | before = align[start:end]
112 | for length_now, length_before in zip(wp_indices.lengths, before.lengths):
113 | if seen_zero:
114 | assert length_now == 0, wp_indices.lengths
115 | elif length_now == 0:
116 | seen_zero = True
117 | else:
118 | length_now == length_before
119 |
--------------------------------------------------------------------------------
/spacy_transformers/tests/util.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, List, Union
2 | import numpy
3 | import torch
4 | import copy
5 | from transformers.file_utils import ModelOutput
6 | from numpy.testing import assert_array_equal
7 |
8 | from spacy.tokens import Doc
9 | from thinc.api import Model, get_current_ops
10 |
11 | from ..data_classes import FullTransformerBatch, HFObjects
12 | from ..span_getters import get_doc_spans
13 | from ..layers.transformer_model import forward as transformer_forward
14 |
15 |
16 | def _assert_equal_tensors(tensors1, tensors2):
17 | ops = get_current_ops()
18 | for i in range(len(tensors1)):
19 | t1 = ops.asarray(tensors1[i])
20 | t2 = ops.asarray(tensors2[i])
21 | assert_array_equal(ops.to_numpy(t1), ops.to_numpy(t2))
22 |
23 |
24 | class DummyTokenizer:
25 | def __init__(self):
26 | self.str2int = {}
27 | self.int2str = {}
28 | self.start_symbol = ""
29 | self.end_symbol = ""
30 | self.model_max_length = 512
31 | self.pad_token = "[PAD]"
32 |
33 | @property
34 | def all_special_tokens(self):
35 | return [self.start_symbol, self.end_symbol]
36 |
37 | def __call__(
38 | self,
39 | texts,
40 | add_special_tokens=True,
41 | max_length=None,
42 | stride: int = 0,
43 | truncation_strategy="longest_first",
44 | padding=False,
45 | truncation=False,
46 | is_pretokenized=False,
47 | return_tensors=None,
48 | return_token_type_ids=None,
49 | return_attention_mask=None,
50 | return_overflowing_tokens=False,
51 | return_special_tokens_masks=False,
52 | return_offsets_mapping=False,
53 | return_length=False,
54 | ):
55 | output: Dict = {
56 | "input_ids": [],
57 | "attention_mask": [],
58 | "token_type_ids": [],
59 | } # type: ignore
60 |
61 | for text in texts:
62 | words, offsets, mask, type_ids = self._tokenize(text)
63 | ids = self._encode_words(words)
64 | output["input_ids"].append(ids)
65 | output["attention_mask"].append(mask)
66 | output["token_type_ids"].append(type_ids)
67 | if padding:
68 | output = self._pad(output)
69 | if return_tensors == "pt":
70 | output["input_ids"] = torch.tensor(output["input_ids"]) # type: ignore
71 | output["attention_mask"] = torch.tensor(output["attention_mask"]) # type: ignore
72 | output["token_type_ids"] = torch.tensor(output["token_type_ids"]) # type: ignore
73 | elif return_tensors == "np":
74 | output["input_ids"] = numpy.asarray(output["input_ids"]) # type: ignore
75 | output["attention_mask"] = numpy.asarray(output["attention_mask"]) # type: ignore
76 | output["token_type_ids"] = numpy.asarray(output["token_type_ids"]) # type: ignore
77 | if return_length:
78 | output["length"] = torch.tensor([len(x) for x in output["input_ids"]]) # type: ignore
79 | return output
80 |
81 | def convert_ids_to_tokens(self, ids: Union[List[int], torch.Tensor]) -> List[str]:
82 | return [self.int2str[int(id_)] for id_ in ids] # type: ignore
83 |
84 | def _pad(self, batch):
85 | batch = copy.deepcopy(batch)
86 | longest = max(len(ids) for ids in batch["input_ids"])
87 | for i in range(len(batch["input_ids"])):
88 | length = len(batch["input_ids"][i])
89 | difference = longest - length
90 | batch["attention_mask"][i] = [1] * length + [0] * difference
91 | batch["input_ids"][i].extend([0] * difference)
92 | batch["token_type_ids"][i].extend([2] * difference)
93 | return batch
94 |
95 | def _tokenize(self, text):
96 | offsets = []
97 | start = 0
98 | for i, char in enumerate(text):
99 | if char == " ":
100 | offsets.append((start, i))
101 | start = i + 1
102 | if start < len(text):
103 | offsets.append((start, len(text)))
104 | words = [text[start:end] for start, end in offsets]
105 | type_ids = [0] + [1] * len(words) + [0]
106 | words = [self.start_symbol] + words + [self.end_symbol]
107 | offsets = [None] + offsets + [None]
108 | mask = [1] * len(words)
109 | return words, offsets, mask, type_ids
110 |
111 | def _encode_words(self, words):
112 | ids = []
113 | for word in words:
114 | if word not in self.str2int:
115 | self.int2str[len(self.str2int)] = word
116 | self.str2int[word] = len(self.str2int)
117 | ids.append(self.str2int[word])
118 | return ids
119 |
120 |
121 | def DummyTransformerModel(width: int, depth: int):
122 | def _forward(model, tokens, is_train):
123 | width = model.attrs["width"]
124 | depth = model.attrs["depth"]
125 | shape = (depth, tokens.input_ids.shape[0], tokens.input_ids.shape[1], width)
126 | tensors = torch.zeros(*shape)
127 | return ModelOutput(last_hidden_state=tensors), lambda d_tensors: tokens
128 |
129 | return Model(
130 | "dummy-transformer",
131 | _forward,
132 | attrs={"width": width, "depth": depth},
133 | )
134 |
135 |
136 | def DummyTransformer(
137 | depth: int = 2, width: int = 4, get_spans=get_doc_spans
138 | ) -> Model[List[Doc], FullTransformerBatch]:
139 | """Create a test model that produces a FullTransformerBatch object."""
140 | hf_model = HFObjects(DummyTokenizer(), None, None)
141 |
142 | return DummyModel(
143 | "dummy-transformer",
144 | transformer_forward,
145 | layers=[DummyTransformerModel(width=width, depth=depth)],
146 | attrs={
147 | "get_spans": get_spans,
148 | "hf_model": hf_model,
149 | "grad_factor": 1.0,
150 | "flush_cache_chance": 0.0,
151 | "transformer_config": {},
152 | },
153 | dims={"nO": width},
154 | )
155 |
156 |
157 | class DummyModel(Model):
158 | @property
159 | def tokenizer(self):
160 | return DummyTokenizer()
161 |
162 | @property
163 | def transformer(self):
164 | return None
165 |
166 | @property
167 | def tokenizer_config(self):
168 | return {}
169 |
170 | @property
171 | def transformer_config(self):
172 | return {}
173 |
--------------------------------------------------------------------------------
/spacy_transformers/truncate.py:
--------------------------------------------------------------------------------
1 | from typing import Tuple, List, Union, TypeVar
2 | import numpy
3 | from thinc.types import Ragged, Ints2d, Floats2d
4 | from .data_classes import WordpieceBatch
5 |
6 | ArrayT = TypeVar("ArrayT", bound=Union[Ints2d, Floats2d])
7 |
8 |
9 | def truncate_oversize_splits(
10 | wordpieces: WordpieceBatch, align: Ragged, max_length: int
11 | ) -> Tuple[WordpieceBatch, Ragged]:
12 | """Drop wordpieces from inputs that are too long. This can happen because
13 | the splitter is based on linguistic tokens, and the number of wordpieces
14 | that each token is split into is unpredictable, so we can end up with splits
15 | that have more wordpieces than the model's maximum.
16 |
17 | To solve this, we calculate a score for each wordpiece in the split,
18 | and drop the wordpieces with the highest scores. I can think of a few
19 | scoring schemes we could use:
20 |
21 | a) Drop the ends of longest wordpieces. This scoring would be just:
22 | position_in_token
23 | b) Drop the middles of longest wordpieces. The score would be:
24 | min(length - position_in_token, position_in_token)
25 | c) Drop all wordpieces from longest tokens. This would be:
26 | length
27 | d) Drop wordpieces from the end of the split. This would be:
28 | position_in_split
29 |
30 | The advantage of a) and b) is that they give some representation to each
31 | token. The advantage of c) is that it leaves a higher % of tokens with intact
32 | representations. The advantage of d) is that it leaves contiguous chunks of
33 | wordpieces intact, and drops from the end.
34 |
35 | I find b) most appealing, but it's also the most complicated. Let's just do
36 | d) for now.
37 | """
38 | if wordpieces.input_ids.shape[1] < max_length:
39 | return wordpieces, align
40 | mask = _get_truncation_mask_drop_from_end(
41 | wordpieces.input_ids.shape, wordpieces.lengths, align, max_length
42 | )
43 | return _truncate_tokens(wordpieces, mask), _truncate_alignment(align, mask)
44 |
45 |
46 | def _get_truncation_mask_drop_from_end(
47 | shape: Tuple[int, int], split_lengths: List[int], align: Ragged, max_length: int
48 | ) -> numpy.ndarray:
49 | """Return a two-dimensional boolean mask, indicating whether wordpieces
50 | are dropped from their sequences.
51 |
52 | Drop wordpieces from the end of the sequence.
53 | """
54 | mask = numpy.ones(shape, dtype="bool")
55 | mask[:, max_length:] = 0
56 | return mask
57 |
58 |
59 | def _truncate_tokens(wordpieces: WordpieceBatch, mask: numpy.ndarray) -> WordpieceBatch:
60 | n_seq = len(wordpieces)
61 | mask1d = mask.ravel()
62 | mask = mask.reshape((n_seq, -1))
63 |
64 | strings: List[List[str]] = []
65 | for i, seq in enumerate(wordpieces.strings):
66 | strings.append([])
67 | for j, token in enumerate(seq):
68 | if mask[i, j]:
69 | strings[-1].append(token)
70 |
71 | def filter_array(data: ArrayT) -> ArrayT:
72 | data1d = data.reshape((-1,))
73 | return data1d[mask1d].reshape((n_seq, -1)) # type: ignore
74 |
75 | filtered_token_type_ids = None
76 | if wordpieces.token_type_ids is not None:
77 | filtered_token_type_ids = filter_array(wordpieces.token_type_ids)
78 |
79 | return WordpieceBatch(
80 | strings=strings,
81 | input_ids=filter_array(wordpieces.input_ids),
82 | attention_mask=filter_array(wordpieces.attention_mask),
83 | lengths=[len(seq) for seq in strings],
84 | token_type_ids=filtered_token_type_ids,
85 | )
86 |
87 |
88 | def _truncate_alignment(align: Ragged, mask: numpy.ndarray) -> Ragged:
89 | # We're going to have fewer wordpieces in the new array, so all of our
90 | # wordpiece indices in the alignment table will be off --- they'll point
91 | # to the wrong row. So we need to do three things here:
92 | #
93 | # 1) Adjust all the indices in align.dataXd to account for the dropped data
94 | # 2) Remove the dropped indices from the align.dataXd
95 | # 3) Calculate new align.lengths
96 | #
97 | # The wordpiece mapping is easily calculated by the cumulative sum of the
98 | # mask table.
99 | # Let's say we have [True, False, False, True]. The mapping of the dropped
100 | # wordpieces doesn't matter, because we can filter it with the mask. So we
101 | # have [0, 0, 0, 1], i.e the wordpiece that was
102 | # at 0 is still at 0, and the wordpiece that was at 3 is now at 1.
103 | mask = mask.ravel()
104 | idx_map = mask.cumsum() - 1
105 | idx_map[~mask] = -1
106 | # Step 1: Adjust all the indices in align.dataXd.
107 | new_align = idx_map[align.data.ravel()]
108 | # Step 2: Remove the dropped indices
109 | new_align = new_align[new_align >= 0]
110 | # Step 3: Calculate new align.lengths
111 | new_lengths = align.lengths.copy()
112 | for i in range(len(align.lengths)):
113 | drops = ~mask[align[i].data.ravel()]
114 | new_lengths[i] -= drops.sum()
115 | return Ragged(new_align, new_lengths)
116 |
--------------------------------------------------------------------------------
/spacy_transformers/util.py:
--------------------------------------------------------------------------------
1 | from typing import List, Dict, Union, Set
2 | from pathlib import Path
3 | import random
4 | from transformers import AutoModel, AutoTokenizer
5 | from transformers.tokenization_utils import BatchEncoding
6 | from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
7 | import catalogue
8 | from spacy.util import registry
9 | from thinc.api import get_torch_default_device
10 | import torch.cuda
11 | import tempfile
12 | import shutil
13 | import contextlib
14 | import warnings
15 |
16 |
17 | # fmt: off
18 | registry.span_getters = catalogue.create("spacy", "span_getters", entry_points=True) # type: ignore
19 | registry.annotation_setters = catalogue.create("spacy", "annotation_setters", entry_points=True) # type: ignore
20 | # fmt: on
21 |
22 |
23 | def huggingface_from_pretrained(source: Union[Path, str], config: Dict):
24 | """Create a Huggingface transformer model from pretrained weights. Will
25 | download the model if it is not already downloaded.
26 |
27 | source (Union[str, Path]): The name of the model or a path to it, such as
28 | 'bert-base-cased'.
29 | config (dict): Settings to pass to the tokenizer.
30 | """
31 | warnings.warn(
32 | "spacy_transformers.util.huggingface_from_pretrained has been moved to "
33 | "spacy_transformers.layers.transformer_model.huggingface_from_pretrained "
34 | "with an updated API:\n"
35 | "huggingface_from_pretrained(source, tok_config, trf_config) -> HFObjects",
36 | DeprecationWarning,
37 | )
38 | if isinstance(source, Path):
39 | str_path = str(source.absolute())
40 | else:
41 | str_path = source
42 | tokenizer = AutoTokenizer.from_pretrained(str_path, **config)
43 | transformer = AutoModel.from_pretrained(str_path)
44 | torch_device = get_torch_default_device()
45 | transformer.to(torch_device)
46 | return tokenizer, transformer
47 |
48 |
49 | def huggingface_tokenize(tokenizer, texts: List[str]) -> BatchEncoding:
50 | """Apply a Huggingface tokenizer to a batch of texts."""
51 |
52 | # Use NumPy arrays rather than PyTorch tensors to avoid a lot of
53 | # host <-> device transfers during tokenization and post-processing
54 | # when a GPU is used.
55 | warnings.warn(
56 | "spacy_transformers.util.huggingface_tokenize has been moved to "
57 | "spacy_transformers.layers.transformer_model.huggingface_tokenize.",
58 | DeprecationWarning,
59 | )
60 | token_data = tokenizer(
61 | texts,
62 | add_special_tokens=True,
63 | return_attention_mask=True,
64 | return_offsets_mapping=isinstance(tokenizer, PreTrainedTokenizerFast),
65 | return_tensors="np",
66 | return_token_type_ids=None, # Sets to model default
67 | padding="longest",
68 | )
69 | token_data["input_texts"] = []
70 | for i in range(len(token_data["input_ids"])):
71 | wp_texts = tokenizer.convert_ids_to_tokens(token_data["input_ids"][i])
72 | token_data["input_texts"].append(wp_texts)
73 | token_data["pad_token"] = tokenizer.pad_token
74 | return token_data
75 |
76 |
77 | def maybe_flush_pytorch_cache(chance: float = 1.0):
78 | """Flip a coin and decide whether to flush PyTorch's cache. This allows the
79 | cache to be flushed periodically without maintaining a counter.
80 |
81 | I'm not sure why this is necessary, it shouldn't be. But it definitely does
82 | help...
83 | """
84 | if random.random() < chance and torch.cuda.is_available():
85 | torch.cuda.empty_cache()
86 |
87 |
88 | def transpose_list(nested_list):
89 | output = []
90 | for i, entry in enumerate(nested_list):
91 | while len(output) < len(entry):
92 | output.append([None] * len(nested_list))
93 | for j, x in enumerate(entry):
94 | output[j][i] = x
95 | return output
96 |
97 |
98 | def batch_by_length(seqs, max_words: int) -> List[List[int]]:
99 | """Given a list of sequences, return a batched list of indices into the
100 | list, where the batches are grouped by length, in descending order.
101 |
102 | Batches may be at most max_words in size, defined as max sequence length * size.
103 | """
104 | # Use negative index so we can get sort by position ascending.
105 | lengths_indices = [(len(seq), i) for i, seq in enumerate(seqs)]
106 | lengths_indices.sort()
107 | batches: List[List[int]] = []
108 | batch: List[int] = []
109 | for length, i in lengths_indices:
110 | if not batch:
111 | batch.append(i)
112 | elif length * (len(batch) + 1) <= max_words:
113 | batch.append(i)
114 | else:
115 | batches.append(batch)
116 | batch = [i]
117 | if batch:
118 | batches.append(batch)
119 | # Check lengths match
120 | assert sum(len(b) for b in batches) == len(seqs)
121 | # Check no duplicates
122 | seen: Set[int] = set()
123 | for b in batches:
124 | seen.update(id(item) for item in b)
125 | assert len(seen) == len(seqs)
126 | batches = [list(sorted(batch)) for batch in batches]
127 | batches.reverse()
128 | return batches
129 |
130 |
131 | def log_gpu_memory(logger, context):
132 | mem = torch.cuda.memory_allocated() // 1024**2
133 | logger.info(f"{mem:.1f}: {context}")
134 |
135 |
136 | def log_batch_size(logger, token_data, is_train):
137 | batch_size = token_data["input_ids"].shape[0]
138 | seq_len = token_data["input_ids"].shape[1]
139 | squared = seq_len**2 * batch_size
140 |
141 | if is_train:
142 | logger.info(f"{batch_size} x {seq_len} ({squared}) update")
143 | else:
144 | logger.info(f"{batch_size} x {seq_len} ({squared}) predict")
145 |
146 |
147 | @contextlib.contextmanager
148 | def make_tempdir():
149 | """Execute a block in a temporary directory and remove the directory and
150 | its contents at the end of the with block.
151 |
152 | YIELDS (Path): The path of the temp directory.
153 | """
154 | d = Path(tempfile.mkdtemp())
155 | yield d
156 | shutil.rmtree(str(d))
157 |
--------------------------------------------------------------------------------