├── .github
    ├── PULL_REQUEST_TEMPLATE.md
    ├── no-response.yml
    └── workflows
    │   ├── cibuildwheel.yml
    │   ├── explosionbot.yml
    │   ├── issue-manager.yml
    │   ├── publish_pypi.yml
    │   └── tests.yml
├── .gitignore
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── bin
    ├── get-version.sh
    └── push-tag.sh
├── build-constraints.txt
├── examples
    └── configs
    │   ├── joint-core-bert.cfg
    │   └── ner-albert.cfg
├── pyproject.toml
├── requirements.txt
├── setup.cfg
├── setup.py
└── spacy_transformers
    ├── __init__.py
    ├── align.pyi
    ├── align.pyx
    ├── annotation_setters.py
    ├── architectures.py
    ├── data_classes.py
    ├── layers
        ├── __init__.py
        ├── _util.py
        ├── hf_shim.py
        ├── hf_wrapper.py
        ├── listener.py
        ├── split_trf.py
        ├── transformer_model.py
        └── trfs2arrays.py
    ├── pipeline_component.py
    ├── py.typed
    ├── span_getters.py
    ├── tests
        ├── __init__.py
        ├── enable_gpu.py
        ├── regression
        │   ├── __init__.py
        │   ├── test_spacy_issue6401.py
        │   └── test_spacy_issue7029.py
        ├── test_alignment.py
        ├── test_configs.py
        ├── test_data_classes.py
        ├── test_deprecations.py
        ├── test_model_sequence_classification.py
        ├── test_model_wrapper.py
        ├── test_pipeline_component.py
        ├── test_serialize.py
        ├── test_spanners.py
        ├── test_textcatcnn.py
        ├── test_tok2vectransformer.py
        ├── test_truncation.py
        └── util.py
    ├── truncate.py
    └── util.py


/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | <!--- Provide a general summary of your changes in the title. -->
 2 | 
 3 | ## Description
 4 | 
 5 | <!--- Use this section to describe your changes. If your changes required
 6 | testing, include information about the testing environment and the tests you
 7 | ran. If your test fixes a bug reported in an issue, don't forget to include the
 8 | issue number. If your PR is still a work in progress, that's totally fine – just
 9 | include a note to let us know. -->
10 | 
11 | ### Types of change
12 | 
13 | <!-- What type of change does your PR cover? Is it a bug fix, an enhancement
14 | or new feature, or a change to the documentation? -->
15 | 
16 | ## Checklist
17 | 
18 | <!--- Before you submit the PR, go over this checklist and make sure you can
19 | tick off all the boxes. [] -> [x] -->
20 | 
21 | - [ ] I confirm that I have the right to submit this contribution under the project's MIT license.
22 | - [ ] I ran the tests, and all new and existing tests passed.
23 | - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
24 | 


--------------------------------------------------------------------------------
/.github/no-response.yml:
--------------------------------------------------------------------------------
 1 | # Configuration for probot-no-response - https://github.com/probot/no-response
 2 | 
 3 | # Number of days of inactivity before an Issue is closed for lack of response
 4 | daysUntilClose: 14
 5 | # Label requiring a response
 6 | responseRequiredLabel: more-info-needed
 7 | # Comment to post when closing an Issue for lack of response. Set to `false` to disable
 8 | closeComment: >
 9 |   This issue has been automatically closed because there has been no response
10 |   to a request for more information from the original author. With only the
11 |   information that is currently in the issue, there's not enough information
12 |   to take action. If you're the original author, feel free to reopen the issue
13 |   if you have or find the answers needed to investigate further.
14 | 


--------------------------------------------------------------------------------
/.github/workflows/cibuildwheel.yml:
--------------------------------------------------------------------------------
 1 | name: Build
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       # ytf did they invent their own syntax that's almost regex?
 7 |       # ** matches 'zero or more of any character'
 8 |       - 'release-v[0-9]+.[0-9]+.[0-9]+**'
 9 |       - 'prerelease-v[0-9]+.[0-9]+.[0-9]+**'
10 | jobs:
11 |   build_wheels:
12 |     name: Build wheels on ${{ matrix.os }}
13 |     runs-on: ${{ matrix.os }}
14 |     strategy:
15 |       matrix:
16 |         # macos-13 is an intel runner, macos-14 is apple silicon
17 |         os: [ubuntu-latest, windows-latest, macos-13, macos-14, ubuntu-24.04-arm]
18 | 
19 |     steps:
20 |       - uses: actions/checkout@v4
21 |       - name: Build wheels
22 |         uses: pypa/cibuildwheel@v2.21.3
23 |         env:
24 |           CIBW_SOME_OPTION: value
25 |         with:
26 |           package-dir: .
27 |           output-dir: wheelhouse
28 |           config-file: "{package}/pyproject.toml"
29 |       - uses: actions/upload-artifact@v4
30 |         with:
31 |           name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }}
32 |           path: ./wheelhouse/*.whl
33 | 
34 |   build_sdist:
35 |     name: Build source distribution
36 |     runs-on: ubuntu-latest
37 |     steps:
38 |       - uses: actions/checkout@v4
39 | 
40 |       - name: Build sdist
41 |         run: pipx run build --sdist
42 |       - uses: actions/upload-artifact@v4
43 |         with:
44 |           name: cibw-sdist
45 |           path: dist/*.tar.gz
46 |   create_release:
47 |     needs: [build_wheels, build_sdist]
48 |     runs-on: ubuntu-latest
49 |     permissions:
50 |       contents: write
51 |       checks: write
52 |       actions: read
53 |       issues: read
54 |       packages: write
55 |       pull-requests: read
56 |       repository-projects: read
57 |       statuses: read
58 |     steps:
59 |       - name: Get the tag name and determine if it's a prerelease
60 |         id: get_tag_info
61 |         run: |
62 |           FULL_TAG=${GITHUB_REF#refs/tags/}
63 |           if [[ $FULL_TAG == release-* ]]; then
64 |             TAG_NAME=${FULL_TAG#release-}
65 |             IS_PRERELEASE=false
66 |           elif [[ $FULL_TAG == prerelease-* ]]; then
67 |             TAG_NAME=${FULL_TAG#prerelease-}
68 |             IS_PRERELEASE=true
69 |           else
70 |             echo "Tag does not match expected patterns" >&2
71 |             exit 1
72 |           fi
73 |           echo "FULL_TAG=$TAG_NAME" >> $GITHUB_ENV
74 |           echo "TAG_NAME=$TAG_NAME" >> $GITHUB_ENV
75 |           echo "IS_PRERELEASE=$IS_PRERELEASE" >> $GITHUB_ENV
76 |       - uses: actions/download-artifact@v4
77 |         with:
78 |           # unpacks all CIBW artifacts into dist/
79 |           pattern: cibw-*
80 |           path: dist
81 |           merge-multiple: true
82 |       - name: Create Draft Release
83 |         id: create_release
84 |         uses: softprops/action-gh-release@v2
85 |         if: startsWith(github.ref, 'refs/tags/')
86 |         env:
87 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
88 |         with:
89 |           name: ${{ env.TAG_NAME }}
90 |           draft: true
91 |           prerelease: ${{ env.IS_PRERELEASE }}
92 |           files: "./dist/*" 
93 | 


--------------------------------------------------------------------------------
/.github/workflows/explosionbot.yml:
--------------------------------------------------------------------------------
 1 | name: Explosion Bot
 2 | 
 3 | on:
 4 |   issue_comment:
 5 |     types:
 6 |       - created
 7 |       - edited
 8 | 
 9 | jobs:
10 |   explosion-bot:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - name: Dump GitHub context
14 |         env:
15 |           GITHUB_CONTEXT: ${{ toJson(github) }}
16 |         run: echo "$GITHUB_CONTEXT"
17 |       - uses: actions/checkout@v3
18 |       - uses: actions/setup-python@v4
19 |       - name: Install and run explosion-bot
20 |         run: |
21 |           pip install git+https://${{ secrets.EXPLOSIONBOT_TOKEN }}@github.com/explosion/explosion-bot
22 |           python -m explosionbot
23 |         env:
24 |           INPUT_TOKEN: ${{ secrets.EXPLOSIONBOT_TOKEN }}
25 |           INPUT_BK_TOKEN: ${{ secrets.BUILDKITE_SECRET }}
26 |           ENABLED_COMMANDS: "test_gpu"
27 |           ALLOWED_TEAMS: "spacy-maintainers"
28 | 


--------------------------------------------------------------------------------
/.github/workflows/issue-manager.yml:
--------------------------------------------------------------------------------
 1 | name: Issue Manager
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: "0 0 * * *"
 6 |   issue_comment:
 7 |     types:
 8 |       - created
 9 |       - edited
10 |   issues:
11 |     types:
12 |       - labeled
13 | 
14 | jobs:
15 |   issue-manager:
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |       - uses: tiangolo/issue-manager@0.2.1
19 |         with:
20 |           token: ${{ secrets.GITHUB_TOKEN }}
21 |           config: >
22 |             {
23 |               "resolved": {
24 |                 "delay": "P7D",
25 |                 "message": "This issue has been automatically closed because it was answered and there was no follow-up discussion.",
26 |                 "remove_label_on_comment": true,
27 |                 "remove_label_on_close": true
28 |               }
29 |             }


--------------------------------------------------------------------------------
/.github/workflows/publish_pypi.yml:
--------------------------------------------------------------------------------
 1 | # The cibuildwheel action triggers on creation of a release, this
 2 | # triggers on publication.
 3 | # The expected workflow is to create a draft release and let the wheels
 4 | # upload, and then hit 'publish', which uploads to PyPi.
 5 | 
 6 | on:
 7 |   release:
 8 |     types:
 9 |       - published
10 | 
11 | jobs:
12 |   upload_pypi:
13 |     runs-on: ubuntu-latest
14 |     environment:
15 |       name: pypi
16 |       url: https://pypi.org/p/spacy-transformers
17 |     permissions:
18 |       id-token: write
19 |       contents: read
20 |     if: github.event_name == 'release' && github.event.action == 'published'
21 |     # or, alternatively, upload to PyPI on every tag starting with 'v' (remove on: release above to use this)
22 |     # if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
23 |     steps:
24 |       - uses: robinraju/release-downloader@v1
25 |         with:
26 |           tag: ${{ github.event.release.tag_name }}
27 |           fileName: '*'
28 |           out-file-path: 'dist'
29 |       - uses: pypa/gh-action-pypi-publish@release/v1
30 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
  1 | name: tests
  2 | 
  3 | on:
  4 |   push:
  5 |     paths-ignore:
  6 |       - "*.md"
  7 |   pull_request:
  8 |     types: [opened, synchronize, reopened, edited]
  9 |     paths-ignore:
 10 |       - "*.md"
 11 | 
 12 | env:
 13 |   MODULE_NAME: "spacy_transformers"
 14 |   RUN_MYPY: "true"
 15 | 
 16 | jobs:
 17 |   tests:
 18 |     name: Test
 19 |     if: github.repository_owner == 'explosion'
 20 |     strategy:
 21 |       fail-fast: false
 22 |       matrix:
 23 |         os: [ubuntu-latest, windows-latest, macos-latest]
 24 |         python_version: ["3.12"]
 25 |         include:
 26 |           - os: macos-13
 27 |             python_version: "3.10"
 28 |           - os: windows-latest
 29 |             python_version: "3.11"
 30 |           - os: ubuntu-latest
 31 |             python_version: "3.12"
 32 |           - os: macos-13
 33 |             python_version: "3.12"
 34 |           - os: windows-latest
 35 |             python_version: "3.12"
 36 | 
 37 |     runs-on: ${{ matrix.os }}
 38 | 
 39 |     steps:
 40 |       - name: Check out repo
 41 |         uses: actions/checkout@v3
 42 | 
 43 |       - name: Configure Python version
 44 |         uses: actions/setup-python@v4
 45 |         with:
 46 |           python-version: ${{ matrix.python_version }}
 47 | 
 48 |       - name: Install dependencies
 49 |         run: |
 50 |           python -m pip install -U build pip setuptools wheel
 51 |           python -m pip install -r requirements.txt --force-reinstall
 52 | 
 53 |       - name: Build sdist
 54 |         run: |
 55 |           python -m build --sdist
 56 | 
 57 |       - name: Run mypy
 58 |         if: env.RUN_MYPY == 'true' && matrix.python_version != '3.6'
 59 |         shell: bash
 60 |         run: |
 61 |           python -m mypy $MODULE_NAME
 62 | 
 63 |       - name: Delete source directory
 64 |         shell: bash
 65 |         run: |
 66 |           rm -rf $MODULE_NAME
 67 | 
 68 |       - name: Uninstall all packages
 69 |         run: |
 70 |           python -m pip freeze --exclude pywin32 --exclude torch
 71 |           python -m pip freeze --exclude pywin32 --exclude torch > installed.txt
 72 |           python -m pip uninstall -y -r installed.txt
 73 | 
 74 |       - name: Install newest torch for python 3.7+
 75 |         if: matrix.python_version != '3.6'
 76 |         run: |
 77 |           python -m pip install torch --index-url https://download.pytorch.org/whl/cpu --force-reinstall
 78 | 
 79 |       - name: Install from sdist
 80 |         shell: bash
 81 |         run: |
 82 |           SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
 83 |           python -m pip install dist/$SDIST
 84 | 
 85 |       - name: Run tests
 86 |         shell: bash
 87 |         run: |
 88 |           python -m pip install -r requirements.txt --force-reinstall
 89 |           # The version of pytorch being used here requires numpy v2, but because of the way we're doing the
 90 |           # requirements installation here it's not being resolved that way. So just install numpy 1 here.
 91 |           python -m pip install "numpy<2"
 92 |           python -m pytest --pyargs $MODULE_NAME --cov=$MODULE_NAME
 93 | 
 94 |       - name: Test backwards compatibility for v1.0 models
 95 |         if: matrix.python_version == '3.9'
 96 |         run: |
 97 |           python -m pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.1.0/en_core_web_trf-3.1.0-py3-none-any.whl --no-deps
 98 |           python -c "import spacy; nlp = spacy.load('en_core_web_trf'); doc = nlp('test')"
 99 | 
100 |       - name: Test backwards compatibility for v1.1 models
101 |         if: matrix.python_version == '3.9'
102 |         run: |
103 |           python -m pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.4.0/en_core_web_trf-3.4.0-py3-none-any.whl --no-deps
104 |           python -c "import spacy; nlp = spacy.load('en_core_web_trf'); doc = nlp('test')"
105 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | .vscode
  2 | tmp/
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # vim
 13 | .*.sw*
 14 | 
 15 | # Cython / C extensions
 16 | cythonize.json
 17 | spacy_transformers/*.html
 18 | *.cpp
 19 | *.so
 20 | 
 21 | # Distribution / packaging
 22 | .Python
 23 | build/
 24 | develop-eggs/
 25 | dist/
 26 | downloads/
 27 | eggs/
 28 | .eggs/
 29 | lib/
 30 | lib64/
 31 | parts/
 32 | sdist/
 33 | var/
 34 | wheels/
 35 | pip-wheel-metadata/
 36 | share/python-wheels/
 37 | *.egg-info/
 38 | .installed.cfg
 39 | *.egg
 40 | MANIFEST
 41 | 
 42 | # PyInstaller
 43 | #  Usually these files are written by a python script from a template
 44 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 45 | *.manifest
 46 | *.spec
 47 | 
 48 | # Installer logs
 49 | pip-log.txt
 50 | pip-delete-this-directory.txt
 51 | 
 52 | # Unit test / coverage reports
 53 | htmlcov/
 54 | .tox/
 55 | .nox/
 56 | .coverage
 57 | .coverage.*
 58 | .cache
 59 | nosetests.xml
 60 | coverage.xml
 61 | *.cover
 62 | .hypothesis/
 63 | .pytest_cache/
 64 | 
 65 | # Translations
 66 | *.mo
 67 | *.pot
 68 | 
 69 | # Django stuff:
 70 | *.log
 71 | local_settings.py
 72 | db.sqlite3
 73 | db.sqlite3-journal
 74 | 
 75 | # Flask stuff:
 76 | instance/
 77 | .webassets-cache
 78 | 
 79 | # Scrapy stuff:
 80 | .scrapy
 81 | 
 82 | # Sphinx documentation
 83 | docs/_build/
 84 | 
 85 | # PyBuilder
 86 | target/
 87 | 
 88 | # Jupyter Notebook
 89 | .ipynb_checkpoints
 90 | 
 91 | # IPython
 92 | profile_default/
 93 | ipython_config.py
 94 | 
 95 | # pyenv
 96 | .python-version
 97 | 
 98 | # pipenv
 99 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
100 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
101 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
102 | #   install all needed dependencies.
103 | #Pipfile.lock
104 | 
105 | # celery beat schedule file
106 | celerybeat-schedule
107 | 
108 | # SageMath parsed files
109 | *.sage.py
110 | 
111 | # Environments
112 | .env
113 | .venv
114 | env/
115 | venv/
116 | ENV/
117 | env.bak/
118 | venv.bak/
119 | 
120 | # Spyder project settings
121 | .spyderproject
122 | .spyproject
123 | 
124 | # Rope project settings
125 | .ropeproject
126 | 
127 | # mkdocs documentation
128 | /site
129 | 
130 | # mypy
131 | .mypy_cache/
132 | .dmypy.json
133 | dmypy.json
134 | 
135 | # Pyre type checker
136 | .pyre/
137 | 
138 | # Pycharm project files
139 | *.idea
140 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 ExplosionAI GmbH
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include spacy_transformers *.pyi *.pyx *.pxd
2 | recursive-exclude spacy_transformers *.cpp
3 | include LICENSE
4 | include README.md
5 | include pyproject.toml
6 | include spacy_transformers/py.typed
7 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL := /bin/bash
 2 | PYVER := 3.6
 3 | VENV := ./env$(PYVER)
 4 | 
 5 | version := $(shell "bin/get-version.sh")
 6 | 
 7 | dist/spacy-trf-$(version).pex : wheelhouse/spacy-trf-$(version).stamp
 8 | 	$(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -o $@ spacy_transformers==$(version) jsonschema
 9 | 	chmod a+rx $@
10 | 
11 | wheelhouse/spacy-trf-$(version).stamp : $(VENV)/bin/pex setup.py spacy_transformers/*.py* spacy_transformers/*/*.py*
12 | 	$(VENV)/bin/pip wheel . -w ./wheelhouse
13 | 	$(VENV)/bin/pip wheel jsonschema -w ./wheelhouse
14 | 	touch $@
15 | 
16 | $(VENV)/bin/pex :
17 | 	python$(PYVER) -m venv $(VENV)
18 | 	$(VENV)/bin/pip install -U pip setuptools pex wheel
19 | 
20 | .PHONY : clean
21 | 
22 | clean : setup.py
23 | 	rm -rf dist/*
24 | 	rm -rf ./wheelhouse
25 | 	rm -rf $(VENV)
26 | 	python setup.py clean --all
27 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <a href="https://explosion.ai"><img src="https://explosion.ai/assets/img/logo.svg" width="125" height="125" align="right" /></a>
 2 | 
 3 | # spacy-transformers: Use pretrained transformers like BERT, XLNet and GPT-2 in spaCy
 4 | 
 5 | This package provides [spaCy](https://github.com/explosion/spaCy) components and
 6 | architectures to use transformer models via
 7 | [Hugging Face's `transformers`](https://github.com/huggingface/transformers) in
 8 | spaCy. The result is convenient access to state-of-the-art transformer
 9 | architectures, such as BERT, GPT-2, XLNet, etc.
10 | 
11 | > **This release requires [spaCy v3](https://spacy.io/usage/v3).** For the
12 | > previous version of this library, see the
13 | > [`v0.6.x` branch](https://github.com/explosion/spacy-transformers/tree/v0.6.x).
14 | 
15 | [![tests](https://github.com/explosion/spacy-transformers/actions/workflows/tests.yml/badge.svg)](https://github.com/explosion/spacy-transformers/actions/workflows/tests.yml)
16 | [![PyPi](https://img.shields.io/pypi/v/spacy-transformers.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.python.org/pypi/spacy-transformers)
17 | [![GitHub](https://img.shields.io/github/release/explosion/spacy-transformers/all.svg?style=flat-square&logo=github)](https://github.com/explosion/spacy-transformers/releases)
18 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg?style=flat-square)](https://github.com/ambv/black)
19 | 
20 | ## Features
21 | 
22 | - Use pretrained transformer models like **BERT**, **RoBERTa** and **XLNet** to
23 |   power your spaCy pipeline.
24 | - Easy **multi-task learning**: backprop to one transformer model from several
25 |   pipeline components.
26 | - Train using spaCy v3's powerful and extensible config system.
27 | - Automatic alignment of transformer output to spaCy's tokenization.
28 | - Easily customize what transformer data is saved in the `Doc` object.
29 | - Easily customize how long documents are processed.
30 | - Out-of-the-box serialization and model packaging.
31 | 
32 | ## 🚀 Installation
33 | 
34 | Installing the package from pip will automatically install all dependencies,
35 | including PyTorch and spaCy. Make sure you install this package **before** you
36 | install the models. Also note that this package requires **Python 3.6+**,
37 | **PyTorch v1.5+** and **spaCy v3.0+**.
38 | 
39 | ```bash
40 | pip install 'spacy[transformers]'
41 | ```
42 | 
43 | For GPU installation, find your CUDA version using `nvcc --version` and add the
44 | [version in brackets](https://spacy.io/usage/#gpu), e.g.
45 | `spacy[transformers,cuda92]` for CUDA9.2 or `spacy[transformers,cuda100]` for
46 | CUDA10.0.
47 | 
48 | If you are having trouble installing PyTorch, follow the
49 | [instructions](https://pytorch.org/get-started/locally/) on the official website
50 | for your specific operating system and requirements.
51 | 
52 | ## 📖 Documentation
53 | 
54 | > ⚠️ **Important note:** This package has been extensively refactored to take
55 | > advantage of [spaCy v3.0](https://spacy.io). Previous versions that were built
56 | > for [spaCy v2.x](https://v2.spacy.io) worked considerably differently. Please
57 | > see previous tagged versions of this README for documentation on prior
58 | > versions.
59 | 
60 | - 📘
61 |   [Embeddings, Transformers and Transfer Learning](https://spacy.io/usage/embeddings-transformers):
62 |   How to use transformers in spaCy
63 | - 📘 [Training Pipelines and Models](https://spacy.io/usage/training): Train and
64 |   update components on your own data and integrate custom models
65 | - 📘
66 |   [Layers and Model Architectures](https://spacy.io/usage/layers-architectures):
67 |   Power spaCy components with custom neural networks
68 | - 📗 [`Transformer`](https://spacy.io/api/transformer): Pipeline component API
69 |   reference
70 | - 📗
71 |   [Transformer architectures](https://spacy.io/api/architectures#transformers):
72 |   Architectures and registered functions
73 | 
74 | ## Applying pretrained text and token classification models
75 | 
76 | Note that the `transformer` component from `spacy-transformers` does not support
77 | task-specific heads like token or text classification. A task-specific
78 | transformer model can be used as a source of features to train spaCy components
79 | like `ner` or `textcat`, but the `transformer` component does not provide access
80 | to task-specific heads for training or inference.
81 | 
82 | Alternatively, if you only want use to the **predictions** from an existing
83 | Hugging Face text or token classification model, you can use the wrappers from
84 | [`spacy-huggingface-pipelines`](https://github.com/explosion/spacy-huggingface-pipelines)
85 | to incorporate task-specific transformer models into your spaCy pipelines.
86 | 
87 | ## Bug reports and other issues
88 | 
89 | Please use [spaCy's issue tracker](https://github.com/explosion/spaCy/issues) to
90 | report a bug, or open a new thread on the
91 | [discussion board](https://github.com/explosion/spaCy/discussions) for any other
92 | issue.
93 | 


--------------------------------------------------------------------------------
/bin/get-version.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | version=$(grep "version = " setup.cfg)
 6 | version=${version/version = }
 7 | version=${version/\'/}
 8 | version=${version/\'/}
 9 | version=${version/\"/}
10 | version=${version/\"/}
11 | 
12 | echo $version
13 | 


--------------------------------------------------------------------------------
/bin/push-tag.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | # Insist repository is clean
 6 | git diff-index --quiet HEAD
 7 | 
 8 | git checkout $1
 9 | git pull origin $1
10 | git push origin $1
11 | 
12 | version=$(grep "version = " setup.cfg)
13 | version=${version/version = }
14 | version=${version/\'/}
15 | version=${version/\'/}
16 | version=${version/\"/}
17 | version=${version/\"/}
18 | git tag "v$version"
19 | git push origin "v$version"
20 | 


--------------------------------------------------------------------------------
/build-constraints.txt:
--------------------------------------------------------------------------------
1 | # build version constraints for use with wheelwright + multibuild
2 | numpy==1.15.0; python_version<='3.7' and platform_machine!='aarch64'
3 | numpy==1.19.2; python_version<='3.7' and platform_machine=='aarch64'
4 | numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
5 | numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
6 | numpy>=1.25.0; python_version>='3.9'
7 | 


--------------------------------------------------------------------------------
/examples/configs/joint-core-bert.cfg:
--------------------------------------------------------------------------------
  1 | [training]
  2 | seed = 0
  3 | gold_preproc = false
  4 | # Limitations on training document length or number of examples.
  5 | max_length = 500
  6 | limit = 0
  7 | patience = 10000
  8 | eval_frequency = 400
  9 | dropout = 0.1
 10 | init_tok2vec = null
 11 | max_epochs = 0
 12 | max_steps = 0
 13 | orth_variant_level = 0.0
 14 | 
 15 | scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
 16 | score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
 17 | 
 18 | base_model = null          
 19 | use_pytorch_for_gpu_memory = true
 20 | omit_extra_lookups = false
 21 | raw_text = null
 22 | tag_map = null
 23 | vectors = null
 24 | morph_rules = null
 25 | 
 26 | batch_by = "padded"
 27 | batch_size = 2000
 28 | accumulate_gradient = 3
 29 | discard_oversize = true
 30 | eval_batch_size = 256
 31 | 
 32 | [training.optimizer]
 33 | @optimizers = "Adam.v1"
 34 | beta1 = 0.9
 35 | beta2 = 0.999
 36 | eps = 1e-8
 37 | L2_is_weight_decay = true
 38 | L2 = 0.01
 39 | grad_clip = 1.0
 40 | use_averages = false
 41 | 
 42 | [training.optimizer.learn_rate]
 43 | @schedules = "warmup_linear.v1"
 44 | warmup_steps = 250
 45 | total_steps = 20000
 46 | initial_rate = 5e-5
 47 | 
 48 | 
 49 | [nlp]
 50 | lang = "en"
 51 | stop_words = []
 52 | lex_attr_getters = {}
 53 | pipeline = ["transformer", "tagger", "parser", "ner"]
 54 | 
 55 | [nlp.tokenizer]
 56 | @tokenizers = "spacy.Tokenizer.v1"
 57 | 
 58 | [nlp.lemmatizer]
 59 | @lemmatizers = "spacy.Lemmatizer.v1"
 60 | 
 61 | [nlp.writing_system]
 62 | direction = "ltr"
 63 | has_case = true
 64 | has_letters = true
 65 | 
 66 | [components]
 67 | 
 68 | [components.transformer]
 69 | factory = "transformer"
 70 | max_batch_items = 4096
 71 | 
 72 | [components.tagger]
 73 | factory = "tagger"
 74 | 
 75 | [components.parser]
 76 | factory = "parser"
 77 | learn_tokens = false
 78 | min_action_freq = 1
 79 | 
 80 | [components.ner]
 81 | factory = "ner"
 82 | learn_tokens = false
 83 | min_action_freq = 1
 84 | 
 85 | # This loads the Huggingface Transformers model. The transformer is applied
 86 | # to a batch of Doc objects, which are preprocessed into Span objects to support
 87 | # longer documents.
 88 | [components.transformer.model]
 89 | @architectures = "spacy-transformers.TransformerModel.v3"
 90 | name = "roberta-base"
 91 | tokenizer_config = {"use_fast": true}
 92 | transformer_config = {"output_attentions": false}
 93 | 
 94 | [components.transformer.model.get_spans]
 95 | # You can set a custom strategy for preparing spans from the batch, e.g. you
 96 | # can predict over sentences. Here we predict over the whole document.
 97 | @span_getters = "strided_spans.v1"
 98 | window = 128
 99 | stride = 96
100 | 
101 | [components.tagger.model]
102 | @architectures = "spacy.Tagger.v1"
103 | 
104 | [components.parser.model]
105 | @architectures = "spacy.TransitionBasedParser.v1"
106 | nr_feature_tokens = 8
107 | hidden_width = 64
108 | maxout_pieces = 2
109 | use_upper = false
110 | 
111 | [components.ner.model]
112 | @architectures = "spacy.TransitionBasedParser.v1"
113 | nr_feature_tokens = 3
114 | hidden_width = 64
115 | maxout_pieces = 2
116 | use_upper = false
117 | 
118 | # These "listener" layers are connected to the transformer pipeline component
119 | # in order to achieve multi-task learning across the pipeline.
120 | # They rely on the transformer to predict over the batch and cache the result
121 | # and callback. The gradient for the transformers will be accumulated by
122 | # the listeners, and then the last listener will call the backprop callback.
123 | [components.tagger.model.tok2vec]
124 | @architectures = "spacy-transformers.TransformerListener.v1"
125 | grad_factor = 1.0
126 | 
127 | [components.parser.model.tok2vec]
128 | @architectures = "spacy-transformers.TransformerListener.v1"
129 | grad_factor = 1.0
130 | 
131 | [components.ner.model.tok2vec]
132 | @architectures = "spacy-transformers.TransformerListener.v1"
133 | grad_factor = 1.0
134 | 
135 | # These pooling layers control how the token vectors are calculated from
136 | # the word pieces. The reduce_mean layer averages the wordpieces, so if you
137 | # have one token aligned to multiple wordpieces (as is expected), the token's
138 | # vector will be the average of the wordpieces. The most obvious alternative
139 | # is reduce_last.v1, which would just use the last wordpiece. You could also
140 | # try reduce_first, reduce_sum or even reduce_max.
141 | 
142 | [components.tagger.model.tok2vec.pooling]
143 | @layers = "reduce_mean.v1"
144 | 
145 | [components.parser.model.tok2vec.pooling]
146 | @layers = "reduce_mean.v1"
147 | 
148 | [components.ner.model.tok2vec.pooling]
149 | @layers = "reduce_mean.v1"
150 | 


--------------------------------------------------------------------------------
/examples/configs/ner-albert.cfg:
--------------------------------------------------------------------------------
 1 | [training]
 2 | patience = 10000
 3 | eval_frequency = 200
 4 | dropout = 0.1
 5 | init_tok2vec = null
 6 | vectors = null
 7 | max_epochs = 10000
 8 | orth_variant_level = 0.3
 9 | gold_preproc = true
10 | max_length = 0
11 | scores = ["speed", "ents_p", "ents_r", "ents_f"]
12 | score_weights = {"ents_f": 1.0}
13 | limit = 0
14 | width = 768
15 | accumulate_gradient = 2
16 | seed = 0
17 | use_pytorch_for_gpu_memory = true
18 | 
19 | 
20 | [training.batch_size]
21 | @schedules = "compounding.v1"
22 | start = 500
23 | stop = 500
24 | compound = 1.001
25 | 
26 | [optimizer]
27 | @optimizers = "Adam.v1"
28 | beta1 = 0.9
29 | beta2 = 0.999
30 | L2_is_weight_decay = true
31 | L2 = 0.01
32 | grad_clip = 1.0
33 | use_averages = false
34 | eps = 1e-8
35 | 
36 | [optimizer.learn_rate]
37 | @schedules = "warmup_linear.v1"
38 | initial_rate = 5e-5
39 | warmup_steps = 250
40 | total_steps = 5000
41 | 
42 | [nlp]
43 | lang = "en"
44 | vectors = ${training:vectors}
45 | 
46 | [nlp.pipeline.ner]
47 | factory = "ner"
48 | 
49 | [nlp.pipeline.ner.model]
50 | @architectures = "spacy.TransitionBasedParser.v1"
51 | nr_feature_tokens = 3
52 | hidden_width = 128
53 | maxout_pieces = 3
54 | use_upper = false
55 | 
56 | [nlp.pipeline.ner.model.tok2vec]
57 | @architectures = "spacy.Tok2VecTransformer.v3"
58 | name = "albert-base-v2"
59 | tokenizer_config = {"use_fast": false}
60 | transformer_config = {"output_attentions": false}
61 | grad_factor = 1.0
62 | 
63 | [nlp.pipeline.ner.model.tok2vec.get_spans]
64 | @span_getters = "spacy-transformers.strided_spans.v1"
65 | window = 256
66 | stride = 256
67 | 
68 | [nlp.pipeline.ner.model.tok2vec.pooling]
69 | @layers = "reduce_mean.v1"
70 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = [
 3 |     "setuptools",
 4 |     "cython>=0.25",
 5 |     "numpy>=2.0.0,<3.0.0"
 6 | ]
 7 | build-backend = "setuptools.build_meta"
 8 | 
 9 | [tool.cibuildwheel]
10 | build = "*"
11 | skip = "pp* cp36* cp37* cp38*"
12 | test-skip = ""
13 | free-threaded-support = false
14 | 
15 | archs = ["native"]
16 | 
17 | build-frontend = "default"
18 | config-settings = {}
19 | dependency-versions = "pinned"
20 | environment = {}
21 | environment-pass = []
22 | build-verbosity = 0
23 | 
24 | before-all = ""
25 | before-build = ""
26 | repair-wheel-command = ""
27 | 
28 | test-command = ""
29 | before-test = ""
30 | test-requires = []
31 | test-extras = []
32 | 
33 | container-engine = "docker"
34 | 
35 | manylinux-x86_64-image = "manylinux2014"
36 | manylinux-i686-image = "manylinux2014"
37 | manylinux-aarch64-image = "manylinux2014"
38 | manylinux-ppc64le-image = "manylinux2014"
39 | manylinux-s390x-image = "manylinux2014"
40 | manylinux-pypy_x86_64-image = "manylinux2014"
41 | manylinux-pypy_i686-image = "manylinux2014"
42 | manylinux-pypy_aarch64-image = "manylinux2014"
43 | 
44 | musllinux-x86_64-image = "musllinux_1_2"
45 | musllinux-i686-image = "musllinux_1_2"
46 | musllinux-aarch64-image = "musllinux_1_2"
47 | musllinux-ppc64le-image = "musllinux_1_2"
48 | musllinux-s390x-image = "musllinux_1_2"
49 | 
50 | 
51 | [tool.cibuildwheel.linux]
52 | repair-wheel-command = "auditwheel repair -w {dest_dir} {wheel}"
53 | 
54 | [tool.cibuildwheel.macos]
55 | repair-wheel-command = "delocate-wheel --require-archs {delocate_archs} -w {dest_dir} -v {wheel}"
56 | 
57 | [tool.cibuildwheel.windows]
58 | 
59 | [tool.cibuildwheel.pyodide]
60 | 
61 | [tool.isort]
62 | profile = "black"
63 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | spacy>=3.5.0,<4.1.0
 2 | numpy>=1.15.0
 3 | transformers[sentencepiece]>=3.4.0,<4.42.0
 4 | torch>=1.8.0
 5 | srsly>=2.4.0,<3.0.0
 6 | dataclasses>=0.6,<1.0; python_version < "3.7"
 7 | spacy-alignments>=0.7.2,<1.0.0
 8 | # Development dependencies
 9 | cython>=0.25
10 | pytest>=5.2.0
11 | pytest-cov>=2.7.0,<5.0.0
12 | mypy>=1.0.0,<1.6.0; platform_machine!='aarch64' and python_version >= "3.7"
13 | types-contextvars>=0.1.2; python_version < "3.7"
14 | types-dataclasses>=0.1.3; python_version < "3.7"
15 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
  1 | [metadata]
  2 | version = 1.3.9
  3 | description = spaCy pipelines for pre-trained BERT and other transformers
  4 | url = https://spacy.io
  5 | author = Explosion
  6 | author_email = contact@explosion.ai
  7 | license = MIT
  8 | long_description = file: README.md
  9 | long_description_content_type = text/markdown
 10 | classifiers =
 11 |     Development Status :: 5 - Production/Stable
 12 |     Environment :: Console
 13 |     Intended Audience :: Developers
 14 |     Intended Audience :: Science/Research
 15 |     Topic :: Scientific/Engineering
 16 |     Topic :: Scientific/Engineering :: Artificial Intelligence
 17 |     License :: OSI Approved :: MIT License
 18 |     Operating System :: POSIX :: Linux
 19 |     Operating System :: MacOS :: MacOS X
 20 |     Operating System :: Microsoft :: Windows
 21 |     Programming Language :: Python :: 3
 22 |     Programming Language :: Python :: 3.7
 23 |     Programming Language :: Python :: 3.8
 24 |     Programming Language :: Python :: 3.9
 25 |     Programming Language :: Python :: 3.10
 26 |     Programming Language :: Python :: 3.11
 27 | 
 28 | [options]
 29 | zip_safe = false
 30 | include_package_data = true
 31 | python_requires = >=3.9,<3.14
 32 | install_requires =
 33 |     spacy>=3.5.0,<4.1.0
 34 |     numpy>=1.15.0; python_version < "3.9"
 35 |     numpy>=1.19.0; python_version >= "3.9"
 36 |     transformers>=3.4.0,<4.50.0
 37 |     torch>=1.8.0
 38 |     srsly>=2.4.0,<3.0.0
 39 |     dataclasses>=0.6,<1.0; python_version < "3.7"
 40 |     spacy-alignments>=0.7.2,<1.0.0
 41 | 
 42 | [options.extras_require]
 43 | cuda =
 44 |     cupy>=5.0.0b4
 45 | cuda80 =
 46 |     cupy-cuda80>=5.0.0b4
 47 | cuda90 =
 48 |     cupy-cuda90>=5.0.0b4
 49 | cuda91 =
 50 |     cupy-cuda91>=5.0.0b4
 51 | cuda92 =
 52 |     cupy-cuda92>=5.0.0b4
 53 | cuda100 =
 54 |     cupy-cuda100>=5.0.0b4
 55 | cuda101 =
 56 |     cupy-cuda101>=5.0.0b4
 57 | cuda102 =
 58 |     cupy-cuda102>=5.0.0b4
 59 | cuda110 =
 60 |     cupy-cuda110>=5.0.0b4
 61 | cuda111 =
 62 |     cupy-cuda111>=5.0.0b4
 63 | cuda112 =
 64 |     cupy-cuda112>=5.0.0b4
 65 | 
 66 | [options.entry_points]
 67 | spacy_factories =
 68 |     transformer = spacy_transformers.pipeline_component:make_transformer
 69 | 
 70 | spacy_architectures =
 71 |     spacy-transformers.TransformerListener.v1 = spacy_transformers:architectures.transformer_listener_tok2vec_v1
 72 |     spacy-transformers.Tok2VecTransformer.v1 = spacy_transformers:architectures.transformer_tok2vec_v1
 73 |     spacy-transformers.Tok2VecTransformer.v2 = spacy_transformers:architectures.transformer_tok2vec_v2
 74 |     spacy-transformers.Tok2VecTransformer.v3 = spacy_transformers:architectures.transformer_tok2vec_v3
 75 |     spacy-transformers.TransformerModel.v1 = spacy_transformers:architectures.create_TransformerModel_v1
 76 |     spacy-transformers.TransformerModel.v2 = spacy_transformers:architectures.create_TransformerModel_v2
 77 |     spacy-transformers.TransformerModel.v3 = spacy_transformers:architectures.create_TransformerModel_v3
 78 | 
 79 | [bdist_wheel]
 80 | universal = true
 81 | 
 82 | [sdist]
 83 | formats = gztar
 84 | 
 85 | [flake8]
 86 | ignore = E203, E266, E501, E731, W503
 87 | max-line-length = 80
 88 | select = B,C,E,F,W,T4,B9
 89 | exclude =
 90 |     .env,
 91 |     .git,
 92 |     __pycache__,
 93 | 
 94 | [mypy]
 95 | ignore_missing_imports = True
 96 | no_implicit_optional = True
 97 | plugins = pydantic.mypy, thinc.mypy
 98 | 
 99 | [coverage:run]
100 | 
101 | [coverage:report]
102 | omit =
103 |     **/tests/*
104 |     **/_vendorized/*
105 |     **/about.py
106 | exclude_lines =
107 |     pragma: no cover
108 |     # Don't complain about missing debug-only code:
109 |     def __unicode__
110 |     def __repr__
111 |     if self\.debug
112 |     # Don't complain if tests don't hit defensive assertion code:
113 |     raise AssertionError
114 |     raise NotImplementedError
115 |     # Don't complain if non-runnable code isn't run:
116 |     if 0:
117 |     if __name__ == .__main__.:
118 | show_missing = True
119 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, Extension, find_packages
 2 | from setuptools.command.build_ext import build_ext
 3 | from Cython.Build import cythonize
 4 | from Cython.Compiler import Options
 5 | import numpy
 6 | 
 7 | 
 8 | # Preserve `__doc__` on functions and classes
 9 | # http://docs.cython.org/en/latest/src/userguide/source_files_and_compilation.html#compiler-options
10 | Options.docstrings = True
11 | 
12 | COMPILE_OPTIONS = {
13 |     "msvc": ["/Ox", "/EHsc"],
14 |     "mingw32": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"],
15 |     "other": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"],
16 | }
17 | LINK_OPTIONS = {"msvc": ["-std=c++11"], "mingw32": ["-std=c++11"], "other": []}
18 | COMPILER_DIRECTIVES = {
19 |     "language_level": -3,
20 |     "embedsignature": True,
21 |     "annotation_typing": False,
22 | }
23 | 
24 | 
25 | # By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options
26 | # http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used
27 | class build_ext_options:
28 |     def build_options(self):
29 |         for e in self.extensions:
30 |             e.extra_compile_args += COMPILE_OPTIONS.get(
31 |                 self.compiler.compiler_type, COMPILE_OPTIONS["other"]
32 |             )
33 |         for e in self.extensions:
34 |             e.extra_link_args += LINK_OPTIONS.get(
35 |                 self.compiler.compiler_type, LINK_OPTIONS["other"]
36 |             )
37 | 
38 | 
39 | class build_ext_subclass(build_ext, build_ext_options):
40 |     def build_extensions(self):
41 |         build_ext_options.build_options(self)
42 |         build_ext.build_extensions(self)
43 | 
44 | 
45 | def setup_package():
46 |     ext_modules = [
47 |         Extension(
48 |             "spacy_transformers.align",
49 |             ["spacy_transformers/align.pyx"],
50 |             language="c++",
51 |             include_dirs=[numpy.get_include()],
52 |             extra_compile_args=["-std=c++11"],
53 |         ),
54 |     ]
55 | 
56 |     ext_modules = cythonize(ext_modules, compiler_directives=COMPILER_DIRECTIVES)
57 | 
58 |     setup(
59 |         name="spacy-transformers",
60 |         packages=find_packages(),
61 |         ext_modules=ext_modules,
62 |         cmdclass={"build_ext": build_ext_subclass},
63 |         package_data={"": ["*.pyx", "*.pxd", "*.pxi"]},
64 |     )
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     setup_package()
69 | 


--------------------------------------------------------------------------------
/spacy_transformers/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import architectures
 2 | from . import annotation_setters
 3 | from . import span_getters
 4 | from .layers import TransformerModel
 5 | from .pipeline_component import Transformer, install_extensions
 6 | from .data_classes import TransformerData, FullTransformerBatch
 7 | from .util import registry
 8 | 
 9 | 
10 | __all__ = [
11 |     "install_extensions",
12 |     "Transformer",
13 |     "TransformerModel",
14 |     "TransformerData",
15 |     "FullTransformerBatch",
16 |     "architectures",
17 |     "annotation_setters",
18 |     "span_getters",
19 |     "registry",
20 | ]
21 | 


--------------------------------------------------------------------------------
/spacy_transformers/align.pyi:
--------------------------------------------------------------------------------
 1 | from typing import Dict, List, Tuple, Callable, Optional
 2 | from spacy.tokens import Span, Token
 3 | from thinc.api import Ops
 4 | from thinc.types import Ragged, Floats2d, Ints2d
 5 | 
 6 | def apply_alignment(
 7 |     ops: Ops, align: Ragged, X: Floats2d
 8 | ) -> Tuple[Ragged, Callable]: ...
 9 | def get_token_positions(spans: List[Span]) -> Dict[Token, int]: ...
10 | def get_alignment_via_offset_mapping(
11 |     spans: List[Span],
12 |     offset_mapping: Ints2d,
13 | ) -> Ragged: ...
14 | def get_alignment(
15 |     spans: List[Span],
16 |     wordpieces: List[List[str]],
17 |     special_tokens: Optional[List[str]] = None,
18 | ) -> Ragged: ...
19 | def get_span2wp_from_offset_mapping(
20 |     span: Span,
21 |     wp_char_offsets: Tuple[int],
22 | ) -> List[List[int]]: ...
23 | 


--------------------------------------------------------------------------------
/spacy_transformers/align.pyx:
--------------------------------------------------------------------------------
  1 | # cython: infer_types=True, boundscheck=False
  2 | from typing import cast, Dict, List, Tuple, Callable, Set, Optional
  3 | import numpy
  4 | from spacy_alignments.tokenizations import get_alignments
  5 | from spacy.tokens import Span, Token
  6 | from thinc.api import Ops
  7 | from thinc.types import Ragged, Floats2d, Ints1d, Ints2d
  8 | 
  9 | from cython.operator cimport dereference as deref
 10 | from cython.operator cimport preincrement as preinc
 11 | from libc.stdint cimport uint32_t, int32_t, int64_t
 12 | from libc.stdlib cimport free
 13 | from libcpp.unordered_set cimport unordered_set
 14 | from libcpp.vector cimport vector
 15 | 
 16 | ctypedef unordered_set[uint32_t]* unordered_set_uint32_t_ptr
 17 | 
 18 | 
 19 | def apply_alignment(ops: Ops, align: Ragged, X: Floats2d) -> Tuple[Ragged, Callable]:
 20 |     """Align wordpiece data (X) to match tokens, and provide a callback to
 21 |     reverse it.
 22 | 
 23 |     This function returns a Ragged array, which represents the fact that one
 24 |     token may be aligned against multiple wordpieces. It's a nested list,
 25 |     concatenated with a lengths array to indicate the nested structure.
 26 | 
 27 |     The alignment is also a Ragged array, where the lengths indicate how many
 28 |     wordpieces each token is aligned against. The output ragged therefore has
 29 |     the same lengths as the alignment ragged, which means the output data
 30 |     also has the same number of data rows as the alignment. The size of the
 31 |     lengths array indicates the number of tokens in the batch.
 32 | 
 33 |     The actual alignment is a simple indexing operation:
 34 | 
 35 |         for i, index in enumerate(align.data):
 36 |             Y[i] = X[index]
 37 | 
 38 |     Which is vectorized via numpy advanced indexing:
 39 | 
 40 |         Y = X[align.data]
 41 | 
 42 |     The inverse operation, for the backward pass, uses the 'scatter_add' op
 43 |     because one wordpiece may be aligned against multiple tokens. So we need:
 44 | 
 45 |         for i, index in enumerate(align.data):
 46 |             X[index] += Y[i]
 47 | 
 48 |     The addition wouldn't occur if we simply did `X[index] = Y`, so we use
 49 |     the scatter_add op.
 50 |     """
 51 |     if not align.lengths.sum():
 52 |         return _apply_empty_alignment(ops, align, X)
 53 |     shape = X.shape
 54 |     indices = cast(Ints1d, align.dataXd)
 55 |     Y = Ragged(X[indices], cast(Ints1d, ops.asarray(align.lengths)))
 56 | 
 57 |     def backprop_apply_alignment(dY: Ragged) -> Floats2d:
 58 |         assert dY.data.shape[0] == indices.shape[0]
 59 |         dX = ops.alloc2f(*shape)
 60 |         ops.scatter_add(dX, indices, cast(Floats2d, dY.dataXd))
 61 |         return dX
 62 | 
 63 |     return Y, backprop_apply_alignment
 64 | 
 65 | 
 66 | def _apply_empty_alignment(ops, align, X):
 67 |     shape = X.shape
 68 |     Y = Ragged(
 69 |         ops.alloc2f(align.lengths.shape[0], X.shape[1]),
 70 |         ops.alloc1i(align.lengths.shape[0]) + 1,
 71 |     )
 72 | 
 73 |     def backprop_null_alignment(dY: Ragged) -> Floats2d:
 74 |         return ops.alloc2f(*shape)
 75 | 
 76 |     return Y, backprop_null_alignment
 77 | 
 78 | 
 79 | def get_token_positions(spans: List[Span]) -> Dict[Token, int]:
 80 |     token_positions: Dict[Token, int] = {}
 81 |     seen_docs = set()
 82 |     for span in spans:
 83 |         if span.doc in seen_docs:
 84 |             continue
 85 |         seen_docs.add(span.doc)
 86 |         for token in span.doc:
 87 |             if token not in token_positions:
 88 |                 token_positions[token] = len(token_positions)
 89 |     return token_positions
 90 | 
 91 | 
 92 | def get_alignment_via_offset_mapping(
 93 |     spans: List[Span],
 94 |     offset_mapping: Ints2d,
 95 | ) -> Ragged:
 96 |     if len(spans) != len(offset_mapping):
 97 |         raise ValueError("Cannot align batches of different sizes.")
 98 |     # Tokens can occur more than once, and we need the alignment of each token
 99 |     # to its place in the concatenated wordpieces array.
100 |     token_positions = get_token_positions(spans)
101 |     alignment: List[Set[int]] = [set() for _ in range(len(token_positions))]
102 |     wp_start = 0
103 |     for i, span in enumerate(spans):
104 |         span_offset_mapping = offset_mapping[i]
105 |         span2wp = get_span2wp_from_offset_mapping(span, span_offset_mapping)
106 |         for token, wp_js in zip(span, span2wp):
107 |             position = token_positions[token]
108 |             alignment[position].update(wp_start + j for j in wp_js)
109 |         wp_start += span_offset_mapping.shape[0]
110 |     lengths: List[int] = []
111 |     flat: List[int] = []
112 |     for a in alignment:
113 |         lengths.append(len(a))
114 |         flat.extend(sorted(a))
115 |     align = Ragged(
116 |         cast(Ints1d, numpy.array(flat, dtype="i")),
117 |         cast(Ints1d, numpy.array(lengths, dtype="i")),
118 |     )
119 |     return align
120 | 
121 | 
122 | def get_alignment(
123 |     spans: List[Span],
124 |     wordpieces: List[List[str]],
125 |     special_tokens: Optional[List[str]] = None,
126 | ) -> Ragged:
127 |     """Compute a ragged alignment array that records, for each unique token in
128 |     `spans`, the corresponding indices in the flattened `wordpieces` array.
129 |     For instance, imagine you have two overlapping spans:
130 | 
131 |         [[I, like, walking], [walking, outdoors]]
132 | 
133 |     And their wordpieces are:
134 | 
135 |         [[I, like, walk, ing], [walk, ing, out, doors]]
136 | 
137 |     We want to align "walking" against [walk, ing, walk, ing], which have
138 |     indices [2, 3, 4, 5] once the nested wordpieces list is flattened.
139 | 
140 |     The nested alignment list would be:
141 | 
142 |     [[0], [1], [2, 3, 4, 5], [6, 7]]
143 |       I   like    walking    outdoors
144 | 
145 |     Which gets flattened into the ragged array:
146 | 
147 |     [0, 1, 2, 3, 4, 5, 6, 7]
148 |     [1, 1, 4, 2]
149 | 
150 |     The ragged format allows the aligned data to be computed via:
151 | 
152 |     tokens = Ragged(wp_tensor[align.data], align.lengths)
153 | 
154 |     This produces a ragged format, indicating which tokens need to be collapsed
155 |     to make the aligned array. The reduction is deferred for a later step, so
156 |     the user can configure it. The indexing is especially efficient in trivial
157 |     cases like this where the indexing array is completely continuous.
158 |     """
159 |     if len(spans) != len(wordpieces):
160 |         raise ValueError("Cannot align batches of different sizes.")
161 |     if special_tokens is None:
162 |         special_tokens = []
163 |     # Tokens can occur more than once, and we need the alignment of each token
164 |     # to its place in the concatenated wordpieces array.
165 |     token_positions = get_token_positions(spans)
166 |     alignment: List[Set[int]] = [set() for _ in range(len(token_positions))]
167 |     wp_start = 0
168 |     for i, (span, wp_toks) in enumerate(zip(spans, wordpieces)):
169 |         sp_toks = [token.text for token in span]
170 |         wp_toks_filtered = wp_toks
171 |         # In the case that the special tokens do not appear in the text, filter
172 |         # them out for alignment purposes so that special tokens like "<s>" are
173 |         # not aligned to the character "s" in the text. (If the special tokens
174 |         # appear in the text, it's not possible to distinguish them from the
175 |         # added special tokens, so they may be aligned incorrectly.)
176 |         if not any([special in span.text for special in special_tokens]):
177 |             wp_toks_filtered = [
178 |                 tok if tok not in special_tokens else "" for tok in wp_toks
179 |             ]
180 |         span2wp, wp2span = get_alignments(sp_toks, wp_toks_filtered)
181 |         for token, wp_js in zip(span, span2wp):
182 |             position = token_positions[token]
183 |             alignment[position].update(wp_start + j for j in wp_js)
184 |         wp_start += len(wp_toks)
185 |     lengths: List[int] = []
186 |     flat: List[int] = []
187 |     for a in alignment:
188 |         lengths.append(len(a))
189 |         flat.extend(sorted(a))
190 |     align = Ragged(
191 |         cast(Ints1d, numpy.array(flat, dtype="i")),
192 |         cast(Ints1d, numpy.array(lengths, dtype="i")),
193 |     )
194 |     return align
195 | 
196 | 
197 | def get_span2wp_from_offset_mapping(span, wp_char_offsets):
198 |     # create a mapping of char indices to spacy token indices
199 |     cdef int span_idx = span[0].idx
200 |     cdef int span_i = span[0].i
201 |     cdef int char_idx, rel_token_i
202 |     # size is +1 so we don't have to check whether the text has a trailing space
203 |     char_to_sp_token = numpy.full((len(span.text) + 1,), -1, dtype="int32")
204 |     for token in span:
205 |         rel_token_i = token.i - span_i
206 |         for char_idx in range(
207 |                 token.idx - span_idx,
208 |                 token.idx - span_idx + len(token) + 1,
209 |         ):
210 |             char_to_sp_token[char_idx] = rel_token_i
211 | 
212 |     # align all wordpiece tokens to one or more spacy token indices
213 |     cdef vector[unordered_set_uint32_t_ptr] alignment
214 |     for _ in range(len(span)):
215 |         alignment.push_back(new unordered_set[uint32_t]())
216 |     _get_span2wp_alignment(
217 |         &alignment,
218 |         numpy.ascontiguousarray(char_to_sp_token),
219 |         char_to_sp_token.size,
220 |         numpy.ascontiguousarray(wp_char_offsets, dtype="int64"),
221 |         wp_char_offsets.shape[0],
222 |     )
223 | 
224 |     # convert the alignment into a list of aligned wordpiece indices per spacy
225 |     # token index (unsorted at this point)
226 |     cdef unordered_set_uint32_t_ptr s
227 |     cdef vector[unordered_set_uint32_t_ptr].iterator it_v = alignment.begin()
228 |     cdef unordered_set[uint32_t].iterator it_s
229 |     result: List[List[int]] = []
230 |     while it_v != alignment.end():
231 |         result.append([])
232 |         s = deref(it_v)
233 |         it_s = s.begin()
234 |         while it_s != s.end():
235 |             result[-1].append(deref(it_s))
236 |             preinc(it_s)
237 |         del s
238 |         preinc(it_v)
239 |     return result
240 | 
241 | 
242 | cdef int _get_span2wp_alignment(
243 |         vector[unordered_set_uint32_t_ptr]* alignment,
244 |         int32_t[::1] char_to_sp_token,
245 |         int char_to_sp_token_length,
246 |         int64_t[:, ::1] wp_char_offsets,
247 |         int wp_char_offsets_length,
248 |     ) nogil:
249 |     cdef int char_idx, start_idx, end_idx, token_i
250 |     cdef int wp_j = 0
251 |     cdef int alignment_size = alignment.size()
252 |     while wp_j < wp_char_offsets_length:
253 |         start_idx = wp_char_offsets[wp_j][0]
254 |         end_idx = wp_char_offsets[wp_j][1]
255 |         char_idx = start_idx
256 |         while char_idx < end_idx:
257 |             if 0 <= char_idx < char_to_sp_token_length:
258 |                 token_i = char_to_sp_token[char_idx]
259 |             else:
260 |                 token_i = -1
261 |             if 0 <= token_i < alignment_size:
262 |                 deref(alignment.at(token_i)).insert(wp_j)
263 |             char_idx += 1
264 |         wp_j += 1
265 |     return 0
266 | 


--------------------------------------------------------------------------------
/spacy_transformers/annotation_setters.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable, List
 2 | from spacy.tokens import Doc
 3 | 
 4 | from .util import registry
 5 | from .data_classes import FullTransformerBatch
 6 | 
 7 | 
 8 | def null_annotation_setter(docs: List[Doc], trf_data: FullTransformerBatch) -> None:
 9 |     """Set no additional annotations on the Doc objects."""
10 |     pass
11 | 
12 | 
13 | @registry.annotation_setters("spacy-transformers.null_annotation_setter.v1")  # type: ignore
14 | def configure_null_annotation_setter() -> Callable[
15 |     [List[Doc], FullTransformerBatch], None
16 | ]:
17 |     return null_annotation_setter
18 | 
19 | 
20 | __all__ = ["null_annotation_setter", "configure_null_annotation_setter"]
21 | 


--------------------------------------------------------------------------------
/spacy_transformers/architectures.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Callable
  2 | from thinc.api import Model, chain
  3 | from thinc.types import Ragged, Floats2d
  4 | from spacy.tokens import Doc
  5 | 
  6 | from .layers import TransformerModel, TransformerListener
  7 | from .layers import trfs2arrays, split_trf_batch
  8 | from .util import registry
  9 | from .data_classes import FullTransformerBatch
 10 | 
 11 | 
 12 | @registry.architectures.register("spacy-transformers.TransformerListener.v1")
 13 | def transformer_listener_tok2vec_v1(
 14 |     pooling: Model[Ragged, Floats2d], grad_factor: float = 1.0, upstream: str = "*"
 15 | ) -> Model[List[Doc], List[Floats2d]]:
 16 |     """Create a 'TransformerListener' layer, which will connect to a Transformer
 17 |     component earlier in the pipeline.
 18 | 
 19 |     The layer takes a list of Doc objects as input, and produces a list of
 20 |     2d arrays as output, with each array having one row per token. Most spaCy
 21 |     models expect a sublayer with this signature, making it easy to connect them
 22 |     to a transformer model via this sublayer.
 23 |     Transformer models usually operate over wordpieces, which usually don't align
 24 |     one-to-one against spaCy tokens. The layer therefore requires a reduction
 25 |     operation in order to calculate a single token vector given zero or more
 26 |     wordpiece vectors.
 27 | 
 28 |     pooling (Model[Ragged, Floats2d]): A reduction layer used to calculate
 29 |         the token vectors based on zero or more wordpiece vectors. If in doubt,
 30 |         mean pooling (see `thinc.layers.reduce_mean`) is usually a good choice.
 31 |     grad_factor (float): Reweight gradients from the component before passing
 32 |         them upstream. You can set this to 0 to "freeze" the transformer weights
 33 |         with respect to the component, or use it to make some components more
 34 |         significant than others. Leaving it at 1.0 is usually fine.
 35 |     upstream (str): A string to identify the 'upstream' Transformer
 36 |         to communicate with. The upstream name should either be the wildcard
 37 |         string '*', or the name of the `Transformer` component. You'll almost
 38 |         never have multiple upstream Transformer components, so the wildcard
 39 |         string will almost always be fine.
 40 |     """
 41 |     listener = TransformerListener(upstream_name=upstream)
 42 |     model: Model = chain(listener, trfs2arrays(pooling, grad_factor))
 43 |     model.set_ref("listener", listener)
 44 |     return model
 45 | 
 46 | 
 47 | @registry.architectures.register("spacy-transformers.Tok2VecTransformer.v1")
 48 | def transformer_tok2vec_v1(
 49 |     name: str,
 50 |     get_spans,
 51 |     tokenizer_config: dict,
 52 |     pooling: Model[Ragged, Floats2d],
 53 |     grad_factor: float = 1.0,
 54 | ) -> Model[List[Doc], List[Floats2d]]:
 55 |     """Use a transformer as a "Tok2Vec" layer directly. This does not allow
 56 |     multiple components to share the transformer weights, and does not allow
 57 |     the transformer to set annotations into the `Doc` object, but it's a
 58 |     simpler solution if you only need the transformer within one component.
 59 | 
 60 |     get_spans (Callable[[List[Doc]], List[List[Span]]]): A function to extract
 61 |         spans from the batch of Doc objects. See the "TransformerModel" layer
 62 |         for details.
 63 |     tokenizer_config (dict): Settings to pass to the transformers tokenizer.
 64 |     pooling (Model[Ragged, Floats2d]): A reduction layer used to calculate
 65 |         the token vectors based on zero or more wordpiece vectors. If in doubt,
 66 |         mean pooling (see `thinc.layers.reduce_mean`) is usually a good choice.
 67 |      grad_factor (float): Reweight gradients from the component before passing
 68 |         them to the transformer. You can set this to 0 to "freeze" the transformer
 69 |         weights with respect to the component, or to make it learn more slowly.
 70 |         Leaving it at 1.0 is usually fine.
 71 |     """
 72 |     return chain(
 73 |         TransformerModel(name, get_spans, tokenizer_config),
 74 |         split_trf_batch(),
 75 |         trfs2arrays(pooling, grad_factor),
 76 |     )
 77 | 
 78 | 
 79 | @registry.architectures.register("spacy-transformers.Tok2VecTransformer.v2")
 80 | def transformer_tok2vec_v2(
 81 |     name: str,
 82 |     get_spans,
 83 |     tokenizer_config: dict,
 84 |     pooling: Model[Ragged, Floats2d],
 85 |     grad_factor: float = 1.0,
 86 |     transformer_config: dict = {},
 87 | ) -> Model[List[Doc], List[Floats2d]]:
 88 |     """Use a transformer as a "Tok2Vec" layer directly. This does not allow
 89 |     multiple components to share the transformer weights, and does not allow
 90 |     the transformer to set annotations into the `Doc` object, but it's a
 91 |     simpler solution if you only need the transformer within one component.
 92 | 
 93 |     get_spans (Callable[[List[Doc]], List[List[Span]]]): A function to extract
 94 |         spans from the batch of Doc objects. See the "TransformerModel" layer
 95 |         for details.
 96 |     tokenizer_config (dict): Settings to pass to the transformers tokenizer.
 97 |     pooling (Model[Ragged, Floats2d]): A reduction layer used to calculate
 98 |         the token vectors based on zero or more wordpiece vectors. If in doubt,
 99 |         mean pooling (see `thinc.layers.reduce_mean`) is usually a good choice.
100 |     grad_factor (float): Reweight gradients from the component before passing
101 |         them to the transformer. You can set this to 0 to "freeze" the transformer
102 |         weights with respect to the component, or to make it learn more slowly.
103 |         Leaving it at 1.0 is usually fine.
104 |     transformers_config (dict): Settings to pass to the transformers forward pass
105 |         of the transformer.
106 |     """
107 |     return chain(
108 |         TransformerModel(name, get_spans, tokenizer_config, transformer_config),
109 |         split_trf_batch(),
110 |         trfs2arrays(pooling, grad_factor),
111 |     )
112 | 
113 | 
114 | # Note: when updating, also make sure to update 'replace_listener_cfg' in _util.py
115 | @registry.architectures.register("spacy-transformers.Tok2VecTransformer.v3")
116 | def transformer_tok2vec_v3(
117 |     name: str,
118 |     get_spans,
119 |     tokenizer_config: dict,
120 |     pooling: Model[Ragged, Floats2d],
121 |     grad_factor: float = 1.0,
122 |     transformer_config: dict = {},
123 |     mixed_precision: bool = False,
124 |     grad_scaler_config: dict = {},
125 | ) -> Model[List[Doc], List[Floats2d]]:
126 |     """Use a transformer as a "Tok2Vec" layer directly. This does not allow
127 |     multiple components to share the transformer weights, and does not allow
128 |     the transformer to set annotations into the `Doc` object, but it's a
129 |     simpler solution if you only need the transformer within one component.
130 | 
131 |     get_spans (Callable[[List[Doc]], List[List[Span]]]): A function to extract
132 |         spans from the batch of Doc objects. See the "TransformerModel" layer
133 |         for details.
134 |     tokenizer_config (dict): Settings to pass to the transformers tokenizer.
135 |     pooling (Model[Ragged, Floats2d]): A reduction layer used to calculate
136 |         the token vectors based on zero or more wordpiece vectors. If in doubt,
137 |         mean pooling (see `thinc.layers.reduce_mean`) is usually a good choice.
138 |     grad_factor (float): Reweight gradients from the component before passing
139 |         them to the transformer. You can set this to 0 to "freeze" the transformer
140 |         weights with respect to the component, or to make it learn more slowly.
141 |         Leaving it at 1.0 is usually fine.
142 |     transformers_config (dict): Settings to pass to the transformers forward pass
143 |         of the transformer.
144 |     mixed_precision (bool): Enable mixed-precision. Mixed-precision replaces
145 |         whitelisted ops to half-precision counterparts. This speeds up training
146 |         and prediction on modern GPUs and reduces GPU memory use.
147 |     grad_scaler_config (dict): Configuration for gradient scaling in mixed-precision
148 |         training. Gradient scaling is enabled automatically when mixed-precision
149 |         training is used.
150 | 
151 |         Setting `enabled` to `False` in the gradient scaling configuration disables
152 |         gradient scaling. The `init_scale` (default: `2 ** 16`) determines the
153 |         initial scale. `backoff_factor` (default: `0.5`) specifies the factor
154 |         by which the scale should be reduced when gradients overflow.
155 |         `growth_interval` (default: `2000`) configures the number of steps
156 |         without gradient overflows after which the scale should be increased.
157 |         Finally, `growth_factor` (default: `2.0`) determines the factor by which
158 |         the scale should be increased when no overflows were found for
159 |         `growth_interval` steps.
160 |     """
161 |     # Note that this is a chain of chain on purpose, to match the structure of
162 |     # TransformerListener.v1 after it is run through replace_listener (cf PR #310)
163 |     return chain(  # type: ignore
164 |         chain(
165 |             TransformerModel(
166 |                 name,
167 |                 get_spans,
168 |                 tokenizer_config,
169 |                 transformer_config,
170 |                 mixed_precision,
171 |                 grad_scaler_config,
172 |             ),
173 |             split_trf_batch(),
174 |         ),
175 |         trfs2arrays(pooling, grad_factor),
176 |     )
177 | 
178 | 
179 | @registry.architectures.register("spacy-transformers.TransformerModel.v1")
180 | def create_TransformerModel_v1(
181 |     name: str,
182 |     get_spans: Callable,
183 |     tokenizer_config: dict = {},
184 | ) -> Model[List[Doc], FullTransformerBatch]:
185 |     model = TransformerModel(name, get_spans, tokenizer_config)
186 |     return model
187 | 
188 | 
189 | @registry.architectures.register("spacy-transformers.TransformerModel.v2")
190 | def create_TransformerModel_v2(
191 |     name: str,
192 |     get_spans: Callable,
193 |     tokenizer_config: dict = {},
194 |     transformer_config: dict = {},
195 | ) -> Model[List[Doc], FullTransformerBatch]:
196 |     model = TransformerModel(name, get_spans, tokenizer_config, transformer_config)
197 |     return model
198 | 
199 | 
200 | @registry.architectures.register("spacy-transformers.TransformerModel.v3")
201 | def create_TransformerModel_v3(
202 |     name: str,
203 |     get_spans: Callable,
204 |     tokenizer_config: dict = {},
205 |     transformer_config: dict = {},
206 |     mixed_precision: bool = False,
207 |     grad_scaler_config: dict = {},
208 | ) -> Model[List[Doc], FullTransformerBatch]:
209 |     """Pretrained transformer model that can be finetuned for downstream tasks.
210 | 
211 |     name (str): Name of the pretrained Huggingface model to use.
212 |     get_spans (Callable[[List[Doc]], List[List[Span]]]): A function to extract
213 |         spans from the batch of Doc objects. See the "TransformerModel" layer
214 |         for details.
215 |     tokenizer_config (dict): Settings to pass to the transformers tokenizer.
216 |     transformers_config (dict): Settings to pass to the transformers forward pass
217 |         of the transformer.
218 |     mixed_precision (bool): Enable mixed-precision. Mixed-precision replaces
219 |         whitelisted ops to half-precision counterparts. This speeds up training
220 |         and prediction on modern GPUs and reduces GPU memory use.
221 |     grad_scaler_config (dict): Configuration for gradient scaling in mixed-precision
222 |         training. Gradient scaling is enabled automatically when mixed-precision
223 |         training is used.
224 | 
225 |         Setting `enabled` to `False` in the gradient scaling configuration disables
226 |         gradient scaling. The `init_scale` (default: `2 ** 16`) determines the
227 |         initial scale. `backoff_factor` (default: `0.5`) specifies the factor
228 |         by which the scale should be reduced when gradients overflow.
229 |         `growth_interval` (default: `2000`) configures the number of steps
230 |         without gradient overflows after which the scale should be increased.
231 |         Finally, `growth_factor` (default: `2.0`) determines the factor by which
232 |         the scale should be increased when no overflows were found for
233 |         `growth_interval` steps.
234 |     """
235 |     model = TransformerModel(
236 |         name,
237 |         get_spans,
238 |         tokenizer_config,
239 |         transformer_config,
240 |         mixed_precision,
241 |         grad_scaler_config,
242 |     )
243 |     return model
244 | 


--------------------------------------------------------------------------------
/spacy_transformers/data_classes.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional, List, Dict, Any, Union, Tuple, cast
  2 | from dataclasses import dataclass, field
  3 | import torch
  4 | import numpy
  5 | from transformers.tokenization_utils import BatchEncoding
  6 | from transformers.file_utils import ModelOutput
  7 | from transformers.modeling_outputs import BaseModelOutput
  8 | from thinc.types import Ragged, Floats2d, Floats3d, FloatsXd, Ints1d, Ints2d
  9 | from thinc.api import NumpyOps, get_array_module, xp2torch, torch2xp
 10 | from spacy.tokens import Span
 11 | import srsly
 12 | 
 13 | from .util import transpose_list
 14 | from .align import get_token_positions
 15 | 
 16 | 
 17 | @dataclass
 18 | class WordpieceBatch:
 19 |     """Holds data from the transformers BatchEncoding class.
 20 | 
 21 |     We would have preferred to use the BatchEncoding class directly, but
 22 |     there's a few problems with that.
 23 | 
 24 |     1. Some BatchEncoding functionality requires the tokenizers.Encoding object,
 25 |         and it's impossible for us to create or manipulate that object. This means
 26 |         we can't really create BatchEncoding objects, which limits what we can do.
 27 |     2. We want some semantic differences, for instance the "lengths" data in the
 28 |         BatchEncoding is useless when the inputs are padded. We want it to tell
 29 |         us the *unpadded* lengths.
 30 |     3. We want typed attributes, so that we can type-check properly.
 31 |     4. We prefer to have numpy/cupy arrays rather than torch arrays.
 32 |     5. The API around the BatchEncoding object has been changing a lot, so we
 33 |         want to minimize the places where we touch it.
 34 |     """
 35 | 
 36 |     strings: List[List[str]]
 37 |     input_ids: Ints2d
 38 |     attention_mask: Floats2d
 39 |     lengths: List[int]
 40 |     token_type_ids: Optional[Ints2d]
 41 | 
 42 |     def __len__(self) -> int:
 43 |         return len(self.strings)
 44 | 
 45 |     def __getitem__(self, index) -> "WordpieceBatch":
 46 |         if isinstance(index, int):
 47 |             slice_ = slice(index, index + 1)
 48 |         else:
 49 |             slice_ = index
 50 |         return WordpieceBatch(
 51 |             strings=self.strings[slice_],
 52 |             input_ids=self.input_ids[slice_],
 53 |             attention_mask=self.attention_mask[slice_],
 54 |             lengths=self.lengths[slice_],
 55 |             token_type_ids=(
 56 |                 self.token_type_ids[slice_] if self.token_type_ids is not None else None
 57 |             ),
 58 |         )
 59 | 
 60 |     def to_hf_dict(self) -> Dict:
 61 |         """Return a dict similar to the format produced by the Huggingface
 62 |         tokenizer, converting arrays to pytorch tensors as well.
 63 |         """
 64 |         output = {
 65 |             "input_ids": xp2torch(self.input_ids),
 66 |             "attention_mask": xp2torch(self.attention_mask),
 67 |             "input_texts": self.strings,
 68 |         }
 69 |         if self.token_type_ids is not None:
 70 |             output["token_type_ids"] = xp2torch(self.token_type_ids)
 71 |         return output
 72 | 
 73 |     @classmethod
 74 |     def empty(cls, *, xp=numpy) -> "WordpieceBatch":
 75 |         return cls(
 76 |             strings=[],
 77 |             input_ids=xp.zeros((0, 0), dtype="i"),
 78 |             attention_mask=xp.ones((0, 0), dtype="bool"),
 79 |             lengths=[],
 80 |             token_type_ids=None,
 81 |         )
 82 | 
 83 |     @classmethod
 84 |     def zeros(cls, lengths: List[int], xp=numpy) -> "WordpieceBatch":
 85 |         return cls(
 86 |             strings=[[""] * length for length in lengths],
 87 |             input_ids=xp.array([[0] * length for length in lengths], dtype="i"),
 88 |             attention_mask=xp.ones((len(lengths), max(lengths)), dtype="bool"),
 89 |             lengths=lengths,
 90 |             token_type_ids=None,
 91 |         )
 92 | 
 93 |     @classmethod
 94 |     def from_batch_encoding(cls, token_data: BatchEncoding) -> "WordpieceBatch":
 95 |         assert isinstance(token_data, BatchEncoding) or isinstance(token_data, dict)
 96 |         pad_token = token_data.get("pad_token", "[PAD]")
 97 |         lengths = [
 98 |             len([tok for tok in tokens if tok != pad_token])
 99 |             for tokens in token_data["input_texts"]
100 |         ]
101 | 
102 |         # The following tensors are intentionally allocated on the CPU to reduce
103 |         # host-to-device copies.
104 |         numpy_ops = NumpyOps()
105 |         input_ids = token_data["input_ids"]
106 |         token_type_ids = token_data.get("token_type_ids")
107 | 
108 |         return cls(
109 |             strings=token_data["input_texts"],
110 |             input_ids=numpy_ops.asarray(input_ids, dtype=input_ids.dtype),
111 |             attention_mask=numpy_ops.asarray2f(token_data["attention_mask"]),
112 |             lengths=lengths,
113 |             token_type_ids=(
114 |                 numpy_ops.asarray(token_type_ids, dtype=token_type_ids.dtype)
115 |                 if token_type_ids is not None
116 |                 else None
117 |             ),
118 |         )
119 | 
120 |     def to_dict(self) -> Dict[str, Any]:
121 |         return {
122 |             "strings": self.strings,
123 |             "input_ids": self.input_ids,
124 |             "attention_mask": self.attention_mask,
125 |             "lengths": self.lengths,
126 |             "token_type_ids": self.token_type_ids,
127 |         }
128 | 
129 |     def from_dict(self, msg: Dict[str, Any]) -> "WordpieceBatch":
130 |         self.strings = msg["strings"]
131 |         self.input_ids = msg["input_ids"]
132 |         self.attention_mask = msg["attention_mask"]
133 |         self.lengths = msg["lengths"]
134 |         self.token_type_ids = msg["token_type_ids"]
135 |         return self
136 | 
137 | 
138 | @dataclass
139 | class TransformerData:
140 |     """Transformer tokens and outputs for one Doc object.
141 | 
142 |     The transformer models return tensors that refer to a whole padded batch
143 |     of documents. These tensors are wrapped into the FullTransformerBatch object.
144 |     The FullTransformerBatch then splits out the per-document data, which is
145 |     handled by this class. Instances of this class are typically assigned to
146 |     the doc._.trf_data extension attribute.
147 | 
148 |     Attributes
149 |     ----------
150 |     wordpieces (WordpieceBatch): A slice of the wordpiece token data produced
151 |         by the Huggingface tokenizer.
152 |     model_output (ModelOutput): The model output from the transformer model,
153 |         determined by the model and transformer config.
154 |     align (Ragged): Alignment from the Doc's tokenization to the wordpieces.
155 |         This is a ragged array, where align.lengths[i] indicates the number of
156 |         wordpiece tokens that token i aligns against. The actual indices are
157 |         provided at align[i].dataXd.
158 |     """
159 | 
160 |     wordpieces: WordpieceBatch
161 |     model_output: ModelOutput
162 |     align: Ragged
163 | 
164 |     @classmethod
165 |     def empty(cls) -> "TransformerData":
166 |         align = Ragged(
167 |             cast(Ints1d, numpy.zeros((0,), dtype="i")),
168 |             cast(Ints1d, numpy.zeros((0,), dtype="i")),
169 |         )
170 |         return cls(
171 |             wordpieces=WordpieceBatch.empty(), model_output=ModelOutput(), align=align
172 |         )
173 | 
174 |     @classmethod
175 |     def zeros(cls, length: int, width: int, *, xp=numpy) -> "TransformerData":
176 |         """Create a valid TransformerData container for a given shape, filled
177 |         with zeros."""
178 |         return cls(
179 |             wordpieces=WordpieceBatch.zeros([length], xp=xp),
180 |             model_output=ModelOutput(
181 |                 last_hidden_state=xp.zeros((1, length, width), dtype="f")
182 |             ),
183 |             align=Ragged(
184 |                 cast(Ints1d, numpy.arange(length)),
185 |                 cast(Ints1d, numpy.ones((length,), dtype="i")),
186 |             ),
187 |         )
188 | 
189 |     @property
190 |     def tensors(self) -> Tuple[Union[FloatsXd, List[FloatsXd]]]:
191 |         return self.model_output.to_tuple()
192 | 
193 |     @property
194 |     def tokens(self) -> Dict[str, Any]:
195 |         """Deprecated. A dict with the wordpiece token data."""
196 |         return self.wordpieces.to_hf_dict()
197 | 
198 |     @property
199 |     def width(self) -> int:
200 |         if "last_hidden_state" in self.model_output:
201 |             return cast(BaseModelOutput, self.model_output).last_hidden_state.shape[-1]
202 |         else:
203 |             raise ValueError("Cannot find last hidden state")
204 | 
205 |     def to_dict(self) -> Dict[str, Any]:
206 |         return {
207 |             "wordpieces": self.wordpieces.to_dict(),
208 |             "model_output": self.model_output,
209 |             "align": [self.align.dataXd, self.align.lengths],
210 |         }
211 | 
212 |     def from_dict(self, msg: Dict[str, Any]) -> "TransformerData":
213 |         self.wordpieces = WordpieceBatch.empty().from_dict(msg["wordpieces"])
214 |         self.model_output = ModelOutput(msg["model_output"])
215 |         self.align = Ragged(*msg["align"])
216 |         return self
217 | 
218 |     def to_bytes(self) -> bytes:
219 |         return srsly.msgpack_dumps(self.to_dict())
220 | 
221 |     def from_bytes(self, byte_string: bytes) -> "TransformerData":
222 |         msg = srsly.msgpack_loads(byte_string)
223 |         self.from_dict(msg)
224 |         return self
225 | 
226 | 
227 | @srsly.msgpack_encoders("transformerdata")
228 | def serialize_transformer_data(obj, chain=None):
229 |     if isinstance(obj, TransformerData):
230 |         return {"__transformerdata__": obj.to_dict()}
231 |     return obj if chain is None else chain(obj)
232 | 
233 | 
234 | @srsly.msgpack_decoders("transformerdata")
235 | def deserialize_transformer_data(obj, chain=None):
236 |     if "__transformerdata__" in obj:
237 |         return TransformerData.empty().from_dict(obj["__transformerdata__"])
238 |     return obj if chain is None else chain(obj)
239 | 
240 | 
241 | @dataclass
242 | class FullTransformerBatch:
243 |     """Holds a batch of input and output objects for a transformer model. The
244 |     data can then be split to a list of `TransformerData` objects to associate
245 |     the outputs to each `Doc` in the batch.
246 | 
247 |     Attributes
248 |     ----------
249 |     spans (List[List[Span]]): The batch of input spans. The outer list refers
250 |         to the Doc objects in the batch, and the inner list are the spans for
251 |         that `Doc`. Note that spans are allowed to overlap or exclude tokens,
252 |         but each Span can only refer to one Doc (by definition). This means that
253 |         within a Doc, the regions of the output tensors that correspond to each
254 |         Span may overlap or have gaps, but for each Doc, there is a non-overlapping
255 |         contiguous slice of the outputs.
256 |     wordpieces (WordpieceBatch): Token data from the Huggingface tokenizer.
257 |     model_output (ModelOutput): The output of the transformer model.
258 |     align (Ragged): Alignment from the spaCy tokenization to the wordpieces.
259 |         This is a ragged array, where align.lengths[i] indicates the number of
260 |         wordpiece tokens that token i aligns against. The actual indices are
261 |         provided at align[i].dataXd.
262 |     """
263 | 
264 |     spans: List[List[Span]]
265 |     wordpieces: WordpieceBatch
266 |     model_output: ModelOutput
267 |     align: Ragged
268 |     cached_doc_data: Optional[List[TransformerData]] = None
269 | 
270 |     @classmethod
271 |     def empty(cls, nr_docs) -> "FullTransformerBatch":
272 |         spans: List[List[Span]] = [[] for _ in range(nr_docs)]
273 |         doc_data = [TransformerData.empty() for _ in range(nr_docs)]
274 |         align = Ragged(
275 |             cast(Ints1d, numpy.zeros((0,), dtype="i")),
276 |             cast(Ints1d, numpy.zeros((0,), dtype="i")),
277 |         )
278 |         return cls(
279 |             spans=spans,
280 |             wordpieces=WordpieceBatch.empty(),
281 |             model_output=ModelOutput(),
282 |             align=align,
283 |             cached_doc_data=doc_data,
284 |         )
285 | 
286 |     @property
287 |     def tensors(self) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
288 |         return self.model_output.to_tuple()
289 | 
290 |     @property
291 |     def tokens(self) -> Dict[str, Any]:
292 |         """Deprecated. Dict formatted version of the self.wordpieces data,
293 |         with values converted to PyTorch tensors.
294 |         """
295 |         return self.wordpieces.to_hf_dict()
296 | 
297 |     @property
298 |     def doc_data(self) -> List[TransformerData]:
299 |         """The outputs, split per spaCy Doc object."""
300 |         if self.cached_doc_data is None:
301 |             self.cached_doc_data = self.split_by_doc()
302 |         return self.cached_doc_data
303 | 
304 |     def unsplit_by_doc(self, arrays: List[List[Floats3d]]) -> "FullTransformerBatch":
305 |         """Return a new FullTransformerBatch from a split batch of activations,
306 |         using the current object's spans, wordpieces and alignment.
307 | 
308 |         This is used during the backward pass, in order to construct the gradients
309 |         to pass back into the transformer model.
310 |         """
311 |         xp = get_array_module(arrays[0][0])
312 |         # construct a dummy ModelOutput with the tensor values
313 |         model_output = ModelOutput()
314 |         for i, x in enumerate(transpose_list(arrays)):
315 |             model_output[f"output_{i}"] = xp2torch(xp.vstack(x))
316 |         return FullTransformerBatch(
317 |             spans=self.spans,
318 |             wordpieces=self.wordpieces,
319 |             model_output=model_output,
320 |             align=self.align,
321 |         )
322 | 
323 |     def split_by_doc(self) -> List[TransformerData]:
324 |         """Split a TransformerData that represents a batch into a list with
325 |         one TransformerData per Doc.
326 |         """
327 |         flat_spans = []
328 |         for doc_spans in self.spans:
329 |             flat_spans.extend(doc_spans)
330 |         token_positions = get_token_positions(flat_spans)
331 | 
332 |         # Convert all outputs to XP arrays.
333 |         xp_model_output = ModelOutput()
334 |         last_hidden_state = cast(BaseModelOutput, self.model_output).last_hidden_state
335 |         for key, output in self.model_output.items():
336 |             if isinstance(output, torch.Tensor):
337 |                 xp_model_output[key] = torch2xp(output)
338 |             elif (
339 |                 isinstance(output, tuple)
340 |                 and all(isinstance(t, torch.Tensor) for t in output)
341 |                 and all(t.shape[0] == last_hidden_state.shape[0] for t in output)
342 |             ):
343 |                 xp_model_output[key] = [torch2xp(t) for t in output]
344 | 
345 |         # Split outputs per Doc.
346 |         outputs = []
347 |         start = 0
348 |         prev_tokens = 0
349 |         for doc_spans in self.spans:
350 |             if len(doc_spans) == 0 or len(doc_spans[0]) == 0:
351 |                 outputs.append(TransformerData.empty())
352 |                 continue
353 |             start_i = token_positions[doc_spans[0][0]]
354 |             end_i = token_positions[doc_spans[-1][-1]] + 1
355 |             end = start + len(doc_spans)
356 |             doc_tokens = self.wordpieces[start:end]
357 |             doc_align = self.align[start_i:end_i]
358 |             doc_align.data = doc_align.data - prev_tokens
359 |             model_output = ModelOutput()
360 |             for key, output in xp_model_output.items():
361 |                 # After the torch2xp conversion above, we only have XP arrays
362 |                 # and lists of XP arrays.
363 |                 if not isinstance(output, list):
364 |                     model_output[key] = output[start:end]
365 |                 else:
366 |                     model_output[key] = [t[start:end] for t in output]
367 |             outputs.append(
368 |                 TransformerData(
369 |                     wordpieces=doc_tokens,
370 |                     model_output=model_output,
371 |                     align=doc_align,
372 |                 )
373 |             )
374 |             prev_tokens += doc_tokens.input_ids.size
375 |             start += len(doc_spans)
376 |         return outputs
377 | 
378 | 
379 | @dataclass
380 | class HFObjects:
381 | 
382 |     tokenizer: Any
383 |     transformer: Any
384 |     vocab_file_contents: Any
385 |     _init_tokenizer_config: Dict[str, Any] = field(default_factory=dict)
386 |     _init_transformer_config: Dict[str, Any] = field(default_factory=dict)
387 | 


--------------------------------------------------------------------------------
/spacy_transformers/layers/__init__.py:
--------------------------------------------------------------------------------
1 | from .listener import TransformerListener
2 | from .transformer_model import TransformerModel
3 | from .split_trf import split_trf_batch
4 | from .trfs2arrays import trfs2arrays
5 | 
6 | 
7 | __all__ = ["TransformerListener", "TransformerModel", "split_trf_batch", "trfs2arrays"]
8 | 


--------------------------------------------------------------------------------
/spacy_transformers/layers/_util.py:
--------------------------------------------------------------------------------
 1 | from thinc.api import chain
 2 | from .split_trf import split_trf_batch
 3 | 
 4 | 
 5 | def replace_listener(model):
 6 |     return chain(model, split_trf_batch())
 7 | 
 8 | 
 9 | def replace_listener_cfg(tok2vec_model_cfg, listener_model_cfg):
10 |     result = tok2vec_model_cfg.copy()
11 |     if (
12 |         "TransformerModel" in tok2vec_model_cfg["@architectures"]
13 |         and "TransformerListener" in listener_model_cfg["@architectures"]
14 |     ):
15 |         result["@architectures"] = "spacy-transformers.Tok2VecTransformer.v3"
16 |         for key in ["pooling", "grad_factor"]:
17 |             if key in listener_model_cfg and key not in result:
18 |                 result[key] = listener_model_cfg[key]
19 |     return result
20 | 


--------------------------------------------------------------------------------
/spacy_transformers/layers/hf_shim.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Dict
  2 | from io import BytesIO
  3 | from pathlib import Path
  4 | import srsly
  5 | import torch
  6 | import warnings
  7 | from thinc.api import get_torch_default_device
  8 | from spacy.util import SimpleFrozenDict
  9 | 
 10 | from ..data_classes import HFObjects
 11 | from ..util import make_tempdir
 12 | 
 13 | from thinc.api import PyTorchGradScaler, PyTorchShim
 14 | 
 15 | from transformers import AutoModel, AutoConfig, AutoTokenizer
 16 | 
 17 | 
 18 | class HFShim(PyTorchShim):
 19 |     """Interface between a HF Pytorch model and a Thinc Model."""
 20 | 
 21 |     def __init__(
 22 |         self,
 23 |         model: HFObjects,
 24 |         config=None,
 25 |         optimizer: Any = None,
 26 |         mixed_precision: bool = False,
 27 |         grad_scaler_config: dict = {},
 28 |         config_cls=AutoConfig,
 29 |         model_cls=AutoModel,
 30 |         tokenizer_cls=AutoTokenizer,
 31 |     ):
 32 |         self._hfmodel = model
 33 |         self.config_cls = config_cls
 34 |         self.model_cls = model_cls
 35 |         self.tokenizer_cls = tokenizer_cls
 36 | 
 37 |         # Enable gradient scaling when mixed precision is enabled and gradient
 38 |         # scaling is not explicitly disabled in the configuration.
 39 |         if "enabled" not in grad_scaler_config:
 40 |             grad_scaler_config["enabled"] = mixed_precision
 41 | 
 42 |         super().__init__(
 43 |             model.transformer,
 44 |             config,
 45 |             optimizer,
 46 |             mixed_precision,
 47 |             grad_scaler=PyTorchGradScaler(**grad_scaler_config),
 48 |         )
 49 | 
 50 |     def to_bytes(self):
 51 |         config = {}
 52 |         tok_dict = {}
 53 |         weights_bytes = {}
 54 |         tok_cfg = {}
 55 |         trf_cfg = {}
 56 |         hf_model = self._hfmodel
 57 |         if hf_model.transformer is not None:
 58 |             tok_dict = {}
 59 |             config = hf_model.transformer.config.to_dict()
 60 |             tokenizer = hf_model.tokenizer
 61 |             with make_tempdir() as temp_dir:
 62 |                 if hasattr(tokenizer, "vocab_file"):
 63 |                     vocab_file_name = tokenizer.vocab_files_names["vocab_file"]
 64 |                     vocab_file_path = str((temp_dir / vocab_file_name).absolute())
 65 |                     with open(vocab_file_path, "wb") as fileh:
 66 |                         fileh.write(hf_model.vocab_file_contents)
 67 |                     tokenizer.vocab_file = vocab_file_path
 68 |                 tok_dict["kwargs"] = {"use_fast": tokenizer.is_fast}
 69 |                 tokenizer.save_pretrained(str(temp_dir.absolute()))
 70 |                 for x in temp_dir.glob("**/*"):
 71 |                     if x.is_file():
 72 |                         tok_dict[x.name] = x.read_bytes()
 73 |             filelike = BytesIO()
 74 |             torch.save(self._model.state_dict(), filelike)
 75 |             filelike.seek(0)
 76 |             weights_bytes = filelike.getvalue()
 77 |         else:
 78 |             tok_cfg = hf_model._init_tokenizer_config
 79 |             trf_cfg = hf_model._init_transformer_config
 80 |         msg = {
 81 |             "config": config,
 82 |             "state": weights_bytes,
 83 |             "tokenizer": tok_dict,
 84 |             "_init_tokenizer_config": tok_cfg,
 85 |             "_init_transformer_config": trf_cfg,
 86 |         }
 87 |         return srsly.msgpack_dumps(msg)
 88 | 
 89 |     def from_bytes(self, bytes_data):
 90 |         msg = srsly.msgpack_loads(bytes_data)
 91 |         config_dict = msg["config"]
 92 |         tok_dict = msg["tokenizer"]
 93 |         if config_dict:
 94 |             with make_tempdir() as temp_dir:
 95 |                 config_file = temp_dir / "config.json"
 96 |                 srsly.write_json(config_file, config_dict)
 97 |                 config = self.config_cls.from_pretrained(config_file)
 98 |                 tok_kwargs = tok_dict.pop("kwargs", {})
 99 |                 for x, x_bytes in tok_dict.items():
100 |                     Path(temp_dir / x).write_bytes(x_bytes)
101 |                 tokenizer = self.tokenizer_cls.from_pretrained(
102 |                     str(temp_dir.absolute()), **tok_kwargs
103 |                 )
104 |                 vocab_file_contents = None
105 |                 if hasattr(tokenizer, "vocab_file"):
106 |                     vocab_file_name = tokenizer.vocab_files_names["vocab_file"]
107 |                     vocab_file_path = str((temp_dir / vocab_file_name).absolute())
108 |                     with open(vocab_file_path, "rb") as fileh:
109 |                         vocab_file_contents = fileh.read()
110 | 
111 |             transformer = self.model_cls.from_config(config)
112 |             self._hfmodel = HFObjects(
113 |                 tokenizer,
114 |                 transformer,
115 |                 vocab_file_contents,
116 |                 SimpleFrozenDict(),
117 |                 SimpleFrozenDict(),
118 |             )
119 |             self._model = transformer
120 |             filelike = BytesIO(msg["state"])
121 |             filelike.seek(0)
122 |             device = get_torch_default_device()
123 |             try:
124 |                 self._model.load_state_dict(torch.load(filelike, map_location=device))
125 |             except RuntimeError:
126 |                 warn_msg = (
127 |                     "Error loading saved torch state_dict with strict=True, "
128 |                     "likely due to differences between 'transformers' "
129 |                     "versions. Attempting to load with strict=False as a "
130 |                     "fallback...\n\n"
131 |                     "If you see errors or degraded performance, download a "
132 |                     "newer compatible model or retrain your custom model with "
133 |                     "the current 'transformers' and 'spacy-transformers' "
134 |                     "versions. For more details and available updates, run: "
135 |                     "python -m spacy validate"
136 |                 )
137 |                 warnings.warn(warn_msg)
138 |                 filelike.seek(0)
139 |                 b = torch.load(filelike, map_location=device)
140 |                 self._model.load_state_dict(b, strict=False)
141 |             self._model.to(device)
142 |         else:
143 |             self._hfmodel = HFObjects(
144 |                 None,
145 |                 None,
146 |                 None,
147 |                 msg["_init_tokenizer_config"],
148 |                 msg["_init_transformer_config"],
149 |             )
150 |         return self
151 | 


--------------------------------------------------------------------------------
/spacy_transformers/layers/hf_wrapper.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable, Optional, Any
 2 | from thinc.layers.pytorchwrapper import forward as pt_forward
 3 | from thinc.layers.pytorchwrapper import convert_pytorch_default_inputs
 4 | from thinc.layers.pytorchwrapper import convert_pytorch_default_outputs
 5 | from thinc.api import registry, Model
 6 | 
 7 | from transformers import AutoConfig, AutoModel, AutoTokenizer
 8 | 
 9 | from ..data_classes import HFObjects
10 | from .hf_shim import HFShim
11 | 
12 | 
13 | @registry.layers("HFWrapper.v1")
14 | def HFWrapper(
15 |     hf_model: HFObjects,
16 |     convert_inputs: Optional[Callable] = None,
17 |     convert_outputs: Optional[Callable] = None,
18 |     mixed_precision: bool = False,
19 |     grad_scaler_config: dict = {},
20 |     config_cls=AutoConfig,
21 |     model_cls=AutoModel,
22 |     tokenizer_cls=AutoTokenizer,
23 | ) -> Model[Any, Any]:
24 |     """Wrap a PyTorch HF model, so that it has the same API as Thinc models.
25 |     To optimize the model, you'll need to create a PyTorch optimizer and call
26 |     optimizer.step() after each batch. See examples/wrap_pytorch.py
27 | 
28 |     Your PyTorch model's forward method can take arbitrary args and kwargs,
29 |     but must return either a single tensor as output or a tuple. You may find the
30 |     PyTorch register_forward_hook helpful if you need to adapt the output.
31 | 
32 |     The convert functions are used to map inputs and outputs to and from your
33 |     PyTorch model. Each function should return the converted output, and a callback
34 |     to use during the backward pass. So:
35 | 
36 |         Xtorch, get_dX = convert_inputs(X)
37 |         Ytorch, torch_backprop = model.shims[0](Xtorch, is_train)
38 |         Y, get_dYtorch = convert_outputs(Ytorch)
39 | 
40 |     To allow maximum flexibility, the PyTorchShim expects ArgsKwargs objects
41 |     on the way into the forward and backward passed. The ArgsKwargs objects
42 |     will be passed straight into the model in the forward pass, and straight
43 |     into `torch.autograd.backward` during the backward pass.
44 |     """
45 |     if convert_inputs is None:
46 |         convert_inputs = convert_pytorch_default_inputs
47 |     if convert_outputs is None:
48 |         convert_outputs = convert_pytorch_default_outputs
49 | 
50 |     return Model(
51 |         "hf-pytorch",
52 |         pt_forward,
53 |         attrs={"convert_inputs": convert_inputs, "convert_outputs": convert_outputs},
54 |         shims=[
55 |             HFShim(
56 |                 hf_model,
57 |                 mixed_precision=mixed_precision,
58 |                 grad_scaler_config=grad_scaler_config,
59 |                 config_cls=config_cls,
60 |                 model_cls=model_cls,
61 |                 tokenizer_cls=tokenizer_cls,
62 |             )
63 |         ],
64 |         dims={"nI": None, "nO": None},
65 |     )
66 | 


--------------------------------------------------------------------------------
/spacy_transformers/layers/listener.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, Callable, List
 2 | from thinc.api import Model
 3 | from spacy.errors import Errors
 4 | from spacy.tokens import Doc
 5 | from ..data_classes import TransformerData
 6 | 
 7 | 
 8 | class TransformerListener(Model):
 9 |     """A layer that gets fed its answers from an upstream connection,
10 |     for instance from a component earlier in the pipeline.
11 |     """
12 | 
13 |     name = "transformer-listener"
14 | 
15 |     _batch_id: Optional[int]
16 |     _outputs: Optional[List[TransformerData]]
17 |     _backprop: Optional[Callable[[List[TransformerData]], List[Doc]]]
18 | 
19 |     def __init__(self, upstream_name: str):
20 |         Model.__init__(self, name=self.name, forward=forward, dims={"nO": None})
21 |         self.upstream_name = upstream_name
22 |         self._batch_id = None
23 |         self._outputs = None
24 |         self._backprop = None
25 | 
26 |     @classmethod
27 |     def get_batch_id(cls, inputs: List[Doc]):
28 |         return sum(sum(token.orth for token in doc) for doc in inputs)
29 | 
30 |     def receive(self, batch_id, outputs, backprop):
31 |         self._batch_id = batch_id
32 |         self._outputs = outputs
33 |         self._backprop = backprop
34 | 
35 |     def backprop_and_clear(self, *args, **kwargs):
36 |         """Call the stored _backprop callback, and then
37 |         clears it. This saves memory, as otherwise we hold onto that callback
38 |         until the next batch.
39 |         """
40 |         if self._backprop is not None:
41 |             result = self._backprop(*args, **kwargs)
42 |         else:
43 |             result = None
44 |         self._batch_id = None
45 |         self._outputs = None
46 |         self._backprop = None
47 |         return result
48 | 
49 |     def verify_inputs(self, inputs):
50 |         if self._batch_id is None and self._outputs is None:
51 |             raise ValueError
52 |         else:
53 |             batch_id = self.get_batch_id(inputs)
54 |             if batch_id != self._batch_id:
55 |                 raise ValueError(f"Mismatched IDs! {batch_id} vs {self._batch_id}")
56 |             else:
57 |                 return True
58 | 
59 | 
60 | def forward(model: TransformerListener, docs, is_train):
61 |     if is_train:
62 |         # This might occur during training when the transformer layer is frozen / hasn't been updated.
63 |         # In that case, it should be set to "annotating" so we can retrieve the embeddings from the doc.
64 |         if model._batch_id is None:
65 |             outputs = []
66 |             for doc in docs:
67 |                 if doc._.trf_data is None:
68 |                     raise ValueError(Errors.E203.format(name="transformer"))
69 |                 else:
70 |                     outputs.append(doc._.trf_data)
71 |             return outputs, _empty_backprop
72 |         else:
73 |             model.verify_inputs(docs)
74 |             return model._outputs, model.backprop_and_clear
75 |     else:
76 |         width = model.get_dim("nO")
77 |         outputs = []
78 |         for doc in docs:
79 |             if doc._.trf_data is None:
80 |                 outputs.append(TransformerData.zeros(len(doc), width, xp=model.ops.xp))
81 |             else:
82 |                 outputs.append(doc._.trf_data)
83 |         return outputs, _empty_backprop
84 | 
85 | 
86 | def _empty_backprop(dX):
87 |     return []
88 | 


--------------------------------------------------------------------------------
/spacy_transformers/layers/split_trf.py:
--------------------------------------------------------------------------------
 1 | from thinc.api import Model
 2 | from typing import List
 3 | from ..data_classes import FullTransformerBatch, TransformerData
 4 | 
 5 | 
 6 | def split_trf_batch() -> Model[FullTransformerBatch, List[TransformerData]]:
 7 |     return Model("split-trf-batch", forward)
 8 | 
 9 | 
10 | def forward(model, trf_full, is_train):
11 |     def backprop(d_trf_datas):
12 |         return trf_full.unsplit_by_doc([x.tensors for x in d_trf_datas])
13 | 
14 |     return trf_full.doc_data, backprop
15 | 


--------------------------------------------------------------------------------
/spacy_transformers/layers/transformer_model.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Tuple, Callable, Union, Dict
  2 | import copy
  3 | from pathlib import Path
  4 | from transformers.file_utils import ModelOutput
  5 | from transformers import AutoConfig, AutoModel, AutoTokenizer
  6 | from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
  7 | from transformers.tokenization_utils import BatchEncoding
  8 | 
  9 | from spacy.tokens import Doc
 10 | from thinc.api import Model, get_torch_default_device, xp2torch
 11 | from thinc.types import ArgsKwargs
 12 | 
 13 | import logging
 14 | 
 15 | from ..data_classes import FullTransformerBatch, WordpieceBatch, HFObjects
 16 | from ..util import maybe_flush_pytorch_cache
 17 | from ..util import log_gpu_memory, log_batch_size
 18 | from ..layers._util import replace_listener, replace_listener_cfg
 19 | from ..truncate import truncate_oversize_splits
 20 | from ..align import get_alignment, get_alignment_via_offset_mapping
 21 | from .hf_wrapper import HFWrapper
 22 | 
 23 | 
 24 | class TransformerModel(Model):
 25 |     def __init__(
 26 |         self,
 27 |         name: str,
 28 |         get_spans: Callable,
 29 |         tokenizer_config: dict = {},
 30 |         transformer_config: dict = {},
 31 |         mixed_precision: bool = False,
 32 |         grad_scaler_config: dict = {},
 33 |     ):
 34 |         """
 35 |         get_spans (Callable[[List[Doc]], List[Span]]):
 36 |             A function to extract spans from the batch of Doc objects.
 37 |             This is used to manage long documents, by cutting them into smaller
 38 |             sequences before running the transformer. The spans are allowed to
 39 |             overlap, and you can also omit sections of the Doc if they are not
 40 |             relevant.
 41 |         tokenizer_config (dict): Settings to pass to the transformers tokenizer.
 42 |         transformer_config (dict): Settings to pass to the transformers forward pass.
 43 |         """
 44 |         hf_model = HFObjects(None, None, None, tokenizer_config, transformer_config)
 45 |         wrapper = HFWrapper(
 46 |             hf_model,
 47 |             convert_inputs=_convert_transformer_inputs,
 48 |             convert_outputs=_convert_transformer_outputs,
 49 |             mixed_precision=mixed_precision,
 50 |             grad_scaler_config=grad_scaler_config,
 51 |         )
 52 |         super().__init__(
 53 |             "transformer",
 54 |             forward,
 55 |             init=init,
 56 |             layers=[wrapper],
 57 |             dims={"nO": None},
 58 |             attrs={
 59 |                 "get_spans": get_spans,
 60 |                 "name": name,
 61 |                 "set_transformer": set_pytorch_transformer,
 62 |                 "has_transformer": False,
 63 |                 "flush_cache_chance": 0.0,
 64 |                 "replace_listener": replace_listener,
 65 |                 "replace_listener_cfg": replace_listener_cfg,
 66 |             },
 67 |         )
 68 | 
 69 |     @property
 70 |     def tokenizer(self):
 71 |         return self.layers[0].shims[0]._hfmodel.tokenizer
 72 | 
 73 |     @property
 74 |     def transformer(self):
 75 |         return self.layers[0].shims[0]._hfmodel.transformer
 76 | 
 77 |     @property
 78 |     def _init_tokenizer_config(self):
 79 |         return self.layers[0].shims[0]._hfmodel._init_tokenizer_config
 80 | 
 81 |     @property
 82 |     def _init_transformer_config(self):
 83 |         return self.layers[0].shims[0]._hfmodel._init_transformer_config
 84 | 
 85 |     def copy(self):
 86 |         """
 87 |         Create a copy of the model, its attributes, and its parameters. Any child
 88 |         layers will also be deep-copied. The copy will receive a distinct `model.id`
 89 |         value.
 90 |         """
 91 |         copied = TransformerModel(self.name, self.attrs["get_spans"])
 92 |         params = {}
 93 |         for name in self.param_names:
 94 |             params[name] = self.get_param(name) if self.has_param(name) else None
 95 |         copied.params = copy.deepcopy(params)
 96 |         copied.dims = copy.deepcopy(self._dims)
 97 |         copied.layers[0] = copy.deepcopy(self.layers[0])
 98 |         for name in self.grad_names:
 99 |             copied.set_grad(name, self.get_grad(name).copy())
100 |         return copied
101 | 
102 | 
103 | def set_logger(model, out_file):
104 |     """Add a logger that will log memory usage to the given file.
105 | 
106 |     Used to debug OOM errors.
107 |     """
108 |     logging.basicConfig(
109 |         level="INFO", format="%(asctime)s:%(levelname)s: %(message)s", stream=out_file
110 |     )
111 |     model.attrs["logger"] = logging.getLogger(__name__)
112 | 
113 | 
114 | def set_pytorch_transformer(model, hf_model: HFObjects):
115 |     if model.attrs["has_transformer"]:
116 |         raise ValueError("Cannot set second transformer.")
117 |     model.layers[0].shims[0]._model = hf_model.transformer
118 |     model.layers[0].shims[0]._hfmodel.tokenizer = hf_model.tokenizer
119 |     model.layers[0].shims[0]._hfmodel.transformer = hf_model.transformer
120 |     model.layers[0].shims[0]._hfmodel.vocab_file_contents = hf_model.vocab_file_contents
121 |     model.attrs["has_transformer"] = True
122 |     model.set_dim("nO", hf_model.transformer.config.hidden_size)
123 | 
124 | 
125 | def init(model: TransformerModel, X=None, Y=None):
126 |     if model.attrs["has_transformer"]:
127 |         return
128 |     name = model.attrs["name"]
129 |     tok_cfg = model._init_tokenizer_config
130 |     trf_cfg = model._init_transformer_config
131 |     hf_model = huggingface_from_pretrained(name, tok_cfg, trf_cfg)
132 |     model.attrs["set_transformer"](model, hf_model)
133 |     tokenizer = model.tokenizer
134 |     # Call the model with a batch of inputs to infer the width
135 |     if X:
136 |         # If we're dealing with actual texts, do the work to setup the wordpieces
137 |         # batch properly
138 |         docs = X
139 |         get_spans = model.attrs["get_spans"]
140 |         nested_spans = get_spans(docs)
141 |         flat_spans = []
142 |         for doc_spans in nested_spans:
143 |             flat_spans.extend(doc_spans)
144 |         token_data = huggingface_tokenize(tokenizer, [span.text for span in flat_spans])
145 |         wordpieces = WordpieceBatch.from_batch_encoding(token_data)
146 |         if "offset_mapping" in token_data:
147 |             align = get_alignment_via_offset_mapping(
148 |                 flat_spans,
149 |                 token_data["offset_mapping"],
150 |             )
151 |         else:
152 |             align = get_alignment(
153 |                 flat_spans, wordpieces.strings, tokenizer.all_special_tokens
154 |             )
155 |         wordpieces, align = truncate_oversize_splits(
156 |             wordpieces, align, tokenizer.model_max_length
157 |         )
158 |     else:
159 |         texts = ["hello world", "foo bar"]
160 |         token_data = huggingface_tokenize(tokenizer, texts)
161 |         wordpieces = WordpieceBatch.from_batch_encoding(token_data)
162 |     model.layers[0].initialize(X=wordpieces)
163 |     model_output = model.layers[0].predict(wordpieces)
164 |     model.set_dim("nO", model_output.last_hidden_state.shape[-1])
165 | 
166 | 
167 | def forward(
168 |     model: TransformerModel, docs: List[Doc], is_train: bool
169 | ) -> Tuple[FullTransformerBatch, Callable]:
170 |     tokenizer = model.tokenizer
171 |     get_spans = model.attrs["get_spans"]
172 |     transformer = model.layers[0]
173 | 
174 |     nested_spans = get_spans(docs)
175 |     flat_spans = []
176 |     for doc_spans in nested_spans:
177 |         flat_spans.extend(doc_spans)
178 |     # Flush the PyTorch cache every so often. It seems to help with memory :(
179 |     # This shouldn't be necessary, I'm not sure what I'm doing wrong?
180 |     maybe_flush_pytorch_cache(chance=model.attrs.get("flush_cache_chance", 0))
181 |     if "logger" in model.attrs:
182 |         log_gpu_memory(model.attrs["logger"], "begin forward")
183 |     batch_encoding = huggingface_tokenize(tokenizer, [span.text for span in flat_spans])
184 |     wordpieces = WordpieceBatch.from_batch_encoding(batch_encoding)
185 |     if "logger" in model.attrs:
186 |         log_batch_size(model.attrs["logger"], wordpieces, is_train)
187 |     if "offset_mapping" in batch_encoding:
188 |         align = get_alignment_via_offset_mapping(
189 |             flat_spans,
190 |             batch_encoding["offset_mapping"],
191 |         )
192 |     else:
193 |         align = get_alignment(
194 |             flat_spans, wordpieces.strings, tokenizer.all_special_tokens
195 |         )
196 |     wordpieces, align = truncate_oversize_splits(
197 |         wordpieces, align, tokenizer.model_max_length
198 |     )
199 |     model_output, bp_tensors = transformer(wordpieces, is_train)
200 |     if "logger" in model.attrs:
201 |         log_gpu_memory(model.attrs["logger"], "after forward")
202 |     output = FullTransformerBatch(
203 |         spans=nested_spans,
204 |         wordpieces=wordpieces,
205 |         model_output=model_output,
206 |         align=align,
207 |     )
208 |     if "logger" in model.attrs:
209 |         log_gpu_memory(model.attrs["logger"], "return from forward")
210 | 
211 |     def backprop_transformer(d_output: FullTransformerBatch) -> List[Doc]:
212 |         if "logger" in model.attrs:
213 |             log_gpu_memory(model.attrs["logger"], "Begin backprop")
214 |         _ = bp_tensors(d_output.model_output)
215 |         if "logger" in model.attrs:
216 |             log_gpu_memory(model.attrs["logger"], "After backprop")
217 |         return docs
218 | 
219 |     return output, backprop_transformer
220 | 
221 | 
222 | def _convert_transformer_inputs(model, wps: WordpieceBatch, is_train):
223 |     # Adapter for the HFWrapper. See https://thinc.ai/docs/usage-frameworks
224 | 
225 |     hf_device = model.shims[0]._hfmodel.transformer.device
226 |     kwargs = {
227 |         "input_ids": xp2torch(wps.input_ids, device=hf_device),
228 |         "attention_mask": xp2torch(wps.attention_mask, device=hf_device),
229 |     }
230 |     if wps.token_type_ids is not None:
231 |         kwargs["token_type_ids"] = xp2torch(wps.token_type_ids, device=hf_device)
232 |     return ArgsKwargs(args=(), kwargs=kwargs), lambda dX: []
233 | 
234 | 
235 | def _convert_transformer_outputs(model, inputs_outputs, is_train):
236 |     _, model_output = inputs_outputs
237 | 
238 |     def backprop(d_model_output: ModelOutput) -> ArgsKwargs:
239 |         return ArgsKwargs(
240 |             args=(model_output.last_hidden_state,),
241 |             kwargs={"grad_tensors": d_model_output.values()},
242 |         )
243 | 
244 |     return model_output, backprop
245 | 
246 | 
247 | def huggingface_from_pretrained(
248 |     source: Union[Path, str],
249 |     tok_config: Dict,
250 |     trf_config: Dict,
251 |     config_cls=AutoConfig,
252 |     model_cls=AutoModel,
253 |     tokenizer_cls=AutoTokenizer,
254 | ) -> HFObjects:
255 |     """Create a Huggingface transformer model from pretrained weights. Will
256 |     download the model if it is not already downloaded.
257 | 
258 |     source (Union[str, Path]): The name of the model or a path to it, such as
259 |         'bert-base-cased'.
260 |     tok_config (dict): Settings to pass to the tokenizer.
261 |     trf_config (dict): Settings to pass to the transformer.
262 |     """
263 |     if isinstance(source, Path):
264 |         str_path = str(source.absolute())
265 |     else:
266 |         str_path = source
267 |     tokenizer = tokenizer_cls.from_pretrained(str_path, **tok_config)
268 |     vocab_file_contents = None
269 |     if hasattr(tokenizer, "vocab_file"):
270 |         with open(tokenizer.vocab_file, "rb") as fileh:
271 |             vocab_file_contents = fileh.read()
272 |     trf_config["return_dict"] = True
273 |     config = config_cls.from_pretrained(str_path, **trf_config)
274 |     transformer = model_cls.from_pretrained(str_path, config=config)
275 |     torch_device = get_torch_default_device()
276 |     transformer.to(torch_device)
277 |     return HFObjects(tokenizer, transformer, vocab_file_contents)
278 | 
279 | 
280 | def huggingface_tokenize(tokenizer, texts: List[str]) -> BatchEncoding:
281 |     """Apply a Huggingface tokenizer to a batch of texts."""
282 | 
283 |     # Use NumPy arrays rather than PyTorch tensors to avoid a lot of
284 |     # host <-> device transfers during tokenization and post-processing
285 |     # when a GPU is used.
286 |     token_data = tokenizer(
287 |         texts,
288 |         add_special_tokens=True,
289 |         return_attention_mask=True,
290 |         return_offsets_mapping=isinstance(tokenizer, PreTrainedTokenizerFast),
291 |         return_tensors="np",
292 |         return_token_type_ids=None,  # Sets to model default
293 |         padding="longest",
294 |     )
295 |     token_data["input_texts"] = []
296 |     for i in range(len(token_data["input_ids"])):
297 |         wp_texts = tokenizer.convert_ids_to_tokens(token_data["input_ids"][i])
298 |         token_data["input_texts"].append(wp_texts)
299 |     token_data["pad_token"] = tokenizer.pad_token
300 |     return token_data
301 | 


--------------------------------------------------------------------------------
/spacy_transformers/layers/trfs2arrays.py:
--------------------------------------------------------------------------------
  1 | from typing import Callable, List, Optional, Tuple, cast
  2 | import numpy
  3 | from spacy.util import all_equal
  4 | from transformers.file_utils import ModelOutput
  5 | from transformers.modeling_outputs import BaseModelOutput
  6 | from thinc.api import Model
  7 | from thinc.types import Ragged, Floats2d
  8 | from ..data_classes import TransformerData
  9 | from ..align import apply_alignment
 10 | 
 11 | 
 12 | def trfs2arrays(
 13 |     pooling: Model[Ragged, Floats2d], grad_factor: float
 14 | ) -> Model[List[TransformerData], List[Floats2d]]:
 15 |     """Pool transformer data into token-aligned tensors."""
 16 |     return Model(
 17 |         "trfs2arrays",
 18 |         forward,
 19 |         layers=[pooling],
 20 |         attrs={"grad_factor": grad_factor},
 21 |     )
 22 | 
 23 | 
 24 | def forward(model: Model, trf_datas: List[TransformerData], is_train: bool):
 25 |     pooling: Model[Ragged, Floats2d] = model.layers[0]
 26 |     grad_factor = model.attrs["grad_factor"]
 27 |     zero_outputs: List[Tuple[int, Floats2d]] = []
 28 |     backprops_alignment: List[Optional[Callable]] = []
 29 |     aligned_outputs: List[Tuple[int, Ragged]] = []
 30 | 
 31 |     # For zero-length documents, we could cache the output width by iterating
 32 |     # through the batch outputs and retrieving the shape of a non-zero length
 33 |     # Doc. This, however, is not fool-proof as one can pass an entire batch of
 34 |     # zero-length Docs to the transformer model (at least during prediction).
 35 |     # Instead of being conditionally correct, we'll explicitly leave the width as
 36 |     # zero in these cases as the effective length of the resultant tensor is zero anyway.
 37 |     output_width = 0
 38 | 
 39 |     for i, trf_data in enumerate(trf_datas):
 40 |         if not isinstance(trf_data, TransformerData):
 41 |             raise ValueError(
 42 |                 "Expected spacy_transformers.data_classes.TransformerData "
 43 |                 f"in trf_data, got: {type(trf_data)}\n"
 44 |                 "Check that your pipeline contains a transformer component "
 45 |                 "with a spacy-transformers TransformerModel architecture."
 46 |             )
 47 |         if "last_hidden_state" in trf_data.model_output:
 48 |             tensor_t_i = cast(BaseModelOutput, trf_data.model_output).last_hidden_state
 49 |             if tensor_t_i.size == 0:
 50 |                 # This can happen during prediction/initialization if the transformer pipe was disabled/not executed and one of the inputs
 51 |                 # was of length zero. This causes the listenener to generate a zero-sized (in the sequence length dim) TransformerData
 52 |                 # output and pass it downstream.
 53 |                 zero_outputs.append((i, model.ops.alloc2f(0, output_width)))
 54 |                 backprops_alignment.append(None)
 55 |             else:
 56 |                 # This is the general case for non-zero length documents.
 57 |                 src = model.ops.reshape2f(tensor_t_i, -1, trf_data.width)  # type: ignore
 58 |                 dst, get_d_src = apply_alignment(model.ops, trf_data.align, src)
 59 |                 aligned_outputs.append((i, dst))
 60 |                 backprops_alignment.append(get_d_src)
 61 |         else:
 62 |             # This can happen during prediction/training for zero-length documents. Since zero-length docs
 63 |             # are implicitly ignored in the span generation stage, the transformer model does not return any
 64 |             # predictions for them and subsequently, FullTransformerBatch.split_by_doc() generates an empty
 65 |             # TransformerData.
 66 |             zero_outputs.append((i, model.ops.alloc2f(0, output_width)))
 67 |             backprops_alignment.append(None)
 68 | 
 69 |     pooling_outputs, backprop_pooling = concat_pooling_forward(
 70 |         pooling, [dst for _, dst in aligned_outputs], is_train
 71 |     )
 72 | 
 73 |     # Interleave the zero and non-zero outputs into the final result.
 74 |     outputs: List[Optional[Floats2d]] = [None] * (
 75 |         len(zero_outputs) + len(aligned_outputs)
 76 |     )
 77 |     for i, zero_output in zero_outputs:
 78 |         outputs[i] = zero_output
 79 |     for (i, _), pooling_output in zip(aligned_outputs, pooling_outputs):
 80 |         outputs[i] = pooling_output
 81 | 
 82 |     def backprop_trf_to_tensor(d_outputs: List[Floats2d]) -> List[TransformerData]:
 83 |         d_trf_datas: List[TransformerData] = []
 84 | 
 85 |         # Only update the gradients that are relevant for pooling.
 86 |         d_pooling = backprop_pooling([d_outputs[i] for i, _ in aligned_outputs])
 87 |         for (i, _), d_pooling_i in zip(aligned_outputs, d_pooling):
 88 |             d_outputs[i] = d_pooling_i
 89 | 
 90 |         to_zip = (trf_datas, d_outputs, backprops_alignment)
 91 |         assert all_equal(len(x) for x in to_zip)  # type: ignore
 92 |         zipped = zip(*to_zip)
 93 |         for trf_data, d_output, get_d_src in zipped:
 94 |             if "last_hidden_state" not in trf_data.model_output:
 95 |                 # This gradient belongs to a zero-length doc and must be ignored as it doesn't have a corresponding
 96 |                 # output from the transformer model (due to empty documents being skipped during the span generation
 97 |                 # stage in the forward pass).
 98 |                 assert len(d_output) == 0
 99 |                 assert get_d_src is None
100 |                 continue
101 | 
102 |             assert get_d_src is not None
103 |             d_model_output = ModelOutput(
104 |                 last_hidden_state=model.ops.alloc(
105 |                     trf_data.model_output.last_hidden_state.shape,  # type: ignore
106 |                     dtype=trf_data.model_output.last_hidden_state.dtype,  # type: ignore
107 |                 )
108 |             )
109 |             d_src = get_d_src(d_output)
110 |             d_src *= grad_factor
111 |             d_model_output["last_hidden_state"] = d_src.reshape(
112 |                 cast(BaseModelOutput, trf_data.model_output).last_hidden_state.shape
113 |             )
114 |             d_trf_datas.append(
115 |                 TransformerData(
116 |                     model_output=d_model_output,
117 |                     wordpieces=trf_data.wordpieces,
118 |                     align=trf_data.align,
119 |                 )
120 |             )
121 |         return d_trf_datas
122 | 
123 |     assert len(outputs) == len(trf_datas)
124 |     return outputs, backprop_trf_to_tensor
125 | 
126 | 
127 | def concat_pooling_forward(
128 |     pooling: Model[Ragged, Floats2d], X: List[Ragged], is_train: bool
129 | ):
130 |     xp = pooling.ops.xp
131 | 
132 |     datas = []
133 |     lens = []
134 |     doc_lens = []
135 |     for X_doc_data in X:
136 |         datas.append(X_doc_data.dataXd)
137 |         lens.append(X_doc_data.lengths)
138 |         doc_lens.append(len(X_doc_data.lengths))
139 | 
140 |     X_flat = Ragged(xp.concatenate(datas, axis=0), xp.concatenate(lens, axis=0))
141 |     Y_pooled, pooling_backprop = pooling(X_flat, is_train)
142 |     Y = xp.split(Y_pooled, numpy.cumsum(doc_lens)[:-1])
143 | 
144 |     def backprop(dY):
145 |         dY_pooled_flat = xp.concatenate(dY)
146 |         dY_flat = pooling_backprop(dY_pooled_flat).dataXd
147 | 
148 |         dY = []
149 |         for X_doc_data in X:
150 |             doc_unpooled_len = X_doc_data.dataXd.shape[0]
151 |             dY.append(Ragged(dY_flat[:doc_unpooled_len], X_doc_data.lengths))
152 |             dY_flat = dY_flat[doc_unpooled_len:]
153 | 
154 |         return dY
155 | 
156 |     return Y, backprop
157 | 


--------------------------------------------------------------------------------
/spacy_transformers/pipeline_component.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Callable, Iterable, Iterator, Optional, Dict, Union
  2 | import warnings
  3 | from spacy.language import Language
  4 | from spacy.pipeline.trainable_pipe import TrainablePipe
  5 | from spacy.pipeline.pipe import deserialize_config
  6 | from spacy.tokens import Doc
  7 | from spacy.vocab import Vocab
  8 | from spacy.training import Example, validate_examples
  9 | from spacy import util, Errors
 10 | from spacy.util import minibatch
 11 | from thinc.api import Model, Config, set_dropout_rate, Optimizer
 12 | import srsly
 13 | from pathlib import Path
 14 | 
 15 | from .layers.transformer_model import huggingface_from_pretrained
 16 | from .util import batch_by_length
 17 | from .annotation_setters import null_annotation_setter
 18 | from .data_classes import FullTransformerBatch, TransformerData
 19 | from .layers import TransformerListener
 20 | 
 21 | 
 22 | DEFAULT_CONFIG_STR = """
 23 | [transformer]
 24 | max_batch_items = 4096
 25 | 
 26 | [transformer.set_extra_annotations]
 27 | @annotation_setters = "spacy-transformers.null_annotation_setter.v1"
 28 | 
 29 | [transformer.model]
 30 | @architectures = "spacy-transformers.TransformerModel.v3"
 31 | name = "roberta-base"
 32 | tokenizer_config = {"use_fast": true}
 33 | transformer_config = {}
 34 | mixed_precision = false
 35 | grad_scaler_config = {}
 36 | 
 37 | [transformer.model.get_spans]
 38 | @span_getters = "spacy-transformers.strided_spans.v1"
 39 | window = 128
 40 | stride = 96
 41 | """
 42 | 
 43 | DEFAULT_CONFIG = Config().from_str(DEFAULT_CONFIG_STR)
 44 | DOC_EXT_ATTR = "trf_data"
 45 | 
 46 | 
 47 | @Language.factory(
 48 |     "transformer",
 49 |     assigns=[f"doc._.{DOC_EXT_ATTR}"],
 50 |     default_config=DEFAULT_CONFIG["transformer"],
 51 | )
 52 | def make_transformer(
 53 |     nlp: Language,
 54 |     name: str,
 55 |     model: Model[List[Doc], FullTransformerBatch],
 56 |     set_extra_annotations: Callable[[List[Doc], FullTransformerBatch], None],
 57 |     max_batch_items: int,
 58 | ):
 59 |     """Construct a Transformer component, which lets you plug a model from the
 60 |     Huggingface transformers library into spaCy so you can use it in your
 61 |     pipeline. One or more subsequent spaCy components can use the transformer
 62 |     outputs as features in its model, with gradients backpropagated to the single
 63 |     shared weights.
 64 | 
 65 |     model (Model[List[Doc], FullTransformerBatch]): A thinc Model object wrapping
 66 |         the transformer. Usually you will want to use the TransformerModel
 67 |         layer for this.
 68 |     set_extra_annotations (Callable[[List[Doc], FullTransformerBatch], None]): A
 69 |         callback to set additional information onto the batch of `Doc` objects.
 70 |         The doc._.trf_data attribute is set prior to calling the callback.
 71 |         By default, no additional annotations are set.
 72 |     """
 73 |     return Transformer(
 74 |         nlp.vocab,
 75 |         model,
 76 |         set_extra_annotations,
 77 |         max_batch_items=max_batch_items,
 78 |         name=name,
 79 |     )
 80 | 
 81 | 
 82 | def install_extensions() -> None:
 83 |     if not Doc.has_extension(DOC_EXT_ATTR):
 84 |         Doc.set_extension(DOC_EXT_ATTR, default=None)
 85 | 
 86 | 
 87 | class Transformer(TrainablePipe):
 88 |     """spaCy pipeline component that provides access to a transformer model from
 89 |     the Huggingface transformers library. Usually you will connect subsequent
 90 |     components to the shared transformer using the TransformerListener layer.
 91 |     This works similarly to spaCy's Tok2Vec component and Tok2VecListener
 92 |     sublayer.
 93 | 
 94 |     The activations from the transformer are saved in the doc._.trf_data extension
 95 |     attribute. You can also provide a callback to set additional annotations.
 96 | 
 97 |     vocab (Vocab): The Vocab object for the pipeline.
 98 |     model (Model[List[Doc], FullTransformerBatch]): A thinc Model object wrapping
 99 |         the transformer. Usually you will want to use the TransformerModel
100 |         layer for this.
101 |     set_extra_annotations (Callable[[List[Doc], FullTransformerBatch], None]): A
102 |         callback to set additional information onto the batch of `Doc` objects.
103 |         The doc._.trf_data attribute is set prior to calling the callback.
104 |         By default, no additional annotations are set.
105 |     """
106 | 
107 |     def __init__(
108 |         self,
109 |         vocab: Vocab,
110 |         model: Model[List[Doc], FullTransformerBatch],
111 |         set_extra_annotations: Callable = null_annotation_setter,
112 |         *,
113 |         name: str = "transformer",
114 |         max_batch_items: int = 128 * 32,  # Max size of padded batch
115 |     ):
116 |         """Initialize the transformer component."""
117 |         self.name = name
118 |         self.vocab = vocab
119 |         self.model = model
120 |         if not isinstance(self.model, Model):
121 |             raise ValueError(f"Expected Thinc Model, got: {type(self.model)}")
122 |         self.set_extra_annotations = set_extra_annotations
123 |         self.cfg = {"max_batch_items": max_batch_items}
124 |         self.listener_map: Dict[str, List[TransformerListener]] = {}
125 |         install_extensions()
126 | 
127 |     @property
128 |     def listeners(self) -> List[TransformerListener]:
129 |         """RETURNS (List[TransformerListener]): The listener models listening
130 |         to this component. Usually internals.
131 |         """
132 |         return [m for c in self.listening_components for m in self.listener_map[c]]
133 | 
134 |     @property
135 |     def listening_components(self) -> List[str]:
136 |         """RETURNS (List[str]): The downstream components listening to this
137 |         component. Usually internals.
138 |         """
139 |         return list(self.listener_map.keys())
140 | 
141 |     def add_listener(self, listener: TransformerListener, component_name: str) -> None:
142 |         """Add a listener for a downstream component. Usually internals."""
143 |         self.listener_map.setdefault(component_name, [])
144 |         if listener not in self.listener_map[component_name]:
145 |             self.listener_map[component_name].append(listener)
146 |         if self.model.has_dim("nO") and listener.has_dim("nO") is None:
147 |             listener.set_dim("nO", self.model.get_dim("nO"))
148 | 
149 |     def remove_listener(
150 |         self, listener: TransformerListener, component_name: str
151 |     ) -> bool:
152 |         """Remove a listener for a downstream component. Usually internals."""
153 |         if component_name in self.listener_map:
154 |             if listener in self.listener_map[component_name]:
155 |                 self.listener_map[component_name].remove(listener)
156 |                 # If no listeners are left, remove entry
157 |                 if not self.listener_map[component_name]:
158 |                     del self.listener_map[component_name]
159 |                 return True
160 |         return False
161 | 
162 |     def find_listeners(self, component) -> None:
163 |         """Walk over a model of a processing component, looking for layers that
164 |         are TransformerListener subclasses that have an upstream_name that
165 |         matches this component.
166 |         Listeners can also set their upstream_name attribute to the wildcard
167 |         string '*' to match any `Transformer`.
168 | 
169 |         You're unlikely to ever need multiple `Transformer` components, so it's
170 |         fine to leave your listeners upstream_name on '*'.
171 |         """
172 |         names = ("*", self.name)
173 |         if isinstance(getattr(component, "model", None), Model):
174 |             for node in component.model.walk():
175 |                 if (
176 |                     isinstance(node, TransformerListener)
177 |                     and node.upstream_name in names
178 |                 ):
179 |                     self.add_listener(node, component.name)
180 | 
181 |     def __call__(self, doc: Doc) -> Doc:
182 |         """Apply the pipe to one document. The document is modified in place,
183 |         and returned. This usually happens under the hood when the nlp object
184 |         is called on a text and all components are applied to the Doc.
185 | 
186 |         docs (Doc): The Doc to process.
187 |         RETURNS (Doc): The processed Doc.
188 | 
189 |         DOCS: https://spacy.io/api/transformer#call
190 |         """
191 |         install_extensions()
192 |         outputs = self.predict([doc])
193 |         self.set_annotations([doc], outputs)
194 |         return doc
195 | 
196 |     def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
197 |         """Apply the pipe to a stream of documents. This usually happens under
198 |         the hood when the nlp object is called on a text and all components are
199 |         applied to the Doc.
200 | 
201 |         stream (Iterable[Doc]): A stream of documents.
202 |         batch_size (int): The number of documents to buffer.
203 |         YIELDS (Doc): Processed documents in order.
204 | 
205 |         DOCS: https://spacy.io/api/transformer#pipe
206 |         """
207 |         install_extensions()
208 |         for outer_batch in minibatch(stream, batch_size):
209 |             outer_batch = list(outer_batch)
210 |             for indices in batch_by_length(outer_batch, self.cfg["max_batch_items"]):
211 |                 subbatch = [outer_batch[i] for i in indices]
212 |                 self.set_annotations(subbatch, self.predict(subbatch))
213 |             yield from outer_batch
214 | 
215 |     def predict(self, docs: Iterable[Doc]) -> FullTransformerBatch:
216 |         """Apply the pipeline's model to a batch of docs, without modifying them.
217 |         Returns the extracted features as the FullTransformerBatch dataclass.
218 | 
219 |         docs (Iterable[Doc]): The documents to predict.
220 |         RETURNS (FullTransformerBatch): The extracted features.
221 | 
222 |         DOCS: https://spacy.io/api/transformer#predict
223 |         """
224 |         docs = list(docs)
225 |         if not any(len(doc) for doc in docs):
226 |             # Handle cases where there are no tokens in any docs.
227 |             activations = FullTransformerBatch.empty(len(docs))
228 |         else:
229 |             activations = self.model.predict(docs)
230 |         return activations
231 | 
232 |     def set_annotations(
233 |         self, docs: Iterable[Doc], predictions: FullTransformerBatch
234 |     ) -> None:
235 |         """Assign the extracted features to the Doc objects. By default, the
236 |         TransformerData object is written to the doc._.trf_data attribute. Your
237 |         set_extra_annotations callback is then called, if provided.
238 | 
239 |         docs (Iterable[Doc]): The documents to modify.
240 |         predictions: (FullTransformerBatch): A batch of activations.
241 | 
242 |         DOCS: https://spacy.io/api/pipe#set_annotations
243 |         """
244 |         doc_data = list(predictions.doc_data)
245 |         for doc, data in zip(docs, doc_data):
246 |             doc._.trf_data = data
247 |         self.set_extra_annotations(list(docs), predictions)
248 | 
249 |     def update(
250 |         self,
251 |         examples: Iterable[Example],
252 |         *,
253 |         drop: float = 0.0,
254 |         sgd: Optional[Optimizer] = None,
255 |         losses: Optional[Dict[str, float]] = None,
256 |     ) -> Dict[str, float]:
257 |         """Prepare for an update to the transformer.
258 | 
259 |         Like the `Tok2Vec` component, the `Transformer` component is unusual
260 |         in that it does not receive "gold standard" annotations to calculate
261 |         a weight update. The optimal output of the transformer data is unknown;
262 |         it's a hidden layer inside the network that is updated by backpropagating
263 |         from output layers.
264 | 
265 |         The `Transformer` component therefore does not perform a weight update
266 |         during its own `update` method. Instead, it runs its transformer model
267 |         and communicates the output and the backpropagation callback to any
268 |         downstream components that have been connected to it via the
269 |         TransformerListener sublayer. If there are multiple listeners, the last
270 |         layer will actually backprop to the transformer and call the optimizer,
271 |         while the others simply increment the gradients.
272 | 
273 |         examples (Iterable[Example]):
274 |             A batch of Example objects. Only the `predicted` doc object is used,
275 |             the reference doc is ignored.
276 |         drop (float): The dropout rate.
277 |         sgd (thinc.api.Optimizer): The optimizer.
278 |         losses (Dict[str, float]): Optional record of the loss during training.
279 |             Updated using the component name as the key.
280 |         RETURNS (Dict[str, float]): The updated losses dictionary.
281 | 
282 |         DOCS: https://spacy.io/api/transformer#update
283 |         """
284 |         validate_examples(examples, "Transformer.update")
285 |         if losses is None:
286 |             losses = {}
287 |         docs = [eg.predicted for eg in examples]
288 |         if isinstance(docs, Doc):
289 |             docs = [docs]
290 |         if not any(len(doc) for doc in docs):
291 |             # Handle cases where there are no tokens in any docs.
292 |             return losses
293 |         set_dropout_rate(self.model, drop)
294 |         trf_full, bp_trf_full = self.model.begin_update(docs)
295 |         d_tensors: List = []
296 |         losses.setdefault(self.name, 0.0)
297 | 
298 |         def accumulate_gradient(d_trf_datas: List[TransformerData]):
299 |             """Accumulate tok2vec loss and gradient. This is passed as a callback
300 |             to all but the last listener. Only the last one does the backprop.
301 |             """
302 |             nonlocal d_tensors
303 |             for i, d_trf_data in enumerate(d_trf_datas):
304 |                 for d_tensor in d_trf_data.tensors:
305 |                     losses[self.name] += float((d_tensor**2).sum())  # type:ignore
306 |                 if i >= len(d_tensors):
307 |                     d_tensors.append(list(d_trf_data.tensors))
308 |                 else:
309 |                     for j, d_tensor in enumerate(d_trf_data.tensors):
310 |                         d_tensors[i][j] += d_tensor
311 | 
312 |         def backprop(d_trf_datas: List[TransformerData]):
313 |             """Callback to actually do the backprop. Passed to last listener."""
314 |             nonlocal d_tensors
315 |             accumulate_gradient(d_trf_datas)
316 |             d_trf_full = trf_full.unsplit_by_doc(d_tensors)
317 |             d_docs = bp_trf_full(d_trf_full)  # type: ignore
318 |             if sgd is not None:
319 |                 self.model.finish_update(sgd)
320 |             d_tensors = []
321 |             return d_docs
322 | 
323 |         batch_id = TransformerListener.get_batch_id(docs)
324 |         for listener in self.listeners[:-1]:
325 |             listener.receive(batch_id, trf_full.doc_data, accumulate_gradient)
326 |         if self.listeners:
327 |             self.listeners[-1].receive(batch_id, trf_full.doc_data, backprop)
328 |         return losses
329 | 
330 |     def get_loss(self, docs, golds, scores):
331 |         """A noop function, for compatibility with the Pipe API. See the `update`
332 |         method for an explanation of the loss mechanics of the component.
333 |         """
334 |         pass
335 | 
336 |     def initialize(
337 |         self,
338 |         get_examples: Callable[[], Iterable[Example]],
339 |         *,
340 |         nlp: Optional[Language] = None,
341 |     ):
342 |         """Initialize the pipe for training, using data examples if available.
343 | 
344 |         get_examples (Callable[[], Iterable[Example]]): Optional function that
345 |             returns gold-standard Example objects.
346 |         nlp (Language): The current nlp object.
347 | 
348 |         DOCS: https://spacy.io/api/transformer#initialize
349 |         """
350 |         docs = [Doc(Vocab(), words=["hello"])]
351 |         self.model.initialize(X=docs)
352 |         if nlp is not None:
353 |             for i, (name1, proc1) in enumerate(nlp.pipeline):
354 |                 if proc1 is self:
355 |                     for name2, proc2 in nlp.pipeline[i:]:
356 |                         self.find_listeners(proc2)
357 |                     break
358 | 
359 |     def to_disk(
360 |         self, path: Union[str, Path], *, exclude: Iterable[str] = tuple()
361 |     ) -> None:
362 |         """Serialize the pipe to disk.
363 | 
364 |         path (str / Path): Path to a directory.
365 |         exclude (Iterable[str]): String names of serialization fields to exclude.
366 | 
367 |         DOCS: https://spacy.io/api/transformer#to_disk
368 |         """
369 |         serialize = {}
370 |         serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
371 |         serialize["vocab"] = lambda p: self.vocab.to_disk(p)
372 |         serialize["model"] = lambda p: self.model.to_disk(p)
373 |         util.to_disk(path, serialize, exclude)
374 | 
375 |     def from_disk(
376 |         self, path: Union[str, Path], *, exclude: Iterable[str] = tuple()
377 |     ) -> "Transformer":
378 |         """Load the pipe from disk.
379 | 
380 |         path (str / Path): Path to a directory.
381 |         exclude (Iterable[str]): String names of serialization fields to exclude.
382 |         RETURNS (Transformer): The loaded object.
383 | 
384 |         DOCS: https://spacy.io/api/transformer#from_disk
385 |         """
386 | 
387 |         def load_model(p):
388 |             try:
389 |                 with open(p, "rb") as mfile:
390 |                     self.model.from_bytes(mfile.read())
391 |             except AttributeError:
392 |                 raise ValueError(Errors.E149) from None
393 |             except (IsADirectoryError, PermissionError):
394 |                 warn_msg = (
395 |                     "Automatically converting a transformer component "
396 |                     "from spacy-transformers v1.0 to v1.1+. If you see errors "
397 |                     "or degraded performance, download a newer compatible "
398 |                     "model or retrain your custom model with the current "
399 |                     "spacy-transformers version. For more details and "
400 |                     "available updates, run: python -m spacy validate"
401 |                 )
402 |                 warnings.warn(warn_msg)
403 |                 p = Path(p).absolute()
404 |                 hf_model = huggingface_from_pretrained(
405 |                     p,
406 |                     self.model._init_tokenizer_config,
407 |                     self.model._init_transformer_config,
408 |                 )
409 |                 self.model.attrs["set_transformer"](self.model, hf_model)
410 | 
411 |         deserialize = {
412 |             "vocab": self.vocab.from_disk,
413 |             "cfg": lambda p: self.cfg.update(deserialize_config(p)),
414 |             "model": load_model,
415 |         }
416 |         util.from_disk(path, deserialize, exclude)  # type: ignore
417 |         return self
418 | 


--------------------------------------------------------------------------------
/spacy_transformers/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spacy-transformers/aa1bb58f74570035e8a6dc3623292deaf95e03da/spacy_transformers/py.typed


--------------------------------------------------------------------------------
/spacy_transformers/span_getters.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable, Iterable, List
 2 | from functools import partial
 3 | from spacy.tokens import Doc, Span
 4 | 
 5 | from .util import registry
 6 | 
 7 | SpannerT = Callable[[List[Doc]], List[List[Span]]]
 8 | 
 9 | 
10 | def get_strided_spans(
11 |     docs: Iterable[Doc], window: int, stride: int
12 | ) -> List[List[Span]]:
13 |     spans: List[List[Span]] = []
14 |     for doc in docs:
15 |         start = 0
16 |         spans.append([])
17 |         for i in range(len(doc) // stride):
18 |             spans[-1].append(doc[start : start + window])
19 |             if (start + window) >= len(doc):
20 |                 break
21 |             start += stride
22 |         else:
23 |             if start < len(doc):
24 |                 spans[-1].append(doc[start:])
25 |     return spans
26 | 
27 | 
28 | @registry.span_getters("spacy-transformers.strided_spans.v1")  # type: ignore
29 | def configure_strided_spans(window: int, stride: int) -> SpannerT:
30 |     """
31 |     Set the 'window' and 'stride' options for getting strided spans.
32 | 
33 |     If you set the window and stride to the same value, the spans will cover
34 |     each token once. Setting 'stride' lower than 'window' will allow for an
35 |     overlap, so that some tokens are counted twice. This can be desirable,
36 |     because it allows all tokens to have both a left and right context.
37 |     """
38 |     return partial(get_strided_spans, window=window, stride=stride)
39 | 
40 | 
41 | def get_sent_spans(docs: Iterable[Doc]) -> List[List[Span]]:
42 |     return [list(doc.sents) for doc in docs]
43 | 
44 | 
45 | @registry.span_getters("spacy-transformers.sent_spans.v1")  # type: ignore
46 | def configure_get_sent_spans() -> Callable:
47 |     """
48 |     Create a `span_getter` that uses sentence boundary markers to extract
49 |     the spans. This requires sentence boundaries to be set, and may result
50 |     in somewhat uneven batches, depending on the sentence lengths. However,
51 |     it does provide the transformer with more meaningful windows to attend over.
52 |     """
53 |     return get_sent_spans
54 | 
55 | 
56 | def get_doc_spans(docs: Iterable[Doc]) -> List[List[Span]]:
57 |     return [[doc[:]] for doc in docs]
58 | 
59 | 
60 | @registry.span_getters("spacy-transformers.doc_spans.v1")  # type: ignore
61 | def configure_get_doc_spans() -> Callable:
62 |     """
63 |     Create a `span_getter` that uses the whole document as its spans. This is
64 |     the best approach if your `Doc` objects already refer to relatively short
65 |     texts.
66 |     """
67 |     return get_doc_spans
68 | 
69 | 
70 | __all__ = [
71 |     "get_sent_spans",
72 |     "get_doc_spans",
73 |     "configure_get_doc_spans",
74 |     "configure_get_sent_spans",
75 |     "configure_strided_spans",
76 | ]
77 | 


--------------------------------------------------------------------------------
/spacy_transformers/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spacy-transformers/aa1bb58f74570035e8a6dc3623292deaf95e03da/spacy_transformers/tests/__init__.py


--------------------------------------------------------------------------------
/spacy_transformers/tests/enable_gpu.py:
--------------------------------------------------------------------------------
1 | from spacy import require_gpu
2 | 
3 | require_gpu()
4 | 


--------------------------------------------------------------------------------
/spacy_transformers/tests/regression/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spacy-transformers/aa1bb58f74570035e8a6dc3623292deaf95e03da/spacy_transformers/tests/regression/__init__.py


--------------------------------------------------------------------------------
/spacy_transformers/tests/regression/test_spacy_issue6401.py:
--------------------------------------------------------------------------------
 1 | import spacy
 2 | from spacy.training.example import Example
 3 | from spacy.util import make_tempdir
 4 | from spacy import util
 5 | from thinc.api import Config
 6 | 
 7 | 
 8 | TRAIN_DATA = [
 9 |     ("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
10 |     ("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}),
11 | ]
12 | 
13 | 
14 | cfg_string = """
15 |     [nlp]
16 |     lang = "en"
17 |     pipeline = ["transformer","textcat"]
18 | 
19 |     [components]
20 | 
21 |     [components.textcat]
22 |     factory = "textcat"
23 | 
24 |     [components.textcat.model]
25 |     @architectures = "spacy.TextCatEnsemble.v2"
26 | 
27 |     [components.textcat.model.tok2vec]
28 |     @architectures = "spacy-transformers.TransformerListener.v1"
29 |     grad_factor = 1.0
30 | 
31 |     [components.textcat.model.tok2vec.pooling]
32 |     @layers = "reduce_mean.v1"
33 | 
34 |     [components.transformer]
35 |     factory = "transformer"
36 | 
37 |     [components.transformer.model]
38 |     name = "distilbert-base-uncased"
39 |     """
40 | 
41 | 
42 | def test_transformer_pipeline_textcat():
43 |     """Test that a pipeline with just a transformer+textcat runs and trains properly.
44 |     This used to throw an error because of shape inference issues -
45 |     cf https://github.com/explosion/spaCy/issues/6401"""
46 |     orig_config = Config().from_str(cfg_string)
47 |     nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
48 |     assert nlp.pipe_names == ["transformer", "textcat"]
49 |     train_examples = []
50 | 
51 |     for text, annotations in TRAIN_DATA:
52 |         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
53 |     optimizer = nlp.initialize(get_examples=lambda: train_examples)
54 | 
55 |     for i in range(2):
56 |         losses = {}
57 |         nlp.update(train_examples, sgd=optimizer, losses=losses)
58 | 
59 |     doc = nlp("We're interested at underwater basket weaving.")
60 |     cats1 = doc.cats
61 | 
62 |     # ensure IO goes OK
63 |     with make_tempdir() as d:
64 |         file_path = d / "trained_nlp"
65 |         nlp.to_disk(file_path)
66 |         nlp2 = spacy.load(file_path)
67 |         doc2 = nlp2("We're interested at underwater basket weaving.")
68 |         cats2 = doc2.cats
69 |         assert cats1 == cats2
70 | 


--------------------------------------------------------------------------------
/spacy_transformers/tests/regression/test_spacy_issue7029.py:
--------------------------------------------------------------------------------
 1 | from spacy.lang.en import English
 2 | from spacy.training import Example
 3 | from spacy.util import load_config_from_str
 4 | 
 5 | CONFIG = """
 6 | [nlp]
 7 | lang = "en"
 8 | pipeline = ["transformer", "tagger"]
 9 | 
10 | [components]
11 | 
12 | [components.transformer]
13 | factory = "transformer"
14 | 
15 | [components.transformer.model]
16 | name = "distilbert-base-uncased"
17 | 
18 | [components.tagger]
19 | factory = "tagger"
20 | 
21 | [components.tagger.model]
22 | @architectures = "spacy.Tagger.v1"
23 | nO = null
24 | 
25 | [components.tagger.model.tok2vec]
26 | @architectures = "spacy-transformers.TransformerListener.v1"
27 | grad_factor = 1.0
28 | 
29 | [components.tagger.model.tok2vec.pooling]
30 | @layers = "reduce_mean.v1"
31 | """
32 | 
33 | 
34 | TRAIN_DATA = [
35 |     ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
36 |     ("", {}),
37 |     ("Eat blue ham", {"tags": ["V", "J", "N"]}),
38 | ]
39 | 
40 | 
41 | def test_empty_doc():
42 |     """Test that an empty document gets processed correctly"""
43 |     nlp = English.from_config(load_config_from_str(CONFIG))
44 |     train_examples = []
45 |     for t in TRAIN_DATA:
46 |         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
47 |     optimizer = nlp.initialize(get_examples=lambda: train_examples)
48 |     for i in range(2):
49 |         losses = {}
50 |         nlp.update(train_examples, sgd=optimizer, losses=losses)
51 |     texts = ["first", "second", "third", "fourth", "and", "then", "some", ""]
52 | 
53 |     # run as normal
54 |     nlp.select_pipes(enable=["transformer", "tagger"])
55 |     docs1 = list(nlp.pipe(texts, batch_size=1))
56 |     docs2 = list(nlp.pipe(texts, batch_size=4))
57 |     assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]]
58 | 
59 |     # disable the transformer (the listener will produce random output)
60 |     nlp.select_pipes(enable=["tagger"])
61 |     docs1 = list(nlp.pipe(texts, batch_size=1))
62 |     docs2 = list(nlp.pipe(texts, batch_size=4))
63 |     assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]]
64 | 


--------------------------------------------------------------------------------
/spacy_transformers/tests/test_alignment.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from typing import List
  3 | import numpy
  4 | from spacy.tokens import Doc
  5 | from spacy.vocab import Vocab
  6 | from thinc.api import NumpyOps
  7 | from thinc.types import Ragged
  8 | from ..align import get_alignment, apply_alignment
  9 | from ..align import get_span2wp_from_offset_mapping
 10 | 
 11 | 
 12 | def get_ragged(ops, nested: List[List[int]]):
 13 |     nested = [ops.asarray(x) for x in nested]
 14 |     return Ragged(ops.flatten(nested), ops.asarray([len(x) for x in nested]))
 15 | 
 16 | 
 17 | def get_spans(word_seqs):
 18 |     vocab = Vocab()
 19 |     docs = [Doc(vocab, words=words) for words in word_seqs]
 20 |     return [doc[:] for doc in docs]
 21 | 
 22 | 
 23 | def flatten_strings(words1, words2):
 24 |     flat1 = []
 25 |     flat2 = []
 26 |     for seq in words1:
 27 |         flat1.extend(seq)
 28 |     stride = max((len(seq) for seq in words2), default=0)
 29 |     for seq in words2:
 30 |         flat2.extend(seq)
 31 |         flat2.extend([""] * (stride - len(seq)))
 32 |     return flat1, flat2
 33 | 
 34 | 
 35 | @pytest.mark.parametrize(
 36 |     "words1,words2",
 37 |     [
 38 |         ([["a", "b"]], [["a", "b"]]),
 39 |         ([["ab"]], [["a", "b"]]),
 40 |         ([["a", "b"]], [["ab"]]),
 41 |         ([["ab", "c"]], [["a", "bc"]]),
 42 |         ([["ab", "cd"]], [["a", "bc", "d"]]),
 43 |     ],
 44 | )
 45 | def test_alignments_match(words1, words2):
 46 |     spans = get_spans(words1)
 47 |     align = get_alignment(spans, words2)
 48 |     unique_tokens = set()
 49 |     for span in spans:
 50 |         for token in span:
 51 |             unique_tokens.add((id(token.doc), token.idx))
 52 |     assert len(unique_tokens) == align.lengths.shape[0]
 53 |     flat_words1, flat_words2 = flatten_strings(words1, words2)
 54 |     for i, word in enumerate(flat_words1):
 55 |         wp_word = "".join([flat_words2[int(j[0])] for j in align[i].data])
 56 |         if len(word) < len(wp_word):
 57 |             assert word in wp_word
 58 |         elif len(word) > len(wp_word):
 59 |             assert wp_word in word
 60 |         else:
 61 |             assert word == wp_word
 62 | 
 63 | 
 64 | @pytest.mark.parametrize(
 65 |     "nested_align,X_cols",
 66 |     [
 67 |         ([[0, 1, 2], [3], [4]], 4),
 68 |         ([[], [1], [1], [2]], 2),
 69 |         ([[0, 1], [1, 2], [], [4]], 2),
 70 |     ],
 71 | )
 72 | def test_apply_alignment(nested_align, X_cols):
 73 |     ops = NumpyOps()
 74 |     align = get_ragged(ops, nested_align)
 75 |     X_shape = (align.data.max() + 1, X_cols)
 76 |     X = ops.alloc2f(*X_shape)
 77 |     Y, get_dX = apply_alignment(ops, align, X)
 78 |     assert isinstance(Y, Ragged)
 79 |     assert Y.data.shape[0] == align.data.shape[0]
 80 |     assert Y.lengths.shape[0] == len(nested_align)
 81 |     dX = get_dX(Y)
 82 |     assert dX.shape == X.shape
 83 | 
 84 | 
 85 | @pytest.mark.parametrize(
 86 |     # fmt: off
 87 |     # roberta-base offset_mapping and expected alignment
 88 |     "words,offset_mapping,alignment",
 89 |     [
 90 |         (
 91 |             ["Áaaa"],
 92 |             numpy.asarray([(0, 0), (0, 1), (0, 1), (1, 4), (0, 0)], dtype="i"),
 93 |             [[1, 2, 3]],
 94 |         ),
 95 |         (
 96 |             ["INGG", "á", "aäa"],
 97 |             numpy.asarray([(0, 0), (0, 3), (3, 4), (5, 6), (5, 6), (7, 8), (8, 9), (9, 10), (0, 0)], dtype="i"),
 98 |             [[1, 2], [3, 4], [5, 6, 7]],
 99 |         ),
100 |     ],
101 |     # fmt: on
102 | )
103 | def test_offset_alignment(words, offset_mapping, alignment):
104 |     spans = get_spans([words])
105 |     result = get_span2wp_from_offset_mapping(spans[0], offset_mapping)
106 |     assert all(sorted(r) == a for r, a in zip(result, alignment))
107 | 


--------------------------------------------------------------------------------
/spacy_transformers/tests/test_configs.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | 
 3 | import pytest
 4 | import spacy
 5 | from spacy.training import Example
 6 | from spacy.training.initialize import init_nlp
 7 | from spacy.util import CONFIG_SECTION_ORDER
 8 | from spacy.language import DEFAULT_CONFIG
 9 | from thinc.config import Config
10 | 
11 | 
12 | TRAIN_TAGGER_DATA = [
13 |     ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
14 |     ("Eat blue ham", {"tags": ["V", "J", "N"]}),
15 | ]
16 | 
17 | 
18 | cfg_string = """
19 |     [nlp]
20 |     lang = "en"
21 |     pipeline = ["custom_transformer","tagger"]
22 | 
23 |     [components]
24 | 
25 |     [components.tagger]
26 |     factory = "tagger"
27 | 
28 |     [components.tagger.model]
29 |     @architectures = "spacy.Tagger.v1"
30 |     nO = null
31 | 
32 |     [components.tagger.model.tok2vec]
33 |     @architectures = "spacy-transformers.TransformerListener.v1"
34 |     grad_factor = 1.0
35 |     upstream = "custom_transformer"
36 | 
37 |     [components.tagger.model.tok2vec.pooling]
38 |     @layers = "reduce_mean.v1"
39 | 
40 |     [components.custom_transformer]
41 |     factory = "transformer"
42 | 
43 |     [corpora]
44 |     @readers = toy_tagger_data.v1
45 |     
46 |     [initialize]
47 |     
48 |     [initialize.components]
49 |     
50 |     [initialize.components.tagger]
51 |     labels = ["LABEL"]
52 |     """
53 | 
54 | 
55 | @pytest.mark.parametrize("config_string", [cfg_string])
56 | def test_init_nlp(config_string):
57 |     @spacy.registry.readers.register("toy_tagger_data.v1")
58 |     def read_tagger_data():
59 |         def parse_data(nlp, index):
60 |             ex = TRAIN_TAGGER_DATA[index]
61 |             yield Example.from_dict(nlp.make_doc(ex[0]), ex[1])
62 | 
63 |         return {
64 |             "train": partial(parse_data, index=0),
65 |             "dev": partial(parse_data, index=1),
66 |         }
67 | 
68 |     config = spacy.util.load_config_from_str(config_string, interpolate=False)
69 |     config = Config(DEFAULT_CONFIG, section_order=CONFIG_SECTION_ORDER).merge(config)
70 |     nlp = init_nlp(config, use_gpu=False)
71 |     assert nlp is not None
72 | 
73 |     tagger = nlp.get_pipe("tagger")
74 |     transformer = nlp.get_pipe("custom_transformer")
75 |     tagger_trf = tagger.model.get_ref("tok2vec").layers[0]
76 |     assert tagger_trf.upstream_name == "custom_transformer"
77 |     assert transformer.listeners[0] == tagger_trf
78 | 


--------------------------------------------------------------------------------
/spacy_transformers/tests/test_data_classes.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import numpy
 3 | from numpy.testing import assert_equal
 4 | from spacy_transformers.data_classes import WordpieceBatch
 5 | 
 6 | 
 7 | @pytest.fixture
 8 | def wordpieces():
 9 |     strings = [["some", "random", "strings"], ["are"], ["added", "here"]]
10 |     shape = (len(strings), max(len(seq) for seq in strings))
11 |     wordpieces = WordpieceBatch(
12 |         strings=strings,
13 |         input_ids=numpy.zeros(shape, dtype="i"),
14 |         token_type_ids=numpy.zeros(shape, dtype="i"),
15 |         attention_mask=numpy.zeros((shape[0], shape[1]), dtype="bool"),
16 |         lengths=[len(seq) for seq in strings],
17 |     )
18 |     return wordpieces
19 | 
20 | 
21 | def test_wordpieces_IO(wordpieces):
22 |     wp_dict = wordpieces.to_dict()
23 |     wordpieces_2 = WordpieceBatch.empty().from_dict(wp_dict)
24 |     for key, value in wordpieces_2.to_dict().items():
25 |         assert_equal(value, wp_dict[key])
26 | 


--------------------------------------------------------------------------------
/spacy_transformers/tests/test_deprecations.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from spacy_transformers.util import huggingface_from_pretrained
 3 | from spacy_transformers.util import huggingface_tokenize
 4 | 
 5 | 
 6 | def test_deprecation_warnings():
 7 |     with pytest.warns(DeprecationWarning):
 8 |         tokenizer, transformer = huggingface_from_pretrained(
 9 |             "distilbert-base-uncased", {}
10 |         )
11 |     with pytest.warns(DeprecationWarning):
12 |         token_data = huggingface_tokenize(tokenizer, ["a", "b", "c"])
13 | 


--------------------------------------------------------------------------------
/spacy_transformers/tests/test_model_sequence_classification.py:
--------------------------------------------------------------------------------
  1 | from typing import Callable
  2 | from functools import partial
  3 | import copy
  4 | 
  5 | import torch
  6 | from transformers import AutoModelForSequenceClassification
  7 | from transformers.models.distilbert.modeling_distilbert import (
  8 |     DistilBertForSequenceClassification,
  9 | )
 10 | from transformers.modeling_outputs import SequenceClassifierOutput
 11 | 
 12 | import spacy
 13 | from thinc.api import Model
 14 | 
 15 | from spacy_transformers.data_classes import HFObjects, WordpieceBatch
 16 | from spacy_transformers.layers.hf_wrapper import HFWrapper
 17 | from spacy_transformers.layers.transformer_model import _convert_transformer_inputs
 18 | from spacy_transformers.layers.transformer_model import _convert_transformer_outputs
 19 | from spacy_transformers.layers.transformer_model import forward
 20 | from spacy_transformers.layers.transformer_model import huggingface_from_pretrained
 21 | from spacy_transformers.layers.transformer_model import huggingface_tokenize
 22 | from spacy_transformers.layers.transformer_model import set_pytorch_transformer
 23 | from spacy_transformers.span_getters import get_strided_spans
 24 | 
 25 | 
 26 | def test_model_for_sequence_classification():
 27 |     # adapted from https://github.com/KennethEnevoldsen/spacy-wrap/
 28 |     class ClassificationTransformerModel(Model):
 29 |         def __init__(
 30 |             self,
 31 |             name: str,
 32 |             get_spans: Callable,
 33 |             tokenizer_config: dict = {},
 34 |             transformer_config: dict = {},
 35 |             mixed_precision: bool = False,
 36 |             grad_scaler_config: dict = {},
 37 |         ):
 38 |             hf_model = HFObjects(None, None, None, tokenizer_config, transformer_config)
 39 |             wrapper = HFWrapper(
 40 |                 hf_model,
 41 |                 convert_inputs=_convert_transformer_inputs,
 42 |                 convert_outputs=_convert_transformer_outputs,
 43 |                 mixed_precision=mixed_precision,
 44 |                 grad_scaler_config=grad_scaler_config,
 45 |                 model_cls=AutoModelForSequenceClassification,
 46 |             )
 47 |             super().__init__(
 48 |                 "clf_transformer",
 49 |                 forward,
 50 |                 init=init,
 51 |                 layers=[wrapper],
 52 |                 dims={"nO": None},
 53 |                 attrs={
 54 |                     "get_spans": get_spans,
 55 |                     "name": name,
 56 |                     "set_transformer": set_pytorch_transformer,
 57 |                     "has_transformer": False,
 58 |                     "flush_cache_chance": 0.0,
 59 |                 },
 60 |             )
 61 | 
 62 |         @property
 63 |         def tokenizer(self):
 64 |             return self.layers[0].shims[0]._hfmodel.tokenizer
 65 | 
 66 |         @property
 67 |         def transformer(self):
 68 |             return self.layers[0].shims[0]._hfmodel.transformer
 69 | 
 70 |         @property
 71 |         def _init_tokenizer_config(self):
 72 |             return self.layers[0].shims[0]._hfmodel._init_tokenizer_config
 73 | 
 74 |         @property
 75 |         def _init_transformer_config(self):
 76 |             return self.layers[0].shims[0]._hfmodel._init_transformer_config
 77 | 
 78 |         def copy(self):
 79 |             """
 80 |             Create a copy of the model, its attributes, and its parameters. Any child
 81 |             layers will also be deep-copied. The copy will receive a distinct `model.id`
 82 |             value.
 83 |             """
 84 |             copied = ClassificationTransformerModel(self.name, self.attrs["get_spans"])
 85 |             params = {}
 86 |             for name in self.param_names:
 87 |                 params[name] = self.get_param(name) if self.has_param(name) else None
 88 |             copied.params = copy.deepcopy(params)
 89 |             copied.dims = copy.deepcopy(self._dims)
 90 |             copied.layers[0] = copy.deepcopy(self.layers[0])
 91 |             for name in self.grad_names:
 92 |                 copied.set_grad(name, self.get_grad(name).copy())
 93 |             return copied
 94 | 
 95 |     def init(model: ClassificationTransformerModel, X=None, Y=None):
 96 |         if model.attrs["has_transformer"]:
 97 |             return
 98 |         name = model.attrs["name"]
 99 |         tok_cfg = model._init_tokenizer_config
100 |         trf_cfg = model._init_transformer_config
101 |         hf_model = huggingface_from_pretrained(
102 |             name, tok_cfg, trf_cfg, model_cls=AutoModelForSequenceClassification
103 |         )
104 |         model.attrs["set_transformer"](model, hf_model)
105 |         tokenizer = model.tokenizer
106 |         texts = ["hello world", "foo bar"]
107 |         token_data = huggingface_tokenize(tokenizer, texts)
108 |         wordpieces = WordpieceBatch.from_batch_encoding(token_data)
109 |         model.layers[0].initialize(X=wordpieces)
110 | 
111 |     model = ClassificationTransformerModel(
112 |         "sgugger/tiny-distilbert-classification",
113 |         get_spans=partial(get_strided_spans, window=128, stride=96),
114 |     )
115 |     model.initialize()
116 | 
117 |     assert isinstance(model.transformer, DistilBertForSequenceClassification)
118 |     nlp = spacy.blank("en")
119 |     doc = nlp.make_doc("some text")
120 |     assert isinstance(model.predict([doc]).model_output, SequenceClassifierOutput)
121 | 
122 |     b = model.to_bytes()
123 |     model_re = ClassificationTransformerModel(
124 |         "sgugger/tiny-distilbert-classification",
125 |         get_spans=partial(get_strided_spans, window=128, stride=96),
126 |     ).from_bytes(b)
127 |     assert isinstance(model_re.transformer, DistilBertForSequenceClassification)
128 |     assert isinstance(model_re.predict([doc]).model_output, SequenceClassifierOutput)
129 |     assert torch.equal(
130 |         model.predict([doc]).model_output.logits,
131 |         model_re.predict([doc]).model_output.logits,
132 |     )
133 |     # Note that model.to_bytes() != model_re.to_bytes(), but this is also not
134 |     # true for the default models.
135 | 


--------------------------------------------------------------------------------
/spacy_transformers/tests/test_model_wrapper.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import spacy
  3 | from thinc.api import Model
  4 | from ..layers import TransformerModel
  5 | from ..data_classes import FullTransformerBatch
  6 | from ..span_getters import get_doc_spans
  7 | 
  8 | 
  9 | MODEL_NAMES = [
 10 |     "distilbert-base-uncased",
 11 |     "hf-internal-testing/tiny-random-gpt2",
 12 |     "hf-internal-testing/tiny-random-xlnet",
 13 | ]
 14 | 
 15 | 
 16 | @pytest.fixture
 17 | def nlp():
 18 |     return spacy.blank("en")
 19 | 
 20 | 
 21 | @pytest.fixture
 22 | def docs(nlp):
 23 |     texts = ["the cat sat on the mat.", "hello world."]
 24 |     return [nlp(text) for text in texts]
 25 | 
 26 | 
 27 | @pytest.fixture(scope="module", params=MODEL_NAMES)
 28 | def name(request):
 29 |     return request.param
 30 | 
 31 | 
 32 | @pytest.fixture(scope="module", params=[True, False])
 33 | def output_attentions(request):
 34 |     return request.param
 35 | 
 36 | 
 37 | @pytest.fixture(scope="module", params=[True, False])
 38 | def output_hidden_states(request):
 39 |     return request.param
 40 | 
 41 | 
 42 | @pytest.fixture(scope="module")
 43 | def trf_model(name, output_attentions, output_hidden_states):
 44 |     if "gpt2" in name:
 45 |         model = TransformerModel(
 46 |             name,
 47 |             get_doc_spans,
 48 |             {"use_fast": True, "pad_token": "<|endoftext|>"},
 49 |             {
 50 |                 "output_attentions": output_attentions,
 51 |                 "output_hidden_states": output_hidden_states,
 52 |             },
 53 |         )
 54 | 
 55 |     else:
 56 |         # test slow tokenizers with distilbert-base-uncased (parameterizing
 57 |         # for all models blows up the memory usage during the test suite)
 58 |         if name == "distilbert-base-uncased":
 59 |             use_fast = False
 60 |         else:
 61 |             use_fast = True
 62 |         model = TransformerModel(
 63 |             name,
 64 |             get_doc_spans,
 65 |             {"use_fast": use_fast},
 66 |             {
 67 |                 "output_attentions": output_attentions,
 68 |                 "output_hidden_states": output_hidden_states,
 69 |             },
 70 |         )
 71 |     model.initialize()
 72 |     return model
 73 | 
 74 | 
 75 | def test_model_init(name, trf_model):
 76 |     assert isinstance(trf_model, Model)
 77 |     if name == "distilbert-base-uncased":
 78 |         assert not trf_model.tokenizer.is_fast
 79 |     else:
 80 |         assert trf_model.tokenizer.is_fast
 81 | 
 82 | 
 83 | def test_model_predict(nlp, docs, trf_model):
 84 |     outputs = trf_model.predict(docs)
 85 |     shape = outputs.model_output.last_hidden_state.shape
 86 |     if trf_model.transformer.config.output_attentions is True:
 87 |         assert outputs.model_output.attentions is not None
 88 |         assert all([t.shape[0] == shape[0] for t in outputs.model_output.attentions])
 89 |     else:
 90 |         assert outputs.model_output.attentions is None
 91 |     if trf_model.transformer.config.output_hidden_states is True:
 92 |         assert outputs.model_output.hidden_states is not None
 93 |         assert all([t.shape[0] == shape[0] for t in outputs.model_output.hidden_states])
 94 |     else:
 95 |         assert outputs.model_output.hidden_states is None
 96 |     assert isinstance(outputs, FullTransformerBatch)
 97 | 
 98 |     # for a fast tokenizer check that all non-special wordpieces are aligned
 99 |     # (which is not necessarily true for the slow tokenizers)
100 |     if trf_model.tokenizer.is_fast:
101 |         outputs = trf_model.predict([nlp.make_doc("\tÁaaa  \n\n")])
102 |         aligned_wps = outputs.align.data.flatten()
103 |         for i in range(len(outputs.wordpieces.strings[0])):
104 |             if (
105 |                 outputs.wordpieces.strings[0][i]
106 |                 not in trf_model.tokenizer.all_special_tokens
107 |             ):
108 |                 assert i in aligned_wps
109 | 


--------------------------------------------------------------------------------
/spacy_transformers/tests/test_pipeline_component.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from packaging.version import Version
  3 | import torch
  4 | import spacy
  5 | from spacy.language import Language
  6 | from spacy.training.example import Example
  7 | from spacy.util import make_tempdir
  8 | from spacy.vocab import Vocab
  9 | from spacy.tokens import Doc
 10 | from spacy import util
 11 | from thinc.api import Model, Config, get_current_ops, NumpyOps
 12 | from spacy.tests.util import assert_docs_equal
 13 | 
 14 | from .util import DummyTransformer, _assert_equal_tensors
 15 | from .. import TransformerModel
 16 | from ..pipeline_component import Transformer
 17 | from ..layers import TransformerListener
 18 | from ..data_classes import TransformerData, FullTransformerBatch
 19 | 
 20 | 
 21 | torch.set_num_threads(1)
 22 | 
 23 | 
 24 | @pytest.fixture
 25 | def vocab():
 26 |     return Vocab()
 27 | 
 28 | 
 29 | @pytest.fixture
 30 | def docs(vocab):
 31 |     return [
 32 |         Doc(vocab, words=["hello", "world"]),
 33 |         Doc(vocab, words=["this", "is", "another"]),
 34 |     ]
 35 | 
 36 | 
 37 | @pytest.fixture
 38 | def component(vocab):
 39 |     return Transformer(Vocab(), DummyTransformer())
 40 | 
 41 | 
 42 | @pytest.fixture(scope="module")
 43 | def simple_nlp():
 44 |     nlp = Language()
 45 |     nlp.add_pipe("transformer")
 46 |     train_examples = []
 47 |     for t in TRAIN_DATA:
 48 |         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
 49 | 
 50 |     optimizer = nlp.initialize()
 51 |     for i in range(2):
 52 |         losses = {}
 53 |         nlp.update(train_examples, sgd=optimizer, losses=losses)
 54 | 
 55 |     return nlp
 56 | 
 57 | 
 58 | def test_init(component):
 59 |     assert isinstance(component.vocab, Vocab)
 60 |     assert isinstance(component.model, Model)
 61 |     assert hasattr(component.set_extra_annotations, "__call__")
 62 |     assert component.listeners == []
 63 |     assert component.cfg == {"max_batch_items": 4096}
 64 | 
 65 | 
 66 | def test_predict(component, docs):
 67 |     trf_data = component.predict(docs)
 68 |     n_tokens = trf_data.wordpieces.input_ids.shape[1]
 69 |     width = component.model.layers[0].attrs["width"]
 70 |     assert isinstance(trf_data, FullTransformerBatch)
 71 |     assert (
 72 |         len(trf_data.model_output.last_hidden_state)
 73 |         == component.model.layers[0].attrs["depth"]
 74 |     )
 75 |     assert trf_data.model_output.last_hidden_state[0].shape == (
 76 |         len(docs),
 77 |         n_tokens,
 78 |         width,
 79 |     )
 80 | 
 81 | 
 82 | def test_set_annotations(component, docs):
 83 |     trf_data = component.predict(docs)
 84 |     component.set_annotations(docs, trf_data)
 85 |     for doc in docs:
 86 |         assert isinstance(doc._.trf_data, TransformerData)
 87 | 
 88 | 
 89 | def test_set_extra_annotations(component, docs):
 90 |     Doc.set_extension("custom_attr", default="")
 91 | 
 92 |     def custom_annotation_setter(docs, trf_data):
 93 |         doc_data = list(trf_data.doc_data)
 94 |         for doc, data in zip(docs, doc_data):
 95 |             doc._.custom_attr = data
 96 | 
 97 |     component.set_extra_annotations = custom_annotation_setter
 98 |     trf_data = component.predict(docs)
 99 |     component.set_annotations(docs, trf_data)
100 |     for doc in docs:
101 |         assert isinstance(doc._.custom_attr, TransformerData)
102 | 
103 | 
104 | def test_listeners(component, docs):
105 |     docs = list(component.pipe(docs))
106 |     for listener in component.listeners:
107 |         assert listener.verify_inputs(docs)
108 | 
109 | 
110 | TRAIN_DATA = [
111 |     (
112 |         "I like green eggs",
113 |         {"tags": ["N", "V", "J", "N"], "sent_starts": [True, False, True, False]},
114 |     ),
115 |     ("Eat blue ham", {"tags": ["V", "J", "N"], "sent_starts": [True, False, False]}),
116 | ]
117 | 
118 | 
119 | def test_transformer_pipeline_simple(simple_nlp):
120 |     """Test that a simple pipeline with just a transformer at least runs"""
121 |     doc = simple_nlp("We're interested at underwater basket weaving.")
122 |     assert doc
123 | 
124 | 
125 | def test_transformer_pipeline_long_token(simple_nlp):
126 |     """Test that a simple pipeline does not raise an error on texts that exceeds
127 |     the model max length. We should truncate instead.
128 |     """
129 |     doc = simple_nlp("https://example.com/" + "a/" * 1000)
130 |     assert len(doc) == 1
131 | 
132 | 
133 | cfg_string = """
134 |     [nlp]
135 |     lang = "en"
136 |     pipeline = ["transformer","tagger","senter"]
137 | 
138 |     [components]
139 | 
140 |     [components.senter]
141 |     factory = "senter"
142 | 
143 |     [components.senter.model]
144 |     @architectures = "spacy.Tagger.v1"
145 |     nO = null
146 | 
147 |     [components.senter.model.tok2vec]
148 |     @architectures = "spacy-transformers.TransformerListener.v1"
149 |     grad_factor = 1.0
150 |     upstream = "transformer"
151 | 
152 |     [components.senter.model.tok2vec.pooling]
153 |     @layers = "reduce_mean.v1"
154 | 
155 |     [components.tagger]
156 |     factory = "tagger"
157 | 
158 |     [components.tagger.model]
159 |     @architectures = "spacy.Tagger.v1"
160 |     nO = null
161 | 
162 |     [components.tagger.model.tok2vec]
163 |     @architectures = "spacy-transformers.TransformerListener.v1"
164 |     grad_factor = 1.0
165 |     upstream = "transformer"
166 | 
167 |     [components.tagger.model.tok2vec.pooling]
168 |     @layers = "reduce_mean.v1"
169 | 
170 |     [components.transformer]
171 |     factory = "transformer"
172 | 
173 |     [components.transformer.model]
174 |     @architectures = "spacy-transformers.TransformerModel.v3"
175 |     name = "albert-base-v2"
176 | 
177 |     [components.transformer.model.transformer_config]
178 |     output_attentions = true
179 |     """
180 | 
181 | 
182 | def test_transformer_pipeline_tagger_senter_listener():
183 |     """Test that a pipeline with just a transformer+tagger+senter runs and
184 |     trains properly"""
185 |     orig_config = Config().from_str(cfg_string)
186 |     nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
187 |     assert nlp.pipe_names == ["transformer", "tagger", "senter"]
188 |     tagger = nlp.get_pipe("tagger")
189 |     transformer = nlp.get_pipe("transformer")
190 |     tagger_trf = tagger.model.get_ref("tok2vec").layers[0]
191 |     assert isinstance(transformer, Transformer)
192 |     assert isinstance(tagger_trf, TransformerListener)
193 |     train_examples = []
194 |     for t in TRAIN_DATA:
195 |         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
196 |         for tag in t[1]["tags"]:
197 |             tagger.add_label(tag)
198 | 
199 |     # Check that the Transformer component finds it listeners
200 |     optimizer = nlp.initialize(lambda: train_examples)
201 |     assert tagger_trf in transformer.listeners
202 | 
203 |     for i in range(2):
204 |         losses = {}
205 |         nlp.update(train_examples, sgd=optimizer, losses=losses)
206 | 
207 |     text = "We're interested at underwater basket weaving."
208 |     doc = nlp(text)
209 |     doc_tensor = tagger_trf.predict([doc])
210 |     _assert_equal_tensors(doc._.trf_data.tensors, doc_tensor[0].tensors)
211 | 
212 |     # ensure IO goes OK
213 |     with make_tempdir() as d:
214 |         file_path = d / "trained_nlp"
215 |         nlp.to_disk(file_path)
216 |         nlp2 = util.load_model_from_path(file_path)
217 |         doc2 = nlp2(text)
218 |         tagger2 = nlp2.get_pipe("tagger")
219 |         tagger_trf2 = tagger2.model.get_ref("tok2vec").layers[0]
220 |         doc_tensor2 = tagger_trf2.predict([doc2])
221 |         _assert_equal_tensors(doc_tensor2[0].tensors, doc_tensor[0].tensors)
222 | 
223 |         # make sure that this can be saved to directory once more
224 |         file_path_2 = d / "trained_nlp_2"
225 |         nlp2.to_disk(file_path_2)
226 | 
227 |     # ensure to_bytes / from_bytes works
228 |     nlp_bytes = nlp.to_bytes()
229 |     nlp3 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
230 |     nlp3.from_bytes(nlp_bytes)
231 |     doc3 = nlp3(text)
232 |     tagger3 = nlp3.get_pipe("tagger")
233 |     tagger_trf3 = tagger3.model.get_ref("tok2vec").layers[0]
234 |     doc_tensor3 = tagger_trf3.predict([doc3])
235 |     _assert_equal_tensors(doc_tensor3[0].tensors, doc_tensor[0].tensors)
236 | 
237 | 
238 | def test_transformer_sentencepiece_IO():
239 |     """Test that a transformer using sentencepiece trains + IO goes OK"""
240 |     orig_config = Config().from_str(cfg_string)
241 |     orig_config["components"]["transformer"]["model"]["name"] = "hf-internal-testing/tiny-xlm-roberta"
242 |     orig_config["components"]["transformer"]["model"]["tokenizer_config"] = {"use_fast": False}
243 |     nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
244 |     tagger = nlp.get_pipe("tagger")
245 |     tagger_trf = tagger.model.get_ref("tok2vec").layers[0]
246 |     train_examples = []
247 |     for t in TRAIN_DATA:
248 |         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
249 |         for tag in t[1]["tags"]:
250 |             tagger.add_label(tag)
251 | 
252 |     optimizer = nlp.initialize(lambda: train_examples)
253 |     for i in range(2):
254 |         losses = {}
255 |         nlp.update(train_examples, sgd=optimizer, losses=losses)
256 | 
257 |     text = "We're interested at underwater basket weaving."
258 |     doc = nlp(text)
259 |     doc_tensor = tagger_trf.predict([doc])
260 | 
261 |     # ensure IO goes OK
262 |     with make_tempdir() as d:
263 |         file_path = d / "trained_nlp"
264 |         nlp.to_disk(file_path)
265 |         nlp2 = util.load_model_from_path(file_path)
266 |         doc2 = nlp2(text)
267 |         tagger2 = nlp2.get_pipe("tagger")
268 |         tagger_trf2 = tagger2.model.get_ref("tok2vec").layers[0]
269 |         doc_tensor2 = tagger_trf2.predict([doc2])
270 |         _assert_equal_tensors(doc_tensor2[0].tensors, doc_tensor[0].tensors)
271 | 
272 |         # make sure that this can be saved to directory once more
273 |         file_path_2 = d / "trained_nlp_2"
274 |         nlp2.to_disk(file_path_2)
275 | 
276 |     # ensure to_bytes / from_bytes works
277 |     nlp_bytes = nlp.to_bytes()
278 |     nlp3 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
279 |     nlp3.from_bytes(nlp_bytes)
280 |     doc3 = nlp3(text)
281 |     tagger3 = nlp3.get_pipe("tagger")
282 |     tagger_trf3 = tagger3.model.get_ref("tok2vec").layers[0]
283 |     doc_tensor3 = tagger_trf3.predict([doc3])
284 |     _assert_equal_tensors(doc_tensor3[0].tensors, doc_tensor[0].tensors)
285 | 
286 | 
287 | def test_transformer_pipeline_empty():
288 |     """Test that the pipeline doesn't fail with empty input"""
289 |     orig_config = Config().from_str(cfg_string)
290 |     nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
291 |     tagger = nlp.get_pipe("tagger")
292 |     train_examples = []
293 |     for t in TRAIN_DATA:
294 |         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
295 |         for tag in t[1]["tags"]:
296 |             tagger.add_label(tag)
297 | 
298 |     # train on empty doc
299 |     optimizer = nlp.initialize()
300 |     losses = {}
301 |     empty_train_example = Example.from_dict(nlp.make_doc(""), {})
302 |     nlp.update(train_examples, sgd=optimizer, losses=losses)
303 |     nlp.update([empty_train_example], sgd=optimizer, losses=losses)
304 |     train_examples.append(empty_train_example)
305 |     nlp.update(train_examples, sgd=optimizer, losses=losses)
306 |     # Interleave an empty doc between non-empty ones
307 |     train_examples.insert(1, Example.from_dict(nlp.make_doc(""), {}))
308 |     nlp.update(train_examples, sgd=optimizer, losses=losses)
309 | 
310 |     # predict empty doc
311 |     doc = nlp("")
312 |     _assert_empty(doc._.trf_data)
313 |     docs = nlp.pipe(["", ""])
314 |     for doc in docs:
315 |         _assert_empty(doc._.trf_data)
316 |     nlp.pipe([])
317 | 
318 |     # predict combination of empty and non-empty
319 |     doc = nlp("This is a sentence")
320 |     normal_tags = [t.tag_ for t in doc]
321 | 
322 |     docs = list(nlp.pipe(["", "This is a sentence", "", ""]))
323 |     _assert_empty(docs[0]._.trf_data)
324 |     assert [t.tag_ for t in docs[0]] == []
325 |     assert [t.tag_ for t in docs[1]] == normal_tags
326 |     _assert_empty(docs[2]._.trf_data)
327 |     _assert_empty(docs[3]._.trf_data)
328 | 
329 | 
330 | def _assert_empty(trf_data):
331 |     assert trf_data.wordpieces.strings == []
332 |     assert trf_data.wordpieces.input_ids.size == 0
333 |     assert trf_data.wordpieces.attention_mask.size == 0
334 |     assert trf_data.tensors == ()
335 |     assert len(trf_data.align.data) == 0
336 | 
337 | 
338 | def test_replace_listeners():
339 |     orig_config = Config().from_str(cfg_string)
340 |     nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
341 |     text = "This is awesome"
342 |     examples = [Example.from_dict(nlp.make_doc(text), {"tags": ["A", "B", "C"]})]
343 |     optimizer = nlp.initialize(lambda: examples)
344 |     # verify correct configuration with transformer listener
345 |     transformer = nlp.get_pipe("transformer")
346 |     tagger = nlp.get_pipe("tagger")
347 |     tagger_tok2vec = tagger.model.get_ref("tok2vec")
348 |     tagger_listener = tagger_tok2vec.get_ref("listener")
349 |     assert isinstance(tagger_listener, TransformerListener)
350 |     assert transformer.listener_map["tagger"][0] == tagger_listener
351 |     assert isinstance(transformer.model, TransformerModel)
352 |     assert (
353 |         nlp.config["components"]["transformer"]["model"]["@architectures"]
354 |         == "spacy-transformers.TransformerModel.v3"
355 |     )
356 |     assert (
357 |         nlp.config["components"]["tagger"]["model"]["tok2vec"]["@architectures"]
358 |         == "spacy-transformers.TransformerListener.v1"
359 |     )
360 |     # train pipe before replacing listeners
361 |     for i in range(2):
362 |         losses = {}
363 |         nlp.update(examples, sgd=optimizer, losses=losses)
364 |         doc = nlp(text)
365 | 
366 |     preds = [t.tag_ for t in doc]
367 |     doc_tensor = tagger_tok2vec.predict([doc])
368 | 
369 |     # replace listener and verify predictions are still the same
370 |     nlp.replace_listeners("transformer", "tagger", ["model.tok2vec"])
371 |     tagger = nlp.get_pipe("tagger")
372 |     tagger_tok2vec = tagger.model.get_ref("tok2vec")
373 |     assert isinstance(tagger_tok2vec, Model)
374 |     assert tagger_tok2vec.layers[0].layers[0].name == "transformer"
375 |     assert (
376 |         nlp.config["components"]["tagger"]["model"]["tok2vec"]["@architectures"]
377 |         == "spacy-transformers.Tok2VecTransformer.v3"
378 |     )
379 |     doc2 = nlp(text)
380 |     assert preds == [t.tag_ for t in doc2]
381 |     pred_tensor = tagger_tok2vec.predict([doc2])
382 |     _assert_equal_tensors(doc_tensor, pred_tensor)
383 | 
384 |     # attempt training with the new pipeline
385 |     optimizer = nlp.resume_training()
386 |     for i in range(2):
387 |         losses = {}
388 |         nlp.update(examples, sgd=optimizer, losses=losses)
389 |         assert losses["tagger"] > 0.0
390 | 
391 |     # check for presence of additional fields in model_output
392 |     assert doc2._.trf_data.model_output.pooler_output is not None
393 |     assert doc2._.trf_data.model_output.attentions is not None
394 | 
395 |     # ensure IO goes OK
396 |     doc_tensor_trained = tagger_tok2vec.predict([doc])
397 |     with make_tempdir() as d:
398 |         file_path = d / "trained_nlp"
399 |         nlp.to_disk(file_path)
400 |         nlp2 = util.load_model_from_path(file_path)
401 |         doc3 = nlp2(text)
402 |         tagger2 = nlp2.get_pipe("tagger")
403 |         tagger_tok2vec2 = tagger2.model.get_ref("tok2vec")
404 |         pred_tensor = tagger_tok2vec2.predict([doc3])
405 |         _assert_equal_tensors(doc_tensor_trained, pred_tensor)
406 | 
407 | 
408 | def test_replace_listeners_invalid():
409 |     orig_config = Config().from_str(cfg_string)
410 |     nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
411 |     text = "This is awesome"
412 |     examples = [Example.from_dict(nlp.make_doc(text), {"tags": ["A", "B", "C"]})]
413 |     optimizer = nlp.initialize(lambda: examples)
414 |     for i in range(2):
415 |         losses = {}
416 |         nlp.update(examples, sgd=optimizer, losses=losses)
417 |     with pytest.raises(ValueError):
418 |         nlp.replace_listeners("invalid", "tagger", ["model.tok2vec"])
419 |     with pytest.raises(ValueError):
420 |         nlp.replace_listeners("transformer", "parser", ["model.tok2vec"])
421 |     with pytest.raises(ValueError):
422 |         nlp.replace_listeners("transformer", "tagger", ["model.yolo"])
423 |     with pytest.raises(ValueError):
424 |         nlp.replace_listeners("transformer", "tagger", ["model.tok2vec", "model.yolo"])
425 | 
426 | 
427 | @pytest.fixture
428 | def texts():
429 |     data = [
430 |         "Hello world.",
431 |         "This is spacy.",
432 |         "You can use multiprocessing with pipe method.",
433 |         "Please try!",
434 |     ]
435 |     return data
436 | 
437 | 
438 | def test_multiprocessing(simple_nlp, texts):
439 |     ops = get_current_ops()
440 |     if isinstance(ops, NumpyOps):
441 |         texts = texts * 3
442 |         expecteds = [simple_nlp(text) for text in texts]
443 |         docs = simple_nlp.pipe(texts, n_process=2, batch_size=2)
444 | 
445 |         for doc, expected_doc in zip(docs, expecteds):
446 |             assert_docs_equal(doc, expected_doc)
447 | 
448 | 
449 | def test_frozen_listener():
450 |     orig_config = Config().from_str(cfg_string)
451 |     nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
452 |     text = "This is awesome"
453 |     examples = [Example.from_dict(nlp.make_doc(text), {"tags": ["A", "B", "C"]})]
454 |     optimizer = nlp.initialize(lambda: examples)
455 |     # train pipe before freezing listener
456 |     for i in range(2):
457 |         losses = {}
458 |         nlp.update(examples, sgd=optimizer, losses=losses)
459 |         doc = nlp(text)
460 | 
461 |     transformer_bytes = nlp.get_pipe("transformer").to_bytes()
462 |     tagger_bytes = nlp.get_pipe("tagger").to_bytes()
463 | 
464 |     # train further with frozen listener
465 |     for i in range(2):
466 |         losses = {}
467 |         nlp.update(
468 |             examples,
469 |             sgd=optimizer,
470 |             losses=losses,
471 |             exclude=["transformer"],
472 |             annotates=["transformer"],
473 |         )
474 |         doc = nlp(text)
475 | 
476 |     # only tagger was updated
477 |     assert nlp.get_pipe("transformer").to_bytes() == transformer_bytes
478 |     assert nlp.get_pipe("tagger").to_bytes() != tagger_bytes
479 | 
480 | 
481 | def test_no_update_listener_in_predict():
482 |     orig_config = Config().from_str(cfg_string)
483 |     nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
484 |     listener = nlp.get_pipe("tagger").model.get_ref("tok2vec").get_ref("listener")
485 |     transformer = nlp.get_pipe("transformer")
486 | 
487 |     text = "This is awesome"
488 |     examples = [Example.from_dict(nlp.make_doc(text), {"tags": ["A", "B", "C"]})]
489 |     docs = [eg.predicted for eg in examples]
490 |     nlp.initialize(lambda: examples)
491 | 
492 |     transformer.update(examples)
493 |     assert listener._backprop is not None
494 | 
495 |     transformer.predict(docs)
496 |     assert listener._backprop is not None
497 | 
498 | 
499 | @pytest.mark.skipif(
500 |     Version(spacy.__version__) < Version("3.5.4"), reason="Bug fixed in spaCy v3.5.4"
501 | )
502 | def test_source_replace_listeners():
503 |     """Test that a pipeline with a transformer+tagger+senter and some replaced
504 |     listeners runs and trains properly"""
505 |     orig_config = """
506 |     [nlp]
507 |     lang = "en"
508 |     pipeline = ["transformer","tagger","senter"]
509 | 
510 |     [components]
511 | 
512 |     [components.senter]
513 |     factory = "senter"
514 | 
515 |     [components.senter.model]
516 |     @architectures = "spacy.Tagger.v1"
517 |     nO = null
518 | 
519 |     [components.senter.model.tok2vec]
520 |     @architectures = "spacy-transformers.TransformerListener.v1"
521 |     grad_factor = 1.0
522 |     upstream = "transformer"
523 | 
524 |     [components.senter.model.tok2vec.pooling]
525 |     @layers = "reduce_mean.v1"
526 | 
527 |     [components.tagger]
528 |     factory = "tagger"
529 | 
530 |     [components.tagger.model]
531 |     @architectures = "spacy.Tagger.v1"
532 |     nO = null
533 | 
534 |     [components.tagger.model.tok2vec]
535 |     @architectures = "spacy-transformers.TransformerListener.v1"
536 |     grad_factor = 1.0
537 |     upstream = "transformer"
538 | 
539 |     [components.tagger.model.tok2vec.pooling]
540 |     @layers = "reduce_mean.v1"
541 | 
542 |     [components.transformer]
543 |     factory = "transformer"
544 | 
545 |     [components.transformer.model]
546 |     @architectures = "spacy-transformers.TransformerModel.v3"
547 |     name = "distilbert-base-uncased"
548 |     """
549 |     orig_config = Config().from_str(cfg_string)
550 |     nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
551 |     assert nlp.pipe_names == ["transformer", "tagger", "senter"]
552 |     tagger = nlp.get_pipe("tagger")
553 |     train_examples = []
554 |     for t in TRAIN_DATA:
555 |         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
556 |         for tag in t[1]["tags"]:
557 |             tagger.add_label(tag)
558 |     optimizer = nlp.initialize(lambda: train_examples)
559 |     assert nlp.get_pipe("transformer").listening_components == ["tagger", "senter"]
560 |     for i in range(2):
561 |         losses = {}
562 |         nlp.update(train_examples, sgd=optimizer, losses=losses)
563 | 
564 |     with make_tempdir() as dir_path:
565 |         nlp.to_disk(dir_path)
566 |         base_model = str(dir_path)
567 |         new_config = {
568 |             "nlp": {
569 |                 "lang": "en",
570 |                 "pipeline": ["transformer", "tagger", "senter", "ner"],
571 |             },
572 |             "components": {
573 |                 "transformer": {"source": base_model},
574 |                 "tagger": {
575 |                     "source": base_model,
576 |                     "replace_listeners": ["model.tok2vec"],
577 |                 },
578 |                 "senter": {
579 |                     "source": base_model,
580 |                     "replace_listeners": ["model.tok2vec"],
581 |                 },
582 |                 "ner": {
583 |                     "factory": "ner",
584 |                     "model": {
585 |                         "@architectures": "spacy.TransitionBasedParser.v2",
586 |                         "state_type": "ner",
587 |                         "tok2vec": {
588 |                             "@architectures": "spacy-transformers.TransformerListener.v1",
589 |                             "grad_factor": 1.0,
590 |                             "upstream": "transformer",
591 |                             "pooling": {"@layers": "reduce_mean.v1"},
592 |                         },
593 |                     },
594 |                 },
595 |             },
596 |         }
597 |         new_nlp = util.load_model_from_config(new_config, auto_fill=True)
598 |         for component in ("tagger", "senter"):
599 |             assert (
600 |                 new_nlp.config["components"][component]["model"]["tok2vec"][
601 |                     "@architectures"
602 |                 ]
603 |                 == "spacy-transformers.Tok2VecTransformer.v3"
604 |             )
605 |         assert new_nlp.get_pipe("transformer").listening_components == ["ner"]
606 | 
607 |         with make_tempdir() as new_dir_path:
608 |             new_nlp.to_disk(new_dir_path)
609 |             new_nlp_re = spacy.load(new_dir_path)
610 |             for component in ("tagger", "senter"):
611 |                 assert (
612 |                     new_nlp.config["components"][component]["model"]["tok2vec"][
613 |                         "@architectures"
614 |                     ]
615 |                     == "spacy-transformers.Tok2VecTransformer.v3"
616 |                 )
617 |             assert new_nlp_re.get_pipe("transformer").listening_components == ["ner"]
618 | 


--------------------------------------------------------------------------------
/spacy_transformers/tests/test_serialize.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import copy
  3 | import spacy
  4 | from spacy import Language
  5 | from spacy.lang.en import English
  6 | from spacy.tests.util import assert_docs_equal
  7 | from spacy.tokens import Doc
  8 | from spacy.util import make_tempdir
  9 | from spacy import util
 10 | import srsly
 11 | from thinc.api import Config, get_current_ops
 12 | from numpy.testing import assert_array_equal
 13 | 
 14 | from .. import TransformerData
 15 | 
 16 | 
 17 | DEFAULT_CONFIG = {
 18 |     "model": {
 19 |         "@architectures": "spacy-transformers.TransformerModel.v3",
 20 |         "name": "hf-internal-testing/tiny-random-DistilBertModel",
 21 |         "tokenizer_config": {"use_fast": False},
 22 |     }
 23 | }
 24 | 
 25 | 
 26 | def test_serialize_transformer_data():
 27 |     data = {"x": TransformerData.empty()}
 28 |     bytes_data = srsly.msgpack_dumps(data)
 29 |     new_data = srsly.msgpack_loads(bytes_data)
 30 |     assert isinstance(new_data["x"], TransformerData)
 31 | 
 32 |     nlp = Language()
 33 |     nlp.add_pipe(
 34 |         "transformer",
 35 |         config={
 36 |             "model": {
 37 |                 "name": "hf-internal-testing/tiny-random-DistilBertModel",
 38 |                 "transformer_config": {"output_attentions": True},
 39 |             }
 40 |         },
 41 |     )
 42 |     nlp.initialize()
 43 |     doc = nlp("This is a test.")
 44 |     b = doc.to_bytes()
 45 |     reloaded_doc = Doc(nlp.vocab)
 46 |     reloaded_doc.from_bytes(b)
 47 |     assert_docs_equal(doc, reloaded_doc)
 48 |     ops = get_current_ops()
 49 |     for key in doc._.trf_data.model_output:
 50 |         assert_array_equal(
 51 |             ops.to_numpy(ops.asarray(doc._.trf_data.model_output[key])),
 52 |             ops.to_numpy(ops.asarray(reloaded_doc._.trf_data.model_output[key])),
 53 |         )
 54 | 
 55 | 
 56 | def test_transformer_tobytes():
 57 |     nlp = Language()
 58 |     trf = nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
 59 |     trf_bytes = trf.to_bytes()
 60 | 
 61 |     nlp2 = Language()
 62 |     trf2 = nlp2.add_pipe("transformer", config=DEFAULT_CONFIG)
 63 |     trf2.from_bytes(trf_bytes)
 64 | 
 65 | 
 66 | def test_initialized_transformer_tobytes():
 67 |     nlp = Language()
 68 |     trf = nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
 69 |     nlp.initialize()
 70 |     trf_bytes = trf.to_bytes()
 71 | 
 72 |     nlp2 = Language()
 73 |     trf2 = nlp2.add_pipe("transformer", config=DEFAULT_CONFIG)
 74 |     trf2.from_bytes(trf_bytes)
 75 | 
 76 |     assert trf2.model.tokenizer.is_fast is False
 77 | 
 78 | 
 79 | def test_initialized_transformer_todisk():
 80 |     nlp = Language()
 81 |     trf = nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
 82 |     nlp.initialize()
 83 |     with make_tempdir() as d:
 84 |         trf.to_disk(d)
 85 |         nlp2 = Language()
 86 |         trf2 = nlp2.add_pipe("transformer", config=DEFAULT_CONFIG)
 87 |         trf2.from_disk(d)
 88 | 
 89 |         assert trf2.model.tokenizer.is_fast is False
 90 | 
 91 |     fast_config = copy.deepcopy(DEFAULT_CONFIG)
 92 |     fast_config["model"]["tokenizer_config"]["use_fast"] = True
 93 |     nlp = Language()
 94 |     trf = nlp.add_pipe("transformer", config=fast_config)
 95 |     nlp.initialize()
 96 |     with make_tempdir() as d:
 97 |         trf.to_disk(d)
 98 |         nlp2 = Language()
 99 |         trf2 = nlp2.add_pipe("transformer", config=fast_config)
100 |         trf2.from_disk(d)
101 | 
102 |         assert trf2.model.tokenizer.is_fast is True
103 | 
104 | 
105 | def test_transformer_pipeline_tobytes():
106 |     nlp = Language()
107 |     nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
108 |     nlp.initialize()
109 |     assert nlp.pipe_names == ["transformer"]
110 |     nlp_bytes = nlp.to_bytes()
111 | 
112 |     nlp2 = Language()
113 |     nlp2.add_pipe("transformer", config=DEFAULT_CONFIG)
114 |     nlp2.from_bytes(nlp_bytes)
115 |     assert nlp2.pipe_names == ["transformer"]
116 | 
117 | 
118 | def test_transformer_pipeline_todisk():
119 |     nlp = English()
120 |     nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
121 |     nlp.initialize()
122 |     with make_tempdir() as d:
123 |         nlp.to_disk(d)
124 |         nlp2 = spacy.load(d)
125 |         assert nlp2.pipe_names == ["transformer"]
126 | 
127 | 
128 | def test_transformer_pipeline_todisk_settings():
129 |     nlp = English()
130 |     trf = nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
131 |     nlp.initialize()
132 |     # initially no attentions
133 |     assert trf.model.tokenizer.model_max_length == 512
134 |     assert trf.model.transformer.config.output_attentions is False
135 |     assert "attentions" not in nlp("test")._.trf_data.model_output
136 |     # modify model_max_length (note that modifications to
137 |     # tokenizer.model_max_length for transformers<4.25 are not serialized by
138 |     # save_pretrained, see: https://github.com/explosion/spaCy/discussions/7393)
139 |     trf.model.tokenizer.init_kwargs["model_max_length"] = 499
140 |     # transformer>=4.25, model_max_length is saved and init_kwargs changes are
141 |     # clobbered, so do both for this test
142 |     trf.model.tokenizer.model_max_length = 499
143 |     # add attentions on-the-fly
144 |     trf.model.transformer.config.output_attentions = True
145 |     assert nlp("test")._.trf_data.model_output.attentions is not None
146 |     with make_tempdir() as d:
147 |         nlp.to_disk(d)
148 |         nlp2 = spacy.load(d)
149 |         assert nlp2.pipe_names == ["transformer"]
150 |         trf2 = nlp2.get_pipe("transformer")
151 |         # model_max_length is preserved
152 |         assert trf2.model.tokenizer.model_max_length == 499
153 |         # output_attentions setting is preserved
154 |         assert trf2.model.transformer.config.output_attentions is True
155 |         assert nlp2("test")._.trf_data.model_output.attentions is not None
156 |         # the init configs are empty SimpleFrozenDicts
157 |         assert trf2.model._init_tokenizer_config == {}
158 |         with pytest.raises(NotImplementedError):
159 |             trf2.model._init_tokenizer_config["use_fast"] = False
160 | 
161 | 
162 | def test_transformer_pipeline_todisk_before_initialize():
163 |     nlp = English()
164 |     nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
165 |     with make_tempdir() as d:
166 |         # serialize before initialization
167 |         nlp.to_disk(d)
168 |         nlp2 = spacy.load(d)
169 |         nlp2.initialize()
170 |         assert "last_hidden_state" in nlp2("test")._.trf_data.model_output
171 | 
172 | 
173 | inline_cfg_string = """
174 |     [nlp]
175 |     lang = "en"
176 |     pipeline = ["tagger"]
177 | 
178 |     [components]
179 | 
180 |     [components.tagger]
181 |     factory = "tagger"
182 | 
183 |     [components.tagger.model]
184 |     @architectures = "spacy.Tagger.v1"
185 |     nO = null
186 | 
187 |     [components.tagger.model.tok2vec]
188 |     @architectures = "spacy-transformers.Tok2VecTransformer.v3"
189 |     name = "hf-internal-testing/tiny-random-DistilBertModel"
190 |     tokenizer_config = {"use_fast": true}
191 |     transformer_config = {"output_attentions": false}
192 |     grad_factor = 1.0
193 | 
194 |     [components.tagger.model.tok2vec.get_spans]
195 |     @span_getters = "spacy-transformers.strided_spans.v1"
196 |     window = 256
197 |     stride = 256
198 | 
199 |     [components.tagger.model.tok2vec.pooling]
200 |     @layers = "reduce_mean.v1"
201 |     """
202 | 
203 | 
204 | def test_inline_transformer_tobytes():
205 |     orig_config = Config().from_str(inline_cfg_string)
206 |     nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
207 |     tagger = nlp.get_pipe("tagger")
208 |     tagger_bytes = tagger.to_bytes()
209 | 
210 |     nlp2 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
211 |     tagger2 = nlp2.get_pipe("tagger")
212 |     tagger2.from_bytes(tagger_bytes)
213 | 
214 | 
215 | def test_initialized_inline_transformer_tobytes():
216 |     orig_config = Config().from_str(inline_cfg_string)
217 |     nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
218 |     assert nlp.pipe_names == ["tagger"]
219 |     tagger = nlp.get_pipe("tagger")
220 |     tagger.add_label("V")
221 |     nlp.initialize()
222 |     tagger_bytes = tagger.to_bytes()
223 | 
224 |     nlp2 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
225 |     tagger2 = nlp2.get_pipe("tagger")
226 |     tagger2.from_bytes(tagger_bytes)
227 |     assert list(tagger2.labels) == ["V"]
228 | 
229 | 
230 | def test_inline_transformer_todisk():
231 |     orig_config = Config().from_str(inline_cfg_string)
232 |     nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
233 |     assert nlp.pipe_names == ["tagger"]
234 |     tagger = nlp.get_pipe("tagger")
235 |     tagger.add_label("V")
236 |     with make_tempdir() as d:
237 |         tagger.to_disk(d)
238 |         nlp2 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
239 |         tagger2 = nlp2.get_pipe("tagger")
240 |         tagger2.from_disk(d)
241 |         assert list(tagger2.labels) == ["V"]
242 | 
243 | 
244 | def test_initialized_inline_transformer_todisk():
245 |     orig_config = Config().from_str(inline_cfg_string)
246 |     nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
247 |     assert nlp.pipe_names == ["tagger"]
248 |     tagger = nlp.get_pipe("tagger")
249 |     tagger.add_label("V")
250 |     nlp.initialize()
251 |     with make_tempdir() as d:
252 |         tagger.to_disk(d)
253 |         nlp2 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
254 |         tagger2 = nlp2.get_pipe("tagger")
255 |         tagger2.from_disk(d)
256 |         assert list(tagger2.labels) == ["V"]
257 | 
258 | 
259 | def test_inline_transformer_pipeline_tobytes():
260 |     orig_config = Config().from_str(inline_cfg_string)
261 |     nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
262 |     assert nlp.pipe_names == ["tagger"]
263 |     tagger = nlp.get_pipe("tagger")
264 |     tagger.add_label("V")
265 |     nlp.initialize()
266 |     nlp_bytes = nlp.to_bytes()
267 | 
268 |     nlp2 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
269 |     nlp2.from_bytes(nlp_bytes)
270 |     assert nlp2.pipe_names == ["tagger"]
271 | 
272 | 
273 | def test_inline_transformer_pipeline_todisk():
274 |     orig_config = Config().from_str(inline_cfg_string)
275 |     nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
276 |     assert nlp.pipe_names == ["tagger"]
277 |     with make_tempdir() as d:
278 |         nlp.to_disk(d)
279 |         nlp2 = spacy.load(d)
280 |         assert nlp2.pipe_names == ["tagger"]
281 | 
282 | 
283 | def test_initialized_inline_transformer_pipeline_todisk():
284 |     orig_config = Config().from_str(inline_cfg_string)
285 |     nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
286 |     assert nlp.pipe_names == ["tagger"]
287 |     tagger = nlp.get_pipe("tagger")
288 |     tagger.add_label("V")
289 |     nlp.initialize()
290 |     with make_tempdir() as d:
291 |         nlp.to_disk(d)
292 |         nlp2 = spacy.load(d)
293 |         assert nlp2.pipe_names == ["tagger"]
294 |         tagger2 = nlp2.get_pipe("tagger")
295 |         assert list(tagger2.labels) == ["V"]
296 | 


--------------------------------------------------------------------------------
/spacy_transformers/tests/test_spanners.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from spacy.lang.en import English
 3 | 
 4 | from ..span_getters import configure_strided_spans, configure_get_sent_spans
 5 | 
 6 | 
 7 | @pytest.mark.parametrize(
 8 |     "window,stride,docs,result",
 9 |     [
10 |         (4, 3, ["0", "1234", "56789a"], [["0"], ["1234"], ["5678", "89a"]]),
11 |         (4, 4, ["0", "1234", "56789a"], [["0"], ["1234"], ["5678", "9a"]]),
12 |         (4, 2, ["0", "1234", "56789a"], [["0"], ["1234"], ["5678", "789a"]]),
13 |     ],
14 | )
15 | def test_get_strided_spans(window, stride, docs, result):
16 |     get_strided = configure_strided_spans(window, stride)
17 |     spans = get_strided(docs)
18 |     assert spans == result
19 | 
20 | 
21 | def test_get_sent_spans():
22 |     nlp = English()
23 |     nlp.add_pipe("sentencizer")
24 |     doc = nlp("One. One more. Three sentences in total.")
25 |     assert len(list(doc.sents)) == 3
26 |     get_sent_spans = configure_get_sent_spans()
27 |     spans = get_sent_spans([doc])[0]
28 |     assert len(spans) == 3
29 |     assert spans[0].text == "One."
30 |     assert spans[1].text == "One more."
31 |     assert spans[2].text == "Three sentences in total."
32 | 
33 | 
34 | def test_get_custom_spans():
35 |     def configure_custom_sent_spans(max_length: int):
36 |         def get_custom_sent_spans(docs):
37 |             spans = []
38 |             for doc in docs:
39 |                 spans.append([])
40 |                 for sent in doc.sents:
41 |                     start = 0
42 |                     end = max_length
43 |                     while end <= len(sent):
44 |                         spans[-1].append(sent[start:end])
45 |                         start += max_length
46 |                         end += max_length
47 |                     if start < len(sent):
48 |                         spans[-1].append(sent[start : len(sent)])
49 |             return spans
50 | 
51 |         return get_custom_sent_spans
52 | 
53 |     nlp = English()
54 |     nlp.add_pipe("sentencizer")
55 |     doc = nlp(
56 |         "One. And one more. So that makes three sentences and this one is a bit longer."
57 |     )
58 |     assert len(list(doc.sents)) == 3
59 |     get_sent_spans = configure_custom_sent_spans(max_length=4)
60 |     spans = get_sent_spans([doc])[0]
61 |     assert len(spans) == 6
62 |     assert spans[0].text == "One."
63 |     assert spans[1].text == "And one more."
64 |     assert spans[2].text == "So that makes three"
65 |     assert spans[3].text == "sentences and this one"
66 |     assert spans[4].text == "is a bit longer"
67 |     assert spans[5].text == "."
68 | 


--------------------------------------------------------------------------------
/spacy_transformers/tests/test_textcatcnn.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from packaging.version import Version
 3 | 
 4 | from spacy.training.example import Example
 5 | from spacy import util
 6 | import thinc
 7 | from thinc.api import Model, Config
 8 | 
 9 | # fmt: off
10 | cfg_string = """
11 |     [nlp]
12 |     lang = "en"
13 |     pipeline = ["textcat"]
14 | 
15 |     [components]
16 | 
17 |     [components.textcat]
18 |     factory = "textcat"
19 | 
20 |     [components.textcat.model]
21 |     @architectures = "spacy.TextCatCNN.v2"
22 |     nO = null
23 |     exclusive_classes = false
24 | 
25 |     [components.textcat.model.tok2vec]
26 |     @architectures = "spacy-transformers.Tok2VecTransformer.v1"
27 |     name = "roberta-base"
28 |     tokenizer_config = {"use_fast": false}
29 |     grad_factor = 1.0
30 | 
31 |     [components.textcat.model.tok2vec.get_spans]
32 |     @span_getters = "spacy-transformers.strided_spans.v1"
33 |     window = 256
34 |     stride = 256
35 | 
36 |     [components.textcat.model.tok2vec.pooling]
37 |     @layers = "reduce_mean.v1"
38 |     """
39 | # fmt: on
40 | 
41 | 
42 | # TODO: remove skip after requiring spacy>=3.5.1 or at the very latest, after
43 | # dropping python 3.7 switch to importlib.metadata.version("thinc")
44 | @pytest.mark.skipif(
45 |     Version(thinc.__version__) < Version("8.1.8"), reason="Requires thinc>=8.1.8"
46 | )
47 | def test_textcatcnn():
48 |     orig_config = Config().from_str(cfg_string)
49 |     nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
50 |     assert nlp.pipe_names == ["textcat"]
51 | 
52 |     textcat = nlp.get_pipe("textcat")
53 |     assert textcat.is_resizable is True
54 | 
55 |     train_examples = []
56 |     doc = nlp.make_doc("ok")
57 |     doc.cats["X"] = 1.0
58 |     doc.cats["Y"] = 0.0
59 |     train_examples.append(Example(doc, doc))
60 | 
61 |     nlp.initialize(lambda: train_examples)
62 | 


--------------------------------------------------------------------------------
/spacy_transformers/tests/test_tok2vectransformer.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from spacy.training.example import Example
 3 | from spacy.util import make_tempdir
 4 | from spacy import util
 5 | from thinc.api import Model, Config
 6 | from .util import _assert_equal_tensors
 7 | 
 8 | # fmt: off
 9 | TRAIN_DATA = [
10 |     ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
11 |     ("Eat blue ham", {"tags": ["V", "J", "N"]}),
12 | ]
13 | 
14 | 
15 | cfg_string = """
16 |     [nlp]
17 |     lang = "en"
18 |     pipeline = ["tagger"]
19 | 
20 |     [components]
21 | 
22 |     [components.tagger]
23 |     factory = "tagger"
24 | 
25 |     [components.tagger.model]
26 |     @architectures = "spacy.Tagger.v1"
27 |     nO = null
28 | 
29 |     [components.tagger.model.tok2vec]
30 |     @architectures = "spacy-transformers.Tok2VecTransformer.v1"
31 |     name = "distilbert-base-uncased"
32 |     tokenizer_config = {"use_fast": false}
33 |     grad_factor = 1.0
34 | 
35 |     [components.tagger.model.tok2vec.get_spans]
36 |     @span_getters = "spacy-transformers.strided_spans.v1"
37 |     window = 256
38 |     stride = 256
39 | 
40 |     [components.tagger.model.tok2vec.pooling]
41 |     @layers = "reduce_mean.v1"
42 |     """
43 | # fmt: on
44 | 
45 | 
46 | def test_transformer_pipeline_tagger_internal():
47 |     """Test that a tagger with internal transformer runs and trains properly"""
48 |     orig_config = Config().from_str(cfg_string)
49 |     nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
50 |     assert nlp.pipe_names == ["tagger"]
51 |     tagger = nlp.get_pipe("tagger")
52 |     tagger_trf = tagger.model.get_ref("tok2vec").layers[0]
53 |     assert isinstance(tagger_trf, Model)
54 |     train_examples = []
55 |     for t in TRAIN_DATA:
56 |         train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
57 |         for tag in t[1]["tags"]:
58 |             tagger.add_label(tag)
59 | 
60 |     optimizer = nlp.initialize(lambda: train_examples)
61 |     for i in range(2):
62 |         losses = {}
63 |         nlp.update(train_examples, sgd=optimizer, losses=losses)
64 | 
65 |     doc = nlp("We're interested at underwater basket weaving.")
66 |     doc_tensor = tagger_trf.predict([doc])
67 | 
68 |     # ensure IO goes OK
69 |     with make_tempdir() as d:
70 |         file_path = d / "trained_nlp"
71 |         nlp.to_disk(file_path)
72 | 
73 |         # results are not the same if we don't call from_disk
74 |         nlp2 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
75 |         nlp2.initialize(lambda: train_examples)
76 |         doc2 = nlp2("We're interested at underwater basket weaving.")
77 |         tagger2 = nlp2.get_pipe("tagger")
78 |         tagger_trf2 = tagger2.model.get_ref("tok2vec").layers[0]
79 |         doc_tensor2 = tagger_trf2.predict([doc2])
80 |         with pytest.raises(AssertionError):
81 |             _assert_equal_tensors(
82 |                 doc_tensor2.doc_data[0].tensors, doc_tensor.doc_data[0].tensors
83 |             )
84 | 
85 |         # results ARE the same if we call from_disk
86 |         nlp3 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
87 |         nlp3.from_disk(file_path)
88 |         doc3 = nlp3("We're interested at underwater basket weaving.")
89 |         tagger3 = nlp3.get_pipe("tagger")
90 |         tagger_trf3 = tagger3.model.get_ref("tok2vec").layers[0]
91 |         doc_tensor3 = tagger_trf3.predict([doc3])
92 |         _assert_equal_tensors(
93 |             doc_tensor3.doc_data[0].tensors, doc_tensor.doc_data[0].tensors
94 |         )
95 | 


--------------------------------------------------------------------------------
/spacy_transformers/tests/test_truncation.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import numpy
  3 | from thinc.types import Ragged
  4 | from thinc.api import NumpyOps
  5 | from ..data_classes import WordpieceBatch
  6 | from ..truncate import _truncate_tokens, _truncate_alignment
  7 | 
  8 | 
  9 | @pytest.fixture
 10 | def sequences():
 11 |     # Each sequence is a list of tokens, and each token is a number of wordpieces
 12 |     return [
 13 |         [1, 3, 1],  # So 5 wordpieces this sequence
 14 |         [3, 7, 1, 1],  # 12
 15 |         [1],  # 1
 16 |         [20, 1],  # 21
 17 |     ]
 18 | 
 19 | 
 20 | @pytest.fixture
 21 | def shape(sequences):
 22 |     # Get the shape of the input_ids, which includes the padding.
 23 |     maximum = max(sum(lengths) for lengths in sequences)
 24 |     return (len(sequences), maximum)
 25 | 
 26 | 
 27 | @pytest.fixture
 28 | def seq_lengths(sequences):
 29 |     return numpy.array([sum(seq) for seq in sequences], dtype="i")
 30 | 
 31 | 
 32 | @pytest.fixture
 33 | def wordpieces(sequences):
 34 |     strings = []
 35 |     for token_lengths in sequences:
 36 |         strings.append([])
 37 |         for length in token_lengths:
 38 |             strings[-1].extend(str(i) for i in range(length))
 39 |     shape = (len(strings), max(len(seq) for seq in strings))
 40 |     wordpieces = WordpieceBatch(
 41 |         strings=strings,
 42 |         input_ids=numpy.zeros(shape, dtype="i"),
 43 |         token_type_ids=numpy.zeros(shape, dtype="i"),
 44 |         attention_mask=numpy.zeros((shape[0], shape[1]), dtype="bool"),
 45 |         lengths=[len(seq) for seq in strings],
 46 |     )
 47 |     return wordpieces
 48 | 
 49 | 
 50 | @pytest.fixture
 51 | def align(sequences):
 52 |     lengths = []
 53 |     indices = []
 54 |     offset = 0
 55 |     for seq in sequences:
 56 |         for token_length in seq:
 57 |             lengths.append(token_length)
 58 |             indices.extend(i + offset for i in range(token_length))
 59 |             offset += token_length
 60 |     return Ragged(numpy.array(indices, dtype="i"), numpy.array(lengths, dtype="i"))
 61 | 
 62 | 
 63 | @pytest.fixture
 64 | def max_length():
 65 |     return 6
 66 | 
 67 | 
 68 | @pytest.fixture
 69 | def mask_from_end(shape, max_length):
 70 |     n_seq, length = shape
 71 |     bools = [
 72 |         numpy.array([i < max_length for i in range(length)], dtype="bool")
 73 |         for _ in range(n_seq)
 74 |     ]
 75 |     return numpy.concatenate(bools)
 76 | 
 77 | 
 78 | def test_truncate_wordpieces(wordpieces, max_length, mask_from_end):
 79 |     truncated = _truncate_tokens(wordpieces, mask_from_end)
 80 |     for i, seq in enumerate(truncated.strings):
 81 |         assert len(seq) <= max_length
 82 |         assert seq == wordpieces.strings[i][:max_length]
 83 |         assert truncated.input_ids[i].shape[0] <= max_length
 84 |         assert truncated.token_type_ids[i].shape[0] <= max_length
 85 |         assert truncated.attention_mask[i].shape[0] <= max_length
 86 | 
 87 | 
 88 | def test_truncate_alignment_from_end(sequences, max_length, align, mask_from_end):
 89 |     # print("Max length", max_length)
 90 |     # print("Sequences", sequences)
 91 |     # print("Mask", mask_from_end)
 92 |     ops = NumpyOps()
 93 |     truncated = _truncate_alignment(align, mask_from_end)
 94 |     # print(truncated.dataXd.shape, truncated.lengths.sum())
 95 |     # print("Before", list(map(list, ops.unflatten(align.dataXd, align.lengths))))
 96 |     # print("After", list(map(list, ops.unflatten(truncated.dataXd, truncated.lengths))))
 97 |     # Check that the number of tokens hasn't changed. We still need to have
 98 |     # alignment for every token.
 99 |     assert truncated.lengths.shape[0] == align.lengths.shape[0]
100 |     start = 0
101 |     for i, seq in enumerate(sequences):
102 |         end = start + len(seq)
103 |         # Get the alignment for this sequence of tokens. Each length in the
104 |         # alignment indicates the number of wordpiece tokens, so we need to
105 |         # check that the sum of the lengths doesn't exceed the maximum.
106 |         wp_indices = truncated[start:end]
107 |         assert wp_indices.lengths.sum() <= max_length
108 |         # We're truncating from the end, so we shouldn't see different values
109 |         # except at the end of the sequence.
110 |         seen_zero = False
111 |         before = align[start:end]
112 |         for length_now, length_before in zip(wp_indices.lengths, before.lengths):
113 |             if seen_zero:
114 |                 assert length_now == 0, wp_indices.lengths
115 |             elif length_now == 0:
116 |                 seen_zero = True
117 |             else:
118 |                 length_now == length_before
119 | 


--------------------------------------------------------------------------------
/spacy_transformers/tests/util.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, List, Union
  2 | import numpy
  3 | import torch
  4 | import copy
  5 | from transformers.file_utils import ModelOutput
  6 | from numpy.testing import assert_array_equal
  7 | 
  8 | from spacy.tokens import Doc
  9 | from thinc.api import Model, get_current_ops
 10 | 
 11 | from ..data_classes import FullTransformerBatch, HFObjects
 12 | from ..span_getters import get_doc_spans
 13 | from ..layers.transformer_model import forward as transformer_forward
 14 | 
 15 | 
 16 | def _assert_equal_tensors(tensors1, tensors2):
 17 |     ops = get_current_ops()
 18 |     for i in range(len(tensors1)):
 19 |         t1 = ops.asarray(tensors1[i])
 20 |         t2 = ops.asarray(tensors2[i])
 21 |         assert_array_equal(ops.to_numpy(t1), ops.to_numpy(t2))
 22 | 
 23 | 
 24 | class DummyTokenizer:
 25 |     def __init__(self):
 26 |         self.str2int = {}
 27 |         self.int2str = {}
 28 |         self.start_symbol = "<s>"
 29 |         self.end_symbol = "</s>"
 30 |         self.model_max_length = 512
 31 |         self.pad_token = "[PAD]"
 32 | 
 33 |     @property
 34 |     def all_special_tokens(self):
 35 |         return [self.start_symbol, self.end_symbol]
 36 | 
 37 |     def __call__(
 38 |         self,
 39 |         texts,
 40 |         add_special_tokens=True,
 41 |         max_length=None,
 42 |         stride: int = 0,
 43 |         truncation_strategy="longest_first",
 44 |         padding=False,
 45 |         truncation=False,
 46 |         is_pretokenized=False,
 47 |         return_tensors=None,
 48 |         return_token_type_ids=None,
 49 |         return_attention_mask=None,
 50 |         return_overflowing_tokens=False,
 51 |         return_special_tokens_masks=False,
 52 |         return_offsets_mapping=False,
 53 |         return_length=False,
 54 |     ):
 55 |         output: Dict = {
 56 |             "input_ids": [],
 57 |             "attention_mask": [],
 58 |             "token_type_ids": [],
 59 |         }  # type: ignore
 60 | 
 61 |         for text in texts:
 62 |             words, offsets, mask, type_ids = self._tokenize(text)
 63 |             ids = self._encode_words(words)
 64 |             output["input_ids"].append(ids)
 65 |             output["attention_mask"].append(mask)
 66 |             output["token_type_ids"].append(type_ids)
 67 |         if padding:
 68 |             output = self._pad(output)
 69 |         if return_tensors == "pt":
 70 |             output["input_ids"] = torch.tensor(output["input_ids"])  # type: ignore
 71 |             output["attention_mask"] = torch.tensor(output["attention_mask"])  # type: ignore
 72 |             output["token_type_ids"] = torch.tensor(output["token_type_ids"])  # type: ignore
 73 |         elif return_tensors == "np":
 74 |             output["input_ids"] = numpy.asarray(output["input_ids"])  # type: ignore
 75 |             output["attention_mask"] = numpy.asarray(output["attention_mask"])  # type: ignore
 76 |             output["token_type_ids"] = numpy.asarray(output["token_type_ids"])  # type: ignore
 77 |         if return_length:
 78 |             output["length"] = torch.tensor([len(x) for x in output["input_ids"]])  # type: ignore
 79 |         return output
 80 | 
 81 |     def convert_ids_to_tokens(self, ids: Union[List[int], torch.Tensor]) -> List[str]:
 82 |         return [self.int2str[int(id_)] for id_ in ids]  # type: ignore
 83 | 
 84 |     def _pad(self, batch):
 85 |         batch = copy.deepcopy(batch)
 86 |         longest = max(len(ids) for ids in batch["input_ids"])
 87 |         for i in range(len(batch["input_ids"])):
 88 |             length = len(batch["input_ids"][i])
 89 |             difference = longest - length
 90 |             batch["attention_mask"][i] = [1] * length + [0] * difference
 91 |             batch["input_ids"][i].extend([0] * difference)
 92 |             batch["token_type_ids"][i].extend([2] * difference)
 93 |         return batch
 94 | 
 95 |     def _tokenize(self, text):
 96 |         offsets = []
 97 |         start = 0
 98 |         for i, char in enumerate(text):
 99 |             if char == " ":
100 |                 offsets.append((start, i))
101 |                 start = i + 1
102 |         if start < len(text):
103 |             offsets.append((start, len(text)))
104 |         words = [text[start:end] for start, end in offsets]
105 |         type_ids = [0] + [1] * len(words) + [0]
106 |         words = [self.start_symbol] + words + [self.end_symbol]
107 |         offsets = [None] + offsets + [None]
108 |         mask = [1] * len(words)
109 |         return words, offsets, mask, type_ids
110 | 
111 |     def _encode_words(self, words):
112 |         ids = []
113 |         for word in words:
114 |             if word not in self.str2int:
115 |                 self.int2str[len(self.str2int)] = word
116 |                 self.str2int[word] = len(self.str2int)
117 |             ids.append(self.str2int[word])
118 |         return ids
119 | 
120 | 
121 | def DummyTransformerModel(width: int, depth: int):
122 |     def _forward(model, tokens, is_train):
123 |         width = model.attrs["width"]
124 |         depth = model.attrs["depth"]
125 |         shape = (depth, tokens.input_ids.shape[0], tokens.input_ids.shape[1], width)
126 |         tensors = torch.zeros(*shape)
127 |         return ModelOutput(last_hidden_state=tensors), lambda d_tensors: tokens
128 | 
129 |     return Model(
130 |         "dummy-transformer",
131 |         _forward,
132 |         attrs={"width": width, "depth": depth},
133 |     )
134 | 
135 | 
136 | def DummyTransformer(
137 |     depth: int = 2, width: int = 4, get_spans=get_doc_spans
138 | ) -> Model[List[Doc], FullTransformerBatch]:
139 |     """Create a test model that produces a FullTransformerBatch object."""
140 |     hf_model = HFObjects(DummyTokenizer(), None, None)
141 | 
142 |     return DummyModel(
143 |         "dummy-transformer",
144 |         transformer_forward,
145 |         layers=[DummyTransformerModel(width=width, depth=depth)],
146 |         attrs={
147 |             "get_spans": get_spans,
148 |             "hf_model": hf_model,
149 |             "grad_factor": 1.0,
150 |             "flush_cache_chance": 0.0,
151 |             "transformer_config": {},
152 |         },
153 |         dims={"nO": width},
154 |     )
155 | 
156 | 
157 | class DummyModel(Model):
158 |     @property
159 |     def tokenizer(self):
160 |         return DummyTokenizer()
161 | 
162 |     @property
163 |     def transformer(self):
164 |         return None
165 | 
166 |     @property
167 |     def tokenizer_config(self):
168 |         return {}
169 | 
170 |     @property
171 |     def transformer_config(self):
172 |         return {}
173 | 


--------------------------------------------------------------------------------
/spacy_transformers/truncate.py:
--------------------------------------------------------------------------------
  1 | from typing import Tuple, List, Union, TypeVar
  2 | import numpy
  3 | from thinc.types import Ragged, Ints2d, Floats2d
  4 | from .data_classes import WordpieceBatch
  5 | 
  6 | ArrayT = TypeVar("ArrayT", bound=Union[Ints2d, Floats2d])
  7 | 
  8 | 
  9 | def truncate_oversize_splits(
 10 |     wordpieces: WordpieceBatch, align: Ragged, max_length: int
 11 | ) -> Tuple[WordpieceBatch, Ragged]:
 12 |     """Drop wordpieces from inputs that are too long. This can happen because
 13 |     the splitter is based on linguistic tokens, and the number of wordpieces
 14 |     that each token is split into is unpredictable, so we can end up with splits
 15 |     that have more wordpieces than the model's maximum.
 16 | 
 17 |     To solve this, we calculate a score for each wordpiece in the split,
 18 |     and drop the wordpieces with the highest scores. I can think of a few
 19 |     scoring schemes we could use:
 20 | 
 21 |     a) Drop the ends of longest wordpieces. This scoring would be just:
 22 |         position_in_token
 23 |     b) Drop the middles of longest wordpieces. The score would be:
 24 |         min(length - position_in_token, position_in_token)
 25 |     c) Drop all wordpieces from longest tokens. This would be:
 26 |         length
 27 |     d) Drop wordpieces from the end of the split. This would be:
 28 |         position_in_split
 29 | 
 30 |     The advantage of a) and b) is that they give some representation to each
 31 |     token. The advantage of c) is that it leaves a higher % of tokens with intact
 32 |     representations. The advantage of d) is that it leaves contiguous chunks of
 33 |     wordpieces intact, and drops from the end.
 34 | 
 35 |     I find b) most appealing, but it's also the most complicated. Let's just do
 36 |     d) for now.
 37 |     """
 38 |     if wordpieces.input_ids.shape[1] < max_length:
 39 |         return wordpieces, align
 40 |     mask = _get_truncation_mask_drop_from_end(
 41 |         wordpieces.input_ids.shape, wordpieces.lengths, align, max_length
 42 |     )
 43 |     return _truncate_tokens(wordpieces, mask), _truncate_alignment(align, mask)
 44 | 
 45 | 
 46 | def _get_truncation_mask_drop_from_end(
 47 |     shape: Tuple[int, int], split_lengths: List[int], align: Ragged, max_length: int
 48 | ) -> numpy.ndarray:
 49 |     """Return a two-dimensional boolean mask, indicating whether wordpieces
 50 |     are dropped from their sequences.
 51 | 
 52 |     Drop wordpieces from the end of the sequence.
 53 |     """
 54 |     mask = numpy.ones(shape, dtype="bool")
 55 |     mask[:, max_length:] = 0
 56 |     return mask
 57 | 
 58 | 
 59 | def _truncate_tokens(wordpieces: WordpieceBatch, mask: numpy.ndarray) -> WordpieceBatch:
 60 |     n_seq = len(wordpieces)
 61 |     mask1d = mask.ravel()
 62 |     mask = mask.reshape((n_seq, -1))
 63 | 
 64 |     strings: List[List[str]] = []
 65 |     for i, seq in enumerate(wordpieces.strings):
 66 |         strings.append([])
 67 |         for j, token in enumerate(seq):
 68 |             if mask[i, j]:
 69 |                 strings[-1].append(token)
 70 | 
 71 |     def filter_array(data: ArrayT) -> ArrayT:
 72 |         data1d = data.reshape((-1,))
 73 |         return data1d[mask1d].reshape((n_seq, -1))  # type: ignore
 74 | 
 75 |     filtered_token_type_ids = None
 76 |     if wordpieces.token_type_ids is not None:
 77 |         filtered_token_type_ids = filter_array(wordpieces.token_type_ids)
 78 | 
 79 |     return WordpieceBatch(
 80 |         strings=strings,
 81 |         input_ids=filter_array(wordpieces.input_ids),
 82 |         attention_mask=filter_array(wordpieces.attention_mask),
 83 |         lengths=[len(seq) for seq in strings],
 84 |         token_type_ids=filtered_token_type_ids,
 85 |     )
 86 | 
 87 | 
 88 | def _truncate_alignment(align: Ragged, mask: numpy.ndarray) -> Ragged:
 89 |     # We're going to have fewer wordpieces in the new array, so all of our
 90 |     # wordpiece indices in the alignment table will be off --- they'll point
 91 |     # to the wrong row. So we need to do three things here:
 92 |     #
 93 |     # 1) Adjust all the indices in align.dataXd to account for the dropped data
 94 |     # 2) Remove the dropped indices from the align.dataXd
 95 |     # 3) Calculate new align.lengths
 96 |     #
 97 |     # The wordpiece mapping is easily calculated by the cumulative sum of the
 98 |     # mask table.
 99 |     # Let's say we have [True, False, False, True]. The mapping of the dropped
100 |     # wordpieces doesn't matter, because we can filter it with the mask. So we
101 |     # have [0, 0, 0, 1], i.e the wordpiece that was
102 |     # at 0 is still at 0, and the wordpiece that was at 3 is now at 1.
103 |     mask = mask.ravel()
104 |     idx_map = mask.cumsum() - 1
105 |     idx_map[~mask] = -1
106 |     # Step 1: Adjust all the indices in align.dataXd.
107 |     new_align = idx_map[align.data.ravel()]
108 |     # Step 2: Remove the dropped indices
109 |     new_align = new_align[new_align >= 0]
110 |     # Step 3: Calculate new align.lengths
111 |     new_lengths = align.lengths.copy()
112 |     for i in range(len(align.lengths)):
113 |         drops = ~mask[align[i].data.ravel()]
114 |         new_lengths[i] -= drops.sum()
115 |     return Ragged(new_align, new_lengths)
116 | 


--------------------------------------------------------------------------------
/spacy_transformers/util.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Dict, Union, Set
  2 | from pathlib import Path
  3 | import random
  4 | from transformers import AutoModel, AutoTokenizer
  5 | from transformers.tokenization_utils import BatchEncoding
  6 | from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
  7 | import catalogue
  8 | from spacy.util import registry
  9 | from thinc.api import get_torch_default_device
 10 | import torch.cuda
 11 | import tempfile
 12 | import shutil
 13 | import contextlib
 14 | import warnings
 15 | 
 16 | 
 17 | # fmt: off
 18 | registry.span_getters = catalogue.create("spacy", "span_getters", entry_points=True)  # type: ignore
 19 | registry.annotation_setters = catalogue.create("spacy", "annotation_setters", entry_points=True)  # type: ignore
 20 | # fmt: on
 21 | 
 22 | 
 23 | def huggingface_from_pretrained(source: Union[Path, str], config: Dict):
 24 |     """Create a Huggingface transformer model from pretrained weights. Will
 25 |     download the model if it is not already downloaded.
 26 | 
 27 |     source (Union[str, Path]): The name of the model or a path to it, such as
 28 |         'bert-base-cased'.
 29 |     config (dict): Settings to pass to the tokenizer.
 30 |     """
 31 |     warnings.warn(
 32 |         "spacy_transformers.util.huggingface_from_pretrained has been moved to "
 33 |         "spacy_transformers.layers.transformer_model.huggingface_from_pretrained "
 34 |         "with an updated API:\n"
 35 |         "huggingface_from_pretrained(source, tok_config, trf_config) -> HFObjects",
 36 |         DeprecationWarning,
 37 |     )
 38 |     if isinstance(source, Path):
 39 |         str_path = str(source.absolute())
 40 |     else:
 41 |         str_path = source
 42 |     tokenizer = AutoTokenizer.from_pretrained(str_path, **config)
 43 |     transformer = AutoModel.from_pretrained(str_path)
 44 |     torch_device = get_torch_default_device()
 45 |     transformer.to(torch_device)
 46 |     return tokenizer, transformer
 47 | 
 48 | 
 49 | def huggingface_tokenize(tokenizer, texts: List[str]) -> BatchEncoding:
 50 |     """Apply a Huggingface tokenizer to a batch of texts."""
 51 | 
 52 |     # Use NumPy arrays rather than PyTorch tensors to avoid a lot of
 53 |     # host <-> device transfers during tokenization and post-processing
 54 |     # when a GPU is used.
 55 |     warnings.warn(
 56 |         "spacy_transformers.util.huggingface_tokenize has been moved to "
 57 |         "spacy_transformers.layers.transformer_model.huggingface_tokenize.",
 58 |         DeprecationWarning,
 59 |     )
 60 |     token_data = tokenizer(
 61 |         texts,
 62 |         add_special_tokens=True,
 63 |         return_attention_mask=True,
 64 |         return_offsets_mapping=isinstance(tokenizer, PreTrainedTokenizerFast),
 65 |         return_tensors="np",
 66 |         return_token_type_ids=None,  # Sets to model default
 67 |         padding="longest",
 68 |     )
 69 |     token_data["input_texts"] = []
 70 |     for i in range(len(token_data["input_ids"])):
 71 |         wp_texts = tokenizer.convert_ids_to_tokens(token_data["input_ids"][i])
 72 |         token_data["input_texts"].append(wp_texts)
 73 |     token_data["pad_token"] = tokenizer.pad_token
 74 |     return token_data
 75 | 
 76 | 
 77 | def maybe_flush_pytorch_cache(chance: float = 1.0):
 78 |     """Flip a coin and decide whether to flush PyTorch's cache. This allows the
 79 |     cache to be flushed periodically without maintaining a counter.
 80 | 
 81 |     I'm not sure why this is necessary, it shouldn't be. But it definitely does
 82 |     help...
 83 |     """
 84 |     if random.random() < chance and torch.cuda.is_available():
 85 |         torch.cuda.empty_cache()
 86 | 
 87 | 
 88 | def transpose_list(nested_list):
 89 |     output = []
 90 |     for i, entry in enumerate(nested_list):
 91 |         while len(output) < len(entry):
 92 |             output.append([None] * len(nested_list))
 93 |         for j, x in enumerate(entry):
 94 |             output[j][i] = x
 95 |     return output
 96 | 
 97 | 
 98 | def batch_by_length(seqs, max_words: int) -> List[List[int]]:
 99 |     """Given a list of sequences, return a batched list of indices into the
100 |     list, where the batches are grouped by length, in descending order.
101 | 
102 |     Batches may be at most max_words in size, defined as max sequence length * size.
103 |     """
104 |     # Use negative index so we can get sort by position ascending.
105 |     lengths_indices = [(len(seq), i) for i, seq in enumerate(seqs)]
106 |     lengths_indices.sort()
107 |     batches: List[List[int]] = []
108 |     batch: List[int] = []
109 |     for length, i in lengths_indices:
110 |         if not batch:
111 |             batch.append(i)
112 |         elif length * (len(batch) + 1) <= max_words:
113 |             batch.append(i)
114 |         else:
115 |             batches.append(batch)
116 |             batch = [i]
117 |     if batch:
118 |         batches.append(batch)
119 |     # Check lengths match
120 |     assert sum(len(b) for b in batches) == len(seqs)
121 |     # Check no duplicates
122 |     seen: Set[int] = set()
123 |     for b in batches:
124 |         seen.update(id(item) for item in b)
125 |     assert len(seen) == len(seqs)
126 |     batches = [list(sorted(batch)) for batch in batches]
127 |     batches.reverse()
128 |     return batches
129 | 
130 | 
131 | def log_gpu_memory(logger, context):
132 |     mem = torch.cuda.memory_allocated() // 1024**2
133 |     logger.info(f"{mem:.1f}: {context}")
134 | 
135 | 
136 | def log_batch_size(logger, token_data, is_train):
137 |     batch_size = token_data["input_ids"].shape[0]
138 |     seq_len = token_data["input_ids"].shape[1]
139 |     squared = seq_len**2 * batch_size
140 | 
141 |     if is_train:
142 |         logger.info(f"{batch_size} x {seq_len} ({squared}) update")
143 |     else:
144 |         logger.info(f"{batch_size} x {seq_len} ({squared}) predict")
145 | 
146 | 
147 | @contextlib.contextmanager
148 | def make_tempdir():
149 |     """Execute a block in a temporary directory and remove the directory and
150 |     its contents at the end of the with block.
151 | 
152 |     YIELDS (Path): The path of the temp directory.
153 |     """
154 |     d = Path(tempfile.mkdtemp())
155 |     yield d
156 |     shutil.rmtree(str(d))
157 | 


--------------------------------------------------------------------------------