├── .github ├── PULL_REQUEST_TEMPLATE.md ├── no-response.yml └── workflows │ ├── cibuildwheel.yml │ ├── explosionbot.yml │ ├── issue-manager.yml │ ├── publish_pypi.yml │ └── tests.yml ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── bin ├── get-version.sh └── push-tag.sh ├── build-constraints.txt ├── examples └── configs │ ├── joint-core-bert.cfg │ └── ner-albert.cfg ├── pyproject.toml ├── requirements.txt ├── setup.cfg ├── setup.py └── spacy_transformers ├── __init__.py ├── align.pyi ├── align.pyx ├── annotation_setters.py ├── architectures.py ├── data_classes.py ├── layers ├── __init__.py ├── _util.py ├── hf_shim.py ├── hf_wrapper.py ├── listener.py ├── split_trf.py ├── transformer_model.py └── trfs2arrays.py ├── pipeline_component.py ├── py.typed ├── span_getters.py ├── tests ├── __init__.py ├── enable_gpu.py ├── regression │ ├── __init__.py │ ├── test_spacy_issue6401.py │ └── test_spacy_issue7029.py ├── test_alignment.py ├── test_configs.py ├── test_data_classes.py ├── test_deprecations.py ├── test_model_sequence_classification.py ├── test_model_wrapper.py ├── test_pipeline_component.py ├── test_serialize.py ├── test_spanners.py ├── test_textcatcnn.py ├── test_tok2vectransformer.py ├── test_truncation.py └── util.py ├── truncate.py └── util.py /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Description 4 | 5 | 10 | 11 | ### Types of change 12 | 13 | 15 | 16 | ## Checklist 17 | 18 | 20 | 21 | - [ ] I confirm that I have the right to submit this contribution under the project's MIT license. 22 | - [ ] I ran the tests, and all new and existing tests passed. 23 | - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information. 24 | -------------------------------------------------------------------------------- /.github/no-response.yml: -------------------------------------------------------------------------------- 1 | # Configuration for probot-no-response - https://github.com/probot/no-response 2 | 3 | # Number of days of inactivity before an Issue is closed for lack of response 4 | daysUntilClose: 14 5 | # Label requiring a response 6 | responseRequiredLabel: more-info-needed 7 | # Comment to post when closing an Issue for lack of response. Set to `false` to disable 8 | closeComment: > 9 | This issue has been automatically closed because there has been no response 10 | to a request for more information from the original author. With only the 11 | information that is currently in the issue, there's not enough information 12 | to take action. If you're the original author, feel free to reopen the issue 13 | if you have or find the answers needed to investigate further. 14 | -------------------------------------------------------------------------------- /.github/workflows/cibuildwheel.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | 3 | on: 4 | push: 5 | tags: 6 | # ytf did they invent their own syntax that's almost regex? 7 | # ** matches 'zero or more of any character' 8 | - 'release-v[0-9]+.[0-9]+.[0-9]+**' 9 | - 'prerelease-v[0-9]+.[0-9]+.[0-9]+**' 10 | jobs: 11 | build_wheels: 12 | name: Build wheels on ${{ matrix.os }} 13 | runs-on: ${{ matrix.os }} 14 | strategy: 15 | matrix: 16 | # macos-13 is an intel runner, macos-14 is apple silicon 17 | os: [ubuntu-latest, windows-latest, macos-13, macos-14, ubuntu-24.04-arm] 18 | 19 | steps: 20 | - uses: actions/checkout@v4 21 | - name: Build wheels 22 | uses: pypa/cibuildwheel@v2.21.3 23 | env: 24 | CIBW_SOME_OPTION: value 25 | with: 26 | package-dir: . 27 | output-dir: wheelhouse 28 | config-file: "{package}/pyproject.toml" 29 | - uses: actions/upload-artifact@v4 30 | with: 31 | name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }} 32 | path: ./wheelhouse/*.whl 33 | 34 | build_sdist: 35 | name: Build source distribution 36 | runs-on: ubuntu-latest 37 | steps: 38 | - uses: actions/checkout@v4 39 | 40 | - name: Build sdist 41 | run: pipx run build --sdist 42 | - uses: actions/upload-artifact@v4 43 | with: 44 | name: cibw-sdist 45 | path: dist/*.tar.gz 46 | create_release: 47 | needs: [build_wheels, build_sdist] 48 | runs-on: ubuntu-latest 49 | permissions: 50 | contents: write 51 | checks: write 52 | actions: read 53 | issues: read 54 | packages: write 55 | pull-requests: read 56 | repository-projects: read 57 | statuses: read 58 | steps: 59 | - name: Get the tag name and determine if it's a prerelease 60 | id: get_tag_info 61 | run: | 62 | FULL_TAG=${GITHUB_REF#refs/tags/} 63 | if [[ $FULL_TAG == release-* ]]; then 64 | TAG_NAME=${FULL_TAG#release-} 65 | IS_PRERELEASE=false 66 | elif [[ $FULL_TAG == prerelease-* ]]; then 67 | TAG_NAME=${FULL_TAG#prerelease-} 68 | IS_PRERELEASE=true 69 | else 70 | echo "Tag does not match expected patterns" >&2 71 | exit 1 72 | fi 73 | echo "FULL_TAG=$TAG_NAME" >> $GITHUB_ENV 74 | echo "TAG_NAME=$TAG_NAME" >> $GITHUB_ENV 75 | echo "IS_PRERELEASE=$IS_PRERELEASE" >> $GITHUB_ENV 76 | - uses: actions/download-artifact@v4 77 | with: 78 | # unpacks all CIBW artifacts into dist/ 79 | pattern: cibw-* 80 | path: dist 81 | merge-multiple: true 82 | - name: Create Draft Release 83 | id: create_release 84 | uses: softprops/action-gh-release@v2 85 | if: startsWith(github.ref, 'refs/tags/') 86 | env: 87 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 88 | with: 89 | name: ${{ env.TAG_NAME }} 90 | draft: true 91 | prerelease: ${{ env.IS_PRERELEASE }} 92 | files: "./dist/*" 93 | -------------------------------------------------------------------------------- /.github/workflows/explosionbot.yml: -------------------------------------------------------------------------------- 1 | name: Explosion Bot 2 | 3 | on: 4 | issue_comment: 5 | types: 6 | - created 7 | - edited 8 | 9 | jobs: 10 | explosion-bot: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Dump GitHub context 14 | env: 15 | GITHUB_CONTEXT: ${{ toJson(github) }} 16 | run: echo "$GITHUB_CONTEXT" 17 | - uses: actions/checkout@v3 18 | - uses: actions/setup-python@v4 19 | - name: Install and run explosion-bot 20 | run: | 21 | pip install git+https://${{ secrets.EXPLOSIONBOT_TOKEN }}@github.com/explosion/explosion-bot 22 | python -m explosionbot 23 | env: 24 | INPUT_TOKEN: ${{ secrets.EXPLOSIONBOT_TOKEN }} 25 | INPUT_BK_TOKEN: ${{ secrets.BUILDKITE_SECRET }} 26 | ENABLED_COMMANDS: "test_gpu" 27 | ALLOWED_TEAMS: "spacy-maintainers" 28 | -------------------------------------------------------------------------------- /.github/workflows/issue-manager.yml: -------------------------------------------------------------------------------- 1 | name: Issue Manager 2 | 3 | on: 4 | schedule: 5 | - cron: "0 0 * * *" 6 | issue_comment: 7 | types: 8 | - created 9 | - edited 10 | issues: 11 | types: 12 | - labeled 13 | 14 | jobs: 15 | issue-manager: 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: tiangolo/issue-manager@0.2.1 19 | with: 20 | token: ${{ secrets.GITHUB_TOKEN }} 21 | config: > 22 | { 23 | "resolved": { 24 | "delay": "P7D", 25 | "message": "This issue has been automatically closed because it was answered and there was no follow-up discussion.", 26 | "remove_label_on_comment": true, 27 | "remove_label_on_close": true 28 | } 29 | } -------------------------------------------------------------------------------- /.github/workflows/publish_pypi.yml: -------------------------------------------------------------------------------- 1 | # The cibuildwheel action triggers on creation of a release, this 2 | # triggers on publication. 3 | # The expected workflow is to create a draft release and let the wheels 4 | # upload, and then hit 'publish', which uploads to PyPi. 5 | 6 | on: 7 | release: 8 | types: 9 | - published 10 | 11 | jobs: 12 | upload_pypi: 13 | runs-on: ubuntu-latest 14 | environment: 15 | name: pypi 16 | url: https://pypi.org/p/spacy-transformers 17 | permissions: 18 | id-token: write 19 | contents: read 20 | if: github.event_name == 'release' && github.event.action == 'published' 21 | # or, alternatively, upload to PyPI on every tag starting with 'v' (remove on: release above to use this) 22 | # if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') 23 | steps: 24 | - uses: robinraju/release-downloader@v1 25 | with: 26 | tag: ${{ github.event.release.tag_name }} 27 | fileName: '*' 28 | out-file-path: 'dist' 29 | - uses: pypa/gh-action-pypi-publish@release/v1 30 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: tests 2 | 3 | on: 4 | push: 5 | paths-ignore: 6 | - "*.md" 7 | pull_request: 8 | types: [opened, synchronize, reopened, edited] 9 | paths-ignore: 10 | - "*.md" 11 | 12 | env: 13 | MODULE_NAME: "spacy_transformers" 14 | RUN_MYPY: "true" 15 | 16 | jobs: 17 | tests: 18 | name: Test 19 | if: github.repository_owner == 'explosion' 20 | strategy: 21 | fail-fast: false 22 | matrix: 23 | os: [ubuntu-latest, windows-latest, macos-latest] 24 | python_version: ["3.12"] 25 | include: 26 | - os: macos-13 27 | python_version: "3.10" 28 | - os: windows-latest 29 | python_version: "3.11" 30 | - os: ubuntu-latest 31 | python_version: "3.12" 32 | - os: macos-13 33 | python_version: "3.12" 34 | - os: windows-latest 35 | python_version: "3.12" 36 | 37 | runs-on: ${{ matrix.os }} 38 | 39 | steps: 40 | - name: Check out repo 41 | uses: actions/checkout@v3 42 | 43 | - name: Configure Python version 44 | uses: actions/setup-python@v4 45 | with: 46 | python-version: ${{ matrix.python_version }} 47 | 48 | - name: Install dependencies 49 | run: | 50 | python -m pip install -U build pip setuptools wheel 51 | python -m pip install -r requirements.txt --force-reinstall 52 | 53 | - name: Build sdist 54 | run: | 55 | python -m build --sdist 56 | 57 | - name: Run mypy 58 | if: env.RUN_MYPY == 'true' && matrix.python_version != '3.6' 59 | shell: bash 60 | run: | 61 | python -m mypy $MODULE_NAME 62 | 63 | - name: Delete source directory 64 | shell: bash 65 | run: | 66 | rm -rf $MODULE_NAME 67 | 68 | - name: Uninstall all packages 69 | run: | 70 | python -m pip freeze --exclude pywin32 --exclude torch 71 | python -m pip freeze --exclude pywin32 --exclude torch > installed.txt 72 | python -m pip uninstall -y -r installed.txt 73 | 74 | - name: Install newest torch for python 3.7+ 75 | if: matrix.python_version != '3.6' 76 | run: | 77 | python -m pip install torch --index-url https://download.pytorch.org/whl/cpu --force-reinstall 78 | 79 | - name: Install from sdist 80 | shell: bash 81 | run: | 82 | SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1) 83 | python -m pip install dist/$SDIST 84 | 85 | - name: Run tests 86 | shell: bash 87 | run: | 88 | python -m pip install -r requirements.txt --force-reinstall 89 | # The version of pytorch being used here requires numpy v2, but because of the way we're doing the 90 | # requirements installation here it's not being resolved that way. So just install numpy 1 here. 91 | python -m pip install "numpy<2" 92 | python -m pytest --pyargs $MODULE_NAME --cov=$MODULE_NAME 93 | 94 | - name: Test backwards compatibility for v1.0 models 95 | if: matrix.python_version == '3.9' 96 | run: | 97 | python -m pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.1.0/en_core_web_trf-3.1.0-py3-none-any.whl --no-deps 98 | python -c "import spacy; nlp = spacy.load('en_core_web_trf'); doc = nlp('test')" 99 | 100 | - name: Test backwards compatibility for v1.1 models 101 | if: matrix.python_version == '3.9' 102 | run: | 103 | python -m pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.4.0/en_core_web_trf-3.4.0-py3-none-any.whl --no-deps 104 | python -c "import spacy; nlp = spacy.load('en_core_web_trf'); doc = nlp('test')" 105 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | tmp/ 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # vim 13 | .*.sw* 14 | 15 | # Cython / C extensions 16 | cythonize.json 17 | spacy_transformers/*.html 18 | *.cpp 19 | *.so 20 | 21 | # Distribution / packaging 22 | .Python 23 | build/ 24 | develop-eggs/ 25 | dist/ 26 | downloads/ 27 | eggs/ 28 | .eggs/ 29 | lib/ 30 | lib64/ 31 | parts/ 32 | sdist/ 33 | var/ 34 | wheels/ 35 | pip-wheel-metadata/ 36 | share/python-wheels/ 37 | *.egg-info/ 38 | .installed.cfg 39 | *.egg 40 | MANIFEST 41 | 42 | # PyInstaller 43 | # Usually these files are written by a python script from a template 44 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 45 | *.manifest 46 | *.spec 47 | 48 | # Installer logs 49 | pip-log.txt 50 | pip-delete-this-directory.txt 51 | 52 | # Unit test / coverage reports 53 | htmlcov/ 54 | .tox/ 55 | .nox/ 56 | .coverage 57 | .coverage.* 58 | .cache 59 | nosetests.xml 60 | coverage.xml 61 | *.cover 62 | .hypothesis/ 63 | .pytest_cache/ 64 | 65 | # Translations 66 | *.mo 67 | *.pot 68 | 69 | # Django stuff: 70 | *.log 71 | local_settings.py 72 | db.sqlite3 73 | db.sqlite3-journal 74 | 75 | # Flask stuff: 76 | instance/ 77 | .webassets-cache 78 | 79 | # Scrapy stuff: 80 | .scrapy 81 | 82 | # Sphinx documentation 83 | docs/_build/ 84 | 85 | # PyBuilder 86 | target/ 87 | 88 | # Jupyter Notebook 89 | .ipynb_checkpoints 90 | 91 | # IPython 92 | profile_default/ 93 | ipython_config.py 94 | 95 | # pyenv 96 | .python-version 97 | 98 | # pipenv 99 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 100 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 101 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 102 | # install all needed dependencies. 103 | #Pipfile.lock 104 | 105 | # celery beat schedule file 106 | celerybeat-schedule 107 | 108 | # SageMath parsed files 109 | *.sage.py 110 | 111 | # Environments 112 | .env 113 | .venv 114 | env/ 115 | venv/ 116 | ENV/ 117 | env.bak/ 118 | venv.bak/ 119 | 120 | # Spyder project settings 121 | .spyderproject 122 | .spyproject 123 | 124 | # Rope project settings 125 | .ropeproject 126 | 127 | # mkdocs documentation 128 | /site 129 | 130 | # mypy 131 | .mypy_cache/ 132 | .dmypy.json 133 | dmypy.json 134 | 135 | # Pyre type checker 136 | .pyre/ 137 | 138 | # Pycharm project files 139 | *.idea 140 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 ExplosionAI GmbH 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include spacy_transformers *.pyi *.pyx *.pxd 2 | recursive-exclude spacy_transformers *.cpp 3 | include LICENSE 4 | include README.md 5 | include pyproject.toml 6 | include spacy_transformers/py.typed 7 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SHELL := /bin/bash 2 | PYVER := 3.6 3 | VENV := ./env$(PYVER) 4 | 5 | version := $(shell "bin/get-version.sh") 6 | 7 | dist/spacy-trf-$(version).pex : wheelhouse/spacy-trf-$(version).stamp 8 | $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -o $@ spacy_transformers==$(version) jsonschema 9 | chmod a+rx $@ 10 | 11 | wheelhouse/spacy-trf-$(version).stamp : $(VENV)/bin/pex setup.py spacy_transformers/*.py* spacy_transformers/*/*.py* 12 | $(VENV)/bin/pip wheel . -w ./wheelhouse 13 | $(VENV)/bin/pip wheel jsonschema -w ./wheelhouse 14 | touch $@ 15 | 16 | $(VENV)/bin/pex : 17 | python$(PYVER) -m venv $(VENV) 18 | $(VENV)/bin/pip install -U pip setuptools pex wheel 19 | 20 | .PHONY : clean 21 | 22 | clean : setup.py 23 | rm -rf dist/* 24 | rm -rf ./wheelhouse 25 | rm -rf $(VENV) 26 | python setup.py clean --all 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # spacy-transformers: Use pretrained transformers like BERT, XLNet and GPT-2 in spaCy 4 | 5 | This package provides [spaCy](https://github.com/explosion/spaCy) components and 6 | architectures to use transformer models via 7 | [Hugging Face's `transformers`](https://github.com/huggingface/transformers) in 8 | spaCy. The result is convenient access to state-of-the-art transformer 9 | architectures, such as BERT, GPT-2, XLNet, etc. 10 | 11 | > **This release requires [spaCy v3](https://spacy.io/usage/v3).** For the 12 | > previous version of this library, see the 13 | > [`v0.6.x` branch](https://github.com/explosion/spacy-transformers/tree/v0.6.x). 14 | 15 | [![tests](https://github.com/explosion/spacy-transformers/actions/workflows/tests.yml/badge.svg)](https://github.com/explosion/spacy-transformers/actions/workflows/tests.yml) 16 | [![PyPi](https://img.shields.io/pypi/v/spacy-transformers.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.python.org/pypi/spacy-transformers) 17 | [![GitHub](https://img.shields.io/github/release/explosion/spacy-transformers/all.svg?style=flat-square&logo=github)](https://github.com/explosion/spacy-transformers/releases) 18 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg?style=flat-square)](https://github.com/ambv/black) 19 | 20 | ## Features 21 | 22 | - Use pretrained transformer models like **BERT**, **RoBERTa** and **XLNet** to 23 | power your spaCy pipeline. 24 | - Easy **multi-task learning**: backprop to one transformer model from several 25 | pipeline components. 26 | - Train using spaCy v3's powerful and extensible config system. 27 | - Automatic alignment of transformer output to spaCy's tokenization. 28 | - Easily customize what transformer data is saved in the `Doc` object. 29 | - Easily customize how long documents are processed. 30 | - Out-of-the-box serialization and model packaging. 31 | 32 | ## 🚀 Installation 33 | 34 | Installing the package from pip will automatically install all dependencies, 35 | including PyTorch and spaCy. Make sure you install this package **before** you 36 | install the models. Also note that this package requires **Python 3.6+**, 37 | **PyTorch v1.5+** and **spaCy v3.0+**. 38 | 39 | ```bash 40 | pip install 'spacy[transformers]' 41 | ``` 42 | 43 | For GPU installation, find your CUDA version using `nvcc --version` and add the 44 | [version in brackets](https://spacy.io/usage/#gpu), e.g. 45 | `spacy[transformers,cuda92]` for CUDA9.2 or `spacy[transformers,cuda100]` for 46 | CUDA10.0. 47 | 48 | If you are having trouble installing PyTorch, follow the 49 | [instructions](https://pytorch.org/get-started/locally/) on the official website 50 | for your specific operating system and requirements. 51 | 52 | ## 📖 Documentation 53 | 54 | > ⚠️ **Important note:** This package has been extensively refactored to take 55 | > advantage of [spaCy v3.0](https://spacy.io). Previous versions that were built 56 | > for [spaCy v2.x](https://v2.spacy.io) worked considerably differently. Please 57 | > see previous tagged versions of this README for documentation on prior 58 | > versions. 59 | 60 | - 📘 61 | [Embeddings, Transformers and Transfer Learning](https://spacy.io/usage/embeddings-transformers): 62 | How to use transformers in spaCy 63 | - 📘 [Training Pipelines and Models](https://spacy.io/usage/training): Train and 64 | update components on your own data and integrate custom models 65 | - 📘 66 | [Layers and Model Architectures](https://spacy.io/usage/layers-architectures): 67 | Power spaCy components with custom neural networks 68 | - 📗 [`Transformer`](https://spacy.io/api/transformer): Pipeline component API 69 | reference 70 | - 📗 71 | [Transformer architectures](https://spacy.io/api/architectures#transformers): 72 | Architectures and registered functions 73 | 74 | ## Applying pretrained text and token classification models 75 | 76 | Note that the `transformer` component from `spacy-transformers` does not support 77 | task-specific heads like token or text classification. A task-specific 78 | transformer model can be used as a source of features to train spaCy components 79 | like `ner` or `textcat`, but the `transformer` component does not provide access 80 | to task-specific heads for training or inference. 81 | 82 | Alternatively, if you only want use to the **predictions** from an existing 83 | Hugging Face text or token classification model, you can use the wrappers from 84 | [`spacy-huggingface-pipelines`](https://github.com/explosion/spacy-huggingface-pipelines) 85 | to incorporate task-specific transformer models into your spaCy pipelines. 86 | 87 | ## Bug reports and other issues 88 | 89 | Please use [spaCy's issue tracker](https://github.com/explosion/spaCy/issues) to 90 | report a bug, or open a new thread on the 91 | [discussion board](https://github.com/explosion/spaCy/discussions) for any other 92 | issue. 93 | -------------------------------------------------------------------------------- /bin/get-version.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | version=$(grep "version = " setup.cfg) 6 | version=${version/version = } 7 | version=${version/\'/} 8 | version=${version/\'/} 9 | version=${version/\"/} 10 | version=${version/\"/} 11 | 12 | echo $version 13 | -------------------------------------------------------------------------------- /bin/push-tag.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | # Insist repository is clean 6 | git diff-index --quiet HEAD 7 | 8 | git checkout $1 9 | git pull origin $1 10 | git push origin $1 11 | 12 | version=$(grep "version = " setup.cfg) 13 | version=${version/version = } 14 | version=${version/\'/} 15 | version=${version/\'/} 16 | version=${version/\"/} 17 | version=${version/\"/} 18 | git tag "v$version" 19 | git push origin "v$version" 20 | -------------------------------------------------------------------------------- /build-constraints.txt: -------------------------------------------------------------------------------- 1 | # build version constraints for use with wheelwright + multibuild 2 | numpy==1.15.0; python_version<='3.7' and platform_machine!='aarch64' 3 | numpy==1.19.2; python_version<='3.7' and platform_machine=='aarch64' 4 | numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64' 5 | numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64' 6 | numpy>=1.25.0; python_version>='3.9' 7 | -------------------------------------------------------------------------------- /examples/configs/joint-core-bert.cfg: -------------------------------------------------------------------------------- 1 | [training] 2 | seed = 0 3 | gold_preproc = false 4 | # Limitations on training document length or number of examples. 5 | max_length = 500 6 | limit = 0 7 | patience = 10000 8 | eval_frequency = 400 9 | dropout = 0.1 10 | init_tok2vec = null 11 | max_epochs = 0 12 | max_steps = 0 13 | orth_variant_level = 0.0 14 | 15 | scores = ["speed", "tags_acc", "uas", "las", "ents_f"] 16 | score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2} 17 | 18 | base_model = null 19 | use_pytorch_for_gpu_memory = true 20 | omit_extra_lookups = false 21 | raw_text = null 22 | tag_map = null 23 | vectors = null 24 | morph_rules = null 25 | 26 | batch_by = "padded" 27 | batch_size = 2000 28 | accumulate_gradient = 3 29 | discard_oversize = true 30 | eval_batch_size = 256 31 | 32 | [training.optimizer] 33 | @optimizers = "Adam.v1" 34 | beta1 = 0.9 35 | beta2 = 0.999 36 | eps = 1e-8 37 | L2_is_weight_decay = true 38 | L2 = 0.01 39 | grad_clip = 1.0 40 | use_averages = false 41 | 42 | [training.optimizer.learn_rate] 43 | @schedules = "warmup_linear.v1" 44 | warmup_steps = 250 45 | total_steps = 20000 46 | initial_rate = 5e-5 47 | 48 | 49 | [nlp] 50 | lang = "en" 51 | stop_words = [] 52 | lex_attr_getters = {} 53 | pipeline = ["transformer", "tagger", "parser", "ner"] 54 | 55 | [nlp.tokenizer] 56 | @tokenizers = "spacy.Tokenizer.v1" 57 | 58 | [nlp.lemmatizer] 59 | @lemmatizers = "spacy.Lemmatizer.v1" 60 | 61 | [nlp.writing_system] 62 | direction = "ltr" 63 | has_case = true 64 | has_letters = true 65 | 66 | [components] 67 | 68 | [components.transformer] 69 | factory = "transformer" 70 | max_batch_items = 4096 71 | 72 | [components.tagger] 73 | factory = "tagger" 74 | 75 | [components.parser] 76 | factory = "parser" 77 | learn_tokens = false 78 | min_action_freq = 1 79 | 80 | [components.ner] 81 | factory = "ner" 82 | learn_tokens = false 83 | min_action_freq = 1 84 | 85 | # This loads the Huggingface Transformers model. The transformer is applied 86 | # to a batch of Doc objects, which are preprocessed into Span objects to support 87 | # longer documents. 88 | [components.transformer.model] 89 | @architectures = "spacy-transformers.TransformerModel.v3" 90 | name = "roberta-base" 91 | tokenizer_config = {"use_fast": true} 92 | transformer_config = {"output_attentions": false} 93 | 94 | [components.transformer.model.get_spans] 95 | # You can set a custom strategy for preparing spans from the batch, e.g. you 96 | # can predict over sentences. Here we predict over the whole document. 97 | @span_getters = "strided_spans.v1" 98 | window = 128 99 | stride = 96 100 | 101 | [components.tagger.model] 102 | @architectures = "spacy.Tagger.v1" 103 | 104 | [components.parser.model] 105 | @architectures = "spacy.TransitionBasedParser.v1" 106 | nr_feature_tokens = 8 107 | hidden_width = 64 108 | maxout_pieces = 2 109 | use_upper = false 110 | 111 | [components.ner.model] 112 | @architectures = "spacy.TransitionBasedParser.v1" 113 | nr_feature_tokens = 3 114 | hidden_width = 64 115 | maxout_pieces = 2 116 | use_upper = false 117 | 118 | # These "listener" layers are connected to the transformer pipeline component 119 | # in order to achieve multi-task learning across the pipeline. 120 | # They rely on the transformer to predict over the batch and cache the result 121 | # and callback. The gradient for the transformers will be accumulated by 122 | # the listeners, and then the last listener will call the backprop callback. 123 | [components.tagger.model.tok2vec] 124 | @architectures = "spacy-transformers.TransformerListener.v1" 125 | grad_factor = 1.0 126 | 127 | [components.parser.model.tok2vec] 128 | @architectures = "spacy-transformers.TransformerListener.v1" 129 | grad_factor = 1.0 130 | 131 | [components.ner.model.tok2vec] 132 | @architectures = "spacy-transformers.TransformerListener.v1" 133 | grad_factor = 1.0 134 | 135 | # These pooling layers control how the token vectors are calculated from 136 | # the word pieces. The reduce_mean layer averages the wordpieces, so if you 137 | # have one token aligned to multiple wordpieces (as is expected), the token's 138 | # vector will be the average of the wordpieces. The most obvious alternative 139 | # is reduce_last.v1, which would just use the last wordpiece. You could also 140 | # try reduce_first, reduce_sum or even reduce_max. 141 | 142 | [components.tagger.model.tok2vec.pooling] 143 | @layers = "reduce_mean.v1" 144 | 145 | [components.parser.model.tok2vec.pooling] 146 | @layers = "reduce_mean.v1" 147 | 148 | [components.ner.model.tok2vec.pooling] 149 | @layers = "reduce_mean.v1" 150 | -------------------------------------------------------------------------------- /examples/configs/ner-albert.cfg: -------------------------------------------------------------------------------- 1 | [training] 2 | patience = 10000 3 | eval_frequency = 200 4 | dropout = 0.1 5 | init_tok2vec = null 6 | vectors = null 7 | max_epochs = 10000 8 | orth_variant_level = 0.3 9 | gold_preproc = true 10 | max_length = 0 11 | scores = ["speed", "ents_p", "ents_r", "ents_f"] 12 | score_weights = {"ents_f": 1.0} 13 | limit = 0 14 | width = 768 15 | accumulate_gradient = 2 16 | seed = 0 17 | use_pytorch_for_gpu_memory = true 18 | 19 | 20 | [training.batch_size] 21 | @schedules = "compounding.v1" 22 | start = 500 23 | stop = 500 24 | compound = 1.001 25 | 26 | [optimizer] 27 | @optimizers = "Adam.v1" 28 | beta1 = 0.9 29 | beta2 = 0.999 30 | L2_is_weight_decay = true 31 | L2 = 0.01 32 | grad_clip = 1.0 33 | use_averages = false 34 | eps = 1e-8 35 | 36 | [optimizer.learn_rate] 37 | @schedules = "warmup_linear.v1" 38 | initial_rate = 5e-5 39 | warmup_steps = 250 40 | total_steps = 5000 41 | 42 | [nlp] 43 | lang = "en" 44 | vectors = ${training:vectors} 45 | 46 | [nlp.pipeline.ner] 47 | factory = "ner" 48 | 49 | [nlp.pipeline.ner.model] 50 | @architectures = "spacy.TransitionBasedParser.v1" 51 | nr_feature_tokens = 3 52 | hidden_width = 128 53 | maxout_pieces = 3 54 | use_upper = false 55 | 56 | [nlp.pipeline.ner.model.tok2vec] 57 | @architectures = "spacy.Tok2VecTransformer.v3" 58 | name = "albert-base-v2" 59 | tokenizer_config = {"use_fast": false} 60 | transformer_config = {"output_attentions": false} 61 | grad_factor = 1.0 62 | 63 | [nlp.pipeline.ner.model.tok2vec.get_spans] 64 | @span_getters = "spacy-transformers.strided_spans.v1" 65 | window = 256 66 | stride = 256 67 | 68 | [nlp.pipeline.ner.model.tok2vec.pooling] 69 | @layers = "reduce_mean.v1" 70 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools", 4 | "cython>=0.25", 5 | "numpy>=2.0.0,<3.0.0" 6 | ] 7 | build-backend = "setuptools.build_meta" 8 | 9 | [tool.cibuildwheel] 10 | build = "*" 11 | skip = "pp* cp36* cp37* cp38*" 12 | test-skip = "" 13 | free-threaded-support = false 14 | 15 | archs = ["native"] 16 | 17 | build-frontend = "default" 18 | config-settings = {} 19 | dependency-versions = "pinned" 20 | environment = {} 21 | environment-pass = [] 22 | build-verbosity = 0 23 | 24 | before-all = "" 25 | before-build = "" 26 | repair-wheel-command = "" 27 | 28 | test-command = "" 29 | before-test = "" 30 | test-requires = [] 31 | test-extras = [] 32 | 33 | container-engine = "docker" 34 | 35 | manylinux-x86_64-image = "manylinux2014" 36 | manylinux-i686-image = "manylinux2014" 37 | manylinux-aarch64-image = "manylinux2014" 38 | manylinux-ppc64le-image = "manylinux2014" 39 | manylinux-s390x-image = "manylinux2014" 40 | manylinux-pypy_x86_64-image = "manylinux2014" 41 | manylinux-pypy_i686-image = "manylinux2014" 42 | manylinux-pypy_aarch64-image = "manylinux2014" 43 | 44 | musllinux-x86_64-image = "musllinux_1_2" 45 | musllinux-i686-image = "musllinux_1_2" 46 | musllinux-aarch64-image = "musllinux_1_2" 47 | musllinux-ppc64le-image = "musllinux_1_2" 48 | musllinux-s390x-image = "musllinux_1_2" 49 | 50 | 51 | [tool.cibuildwheel.linux] 52 | repair-wheel-command = "auditwheel repair -w {dest_dir} {wheel}" 53 | 54 | [tool.cibuildwheel.macos] 55 | repair-wheel-command = "delocate-wheel --require-archs {delocate_archs} -w {dest_dir} -v {wheel}" 56 | 57 | [tool.cibuildwheel.windows] 58 | 59 | [tool.cibuildwheel.pyodide] 60 | 61 | [tool.isort] 62 | profile = "black" 63 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | spacy>=3.5.0,<4.1.0 2 | numpy>=1.15.0 3 | transformers[sentencepiece]>=3.4.0,<4.42.0 4 | torch>=1.8.0 5 | srsly>=2.4.0,<3.0.0 6 | dataclasses>=0.6,<1.0; python_version < "3.7" 7 | spacy-alignments>=0.7.2,<1.0.0 8 | # Development dependencies 9 | cython>=0.25 10 | pytest>=5.2.0 11 | pytest-cov>=2.7.0,<5.0.0 12 | mypy>=1.0.0,<1.6.0; platform_machine!='aarch64' and python_version >= "3.7" 13 | types-contextvars>=0.1.2; python_version < "3.7" 14 | types-dataclasses>=0.1.3; python_version < "3.7" 15 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | version = 1.3.9 3 | description = spaCy pipelines for pre-trained BERT and other transformers 4 | url = https://spacy.io 5 | author = Explosion 6 | author_email = contact@explosion.ai 7 | license = MIT 8 | long_description = file: README.md 9 | long_description_content_type = text/markdown 10 | classifiers = 11 | Development Status :: 5 - Production/Stable 12 | Environment :: Console 13 | Intended Audience :: Developers 14 | Intended Audience :: Science/Research 15 | Topic :: Scientific/Engineering 16 | Topic :: Scientific/Engineering :: Artificial Intelligence 17 | License :: OSI Approved :: MIT License 18 | Operating System :: POSIX :: Linux 19 | Operating System :: MacOS :: MacOS X 20 | Operating System :: Microsoft :: Windows 21 | Programming Language :: Python :: 3 22 | Programming Language :: Python :: 3.7 23 | Programming Language :: Python :: 3.8 24 | Programming Language :: Python :: 3.9 25 | Programming Language :: Python :: 3.10 26 | Programming Language :: Python :: 3.11 27 | 28 | [options] 29 | zip_safe = false 30 | include_package_data = true 31 | python_requires = >=3.9,<3.14 32 | install_requires = 33 | spacy>=3.5.0,<4.1.0 34 | numpy>=1.15.0; python_version < "3.9" 35 | numpy>=1.19.0; python_version >= "3.9" 36 | transformers>=3.4.0,<4.50.0 37 | torch>=1.8.0 38 | srsly>=2.4.0,<3.0.0 39 | dataclasses>=0.6,<1.0; python_version < "3.7" 40 | spacy-alignments>=0.7.2,<1.0.0 41 | 42 | [options.extras_require] 43 | cuda = 44 | cupy>=5.0.0b4 45 | cuda80 = 46 | cupy-cuda80>=5.0.0b4 47 | cuda90 = 48 | cupy-cuda90>=5.0.0b4 49 | cuda91 = 50 | cupy-cuda91>=5.0.0b4 51 | cuda92 = 52 | cupy-cuda92>=5.0.0b4 53 | cuda100 = 54 | cupy-cuda100>=5.0.0b4 55 | cuda101 = 56 | cupy-cuda101>=5.0.0b4 57 | cuda102 = 58 | cupy-cuda102>=5.0.0b4 59 | cuda110 = 60 | cupy-cuda110>=5.0.0b4 61 | cuda111 = 62 | cupy-cuda111>=5.0.0b4 63 | cuda112 = 64 | cupy-cuda112>=5.0.0b4 65 | 66 | [options.entry_points] 67 | spacy_factories = 68 | transformer = spacy_transformers.pipeline_component:make_transformer 69 | 70 | spacy_architectures = 71 | spacy-transformers.TransformerListener.v1 = spacy_transformers:architectures.transformer_listener_tok2vec_v1 72 | spacy-transformers.Tok2VecTransformer.v1 = spacy_transformers:architectures.transformer_tok2vec_v1 73 | spacy-transformers.Tok2VecTransformer.v2 = spacy_transformers:architectures.transformer_tok2vec_v2 74 | spacy-transformers.Tok2VecTransformer.v3 = spacy_transformers:architectures.transformer_tok2vec_v3 75 | spacy-transformers.TransformerModel.v1 = spacy_transformers:architectures.create_TransformerModel_v1 76 | spacy-transformers.TransformerModel.v2 = spacy_transformers:architectures.create_TransformerModel_v2 77 | spacy-transformers.TransformerModel.v3 = spacy_transformers:architectures.create_TransformerModel_v3 78 | 79 | [bdist_wheel] 80 | universal = true 81 | 82 | [sdist] 83 | formats = gztar 84 | 85 | [flake8] 86 | ignore = E203, E266, E501, E731, W503 87 | max-line-length = 80 88 | select = B,C,E,F,W,T4,B9 89 | exclude = 90 | .env, 91 | .git, 92 | __pycache__, 93 | 94 | [mypy] 95 | ignore_missing_imports = True 96 | no_implicit_optional = True 97 | plugins = pydantic.mypy, thinc.mypy 98 | 99 | [coverage:run] 100 | 101 | [coverage:report] 102 | omit = 103 | **/tests/* 104 | **/_vendorized/* 105 | **/about.py 106 | exclude_lines = 107 | pragma: no cover 108 | # Don't complain about missing debug-only code: 109 | def __unicode__ 110 | def __repr__ 111 | if self\.debug 112 | # Don't complain if tests don't hit defensive assertion code: 113 | raise AssertionError 114 | raise NotImplementedError 115 | # Don't complain if non-runnable code isn't run: 116 | if 0: 117 | if __name__ == .__main__.: 118 | show_missing = True 119 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, Extension, find_packages 2 | from setuptools.command.build_ext import build_ext 3 | from Cython.Build import cythonize 4 | from Cython.Compiler import Options 5 | import numpy 6 | 7 | 8 | # Preserve `__doc__` on functions and classes 9 | # http://docs.cython.org/en/latest/src/userguide/source_files_and_compilation.html#compiler-options 10 | Options.docstrings = True 11 | 12 | COMPILE_OPTIONS = { 13 | "msvc": ["/Ox", "/EHsc"], 14 | "mingw32": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"], 15 | "other": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"], 16 | } 17 | LINK_OPTIONS = {"msvc": ["-std=c++11"], "mingw32": ["-std=c++11"], "other": []} 18 | COMPILER_DIRECTIVES = { 19 | "language_level": -3, 20 | "embedsignature": True, 21 | "annotation_typing": False, 22 | } 23 | 24 | 25 | # By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options 26 | # http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used 27 | class build_ext_options: 28 | def build_options(self): 29 | for e in self.extensions: 30 | e.extra_compile_args += COMPILE_OPTIONS.get( 31 | self.compiler.compiler_type, COMPILE_OPTIONS["other"] 32 | ) 33 | for e in self.extensions: 34 | e.extra_link_args += LINK_OPTIONS.get( 35 | self.compiler.compiler_type, LINK_OPTIONS["other"] 36 | ) 37 | 38 | 39 | class build_ext_subclass(build_ext, build_ext_options): 40 | def build_extensions(self): 41 | build_ext_options.build_options(self) 42 | build_ext.build_extensions(self) 43 | 44 | 45 | def setup_package(): 46 | ext_modules = [ 47 | Extension( 48 | "spacy_transformers.align", 49 | ["spacy_transformers/align.pyx"], 50 | language="c++", 51 | include_dirs=[numpy.get_include()], 52 | extra_compile_args=["-std=c++11"], 53 | ), 54 | ] 55 | 56 | ext_modules = cythonize(ext_modules, compiler_directives=COMPILER_DIRECTIVES) 57 | 58 | setup( 59 | name="spacy-transformers", 60 | packages=find_packages(), 61 | ext_modules=ext_modules, 62 | cmdclass={"build_ext": build_ext_subclass}, 63 | package_data={"": ["*.pyx", "*.pxd", "*.pxi"]}, 64 | ) 65 | 66 | 67 | if __name__ == "__main__": 68 | setup_package() 69 | -------------------------------------------------------------------------------- /spacy_transformers/__init__.py: -------------------------------------------------------------------------------- 1 | from . import architectures 2 | from . import annotation_setters 3 | from . import span_getters 4 | from .layers import TransformerModel 5 | from .pipeline_component import Transformer, install_extensions 6 | from .data_classes import TransformerData, FullTransformerBatch 7 | from .util import registry 8 | 9 | 10 | __all__ = [ 11 | "install_extensions", 12 | "Transformer", 13 | "TransformerModel", 14 | "TransformerData", 15 | "FullTransformerBatch", 16 | "architectures", 17 | "annotation_setters", 18 | "span_getters", 19 | "registry", 20 | ] 21 | -------------------------------------------------------------------------------- /spacy_transformers/align.pyi: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Tuple, Callable, Optional 2 | from spacy.tokens import Span, Token 3 | from thinc.api import Ops 4 | from thinc.types import Ragged, Floats2d, Ints2d 5 | 6 | def apply_alignment( 7 | ops: Ops, align: Ragged, X: Floats2d 8 | ) -> Tuple[Ragged, Callable]: ... 9 | def get_token_positions(spans: List[Span]) -> Dict[Token, int]: ... 10 | def get_alignment_via_offset_mapping( 11 | spans: List[Span], 12 | offset_mapping: Ints2d, 13 | ) -> Ragged: ... 14 | def get_alignment( 15 | spans: List[Span], 16 | wordpieces: List[List[str]], 17 | special_tokens: Optional[List[str]] = None, 18 | ) -> Ragged: ... 19 | def get_span2wp_from_offset_mapping( 20 | span: Span, 21 | wp_char_offsets: Tuple[int], 22 | ) -> List[List[int]]: ... 23 | -------------------------------------------------------------------------------- /spacy_transformers/align.pyx: -------------------------------------------------------------------------------- 1 | # cython: infer_types=True, boundscheck=False 2 | from typing import cast, Dict, List, Tuple, Callable, Set, Optional 3 | import numpy 4 | from spacy_alignments.tokenizations import get_alignments 5 | from spacy.tokens import Span, Token 6 | from thinc.api import Ops 7 | from thinc.types import Ragged, Floats2d, Ints1d, Ints2d 8 | 9 | from cython.operator cimport dereference as deref 10 | from cython.operator cimport preincrement as preinc 11 | from libc.stdint cimport uint32_t, int32_t, int64_t 12 | from libc.stdlib cimport free 13 | from libcpp.unordered_set cimport unordered_set 14 | from libcpp.vector cimport vector 15 | 16 | ctypedef unordered_set[uint32_t]* unordered_set_uint32_t_ptr 17 | 18 | 19 | def apply_alignment(ops: Ops, align: Ragged, X: Floats2d) -> Tuple[Ragged, Callable]: 20 | """Align wordpiece data (X) to match tokens, and provide a callback to 21 | reverse it. 22 | 23 | This function returns a Ragged array, which represents the fact that one 24 | token may be aligned against multiple wordpieces. It's a nested list, 25 | concatenated with a lengths array to indicate the nested structure. 26 | 27 | The alignment is also a Ragged array, where the lengths indicate how many 28 | wordpieces each token is aligned against. The output ragged therefore has 29 | the same lengths as the alignment ragged, which means the output data 30 | also has the same number of data rows as the alignment. The size of the 31 | lengths array indicates the number of tokens in the batch. 32 | 33 | The actual alignment is a simple indexing operation: 34 | 35 | for i, index in enumerate(align.data): 36 | Y[i] = X[index] 37 | 38 | Which is vectorized via numpy advanced indexing: 39 | 40 | Y = X[align.data] 41 | 42 | The inverse operation, for the backward pass, uses the 'scatter_add' op 43 | because one wordpiece may be aligned against multiple tokens. So we need: 44 | 45 | for i, index in enumerate(align.data): 46 | X[index] += Y[i] 47 | 48 | The addition wouldn't occur if we simply did `X[index] = Y`, so we use 49 | the scatter_add op. 50 | """ 51 | if not align.lengths.sum(): 52 | return _apply_empty_alignment(ops, align, X) 53 | shape = X.shape 54 | indices = cast(Ints1d, align.dataXd) 55 | Y = Ragged(X[indices], cast(Ints1d, ops.asarray(align.lengths))) 56 | 57 | def backprop_apply_alignment(dY: Ragged) -> Floats2d: 58 | assert dY.data.shape[0] == indices.shape[0] 59 | dX = ops.alloc2f(*shape) 60 | ops.scatter_add(dX, indices, cast(Floats2d, dY.dataXd)) 61 | return dX 62 | 63 | return Y, backprop_apply_alignment 64 | 65 | 66 | def _apply_empty_alignment(ops, align, X): 67 | shape = X.shape 68 | Y = Ragged( 69 | ops.alloc2f(align.lengths.shape[0], X.shape[1]), 70 | ops.alloc1i(align.lengths.shape[0]) + 1, 71 | ) 72 | 73 | def backprop_null_alignment(dY: Ragged) -> Floats2d: 74 | return ops.alloc2f(*shape) 75 | 76 | return Y, backprop_null_alignment 77 | 78 | 79 | def get_token_positions(spans: List[Span]) -> Dict[Token, int]: 80 | token_positions: Dict[Token, int] = {} 81 | seen_docs = set() 82 | for span in spans: 83 | if span.doc in seen_docs: 84 | continue 85 | seen_docs.add(span.doc) 86 | for token in span.doc: 87 | if token not in token_positions: 88 | token_positions[token] = len(token_positions) 89 | return token_positions 90 | 91 | 92 | def get_alignment_via_offset_mapping( 93 | spans: List[Span], 94 | offset_mapping: Ints2d, 95 | ) -> Ragged: 96 | if len(spans) != len(offset_mapping): 97 | raise ValueError("Cannot align batches of different sizes.") 98 | # Tokens can occur more than once, and we need the alignment of each token 99 | # to its place in the concatenated wordpieces array. 100 | token_positions = get_token_positions(spans) 101 | alignment: List[Set[int]] = [set() for _ in range(len(token_positions))] 102 | wp_start = 0 103 | for i, span in enumerate(spans): 104 | span_offset_mapping = offset_mapping[i] 105 | span2wp = get_span2wp_from_offset_mapping(span, span_offset_mapping) 106 | for token, wp_js in zip(span, span2wp): 107 | position = token_positions[token] 108 | alignment[position].update(wp_start + j for j in wp_js) 109 | wp_start += span_offset_mapping.shape[0] 110 | lengths: List[int] = [] 111 | flat: List[int] = [] 112 | for a in alignment: 113 | lengths.append(len(a)) 114 | flat.extend(sorted(a)) 115 | align = Ragged( 116 | cast(Ints1d, numpy.array(flat, dtype="i")), 117 | cast(Ints1d, numpy.array(lengths, dtype="i")), 118 | ) 119 | return align 120 | 121 | 122 | def get_alignment( 123 | spans: List[Span], 124 | wordpieces: List[List[str]], 125 | special_tokens: Optional[List[str]] = None, 126 | ) -> Ragged: 127 | """Compute a ragged alignment array that records, for each unique token in 128 | `spans`, the corresponding indices in the flattened `wordpieces` array. 129 | For instance, imagine you have two overlapping spans: 130 | 131 | [[I, like, walking], [walking, outdoors]] 132 | 133 | And their wordpieces are: 134 | 135 | [[I, like, walk, ing], [walk, ing, out, doors]] 136 | 137 | We want to align "walking" against [walk, ing, walk, ing], which have 138 | indices [2, 3, 4, 5] once the nested wordpieces list is flattened. 139 | 140 | The nested alignment list would be: 141 | 142 | [[0], [1], [2, 3, 4, 5], [6, 7]] 143 | I like walking outdoors 144 | 145 | Which gets flattened into the ragged array: 146 | 147 | [0, 1, 2, 3, 4, 5, 6, 7] 148 | [1, 1, 4, 2] 149 | 150 | The ragged format allows the aligned data to be computed via: 151 | 152 | tokens = Ragged(wp_tensor[align.data], align.lengths) 153 | 154 | This produces a ragged format, indicating which tokens need to be collapsed 155 | to make the aligned array. The reduction is deferred for a later step, so 156 | the user can configure it. The indexing is especially efficient in trivial 157 | cases like this where the indexing array is completely continuous. 158 | """ 159 | if len(spans) != len(wordpieces): 160 | raise ValueError("Cannot align batches of different sizes.") 161 | if special_tokens is None: 162 | special_tokens = [] 163 | # Tokens can occur more than once, and we need the alignment of each token 164 | # to its place in the concatenated wordpieces array. 165 | token_positions = get_token_positions(spans) 166 | alignment: List[Set[int]] = [set() for _ in range(len(token_positions))] 167 | wp_start = 0 168 | for i, (span, wp_toks) in enumerate(zip(spans, wordpieces)): 169 | sp_toks = [token.text for token in span] 170 | wp_toks_filtered = wp_toks 171 | # In the case that the special tokens do not appear in the text, filter 172 | # them out for alignment purposes so that special tokens like "" are 173 | # not aligned to the character "s" in the text. (If the special tokens 174 | # appear in the text, it's not possible to distinguish them from the 175 | # added special tokens, so they may be aligned incorrectly.) 176 | if not any([special in span.text for special in special_tokens]): 177 | wp_toks_filtered = [ 178 | tok if tok not in special_tokens else "" for tok in wp_toks 179 | ] 180 | span2wp, wp2span = get_alignments(sp_toks, wp_toks_filtered) 181 | for token, wp_js in zip(span, span2wp): 182 | position = token_positions[token] 183 | alignment[position].update(wp_start + j for j in wp_js) 184 | wp_start += len(wp_toks) 185 | lengths: List[int] = [] 186 | flat: List[int] = [] 187 | for a in alignment: 188 | lengths.append(len(a)) 189 | flat.extend(sorted(a)) 190 | align = Ragged( 191 | cast(Ints1d, numpy.array(flat, dtype="i")), 192 | cast(Ints1d, numpy.array(lengths, dtype="i")), 193 | ) 194 | return align 195 | 196 | 197 | def get_span2wp_from_offset_mapping(span, wp_char_offsets): 198 | # create a mapping of char indices to spacy token indices 199 | cdef int span_idx = span[0].idx 200 | cdef int span_i = span[0].i 201 | cdef int char_idx, rel_token_i 202 | # size is +1 so we don't have to check whether the text has a trailing space 203 | char_to_sp_token = numpy.full((len(span.text) + 1,), -1, dtype="int32") 204 | for token in span: 205 | rel_token_i = token.i - span_i 206 | for char_idx in range( 207 | token.idx - span_idx, 208 | token.idx - span_idx + len(token) + 1, 209 | ): 210 | char_to_sp_token[char_idx] = rel_token_i 211 | 212 | # align all wordpiece tokens to one or more spacy token indices 213 | cdef vector[unordered_set_uint32_t_ptr] alignment 214 | for _ in range(len(span)): 215 | alignment.push_back(new unordered_set[uint32_t]()) 216 | _get_span2wp_alignment( 217 | &alignment, 218 | numpy.ascontiguousarray(char_to_sp_token), 219 | char_to_sp_token.size, 220 | numpy.ascontiguousarray(wp_char_offsets, dtype="int64"), 221 | wp_char_offsets.shape[0], 222 | ) 223 | 224 | # convert the alignment into a list of aligned wordpiece indices per spacy 225 | # token index (unsorted at this point) 226 | cdef unordered_set_uint32_t_ptr s 227 | cdef vector[unordered_set_uint32_t_ptr].iterator it_v = alignment.begin() 228 | cdef unordered_set[uint32_t].iterator it_s 229 | result: List[List[int]] = [] 230 | while it_v != alignment.end(): 231 | result.append([]) 232 | s = deref(it_v) 233 | it_s = s.begin() 234 | while it_s != s.end(): 235 | result[-1].append(deref(it_s)) 236 | preinc(it_s) 237 | del s 238 | preinc(it_v) 239 | return result 240 | 241 | 242 | cdef int _get_span2wp_alignment( 243 | vector[unordered_set_uint32_t_ptr]* alignment, 244 | int32_t[::1] char_to_sp_token, 245 | int char_to_sp_token_length, 246 | int64_t[:, ::1] wp_char_offsets, 247 | int wp_char_offsets_length, 248 | ) nogil: 249 | cdef int char_idx, start_idx, end_idx, token_i 250 | cdef int wp_j = 0 251 | cdef int alignment_size = alignment.size() 252 | while wp_j < wp_char_offsets_length: 253 | start_idx = wp_char_offsets[wp_j][0] 254 | end_idx = wp_char_offsets[wp_j][1] 255 | char_idx = start_idx 256 | while char_idx < end_idx: 257 | if 0 <= char_idx < char_to_sp_token_length: 258 | token_i = char_to_sp_token[char_idx] 259 | else: 260 | token_i = -1 261 | if 0 <= token_i < alignment_size: 262 | deref(alignment.at(token_i)).insert(wp_j) 263 | char_idx += 1 264 | wp_j += 1 265 | return 0 266 | -------------------------------------------------------------------------------- /spacy_transformers/annotation_setters.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, List 2 | from spacy.tokens import Doc 3 | 4 | from .util import registry 5 | from .data_classes import FullTransformerBatch 6 | 7 | 8 | def null_annotation_setter(docs: List[Doc], trf_data: FullTransformerBatch) -> None: 9 | """Set no additional annotations on the Doc objects.""" 10 | pass 11 | 12 | 13 | @registry.annotation_setters("spacy-transformers.null_annotation_setter.v1") # type: ignore 14 | def configure_null_annotation_setter() -> Callable[ 15 | [List[Doc], FullTransformerBatch], None 16 | ]: 17 | return null_annotation_setter 18 | 19 | 20 | __all__ = ["null_annotation_setter", "configure_null_annotation_setter"] 21 | -------------------------------------------------------------------------------- /spacy_transformers/architectures.py: -------------------------------------------------------------------------------- 1 | from typing import List, Callable 2 | from thinc.api import Model, chain 3 | from thinc.types import Ragged, Floats2d 4 | from spacy.tokens import Doc 5 | 6 | from .layers import TransformerModel, TransformerListener 7 | from .layers import trfs2arrays, split_trf_batch 8 | from .util import registry 9 | from .data_classes import FullTransformerBatch 10 | 11 | 12 | @registry.architectures.register("spacy-transformers.TransformerListener.v1") 13 | def transformer_listener_tok2vec_v1( 14 | pooling: Model[Ragged, Floats2d], grad_factor: float = 1.0, upstream: str = "*" 15 | ) -> Model[List[Doc], List[Floats2d]]: 16 | """Create a 'TransformerListener' layer, which will connect to a Transformer 17 | component earlier in the pipeline. 18 | 19 | The layer takes a list of Doc objects as input, and produces a list of 20 | 2d arrays as output, with each array having one row per token. Most spaCy 21 | models expect a sublayer with this signature, making it easy to connect them 22 | to a transformer model via this sublayer. 23 | Transformer models usually operate over wordpieces, which usually don't align 24 | one-to-one against spaCy tokens. The layer therefore requires a reduction 25 | operation in order to calculate a single token vector given zero or more 26 | wordpiece vectors. 27 | 28 | pooling (Model[Ragged, Floats2d]): A reduction layer used to calculate 29 | the token vectors based on zero or more wordpiece vectors. If in doubt, 30 | mean pooling (see `thinc.layers.reduce_mean`) is usually a good choice. 31 | grad_factor (float): Reweight gradients from the component before passing 32 | them upstream. You can set this to 0 to "freeze" the transformer weights 33 | with respect to the component, or use it to make some components more 34 | significant than others. Leaving it at 1.0 is usually fine. 35 | upstream (str): A string to identify the 'upstream' Transformer 36 | to communicate with. The upstream name should either be the wildcard 37 | string '*', or the name of the `Transformer` component. You'll almost 38 | never have multiple upstream Transformer components, so the wildcard 39 | string will almost always be fine. 40 | """ 41 | listener = TransformerListener(upstream_name=upstream) 42 | model: Model = chain(listener, trfs2arrays(pooling, grad_factor)) 43 | model.set_ref("listener", listener) 44 | return model 45 | 46 | 47 | @registry.architectures.register("spacy-transformers.Tok2VecTransformer.v1") 48 | def transformer_tok2vec_v1( 49 | name: str, 50 | get_spans, 51 | tokenizer_config: dict, 52 | pooling: Model[Ragged, Floats2d], 53 | grad_factor: float = 1.0, 54 | ) -> Model[List[Doc], List[Floats2d]]: 55 | """Use a transformer as a "Tok2Vec" layer directly. This does not allow 56 | multiple components to share the transformer weights, and does not allow 57 | the transformer to set annotations into the `Doc` object, but it's a 58 | simpler solution if you only need the transformer within one component. 59 | 60 | get_spans (Callable[[List[Doc]], List[List[Span]]]): A function to extract 61 | spans from the batch of Doc objects. See the "TransformerModel" layer 62 | for details. 63 | tokenizer_config (dict): Settings to pass to the transformers tokenizer. 64 | pooling (Model[Ragged, Floats2d]): A reduction layer used to calculate 65 | the token vectors based on zero or more wordpiece vectors. If in doubt, 66 | mean pooling (see `thinc.layers.reduce_mean`) is usually a good choice. 67 | grad_factor (float): Reweight gradients from the component before passing 68 | them to the transformer. You can set this to 0 to "freeze" the transformer 69 | weights with respect to the component, or to make it learn more slowly. 70 | Leaving it at 1.0 is usually fine. 71 | """ 72 | return chain( 73 | TransformerModel(name, get_spans, tokenizer_config), 74 | split_trf_batch(), 75 | trfs2arrays(pooling, grad_factor), 76 | ) 77 | 78 | 79 | @registry.architectures.register("spacy-transformers.Tok2VecTransformer.v2") 80 | def transformer_tok2vec_v2( 81 | name: str, 82 | get_spans, 83 | tokenizer_config: dict, 84 | pooling: Model[Ragged, Floats2d], 85 | grad_factor: float = 1.0, 86 | transformer_config: dict = {}, 87 | ) -> Model[List[Doc], List[Floats2d]]: 88 | """Use a transformer as a "Tok2Vec" layer directly. This does not allow 89 | multiple components to share the transformer weights, and does not allow 90 | the transformer to set annotations into the `Doc` object, but it's a 91 | simpler solution if you only need the transformer within one component. 92 | 93 | get_spans (Callable[[List[Doc]], List[List[Span]]]): A function to extract 94 | spans from the batch of Doc objects. See the "TransformerModel" layer 95 | for details. 96 | tokenizer_config (dict): Settings to pass to the transformers tokenizer. 97 | pooling (Model[Ragged, Floats2d]): A reduction layer used to calculate 98 | the token vectors based on zero or more wordpiece vectors. If in doubt, 99 | mean pooling (see `thinc.layers.reduce_mean`) is usually a good choice. 100 | grad_factor (float): Reweight gradients from the component before passing 101 | them to the transformer. You can set this to 0 to "freeze" the transformer 102 | weights with respect to the component, or to make it learn more slowly. 103 | Leaving it at 1.0 is usually fine. 104 | transformers_config (dict): Settings to pass to the transformers forward pass 105 | of the transformer. 106 | """ 107 | return chain( 108 | TransformerModel(name, get_spans, tokenizer_config, transformer_config), 109 | split_trf_batch(), 110 | trfs2arrays(pooling, grad_factor), 111 | ) 112 | 113 | 114 | # Note: when updating, also make sure to update 'replace_listener_cfg' in _util.py 115 | @registry.architectures.register("spacy-transformers.Tok2VecTransformer.v3") 116 | def transformer_tok2vec_v3( 117 | name: str, 118 | get_spans, 119 | tokenizer_config: dict, 120 | pooling: Model[Ragged, Floats2d], 121 | grad_factor: float = 1.0, 122 | transformer_config: dict = {}, 123 | mixed_precision: bool = False, 124 | grad_scaler_config: dict = {}, 125 | ) -> Model[List[Doc], List[Floats2d]]: 126 | """Use a transformer as a "Tok2Vec" layer directly. This does not allow 127 | multiple components to share the transformer weights, and does not allow 128 | the transformer to set annotations into the `Doc` object, but it's a 129 | simpler solution if you only need the transformer within one component. 130 | 131 | get_spans (Callable[[List[Doc]], List[List[Span]]]): A function to extract 132 | spans from the batch of Doc objects. See the "TransformerModel" layer 133 | for details. 134 | tokenizer_config (dict): Settings to pass to the transformers tokenizer. 135 | pooling (Model[Ragged, Floats2d]): A reduction layer used to calculate 136 | the token vectors based on zero or more wordpiece vectors. If in doubt, 137 | mean pooling (see `thinc.layers.reduce_mean`) is usually a good choice. 138 | grad_factor (float): Reweight gradients from the component before passing 139 | them to the transformer. You can set this to 0 to "freeze" the transformer 140 | weights with respect to the component, or to make it learn more slowly. 141 | Leaving it at 1.0 is usually fine. 142 | transformers_config (dict): Settings to pass to the transformers forward pass 143 | of the transformer. 144 | mixed_precision (bool): Enable mixed-precision. Mixed-precision replaces 145 | whitelisted ops to half-precision counterparts. This speeds up training 146 | and prediction on modern GPUs and reduces GPU memory use. 147 | grad_scaler_config (dict): Configuration for gradient scaling in mixed-precision 148 | training. Gradient scaling is enabled automatically when mixed-precision 149 | training is used. 150 | 151 | Setting `enabled` to `False` in the gradient scaling configuration disables 152 | gradient scaling. The `init_scale` (default: `2 ** 16`) determines the 153 | initial scale. `backoff_factor` (default: `0.5`) specifies the factor 154 | by which the scale should be reduced when gradients overflow. 155 | `growth_interval` (default: `2000`) configures the number of steps 156 | without gradient overflows after which the scale should be increased. 157 | Finally, `growth_factor` (default: `2.0`) determines the factor by which 158 | the scale should be increased when no overflows were found for 159 | `growth_interval` steps. 160 | """ 161 | # Note that this is a chain of chain on purpose, to match the structure of 162 | # TransformerListener.v1 after it is run through replace_listener (cf PR #310) 163 | return chain( # type: ignore 164 | chain( 165 | TransformerModel( 166 | name, 167 | get_spans, 168 | tokenizer_config, 169 | transformer_config, 170 | mixed_precision, 171 | grad_scaler_config, 172 | ), 173 | split_trf_batch(), 174 | ), 175 | trfs2arrays(pooling, grad_factor), 176 | ) 177 | 178 | 179 | @registry.architectures.register("spacy-transformers.TransformerModel.v1") 180 | def create_TransformerModel_v1( 181 | name: str, 182 | get_spans: Callable, 183 | tokenizer_config: dict = {}, 184 | ) -> Model[List[Doc], FullTransformerBatch]: 185 | model = TransformerModel(name, get_spans, tokenizer_config) 186 | return model 187 | 188 | 189 | @registry.architectures.register("spacy-transformers.TransformerModel.v2") 190 | def create_TransformerModel_v2( 191 | name: str, 192 | get_spans: Callable, 193 | tokenizer_config: dict = {}, 194 | transformer_config: dict = {}, 195 | ) -> Model[List[Doc], FullTransformerBatch]: 196 | model = TransformerModel(name, get_spans, tokenizer_config, transformer_config) 197 | return model 198 | 199 | 200 | @registry.architectures.register("spacy-transformers.TransformerModel.v3") 201 | def create_TransformerModel_v3( 202 | name: str, 203 | get_spans: Callable, 204 | tokenizer_config: dict = {}, 205 | transformer_config: dict = {}, 206 | mixed_precision: bool = False, 207 | grad_scaler_config: dict = {}, 208 | ) -> Model[List[Doc], FullTransformerBatch]: 209 | """Pretrained transformer model that can be finetuned for downstream tasks. 210 | 211 | name (str): Name of the pretrained Huggingface model to use. 212 | get_spans (Callable[[List[Doc]], List[List[Span]]]): A function to extract 213 | spans from the batch of Doc objects. See the "TransformerModel" layer 214 | for details. 215 | tokenizer_config (dict): Settings to pass to the transformers tokenizer. 216 | transformers_config (dict): Settings to pass to the transformers forward pass 217 | of the transformer. 218 | mixed_precision (bool): Enable mixed-precision. Mixed-precision replaces 219 | whitelisted ops to half-precision counterparts. This speeds up training 220 | and prediction on modern GPUs and reduces GPU memory use. 221 | grad_scaler_config (dict): Configuration for gradient scaling in mixed-precision 222 | training. Gradient scaling is enabled automatically when mixed-precision 223 | training is used. 224 | 225 | Setting `enabled` to `False` in the gradient scaling configuration disables 226 | gradient scaling. The `init_scale` (default: `2 ** 16`) determines the 227 | initial scale. `backoff_factor` (default: `0.5`) specifies the factor 228 | by which the scale should be reduced when gradients overflow. 229 | `growth_interval` (default: `2000`) configures the number of steps 230 | without gradient overflows after which the scale should be increased. 231 | Finally, `growth_factor` (default: `2.0`) determines the factor by which 232 | the scale should be increased when no overflows were found for 233 | `growth_interval` steps. 234 | """ 235 | model = TransformerModel( 236 | name, 237 | get_spans, 238 | tokenizer_config, 239 | transformer_config, 240 | mixed_precision, 241 | grad_scaler_config, 242 | ) 243 | return model 244 | -------------------------------------------------------------------------------- /spacy_transformers/data_classes.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, List, Dict, Any, Union, Tuple, cast 2 | from dataclasses import dataclass, field 3 | import torch 4 | import numpy 5 | from transformers.tokenization_utils import BatchEncoding 6 | from transformers.file_utils import ModelOutput 7 | from transformers.modeling_outputs import BaseModelOutput 8 | from thinc.types import Ragged, Floats2d, Floats3d, FloatsXd, Ints1d, Ints2d 9 | from thinc.api import NumpyOps, get_array_module, xp2torch, torch2xp 10 | from spacy.tokens import Span 11 | import srsly 12 | 13 | from .util import transpose_list 14 | from .align import get_token_positions 15 | 16 | 17 | @dataclass 18 | class WordpieceBatch: 19 | """Holds data from the transformers BatchEncoding class. 20 | 21 | We would have preferred to use the BatchEncoding class directly, but 22 | there's a few problems with that. 23 | 24 | 1. Some BatchEncoding functionality requires the tokenizers.Encoding object, 25 | and it's impossible for us to create or manipulate that object. This means 26 | we can't really create BatchEncoding objects, which limits what we can do. 27 | 2. We want some semantic differences, for instance the "lengths" data in the 28 | BatchEncoding is useless when the inputs are padded. We want it to tell 29 | us the *unpadded* lengths. 30 | 3. We want typed attributes, so that we can type-check properly. 31 | 4. We prefer to have numpy/cupy arrays rather than torch arrays. 32 | 5. The API around the BatchEncoding object has been changing a lot, so we 33 | want to minimize the places where we touch it. 34 | """ 35 | 36 | strings: List[List[str]] 37 | input_ids: Ints2d 38 | attention_mask: Floats2d 39 | lengths: List[int] 40 | token_type_ids: Optional[Ints2d] 41 | 42 | def __len__(self) -> int: 43 | return len(self.strings) 44 | 45 | def __getitem__(self, index) -> "WordpieceBatch": 46 | if isinstance(index, int): 47 | slice_ = slice(index, index + 1) 48 | else: 49 | slice_ = index 50 | return WordpieceBatch( 51 | strings=self.strings[slice_], 52 | input_ids=self.input_ids[slice_], 53 | attention_mask=self.attention_mask[slice_], 54 | lengths=self.lengths[slice_], 55 | token_type_ids=( 56 | self.token_type_ids[slice_] if self.token_type_ids is not None else None 57 | ), 58 | ) 59 | 60 | def to_hf_dict(self) -> Dict: 61 | """Return a dict similar to the format produced by the Huggingface 62 | tokenizer, converting arrays to pytorch tensors as well. 63 | """ 64 | output = { 65 | "input_ids": xp2torch(self.input_ids), 66 | "attention_mask": xp2torch(self.attention_mask), 67 | "input_texts": self.strings, 68 | } 69 | if self.token_type_ids is not None: 70 | output["token_type_ids"] = xp2torch(self.token_type_ids) 71 | return output 72 | 73 | @classmethod 74 | def empty(cls, *, xp=numpy) -> "WordpieceBatch": 75 | return cls( 76 | strings=[], 77 | input_ids=xp.zeros((0, 0), dtype="i"), 78 | attention_mask=xp.ones((0, 0), dtype="bool"), 79 | lengths=[], 80 | token_type_ids=None, 81 | ) 82 | 83 | @classmethod 84 | def zeros(cls, lengths: List[int], xp=numpy) -> "WordpieceBatch": 85 | return cls( 86 | strings=[[""] * length for length in lengths], 87 | input_ids=xp.array([[0] * length for length in lengths], dtype="i"), 88 | attention_mask=xp.ones((len(lengths), max(lengths)), dtype="bool"), 89 | lengths=lengths, 90 | token_type_ids=None, 91 | ) 92 | 93 | @classmethod 94 | def from_batch_encoding(cls, token_data: BatchEncoding) -> "WordpieceBatch": 95 | assert isinstance(token_data, BatchEncoding) or isinstance(token_data, dict) 96 | pad_token = token_data.get("pad_token", "[PAD]") 97 | lengths = [ 98 | len([tok for tok in tokens if tok != pad_token]) 99 | for tokens in token_data["input_texts"] 100 | ] 101 | 102 | # The following tensors are intentionally allocated on the CPU to reduce 103 | # host-to-device copies. 104 | numpy_ops = NumpyOps() 105 | input_ids = token_data["input_ids"] 106 | token_type_ids = token_data.get("token_type_ids") 107 | 108 | return cls( 109 | strings=token_data["input_texts"], 110 | input_ids=numpy_ops.asarray(input_ids, dtype=input_ids.dtype), 111 | attention_mask=numpy_ops.asarray2f(token_data["attention_mask"]), 112 | lengths=lengths, 113 | token_type_ids=( 114 | numpy_ops.asarray(token_type_ids, dtype=token_type_ids.dtype) 115 | if token_type_ids is not None 116 | else None 117 | ), 118 | ) 119 | 120 | def to_dict(self) -> Dict[str, Any]: 121 | return { 122 | "strings": self.strings, 123 | "input_ids": self.input_ids, 124 | "attention_mask": self.attention_mask, 125 | "lengths": self.lengths, 126 | "token_type_ids": self.token_type_ids, 127 | } 128 | 129 | def from_dict(self, msg: Dict[str, Any]) -> "WordpieceBatch": 130 | self.strings = msg["strings"] 131 | self.input_ids = msg["input_ids"] 132 | self.attention_mask = msg["attention_mask"] 133 | self.lengths = msg["lengths"] 134 | self.token_type_ids = msg["token_type_ids"] 135 | return self 136 | 137 | 138 | @dataclass 139 | class TransformerData: 140 | """Transformer tokens and outputs for one Doc object. 141 | 142 | The transformer models return tensors that refer to a whole padded batch 143 | of documents. These tensors are wrapped into the FullTransformerBatch object. 144 | The FullTransformerBatch then splits out the per-document data, which is 145 | handled by this class. Instances of this class are typically assigned to 146 | the doc._.trf_data extension attribute. 147 | 148 | Attributes 149 | ---------- 150 | wordpieces (WordpieceBatch): A slice of the wordpiece token data produced 151 | by the Huggingface tokenizer. 152 | model_output (ModelOutput): The model output from the transformer model, 153 | determined by the model and transformer config. 154 | align (Ragged): Alignment from the Doc's tokenization to the wordpieces. 155 | This is a ragged array, where align.lengths[i] indicates the number of 156 | wordpiece tokens that token i aligns against. The actual indices are 157 | provided at align[i].dataXd. 158 | """ 159 | 160 | wordpieces: WordpieceBatch 161 | model_output: ModelOutput 162 | align: Ragged 163 | 164 | @classmethod 165 | def empty(cls) -> "TransformerData": 166 | align = Ragged( 167 | cast(Ints1d, numpy.zeros((0,), dtype="i")), 168 | cast(Ints1d, numpy.zeros((0,), dtype="i")), 169 | ) 170 | return cls( 171 | wordpieces=WordpieceBatch.empty(), model_output=ModelOutput(), align=align 172 | ) 173 | 174 | @classmethod 175 | def zeros(cls, length: int, width: int, *, xp=numpy) -> "TransformerData": 176 | """Create a valid TransformerData container for a given shape, filled 177 | with zeros.""" 178 | return cls( 179 | wordpieces=WordpieceBatch.zeros([length], xp=xp), 180 | model_output=ModelOutput( 181 | last_hidden_state=xp.zeros((1, length, width), dtype="f") 182 | ), 183 | align=Ragged( 184 | cast(Ints1d, numpy.arange(length)), 185 | cast(Ints1d, numpy.ones((length,), dtype="i")), 186 | ), 187 | ) 188 | 189 | @property 190 | def tensors(self) -> Tuple[Union[FloatsXd, List[FloatsXd]]]: 191 | return self.model_output.to_tuple() 192 | 193 | @property 194 | def tokens(self) -> Dict[str, Any]: 195 | """Deprecated. A dict with the wordpiece token data.""" 196 | return self.wordpieces.to_hf_dict() 197 | 198 | @property 199 | def width(self) -> int: 200 | if "last_hidden_state" in self.model_output: 201 | return cast(BaseModelOutput, self.model_output).last_hidden_state.shape[-1] 202 | else: 203 | raise ValueError("Cannot find last hidden state") 204 | 205 | def to_dict(self) -> Dict[str, Any]: 206 | return { 207 | "wordpieces": self.wordpieces.to_dict(), 208 | "model_output": self.model_output, 209 | "align": [self.align.dataXd, self.align.lengths], 210 | } 211 | 212 | def from_dict(self, msg: Dict[str, Any]) -> "TransformerData": 213 | self.wordpieces = WordpieceBatch.empty().from_dict(msg["wordpieces"]) 214 | self.model_output = ModelOutput(msg["model_output"]) 215 | self.align = Ragged(*msg["align"]) 216 | return self 217 | 218 | def to_bytes(self) -> bytes: 219 | return srsly.msgpack_dumps(self.to_dict()) 220 | 221 | def from_bytes(self, byte_string: bytes) -> "TransformerData": 222 | msg = srsly.msgpack_loads(byte_string) 223 | self.from_dict(msg) 224 | return self 225 | 226 | 227 | @srsly.msgpack_encoders("transformerdata") 228 | def serialize_transformer_data(obj, chain=None): 229 | if isinstance(obj, TransformerData): 230 | return {"__transformerdata__": obj.to_dict()} 231 | return obj if chain is None else chain(obj) 232 | 233 | 234 | @srsly.msgpack_decoders("transformerdata") 235 | def deserialize_transformer_data(obj, chain=None): 236 | if "__transformerdata__" in obj: 237 | return TransformerData.empty().from_dict(obj["__transformerdata__"]) 238 | return obj if chain is None else chain(obj) 239 | 240 | 241 | @dataclass 242 | class FullTransformerBatch: 243 | """Holds a batch of input and output objects for a transformer model. The 244 | data can then be split to a list of `TransformerData` objects to associate 245 | the outputs to each `Doc` in the batch. 246 | 247 | Attributes 248 | ---------- 249 | spans (List[List[Span]]): The batch of input spans. The outer list refers 250 | to the Doc objects in the batch, and the inner list are the spans for 251 | that `Doc`. Note that spans are allowed to overlap or exclude tokens, 252 | but each Span can only refer to one Doc (by definition). This means that 253 | within a Doc, the regions of the output tensors that correspond to each 254 | Span may overlap or have gaps, but for each Doc, there is a non-overlapping 255 | contiguous slice of the outputs. 256 | wordpieces (WordpieceBatch): Token data from the Huggingface tokenizer. 257 | model_output (ModelOutput): The output of the transformer model. 258 | align (Ragged): Alignment from the spaCy tokenization to the wordpieces. 259 | This is a ragged array, where align.lengths[i] indicates the number of 260 | wordpiece tokens that token i aligns against. The actual indices are 261 | provided at align[i].dataXd. 262 | """ 263 | 264 | spans: List[List[Span]] 265 | wordpieces: WordpieceBatch 266 | model_output: ModelOutput 267 | align: Ragged 268 | cached_doc_data: Optional[List[TransformerData]] = None 269 | 270 | @classmethod 271 | def empty(cls, nr_docs) -> "FullTransformerBatch": 272 | spans: List[List[Span]] = [[] for _ in range(nr_docs)] 273 | doc_data = [TransformerData.empty() for _ in range(nr_docs)] 274 | align = Ragged( 275 | cast(Ints1d, numpy.zeros((0,), dtype="i")), 276 | cast(Ints1d, numpy.zeros((0,), dtype="i")), 277 | ) 278 | return cls( 279 | spans=spans, 280 | wordpieces=WordpieceBatch.empty(), 281 | model_output=ModelOutput(), 282 | align=align, 283 | cached_doc_data=doc_data, 284 | ) 285 | 286 | @property 287 | def tensors(self) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]: 288 | return self.model_output.to_tuple() 289 | 290 | @property 291 | def tokens(self) -> Dict[str, Any]: 292 | """Deprecated. Dict formatted version of the self.wordpieces data, 293 | with values converted to PyTorch tensors. 294 | """ 295 | return self.wordpieces.to_hf_dict() 296 | 297 | @property 298 | def doc_data(self) -> List[TransformerData]: 299 | """The outputs, split per spaCy Doc object.""" 300 | if self.cached_doc_data is None: 301 | self.cached_doc_data = self.split_by_doc() 302 | return self.cached_doc_data 303 | 304 | def unsplit_by_doc(self, arrays: List[List[Floats3d]]) -> "FullTransformerBatch": 305 | """Return a new FullTransformerBatch from a split batch of activations, 306 | using the current object's spans, wordpieces and alignment. 307 | 308 | This is used during the backward pass, in order to construct the gradients 309 | to pass back into the transformer model. 310 | """ 311 | xp = get_array_module(arrays[0][0]) 312 | # construct a dummy ModelOutput with the tensor values 313 | model_output = ModelOutput() 314 | for i, x in enumerate(transpose_list(arrays)): 315 | model_output[f"output_{i}"] = xp2torch(xp.vstack(x)) 316 | return FullTransformerBatch( 317 | spans=self.spans, 318 | wordpieces=self.wordpieces, 319 | model_output=model_output, 320 | align=self.align, 321 | ) 322 | 323 | def split_by_doc(self) -> List[TransformerData]: 324 | """Split a TransformerData that represents a batch into a list with 325 | one TransformerData per Doc. 326 | """ 327 | flat_spans = [] 328 | for doc_spans in self.spans: 329 | flat_spans.extend(doc_spans) 330 | token_positions = get_token_positions(flat_spans) 331 | 332 | # Convert all outputs to XP arrays. 333 | xp_model_output = ModelOutput() 334 | last_hidden_state = cast(BaseModelOutput, self.model_output).last_hidden_state 335 | for key, output in self.model_output.items(): 336 | if isinstance(output, torch.Tensor): 337 | xp_model_output[key] = torch2xp(output) 338 | elif ( 339 | isinstance(output, tuple) 340 | and all(isinstance(t, torch.Tensor) for t in output) 341 | and all(t.shape[0] == last_hidden_state.shape[0] for t in output) 342 | ): 343 | xp_model_output[key] = [torch2xp(t) for t in output] 344 | 345 | # Split outputs per Doc. 346 | outputs = [] 347 | start = 0 348 | prev_tokens = 0 349 | for doc_spans in self.spans: 350 | if len(doc_spans) == 0 or len(doc_spans[0]) == 0: 351 | outputs.append(TransformerData.empty()) 352 | continue 353 | start_i = token_positions[doc_spans[0][0]] 354 | end_i = token_positions[doc_spans[-1][-1]] + 1 355 | end = start + len(doc_spans) 356 | doc_tokens = self.wordpieces[start:end] 357 | doc_align = self.align[start_i:end_i] 358 | doc_align.data = doc_align.data - prev_tokens 359 | model_output = ModelOutput() 360 | for key, output in xp_model_output.items(): 361 | # After the torch2xp conversion above, we only have XP arrays 362 | # and lists of XP arrays. 363 | if not isinstance(output, list): 364 | model_output[key] = output[start:end] 365 | else: 366 | model_output[key] = [t[start:end] for t in output] 367 | outputs.append( 368 | TransformerData( 369 | wordpieces=doc_tokens, 370 | model_output=model_output, 371 | align=doc_align, 372 | ) 373 | ) 374 | prev_tokens += doc_tokens.input_ids.size 375 | start += len(doc_spans) 376 | return outputs 377 | 378 | 379 | @dataclass 380 | class HFObjects: 381 | 382 | tokenizer: Any 383 | transformer: Any 384 | vocab_file_contents: Any 385 | _init_tokenizer_config: Dict[str, Any] = field(default_factory=dict) 386 | _init_transformer_config: Dict[str, Any] = field(default_factory=dict) 387 | -------------------------------------------------------------------------------- /spacy_transformers/layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .listener import TransformerListener 2 | from .transformer_model import TransformerModel 3 | from .split_trf import split_trf_batch 4 | from .trfs2arrays import trfs2arrays 5 | 6 | 7 | __all__ = ["TransformerListener", "TransformerModel", "split_trf_batch", "trfs2arrays"] 8 | -------------------------------------------------------------------------------- /spacy_transformers/layers/_util.py: -------------------------------------------------------------------------------- 1 | from thinc.api import chain 2 | from .split_trf import split_trf_batch 3 | 4 | 5 | def replace_listener(model): 6 | return chain(model, split_trf_batch()) 7 | 8 | 9 | def replace_listener_cfg(tok2vec_model_cfg, listener_model_cfg): 10 | result = tok2vec_model_cfg.copy() 11 | if ( 12 | "TransformerModel" in tok2vec_model_cfg["@architectures"] 13 | and "TransformerListener" in listener_model_cfg["@architectures"] 14 | ): 15 | result["@architectures"] = "spacy-transformers.Tok2VecTransformer.v3" 16 | for key in ["pooling", "grad_factor"]: 17 | if key in listener_model_cfg and key not in result: 18 | result[key] = listener_model_cfg[key] 19 | return result 20 | -------------------------------------------------------------------------------- /spacy_transformers/layers/hf_shim.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict 2 | from io import BytesIO 3 | from pathlib import Path 4 | import srsly 5 | import torch 6 | import warnings 7 | from thinc.api import get_torch_default_device 8 | from spacy.util import SimpleFrozenDict 9 | 10 | from ..data_classes import HFObjects 11 | from ..util import make_tempdir 12 | 13 | from thinc.api import PyTorchGradScaler, PyTorchShim 14 | 15 | from transformers import AutoModel, AutoConfig, AutoTokenizer 16 | 17 | 18 | class HFShim(PyTorchShim): 19 | """Interface between a HF Pytorch model and a Thinc Model.""" 20 | 21 | def __init__( 22 | self, 23 | model: HFObjects, 24 | config=None, 25 | optimizer: Any = None, 26 | mixed_precision: bool = False, 27 | grad_scaler_config: dict = {}, 28 | config_cls=AutoConfig, 29 | model_cls=AutoModel, 30 | tokenizer_cls=AutoTokenizer, 31 | ): 32 | self._hfmodel = model 33 | self.config_cls = config_cls 34 | self.model_cls = model_cls 35 | self.tokenizer_cls = tokenizer_cls 36 | 37 | # Enable gradient scaling when mixed precision is enabled and gradient 38 | # scaling is not explicitly disabled in the configuration. 39 | if "enabled" not in grad_scaler_config: 40 | grad_scaler_config["enabled"] = mixed_precision 41 | 42 | super().__init__( 43 | model.transformer, 44 | config, 45 | optimizer, 46 | mixed_precision, 47 | grad_scaler=PyTorchGradScaler(**grad_scaler_config), 48 | ) 49 | 50 | def to_bytes(self): 51 | config = {} 52 | tok_dict = {} 53 | weights_bytes = {} 54 | tok_cfg = {} 55 | trf_cfg = {} 56 | hf_model = self._hfmodel 57 | if hf_model.transformer is not None: 58 | tok_dict = {} 59 | config = hf_model.transformer.config.to_dict() 60 | tokenizer = hf_model.tokenizer 61 | with make_tempdir() as temp_dir: 62 | if hasattr(tokenizer, "vocab_file"): 63 | vocab_file_name = tokenizer.vocab_files_names["vocab_file"] 64 | vocab_file_path = str((temp_dir / vocab_file_name).absolute()) 65 | with open(vocab_file_path, "wb") as fileh: 66 | fileh.write(hf_model.vocab_file_contents) 67 | tokenizer.vocab_file = vocab_file_path 68 | tok_dict["kwargs"] = {"use_fast": tokenizer.is_fast} 69 | tokenizer.save_pretrained(str(temp_dir.absolute())) 70 | for x in temp_dir.glob("**/*"): 71 | if x.is_file(): 72 | tok_dict[x.name] = x.read_bytes() 73 | filelike = BytesIO() 74 | torch.save(self._model.state_dict(), filelike) 75 | filelike.seek(0) 76 | weights_bytes = filelike.getvalue() 77 | else: 78 | tok_cfg = hf_model._init_tokenizer_config 79 | trf_cfg = hf_model._init_transformer_config 80 | msg = { 81 | "config": config, 82 | "state": weights_bytes, 83 | "tokenizer": tok_dict, 84 | "_init_tokenizer_config": tok_cfg, 85 | "_init_transformer_config": trf_cfg, 86 | } 87 | return srsly.msgpack_dumps(msg) 88 | 89 | def from_bytes(self, bytes_data): 90 | msg = srsly.msgpack_loads(bytes_data) 91 | config_dict = msg["config"] 92 | tok_dict = msg["tokenizer"] 93 | if config_dict: 94 | with make_tempdir() as temp_dir: 95 | config_file = temp_dir / "config.json" 96 | srsly.write_json(config_file, config_dict) 97 | config = self.config_cls.from_pretrained(config_file) 98 | tok_kwargs = tok_dict.pop("kwargs", {}) 99 | for x, x_bytes in tok_dict.items(): 100 | Path(temp_dir / x).write_bytes(x_bytes) 101 | tokenizer = self.tokenizer_cls.from_pretrained( 102 | str(temp_dir.absolute()), **tok_kwargs 103 | ) 104 | vocab_file_contents = None 105 | if hasattr(tokenizer, "vocab_file"): 106 | vocab_file_name = tokenizer.vocab_files_names["vocab_file"] 107 | vocab_file_path = str((temp_dir / vocab_file_name).absolute()) 108 | with open(vocab_file_path, "rb") as fileh: 109 | vocab_file_contents = fileh.read() 110 | 111 | transformer = self.model_cls.from_config(config) 112 | self._hfmodel = HFObjects( 113 | tokenizer, 114 | transformer, 115 | vocab_file_contents, 116 | SimpleFrozenDict(), 117 | SimpleFrozenDict(), 118 | ) 119 | self._model = transformer 120 | filelike = BytesIO(msg["state"]) 121 | filelike.seek(0) 122 | device = get_torch_default_device() 123 | try: 124 | self._model.load_state_dict(torch.load(filelike, map_location=device)) 125 | except RuntimeError: 126 | warn_msg = ( 127 | "Error loading saved torch state_dict with strict=True, " 128 | "likely due to differences between 'transformers' " 129 | "versions. Attempting to load with strict=False as a " 130 | "fallback...\n\n" 131 | "If you see errors or degraded performance, download a " 132 | "newer compatible model or retrain your custom model with " 133 | "the current 'transformers' and 'spacy-transformers' " 134 | "versions. For more details and available updates, run: " 135 | "python -m spacy validate" 136 | ) 137 | warnings.warn(warn_msg) 138 | filelike.seek(0) 139 | b = torch.load(filelike, map_location=device) 140 | self._model.load_state_dict(b, strict=False) 141 | self._model.to(device) 142 | else: 143 | self._hfmodel = HFObjects( 144 | None, 145 | None, 146 | None, 147 | msg["_init_tokenizer_config"], 148 | msg["_init_transformer_config"], 149 | ) 150 | return self 151 | -------------------------------------------------------------------------------- /spacy_transformers/layers/hf_wrapper.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Optional, Any 2 | from thinc.layers.pytorchwrapper import forward as pt_forward 3 | from thinc.layers.pytorchwrapper import convert_pytorch_default_inputs 4 | from thinc.layers.pytorchwrapper import convert_pytorch_default_outputs 5 | from thinc.api import registry, Model 6 | 7 | from transformers import AutoConfig, AutoModel, AutoTokenizer 8 | 9 | from ..data_classes import HFObjects 10 | from .hf_shim import HFShim 11 | 12 | 13 | @registry.layers("HFWrapper.v1") 14 | def HFWrapper( 15 | hf_model: HFObjects, 16 | convert_inputs: Optional[Callable] = None, 17 | convert_outputs: Optional[Callable] = None, 18 | mixed_precision: bool = False, 19 | grad_scaler_config: dict = {}, 20 | config_cls=AutoConfig, 21 | model_cls=AutoModel, 22 | tokenizer_cls=AutoTokenizer, 23 | ) -> Model[Any, Any]: 24 | """Wrap a PyTorch HF model, so that it has the same API as Thinc models. 25 | To optimize the model, you'll need to create a PyTorch optimizer and call 26 | optimizer.step() after each batch. See examples/wrap_pytorch.py 27 | 28 | Your PyTorch model's forward method can take arbitrary args and kwargs, 29 | but must return either a single tensor as output or a tuple. You may find the 30 | PyTorch register_forward_hook helpful if you need to adapt the output. 31 | 32 | The convert functions are used to map inputs and outputs to and from your 33 | PyTorch model. Each function should return the converted output, and a callback 34 | to use during the backward pass. So: 35 | 36 | Xtorch, get_dX = convert_inputs(X) 37 | Ytorch, torch_backprop = model.shims[0](Xtorch, is_train) 38 | Y, get_dYtorch = convert_outputs(Ytorch) 39 | 40 | To allow maximum flexibility, the PyTorchShim expects ArgsKwargs objects 41 | on the way into the forward and backward passed. The ArgsKwargs objects 42 | will be passed straight into the model in the forward pass, and straight 43 | into `torch.autograd.backward` during the backward pass. 44 | """ 45 | if convert_inputs is None: 46 | convert_inputs = convert_pytorch_default_inputs 47 | if convert_outputs is None: 48 | convert_outputs = convert_pytorch_default_outputs 49 | 50 | return Model( 51 | "hf-pytorch", 52 | pt_forward, 53 | attrs={"convert_inputs": convert_inputs, "convert_outputs": convert_outputs}, 54 | shims=[ 55 | HFShim( 56 | hf_model, 57 | mixed_precision=mixed_precision, 58 | grad_scaler_config=grad_scaler_config, 59 | config_cls=config_cls, 60 | model_cls=model_cls, 61 | tokenizer_cls=tokenizer_cls, 62 | ) 63 | ], 64 | dims={"nI": None, "nO": None}, 65 | ) 66 | -------------------------------------------------------------------------------- /spacy_transformers/layers/listener.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Callable, List 2 | from thinc.api import Model 3 | from spacy.errors import Errors 4 | from spacy.tokens import Doc 5 | from ..data_classes import TransformerData 6 | 7 | 8 | class TransformerListener(Model): 9 | """A layer that gets fed its answers from an upstream connection, 10 | for instance from a component earlier in the pipeline. 11 | """ 12 | 13 | name = "transformer-listener" 14 | 15 | _batch_id: Optional[int] 16 | _outputs: Optional[List[TransformerData]] 17 | _backprop: Optional[Callable[[List[TransformerData]], List[Doc]]] 18 | 19 | def __init__(self, upstream_name: str): 20 | Model.__init__(self, name=self.name, forward=forward, dims={"nO": None}) 21 | self.upstream_name = upstream_name 22 | self._batch_id = None 23 | self._outputs = None 24 | self._backprop = None 25 | 26 | @classmethod 27 | def get_batch_id(cls, inputs: List[Doc]): 28 | return sum(sum(token.orth for token in doc) for doc in inputs) 29 | 30 | def receive(self, batch_id, outputs, backprop): 31 | self._batch_id = batch_id 32 | self._outputs = outputs 33 | self._backprop = backprop 34 | 35 | def backprop_and_clear(self, *args, **kwargs): 36 | """Call the stored _backprop callback, and then 37 | clears it. This saves memory, as otherwise we hold onto that callback 38 | until the next batch. 39 | """ 40 | if self._backprop is not None: 41 | result = self._backprop(*args, **kwargs) 42 | else: 43 | result = None 44 | self._batch_id = None 45 | self._outputs = None 46 | self._backprop = None 47 | return result 48 | 49 | def verify_inputs(self, inputs): 50 | if self._batch_id is None and self._outputs is None: 51 | raise ValueError 52 | else: 53 | batch_id = self.get_batch_id(inputs) 54 | if batch_id != self._batch_id: 55 | raise ValueError(f"Mismatched IDs! {batch_id} vs {self._batch_id}") 56 | else: 57 | return True 58 | 59 | 60 | def forward(model: TransformerListener, docs, is_train): 61 | if is_train: 62 | # This might occur during training when the transformer layer is frozen / hasn't been updated. 63 | # In that case, it should be set to "annotating" so we can retrieve the embeddings from the doc. 64 | if model._batch_id is None: 65 | outputs = [] 66 | for doc in docs: 67 | if doc._.trf_data is None: 68 | raise ValueError(Errors.E203.format(name="transformer")) 69 | else: 70 | outputs.append(doc._.trf_data) 71 | return outputs, _empty_backprop 72 | else: 73 | model.verify_inputs(docs) 74 | return model._outputs, model.backprop_and_clear 75 | else: 76 | width = model.get_dim("nO") 77 | outputs = [] 78 | for doc in docs: 79 | if doc._.trf_data is None: 80 | outputs.append(TransformerData.zeros(len(doc), width, xp=model.ops.xp)) 81 | else: 82 | outputs.append(doc._.trf_data) 83 | return outputs, _empty_backprop 84 | 85 | 86 | def _empty_backprop(dX): 87 | return [] 88 | -------------------------------------------------------------------------------- /spacy_transformers/layers/split_trf.py: -------------------------------------------------------------------------------- 1 | from thinc.api import Model 2 | from typing import List 3 | from ..data_classes import FullTransformerBatch, TransformerData 4 | 5 | 6 | def split_trf_batch() -> Model[FullTransformerBatch, List[TransformerData]]: 7 | return Model("split-trf-batch", forward) 8 | 9 | 10 | def forward(model, trf_full, is_train): 11 | def backprop(d_trf_datas): 12 | return trf_full.unsplit_by_doc([x.tensors for x in d_trf_datas]) 13 | 14 | return trf_full.doc_data, backprop 15 | -------------------------------------------------------------------------------- /spacy_transformers/layers/transformer_model.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple, Callable, Union, Dict 2 | import copy 3 | from pathlib import Path 4 | from transformers.file_utils import ModelOutput 5 | from transformers import AutoConfig, AutoModel, AutoTokenizer 6 | from transformers.tokenization_utils_fast import PreTrainedTokenizerFast 7 | from transformers.tokenization_utils import BatchEncoding 8 | 9 | from spacy.tokens import Doc 10 | from thinc.api import Model, get_torch_default_device, xp2torch 11 | from thinc.types import ArgsKwargs 12 | 13 | import logging 14 | 15 | from ..data_classes import FullTransformerBatch, WordpieceBatch, HFObjects 16 | from ..util import maybe_flush_pytorch_cache 17 | from ..util import log_gpu_memory, log_batch_size 18 | from ..layers._util import replace_listener, replace_listener_cfg 19 | from ..truncate import truncate_oversize_splits 20 | from ..align import get_alignment, get_alignment_via_offset_mapping 21 | from .hf_wrapper import HFWrapper 22 | 23 | 24 | class TransformerModel(Model): 25 | def __init__( 26 | self, 27 | name: str, 28 | get_spans: Callable, 29 | tokenizer_config: dict = {}, 30 | transformer_config: dict = {}, 31 | mixed_precision: bool = False, 32 | grad_scaler_config: dict = {}, 33 | ): 34 | """ 35 | get_spans (Callable[[List[Doc]], List[Span]]): 36 | A function to extract spans from the batch of Doc objects. 37 | This is used to manage long documents, by cutting them into smaller 38 | sequences before running the transformer. The spans are allowed to 39 | overlap, and you can also omit sections of the Doc if they are not 40 | relevant. 41 | tokenizer_config (dict): Settings to pass to the transformers tokenizer. 42 | transformer_config (dict): Settings to pass to the transformers forward pass. 43 | """ 44 | hf_model = HFObjects(None, None, None, tokenizer_config, transformer_config) 45 | wrapper = HFWrapper( 46 | hf_model, 47 | convert_inputs=_convert_transformer_inputs, 48 | convert_outputs=_convert_transformer_outputs, 49 | mixed_precision=mixed_precision, 50 | grad_scaler_config=grad_scaler_config, 51 | ) 52 | super().__init__( 53 | "transformer", 54 | forward, 55 | init=init, 56 | layers=[wrapper], 57 | dims={"nO": None}, 58 | attrs={ 59 | "get_spans": get_spans, 60 | "name": name, 61 | "set_transformer": set_pytorch_transformer, 62 | "has_transformer": False, 63 | "flush_cache_chance": 0.0, 64 | "replace_listener": replace_listener, 65 | "replace_listener_cfg": replace_listener_cfg, 66 | }, 67 | ) 68 | 69 | @property 70 | def tokenizer(self): 71 | return self.layers[0].shims[0]._hfmodel.tokenizer 72 | 73 | @property 74 | def transformer(self): 75 | return self.layers[0].shims[0]._hfmodel.transformer 76 | 77 | @property 78 | def _init_tokenizer_config(self): 79 | return self.layers[0].shims[0]._hfmodel._init_tokenizer_config 80 | 81 | @property 82 | def _init_transformer_config(self): 83 | return self.layers[0].shims[0]._hfmodel._init_transformer_config 84 | 85 | def copy(self): 86 | """ 87 | Create a copy of the model, its attributes, and its parameters. Any child 88 | layers will also be deep-copied. The copy will receive a distinct `model.id` 89 | value. 90 | """ 91 | copied = TransformerModel(self.name, self.attrs["get_spans"]) 92 | params = {} 93 | for name in self.param_names: 94 | params[name] = self.get_param(name) if self.has_param(name) else None 95 | copied.params = copy.deepcopy(params) 96 | copied.dims = copy.deepcopy(self._dims) 97 | copied.layers[0] = copy.deepcopy(self.layers[0]) 98 | for name in self.grad_names: 99 | copied.set_grad(name, self.get_grad(name).copy()) 100 | return copied 101 | 102 | 103 | def set_logger(model, out_file): 104 | """Add a logger that will log memory usage to the given file. 105 | 106 | Used to debug OOM errors. 107 | """ 108 | logging.basicConfig( 109 | level="INFO", format="%(asctime)s:%(levelname)s: %(message)s", stream=out_file 110 | ) 111 | model.attrs["logger"] = logging.getLogger(__name__) 112 | 113 | 114 | def set_pytorch_transformer(model, hf_model: HFObjects): 115 | if model.attrs["has_transformer"]: 116 | raise ValueError("Cannot set second transformer.") 117 | model.layers[0].shims[0]._model = hf_model.transformer 118 | model.layers[0].shims[0]._hfmodel.tokenizer = hf_model.tokenizer 119 | model.layers[0].shims[0]._hfmodel.transformer = hf_model.transformer 120 | model.layers[0].shims[0]._hfmodel.vocab_file_contents = hf_model.vocab_file_contents 121 | model.attrs["has_transformer"] = True 122 | model.set_dim("nO", hf_model.transformer.config.hidden_size) 123 | 124 | 125 | def init(model: TransformerModel, X=None, Y=None): 126 | if model.attrs["has_transformer"]: 127 | return 128 | name = model.attrs["name"] 129 | tok_cfg = model._init_tokenizer_config 130 | trf_cfg = model._init_transformer_config 131 | hf_model = huggingface_from_pretrained(name, tok_cfg, trf_cfg) 132 | model.attrs["set_transformer"](model, hf_model) 133 | tokenizer = model.tokenizer 134 | # Call the model with a batch of inputs to infer the width 135 | if X: 136 | # If we're dealing with actual texts, do the work to setup the wordpieces 137 | # batch properly 138 | docs = X 139 | get_spans = model.attrs["get_spans"] 140 | nested_spans = get_spans(docs) 141 | flat_spans = [] 142 | for doc_spans in nested_spans: 143 | flat_spans.extend(doc_spans) 144 | token_data = huggingface_tokenize(tokenizer, [span.text for span in flat_spans]) 145 | wordpieces = WordpieceBatch.from_batch_encoding(token_data) 146 | if "offset_mapping" in token_data: 147 | align = get_alignment_via_offset_mapping( 148 | flat_spans, 149 | token_data["offset_mapping"], 150 | ) 151 | else: 152 | align = get_alignment( 153 | flat_spans, wordpieces.strings, tokenizer.all_special_tokens 154 | ) 155 | wordpieces, align = truncate_oversize_splits( 156 | wordpieces, align, tokenizer.model_max_length 157 | ) 158 | else: 159 | texts = ["hello world", "foo bar"] 160 | token_data = huggingface_tokenize(tokenizer, texts) 161 | wordpieces = WordpieceBatch.from_batch_encoding(token_data) 162 | model.layers[0].initialize(X=wordpieces) 163 | model_output = model.layers[0].predict(wordpieces) 164 | model.set_dim("nO", model_output.last_hidden_state.shape[-1]) 165 | 166 | 167 | def forward( 168 | model: TransformerModel, docs: List[Doc], is_train: bool 169 | ) -> Tuple[FullTransformerBatch, Callable]: 170 | tokenizer = model.tokenizer 171 | get_spans = model.attrs["get_spans"] 172 | transformer = model.layers[0] 173 | 174 | nested_spans = get_spans(docs) 175 | flat_spans = [] 176 | for doc_spans in nested_spans: 177 | flat_spans.extend(doc_spans) 178 | # Flush the PyTorch cache every so often. It seems to help with memory :( 179 | # This shouldn't be necessary, I'm not sure what I'm doing wrong? 180 | maybe_flush_pytorch_cache(chance=model.attrs.get("flush_cache_chance", 0)) 181 | if "logger" in model.attrs: 182 | log_gpu_memory(model.attrs["logger"], "begin forward") 183 | batch_encoding = huggingface_tokenize(tokenizer, [span.text for span in flat_spans]) 184 | wordpieces = WordpieceBatch.from_batch_encoding(batch_encoding) 185 | if "logger" in model.attrs: 186 | log_batch_size(model.attrs["logger"], wordpieces, is_train) 187 | if "offset_mapping" in batch_encoding: 188 | align = get_alignment_via_offset_mapping( 189 | flat_spans, 190 | batch_encoding["offset_mapping"], 191 | ) 192 | else: 193 | align = get_alignment( 194 | flat_spans, wordpieces.strings, tokenizer.all_special_tokens 195 | ) 196 | wordpieces, align = truncate_oversize_splits( 197 | wordpieces, align, tokenizer.model_max_length 198 | ) 199 | model_output, bp_tensors = transformer(wordpieces, is_train) 200 | if "logger" in model.attrs: 201 | log_gpu_memory(model.attrs["logger"], "after forward") 202 | output = FullTransformerBatch( 203 | spans=nested_spans, 204 | wordpieces=wordpieces, 205 | model_output=model_output, 206 | align=align, 207 | ) 208 | if "logger" in model.attrs: 209 | log_gpu_memory(model.attrs["logger"], "return from forward") 210 | 211 | def backprop_transformer(d_output: FullTransformerBatch) -> List[Doc]: 212 | if "logger" in model.attrs: 213 | log_gpu_memory(model.attrs["logger"], "Begin backprop") 214 | _ = bp_tensors(d_output.model_output) 215 | if "logger" in model.attrs: 216 | log_gpu_memory(model.attrs["logger"], "After backprop") 217 | return docs 218 | 219 | return output, backprop_transformer 220 | 221 | 222 | def _convert_transformer_inputs(model, wps: WordpieceBatch, is_train): 223 | # Adapter for the HFWrapper. See https://thinc.ai/docs/usage-frameworks 224 | 225 | hf_device = model.shims[0]._hfmodel.transformer.device 226 | kwargs = { 227 | "input_ids": xp2torch(wps.input_ids, device=hf_device), 228 | "attention_mask": xp2torch(wps.attention_mask, device=hf_device), 229 | } 230 | if wps.token_type_ids is not None: 231 | kwargs["token_type_ids"] = xp2torch(wps.token_type_ids, device=hf_device) 232 | return ArgsKwargs(args=(), kwargs=kwargs), lambda dX: [] 233 | 234 | 235 | def _convert_transformer_outputs(model, inputs_outputs, is_train): 236 | _, model_output = inputs_outputs 237 | 238 | def backprop(d_model_output: ModelOutput) -> ArgsKwargs: 239 | return ArgsKwargs( 240 | args=(model_output.last_hidden_state,), 241 | kwargs={"grad_tensors": d_model_output.values()}, 242 | ) 243 | 244 | return model_output, backprop 245 | 246 | 247 | def huggingface_from_pretrained( 248 | source: Union[Path, str], 249 | tok_config: Dict, 250 | trf_config: Dict, 251 | config_cls=AutoConfig, 252 | model_cls=AutoModel, 253 | tokenizer_cls=AutoTokenizer, 254 | ) -> HFObjects: 255 | """Create a Huggingface transformer model from pretrained weights. Will 256 | download the model if it is not already downloaded. 257 | 258 | source (Union[str, Path]): The name of the model or a path to it, such as 259 | 'bert-base-cased'. 260 | tok_config (dict): Settings to pass to the tokenizer. 261 | trf_config (dict): Settings to pass to the transformer. 262 | """ 263 | if isinstance(source, Path): 264 | str_path = str(source.absolute()) 265 | else: 266 | str_path = source 267 | tokenizer = tokenizer_cls.from_pretrained(str_path, **tok_config) 268 | vocab_file_contents = None 269 | if hasattr(tokenizer, "vocab_file"): 270 | with open(tokenizer.vocab_file, "rb") as fileh: 271 | vocab_file_contents = fileh.read() 272 | trf_config["return_dict"] = True 273 | config = config_cls.from_pretrained(str_path, **trf_config) 274 | transformer = model_cls.from_pretrained(str_path, config=config) 275 | torch_device = get_torch_default_device() 276 | transformer.to(torch_device) 277 | return HFObjects(tokenizer, transformer, vocab_file_contents) 278 | 279 | 280 | def huggingface_tokenize(tokenizer, texts: List[str]) -> BatchEncoding: 281 | """Apply a Huggingface tokenizer to a batch of texts.""" 282 | 283 | # Use NumPy arrays rather than PyTorch tensors to avoid a lot of 284 | # host <-> device transfers during tokenization and post-processing 285 | # when a GPU is used. 286 | token_data = tokenizer( 287 | texts, 288 | add_special_tokens=True, 289 | return_attention_mask=True, 290 | return_offsets_mapping=isinstance(tokenizer, PreTrainedTokenizerFast), 291 | return_tensors="np", 292 | return_token_type_ids=None, # Sets to model default 293 | padding="longest", 294 | ) 295 | token_data["input_texts"] = [] 296 | for i in range(len(token_data["input_ids"])): 297 | wp_texts = tokenizer.convert_ids_to_tokens(token_data["input_ids"][i]) 298 | token_data["input_texts"].append(wp_texts) 299 | token_data["pad_token"] = tokenizer.pad_token 300 | return token_data 301 | -------------------------------------------------------------------------------- /spacy_transformers/layers/trfs2arrays.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, List, Optional, Tuple, cast 2 | import numpy 3 | from spacy.util import all_equal 4 | from transformers.file_utils import ModelOutput 5 | from transformers.modeling_outputs import BaseModelOutput 6 | from thinc.api import Model 7 | from thinc.types import Ragged, Floats2d 8 | from ..data_classes import TransformerData 9 | from ..align import apply_alignment 10 | 11 | 12 | def trfs2arrays( 13 | pooling: Model[Ragged, Floats2d], grad_factor: float 14 | ) -> Model[List[TransformerData], List[Floats2d]]: 15 | """Pool transformer data into token-aligned tensors.""" 16 | return Model( 17 | "trfs2arrays", 18 | forward, 19 | layers=[pooling], 20 | attrs={"grad_factor": grad_factor}, 21 | ) 22 | 23 | 24 | def forward(model: Model, trf_datas: List[TransformerData], is_train: bool): 25 | pooling: Model[Ragged, Floats2d] = model.layers[0] 26 | grad_factor = model.attrs["grad_factor"] 27 | zero_outputs: List[Tuple[int, Floats2d]] = [] 28 | backprops_alignment: List[Optional[Callable]] = [] 29 | aligned_outputs: List[Tuple[int, Ragged]] = [] 30 | 31 | # For zero-length documents, we could cache the output width by iterating 32 | # through the batch outputs and retrieving the shape of a non-zero length 33 | # Doc. This, however, is not fool-proof as one can pass an entire batch of 34 | # zero-length Docs to the transformer model (at least during prediction). 35 | # Instead of being conditionally correct, we'll explicitly leave the width as 36 | # zero in these cases as the effective length of the resultant tensor is zero anyway. 37 | output_width = 0 38 | 39 | for i, trf_data in enumerate(trf_datas): 40 | if not isinstance(trf_data, TransformerData): 41 | raise ValueError( 42 | "Expected spacy_transformers.data_classes.TransformerData " 43 | f"in trf_data, got: {type(trf_data)}\n" 44 | "Check that your pipeline contains a transformer component " 45 | "with a spacy-transformers TransformerModel architecture." 46 | ) 47 | if "last_hidden_state" in trf_data.model_output: 48 | tensor_t_i = cast(BaseModelOutput, trf_data.model_output).last_hidden_state 49 | if tensor_t_i.size == 0: 50 | # This can happen during prediction/initialization if the transformer pipe was disabled/not executed and one of the inputs 51 | # was of length zero. This causes the listenener to generate a zero-sized (in the sequence length dim) TransformerData 52 | # output and pass it downstream. 53 | zero_outputs.append((i, model.ops.alloc2f(0, output_width))) 54 | backprops_alignment.append(None) 55 | else: 56 | # This is the general case for non-zero length documents. 57 | src = model.ops.reshape2f(tensor_t_i, -1, trf_data.width) # type: ignore 58 | dst, get_d_src = apply_alignment(model.ops, trf_data.align, src) 59 | aligned_outputs.append((i, dst)) 60 | backprops_alignment.append(get_d_src) 61 | else: 62 | # This can happen during prediction/training for zero-length documents. Since zero-length docs 63 | # are implicitly ignored in the span generation stage, the transformer model does not return any 64 | # predictions for them and subsequently, FullTransformerBatch.split_by_doc() generates an empty 65 | # TransformerData. 66 | zero_outputs.append((i, model.ops.alloc2f(0, output_width))) 67 | backprops_alignment.append(None) 68 | 69 | pooling_outputs, backprop_pooling = concat_pooling_forward( 70 | pooling, [dst for _, dst in aligned_outputs], is_train 71 | ) 72 | 73 | # Interleave the zero and non-zero outputs into the final result. 74 | outputs: List[Optional[Floats2d]] = [None] * ( 75 | len(zero_outputs) + len(aligned_outputs) 76 | ) 77 | for i, zero_output in zero_outputs: 78 | outputs[i] = zero_output 79 | for (i, _), pooling_output in zip(aligned_outputs, pooling_outputs): 80 | outputs[i] = pooling_output 81 | 82 | def backprop_trf_to_tensor(d_outputs: List[Floats2d]) -> List[TransformerData]: 83 | d_trf_datas: List[TransformerData] = [] 84 | 85 | # Only update the gradients that are relevant for pooling. 86 | d_pooling = backprop_pooling([d_outputs[i] for i, _ in aligned_outputs]) 87 | for (i, _), d_pooling_i in zip(aligned_outputs, d_pooling): 88 | d_outputs[i] = d_pooling_i 89 | 90 | to_zip = (trf_datas, d_outputs, backprops_alignment) 91 | assert all_equal(len(x) for x in to_zip) # type: ignore 92 | zipped = zip(*to_zip) 93 | for trf_data, d_output, get_d_src in zipped: 94 | if "last_hidden_state" not in trf_data.model_output: 95 | # This gradient belongs to a zero-length doc and must be ignored as it doesn't have a corresponding 96 | # output from the transformer model (due to empty documents being skipped during the span generation 97 | # stage in the forward pass). 98 | assert len(d_output) == 0 99 | assert get_d_src is None 100 | continue 101 | 102 | assert get_d_src is not None 103 | d_model_output = ModelOutput( 104 | last_hidden_state=model.ops.alloc( 105 | trf_data.model_output.last_hidden_state.shape, # type: ignore 106 | dtype=trf_data.model_output.last_hidden_state.dtype, # type: ignore 107 | ) 108 | ) 109 | d_src = get_d_src(d_output) 110 | d_src *= grad_factor 111 | d_model_output["last_hidden_state"] = d_src.reshape( 112 | cast(BaseModelOutput, trf_data.model_output).last_hidden_state.shape 113 | ) 114 | d_trf_datas.append( 115 | TransformerData( 116 | model_output=d_model_output, 117 | wordpieces=trf_data.wordpieces, 118 | align=trf_data.align, 119 | ) 120 | ) 121 | return d_trf_datas 122 | 123 | assert len(outputs) == len(trf_datas) 124 | return outputs, backprop_trf_to_tensor 125 | 126 | 127 | def concat_pooling_forward( 128 | pooling: Model[Ragged, Floats2d], X: List[Ragged], is_train: bool 129 | ): 130 | xp = pooling.ops.xp 131 | 132 | datas = [] 133 | lens = [] 134 | doc_lens = [] 135 | for X_doc_data in X: 136 | datas.append(X_doc_data.dataXd) 137 | lens.append(X_doc_data.lengths) 138 | doc_lens.append(len(X_doc_data.lengths)) 139 | 140 | X_flat = Ragged(xp.concatenate(datas, axis=0), xp.concatenate(lens, axis=0)) 141 | Y_pooled, pooling_backprop = pooling(X_flat, is_train) 142 | Y = xp.split(Y_pooled, numpy.cumsum(doc_lens)[:-1]) 143 | 144 | def backprop(dY): 145 | dY_pooled_flat = xp.concatenate(dY) 146 | dY_flat = pooling_backprop(dY_pooled_flat).dataXd 147 | 148 | dY = [] 149 | for X_doc_data in X: 150 | doc_unpooled_len = X_doc_data.dataXd.shape[0] 151 | dY.append(Ragged(dY_flat[:doc_unpooled_len], X_doc_data.lengths)) 152 | dY_flat = dY_flat[doc_unpooled_len:] 153 | 154 | return dY 155 | 156 | return Y, backprop 157 | -------------------------------------------------------------------------------- /spacy_transformers/pipeline_component.py: -------------------------------------------------------------------------------- 1 | from typing import List, Callable, Iterable, Iterator, Optional, Dict, Union 2 | import warnings 3 | from spacy.language import Language 4 | from spacy.pipeline.trainable_pipe import TrainablePipe 5 | from spacy.pipeline.pipe import deserialize_config 6 | from spacy.tokens import Doc 7 | from spacy.vocab import Vocab 8 | from spacy.training import Example, validate_examples 9 | from spacy import util, Errors 10 | from spacy.util import minibatch 11 | from thinc.api import Model, Config, set_dropout_rate, Optimizer 12 | import srsly 13 | from pathlib import Path 14 | 15 | from .layers.transformer_model import huggingface_from_pretrained 16 | from .util import batch_by_length 17 | from .annotation_setters import null_annotation_setter 18 | from .data_classes import FullTransformerBatch, TransformerData 19 | from .layers import TransformerListener 20 | 21 | 22 | DEFAULT_CONFIG_STR = """ 23 | [transformer] 24 | max_batch_items = 4096 25 | 26 | [transformer.set_extra_annotations] 27 | @annotation_setters = "spacy-transformers.null_annotation_setter.v1" 28 | 29 | [transformer.model] 30 | @architectures = "spacy-transformers.TransformerModel.v3" 31 | name = "roberta-base" 32 | tokenizer_config = {"use_fast": true} 33 | transformer_config = {} 34 | mixed_precision = false 35 | grad_scaler_config = {} 36 | 37 | [transformer.model.get_spans] 38 | @span_getters = "spacy-transformers.strided_spans.v1" 39 | window = 128 40 | stride = 96 41 | """ 42 | 43 | DEFAULT_CONFIG = Config().from_str(DEFAULT_CONFIG_STR) 44 | DOC_EXT_ATTR = "trf_data" 45 | 46 | 47 | @Language.factory( 48 | "transformer", 49 | assigns=[f"doc._.{DOC_EXT_ATTR}"], 50 | default_config=DEFAULT_CONFIG["transformer"], 51 | ) 52 | def make_transformer( 53 | nlp: Language, 54 | name: str, 55 | model: Model[List[Doc], FullTransformerBatch], 56 | set_extra_annotations: Callable[[List[Doc], FullTransformerBatch], None], 57 | max_batch_items: int, 58 | ): 59 | """Construct a Transformer component, which lets you plug a model from the 60 | Huggingface transformers library into spaCy so you can use it in your 61 | pipeline. One or more subsequent spaCy components can use the transformer 62 | outputs as features in its model, with gradients backpropagated to the single 63 | shared weights. 64 | 65 | model (Model[List[Doc], FullTransformerBatch]): A thinc Model object wrapping 66 | the transformer. Usually you will want to use the TransformerModel 67 | layer for this. 68 | set_extra_annotations (Callable[[List[Doc], FullTransformerBatch], None]): A 69 | callback to set additional information onto the batch of `Doc` objects. 70 | The doc._.trf_data attribute is set prior to calling the callback. 71 | By default, no additional annotations are set. 72 | """ 73 | return Transformer( 74 | nlp.vocab, 75 | model, 76 | set_extra_annotations, 77 | max_batch_items=max_batch_items, 78 | name=name, 79 | ) 80 | 81 | 82 | def install_extensions() -> None: 83 | if not Doc.has_extension(DOC_EXT_ATTR): 84 | Doc.set_extension(DOC_EXT_ATTR, default=None) 85 | 86 | 87 | class Transformer(TrainablePipe): 88 | """spaCy pipeline component that provides access to a transformer model from 89 | the Huggingface transformers library. Usually you will connect subsequent 90 | components to the shared transformer using the TransformerListener layer. 91 | This works similarly to spaCy's Tok2Vec component and Tok2VecListener 92 | sublayer. 93 | 94 | The activations from the transformer are saved in the doc._.trf_data extension 95 | attribute. You can also provide a callback to set additional annotations. 96 | 97 | vocab (Vocab): The Vocab object for the pipeline. 98 | model (Model[List[Doc], FullTransformerBatch]): A thinc Model object wrapping 99 | the transformer. Usually you will want to use the TransformerModel 100 | layer for this. 101 | set_extra_annotations (Callable[[List[Doc], FullTransformerBatch], None]): A 102 | callback to set additional information onto the batch of `Doc` objects. 103 | The doc._.trf_data attribute is set prior to calling the callback. 104 | By default, no additional annotations are set. 105 | """ 106 | 107 | def __init__( 108 | self, 109 | vocab: Vocab, 110 | model: Model[List[Doc], FullTransformerBatch], 111 | set_extra_annotations: Callable = null_annotation_setter, 112 | *, 113 | name: str = "transformer", 114 | max_batch_items: int = 128 * 32, # Max size of padded batch 115 | ): 116 | """Initialize the transformer component.""" 117 | self.name = name 118 | self.vocab = vocab 119 | self.model = model 120 | if not isinstance(self.model, Model): 121 | raise ValueError(f"Expected Thinc Model, got: {type(self.model)}") 122 | self.set_extra_annotations = set_extra_annotations 123 | self.cfg = {"max_batch_items": max_batch_items} 124 | self.listener_map: Dict[str, List[TransformerListener]] = {} 125 | install_extensions() 126 | 127 | @property 128 | def listeners(self) -> List[TransformerListener]: 129 | """RETURNS (List[TransformerListener]): The listener models listening 130 | to this component. Usually internals. 131 | """ 132 | return [m for c in self.listening_components for m in self.listener_map[c]] 133 | 134 | @property 135 | def listening_components(self) -> List[str]: 136 | """RETURNS (List[str]): The downstream components listening to this 137 | component. Usually internals. 138 | """ 139 | return list(self.listener_map.keys()) 140 | 141 | def add_listener(self, listener: TransformerListener, component_name: str) -> None: 142 | """Add a listener for a downstream component. Usually internals.""" 143 | self.listener_map.setdefault(component_name, []) 144 | if listener not in self.listener_map[component_name]: 145 | self.listener_map[component_name].append(listener) 146 | if self.model.has_dim("nO") and listener.has_dim("nO") is None: 147 | listener.set_dim("nO", self.model.get_dim("nO")) 148 | 149 | def remove_listener( 150 | self, listener: TransformerListener, component_name: str 151 | ) -> bool: 152 | """Remove a listener for a downstream component. Usually internals.""" 153 | if component_name in self.listener_map: 154 | if listener in self.listener_map[component_name]: 155 | self.listener_map[component_name].remove(listener) 156 | # If no listeners are left, remove entry 157 | if not self.listener_map[component_name]: 158 | del self.listener_map[component_name] 159 | return True 160 | return False 161 | 162 | def find_listeners(self, component) -> None: 163 | """Walk over a model of a processing component, looking for layers that 164 | are TransformerListener subclasses that have an upstream_name that 165 | matches this component. 166 | Listeners can also set their upstream_name attribute to the wildcard 167 | string '*' to match any `Transformer`. 168 | 169 | You're unlikely to ever need multiple `Transformer` components, so it's 170 | fine to leave your listeners upstream_name on '*'. 171 | """ 172 | names = ("*", self.name) 173 | if isinstance(getattr(component, "model", None), Model): 174 | for node in component.model.walk(): 175 | if ( 176 | isinstance(node, TransformerListener) 177 | and node.upstream_name in names 178 | ): 179 | self.add_listener(node, component.name) 180 | 181 | def __call__(self, doc: Doc) -> Doc: 182 | """Apply the pipe to one document. The document is modified in place, 183 | and returned. This usually happens under the hood when the nlp object 184 | is called on a text and all components are applied to the Doc. 185 | 186 | docs (Doc): The Doc to process. 187 | RETURNS (Doc): The processed Doc. 188 | 189 | DOCS: https://spacy.io/api/transformer#call 190 | """ 191 | install_extensions() 192 | outputs = self.predict([doc]) 193 | self.set_annotations([doc], outputs) 194 | return doc 195 | 196 | def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]: 197 | """Apply the pipe to a stream of documents. This usually happens under 198 | the hood when the nlp object is called on a text and all components are 199 | applied to the Doc. 200 | 201 | stream (Iterable[Doc]): A stream of documents. 202 | batch_size (int): The number of documents to buffer. 203 | YIELDS (Doc): Processed documents in order. 204 | 205 | DOCS: https://spacy.io/api/transformer#pipe 206 | """ 207 | install_extensions() 208 | for outer_batch in minibatch(stream, batch_size): 209 | outer_batch = list(outer_batch) 210 | for indices in batch_by_length(outer_batch, self.cfg["max_batch_items"]): 211 | subbatch = [outer_batch[i] for i in indices] 212 | self.set_annotations(subbatch, self.predict(subbatch)) 213 | yield from outer_batch 214 | 215 | def predict(self, docs: Iterable[Doc]) -> FullTransformerBatch: 216 | """Apply the pipeline's model to a batch of docs, without modifying them. 217 | Returns the extracted features as the FullTransformerBatch dataclass. 218 | 219 | docs (Iterable[Doc]): The documents to predict. 220 | RETURNS (FullTransformerBatch): The extracted features. 221 | 222 | DOCS: https://spacy.io/api/transformer#predict 223 | """ 224 | docs = list(docs) 225 | if not any(len(doc) for doc in docs): 226 | # Handle cases where there are no tokens in any docs. 227 | activations = FullTransformerBatch.empty(len(docs)) 228 | else: 229 | activations = self.model.predict(docs) 230 | return activations 231 | 232 | def set_annotations( 233 | self, docs: Iterable[Doc], predictions: FullTransformerBatch 234 | ) -> None: 235 | """Assign the extracted features to the Doc objects. By default, the 236 | TransformerData object is written to the doc._.trf_data attribute. Your 237 | set_extra_annotations callback is then called, if provided. 238 | 239 | docs (Iterable[Doc]): The documents to modify. 240 | predictions: (FullTransformerBatch): A batch of activations. 241 | 242 | DOCS: https://spacy.io/api/pipe#set_annotations 243 | """ 244 | doc_data = list(predictions.doc_data) 245 | for doc, data in zip(docs, doc_data): 246 | doc._.trf_data = data 247 | self.set_extra_annotations(list(docs), predictions) 248 | 249 | def update( 250 | self, 251 | examples: Iterable[Example], 252 | *, 253 | drop: float = 0.0, 254 | sgd: Optional[Optimizer] = None, 255 | losses: Optional[Dict[str, float]] = None, 256 | ) -> Dict[str, float]: 257 | """Prepare for an update to the transformer. 258 | 259 | Like the `Tok2Vec` component, the `Transformer` component is unusual 260 | in that it does not receive "gold standard" annotations to calculate 261 | a weight update. The optimal output of the transformer data is unknown; 262 | it's a hidden layer inside the network that is updated by backpropagating 263 | from output layers. 264 | 265 | The `Transformer` component therefore does not perform a weight update 266 | during its own `update` method. Instead, it runs its transformer model 267 | and communicates the output and the backpropagation callback to any 268 | downstream components that have been connected to it via the 269 | TransformerListener sublayer. If there are multiple listeners, the last 270 | layer will actually backprop to the transformer and call the optimizer, 271 | while the others simply increment the gradients. 272 | 273 | examples (Iterable[Example]): 274 | A batch of Example objects. Only the `predicted` doc object is used, 275 | the reference doc is ignored. 276 | drop (float): The dropout rate. 277 | sgd (thinc.api.Optimizer): The optimizer. 278 | losses (Dict[str, float]): Optional record of the loss during training. 279 | Updated using the component name as the key. 280 | RETURNS (Dict[str, float]): The updated losses dictionary. 281 | 282 | DOCS: https://spacy.io/api/transformer#update 283 | """ 284 | validate_examples(examples, "Transformer.update") 285 | if losses is None: 286 | losses = {} 287 | docs = [eg.predicted for eg in examples] 288 | if isinstance(docs, Doc): 289 | docs = [docs] 290 | if not any(len(doc) for doc in docs): 291 | # Handle cases where there are no tokens in any docs. 292 | return losses 293 | set_dropout_rate(self.model, drop) 294 | trf_full, bp_trf_full = self.model.begin_update(docs) 295 | d_tensors: List = [] 296 | losses.setdefault(self.name, 0.0) 297 | 298 | def accumulate_gradient(d_trf_datas: List[TransformerData]): 299 | """Accumulate tok2vec loss and gradient. This is passed as a callback 300 | to all but the last listener. Only the last one does the backprop. 301 | """ 302 | nonlocal d_tensors 303 | for i, d_trf_data in enumerate(d_trf_datas): 304 | for d_tensor in d_trf_data.tensors: 305 | losses[self.name] += float((d_tensor**2).sum()) # type:ignore 306 | if i >= len(d_tensors): 307 | d_tensors.append(list(d_trf_data.tensors)) 308 | else: 309 | for j, d_tensor in enumerate(d_trf_data.tensors): 310 | d_tensors[i][j] += d_tensor 311 | 312 | def backprop(d_trf_datas: List[TransformerData]): 313 | """Callback to actually do the backprop. Passed to last listener.""" 314 | nonlocal d_tensors 315 | accumulate_gradient(d_trf_datas) 316 | d_trf_full = trf_full.unsplit_by_doc(d_tensors) 317 | d_docs = bp_trf_full(d_trf_full) # type: ignore 318 | if sgd is not None: 319 | self.model.finish_update(sgd) 320 | d_tensors = [] 321 | return d_docs 322 | 323 | batch_id = TransformerListener.get_batch_id(docs) 324 | for listener in self.listeners[:-1]: 325 | listener.receive(batch_id, trf_full.doc_data, accumulate_gradient) 326 | if self.listeners: 327 | self.listeners[-1].receive(batch_id, trf_full.doc_data, backprop) 328 | return losses 329 | 330 | def get_loss(self, docs, golds, scores): 331 | """A noop function, for compatibility with the Pipe API. See the `update` 332 | method for an explanation of the loss mechanics of the component. 333 | """ 334 | pass 335 | 336 | def initialize( 337 | self, 338 | get_examples: Callable[[], Iterable[Example]], 339 | *, 340 | nlp: Optional[Language] = None, 341 | ): 342 | """Initialize the pipe for training, using data examples if available. 343 | 344 | get_examples (Callable[[], Iterable[Example]]): Optional function that 345 | returns gold-standard Example objects. 346 | nlp (Language): The current nlp object. 347 | 348 | DOCS: https://spacy.io/api/transformer#initialize 349 | """ 350 | docs = [Doc(Vocab(), words=["hello"])] 351 | self.model.initialize(X=docs) 352 | if nlp is not None: 353 | for i, (name1, proc1) in enumerate(nlp.pipeline): 354 | if proc1 is self: 355 | for name2, proc2 in nlp.pipeline[i:]: 356 | self.find_listeners(proc2) 357 | break 358 | 359 | def to_disk( 360 | self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() 361 | ) -> None: 362 | """Serialize the pipe to disk. 363 | 364 | path (str / Path): Path to a directory. 365 | exclude (Iterable[str]): String names of serialization fields to exclude. 366 | 367 | DOCS: https://spacy.io/api/transformer#to_disk 368 | """ 369 | serialize = {} 370 | serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) 371 | serialize["vocab"] = lambda p: self.vocab.to_disk(p) 372 | serialize["model"] = lambda p: self.model.to_disk(p) 373 | util.to_disk(path, serialize, exclude) 374 | 375 | def from_disk( 376 | self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() 377 | ) -> "Transformer": 378 | """Load the pipe from disk. 379 | 380 | path (str / Path): Path to a directory. 381 | exclude (Iterable[str]): String names of serialization fields to exclude. 382 | RETURNS (Transformer): The loaded object. 383 | 384 | DOCS: https://spacy.io/api/transformer#from_disk 385 | """ 386 | 387 | def load_model(p): 388 | try: 389 | with open(p, "rb") as mfile: 390 | self.model.from_bytes(mfile.read()) 391 | except AttributeError: 392 | raise ValueError(Errors.E149) from None 393 | except (IsADirectoryError, PermissionError): 394 | warn_msg = ( 395 | "Automatically converting a transformer component " 396 | "from spacy-transformers v1.0 to v1.1+. If you see errors " 397 | "or degraded performance, download a newer compatible " 398 | "model or retrain your custom model with the current " 399 | "spacy-transformers version. For more details and " 400 | "available updates, run: python -m spacy validate" 401 | ) 402 | warnings.warn(warn_msg) 403 | p = Path(p).absolute() 404 | hf_model = huggingface_from_pretrained( 405 | p, 406 | self.model._init_tokenizer_config, 407 | self.model._init_transformer_config, 408 | ) 409 | self.model.attrs["set_transformer"](self.model, hf_model) 410 | 411 | deserialize = { 412 | "vocab": self.vocab.from_disk, 413 | "cfg": lambda p: self.cfg.update(deserialize_config(p)), 414 | "model": load_model, 415 | } 416 | util.from_disk(path, deserialize, exclude) # type: ignore 417 | return self 418 | -------------------------------------------------------------------------------- /spacy_transformers/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spacy-transformers/aa1bb58f74570035e8a6dc3623292deaf95e03da/spacy_transformers/py.typed -------------------------------------------------------------------------------- /spacy_transformers/span_getters.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Iterable, List 2 | from functools import partial 3 | from spacy.tokens import Doc, Span 4 | 5 | from .util import registry 6 | 7 | SpannerT = Callable[[List[Doc]], List[List[Span]]] 8 | 9 | 10 | def get_strided_spans( 11 | docs: Iterable[Doc], window: int, stride: int 12 | ) -> List[List[Span]]: 13 | spans: List[List[Span]] = [] 14 | for doc in docs: 15 | start = 0 16 | spans.append([]) 17 | for i in range(len(doc) // stride): 18 | spans[-1].append(doc[start : start + window]) 19 | if (start + window) >= len(doc): 20 | break 21 | start += stride 22 | else: 23 | if start < len(doc): 24 | spans[-1].append(doc[start:]) 25 | return spans 26 | 27 | 28 | @registry.span_getters("spacy-transformers.strided_spans.v1") # type: ignore 29 | def configure_strided_spans(window: int, stride: int) -> SpannerT: 30 | """ 31 | Set the 'window' and 'stride' options for getting strided spans. 32 | 33 | If you set the window and stride to the same value, the spans will cover 34 | each token once. Setting 'stride' lower than 'window' will allow for an 35 | overlap, so that some tokens are counted twice. This can be desirable, 36 | because it allows all tokens to have both a left and right context. 37 | """ 38 | return partial(get_strided_spans, window=window, stride=stride) 39 | 40 | 41 | def get_sent_spans(docs: Iterable[Doc]) -> List[List[Span]]: 42 | return [list(doc.sents) for doc in docs] 43 | 44 | 45 | @registry.span_getters("spacy-transformers.sent_spans.v1") # type: ignore 46 | def configure_get_sent_spans() -> Callable: 47 | """ 48 | Create a `span_getter` that uses sentence boundary markers to extract 49 | the spans. This requires sentence boundaries to be set, and may result 50 | in somewhat uneven batches, depending on the sentence lengths. However, 51 | it does provide the transformer with more meaningful windows to attend over. 52 | """ 53 | return get_sent_spans 54 | 55 | 56 | def get_doc_spans(docs: Iterable[Doc]) -> List[List[Span]]: 57 | return [[doc[:]] for doc in docs] 58 | 59 | 60 | @registry.span_getters("spacy-transformers.doc_spans.v1") # type: ignore 61 | def configure_get_doc_spans() -> Callable: 62 | """ 63 | Create a `span_getter` that uses the whole document as its spans. This is 64 | the best approach if your `Doc` objects already refer to relatively short 65 | texts. 66 | """ 67 | return get_doc_spans 68 | 69 | 70 | __all__ = [ 71 | "get_sent_spans", 72 | "get_doc_spans", 73 | "configure_get_doc_spans", 74 | "configure_get_sent_spans", 75 | "configure_strided_spans", 76 | ] 77 | -------------------------------------------------------------------------------- /spacy_transformers/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spacy-transformers/aa1bb58f74570035e8a6dc3623292deaf95e03da/spacy_transformers/tests/__init__.py -------------------------------------------------------------------------------- /spacy_transformers/tests/enable_gpu.py: -------------------------------------------------------------------------------- 1 | from spacy import require_gpu 2 | 3 | require_gpu() 4 | -------------------------------------------------------------------------------- /spacy_transformers/tests/regression/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spacy-transformers/aa1bb58f74570035e8a6dc3623292deaf95e03da/spacy_transformers/tests/regression/__init__.py -------------------------------------------------------------------------------- /spacy_transformers/tests/regression/test_spacy_issue6401.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | from spacy.training.example import Example 3 | from spacy.util import make_tempdir 4 | from spacy import util 5 | from thinc.api import Config 6 | 7 | 8 | TRAIN_DATA = [ 9 | ("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}), 10 | ("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}), 11 | ] 12 | 13 | 14 | cfg_string = """ 15 | [nlp] 16 | lang = "en" 17 | pipeline = ["transformer","textcat"] 18 | 19 | [components] 20 | 21 | [components.textcat] 22 | factory = "textcat" 23 | 24 | [components.textcat.model] 25 | @architectures = "spacy.TextCatEnsemble.v2" 26 | 27 | [components.textcat.model.tok2vec] 28 | @architectures = "spacy-transformers.TransformerListener.v1" 29 | grad_factor = 1.0 30 | 31 | [components.textcat.model.tok2vec.pooling] 32 | @layers = "reduce_mean.v1" 33 | 34 | [components.transformer] 35 | factory = "transformer" 36 | 37 | [components.transformer.model] 38 | name = "distilbert-base-uncased" 39 | """ 40 | 41 | 42 | def test_transformer_pipeline_textcat(): 43 | """Test that a pipeline with just a transformer+textcat runs and trains properly. 44 | This used to throw an error because of shape inference issues - 45 | cf https://github.com/explosion/spaCy/issues/6401""" 46 | orig_config = Config().from_str(cfg_string) 47 | nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) 48 | assert nlp.pipe_names == ["transformer", "textcat"] 49 | train_examples = [] 50 | 51 | for text, annotations in TRAIN_DATA: 52 | train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) 53 | optimizer = nlp.initialize(get_examples=lambda: train_examples) 54 | 55 | for i in range(2): 56 | losses = {} 57 | nlp.update(train_examples, sgd=optimizer, losses=losses) 58 | 59 | doc = nlp("We're interested at underwater basket weaving.") 60 | cats1 = doc.cats 61 | 62 | # ensure IO goes OK 63 | with make_tempdir() as d: 64 | file_path = d / "trained_nlp" 65 | nlp.to_disk(file_path) 66 | nlp2 = spacy.load(file_path) 67 | doc2 = nlp2("We're interested at underwater basket weaving.") 68 | cats2 = doc2.cats 69 | assert cats1 == cats2 70 | -------------------------------------------------------------------------------- /spacy_transformers/tests/regression/test_spacy_issue7029.py: -------------------------------------------------------------------------------- 1 | from spacy.lang.en import English 2 | from spacy.training import Example 3 | from spacy.util import load_config_from_str 4 | 5 | CONFIG = """ 6 | [nlp] 7 | lang = "en" 8 | pipeline = ["transformer", "tagger"] 9 | 10 | [components] 11 | 12 | [components.transformer] 13 | factory = "transformer" 14 | 15 | [components.transformer.model] 16 | name = "distilbert-base-uncased" 17 | 18 | [components.tagger] 19 | factory = "tagger" 20 | 21 | [components.tagger.model] 22 | @architectures = "spacy.Tagger.v1" 23 | nO = null 24 | 25 | [components.tagger.model.tok2vec] 26 | @architectures = "spacy-transformers.TransformerListener.v1" 27 | grad_factor = 1.0 28 | 29 | [components.tagger.model.tok2vec.pooling] 30 | @layers = "reduce_mean.v1" 31 | """ 32 | 33 | 34 | TRAIN_DATA = [ 35 | ("I like green eggs", {"tags": ["N", "V", "J", "N"]}), 36 | ("", {}), 37 | ("Eat blue ham", {"tags": ["V", "J", "N"]}), 38 | ] 39 | 40 | 41 | def test_empty_doc(): 42 | """Test that an empty document gets processed correctly""" 43 | nlp = English.from_config(load_config_from_str(CONFIG)) 44 | train_examples = [] 45 | for t in TRAIN_DATA: 46 | train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) 47 | optimizer = nlp.initialize(get_examples=lambda: train_examples) 48 | for i in range(2): 49 | losses = {} 50 | nlp.update(train_examples, sgd=optimizer, losses=losses) 51 | texts = ["first", "second", "third", "fourth", "and", "then", "some", ""] 52 | 53 | # run as normal 54 | nlp.select_pipes(enable=["transformer", "tagger"]) 55 | docs1 = list(nlp.pipe(texts, batch_size=1)) 56 | docs2 = list(nlp.pipe(texts, batch_size=4)) 57 | assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]] 58 | 59 | # disable the transformer (the listener will produce random output) 60 | nlp.select_pipes(enable=["tagger"]) 61 | docs1 = list(nlp.pipe(texts, batch_size=1)) 62 | docs2 = list(nlp.pipe(texts, batch_size=4)) 63 | assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]] 64 | -------------------------------------------------------------------------------- /spacy_transformers/tests/test_alignment.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from typing import List 3 | import numpy 4 | from spacy.tokens import Doc 5 | from spacy.vocab import Vocab 6 | from thinc.api import NumpyOps 7 | from thinc.types import Ragged 8 | from ..align import get_alignment, apply_alignment 9 | from ..align import get_span2wp_from_offset_mapping 10 | 11 | 12 | def get_ragged(ops, nested: List[List[int]]): 13 | nested = [ops.asarray(x) for x in nested] 14 | return Ragged(ops.flatten(nested), ops.asarray([len(x) for x in nested])) 15 | 16 | 17 | def get_spans(word_seqs): 18 | vocab = Vocab() 19 | docs = [Doc(vocab, words=words) for words in word_seqs] 20 | return [doc[:] for doc in docs] 21 | 22 | 23 | def flatten_strings(words1, words2): 24 | flat1 = [] 25 | flat2 = [] 26 | for seq in words1: 27 | flat1.extend(seq) 28 | stride = max((len(seq) for seq in words2), default=0) 29 | for seq in words2: 30 | flat2.extend(seq) 31 | flat2.extend([""] * (stride - len(seq))) 32 | return flat1, flat2 33 | 34 | 35 | @pytest.mark.parametrize( 36 | "words1,words2", 37 | [ 38 | ([["a", "b"]], [["a", "b"]]), 39 | ([["ab"]], [["a", "b"]]), 40 | ([["a", "b"]], [["ab"]]), 41 | ([["ab", "c"]], [["a", "bc"]]), 42 | ([["ab", "cd"]], [["a", "bc", "d"]]), 43 | ], 44 | ) 45 | def test_alignments_match(words1, words2): 46 | spans = get_spans(words1) 47 | align = get_alignment(spans, words2) 48 | unique_tokens = set() 49 | for span in spans: 50 | for token in span: 51 | unique_tokens.add((id(token.doc), token.idx)) 52 | assert len(unique_tokens) == align.lengths.shape[0] 53 | flat_words1, flat_words2 = flatten_strings(words1, words2) 54 | for i, word in enumerate(flat_words1): 55 | wp_word = "".join([flat_words2[int(j[0])] for j in align[i].data]) 56 | if len(word) < len(wp_word): 57 | assert word in wp_word 58 | elif len(word) > len(wp_word): 59 | assert wp_word in word 60 | else: 61 | assert word == wp_word 62 | 63 | 64 | @pytest.mark.parametrize( 65 | "nested_align,X_cols", 66 | [ 67 | ([[0, 1, 2], [3], [4]], 4), 68 | ([[], [1], [1], [2]], 2), 69 | ([[0, 1], [1, 2], [], [4]], 2), 70 | ], 71 | ) 72 | def test_apply_alignment(nested_align, X_cols): 73 | ops = NumpyOps() 74 | align = get_ragged(ops, nested_align) 75 | X_shape = (align.data.max() + 1, X_cols) 76 | X = ops.alloc2f(*X_shape) 77 | Y, get_dX = apply_alignment(ops, align, X) 78 | assert isinstance(Y, Ragged) 79 | assert Y.data.shape[0] == align.data.shape[0] 80 | assert Y.lengths.shape[0] == len(nested_align) 81 | dX = get_dX(Y) 82 | assert dX.shape == X.shape 83 | 84 | 85 | @pytest.mark.parametrize( 86 | # fmt: off 87 | # roberta-base offset_mapping and expected alignment 88 | "words,offset_mapping,alignment", 89 | [ 90 | ( 91 | ["Áaaa"], 92 | numpy.asarray([(0, 0), (0, 1), (0, 1), (1, 4), (0, 0)], dtype="i"), 93 | [[1, 2, 3]], 94 | ), 95 | ( 96 | ["INGG", "á", "aäa"], 97 | numpy.asarray([(0, 0), (0, 3), (3, 4), (5, 6), (5, 6), (7, 8), (8, 9), (9, 10), (0, 0)], dtype="i"), 98 | [[1, 2], [3, 4], [5, 6, 7]], 99 | ), 100 | ], 101 | # fmt: on 102 | ) 103 | def test_offset_alignment(words, offset_mapping, alignment): 104 | spans = get_spans([words]) 105 | result = get_span2wp_from_offset_mapping(spans[0], offset_mapping) 106 | assert all(sorted(r) == a for r, a in zip(result, alignment)) 107 | -------------------------------------------------------------------------------- /spacy_transformers/tests/test_configs.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | import pytest 4 | import spacy 5 | from spacy.training import Example 6 | from spacy.training.initialize import init_nlp 7 | from spacy.util import CONFIG_SECTION_ORDER 8 | from spacy.language import DEFAULT_CONFIG 9 | from thinc.config import Config 10 | 11 | 12 | TRAIN_TAGGER_DATA = [ 13 | ("I like green eggs", {"tags": ["N", "V", "J", "N"]}), 14 | ("Eat blue ham", {"tags": ["V", "J", "N"]}), 15 | ] 16 | 17 | 18 | cfg_string = """ 19 | [nlp] 20 | lang = "en" 21 | pipeline = ["custom_transformer","tagger"] 22 | 23 | [components] 24 | 25 | [components.tagger] 26 | factory = "tagger" 27 | 28 | [components.tagger.model] 29 | @architectures = "spacy.Tagger.v1" 30 | nO = null 31 | 32 | [components.tagger.model.tok2vec] 33 | @architectures = "spacy-transformers.TransformerListener.v1" 34 | grad_factor = 1.0 35 | upstream = "custom_transformer" 36 | 37 | [components.tagger.model.tok2vec.pooling] 38 | @layers = "reduce_mean.v1" 39 | 40 | [components.custom_transformer] 41 | factory = "transformer" 42 | 43 | [corpora] 44 | @readers = toy_tagger_data.v1 45 | 46 | [initialize] 47 | 48 | [initialize.components] 49 | 50 | [initialize.components.tagger] 51 | labels = ["LABEL"] 52 | """ 53 | 54 | 55 | @pytest.mark.parametrize("config_string", [cfg_string]) 56 | def test_init_nlp(config_string): 57 | @spacy.registry.readers.register("toy_tagger_data.v1") 58 | def read_tagger_data(): 59 | def parse_data(nlp, index): 60 | ex = TRAIN_TAGGER_DATA[index] 61 | yield Example.from_dict(nlp.make_doc(ex[0]), ex[1]) 62 | 63 | return { 64 | "train": partial(parse_data, index=0), 65 | "dev": partial(parse_data, index=1), 66 | } 67 | 68 | config = spacy.util.load_config_from_str(config_string, interpolate=False) 69 | config = Config(DEFAULT_CONFIG, section_order=CONFIG_SECTION_ORDER).merge(config) 70 | nlp = init_nlp(config, use_gpu=False) 71 | assert nlp is not None 72 | 73 | tagger = nlp.get_pipe("tagger") 74 | transformer = nlp.get_pipe("custom_transformer") 75 | tagger_trf = tagger.model.get_ref("tok2vec").layers[0] 76 | assert tagger_trf.upstream_name == "custom_transformer" 77 | assert transformer.listeners[0] == tagger_trf 78 | -------------------------------------------------------------------------------- /spacy_transformers/tests/test_data_classes.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy 3 | from numpy.testing import assert_equal 4 | from spacy_transformers.data_classes import WordpieceBatch 5 | 6 | 7 | @pytest.fixture 8 | def wordpieces(): 9 | strings = [["some", "random", "strings"], ["are"], ["added", "here"]] 10 | shape = (len(strings), max(len(seq) for seq in strings)) 11 | wordpieces = WordpieceBatch( 12 | strings=strings, 13 | input_ids=numpy.zeros(shape, dtype="i"), 14 | token_type_ids=numpy.zeros(shape, dtype="i"), 15 | attention_mask=numpy.zeros((shape[0], shape[1]), dtype="bool"), 16 | lengths=[len(seq) for seq in strings], 17 | ) 18 | return wordpieces 19 | 20 | 21 | def test_wordpieces_IO(wordpieces): 22 | wp_dict = wordpieces.to_dict() 23 | wordpieces_2 = WordpieceBatch.empty().from_dict(wp_dict) 24 | for key, value in wordpieces_2.to_dict().items(): 25 | assert_equal(value, wp_dict[key]) 26 | -------------------------------------------------------------------------------- /spacy_transformers/tests/test_deprecations.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from spacy_transformers.util import huggingface_from_pretrained 3 | from spacy_transformers.util import huggingface_tokenize 4 | 5 | 6 | def test_deprecation_warnings(): 7 | with pytest.warns(DeprecationWarning): 8 | tokenizer, transformer = huggingface_from_pretrained( 9 | "distilbert-base-uncased", {} 10 | ) 11 | with pytest.warns(DeprecationWarning): 12 | token_data = huggingface_tokenize(tokenizer, ["a", "b", "c"]) 13 | -------------------------------------------------------------------------------- /spacy_transformers/tests/test_model_sequence_classification.py: -------------------------------------------------------------------------------- 1 | from typing import Callable 2 | from functools import partial 3 | import copy 4 | 5 | import torch 6 | from transformers import AutoModelForSequenceClassification 7 | from transformers.models.distilbert.modeling_distilbert import ( 8 | DistilBertForSequenceClassification, 9 | ) 10 | from transformers.modeling_outputs import SequenceClassifierOutput 11 | 12 | import spacy 13 | from thinc.api import Model 14 | 15 | from spacy_transformers.data_classes import HFObjects, WordpieceBatch 16 | from spacy_transformers.layers.hf_wrapper import HFWrapper 17 | from spacy_transformers.layers.transformer_model import _convert_transformer_inputs 18 | from spacy_transformers.layers.transformer_model import _convert_transformer_outputs 19 | from spacy_transformers.layers.transformer_model import forward 20 | from spacy_transformers.layers.transformer_model import huggingface_from_pretrained 21 | from spacy_transformers.layers.transformer_model import huggingface_tokenize 22 | from spacy_transformers.layers.transformer_model import set_pytorch_transformer 23 | from spacy_transformers.span_getters import get_strided_spans 24 | 25 | 26 | def test_model_for_sequence_classification(): 27 | # adapted from https://github.com/KennethEnevoldsen/spacy-wrap/ 28 | class ClassificationTransformerModel(Model): 29 | def __init__( 30 | self, 31 | name: str, 32 | get_spans: Callable, 33 | tokenizer_config: dict = {}, 34 | transformer_config: dict = {}, 35 | mixed_precision: bool = False, 36 | grad_scaler_config: dict = {}, 37 | ): 38 | hf_model = HFObjects(None, None, None, tokenizer_config, transformer_config) 39 | wrapper = HFWrapper( 40 | hf_model, 41 | convert_inputs=_convert_transformer_inputs, 42 | convert_outputs=_convert_transformer_outputs, 43 | mixed_precision=mixed_precision, 44 | grad_scaler_config=grad_scaler_config, 45 | model_cls=AutoModelForSequenceClassification, 46 | ) 47 | super().__init__( 48 | "clf_transformer", 49 | forward, 50 | init=init, 51 | layers=[wrapper], 52 | dims={"nO": None}, 53 | attrs={ 54 | "get_spans": get_spans, 55 | "name": name, 56 | "set_transformer": set_pytorch_transformer, 57 | "has_transformer": False, 58 | "flush_cache_chance": 0.0, 59 | }, 60 | ) 61 | 62 | @property 63 | def tokenizer(self): 64 | return self.layers[0].shims[0]._hfmodel.tokenizer 65 | 66 | @property 67 | def transformer(self): 68 | return self.layers[0].shims[0]._hfmodel.transformer 69 | 70 | @property 71 | def _init_tokenizer_config(self): 72 | return self.layers[0].shims[0]._hfmodel._init_tokenizer_config 73 | 74 | @property 75 | def _init_transformer_config(self): 76 | return self.layers[0].shims[0]._hfmodel._init_transformer_config 77 | 78 | def copy(self): 79 | """ 80 | Create a copy of the model, its attributes, and its parameters. Any child 81 | layers will also be deep-copied. The copy will receive a distinct `model.id` 82 | value. 83 | """ 84 | copied = ClassificationTransformerModel(self.name, self.attrs["get_spans"]) 85 | params = {} 86 | for name in self.param_names: 87 | params[name] = self.get_param(name) if self.has_param(name) else None 88 | copied.params = copy.deepcopy(params) 89 | copied.dims = copy.deepcopy(self._dims) 90 | copied.layers[0] = copy.deepcopy(self.layers[0]) 91 | for name in self.grad_names: 92 | copied.set_grad(name, self.get_grad(name).copy()) 93 | return copied 94 | 95 | def init(model: ClassificationTransformerModel, X=None, Y=None): 96 | if model.attrs["has_transformer"]: 97 | return 98 | name = model.attrs["name"] 99 | tok_cfg = model._init_tokenizer_config 100 | trf_cfg = model._init_transformer_config 101 | hf_model = huggingface_from_pretrained( 102 | name, tok_cfg, trf_cfg, model_cls=AutoModelForSequenceClassification 103 | ) 104 | model.attrs["set_transformer"](model, hf_model) 105 | tokenizer = model.tokenizer 106 | texts = ["hello world", "foo bar"] 107 | token_data = huggingface_tokenize(tokenizer, texts) 108 | wordpieces = WordpieceBatch.from_batch_encoding(token_data) 109 | model.layers[0].initialize(X=wordpieces) 110 | 111 | model = ClassificationTransformerModel( 112 | "sgugger/tiny-distilbert-classification", 113 | get_spans=partial(get_strided_spans, window=128, stride=96), 114 | ) 115 | model.initialize() 116 | 117 | assert isinstance(model.transformer, DistilBertForSequenceClassification) 118 | nlp = spacy.blank("en") 119 | doc = nlp.make_doc("some text") 120 | assert isinstance(model.predict([doc]).model_output, SequenceClassifierOutput) 121 | 122 | b = model.to_bytes() 123 | model_re = ClassificationTransformerModel( 124 | "sgugger/tiny-distilbert-classification", 125 | get_spans=partial(get_strided_spans, window=128, stride=96), 126 | ).from_bytes(b) 127 | assert isinstance(model_re.transformer, DistilBertForSequenceClassification) 128 | assert isinstance(model_re.predict([doc]).model_output, SequenceClassifierOutput) 129 | assert torch.equal( 130 | model.predict([doc]).model_output.logits, 131 | model_re.predict([doc]).model_output.logits, 132 | ) 133 | # Note that model.to_bytes() != model_re.to_bytes(), but this is also not 134 | # true for the default models. 135 | -------------------------------------------------------------------------------- /spacy_transformers/tests/test_model_wrapper.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import spacy 3 | from thinc.api import Model 4 | from ..layers import TransformerModel 5 | from ..data_classes import FullTransformerBatch 6 | from ..span_getters import get_doc_spans 7 | 8 | 9 | MODEL_NAMES = [ 10 | "distilbert-base-uncased", 11 | "hf-internal-testing/tiny-random-gpt2", 12 | "hf-internal-testing/tiny-random-xlnet", 13 | ] 14 | 15 | 16 | @pytest.fixture 17 | def nlp(): 18 | return spacy.blank("en") 19 | 20 | 21 | @pytest.fixture 22 | def docs(nlp): 23 | texts = ["the cat sat on the mat.", "hello world."] 24 | return [nlp(text) for text in texts] 25 | 26 | 27 | @pytest.fixture(scope="module", params=MODEL_NAMES) 28 | def name(request): 29 | return request.param 30 | 31 | 32 | @pytest.fixture(scope="module", params=[True, False]) 33 | def output_attentions(request): 34 | return request.param 35 | 36 | 37 | @pytest.fixture(scope="module", params=[True, False]) 38 | def output_hidden_states(request): 39 | return request.param 40 | 41 | 42 | @pytest.fixture(scope="module") 43 | def trf_model(name, output_attentions, output_hidden_states): 44 | if "gpt2" in name: 45 | model = TransformerModel( 46 | name, 47 | get_doc_spans, 48 | {"use_fast": True, "pad_token": "<|endoftext|>"}, 49 | { 50 | "output_attentions": output_attentions, 51 | "output_hidden_states": output_hidden_states, 52 | }, 53 | ) 54 | 55 | else: 56 | # test slow tokenizers with distilbert-base-uncased (parameterizing 57 | # for all models blows up the memory usage during the test suite) 58 | if name == "distilbert-base-uncased": 59 | use_fast = False 60 | else: 61 | use_fast = True 62 | model = TransformerModel( 63 | name, 64 | get_doc_spans, 65 | {"use_fast": use_fast}, 66 | { 67 | "output_attentions": output_attentions, 68 | "output_hidden_states": output_hidden_states, 69 | }, 70 | ) 71 | model.initialize() 72 | return model 73 | 74 | 75 | def test_model_init(name, trf_model): 76 | assert isinstance(trf_model, Model) 77 | if name == "distilbert-base-uncased": 78 | assert not trf_model.tokenizer.is_fast 79 | else: 80 | assert trf_model.tokenizer.is_fast 81 | 82 | 83 | def test_model_predict(nlp, docs, trf_model): 84 | outputs = trf_model.predict(docs) 85 | shape = outputs.model_output.last_hidden_state.shape 86 | if trf_model.transformer.config.output_attentions is True: 87 | assert outputs.model_output.attentions is not None 88 | assert all([t.shape[0] == shape[0] for t in outputs.model_output.attentions]) 89 | else: 90 | assert outputs.model_output.attentions is None 91 | if trf_model.transformer.config.output_hidden_states is True: 92 | assert outputs.model_output.hidden_states is not None 93 | assert all([t.shape[0] == shape[0] for t in outputs.model_output.hidden_states]) 94 | else: 95 | assert outputs.model_output.hidden_states is None 96 | assert isinstance(outputs, FullTransformerBatch) 97 | 98 | # for a fast tokenizer check that all non-special wordpieces are aligned 99 | # (which is not necessarily true for the slow tokenizers) 100 | if trf_model.tokenizer.is_fast: 101 | outputs = trf_model.predict([nlp.make_doc("\tÁaaa \n\n")]) 102 | aligned_wps = outputs.align.data.flatten() 103 | for i in range(len(outputs.wordpieces.strings[0])): 104 | if ( 105 | outputs.wordpieces.strings[0][i] 106 | not in trf_model.tokenizer.all_special_tokens 107 | ): 108 | assert i in aligned_wps 109 | -------------------------------------------------------------------------------- /spacy_transformers/tests/test_pipeline_component.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from packaging.version import Version 3 | import torch 4 | import spacy 5 | from spacy.language import Language 6 | from spacy.training.example import Example 7 | from spacy.util import make_tempdir 8 | from spacy.vocab import Vocab 9 | from spacy.tokens import Doc 10 | from spacy import util 11 | from thinc.api import Model, Config, get_current_ops, NumpyOps 12 | from spacy.tests.util import assert_docs_equal 13 | 14 | from .util import DummyTransformer, _assert_equal_tensors 15 | from .. import TransformerModel 16 | from ..pipeline_component import Transformer 17 | from ..layers import TransformerListener 18 | from ..data_classes import TransformerData, FullTransformerBatch 19 | 20 | 21 | torch.set_num_threads(1) 22 | 23 | 24 | @pytest.fixture 25 | def vocab(): 26 | return Vocab() 27 | 28 | 29 | @pytest.fixture 30 | def docs(vocab): 31 | return [ 32 | Doc(vocab, words=["hello", "world"]), 33 | Doc(vocab, words=["this", "is", "another"]), 34 | ] 35 | 36 | 37 | @pytest.fixture 38 | def component(vocab): 39 | return Transformer(Vocab(), DummyTransformer()) 40 | 41 | 42 | @pytest.fixture(scope="module") 43 | def simple_nlp(): 44 | nlp = Language() 45 | nlp.add_pipe("transformer") 46 | train_examples = [] 47 | for t in TRAIN_DATA: 48 | train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) 49 | 50 | optimizer = nlp.initialize() 51 | for i in range(2): 52 | losses = {} 53 | nlp.update(train_examples, sgd=optimizer, losses=losses) 54 | 55 | return nlp 56 | 57 | 58 | def test_init(component): 59 | assert isinstance(component.vocab, Vocab) 60 | assert isinstance(component.model, Model) 61 | assert hasattr(component.set_extra_annotations, "__call__") 62 | assert component.listeners == [] 63 | assert component.cfg == {"max_batch_items": 4096} 64 | 65 | 66 | def test_predict(component, docs): 67 | trf_data = component.predict(docs) 68 | n_tokens = trf_data.wordpieces.input_ids.shape[1] 69 | width = component.model.layers[0].attrs["width"] 70 | assert isinstance(trf_data, FullTransformerBatch) 71 | assert ( 72 | len(trf_data.model_output.last_hidden_state) 73 | == component.model.layers[0].attrs["depth"] 74 | ) 75 | assert trf_data.model_output.last_hidden_state[0].shape == ( 76 | len(docs), 77 | n_tokens, 78 | width, 79 | ) 80 | 81 | 82 | def test_set_annotations(component, docs): 83 | trf_data = component.predict(docs) 84 | component.set_annotations(docs, trf_data) 85 | for doc in docs: 86 | assert isinstance(doc._.trf_data, TransformerData) 87 | 88 | 89 | def test_set_extra_annotations(component, docs): 90 | Doc.set_extension("custom_attr", default="") 91 | 92 | def custom_annotation_setter(docs, trf_data): 93 | doc_data = list(trf_data.doc_data) 94 | for doc, data in zip(docs, doc_data): 95 | doc._.custom_attr = data 96 | 97 | component.set_extra_annotations = custom_annotation_setter 98 | trf_data = component.predict(docs) 99 | component.set_annotations(docs, trf_data) 100 | for doc in docs: 101 | assert isinstance(doc._.custom_attr, TransformerData) 102 | 103 | 104 | def test_listeners(component, docs): 105 | docs = list(component.pipe(docs)) 106 | for listener in component.listeners: 107 | assert listener.verify_inputs(docs) 108 | 109 | 110 | TRAIN_DATA = [ 111 | ( 112 | "I like green eggs", 113 | {"tags": ["N", "V", "J", "N"], "sent_starts": [True, False, True, False]}, 114 | ), 115 | ("Eat blue ham", {"tags": ["V", "J", "N"], "sent_starts": [True, False, False]}), 116 | ] 117 | 118 | 119 | def test_transformer_pipeline_simple(simple_nlp): 120 | """Test that a simple pipeline with just a transformer at least runs""" 121 | doc = simple_nlp("We're interested at underwater basket weaving.") 122 | assert doc 123 | 124 | 125 | def test_transformer_pipeline_long_token(simple_nlp): 126 | """Test that a simple pipeline does not raise an error on texts that exceeds 127 | the model max length. We should truncate instead. 128 | """ 129 | doc = simple_nlp("https://example.com/" + "a/" * 1000) 130 | assert len(doc) == 1 131 | 132 | 133 | cfg_string = """ 134 | [nlp] 135 | lang = "en" 136 | pipeline = ["transformer","tagger","senter"] 137 | 138 | [components] 139 | 140 | [components.senter] 141 | factory = "senter" 142 | 143 | [components.senter.model] 144 | @architectures = "spacy.Tagger.v1" 145 | nO = null 146 | 147 | [components.senter.model.tok2vec] 148 | @architectures = "spacy-transformers.TransformerListener.v1" 149 | grad_factor = 1.0 150 | upstream = "transformer" 151 | 152 | [components.senter.model.tok2vec.pooling] 153 | @layers = "reduce_mean.v1" 154 | 155 | [components.tagger] 156 | factory = "tagger" 157 | 158 | [components.tagger.model] 159 | @architectures = "spacy.Tagger.v1" 160 | nO = null 161 | 162 | [components.tagger.model.tok2vec] 163 | @architectures = "spacy-transformers.TransformerListener.v1" 164 | grad_factor = 1.0 165 | upstream = "transformer" 166 | 167 | [components.tagger.model.tok2vec.pooling] 168 | @layers = "reduce_mean.v1" 169 | 170 | [components.transformer] 171 | factory = "transformer" 172 | 173 | [components.transformer.model] 174 | @architectures = "spacy-transformers.TransformerModel.v3" 175 | name = "albert-base-v2" 176 | 177 | [components.transformer.model.transformer_config] 178 | output_attentions = true 179 | """ 180 | 181 | 182 | def test_transformer_pipeline_tagger_senter_listener(): 183 | """Test that a pipeline with just a transformer+tagger+senter runs and 184 | trains properly""" 185 | orig_config = Config().from_str(cfg_string) 186 | nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) 187 | assert nlp.pipe_names == ["transformer", "tagger", "senter"] 188 | tagger = nlp.get_pipe("tagger") 189 | transformer = nlp.get_pipe("transformer") 190 | tagger_trf = tagger.model.get_ref("tok2vec").layers[0] 191 | assert isinstance(transformer, Transformer) 192 | assert isinstance(tagger_trf, TransformerListener) 193 | train_examples = [] 194 | for t in TRAIN_DATA: 195 | train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) 196 | for tag in t[1]["tags"]: 197 | tagger.add_label(tag) 198 | 199 | # Check that the Transformer component finds it listeners 200 | optimizer = nlp.initialize(lambda: train_examples) 201 | assert tagger_trf in transformer.listeners 202 | 203 | for i in range(2): 204 | losses = {} 205 | nlp.update(train_examples, sgd=optimizer, losses=losses) 206 | 207 | text = "We're interested at underwater basket weaving." 208 | doc = nlp(text) 209 | doc_tensor = tagger_trf.predict([doc]) 210 | _assert_equal_tensors(doc._.trf_data.tensors, doc_tensor[0].tensors) 211 | 212 | # ensure IO goes OK 213 | with make_tempdir() as d: 214 | file_path = d / "trained_nlp" 215 | nlp.to_disk(file_path) 216 | nlp2 = util.load_model_from_path(file_path) 217 | doc2 = nlp2(text) 218 | tagger2 = nlp2.get_pipe("tagger") 219 | tagger_trf2 = tagger2.model.get_ref("tok2vec").layers[0] 220 | doc_tensor2 = tagger_trf2.predict([doc2]) 221 | _assert_equal_tensors(doc_tensor2[0].tensors, doc_tensor[0].tensors) 222 | 223 | # make sure that this can be saved to directory once more 224 | file_path_2 = d / "trained_nlp_2" 225 | nlp2.to_disk(file_path_2) 226 | 227 | # ensure to_bytes / from_bytes works 228 | nlp_bytes = nlp.to_bytes() 229 | nlp3 = util.load_model_from_config(orig_config, auto_fill=True, validate=True) 230 | nlp3.from_bytes(nlp_bytes) 231 | doc3 = nlp3(text) 232 | tagger3 = nlp3.get_pipe("tagger") 233 | tagger_trf3 = tagger3.model.get_ref("tok2vec").layers[0] 234 | doc_tensor3 = tagger_trf3.predict([doc3]) 235 | _assert_equal_tensors(doc_tensor3[0].tensors, doc_tensor[0].tensors) 236 | 237 | 238 | def test_transformer_sentencepiece_IO(): 239 | """Test that a transformer using sentencepiece trains + IO goes OK""" 240 | orig_config = Config().from_str(cfg_string) 241 | orig_config["components"]["transformer"]["model"]["name"] = "hf-internal-testing/tiny-xlm-roberta" 242 | orig_config["components"]["transformer"]["model"]["tokenizer_config"] = {"use_fast": False} 243 | nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) 244 | tagger = nlp.get_pipe("tagger") 245 | tagger_trf = tagger.model.get_ref("tok2vec").layers[0] 246 | train_examples = [] 247 | for t in TRAIN_DATA: 248 | train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) 249 | for tag in t[1]["tags"]: 250 | tagger.add_label(tag) 251 | 252 | optimizer = nlp.initialize(lambda: train_examples) 253 | for i in range(2): 254 | losses = {} 255 | nlp.update(train_examples, sgd=optimizer, losses=losses) 256 | 257 | text = "We're interested at underwater basket weaving." 258 | doc = nlp(text) 259 | doc_tensor = tagger_trf.predict([doc]) 260 | 261 | # ensure IO goes OK 262 | with make_tempdir() as d: 263 | file_path = d / "trained_nlp" 264 | nlp.to_disk(file_path) 265 | nlp2 = util.load_model_from_path(file_path) 266 | doc2 = nlp2(text) 267 | tagger2 = nlp2.get_pipe("tagger") 268 | tagger_trf2 = tagger2.model.get_ref("tok2vec").layers[0] 269 | doc_tensor2 = tagger_trf2.predict([doc2]) 270 | _assert_equal_tensors(doc_tensor2[0].tensors, doc_tensor[0].tensors) 271 | 272 | # make sure that this can be saved to directory once more 273 | file_path_2 = d / "trained_nlp_2" 274 | nlp2.to_disk(file_path_2) 275 | 276 | # ensure to_bytes / from_bytes works 277 | nlp_bytes = nlp.to_bytes() 278 | nlp3 = util.load_model_from_config(orig_config, auto_fill=True, validate=True) 279 | nlp3.from_bytes(nlp_bytes) 280 | doc3 = nlp3(text) 281 | tagger3 = nlp3.get_pipe("tagger") 282 | tagger_trf3 = tagger3.model.get_ref("tok2vec").layers[0] 283 | doc_tensor3 = tagger_trf3.predict([doc3]) 284 | _assert_equal_tensors(doc_tensor3[0].tensors, doc_tensor[0].tensors) 285 | 286 | 287 | def test_transformer_pipeline_empty(): 288 | """Test that the pipeline doesn't fail with empty input""" 289 | orig_config = Config().from_str(cfg_string) 290 | nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) 291 | tagger = nlp.get_pipe("tagger") 292 | train_examples = [] 293 | for t in TRAIN_DATA: 294 | train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) 295 | for tag in t[1]["tags"]: 296 | tagger.add_label(tag) 297 | 298 | # train on empty doc 299 | optimizer = nlp.initialize() 300 | losses = {} 301 | empty_train_example = Example.from_dict(nlp.make_doc(""), {}) 302 | nlp.update(train_examples, sgd=optimizer, losses=losses) 303 | nlp.update([empty_train_example], sgd=optimizer, losses=losses) 304 | train_examples.append(empty_train_example) 305 | nlp.update(train_examples, sgd=optimizer, losses=losses) 306 | # Interleave an empty doc between non-empty ones 307 | train_examples.insert(1, Example.from_dict(nlp.make_doc(""), {})) 308 | nlp.update(train_examples, sgd=optimizer, losses=losses) 309 | 310 | # predict empty doc 311 | doc = nlp("") 312 | _assert_empty(doc._.trf_data) 313 | docs = nlp.pipe(["", ""]) 314 | for doc in docs: 315 | _assert_empty(doc._.trf_data) 316 | nlp.pipe([]) 317 | 318 | # predict combination of empty and non-empty 319 | doc = nlp("This is a sentence") 320 | normal_tags = [t.tag_ for t in doc] 321 | 322 | docs = list(nlp.pipe(["", "This is a sentence", "", ""])) 323 | _assert_empty(docs[0]._.trf_data) 324 | assert [t.tag_ for t in docs[0]] == [] 325 | assert [t.tag_ for t in docs[1]] == normal_tags 326 | _assert_empty(docs[2]._.trf_data) 327 | _assert_empty(docs[3]._.trf_data) 328 | 329 | 330 | def _assert_empty(trf_data): 331 | assert trf_data.wordpieces.strings == [] 332 | assert trf_data.wordpieces.input_ids.size == 0 333 | assert trf_data.wordpieces.attention_mask.size == 0 334 | assert trf_data.tensors == () 335 | assert len(trf_data.align.data) == 0 336 | 337 | 338 | def test_replace_listeners(): 339 | orig_config = Config().from_str(cfg_string) 340 | nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) 341 | text = "This is awesome" 342 | examples = [Example.from_dict(nlp.make_doc(text), {"tags": ["A", "B", "C"]})] 343 | optimizer = nlp.initialize(lambda: examples) 344 | # verify correct configuration with transformer listener 345 | transformer = nlp.get_pipe("transformer") 346 | tagger = nlp.get_pipe("tagger") 347 | tagger_tok2vec = tagger.model.get_ref("tok2vec") 348 | tagger_listener = tagger_tok2vec.get_ref("listener") 349 | assert isinstance(tagger_listener, TransformerListener) 350 | assert transformer.listener_map["tagger"][0] == tagger_listener 351 | assert isinstance(transformer.model, TransformerModel) 352 | assert ( 353 | nlp.config["components"]["transformer"]["model"]["@architectures"] 354 | == "spacy-transformers.TransformerModel.v3" 355 | ) 356 | assert ( 357 | nlp.config["components"]["tagger"]["model"]["tok2vec"]["@architectures"] 358 | == "spacy-transformers.TransformerListener.v1" 359 | ) 360 | # train pipe before replacing listeners 361 | for i in range(2): 362 | losses = {} 363 | nlp.update(examples, sgd=optimizer, losses=losses) 364 | doc = nlp(text) 365 | 366 | preds = [t.tag_ for t in doc] 367 | doc_tensor = tagger_tok2vec.predict([doc]) 368 | 369 | # replace listener and verify predictions are still the same 370 | nlp.replace_listeners("transformer", "tagger", ["model.tok2vec"]) 371 | tagger = nlp.get_pipe("tagger") 372 | tagger_tok2vec = tagger.model.get_ref("tok2vec") 373 | assert isinstance(tagger_tok2vec, Model) 374 | assert tagger_tok2vec.layers[0].layers[0].name == "transformer" 375 | assert ( 376 | nlp.config["components"]["tagger"]["model"]["tok2vec"]["@architectures"] 377 | == "spacy-transformers.Tok2VecTransformer.v3" 378 | ) 379 | doc2 = nlp(text) 380 | assert preds == [t.tag_ for t in doc2] 381 | pred_tensor = tagger_tok2vec.predict([doc2]) 382 | _assert_equal_tensors(doc_tensor, pred_tensor) 383 | 384 | # attempt training with the new pipeline 385 | optimizer = nlp.resume_training() 386 | for i in range(2): 387 | losses = {} 388 | nlp.update(examples, sgd=optimizer, losses=losses) 389 | assert losses["tagger"] > 0.0 390 | 391 | # check for presence of additional fields in model_output 392 | assert doc2._.trf_data.model_output.pooler_output is not None 393 | assert doc2._.trf_data.model_output.attentions is not None 394 | 395 | # ensure IO goes OK 396 | doc_tensor_trained = tagger_tok2vec.predict([doc]) 397 | with make_tempdir() as d: 398 | file_path = d / "trained_nlp" 399 | nlp.to_disk(file_path) 400 | nlp2 = util.load_model_from_path(file_path) 401 | doc3 = nlp2(text) 402 | tagger2 = nlp2.get_pipe("tagger") 403 | tagger_tok2vec2 = tagger2.model.get_ref("tok2vec") 404 | pred_tensor = tagger_tok2vec2.predict([doc3]) 405 | _assert_equal_tensors(doc_tensor_trained, pred_tensor) 406 | 407 | 408 | def test_replace_listeners_invalid(): 409 | orig_config = Config().from_str(cfg_string) 410 | nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) 411 | text = "This is awesome" 412 | examples = [Example.from_dict(nlp.make_doc(text), {"tags": ["A", "B", "C"]})] 413 | optimizer = nlp.initialize(lambda: examples) 414 | for i in range(2): 415 | losses = {} 416 | nlp.update(examples, sgd=optimizer, losses=losses) 417 | with pytest.raises(ValueError): 418 | nlp.replace_listeners("invalid", "tagger", ["model.tok2vec"]) 419 | with pytest.raises(ValueError): 420 | nlp.replace_listeners("transformer", "parser", ["model.tok2vec"]) 421 | with pytest.raises(ValueError): 422 | nlp.replace_listeners("transformer", "tagger", ["model.yolo"]) 423 | with pytest.raises(ValueError): 424 | nlp.replace_listeners("transformer", "tagger", ["model.tok2vec", "model.yolo"]) 425 | 426 | 427 | @pytest.fixture 428 | def texts(): 429 | data = [ 430 | "Hello world.", 431 | "This is spacy.", 432 | "You can use multiprocessing with pipe method.", 433 | "Please try!", 434 | ] 435 | return data 436 | 437 | 438 | def test_multiprocessing(simple_nlp, texts): 439 | ops = get_current_ops() 440 | if isinstance(ops, NumpyOps): 441 | texts = texts * 3 442 | expecteds = [simple_nlp(text) for text in texts] 443 | docs = simple_nlp.pipe(texts, n_process=2, batch_size=2) 444 | 445 | for doc, expected_doc in zip(docs, expecteds): 446 | assert_docs_equal(doc, expected_doc) 447 | 448 | 449 | def test_frozen_listener(): 450 | orig_config = Config().from_str(cfg_string) 451 | nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) 452 | text = "This is awesome" 453 | examples = [Example.from_dict(nlp.make_doc(text), {"tags": ["A", "B", "C"]})] 454 | optimizer = nlp.initialize(lambda: examples) 455 | # train pipe before freezing listener 456 | for i in range(2): 457 | losses = {} 458 | nlp.update(examples, sgd=optimizer, losses=losses) 459 | doc = nlp(text) 460 | 461 | transformer_bytes = nlp.get_pipe("transformer").to_bytes() 462 | tagger_bytes = nlp.get_pipe("tagger").to_bytes() 463 | 464 | # train further with frozen listener 465 | for i in range(2): 466 | losses = {} 467 | nlp.update( 468 | examples, 469 | sgd=optimizer, 470 | losses=losses, 471 | exclude=["transformer"], 472 | annotates=["transformer"], 473 | ) 474 | doc = nlp(text) 475 | 476 | # only tagger was updated 477 | assert nlp.get_pipe("transformer").to_bytes() == transformer_bytes 478 | assert nlp.get_pipe("tagger").to_bytes() != tagger_bytes 479 | 480 | 481 | def test_no_update_listener_in_predict(): 482 | orig_config = Config().from_str(cfg_string) 483 | nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) 484 | listener = nlp.get_pipe("tagger").model.get_ref("tok2vec").get_ref("listener") 485 | transformer = nlp.get_pipe("transformer") 486 | 487 | text = "This is awesome" 488 | examples = [Example.from_dict(nlp.make_doc(text), {"tags": ["A", "B", "C"]})] 489 | docs = [eg.predicted for eg in examples] 490 | nlp.initialize(lambda: examples) 491 | 492 | transformer.update(examples) 493 | assert listener._backprop is not None 494 | 495 | transformer.predict(docs) 496 | assert listener._backprop is not None 497 | 498 | 499 | @pytest.mark.skipif( 500 | Version(spacy.__version__) < Version("3.5.4"), reason="Bug fixed in spaCy v3.5.4" 501 | ) 502 | def test_source_replace_listeners(): 503 | """Test that a pipeline with a transformer+tagger+senter and some replaced 504 | listeners runs and trains properly""" 505 | orig_config = """ 506 | [nlp] 507 | lang = "en" 508 | pipeline = ["transformer","tagger","senter"] 509 | 510 | [components] 511 | 512 | [components.senter] 513 | factory = "senter" 514 | 515 | [components.senter.model] 516 | @architectures = "spacy.Tagger.v1" 517 | nO = null 518 | 519 | [components.senter.model.tok2vec] 520 | @architectures = "spacy-transformers.TransformerListener.v1" 521 | grad_factor = 1.0 522 | upstream = "transformer" 523 | 524 | [components.senter.model.tok2vec.pooling] 525 | @layers = "reduce_mean.v1" 526 | 527 | [components.tagger] 528 | factory = "tagger" 529 | 530 | [components.tagger.model] 531 | @architectures = "spacy.Tagger.v1" 532 | nO = null 533 | 534 | [components.tagger.model.tok2vec] 535 | @architectures = "spacy-transformers.TransformerListener.v1" 536 | grad_factor = 1.0 537 | upstream = "transformer" 538 | 539 | [components.tagger.model.tok2vec.pooling] 540 | @layers = "reduce_mean.v1" 541 | 542 | [components.transformer] 543 | factory = "transformer" 544 | 545 | [components.transformer.model] 546 | @architectures = "spacy-transformers.TransformerModel.v3" 547 | name = "distilbert-base-uncased" 548 | """ 549 | orig_config = Config().from_str(cfg_string) 550 | nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) 551 | assert nlp.pipe_names == ["transformer", "tagger", "senter"] 552 | tagger = nlp.get_pipe("tagger") 553 | train_examples = [] 554 | for t in TRAIN_DATA: 555 | train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) 556 | for tag in t[1]["tags"]: 557 | tagger.add_label(tag) 558 | optimizer = nlp.initialize(lambda: train_examples) 559 | assert nlp.get_pipe("transformer").listening_components == ["tagger", "senter"] 560 | for i in range(2): 561 | losses = {} 562 | nlp.update(train_examples, sgd=optimizer, losses=losses) 563 | 564 | with make_tempdir() as dir_path: 565 | nlp.to_disk(dir_path) 566 | base_model = str(dir_path) 567 | new_config = { 568 | "nlp": { 569 | "lang": "en", 570 | "pipeline": ["transformer", "tagger", "senter", "ner"], 571 | }, 572 | "components": { 573 | "transformer": {"source": base_model}, 574 | "tagger": { 575 | "source": base_model, 576 | "replace_listeners": ["model.tok2vec"], 577 | }, 578 | "senter": { 579 | "source": base_model, 580 | "replace_listeners": ["model.tok2vec"], 581 | }, 582 | "ner": { 583 | "factory": "ner", 584 | "model": { 585 | "@architectures": "spacy.TransitionBasedParser.v2", 586 | "state_type": "ner", 587 | "tok2vec": { 588 | "@architectures": "spacy-transformers.TransformerListener.v1", 589 | "grad_factor": 1.0, 590 | "upstream": "transformer", 591 | "pooling": {"@layers": "reduce_mean.v1"}, 592 | }, 593 | }, 594 | }, 595 | }, 596 | } 597 | new_nlp = util.load_model_from_config(new_config, auto_fill=True) 598 | for component in ("tagger", "senter"): 599 | assert ( 600 | new_nlp.config["components"][component]["model"]["tok2vec"][ 601 | "@architectures" 602 | ] 603 | == "spacy-transformers.Tok2VecTransformer.v3" 604 | ) 605 | assert new_nlp.get_pipe("transformer").listening_components == ["ner"] 606 | 607 | with make_tempdir() as new_dir_path: 608 | new_nlp.to_disk(new_dir_path) 609 | new_nlp_re = spacy.load(new_dir_path) 610 | for component in ("tagger", "senter"): 611 | assert ( 612 | new_nlp.config["components"][component]["model"]["tok2vec"][ 613 | "@architectures" 614 | ] 615 | == "spacy-transformers.Tok2VecTransformer.v3" 616 | ) 617 | assert new_nlp_re.get_pipe("transformer").listening_components == ["ner"] 618 | -------------------------------------------------------------------------------- /spacy_transformers/tests/test_serialize.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import copy 3 | import spacy 4 | from spacy import Language 5 | from spacy.lang.en import English 6 | from spacy.tests.util import assert_docs_equal 7 | from spacy.tokens import Doc 8 | from spacy.util import make_tempdir 9 | from spacy import util 10 | import srsly 11 | from thinc.api import Config, get_current_ops 12 | from numpy.testing import assert_array_equal 13 | 14 | from .. import TransformerData 15 | 16 | 17 | DEFAULT_CONFIG = { 18 | "model": { 19 | "@architectures": "spacy-transformers.TransformerModel.v3", 20 | "name": "hf-internal-testing/tiny-random-DistilBertModel", 21 | "tokenizer_config": {"use_fast": False}, 22 | } 23 | } 24 | 25 | 26 | def test_serialize_transformer_data(): 27 | data = {"x": TransformerData.empty()} 28 | bytes_data = srsly.msgpack_dumps(data) 29 | new_data = srsly.msgpack_loads(bytes_data) 30 | assert isinstance(new_data["x"], TransformerData) 31 | 32 | nlp = Language() 33 | nlp.add_pipe( 34 | "transformer", 35 | config={ 36 | "model": { 37 | "name": "hf-internal-testing/tiny-random-DistilBertModel", 38 | "transformer_config": {"output_attentions": True}, 39 | } 40 | }, 41 | ) 42 | nlp.initialize() 43 | doc = nlp("This is a test.") 44 | b = doc.to_bytes() 45 | reloaded_doc = Doc(nlp.vocab) 46 | reloaded_doc.from_bytes(b) 47 | assert_docs_equal(doc, reloaded_doc) 48 | ops = get_current_ops() 49 | for key in doc._.trf_data.model_output: 50 | assert_array_equal( 51 | ops.to_numpy(ops.asarray(doc._.trf_data.model_output[key])), 52 | ops.to_numpy(ops.asarray(reloaded_doc._.trf_data.model_output[key])), 53 | ) 54 | 55 | 56 | def test_transformer_tobytes(): 57 | nlp = Language() 58 | trf = nlp.add_pipe("transformer", config=DEFAULT_CONFIG) 59 | trf_bytes = trf.to_bytes() 60 | 61 | nlp2 = Language() 62 | trf2 = nlp2.add_pipe("transformer", config=DEFAULT_CONFIG) 63 | trf2.from_bytes(trf_bytes) 64 | 65 | 66 | def test_initialized_transformer_tobytes(): 67 | nlp = Language() 68 | trf = nlp.add_pipe("transformer", config=DEFAULT_CONFIG) 69 | nlp.initialize() 70 | trf_bytes = trf.to_bytes() 71 | 72 | nlp2 = Language() 73 | trf2 = nlp2.add_pipe("transformer", config=DEFAULT_CONFIG) 74 | trf2.from_bytes(trf_bytes) 75 | 76 | assert trf2.model.tokenizer.is_fast is False 77 | 78 | 79 | def test_initialized_transformer_todisk(): 80 | nlp = Language() 81 | trf = nlp.add_pipe("transformer", config=DEFAULT_CONFIG) 82 | nlp.initialize() 83 | with make_tempdir() as d: 84 | trf.to_disk(d) 85 | nlp2 = Language() 86 | trf2 = nlp2.add_pipe("transformer", config=DEFAULT_CONFIG) 87 | trf2.from_disk(d) 88 | 89 | assert trf2.model.tokenizer.is_fast is False 90 | 91 | fast_config = copy.deepcopy(DEFAULT_CONFIG) 92 | fast_config["model"]["tokenizer_config"]["use_fast"] = True 93 | nlp = Language() 94 | trf = nlp.add_pipe("transformer", config=fast_config) 95 | nlp.initialize() 96 | with make_tempdir() as d: 97 | trf.to_disk(d) 98 | nlp2 = Language() 99 | trf2 = nlp2.add_pipe("transformer", config=fast_config) 100 | trf2.from_disk(d) 101 | 102 | assert trf2.model.tokenizer.is_fast is True 103 | 104 | 105 | def test_transformer_pipeline_tobytes(): 106 | nlp = Language() 107 | nlp.add_pipe("transformer", config=DEFAULT_CONFIG) 108 | nlp.initialize() 109 | assert nlp.pipe_names == ["transformer"] 110 | nlp_bytes = nlp.to_bytes() 111 | 112 | nlp2 = Language() 113 | nlp2.add_pipe("transformer", config=DEFAULT_CONFIG) 114 | nlp2.from_bytes(nlp_bytes) 115 | assert nlp2.pipe_names == ["transformer"] 116 | 117 | 118 | def test_transformer_pipeline_todisk(): 119 | nlp = English() 120 | nlp.add_pipe("transformer", config=DEFAULT_CONFIG) 121 | nlp.initialize() 122 | with make_tempdir() as d: 123 | nlp.to_disk(d) 124 | nlp2 = spacy.load(d) 125 | assert nlp2.pipe_names == ["transformer"] 126 | 127 | 128 | def test_transformer_pipeline_todisk_settings(): 129 | nlp = English() 130 | trf = nlp.add_pipe("transformer", config=DEFAULT_CONFIG) 131 | nlp.initialize() 132 | # initially no attentions 133 | assert trf.model.tokenizer.model_max_length == 512 134 | assert trf.model.transformer.config.output_attentions is False 135 | assert "attentions" not in nlp("test")._.trf_data.model_output 136 | # modify model_max_length (note that modifications to 137 | # tokenizer.model_max_length for transformers<4.25 are not serialized by 138 | # save_pretrained, see: https://github.com/explosion/spaCy/discussions/7393) 139 | trf.model.tokenizer.init_kwargs["model_max_length"] = 499 140 | # transformer>=4.25, model_max_length is saved and init_kwargs changes are 141 | # clobbered, so do both for this test 142 | trf.model.tokenizer.model_max_length = 499 143 | # add attentions on-the-fly 144 | trf.model.transformer.config.output_attentions = True 145 | assert nlp("test")._.trf_data.model_output.attentions is not None 146 | with make_tempdir() as d: 147 | nlp.to_disk(d) 148 | nlp2 = spacy.load(d) 149 | assert nlp2.pipe_names == ["transformer"] 150 | trf2 = nlp2.get_pipe("transformer") 151 | # model_max_length is preserved 152 | assert trf2.model.tokenizer.model_max_length == 499 153 | # output_attentions setting is preserved 154 | assert trf2.model.transformer.config.output_attentions is True 155 | assert nlp2("test")._.trf_data.model_output.attentions is not None 156 | # the init configs are empty SimpleFrozenDicts 157 | assert trf2.model._init_tokenizer_config == {} 158 | with pytest.raises(NotImplementedError): 159 | trf2.model._init_tokenizer_config["use_fast"] = False 160 | 161 | 162 | def test_transformer_pipeline_todisk_before_initialize(): 163 | nlp = English() 164 | nlp.add_pipe("transformer", config=DEFAULT_CONFIG) 165 | with make_tempdir() as d: 166 | # serialize before initialization 167 | nlp.to_disk(d) 168 | nlp2 = spacy.load(d) 169 | nlp2.initialize() 170 | assert "last_hidden_state" in nlp2("test")._.trf_data.model_output 171 | 172 | 173 | inline_cfg_string = """ 174 | [nlp] 175 | lang = "en" 176 | pipeline = ["tagger"] 177 | 178 | [components] 179 | 180 | [components.tagger] 181 | factory = "tagger" 182 | 183 | [components.tagger.model] 184 | @architectures = "spacy.Tagger.v1" 185 | nO = null 186 | 187 | [components.tagger.model.tok2vec] 188 | @architectures = "spacy-transformers.Tok2VecTransformer.v3" 189 | name = "hf-internal-testing/tiny-random-DistilBertModel" 190 | tokenizer_config = {"use_fast": true} 191 | transformer_config = {"output_attentions": false} 192 | grad_factor = 1.0 193 | 194 | [components.tagger.model.tok2vec.get_spans] 195 | @span_getters = "spacy-transformers.strided_spans.v1" 196 | window = 256 197 | stride = 256 198 | 199 | [components.tagger.model.tok2vec.pooling] 200 | @layers = "reduce_mean.v1" 201 | """ 202 | 203 | 204 | def test_inline_transformer_tobytes(): 205 | orig_config = Config().from_str(inline_cfg_string) 206 | nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) 207 | tagger = nlp.get_pipe("tagger") 208 | tagger_bytes = tagger.to_bytes() 209 | 210 | nlp2 = util.load_model_from_config(orig_config, auto_fill=True, validate=True) 211 | tagger2 = nlp2.get_pipe("tagger") 212 | tagger2.from_bytes(tagger_bytes) 213 | 214 | 215 | def test_initialized_inline_transformer_tobytes(): 216 | orig_config = Config().from_str(inline_cfg_string) 217 | nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) 218 | assert nlp.pipe_names == ["tagger"] 219 | tagger = nlp.get_pipe("tagger") 220 | tagger.add_label("V") 221 | nlp.initialize() 222 | tagger_bytes = tagger.to_bytes() 223 | 224 | nlp2 = util.load_model_from_config(orig_config, auto_fill=True, validate=True) 225 | tagger2 = nlp2.get_pipe("tagger") 226 | tagger2.from_bytes(tagger_bytes) 227 | assert list(tagger2.labels) == ["V"] 228 | 229 | 230 | def test_inline_transformer_todisk(): 231 | orig_config = Config().from_str(inline_cfg_string) 232 | nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) 233 | assert nlp.pipe_names == ["tagger"] 234 | tagger = nlp.get_pipe("tagger") 235 | tagger.add_label("V") 236 | with make_tempdir() as d: 237 | tagger.to_disk(d) 238 | nlp2 = util.load_model_from_config(orig_config, auto_fill=True, validate=True) 239 | tagger2 = nlp2.get_pipe("tagger") 240 | tagger2.from_disk(d) 241 | assert list(tagger2.labels) == ["V"] 242 | 243 | 244 | def test_initialized_inline_transformer_todisk(): 245 | orig_config = Config().from_str(inline_cfg_string) 246 | nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) 247 | assert nlp.pipe_names == ["tagger"] 248 | tagger = nlp.get_pipe("tagger") 249 | tagger.add_label("V") 250 | nlp.initialize() 251 | with make_tempdir() as d: 252 | tagger.to_disk(d) 253 | nlp2 = util.load_model_from_config(orig_config, auto_fill=True, validate=True) 254 | tagger2 = nlp2.get_pipe("tagger") 255 | tagger2.from_disk(d) 256 | assert list(tagger2.labels) == ["V"] 257 | 258 | 259 | def test_inline_transformer_pipeline_tobytes(): 260 | orig_config = Config().from_str(inline_cfg_string) 261 | nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) 262 | assert nlp.pipe_names == ["tagger"] 263 | tagger = nlp.get_pipe("tagger") 264 | tagger.add_label("V") 265 | nlp.initialize() 266 | nlp_bytes = nlp.to_bytes() 267 | 268 | nlp2 = util.load_model_from_config(orig_config, auto_fill=True, validate=True) 269 | nlp2.from_bytes(nlp_bytes) 270 | assert nlp2.pipe_names == ["tagger"] 271 | 272 | 273 | def test_inline_transformer_pipeline_todisk(): 274 | orig_config = Config().from_str(inline_cfg_string) 275 | nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) 276 | assert nlp.pipe_names == ["tagger"] 277 | with make_tempdir() as d: 278 | nlp.to_disk(d) 279 | nlp2 = spacy.load(d) 280 | assert nlp2.pipe_names == ["tagger"] 281 | 282 | 283 | def test_initialized_inline_transformer_pipeline_todisk(): 284 | orig_config = Config().from_str(inline_cfg_string) 285 | nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) 286 | assert nlp.pipe_names == ["tagger"] 287 | tagger = nlp.get_pipe("tagger") 288 | tagger.add_label("V") 289 | nlp.initialize() 290 | with make_tempdir() as d: 291 | nlp.to_disk(d) 292 | nlp2 = spacy.load(d) 293 | assert nlp2.pipe_names == ["tagger"] 294 | tagger2 = nlp2.get_pipe("tagger") 295 | assert list(tagger2.labels) == ["V"] 296 | -------------------------------------------------------------------------------- /spacy_transformers/tests/test_spanners.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from spacy.lang.en import English 3 | 4 | from ..span_getters import configure_strided_spans, configure_get_sent_spans 5 | 6 | 7 | @pytest.mark.parametrize( 8 | "window,stride,docs,result", 9 | [ 10 | (4, 3, ["0", "1234", "56789a"], [["0"], ["1234"], ["5678", "89a"]]), 11 | (4, 4, ["0", "1234", "56789a"], [["0"], ["1234"], ["5678", "9a"]]), 12 | (4, 2, ["0", "1234", "56789a"], [["0"], ["1234"], ["5678", "789a"]]), 13 | ], 14 | ) 15 | def test_get_strided_spans(window, stride, docs, result): 16 | get_strided = configure_strided_spans(window, stride) 17 | spans = get_strided(docs) 18 | assert spans == result 19 | 20 | 21 | def test_get_sent_spans(): 22 | nlp = English() 23 | nlp.add_pipe("sentencizer") 24 | doc = nlp("One. One more. Three sentences in total.") 25 | assert len(list(doc.sents)) == 3 26 | get_sent_spans = configure_get_sent_spans() 27 | spans = get_sent_spans([doc])[0] 28 | assert len(spans) == 3 29 | assert spans[0].text == "One." 30 | assert spans[1].text == "One more." 31 | assert spans[2].text == "Three sentences in total." 32 | 33 | 34 | def test_get_custom_spans(): 35 | def configure_custom_sent_spans(max_length: int): 36 | def get_custom_sent_spans(docs): 37 | spans = [] 38 | for doc in docs: 39 | spans.append([]) 40 | for sent in doc.sents: 41 | start = 0 42 | end = max_length 43 | while end <= len(sent): 44 | spans[-1].append(sent[start:end]) 45 | start += max_length 46 | end += max_length 47 | if start < len(sent): 48 | spans[-1].append(sent[start : len(sent)]) 49 | return spans 50 | 51 | return get_custom_sent_spans 52 | 53 | nlp = English() 54 | nlp.add_pipe("sentencizer") 55 | doc = nlp( 56 | "One. And one more. So that makes three sentences and this one is a bit longer." 57 | ) 58 | assert len(list(doc.sents)) == 3 59 | get_sent_spans = configure_custom_sent_spans(max_length=4) 60 | spans = get_sent_spans([doc])[0] 61 | assert len(spans) == 6 62 | assert spans[0].text == "One." 63 | assert spans[1].text == "And one more." 64 | assert spans[2].text == "So that makes three" 65 | assert spans[3].text == "sentences and this one" 66 | assert spans[4].text == "is a bit longer" 67 | assert spans[5].text == "." 68 | -------------------------------------------------------------------------------- /spacy_transformers/tests/test_textcatcnn.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from packaging.version import Version 3 | 4 | from spacy.training.example import Example 5 | from spacy import util 6 | import thinc 7 | from thinc.api import Model, Config 8 | 9 | # fmt: off 10 | cfg_string = """ 11 | [nlp] 12 | lang = "en" 13 | pipeline = ["textcat"] 14 | 15 | [components] 16 | 17 | [components.textcat] 18 | factory = "textcat" 19 | 20 | [components.textcat.model] 21 | @architectures = "spacy.TextCatCNN.v2" 22 | nO = null 23 | exclusive_classes = false 24 | 25 | [components.textcat.model.tok2vec] 26 | @architectures = "spacy-transformers.Tok2VecTransformer.v1" 27 | name = "roberta-base" 28 | tokenizer_config = {"use_fast": false} 29 | grad_factor = 1.0 30 | 31 | [components.textcat.model.tok2vec.get_spans] 32 | @span_getters = "spacy-transformers.strided_spans.v1" 33 | window = 256 34 | stride = 256 35 | 36 | [components.textcat.model.tok2vec.pooling] 37 | @layers = "reduce_mean.v1" 38 | """ 39 | # fmt: on 40 | 41 | 42 | # TODO: remove skip after requiring spacy>=3.5.1 or at the very latest, after 43 | # dropping python 3.7 switch to importlib.metadata.version("thinc") 44 | @pytest.mark.skipif( 45 | Version(thinc.__version__) < Version("8.1.8"), reason="Requires thinc>=8.1.8" 46 | ) 47 | def test_textcatcnn(): 48 | orig_config = Config().from_str(cfg_string) 49 | nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) 50 | assert nlp.pipe_names == ["textcat"] 51 | 52 | textcat = nlp.get_pipe("textcat") 53 | assert textcat.is_resizable is True 54 | 55 | train_examples = [] 56 | doc = nlp.make_doc("ok") 57 | doc.cats["X"] = 1.0 58 | doc.cats["Y"] = 0.0 59 | train_examples.append(Example(doc, doc)) 60 | 61 | nlp.initialize(lambda: train_examples) 62 | -------------------------------------------------------------------------------- /spacy_transformers/tests/test_tok2vectransformer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from spacy.training.example import Example 3 | from spacy.util import make_tempdir 4 | from spacy import util 5 | from thinc.api import Model, Config 6 | from .util import _assert_equal_tensors 7 | 8 | # fmt: off 9 | TRAIN_DATA = [ 10 | ("I like green eggs", {"tags": ["N", "V", "J", "N"]}), 11 | ("Eat blue ham", {"tags": ["V", "J", "N"]}), 12 | ] 13 | 14 | 15 | cfg_string = """ 16 | [nlp] 17 | lang = "en" 18 | pipeline = ["tagger"] 19 | 20 | [components] 21 | 22 | [components.tagger] 23 | factory = "tagger" 24 | 25 | [components.tagger.model] 26 | @architectures = "spacy.Tagger.v1" 27 | nO = null 28 | 29 | [components.tagger.model.tok2vec] 30 | @architectures = "spacy-transformers.Tok2VecTransformer.v1" 31 | name = "distilbert-base-uncased" 32 | tokenizer_config = {"use_fast": false} 33 | grad_factor = 1.0 34 | 35 | [components.tagger.model.tok2vec.get_spans] 36 | @span_getters = "spacy-transformers.strided_spans.v1" 37 | window = 256 38 | stride = 256 39 | 40 | [components.tagger.model.tok2vec.pooling] 41 | @layers = "reduce_mean.v1" 42 | """ 43 | # fmt: on 44 | 45 | 46 | def test_transformer_pipeline_tagger_internal(): 47 | """Test that a tagger with internal transformer runs and trains properly""" 48 | orig_config = Config().from_str(cfg_string) 49 | nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) 50 | assert nlp.pipe_names == ["tagger"] 51 | tagger = nlp.get_pipe("tagger") 52 | tagger_trf = tagger.model.get_ref("tok2vec").layers[0] 53 | assert isinstance(tagger_trf, Model) 54 | train_examples = [] 55 | for t in TRAIN_DATA: 56 | train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) 57 | for tag in t[1]["tags"]: 58 | tagger.add_label(tag) 59 | 60 | optimizer = nlp.initialize(lambda: train_examples) 61 | for i in range(2): 62 | losses = {} 63 | nlp.update(train_examples, sgd=optimizer, losses=losses) 64 | 65 | doc = nlp("We're interested at underwater basket weaving.") 66 | doc_tensor = tagger_trf.predict([doc]) 67 | 68 | # ensure IO goes OK 69 | with make_tempdir() as d: 70 | file_path = d / "trained_nlp" 71 | nlp.to_disk(file_path) 72 | 73 | # results are not the same if we don't call from_disk 74 | nlp2 = util.load_model_from_config(orig_config, auto_fill=True, validate=True) 75 | nlp2.initialize(lambda: train_examples) 76 | doc2 = nlp2("We're interested at underwater basket weaving.") 77 | tagger2 = nlp2.get_pipe("tagger") 78 | tagger_trf2 = tagger2.model.get_ref("tok2vec").layers[0] 79 | doc_tensor2 = tagger_trf2.predict([doc2]) 80 | with pytest.raises(AssertionError): 81 | _assert_equal_tensors( 82 | doc_tensor2.doc_data[0].tensors, doc_tensor.doc_data[0].tensors 83 | ) 84 | 85 | # results ARE the same if we call from_disk 86 | nlp3 = util.load_model_from_config(orig_config, auto_fill=True, validate=True) 87 | nlp3.from_disk(file_path) 88 | doc3 = nlp3("We're interested at underwater basket weaving.") 89 | tagger3 = nlp3.get_pipe("tagger") 90 | tagger_trf3 = tagger3.model.get_ref("tok2vec").layers[0] 91 | doc_tensor3 = tagger_trf3.predict([doc3]) 92 | _assert_equal_tensors( 93 | doc_tensor3.doc_data[0].tensors, doc_tensor.doc_data[0].tensors 94 | ) 95 | -------------------------------------------------------------------------------- /spacy_transformers/tests/test_truncation.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy 3 | from thinc.types import Ragged 4 | from thinc.api import NumpyOps 5 | from ..data_classes import WordpieceBatch 6 | from ..truncate import _truncate_tokens, _truncate_alignment 7 | 8 | 9 | @pytest.fixture 10 | def sequences(): 11 | # Each sequence is a list of tokens, and each token is a number of wordpieces 12 | return [ 13 | [1, 3, 1], # So 5 wordpieces this sequence 14 | [3, 7, 1, 1], # 12 15 | [1], # 1 16 | [20, 1], # 21 17 | ] 18 | 19 | 20 | @pytest.fixture 21 | def shape(sequences): 22 | # Get the shape of the input_ids, which includes the padding. 23 | maximum = max(sum(lengths) for lengths in sequences) 24 | return (len(sequences), maximum) 25 | 26 | 27 | @pytest.fixture 28 | def seq_lengths(sequences): 29 | return numpy.array([sum(seq) for seq in sequences], dtype="i") 30 | 31 | 32 | @pytest.fixture 33 | def wordpieces(sequences): 34 | strings = [] 35 | for token_lengths in sequences: 36 | strings.append([]) 37 | for length in token_lengths: 38 | strings[-1].extend(str(i) for i in range(length)) 39 | shape = (len(strings), max(len(seq) for seq in strings)) 40 | wordpieces = WordpieceBatch( 41 | strings=strings, 42 | input_ids=numpy.zeros(shape, dtype="i"), 43 | token_type_ids=numpy.zeros(shape, dtype="i"), 44 | attention_mask=numpy.zeros((shape[0], shape[1]), dtype="bool"), 45 | lengths=[len(seq) for seq in strings], 46 | ) 47 | return wordpieces 48 | 49 | 50 | @pytest.fixture 51 | def align(sequences): 52 | lengths = [] 53 | indices = [] 54 | offset = 0 55 | for seq in sequences: 56 | for token_length in seq: 57 | lengths.append(token_length) 58 | indices.extend(i + offset for i in range(token_length)) 59 | offset += token_length 60 | return Ragged(numpy.array(indices, dtype="i"), numpy.array(lengths, dtype="i")) 61 | 62 | 63 | @pytest.fixture 64 | def max_length(): 65 | return 6 66 | 67 | 68 | @pytest.fixture 69 | def mask_from_end(shape, max_length): 70 | n_seq, length = shape 71 | bools = [ 72 | numpy.array([i < max_length for i in range(length)], dtype="bool") 73 | for _ in range(n_seq) 74 | ] 75 | return numpy.concatenate(bools) 76 | 77 | 78 | def test_truncate_wordpieces(wordpieces, max_length, mask_from_end): 79 | truncated = _truncate_tokens(wordpieces, mask_from_end) 80 | for i, seq in enumerate(truncated.strings): 81 | assert len(seq) <= max_length 82 | assert seq == wordpieces.strings[i][:max_length] 83 | assert truncated.input_ids[i].shape[0] <= max_length 84 | assert truncated.token_type_ids[i].shape[0] <= max_length 85 | assert truncated.attention_mask[i].shape[0] <= max_length 86 | 87 | 88 | def test_truncate_alignment_from_end(sequences, max_length, align, mask_from_end): 89 | # print("Max length", max_length) 90 | # print("Sequences", sequences) 91 | # print("Mask", mask_from_end) 92 | ops = NumpyOps() 93 | truncated = _truncate_alignment(align, mask_from_end) 94 | # print(truncated.dataXd.shape, truncated.lengths.sum()) 95 | # print("Before", list(map(list, ops.unflatten(align.dataXd, align.lengths)))) 96 | # print("After", list(map(list, ops.unflatten(truncated.dataXd, truncated.lengths)))) 97 | # Check that the number of tokens hasn't changed. We still need to have 98 | # alignment for every token. 99 | assert truncated.lengths.shape[0] == align.lengths.shape[0] 100 | start = 0 101 | for i, seq in enumerate(sequences): 102 | end = start + len(seq) 103 | # Get the alignment for this sequence of tokens. Each length in the 104 | # alignment indicates the number of wordpiece tokens, so we need to 105 | # check that the sum of the lengths doesn't exceed the maximum. 106 | wp_indices = truncated[start:end] 107 | assert wp_indices.lengths.sum() <= max_length 108 | # We're truncating from the end, so we shouldn't see different values 109 | # except at the end of the sequence. 110 | seen_zero = False 111 | before = align[start:end] 112 | for length_now, length_before in zip(wp_indices.lengths, before.lengths): 113 | if seen_zero: 114 | assert length_now == 0, wp_indices.lengths 115 | elif length_now == 0: 116 | seen_zero = True 117 | else: 118 | length_now == length_before 119 | -------------------------------------------------------------------------------- /spacy_transformers/tests/util.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Union 2 | import numpy 3 | import torch 4 | import copy 5 | from transformers.file_utils import ModelOutput 6 | from numpy.testing import assert_array_equal 7 | 8 | from spacy.tokens import Doc 9 | from thinc.api import Model, get_current_ops 10 | 11 | from ..data_classes import FullTransformerBatch, HFObjects 12 | from ..span_getters import get_doc_spans 13 | from ..layers.transformer_model import forward as transformer_forward 14 | 15 | 16 | def _assert_equal_tensors(tensors1, tensors2): 17 | ops = get_current_ops() 18 | for i in range(len(tensors1)): 19 | t1 = ops.asarray(tensors1[i]) 20 | t2 = ops.asarray(tensors2[i]) 21 | assert_array_equal(ops.to_numpy(t1), ops.to_numpy(t2)) 22 | 23 | 24 | class DummyTokenizer: 25 | def __init__(self): 26 | self.str2int = {} 27 | self.int2str = {} 28 | self.start_symbol = "" 29 | self.end_symbol = "" 30 | self.model_max_length = 512 31 | self.pad_token = "[PAD]" 32 | 33 | @property 34 | def all_special_tokens(self): 35 | return [self.start_symbol, self.end_symbol] 36 | 37 | def __call__( 38 | self, 39 | texts, 40 | add_special_tokens=True, 41 | max_length=None, 42 | stride: int = 0, 43 | truncation_strategy="longest_first", 44 | padding=False, 45 | truncation=False, 46 | is_pretokenized=False, 47 | return_tensors=None, 48 | return_token_type_ids=None, 49 | return_attention_mask=None, 50 | return_overflowing_tokens=False, 51 | return_special_tokens_masks=False, 52 | return_offsets_mapping=False, 53 | return_length=False, 54 | ): 55 | output: Dict = { 56 | "input_ids": [], 57 | "attention_mask": [], 58 | "token_type_ids": [], 59 | } # type: ignore 60 | 61 | for text in texts: 62 | words, offsets, mask, type_ids = self._tokenize(text) 63 | ids = self._encode_words(words) 64 | output["input_ids"].append(ids) 65 | output["attention_mask"].append(mask) 66 | output["token_type_ids"].append(type_ids) 67 | if padding: 68 | output = self._pad(output) 69 | if return_tensors == "pt": 70 | output["input_ids"] = torch.tensor(output["input_ids"]) # type: ignore 71 | output["attention_mask"] = torch.tensor(output["attention_mask"]) # type: ignore 72 | output["token_type_ids"] = torch.tensor(output["token_type_ids"]) # type: ignore 73 | elif return_tensors == "np": 74 | output["input_ids"] = numpy.asarray(output["input_ids"]) # type: ignore 75 | output["attention_mask"] = numpy.asarray(output["attention_mask"]) # type: ignore 76 | output["token_type_ids"] = numpy.asarray(output["token_type_ids"]) # type: ignore 77 | if return_length: 78 | output["length"] = torch.tensor([len(x) for x in output["input_ids"]]) # type: ignore 79 | return output 80 | 81 | def convert_ids_to_tokens(self, ids: Union[List[int], torch.Tensor]) -> List[str]: 82 | return [self.int2str[int(id_)] for id_ in ids] # type: ignore 83 | 84 | def _pad(self, batch): 85 | batch = copy.deepcopy(batch) 86 | longest = max(len(ids) for ids in batch["input_ids"]) 87 | for i in range(len(batch["input_ids"])): 88 | length = len(batch["input_ids"][i]) 89 | difference = longest - length 90 | batch["attention_mask"][i] = [1] * length + [0] * difference 91 | batch["input_ids"][i].extend([0] * difference) 92 | batch["token_type_ids"][i].extend([2] * difference) 93 | return batch 94 | 95 | def _tokenize(self, text): 96 | offsets = [] 97 | start = 0 98 | for i, char in enumerate(text): 99 | if char == " ": 100 | offsets.append((start, i)) 101 | start = i + 1 102 | if start < len(text): 103 | offsets.append((start, len(text))) 104 | words = [text[start:end] for start, end in offsets] 105 | type_ids = [0] + [1] * len(words) + [0] 106 | words = [self.start_symbol] + words + [self.end_symbol] 107 | offsets = [None] + offsets + [None] 108 | mask = [1] * len(words) 109 | return words, offsets, mask, type_ids 110 | 111 | def _encode_words(self, words): 112 | ids = [] 113 | for word in words: 114 | if word not in self.str2int: 115 | self.int2str[len(self.str2int)] = word 116 | self.str2int[word] = len(self.str2int) 117 | ids.append(self.str2int[word]) 118 | return ids 119 | 120 | 121 | def DummyTransformerModel(width: int, depth: int): 122 | def _forward(model, tokens, is_train): 123 | width = model.attrs["width"] 124 | depth = model.attrs["depth"] 125 | shape = (depth, tokens.input_ids.shape[0], tokens.input_ids.shape[1], width) 126 | tensors = torch.zeros(*shape) 127 | return ModelOutput(last_hidden_state=tensors), lambda d_tensors: tokens 128 | 129 | return Model( 130 | "dummy-transformer", 131 | _forward, 132 | attrs={"width": width, "depth": depth}, 133 | ) 134 | 135 | 136 | def DummyTransformer( 137 | depth: int = 2, width: int = 4, get_spans=get_doc_spans 138 | ) -> Model[List[Doc], FullTransformerBatch]: 139 | """Create a test model that produces a FullTransformerBatch object.""" 140 | hf_model = HFObjects(DummyTokenizer(), None, None) 141 | 142 | return DummyModel( 143 | "dummy-transformer", 144 | transformer_forward, 145 | layers=[DummyTransformerModel(width=width, depth=depth)], 146 | attrs={ 147 | "get_spans": get_spans, 148 | "hf_model": hf_model, 149 | "grad_factor": 1.0, 150 | "flush_cache_chance": 0.0, 151 | "transformer_config": {}, 152 | }, 153 | dims={"nO": width}, 154 | ) 155 | 156 | 157 | class DummyModel(Model): 158 | @property 159 | def tokenizer(self): 160 | return DummyTokenizer() 161 | 162 | @property 163 | def transformer(self): 164 | return None 165 | 166 | @property 167 | def tokenizer_config(self): 168 | return {} 169 | 170 | @property 171 | def transformer_config(self): 172 | return {} 173 | -------------------------------------------------------------------------------- /spacy_transformers/truncate.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple, List, Union, TypeVar 2 | import numpy 3 | from thinc.types import Ragged, Ints2d, Floats2d 4 | from .data_classes import WordpieceBatch 5 | 6 | ArrayT = TypeVar("ArrayT", bound=Union[Ints2d, Floats2d]) 7 | 8 | 9 | def truncate_oversize_splits( 10 | wordpieces: WordpieceBatch, align: Ragged, max_length: int 11 | ) -> Tuple[WordpieceBatch, Ragged]: 12 | """Drop wordpieces from inputs that are too long. This can happen because 13 | the splitter is based on linguistic tokens, and the number of wordpieces 14 | that each token is split into is unpredictable, so we can end up with splits 15 | that have more wordpieces than the model's maximum. 16 | 17 | To solve this, we calculate a score for each wordpiece in the split, 18 | and drop the wordpieces with the highest scores. I can think of a few 19 | scoring schemes we could use: 20 | 21 | a) Drop the ends of longest wordpieces. This scoring would be just: 22 | position_in_token 23 | b) Drop the middles of longest wordpieces. The score would be: 24 | min(length - position_in_token, position_in_token) 25 | c) Drop all wordpieces from longest tokens. This would be: 26 | length 27 | d) Drop wordpieces from the end of the split. This would be: 28 | position_in_split 29 | 30 | The advantage of a) and b) is that they give some representation to each 31 | token. The advantage of c) is that it leaves a higher % of tokens with intact 32 | representations. The advantage of d) is that it leaves contiguous chunks of 33 | wordpieces intact, and drops from the end. 34 | 35 | I find b) most appealing, but it's also the most complicated. Let's just do 36 | d) for now. 37 | """ 38 | if wordpieces.input_ids.shape[1] < max_length: 39 | return wordpieces, align 40 | mask = _get_truncation_mask_drop_from_end( 41 | wordpieces.input_ids.shape, wordpieces.lengths, align, max_length 42 | ) 43 | return _truncate_tokens(wordpieces, mask), _truncate_alignment(align, mask) 44 | 45 | 46 | def _get_truncation_mask_drop_from_end( 47 | shape: Tuple[int, int], split_lengths: List[int], align: Ragged, max_length: int 48 | ) -> numpy.ndarray: 49 | """Return a two-dimensional boolean mask, indicating whether wordpieces 50 | are dropped from their sequences. 51 | 52 | Drop wordpieces from the end of the sequence. 53 | """ 54 | mask = numpy.ones(shape, dtype="bool") 55 | mask[:, max_length:] = 0 56 | return mask 57 | 58 | 59 | def _truncate_tokens(wordpieces: WordpieceBatch, mask: numpy.ndarray) -> WordpieceBatch: 60 | n_seq = len(wordpieces) 61 | mask1d = mask.ravel() 62 | mask = mask.reshape((n_seq, -1)) 63 | 64 | strings: List[List[str]] = [] 65 | for i, seq in enumerate(wordpieces.strings): 66 | strings.append([]) 67 | for j, token in enumerate(seq): 68 | if mask[i, j]: 69 | strings[-1].append(token) 70 | 71 | def filter_array(data: ArrayT) -> ArrayT: 72 | data1d = data.reshape((-1,)) 73 | return data1d[mask1d].reshape((n_seq, -1)) # type: ignore 74 | 75 | filtered_token_type_ids = None 76 | if wordpieces.token_type_ids is not None: 77 | filtered_token_type_ids = filter_array(wordpieces.token_type_ids) 78 | 79 | return WordpieceBatch( 80 | strings=strings, 81 | input_ids=filter_array(wordpieces.input_ids), 82 | attention_mask=filter_array(wordpieces.attention_mask), 83 | lengths=[len(seq) for seq in strings], 84 | token_type_ids=filtered_token_type_ids, 85 | ) 86 | 87 | 88 | def _truncate_alignment(align: Ragged, mask: numpy.ndarray) -> Ragged: 89 | # We're going to have fewer wordpieces in the new array, so all of our 90 | # wordpiece indices in the alignment table will be off --- they'll point 91 | # to the wrong row. So we need to do three things here: 92 | # 93 | # 1) Adjust all the indices in align.dataXd to account for the dropped data 94 | # 2) Remove the dropped indices from the align.dataXd 95 | # 3) Calculate new align.lengths 96 | # 97 | # The wordpiece mapping is easily calculated by the cumulative sum of the 98 | # mask table. 99 | # Let's say we have [True, False, False, True]. The mapping of the dropped 100 | # wordpieces doesn't matter, because we can filter it with the mask. So we 101 | # have [0, 0, 0, 1], i.e the wordpiece that was 102 | # at 0 is still at 0, and the wordpiece that was at 3 is now at 1. 103 | mask = mask.ravel() 104 | idx_map = mask.cumsum() - 1 105 | idx_map[~mask] = -1 106 | # Step 1: Adjust all the indices in align.dataXd. 107 | new_align = idx_map[align.data.ravel()] 108 | # Step 2: Remove the dropped indices 109 | new_align = new_align[new_align >= 0] 110 | # Step 3: Calculate new align.lengths 111 | new_lengths = align.lengths.copy() 112 | for i in range(len(align.lengths)): 113 | drops = ~mask[align[i].data.ravel()] 114 | new_lengths[i] -= drops.sum() 115 | return Ragged(new_align, new_lengths) 116 | -------------------------------------------------------------------------------- /spacy_transformers/util.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict, Union, Set 2 | from pathlib import Path 3 | import random 4 | from transformers import AutoModel, AutoTokenizer 5 | from transformers.tokenization_utils import BatchEncoding 6 | from transformers.tokenization_utils_fast import PreTrainedTokenizerFast 7 | import catalogue 8 | from spacy.util import registry 9 | from thinc.api import get_torch_default_device 10 | import torch.cuda 11 | import tempfile 12 | import shutil 13 | import contextlib 14 | import warnings 15 | 16 | 17 | # fmt: off 18 | registry.span_getters = catalogue.create("spacy", "span_getters", entry_points=True) # type: ignore 19 | registry.annotation_setters = catalogue.create("spacy", "annotation_setters", entry_points=True) # type: ignore 20 | # fmt: on 21 | 22 | 23 | def huggingface_from_pretrained(source: Union[Path, str], config: Dict): 24 | """Create a Huggingface transformer model from pretrained weights. Will 25 | download the model if it is not already downloaded. 26 | 27 | source (Union[str, Path]): The name of the model or a path to it, such as 28 | 'bert-base-cased'. 29 | config (dict): Settings to pass to the tokenizer. 30 | """ 31 | warnings.warn( 32 | "spacy_transformers.util.huggingface_from_pretrained has been moved to " 33 | "spacy_transformers.layers.transformer_model.huggingface_from_pretrained " 34 | "with an updated API:\n" 35 | "huggingface_from_pretrained(source, tok_config, trf_config) -> HFObjects", 36 | DeprecationWarning, 37 | ) 38 | if isinstance(source, Path): 39 | str_path = str(source.absolute()) 40 | else: 41 | str_path = source 42 | tokenizer = AutoTokenizer.from_pretrained(str_path, **config) 43 | transformer = AutoModel.from_pretrained(str_path) 44 | torch_device = get_torch_default_device() 45 | transformer.to(torch_device) 46 | return tokenizer, transformer 47 | 48 | 49 | def huggingface_tokenize(tokenizer, texts: List[str]) -> BatchEncoding: 50 | """Apply a Huggingface tokenizer to a batch of texts.""" 51 | 52 | # Use NumPy arrays rather than PyTorch tensors to avoid a lot of 53 | # host <-> device transfers during tokenization and post-processing 54 | # when a GPU is used. 55 | warnings.warn( 56 | "spacy_transformers.util.huggingface_tokenize has been moved to " 57 | "spacy_transformers.layers.transformer_model.huggingface_tokenize.", 58 | DeprecationWarning, 59 | ) 60 | token_data = tokenizer( 61 | texts, 62 | add_special_tokens=True, 63 | return_attention_mask=True, 64 | return_offsets_mapping=isinstance(tokenizer, PreTrainedTokenizerFast), 65 | return_tensors="np", 66 | return_token_type_ids=None, # Sets to model default 67 | padding="longest", 68 | ) 69 | token_data["input_texts"] = [] 70 | for i in range(len(token_data["input_ids"])): 71 | wp_texts = tokenizer.convert_ids_to_tokens(token_data["input_ids"][i]) 72 | token_data["input_texts"].append(wp_texts) 73 | token_data["pad_token"] = tokenizer.pad_token 74 | return token_data 75 | 76 | 77 | def maybe_flush_pytorch_cache(chance: float = 1.0): 78 | """Flip a coin and decide whether to flush PyTorch's cache. This allows the 79 | cache to be flushed periodically without maintaining a counter. 80 | 81 | I'm not sure why this is necessary, it shouldn't be. But it definitely does 82 | help... 83 | """ 84 | if random.random() < chance and torch.cuda.is_available(): 85 | torch.cuda.empty_cache() 86 | 87 | 88 | def transpose_list(nested_list): 89 | output = [] 90 | for i, entry in enumerate(nested_list): 91 | while len(output) < len(entry): 92 | output.append([None] * len(nested_list)) 93 | for j, x in enumerate(entry): 94 | output[j][i] = x 95 | return output 96 | 97 | 98 | def batch_by_length(seqs, max_words: int) -> List[List[int]]: 99 | """Given a list of sequences, return a batched list of indices into the 100 | list, where the batches are grouped by length, in descending order. 101 | 102 | Batches may be at most max_words in size, defined as max sequence length * size. 103 | """ 104 | # Use negative index so we can get sort by position ascending. 105 | lengths_indices = [(len(seq), i) for i, seq in enumerate(seqs)] 106 | lengths_indices.sort() 107 | batches: List[List[int]] = [] 108 | batch: List[int] = [] 109 | for length, i in lengths_indices: 110 | if not batch: 111 | batch.append(i) 112 | elif length * (len(batch) + 1) <= max_words: 113 | batch.append(i) 114 | else: 115 | batches.append(batch) 116 | batch = [i] 117 | if batch: 118 | batches.append(batch) 119 | # Check lengths match 120 | assert sum(len(b) for b in batches) == len(seqs) 121 | # Check no duplicates 122 | seen: Set[int] = set() 123 | for b in batches: 124 | seen.update(id(item) for item in b) 125 | assert len(seen) == len(seqs) 126 | batches = [list(sorted(batch)) for batch in batches] 127 | batches.reverse() 128 | return batches 129 | 130 | 131 | def log_gpu_memory(logger, context): 132 | mem = torch.cuda.memory_allocated() // 1024**2 133 | logger.info(f"{mem:.1f}: {context}") 134 | 135 | 136 | def log_batch_size(logger, token_data, is_train): 137 | batch_size = token_data["input_ids"].shape[0] 138 | seq_len = token_data["input_ids"].shape[1] 139 | squared = seq_len**2 * batch_size 140 | 141 | if is_train: 142 | logger.info(f"{batch_size} x {seq_len} ({squared}) update") 143 | else: 144 | logger.info(f"{batch_size} x {seq_len} ({squared}) predict") 145 | 146 | 147 | @contextlib.contextmanager 148 | def make_tempdir(): 149 | """Execute a block in a temporary directory and remove the directory and 150 | its contents at the end of the with block. 151 | 152 | YIELDS (Path): The path of the temp directory. 153 | """ 154 | d = Path(tempfile.mkdtemp()) 155 | yield d 156 | shutil.rmtree(str(d)) 157 | --------------------------------------------------------------------------------