├── .github └── workflows │ ├── pythonpackage.yml │ └── pythonpublish.yml ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── setup.py ├── spacy_udpipe ├── __init__.py ├── resources │ ├── __init__.py │ └── languages.json ├── tokenizer.py ├── udpipe.py └── utils.py └── tests ├── __init__.py ├── conftest.py ├── languages ├── en │ └── test_en_language.py └── fr │ └── test_fr_language.py └── test_spacy_udpipe.py /.github/workflows/pythonpackage.yml: -------------------------------------------------------------------------------- 1 | name: Python package 2 | 3 | on: 4 | pull_request: 5 | types: [opened, synchronize, reopened] 6 | branches: 7 | - master 8 | 9 | jobs: 10 | build: 11 | 12 | runs-on: ${{ matrix.os }} 13 | strategy: 14 | max-parallel: 4 15 | matrix: 16 | os: [ubuntu-latest, windows-latest] 17 | python-version: [3.6, 3.7, 3.8, 3.9] 18 | 19 | steps: 20 | - uses: actions/checkout@v1 21 | - name: Set up Python ${{ matrix.python-version }} 22 | uses: actions/setup-python@v1 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | - name: Install package and dependencies 26 | run: | 27 | python -m pip install --upgrade pip 28 | pip install .[dev] 29 | - name: Lint with flake8 30 | run: | 31 | # stop the build if there are Python syntax errors or undefined names 32 | flake8 --filename=*.py --count --show-source --max-line-length=119 --statistics 33 | # exit-zero treats all errors as warnings 34 | flake8 --filename=*.py --count --exit-zero --max-line-length=119 --max-complexity=10 --statistics 35 | - name: Test with pytest 36 | run: | 37 | python -m pytest -vvv tests 38 | -------------------------------------------------------------------------------- /.github/workflows/pythonpublish.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | deploy: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v1 12 | - name: Set up Python 13 | uses: actions/setup-python@v1 14 | with: 15 | python-version: '3.6' 16 | - name: Install dependencies 17 | run: | 18 | python -m pip install --upgrade pip 19 | pip install setuptools wheel twine 20 | - name: Build and publish 21 | env: 22 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 23 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 24 | run: | 25 | python setup.py sdist bdist_wheel 26 | twine upload dist/* 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore models dir 2 | spacy_udpipe/models 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # pyenv 79 | .python-version 80 | 81 | # celery beat schedule file 82 | celerybeat-schedule 83 | 84 | # SageMath parsed files 85 | *.sage.py 86 | 87 | # Environments 88 | .env 89 | .venv 90 | env/ 91 | venv/ 92 | ENV/ 93 | env.bak/ 94 | venv.bak/ 95 | 96 | # Spyder project settings 97 | .spyderproject 98 | .spyproject 99 | 100 | # Rope project settings 101 | .ropeproject 102 | 103 | # mkdocs documentation 104 | /site 105 | 106 | # mypy 107 | .mypy_cache/ 108 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019-2021 Text Analysis and Knowledge Engineering Lab (TakeLab) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: lint test 2 | 3 | # Lint source code 4 | 5 | lint: 6 | # stop the build if there are Python syntax errors or undefined names 7 | flake8 --filename=*.py --count --show-source --max-line-length=119 --statistics 8 | # exit-zero treats all errors as warnings 9 | flake8 --filename=*.py --count --exit-zero --max-line-length=119 --max-complexity=10 --statistics 10 | 11 | # Run tests 12 | 13 | test: 14 | python -m pytest -vvv tests 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # spaCy + UDPipe 2 | 3 | This package wraps the fast and efficient [UDPipe](http://ufal.mff.cuni.cz/udpipe) language-agnostic NLP pipeline 4 | (via its [Python bindings](https://github.com/ufal/udpipe/tree/master/bindings/python)), so you can use 5 | [UDPipe pre-trained models](https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-3131) as a [spaCy](https://spacy.io/) pipeline for 50+ languages out-of-the-box. 6 | Inspired by [spacy-stanza](https://github.com/explosion/spacy-stanza), this package offers slightly less accurate 7 | models that are in turn much faster (see benchmarks for [UDPipe](https://ufal.mff.cuni.cz/udpipe/models#universal_dependencies_25_models_performance) and [Stanza](https://stanfordnlp.github.io/stanza/performance.html)). 8 | 9 | ## Installation 10 | 11 | Use the package manager [pip](https://pip.pypa.io/en/stable/) to install spacy-udpipe. 12 | 13 | ```bash 14 | pip install spacy-udpipe 15 | ``` 16 | 17 | After installation, use `spacy_udpipe.download()` to download the pre-trained model for the desired language. 18 | 19 | A full list of pre-trained UDPipe models for supported languages can be found in [`languages.json`](https://github.com/TakeLab/spacy-udpipe/blob/master/spacy_udpipe/languages.json). 20 | 21 | ## Usage 22 | The loaded UDPipeLanguage class returns a spaCy [`Language` object](https://spacy.io/api/language), i.e., the object you can use to process text and create a [`Doc` object](https://spacy.io/api/doc). 23 | 24 | ```python 25 | import spacy_udpipe 26 | 27 | spacy_udpipe.download("en") # download English model 28 | 29 | text = "Wikipedia is a free online encyclopedia, created and edited by volunteers around the world." 30 | nlp = spacy_udpipe.load("en") 31 | 32 | doc = nlp(text) 33 | for token in doc: 34 | print(token.text, token.lemma_, token.pos_, token.dep_) 35 | 36 | ``` 37 | As all attributes are computed once and set in the custom [`Tokenizer`](https://spacy.io/api/tokenizer), the `Language.pipeline` is empty. 38 | 39 | The type of `text` can be one of the following: 40 | * unprocessed: `str`, 41 | * presegmented: `List[str]`, 42 | * pretokenized: `List[List[str]]`. 43 | 44 | ### Loading a custom model 45 | The following code snippet demonstrates how to load a custom `UDPipe` model (for the Croatian language): 46 | ```python 47 | import spacy_udpipe 48 | 49 | nlp = spacy_udpipe.load_from_path(lang="hr", 50 | path="./custom_croatian.udpipe", 51 | meta={"description": "Custom 'hr' model"}) 52 | text = "Wikipedija je enciklopedija slobodnog sadržaja." 53 | 54 | doc = nlp(text) 55 | for token in doc: 56 | print(token.text, token.lemma_, token.pos_, token.dep_) 57 | ``` 58 | This can be done for any of the languages supported by spaCy. For an exhaustive list, see [spaCy languages](https://spacy.io/usage/models#languages). 59 | 60 | ## Contributing 61 | Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change. 62 | 63 | Please make sure to update the tests as appropriate. Tests are run automatically for each pull request on the master branch. 64 | To start the tests locally, first, install the package with `pip install -e .[dev]`, then run [`pytest`](https://docs.pytest.org/en/latest/contents.html) in the root source directory as follows: 65 | ```bash 66 | make test 67 | ``` 68 | Additionally, run [`flake8`](https://flake8.pycqa.org/en/latest) with the following command to check for coding mistakes: 69 | ```bash 70 | make lint 71 | ``` 72 | 73 | ## License 74 | * Source code: [MIT](https://choosealicense.com/licenses/mit/) © Text Analysis and Knowledge Engineering Lab (TakeLab) 75 | * Available pre-trained models: [CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/) 76 | 77 | ## Project status 78 | Maintained by [Text Analysis and Knowledge Engineering Lab (TakeLab)](http://takelab.fer.hr/). 79 | 80 | ## Notes 81 | 82 | * Known possible issues: 83 | * Tag map 84 | 85 | `Token.tag_` is a [CoNLL](https://universaldependencies.org/format.html) XPOS tag (language-specific part-of-speech tag), defined for each language separately by the corresponding [Universal Dependencies](https://universaldependencies.org/) treebank. Mappings between XPOS and Universal Dependencies POS tags should be defined in a `TAG_MAP` dictionary (located in language-specific `tag_map.py` files), along with optional morphological features. See [spaCy tag map](https://spacy.io/usage/adding-languages#tag-map) for more details. 86 | * Syntax iterators 87 | 88 | In order to extract `Doc.noun_chunks`, a proper syntax iterator implementation for the language of interest is required. For more details, please see [spaCy syntax iterators](https://spacy.io/usage/adding-languages#syntax-iterators). 89 | * Other language-specific issues 90 | 91 | A quick way to check language-specific defaults in [spaCy](https://spacy.io) is to visit [spaCy language support](https://spacy.io/usage/models#languages). Also, please see [spaCy language data](https://spacy.io/usage/adding-languages#language-data) for details regarding other language-specific data. 92 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from setuptools import find_packages, setup 4 | 5 | 6 | def get_version(fname: str) -> str: 7 | full_path = os.path.join( 8 | os.path.abspath(os.path.dirname(__file__)), 9 | "spacy_udpipe", 10 | fname 11 | ) 12 | with open(full_path, "r", encoding="utf-8") as fp: 13 | for line in fp: 14 | if line.startswith("__version__"): 15 | delim = '"' if '"' in line else "'" 16 | return line.split(delim)[1] 17 | else: 18 | raise RuntimeError( 19 | "Unable to find version string." 20 | ) 21 | 22 | 23 | URL = "https://github.com/TakeLab/spacy-udpipe" 24 | 25 | with open("README.md", "r", encoding="utf-8") as f: 26 | readme = f.read() 27 | 28 | setup( 29 | name="spacy_udpipe", 30 | version=get_version("__init__.py"), 31 | description="Use fast UDPipe models directly in spaCy", 32 | long_description=readme, 33 | long_description_content_type="text/markdown", 34 | url=URL, 35 | author="TakeLab", 36 | author_email="takelab@fer.hr", 37 | license="MIT", 38 | keywords="nlp udpipe spacy python", 39 | packages=find_packages(), 40 | install_requires=[ 41 | "spacy>=3.0.0,<4.0.0", 42 | "ufal.udpipe>=1.2.0", 43 | "importlib_resources;python_version<'3.7'", 44 | ], 45 | extras_require={ 46 | "dev": ["flake8", "pytest", "pytest-mock"], 47 | }, 48 | python_requires=">=3.6", 49 | entry_points={ 50 | "spacy_tokenizers": [ 51 | "spacy_udpipe.PipelineAsTokenizer.v1 = spacy_udpipe:tokenizer.create_tokenizer", 52 | ] 53 | }, 54 | tests_require=["pytest>=5.0.0"], 55 | package_data={"spacy_udpipe": ["./languages.json"], }, 56 | classifiers=[ 57 | "Development Status :: 4 - Beta", 58 | "Intended Audience :: Developers", 59 | "Intended Audience :: Education", 60 | "Intended Audience :: Science/Research" 61 | "Programming Language :: Python :: 3", 62 | "License :: OSI Approved :: MIT License", 63 | "Operating System :: OS Independent", 64 | "Topic :: Software Development", 65 | "Topic :: Software Development :: Libraries", 66 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 67 | "Topic :: Text Processing", 68 | ], 69 | project_urls={ 70 | "SpaCy": "https://spacy.io/", 71 | "TakeLab": "http://takelab.fer.hr/", 72 | "UDPipe": "http://ufal.mff.cuni.cz/udpipe", 73 | "Source": URL, 74 | "Tracker": URL + "/issues", 75 | }, 76 | zip_safe=True 77 | ) 78 | -------------------------------------------------------------------------------- /spacy_udpipe/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.0.0" 2 | __all__ = [ 3 | "download", "load", "load_from_path", 4 | "UDPipeTokenizer", "UDPipeModel" 5 | ] 6 | 7 | from .utils import download, load, load_from_path 8 | from .tokenizer import UDPipeTokenizer 9 | from .udpipe import UDPipeModel 10 | -------------------------------------------------------------------------------- /spacy_udpipe/resources/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TakeLab/spacy-udpipe/a38cd1e1c8dc5e18def177cd21d12922da833d6f/spacy_udpipe/resources/__init__.py -------------------------------------------------------------------------------- /spacy_udpipe/resources/languages.json: -------------------------------------------------------------------------------- 1 | { 2 | "af": "afrikaans-afribooms-ud-2.5-191206.udpipe", 3 | "hy": "armenian-armtdp-ud-2.5-191206.udpipe", 4 | "eu": "basque-bdt-ud-2.5-191206.udpipe", 5 | "be": "belarusian-hse-ud-2.5-191206.udpipe", 6 | "grc": "ancient_greek-perseus-ud-2.5-191206.udpipe", 7 | "grc-perseus": "ancient_greek-perseus-ud-2.5-191206.udpipe", 8 | "grc-proiel": "ancient_greek-proiel-ud-2.5-191206.udpipe", 9 | "ca": "catalan-ancora-ud-2.5-191206.udpipe", 10 | "zh": "chinese-gsd-ud-2.5-191206.udpipe", 11 | "zh-gsdsimp": "chinese-gsdsimp-ud-2.5-191206.udpipe", 12 | "lzh": "classical_chinese-kyoto-ud-2.5-191206.udpipe", 13 | "ar": "arabic-padt-ud-2.5-191206.udpipe", 14 | "bg": "bulgarian-btb-ud-2.5-191206.udpipe", 15 | "cs": "czech-pdt-ud-2.5-191206.udpipe", 16 | "cs-pdt": "czech-pdt-ud-2.5-191206.udpipe", 17 | "cs-cac": "czech-cac-ud-2.5-191206.udpipe", 18 | "cs-fictree": "czech-fictree-ud-2.5-191206.udpipe", 19 | "cs-cltt": "czech-cltt-ud-2.5-191206.udpipe", 20 | "hr": "croatian-set-ud-2.5-191206.udpipe", 21 | "coptic": "coptic-scriptorium-ud-2.5-191206.udpipe", 22 | "nl": "dutch-alpino-ud-2.5-191206.udpipe", 23 | "nl-alpino": "dutch-alpino-ud-2.5-191206.udpipe", 24 | "nl-lassysmall": "dutch-lassysmall-ud-2.5-191206.udpipe", 25 | "da": "danish-ddt-ud-2.5-191206.udpipe", 26 | "et": "estonian-edt-ud-2.5-191206.udpipe", 27 | "et-edt": "estonian-edt-ud-2.5-191206.udpipe", 28 | "et-ewt": "estonian-ewt-ud-2.5-191206.udpipe", 29 | "en": "english-ewt-ud-2.5-191206.udpipe", 30 | "en-ewt": "english-ewt-ud-2.5-191206.udpipe", 31 | "en-gum": "english-gum-ud-2.5-191206.udpipe", 32 | "en-lines": "english-lines-ud-2.5-191206.udpipe", 33 | "en-partut": "english-partut-ud-2.5-191206.udpipe", 34 | "fr": "french-gsd-ud-2.5-191206.udpipe", 35 | "fr-gsd": "french-gsd-ud-2.5-191206.udpipe", 36 | "fr-partut": "french-partut-ud-2.5-191206.udpipe", 37 | "fr-sequoia": "french-sequoia-ud-2.5-191206.udpipe", 38 | "fr-spoken": "french-spoken-ud-2.5-191206.udpipe", 39 | "fro": "old_french-srcmf-ud-2.5-191206.udpipe", 40 | "fi": "finnish-tdt-ud-2.5-191206.udpipe", 41 | "fi-tdt": "finnish-tdt-ud-2.5-191206.udpipe", 42 | "fi-ftb": "finnish-ftb-ud-2.5-191206.udpipe", 43 | "de": "german-gsd-ud-2.5-191206.udpipe", 44 | "de-hdt": "german-hdt-ud-2.5-191206.udpipe", 45 | "gl": "galician-ctg-ud-2.5-191206.udpipe", 46 | "gl-ctg": "galician-ctg-ud-2.5-191206.udpipe", 47 | "gl-treegal": "galician-treegal-ud-2.5-191206.udpipe", 48 | "el": "greek-gdt-ud-2.5-191206.udpipe", 49 | "got": "gothic-proiel-ud-2.5-191206.udpipe", 50 | "hu": "hungarian-szeged-ud-2.5-191206.udpipe", 51 | "hi": "hindi-hdtb-ud-2.5-191206.udpipe", 52 | "he": "hebrew-htb-ud-2.5-191206.udpipe", 53 | "it": "italian-isdt-ud-2.5-191206.udpipe", 54 | "it-isdt": "italian-isdt-ud-2.5-191206.udpipe", 55 | "it-partut": "italian-partut-ud-2.5-191206.udpipe", 56 | "it-postwita": "italian-postwita-ud-2.5-191206.udpipe", 57 | "it-vit": "italian-vit-ud-2.5-191206.udpipe", 58 | "it-twittiro": "italian-twittiro-ud-2.5-191206.udpipe", 59 | "ga": "irish-idt-ud-2.5-191206.udpipe", 60 | "id": "indonesian-gsd-ud-2.5-191206.udpipe", 61 | "la": "latin-ittb-ud-2.5-191206.udpipe", 62 | "la-ittb": "latin-ittb-ud-2.5-191206.udpipe", 63 | "la-proiel": "latin-proiel-ud-2.5-191206.udpipe", 64 | "la-perseus": "latin-perseus-ud-2.5-191206.udpipe", 65 | "ko": "korean-kaist-ud-2.5-191206.udpipe", 66 | "ko-kaist": "korean-kaist-ud-2.5-191206.udpipe", 67 | "ko-gsd": "korean-gsd-ud-2.5-191206.udpipe", 68 | "ja": "japanese-gsd-ud-2.5-191206.udpipe", 69 | "lt": "lithuanian-alksnis-ud-2.5-191206.udpipe", 70 | "lt-alksnis": "lithuanian-alksnis-ud-2.5-191206.udpipe", 71 | "lt-hse": "lithuanian-hse-ud-2.5-191206.udpipe", 72 | "lv": "latvian-lvtb-ud-2.5-191206.udpipe", 73 | "nn": "norwegian-nynorsk-ud-2.5-191206.udpipe", 74 | "nn-nynorsk": "norwegian-nynorsk-ud-2.5-191206.udpipe", 75 | "nn-nynorsklia": "norwegian-nynorsklia-ud-2.5-191206.udpipe", 76 | "nb": "norwegian-bokmaal-ud-2.5-191206.udpipe", 77 | "se": "north_sami-giella-ud-2.5-191206.udpipe", 78 | "mr": "marathi-ufal-ud-2.5-191206.udpipe", 79 | "mt": "maltese-mudt-ud-2.5-191206.udpipe", 80 | "fa": "persian-seraji-ud-2.5-191206.udpipe", 81 | "cu": "old_church_slavonic-proiel-ud-2.5-191206.udpipe", 82 | "ro": "romanian-rrt-ud-2.5-191206.udpipe", 83 | "ro-rrt": "romanian-rrt-ud-2.5-191206.udpipe", 84 | "ro-nonstandard": "romanian-nonstandard-ud-2.5-191206.udpipe", 85 | "pt": "portuguese-gsd-ud-2.5-191206.udpipe", 86 | "pt-gsd": "portuguese-gsd-ud-2.5-191206.udpipe", 87 | "pt-bosque": "portuguese-bosque-ud-2.5-191206.udpipe", 88 | "pl": "polish-pdb-ud-2.5-191206.udpipe", 89 | "pl-pdb": "polish-pdb-ud-2.5-191206.udpipe", 90 | "pl-lfg": "polish-lfg-ud-2.5-191206.udpipe", 91 | "sr": "serbian-set-ud-2.5-191206.udpipe", 92 | "ru": "russian-syntagrus-ud-2.5-191206.udpipe", 93 | "ru-syntagrus": "russian-syntagrus-ud-2.5-191206.udpipe", 94 | "ru-gsd": "russian-gsd-ud-2.5-191206.udpipe", 95 | "ru-taiga": "russian-taiga-ud-2.5-191206.udpipe", 96 | "orv": "old_russian-torot-ud-2.5-191206.udpipe", 97 | "gd": "scottish_gaelic-arcosg-ud-2.5-191206.udpipe", 98 | "es": "spanish-ancora-ud-2.5-191206.udpipe", 99 | "es-ancora": "spanish-ancora-ud-2.5-191206.udpipe", 100 | "es-gsd": "spanish-gsd-ud-2.5-191206.udpipe", 101 | "sl": "slovenian-ssj-ud-2.5-191206.udpipe", 102 | "sl-ssj": "slovenian-ssj-ud-2.5-191206.udpipe", 103 | "sl-sst": "slovenian-sst-ud-2.5-191206.udpipe", 104 | "sk": "slovak-snk-ud-2.5-191206.udpipe", 105 | "uk": "ukrainian-iu-ud-2.5-191206.udpipe", 106 | "tr": "turkish-imst-ud-2.5-191206.udpipe", 107 | "te": "telugu-mtg-ud-2.5-191206.udpipe", 108 | "ta": "tamil-ttb-ud-2.5-191206.udpipe", 109 | "sv": "swedish-talbanken-ud-2.5-191206.udpipe", 110 | "sv-talbanken": "swedish-talbanken-ud-2.5-191206.udpipe", 111 | "sv-lines": "swedish-lines-ud-2.5-191206.udpipe", 112 | "wo": "wolof-wtb-ud-2.5-191206.udpipe", 113 | "vi": "vietnamese-vtb-ud-2.5-191206.udpipe", 114 | "ug": "uyghur-udt-ud-2.5-191206.udpipe", 115 | "ur": "urdu-udtb-ud-2.5-191206.udpipe" 116 | } 117 | -------------------------------------------------------------------------------- /spacy_udpipe/tokenizer.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import Optional, Iterable, List, Tuple, Union, Dict 3 | 4 | from spacy.tokens import Doc 5 | from spacy.vocab import Vocab 6 | from spacy.util import registry 7 | from ufal.udpipe import Sentence, Word 8 | 9 | from .udpipe import UDPipeModel 10 | from .utils import get_path 11 | 12 | 13 | @registry.tokenizers("spacy_udpipe.PipelineAsTokenizer.v1") 14 | def create_tokenizer( 15 | lang: str = "", 16 | path: Optional[str] = None, 17 | meta: Optional[Dict] = None 18 | ): 19 | def tokenizer_factory( 20 | nlp, 21 | lang=lang, 22 | path=path, 23 | meta=meta 24 | ) -> UDPipeTokenizer: 25 | model = UDPipeModel( 26 | lang=lang, 27 | path=path or get_path(lang), 28 | meta=meta 29 | ) 30 | return UDPipeTokenizer( 31 | model=model, 32 | vocab=nlp.vocab 33 | ) 34 | 35 | return tokenizer_factory 36 | 37 | 38 | def _spacy_dep(d: str) -> str: 39 | # Ensure labels match with SpaCy 40 | return d.upper() if d == "root" else d 41 | 42 | 43 | class UDPipeTokenizer: 44 | """Custom Tokenizer which sets all the attributes because 45 | the UDPipe pipeline runs only once and does not 46 | contain separate spaCy pipeline components. 47 | """ 48 | 49 | _ws_pattern = re.compile(r"\s+") 50 | 51 | def __init__( 52 | self, 53 | model: UDPipeModel, 54 | vocab: Vocab 55 | ): 56 | """Initialize the tokenizer. 57 | 58 | model: The initialized UDPipe model. 59 | vocab: The vocabulary to use. 60 | """ 61 | self.model = model 62 | self.vocab = vocab 63 | 64 | def __call__( 65 | self, 66 | text: Union[ 67 | str, 68 | List[str], 69 | List[List[str]] 70 | ] 71 | ) -> Doc: 72 | """Convert input text to a spaCy Doc. 73 | 74 | text: The text to process. It can be presegmented or pretokenized: 75 | str : raw text, 76 | List[str] : presegmented text, 77 | List[List[str]] : pretokenized text. 78 | RETURNS: The spaCy Doc object. 79 | """ 80 | if not text: 81 | return Doc(vocab=self.vocab) 82 | 83 | udpipe_sents = self.model(text=text) if text else [Sentence()] 84 | text = " ".join(s.getText() for s in udpipe_sents) 85 | tokens, heads = self._get_tokens_with_heads(udpipe_sents=udpipe_sents) 86 | 87 | words = [] 88 | spaces = [] 89 | pos = [] 90 | tags = [] 91 | morphs = [] 92 | deps = [] 93 | lemmas = [] 94 | offset = 0 95 | is_aligned = self._check_aligned(text=text, tokens=tokens) 96 | if not is_aligned: 97 | text = "" 98 | for token in tokens: 99 | text += token.form 100 | if token.getSpaceAfter(): 101 | text += " " 102 | for i, token in enumerate(tokens): 103 | span = text[offset:] 104 | if not span: 105 | break 106 | while len(span) and span[0].isspace(): 107 | # If we encounter leading whitespace, skip one character ahead 108 | offset += 1 109 | span = text[offset:] 110 | words.append(token.form) 111 | # Make sure all strings are in the vocabulary 112 | pos.append(token.upostag or "") 113 | # CoNNL xpostag-s, custom for each UD treebank 114 | morphs.append(token.feats or "") 115 | tags.append(token.xpostag or "") 116 | deps.append(_spacy_dep(token.deprel) or "") 117 | lemmas.append(token.lemma or "") 118 | offset += len(token.form) 119 | span = text[offset:] 120 | if i == len(tokens) - 1 or not token.getSpaceAfter(): 121 | spaces.append(False) 122 | elif not is_aligned: 123 | spaces.append(True) 124 | else: 125 | next_token = tokens[i + 1] 126 | spaces.append(not span.startswith(next_token.form)) 127 | doc = Doc( 128 | vocab=self.vocab, 129 | words=words, 130 | spaces=spaces, 131 | pos=pos, 132 | tags=tags, 133 | morphs=morphs, 134 | lemmas=lemmas, 135 | deps=deps, 136 | heads=[head + i for i, head in enumerate(heads)], 137 | ) 138 | return doc 139 | 140 | def pipe( 141 | self, 142 | texts: Union[ 143 | Iterable[str], 144 | Iterable[List[str]], 145 | Iterable[List[List[str]]] 146 | ] 147 | ) -> Iterable[Doc]: 148 | """Tokenize a stream of texts. 149 | 150 | texts: A sequence of unicode texts (raw, presegmented or pretokenized). 151 | n_process: Number of processes to use. 152 | YIELDS: A sequence of Doc objects, in order. 153 | """ 154 | for text in texts: 155 | yield self(text) 156 | 157 | def _get_tokens_with_heads( 158 | self, 159 | udpipe_sents: List[Sentence] 160 | ) -> Tuple[List[str], List[int]]: 161 | """Flatten the tokens in the UDPipe sentence representations and extract 162 | the token indices of the sentence start tokens to is_sent_start set. 163 | 164 | udpipe_sents: The processed sentences. 165 | RETURNS: The tokens (words). 166 | """ 167 | tokens = [] 168 | heads = [] 169 | offset = 0 170 | for sentence in udpipe_sents: 171 | words = sentence.words[1:] # Ignore 172 | for token in words: 173 | # Calculate the absolute token index in the doc, 174 | # then the *relative* index of the head, -1 for zero-indexed 175 | # and if the governor is 0 (root), we leave it at 0 176 | if token.head: 177 | head = token.head + offset - len(tokens) - 1 178 | else: 179 | head = 0 180 | heads.append(head) 181 | tokens.append(token) 182 | offset += len(words) 183 | return tokens, heads 184 | 185 | def _check_aligned(self, text: str, tokens: List[Word]) -> bool: 186 | """Check if tokens are aligned with text. 187 | 188 | text: Text to check. 189 | tokens: Tokens to check. 190 | RETURNS: True iff text and tokens are aligned. 191 | """ 192 | token_texts = "".join(t.form for t in tokens) 193 | return re.sub(self._ws_pattern, "", text) == token_texts 194 | 195 | def to_disk(self, _path, **kwargs): 196 | return None 197 | 198 | def from_disk(self, _path, **kwargs): 199 | return self 200 | 201 | def to_bytes(self, **kwargs): 202 | return b"" 203 | 204 | def from_bytes(self, _bytes_data, **kwargs): 205 | return self 206 | -------------------------------------------------------------------------------- /spacy_udpipe/udpipe.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import Dict, List, Optional, Union 3 | 4 | from ufal.udpipe import InputFormat 5 | from ufal.udpipe import Model 6 | from ufal.udpipe import OutputFormat, ProcessingError, Sentence 7 | 8 | from .utils import get_path 9 | 10 | 11 | def _default_model_meta(lang: str, name: str) -> Dict: 12 | return { 13 | "author": "Milan Straka & Jana Straková", 14 | "description": "UDPipe pretrained model.", 15 | "email": "straka@ufal.mff.cuni.cz", 16 | "lang": f"udpipe_{lang}", 17 | "license": "CC BY-NC-SA 4.0", 18 | "name": name, 19 | "parent_package": "spacy_udpipe", 20 | "pipeline": [ 21 | "Tokenizer", "Tagger", "Lemmatizer", "Parser" 22 | ], 23 | "source": "Universal Dependencies 2.5", 24 | "url": "http://ufal.mff.cuni.cz/udpipe", 25 | "version": "1.2.0" 26 | } 27 | 28 | 29 | class PretokenizedInputFormat: 30 | """Dummy tokenizer for pretokenized input. 31 | 32 | Execution speed might be slow compared to other UDPipe tokenizers 33 | due to pure Python implementation. Mocks InputFormat API to enable 34 | plug-and-play behaviour. 35 | """ 36 | 37 | def setText(self, text: str) -> None: 38 | """Store text in iterable lines for tokenization. 39 | 40 | text: string, where each sentence is on a line and tokens 41 | are separated by tabs. 42 | """ 43 | self.lines = iter(text.split("\n")) 44 | 45 | def nextSentence(self, sentence: Sentence, _: ProcessingError) -> bool: 46 | """Tokenize each line from stored lines and store tokens in sentence. 47 | 48 | sentence: UDPipe container for storing tokens. 49 | """ 50 | try: 51 | line = next(self.lines) 52 | except StopIteration: 53 | return False 54 | 55 | tokens = line.split("\t") 56 | num_tokens = len(tokens) 57 | for i, token in enumerate(tokens): 58 | word = sentence.addWord(token) 59 | if i < num_tokens - 1 and re.match(r"\W", tokens[i + 1]): 60 | # leave no space after current token iff next token 61 | # is non-alphanumeric (i.e. punctuation) 62 | word.setSpaceAfter(False) 63 | return True 64 | 65 | 66 | class UDPipeModel: 67 | 68 | def __init__( 69 | self, 70 | lang: str, 71 | path: Optional[str] = None, 72 | meta: Optional[Dict] = None 73 | ): 74 | """Load UDPipe model for given language. 75 | 76 | lang: ISO 639-1 language code or shorthand UDPipe model name. 77 | path: Path to UDPipe model. 78 | meta: Meta-information about the UDPipe model. 79 | """ 80 | path = path or get_path(lang=lang) 81 | self.model = Model.load(path) 82 | self._lang = lang.split("-")[0] 83 | self._path = path 84 | self._meta = meta or _default_model_meta( 85 | self._lang, self._path.split("/")[-1] 86 | ) 87 | 88 | def __reduce__(self): 89 | # required for multiprocessing on Windows 90 | return self.__class__, (self._lang, self._path, self._meta) 91 | 92 | def __call__( 93 | self, 94 | text: Union[ 95 | str, 96 | List[str], 97 | List[List[str]] 98 | ] 99 | ) -> List[Sentence]: 100 | """Tokenize, tag and parse the text and return it in an UDPipe 101 | representation. 102 | 103 | text: Input text, can be presegmented or pretokenized: 104 | str : raw text, 105 | List[str] : presegmented text, 106 | List[List[str]] : pretokenized text. 107 | RETURNS: Processed sentences. 108 | """ 109 | sentences = self.tokenize(text) 110 | for s in sentences: 111 | self.tag(s) 112 | self.parse(s) 113 | return sentences 114 | 115 | def _read(self, text: str, input_format: str) -> List[Sentence]: 116 | """Convert the text to an UDPipe representation. 117 | 118 | text: Input text. 119 | input_format: Desired input format. 120 | RETURNS: Processed sentences. 121 | """ 122 | input_format.setText(text) 123 | error = ProcessingError() 124 | sentences = [] 125 | 126 | sentence = Sentence() 127 | while input_format.nextSentence(sentence, error): 128 | sentences.append(sentence) 129 | sentence = Sentence() 130 | if error.occurred(): 131 | raise Exception(error.message) 132 | 133 | return sentences 134 | 135 | def tokenize( 136 | self, 137 | text: Union[ 138 | str, 139 | List[str], 140 | List[List[str]] 141 | ] 142 | ) -> List[Sentence]: 143 | """Tokenize input text. 144 | 145 | text: Input text, can be presegmented or pretokenized: 146 | str : raw text, 147 | List[str] : presegmented text, 148 | List[List[str]] : pretokenized text. 149 | Note: both presegmented and pretokenized text can not contain 150 | newline or tab characters. 151 | RETURNS: Processed sentences. 152 | """ 153 | if isinstance(text, str): 154 | tokenizer = self.model.newTokenizer(self.model.DEFAULT) 155 | elif isinstance(text, list): 156 | if isinstance(text[0], list): 157 | text = "\n".join("\t".join(sent) for sent in text) 158 | tokenizer = PretokenizedInputFormat() 159 | else: 160 | text = "\n".join(text) 161 | tokenizer = self.model.newTokenizer( 162 | self.model.TOKENIZER_PRESEGMENTED 163 | ) 164 | else: 165 | raise TypeError( 166 | "\n".join( 167 | (f"Input type is {type(text)}, but must be one:", 168 | "str : raw text", 169 | "List[str] : presegmented text", 170 | "List[List[str]] : pretokenized text") 171 | ) 172 | ) 173 | if not tokenizer: 174 | raise Exception( 175 | "The model does not have a tokenizer " 176 | f"so it can not tokenize input: {text}" 177 | ) 178 | return self._read(text=text, input_format=tokenizer) 179 | 180 | def tag(self, sentence: Sentence) -> None: 181 | """Assign part-of-speech tags (inplace). 182 | 183 | sentence: Input sentence. 184 | """ 185 | self.model.tag(sentence, self.model.DEFAULT) 186 | 187 | def parse(self, sentence: Sentence) -> None: 188 | """Assign dependency parse relations (inplace). 189 | 190 | sentence: Input sentence. 191 | """ 192 | self.model.parse(sentence, self.model.DEFAULT) 193 | 194 | def read(self, text: str, in_format: str) -> List[Sentence]: 195 | """Load text in the given format and return it in an UDPipe 196 | representation. 197 | 198 | text: Text to load. 199 | in_format: 'conllu'|'horizontal'|'vertical'. 200 | RETURNS: Processed sentences. 201 | """ 202 | input_format = InputFormat.newInputFormat(in_format) 203 | if not input_format: 204 | raise Exception(f"Cannot create input format '{in_format}'") 205 | return self._read(text=text, input_format=input_format) 206 | 207 | def write(self, sentences: List[Sentence], out_format: str) -> str: 208 | """Write given sentences in the required output format. 209 | 210 | sentences: Input ufal.udpipe.Sentence-s. 211 | out_format: 'conllu'|'horizontal'|'vertical'. 212 | RETURNS: Sentences formatted in the out_format. 213 | """ 214 | output_format = OutputFormat.newOutputFormat(out_format) 215 | output = "".join([output_format.writeSentence(s) for s in sentences]) 216 | output += output_format.finishDocument() 217 | 218 | return output 219 | -------------------------------------------------------------------------------- /spacy_udpipe/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import urllib.request 4 | from typing import Dict, Optional 5 | 6 | from spacy import blank, Language 7 | from spacy.util import get_lang_class 8 | 9 | from . import resources 10 | 11 | 12 | # Read files from inside a package: https://stackoverflow.com/a/20885799 13 | try: 14 | import importlib.resources as pkg_resources 15 | except ImportError: 16 | # Try backported to Python 3.7 `importlib_resources`. 17 | import importlib_resources as pkg_resources 18 | 19 | 20 | BASE_URL = "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-3131" # noqa: E501 21 | MODELS_DIR = os.getenv( 22 | "SPACY_UDPIPE_MODELS_DIR", 23 | os.path.join(os.path.expanduser("~/.cache"), "spacy_udpipe_models"), 24 | ) 25 | 26 | with pkg_resources.open_text(resources, "languages.json", encoding="utf-8") as f: 27 | LANGUAGES = json.load(f) 28 | 29 | 30 | def _check_language(lang: str) -> None: 31 | assert lang in LANGUAGES, f"'{lang}' language not available" 32 | 33 | 34 | def _check_models_dir(models_dir) -> None: 35 | assert os.path.exists(models_dir), "Download the pretrained model(s) first" 36 | 37 | 38 | def download(lang: str, models_dir: Optional[str] = None, verbose: bool = False) -> None: 39 | """Download the UDPipe pretrained model. 40 | 41 | lang: ISO 639-1 language code or shorthand UDPipe model name. 42 | models_dir: Directory to store a downloaded model. 43 | """ 44 | models_dir = models_dir or MODELS_DIR 45 | _check_language(lang) 46 | try: 47 | _check_models_dir(models_dir) 48 | except AssertionError: 49 | os.makedirs(models_dir) 50 | if LANGUAGES[lang] in os.listdir(models_dir): 51 | if verbose: 52 | print(f"Already downloaded a model for the '{lang}' language") 53 | return 54 | url = f"{BASE_URL}/{LANGUAGES[lang]}" 55 | filename = os.path.join(models_dir, LANGUAGES[lang]) 56 | urllib.request.urlretrieve(url=url, filename=filename) 57 | if verbose: 58 | print(f"Downloaded pre-trained UDPipe model for '{lang}' language") 59 | 60 | 61 | def get_path(lang: str, models_dir: Optional[str] = None) -> str: 62 | """Get the path to the UDPipe pretrained model if it was downloaded. 63 | 64 | lang: ISO 639-1 language code or shorthand UDPipe model name. 65 | models_dir: Directory with the pretrained models. 66 | RETURNS: The path to the UDPipe pretrained model. 67 | """ 68 | models_dir = models_dir or MODELS_DIR 69 | _check_language(lang) 70 | _check_models_dir(models_dir) 71 | if not LANGUAGES[lang] in os.listdir(models_dir): 72 | raise Exception( 73 | "Use spacy_udpipe.download to download the pre-trained" 74 | f" UDPipe model for the '{lang}' language" 75 | ) 76 | path = os.path.join(models_dir, LANGUAGES[lang]) 77 | return path 78 | 79 | 80 | def get_defaults(lang: str) -> Language.Defaults: 81 | """Get the language-specific defaults, if available in spaCy. This allows 82 | using lexical attribute getters that depend on static language data, e.g. 83 | Token.like_num, Token.is_stop, Doc.noun_chunks, etc. 84 | 85 | lang: ISO 639-1 language code or shorthand UDPipe model name. 86 | RETURNS: The language defaults. 87 | """ 88 | try: 89 | lang_cls = get_lang_class(lang) 90 | return lang_cls.Defaults 91 | except ImportError: 92 | return Language.Defaults 93 | 94 | 95 | def load( 96 | lang: str = "" 97 | ) -> Language: 98 | """Convenience function for initializing the Language class that 99 | mimicks spacy.load. 100 | 101 | lang: ISO 639-1 language code or shorthand UDPipe model name. 102 | RETURNS: SpaCy Language object with UDPipeTokenizer. 103 | """ 104 | config = {"nlp": {"tokenizer": {}}} 105 | name = lang.split("-")[0] 106 | config["nlp"]["tokenizer"]["@tokenizers"] = "spacy_udpipe.PipelineAsTokenizer.v1" # noqa: E501 107 | # Set UDPipe options 108 | config["nlp"]["tokenizer"]["lang"] = lang 109 | config["nlp"]["tokenizer"]["path"] = get_path(lang) 110 | config["nlp"]["tokenizer"]["meta"] = None 111 | return blank(name, config=config) 112 | 113 | 114 | def load_from_path( 115 | lang: str, 116 | path: str, 117 | meta: Optional[Dict] = {"description": "custom model"}, 118 | ) -> Language: 119 | """Convenience function for initializing the Language class 120 | and loading a custom UDPipe model via the path argument. 121 | 122 | lang: ISO 639-1 language code or shorthand UDPipe model name. 123 | path: Path to the UDPipe model. 124 | meta: Optional meta-information about the UDPipe model. 125 | RETURNS: SpaCy Language object with UDPipeTokenizer. 126 | """ 127 | config = {"nlp": {"tokenizer": {}}} 128 | name = lang.split("-")[0] 129 | config["nlp"]["tokenizer"]["@tokenizers"] = "spacy_udpipe.PipelineAsTokenizer.v1" # noqa: E501 130 | # Set UDPipe options 131 | config["nlp"]["tokenizer"]["lang"] = lang 132 | config["nlp"]["tokenizer"]["path"] = path 133 | config["nlp"]["tokenizer"]["meta"] = meta 134 | return blank(name, config=config) 135 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TakeLab/spacy-udpipe/a38cd1e1c8dc5e18def177cd21d12922da833d6f/tests/__init__.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.fixture(scope="session", autouse=True) 5 | def set_test_models_dir(tmp_path_factory, session_mocker): 6 | test_models_dir = tmp_path_factory.mktemp("models") 7 | session_mocker.patch("spacy_udpipe.utils.MODELS_DIR", str(test_models_dir)) 8 | -------------------------------------------------------------------------------- /tests/languages/en/test_en_language.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import pytest 4 | from spacy.lang.en import EnglishDefaults 5 | from spacy.language import BaseDefaults 6 | from spacy_udpipe import download 7 | from spacy_udpipe import load 8 | from spacy_udpipe.utils import get_defaults 9 | 10 | 11 | @pytest.fixture 12 | def lang() -> str: 13 | return "en" 14 | 15 | 16 | @pytest.fixture(autouse=True) 17 | def download_lang(lang: str) -> None: 18 | download(lang=lang) 19 | 20 | 21 | def tags_equal(act: List[str], exp: List[str]) -> bool: 22 | """Check if each actual tag is equal to one or more expected tags.""" 23 | return all(a == e if isinstance(e, str) else a in e 24 | for a, e in zip(act, exp)) 25 | 26 | 27 | def test_get_defaults(lang: str) -> None: 28 | assert get_defaults(lang) == EnglishDefaults 29 | assert get_defaults("blabla") == BaseDefaults 30 | 31 | 32 | def test_spacy_udpipe_default(lang: str) -> None: 33 | nlp = load(lang=lang) 34 | 35 | text = "Testing one, two, three. This is a test." 36 | doc = nlp(text=text) 37 | 38 | pos_actual = ["PROPN", "NUM", "PUNCT", "NUM", "PUNCT", "NUM", 39 | "PUNCT", 40 | "PRON", "AUX", "DET", "NOUN", 41 | "PUNCT"] 42 | # test token attributes 43 | assert [t.text for t in doc] == ["Testing", "one", ",", "two", ",", "three", # noqa: E501 44 | ".", 45 | "This", "is", "a", "test", 46 | "."] 47 | assert [t.lemma_ for t in doc] == ["test", "one", ",", "two", ",", "three", 48 | ".", 49 | "this", "be", "a", "test", 50 | "."] 51 | assert tags_equal(act=pos_actual, exp=[t.pos_ for t in doc]) 52 | # CoNNL xpostag-s, custom for each UD treebank 53 | assert [t.tag_ for t in doc] == ["NNP", "CD", ",", "CD", ",", "CD", 54 | ".", 55 | "DT", "VBZ", "DT", "NN", 56 | "."] 57 | assert [t.dep_ for t in doc] == ["ROOT", "nummod", "punct", "appos", "punct", "nummod", # noqa: E501 58 | "punct", 59 | "nsubj", "cop", "det", "ROOT", 60 | "punct"] 61 | assert [t.is_sent_start for t in doc] == [True, False, False, False, False, False, False, # noqa: E501 62 | True, False, False, False, False] 63 | assert any([t.is_stop for t in doc]) 64 | assert [str(t.morph) for t in doc] == [ 65 | 'Number=Sing', 66 | 'NumType=Card', 67 | '', 68 | 'NumType=Card', 69 | '', 70 | 'NumType=Card', 71 | '', 72 | 'Number=Sing|PronType=Dem', 73 | 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin', 74 | 'Definite=Ind|PronType=Art', 75 | 'Number=Sing', 76 | '' 77 | ] 78 | # test doc attributes 79 | assert len(list(doc.sents)) == 2 80 | assert doc.has_annotation("TAG") 81 | assert doc.has_annotation("DEP") 82 | assert doc.has_annotation("SENT_START") 83 | assert doc.has_annotation("MORPH") 84 | # test pipe 85 | docs = list(nlp.pipe(["Testing one, two, three.", "This is a test."])) 86 | assert docs[0].text == "Testing one, two, three." 87 | assert [t.pos_ for t in docs[0]] == ["PROPN", "NUM", "PUNCT", "NUM", "PUNCT", "NUM", "PUNCT"] # noqa: E501 88 | assert docs[1].text == "This is a test." 89 | assert tags_equal(act=pos_actual[-5:], exp=[t.pos_ for t in docs[1]]) 90 | 91 | 92 | def test_spacy_udpipe_presegmented(lang: str) -> None: 93 | nlp = load(lang=lang) 94 | 95 | text = "Testing one, two, three. This is a test." 96 | doc = nlp(text=text) 97 | doc_json = doc.to_json() 98 | 99 | text_pre = ["Testing one, two, three.", "This is a test."] 100 | doc_pre = nlp(text=text_pre) 101 | doc_pre_json = doc_pre.to_json() 102 | 103 | assert doc_json["text"] == doc_pre_json["text"] 104 | assert doc_json["sents"] == doc_pre_json["sents"] 105 | assert doc_json["tokens"] == doc_pre_json["tokens"] 106 | 107 | 108 | def test_spacy_udpipe_pretokenized(lang: str) -> None: 109 | nlp = load(lang=lang) 110 | 111 | text = "Testing one, two, three. This is a test." 112 | doc = nlp(text=text) 113 | doc_json = doc.to_json() 114 | 115 | text_pre = [ 116 | ["Testing", "one", ",", "two", ",", "three", "."], 117 | ["This", "is", "a", "test", "."] 118 | ] 119 | doc_pre = nlp(text=text_pre) 120 | doc_pre_json = doc_pre.to_json() 121 | 122 | assert doc_json["text"] == doc_pre_json["text"] 123 | assert doc_json["sents"] == doc_pre_json["sents"] 124 | assert doc_json["tokens"] == doc_pre_json["tokens"] 125 | -------------------------------------------------------------------------------- /tests/languages/fr/test_fr_language.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from spacy.lang.fr import FrenchDefaults 3 | from spacy.language import BaseDefaults 4 | from spacy_udpipe import download, load 5 | from spacy_udpipe.utils import get_defaults 6 | 7 | 8 | @pytest.fixture 9 | def lang() -> str: 10 | return "fr" 11 | 12 | 13 | @pytest.fixture(autouse=True) 14 | def download_lang(lang: str) -> None: 15 | download(lang=lang) 16 | 17 | 18 | def test_get_defaults(lang: str) -> None: 19 | assert get_defaults(lang) == FrenchDefaults 20 | assert get_defaults("blabla") == BaseDefaults 21 | 22 | 23 | def test_spacy_udpipe(lang: str) -> None: 24 | nlp = load(lang=lang) 25 | 26 | text = "Attention aux articles contractés!" 27 | doc = nlp(text=text) 28 | 29 | assert [t.orth_ for t in doc] == ["Attention", "à", "les", "articles", "contractés", "!"] 30 | 31 | pos = [{"INTJ", "NOUN"}, {"ADP"}, {"DET"}, {"NOUN"}, {"VERB", "ADJ"}, {"PUNCT"}] 32 | for i, t in enumerate(doc): 33 | assert t.pos_ in pos[i] 34 | 35 | assert [t.head.i for t in doc] == [0, 3, 3, 0, 3, 0] 36 | 37 | dep = [{"ROOT", "root"}, {"case"}, {"det"}, {"nmod", "obl", "obl:arg"}, {"acl", "amod"}, {"punct"}] 38 | for i, t in enumerate(doc): 39 | assert t.dep_ in dep[i] 40 | -------------------------------------------------------------------------------- /tests/test_spacy_udpipe.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | 3 | import pytest 4 | import spacy 5 | 6 | from spacy_udpipe import download, load 7 | 8 | 9 | @pytest.fixture 10 | def lang() -> str: 11 | return "en" 12 | 13 | 14 | @pytest.fixture(autouse=True) 15 | def download_lang(lang: str) -> None: 16 | download(lang=lang) 17 | 18 | 19 | def test_serialization(lang: str) -> None: 20 | with tempfile.TemporaryDirectory() as tdir: 21 | nlp = load(lang=lang) 22 | doc = nlp("A simple sentence.") 23 | nlp.to_disk(tdir) 24 | del nlp 25 | 26 | nlp = spacy.load(tdir) 27 | same_doc = nlp("A simple sentence.") 28 | 29 | assert doc.to_json() == same_doc.to_json() 30 | 31 | 32 | def test_pipe(lang: str) -> None: 33 | nlp = load(lang=lang) 34 | 35 | text = "spacy-udpipe still does not support multiprocess execution." 36 | doc = nlp(text) 37 | del nlp 38 | 39 | nlp = load(lang=lang) 40 | texts = [text for _ in range(2)] 41 | docs = list(nlp.pipe(texts, n_process=-1)) 42 | 43 | assert len(docs) == len(texts) 44 | assert docs[0].to_json() == doc.to_json() 45 | assert docs[-1].to_json() == doc.to_json() 46 | --------------------------------------------------------------------------------