├── .github
    └── workflows
    │   ├── pythonpackage.yml
    │   └── pythonpublish.yml
├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── setup.py
├── spacy_udpipe
    ├── __init__.py
    ├── resources
    │   ├── __init__.py
    │   └── languages.json
    ├── tokenizer.py
    ├── udpipe.py
    └── utils.py
└── tests
    ├── __init__.py
    ├── conftest.py
    ├── languages
        ├── en
        │   └── test_en_language.py
        └── fr
        │   └── test_fr_language.py
    └── test_spacy_udpipe.py


/.github/workflows/pythonpackage.yml:
--------------------------------------------------------------------------------
 1 | name: Python package
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     types: [opened, synchronize, reopened]
 6 |     branches:
 7 |       - master
 8 | 
 9 | jobs:
10 |   build:
11 | 
12 |     runs-on: ${{ matrix.os }}
13 |     strategy:
14 |       max-parallel: 4
15 |       matrix:
16 |         os: [ubuntu-latest, windows-latest]
17 |         python-version: [3.6, 3.7, 3.8, 3.9]
18 | 
19 |     steps:
20 |     - uses: actions/checkout@v1
21 |     - name: Set up Python ${{ matrix.python-version }}
22 |       uses: actions/setup-python@v1
23 |       with:
24 |         python-version: ${{ matrix.python-version }}
25 |     - name: Install package and dependencies
26 |       run: |
27 |         python -m pip install --upgrade pip
28 |         pip install .[dev]
29 |     - name: Lint with flake8
30 |       run: |
31 |         # stop the build if there are Python syntax errors or undefined names
32 |         flake8 --filename=*.py --count --show-source --max-line-length=119 --statistics
33 |         # exit-zero treats all errors as warnings
34 |         flake8 --filename=*.py --count --exit-zero --max-line-length=119 --max-complexity=10 --statistics
35 |     - name: Test with pytest
36 |       run: |
37 |         python -m pytest -vvv tests
38 | 


--------------------------------------------------------------------------------
/.github/workflows/pythonpublish.yml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Package
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [created]
 6 | 
 7 | jobs:
 8 |   deploy:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |     - uses: actions/checkout@v1
12 |     - name: Set up Python
13 |       uses: actions/setup-python@v1
14 |       with:
15 |         python-version: '3.6'
16 |     - name: Install dependencies
17 |       run: |
18 |         python -m pip install --upgrade pip
19 |         pip install setuptools wheel twine
20 |     - name: Build and publish
21 |       env:
22 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
23 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
24 |       run: |
25 |         python setup.py sdist bdist_wheel
26 |         twine upload dist/*
27 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Ignore models dir
  2 | spacy_udpipe/models
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | 
 62 | # Flask stuff:
 63 | instance/
 64 | .webassets-cache
 65 | 
 66 | # Scrapy stuff:
 67 | .scrapy
 68 | 
 69 | # Sphinx documentation
 70 | docs/_build/
 71 | 
 72 | # PyBuilder
 73 | target/
 74 | 
 75 | # Jupyter Notebook
 76 | .ipynb_checkpoints
 77 | 
 78 | # pyenv
 79 | .python-version
 80 | 
 81 | # celery beat schedule file
 82 | celerybeat-schedule
 83 | 
 84 | # SageMath parsed files
 85 | *.sage.py
 86 | 
 87 | # Environments
 88 | .env
 89 | .venv
 90 | env/
 91 | venv/
 92 | ENV/
 93 | env.bak/
 94 | venv.bak/
 95 | 
 96 | # Spyder project settings
 97 | .spyderproject
 98 | .spyproject
 99 | 
100 | # Rope project settings
101 | .ropeproject
102 | 
103 | # mkdocs documentation
104 | /site
105 | 
106 | # mypy
107 | .mypy_cache/
108 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019-2021 Text Analysis and Knowledge Engineering Lab (TakeLab)
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: lint test
 2 | 
 3 | # Lint source code
 4 | 
 5 | lint:
 6 | 	# stop the build if there are Python syntax errors or undefined names
 7 | 	flake8 --filename=*.py --count --show-source --max-line-length=119 --statistics
 8 | 	# exit-zero treats all errors as warnings
 9 | 	flake8 --filename=*.py --count --exit-zero --max-line-length=119 --max-complexity=10 --statistics
10 | 
11 | # Run tests
12 | 
13 | test:
14 | 	python -m pytest -vvv tests
15 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # spaCy + UDPipe
 2 | 
 3 | This package wraps the fast and efficient [UDPipe](http://ufal.mff.cuni.cz/udpipe) language-agnostic NLP pipeline
 4 | (via its [Python bindings](https://github.com/ufal/udpipe/tree/master/bindings/python)), so you can use
 5 | [UDPipe pre-trained models](https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-3131) as a [spaCy](https://spacy.io/) pipeline for 50+ languages out-of-the-box.
 6 | Inspired by [spacy-stanza](https://github.com/explosion/spacy-stanza), this package offers slightly less accurate
 7 | models that are in turn much faster (see benchmarks for [UDPipe](https://ufal.mff.cuni.cz/udpipe/models#universal_dependencies_25_models_performance) and [Stanza](https://stanfordnlp.github.io/stanza/performance.html)).
 8 | 
 9 | ## Installation
10 | 
11 | Use the package manager [pip](https://pip.pypa.io/en/stable/) to install spacy-udpipe.
12 | 
13 | ```bash
14 | pip install spacy-udpipe
15 | ```
16 | 
17 | After installation, use `spacy_udpipe.download()` to download the pre-trained model for the desired language.
18 | 
19 | A full list of pre-trained UDPipe models for supported languages can be found in [`languages.json`](https://github.com/TakeLab/spacy-udpipe/blob/master/spacy_udpipe/languages.json).
20 | 
21 | ## Usage
22 | The loaded UDPipeLanguage class returns a spaCy [`Language` object](https://spacy.io/api/language), i.e., the object you can use to process text and create a [`Doc` object](https://spacy.io/api/doc).
23 | 
24 | ```python
25 | import spacy_udpipe
26 | 
27 | spacy_udpipe.download("en") # download English model
28 | 
29 | text = "Wikipedia is a free online encyclopedia, created and edited by volunteers around the world."
30 | nlp = spacy_udpipe.load("en")
31 | 
32 | doc = nlp(text)
33 | for token in doc:
34 |     print(token.text, token.lemma_, token.pos_, token.dep_)
35 | 
36 | ```
37 | As all attributes are computed once and set in the custom [`Tokenizer`](https://spacy.io/api/tokenizer), the `Language.pipeline` is empty.
38 | 
39 | The type of `text` can be one of the following:
40 |   * unprocessed: `str`,
41 |   * presegmented: `List[str]`,
42 |   * pretokenized: `List[List[str]]`.
43 | 
44 | ### Loading a custom model
45 | The following code snippet demonstrates how to load a custom `UDPipe` model (for the Croatian language):
46 | ```python
47 | import spacy_udpipe
48 | 
49 | nlp = spacy_udpipe.load_from_path(lang="hr",
50 |                                   path="./custom_croatian.udpipe",
51 |                                   meta={"description": "Custom 'hr' model"})
52 | text = "Wikipedija je enciklopedija slobodnog sadržaja."
53 | 
54 | doc = nlp(text)
55 | for token in doc:
56 |     print(token.text, token.lemma_, token.pos_, token.dep_)
57 | ```
58 | This can be done for any of the languages supported by spaCy. For an exhaustive list, see [spaCy languages](https://spacy.io/usage/models#languages).
59 | 
60 | ## Contributing
61 | Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change.
62 | 
63 | Please make sure to update the tests as appropriate. Tests are run automatically for each pull request on the master branch.
64 | To start the tests locally, first, install the package with `pip install -e .[dev]`, then run [`pytest`](https://docs.pytest.org/en/latest/contents.html) in the root source directory as follows:
65 | ```bash
66 | make test
67 | ```
68 | Additionally, run [`flake8`](https://flake8.pycqa.org/en/latest) with the following command to check for coding mistakes:
69 | ```bash
70 | make lint
71 | ```
72 | 
73 | ## License
74 | * Source code: [MIT](https://choosealicense.com/licenses/mit/) © Text Analysis and Knowledge Engineering Lab (TakeLab)
75 | * Available pre-trained models: [CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/)
76 | 
77 | ## Project status
78 | Maintained by [Text Analysis and Knowledge Engineering Lab (TakeLab)](http://takelab.fer.hr/).
79 | 
80 | ## Notes
81 | 
82 | * Known possible issues:
83 |     * Tag map
84 | 
85 |       `Token.tag_` is a [CoNLL](https://universaldependencies.org/format.html) XPOS tag (language-specific part-of-speech tag), defined for each language separately by the corresponding [Universal Dependencies](https://universaldependencies.org/) treebank. Mappings between XPOS and Universal Dependencies POS tags should be defined in a `TAG_MAP` dictionary (located in language-specific `tag_map.py` files), along with optional morphological features. See [spaCy tag map](https://spacy.io/usage/adding-languages#tag-map) for more details.
86 |     * Syntax iterators
87 | 
88 |       In order to extract `Doc.noun_chunks`, a proper syntax iterator implementation for the language of interest is required. For more details, please see [spaCy syntax iterators](https://spacy.io/usage/adding-languages#syntax-iterators).
89 |     * Other language-specific issues
90 | 
91 |       A quick way to check language-specific defaults in [spaCy](https://spacy.io) is to visit [spaCy language support](https://spacy.io/usage/models#languages). Also, please see [spaCy language data](https://spacy.io/usage/adding-languages#language-data) for details regarding other language-specific data.
92 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from setuptools import find_packages, setup
 4 | 
 5 | 
 6 | def get_version(fname: str) -> str:
 7 |     full_path = os.path.join(
 8 |         os.path.abspath(os.path.dirname(__file__)),
 9 |         "spacy_udpipe",
10 |         fname
11 |     )
12 |     with open(full_path, "r", encoding="utf-8") as fp:
13 |         for line in fp:
14 |             if line.startswith("__version__"):
15 |                 delim = '"' if '"' in line else "'"
16 |                 return line.split(delim)[1]
17 |             else:
18 |                 raise RuntimeError(
19 |                     "Unable to find version string."
20 |                 )
21 | 
22 | 
23 | URL = "https://github.com/TakeLab/spacy-udpipe"
24 | 
25 | with open("README.md", "r", encoding="utf-8") as f:
26 |     readme = f.read()
27 | 
28 | setup(
29 |     name="spacy_udpipe",
30 |     version=get_version("__init__.py"),
31 |     description="Use fast UDPipe models directly in spaCy",
32 |     long_description=readme,
33 |     long_description_content_type="text/markdown",
34 |     url=URL,
35 |     author="TakeLab",
36 |     author_email="takelab@fer.hr",
37 |     license="MIT",
38 |     keywords="nlp udpipe spacy python",
39 |     packages=find_packages(),
40 |     install_requires=[
41 |         "spacy>=3.0.0,<4.0.0",
42 |         "ufal.udpipe>=1.2.0",
43 |         "importlib_resources;python_version<'3.7'",
44 |     ],
45 |     extras_require={
46 |         "dev": ["flake8", "pytest", "pytest-mock"],
47 |     },
48 |     python_requires=">=3.6",
49 |     entry_points={
50 |         "spacy_tokenizers": [
51 |             "spacy_udpipe.PipelineAsTokenizer.v1 = spacy_udpipe:tokenizer.create_tokenizer",
52 |         ]
53 |     },
54 |     tests_require=["pytest>=5.0.0"],
55 |     package_data={"spacy_udpipe": ["./languages.json"], },
56 |     classifiers=[
57 |         "Development Status :: 4 - Beta",
58 |         "Intended Audience :: Developers",
59 |         "Intended Audience :: Education",
60 |         "Intended Audience :: Science/Research"
61 |         "Programming Language :: Python :: 3",
62 |         "License :: OSI Approved :: MIT License",
63 |         "Operating System :: OS Independent",
64 |         "Topic :: Software Development",
65 |         "Topic :: Software Development :: Libraries",
66 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
67 |         "Topic :: Text Processing",
68 |     ],
69 |     project_urls={
70 |         "SpaCy": "https://spacy.io/",
71 |         "TakeLab": "http://takelab.fer.hr/",
72 |         "UDPipe": "http://ufal.mff.cuni.cz/udpipe",
73 |         "Source": URL,
74 |         "Tracker": URL + "/issues",
75 |     },
76 |     zip_safe=True
77 | )
78 | 


--------------------------------------------------------------------------------
/spacy_udpipe/__init__.py:
--------------------------------------------------------------------------------
 1 | __version__ = "1.0.0"
 2 | __all__ = [
 3 |     "download", "load", "load_from_path",
 4 |     "UDPipeTokenizer", "UDPipeModel"
 5 | ]
 6 | 
 7 | from .utils import download, load, load_from_path
 8 | from .tokenizer import UDPipeTokenizer
 9 | from .udpipe import UDPipeModel
10 | 


--------------------------------------------------------------------------------
/spacy_udpipe/resources/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TakeLab/spacy-udpipe/a38cd1e1c8dc5e18def177cd21d12922da833d6f/spacy_udpipe/resources/__init__.py


--------------------------------------------------------------------------------
/spacy_udpipe/resources/languages.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "af": "afrikaans-afribooms-ud-2.5-191206.udpipe",
  3 |     "hy": "armenian-armtdp-ud-2.5-191206.udpipe",
  4 |     "eu": "basque-bdt-ud-2.5-191206.udpipe",
  5 |     "be": "belarusian-hse-ud-2.5-191206.udpipe",
  6 |     "grc": "ancient_greek-perseus-ud-2.5-191206.udpipe",
  7 |     "grc-perseus": "ancient_greek-perseus-ud-2.5-191206.udpipe",
  8 |     "grc-proiel": "ancient_greek-proiel-ud-2.5-191206.udpipe",
  9 |     "ca": "catalan-ancora-ud-2.5-191206.udpipe",
 10 |     "zh": "chinese-gsd-ud-2.5-191206.udpipe",
 11 |     "zh-gsdsimp": "chinese-gsdsimp-ud-2.5-191206.udpipe",
 12 |     "lzh": "classical_chinese-kyoto-ud-2.5-191206.udpipe",
 13 |     "ar": "arabic-padt-ud-2.5-191206.udpipe",
 14 |     "bg": "bulgarian-btb-ud-2.5-191206.udpipe",
 15 |     "cs": "czech-pdt-ud-2.5-191206.udpipe",
 16 |     "cs-pdt": "czech-pdt-ud-2.5-191206.udpipe",
 17 |     "cs-cac": "czech-cac-ud-2.5-191206.udpipe",
 18 |     "cs-fictree": "czech-fictree-ud-2.5-191206.udpipe",
 19 |     "cs-cltt": "czech-cltt-ud-2.5-191206.udpipe",
 20 |     "hr": "croatian-set-ud-2.5-191206.udpipe",
 21 |     "coptic": "coptic-scriptorium-ud-2.5-191206.udpipe",
 22 |     "nl": "dutch-alpino-ud-2.5-191206.udpipe",
 23 |     "nl-alpino": "dutch-alpino-ud-2.5-191206.udpipe",
 24 |     "nl-lassysmall": "dutch-lassysmall-ud-2.5-191206.udpipe",
 25 |     "da": "danish-ddt-ud-2.5-191206.udpipe",
 26 |     "et": "estonian-edt-ud-2.5-191206.udpipe",
 27 |     "et-edt": "estonian-edt-ud-2.5-191206.udpipe",
 28 |     "et-ewt": "estonian-ewt-ud-2.5-191206.udpipe",
 29 |     "en": "english-ewt-ud-2.5-191206.udpipe",
 30 |     "en-ewt": "english-ewt-ud-2.5-191206.udpipe",
 31 |     "en-gum": "english-gum-ud-2.5-191206.udpipe",
 32 |     "en-lines": "english-lines-ud-2.5-191206.udpipe",
 33 |     "en-partut": "english-partut-ud-2.5-191206.udpipe",
 34 |     "fr": "french-gsd-ud-2.5-191206.udpipe",
 35 |     "fr-gsd": "french-gsd-ud-2.5-191206.udpipe",
 36 |     "fr-partut": "french-partut-ud-2.5-191206.udpipe",
 37 |     "fr-sequoia": "french-sequoia-ud-2.5-191206.udpipe",
 38 |     "fr-spoken": "french-spoken-ud-2.5-191206.udpipe",
 39 |     "fro": "old_french-srcmf-ud-2.5-191206.udpipe",
 40 |     "fi": "finnish-tdt-ud-2.5-191206.udpipe",
 41 |     "fi-tdt": "finnish-tdt-ud-2.5-191206.udpipe",
 42 |     "fi-ftb": "finnish-ftb-ud-2.5-191206.udpipe",
 43 |     "de": "german-gsd-ud-2.5-191206.udpipe",
 44 |     "de-hdt": "german-hdt-ud-2.5-191206.udpipe",
 45 |     "gl": "galician-ctg-ud-2.5-191206.udpipe",
 46 |     "gl-ctg": "galician-ctg-ud-2.5-191206.udpipe",
 47 |     "gl-treegal": "galician-treegal-ud-2.5-191206.udpipe",
 48 |     "el": "greek-gdt-ud-2.5-191206.udpipe",
 49 |     "got": "gothic-proiel-ud-2.5-191206.udpipe",
 50 |     "hu": "hungarian-szeged-ud-2.5-191206.udpipe",
 51 |     "hi": "hindi-hdtb-ud-2.5-191206.udpipe",
 52 |     "he": "hebrew-htb-ud-2.5-191206.udpipe",
 53 |     "it": "italian-isdt-ud-2.5-191206.udpipe",
 54 |     "it-isdt": "italian-isdt-ud-2.5-191206.udpipe",
 55 |     "it-partut": "italian-partut-ud-2.5-191206.udpipe",
 56 |     "it-postwita": "italian-postwita-ud-2.5-191206.udpipe",
 57 |     "it-vit": "italian-vit-ud-2.5-191206.udpipe",
 58 |     "it-twittiro": "italian-twittiro-ud-2.5-191206.udpipe",
 59 |     "ga": "irish-idt-ud-2.5-191206.udpipe",
 60 |     "id": "indonesian-gsd-ud-2.5-191206.udpipe",
 61 |     "la": "latin-ittb-ud-2.5-191206.udpipe",
 62 |     "la-ittb": "latin-ittb-ud-2.5-191206.udpipe",
 63 |     "la-proiel": "latin-proiel-ud-2.5-191206.udpipe",
 64 |     "la-perseus": "latin-perseus-ud-2.5-191206.udpipe",
 65 |     "ko": "korean-kaist-ud-2.5-191206.udpipe",
 66 |     "ko-kaist": "korean-kaist-ud-2.5-191206.udpipe",
 67 |     "ko-gsd": "korean-gsd-ud-2.5-191206.udpipe",
 68 |     "ja": "japanese-gsd-ud-2.5-191206.udpipe",
 69 |     "lt": "lithuanian-alksnis-ud-2.5-191206.udpipe",
 70 |     "lt-alksnis": "lithuanian-alksnis-ud-2.5-191206.udpipe",
 71 |     "lt-hse": "lithuanian-hse-ud-2.5-191206.udpipe",
 72 |     "lv": "latvian-lvtb-ud-2.5-191206.udpipe",
 73 |     "nn": "norwegian-nynorsk-ud-2.5-191206.udpipe",
 74 |     "nn-nynorsk": "norwegian-nynorsk-ud-2.5-191206.udpipe",
 75 |     "nn-nynorsklia": "norwegian-nynorsklia-ud-2.5-191206.udpipe",
 76 |     "nb": "norwegian-bokmaal-ud-2.5-191206.udpipe",
 77 |     "se": "north_sami-giella-ud-2.5-191206.udpipe",
 78 |     "mr": "marathi-ufal-ud-2.5-191206.udpipe",
 79 |     "mt": "maltese-mudt-ud-2.5-191206.udpipe",
 80 |     "fa": "persian-seraji-ud-2.5-191206.udpipe",
 81 |     "cu": "old_church_slavonic-proiel-ud-2.5-191206.udpipe",
 82 |     "ro": "romanian-rrt-ud-2.5-191206.udpipe",
 83 |     "ro-rrt": "romanian-rrt-ud-2.5-191206.udpipe",
 84 |     "ro-nonstandard": "romanian-nonstandard-ud-2.5-191206.udpipe",
 85 |     "pt": "portuguese-gsd-ud-2.5-191206.udpipe",
 86 |     "pt-gsd": "portuguese-gsd-ud-2.5-191206.udpipe",
 87 |     "pt-bosque": "portuguese-bosque-ud-2.5-191206.udpipe",
 88 |     "pl": "polish-pdb-ud-2.5-191206.udpipe",
 89 |     "pl-pdb": "polish-pdb-ud-2.5-191206.udpipe",
 90 |     "pl-lfg": "polish-lfg-ud-2.5-191206.udpipe",
 91 |     "sr": "serbian-set-ud-2.5-191206.udpipe",
 92 |     "ru": "russian-syntagrus-ud-2.5-191206.udpipe",
 93 |     "ru-syntagrus": "russian-syntagrus-ud-2.5-191206.udpipe",
 94 |     "ru-gsd": "russian-gsd-ud-2.5-191206.udpipe",
 95 |     "ru-taiga": "russian-taiga-ud-2.5-191206.udpipe",
 96 |     "orv": "old_russian-torot-ud-2.5-191206.udpipe",
 97 |     "gd": "scottish_gaelic-arcosg-ud-2.5-191206.udpipe",
 98 |     "es": "spanish-ancora-ud-2.5-191206.udpipe",
 99 |     "es-ancora": "spanish-ancora-ud-2.5-191206.udpipe",
100 |     "es-gsd": "spanish-gsd-ud-2.5-191206.udpipe",
101 |     "sl": "slovenian-ssj-ud-2.5-191206.udpipe",
102 |     "sl-ssj": "slovenian-ssj-ud-2.5-191206.udpipe",
103 |     "sl-sst": "slovenian-sst-ud-2.5-191206.udpipe",
104 |     "sk": "slovak-snk-ud-2.5-191206.udpipe",
105 |     "uk": "ukrainian-iu-ud-2.5-191206.udpipe",
106 |     "tr": "turkish-imst-ud-2.5-191206.udpipe",
107 |     "te": "telugu-mtg-ud-2.5-191206.udpipe",
108 |     "ta": "tamil-ttb-ud-2.5-191206.udpipe",
109 |     "sv": "swedish-talbanken-ud-2.5-191206.udpipe",
110 |     "sv-talbanken": "swedish-talbanken-ud-2.5-191206.udpipe",
111 |     "sv-lines": "swedish-lines-ud-2.5-191206.udpipe",
112 |     "wo": "wolof-wtb-ud-2.5-191206.udpipe",
113 |     "vi": "vietnamese-vtb-ud-2.5-191206.udpipe",
114 |     "ug": "uyghur-udt-ud-2.5-191206.udpipe",
115 |     "ur": "urdu-udtb-ud-2.5-191206.udpipe"
116 | }
117 | 


--------------------------------------------------------------------------------
/spacy_udpipe/tokenizer.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from typing import Optional, Iterable, List, Tuple, Union, Dict
  3 | 
  4 | from spacy.tokens import Doc
  5 | from spacy.vocab import Vocab
  6 | from spacy.util import registry
  7 | from ufal.udpipe import Sentence, Word
  8 | 
  9 | from .udpipe import UDPipeModel
 10 | from .utils import get_path
 11 | 
 12 | 
 13 | @registry.tokenizers("spacy_udpipe.PipelineAsTokenizer.v1")
 14 | def create_tokenizer(
 15 |     lang: str = "",
 16 |     path: Optional[str] = None,
 17 |     meta: Optional[Dict] = None
 18 | ):
 19 |     def tokenizer_factory(
 20 |         nlp,
 21 |         lang=lang,
 22 |         path=path,
 23 |         meta=meta
 24 |     ) -> UDPipeTokenizer:
 25 |         model = UDPipeModel(
 26 |             lang=lang,
 27 |             path=path or get_path(lang),
 28 |             meta=meta
 29 |         )
 30 |         return UDPipeTokenizer(
 31 |             model=model,
 32 |             vocab=nlp.vocab
 33 |         )
 34 | 
 35 |     return tokenizer_factory
 36 | 
 37 | 
 38 | def _spacy_dep(d: str) -> str:
 39 |     # Ensure labels match with SpaCy
 40 |     return d.upper() if d == "root" else d
 41 | 
 42 | 
 43 | class UDPipeTokenizer:
 44 |     """Custom Tokenizer which sets all the attributes because
 45 |     the UDPipe pipeline runs only once and does not
 46 |     contain separate spaCy pipeline components.
 47 |     """
 48 | 
 49 |     _ws_pattern = re.compile(r"\s+")
 50 | 
 51 |     def __init__(
 52 |         self,
 53 |         model: UDPipeModel,
 54 |         vocab: Vocab
 55 |     ):
 56 |         """Initialize the tokenizer.
 57 | 
 58 |         model: The initialized UDPipe model.
 59 |         vocab: The vocabulary to use.
 60 |         """
 61 |         self.model = model
 62 |         self.vocab = vocab
 63 | 
 64 |     def __call__(
 65 |         self,
 66 |         text: Union[
 67 |             str,
 68 |             List[str],
 69 |             List[List[str]]
 70 |         ]
 71 |     ) -> Doc:
 72 |         """Convert input text to a spaCy Doc.
 73 | 
 74 |         text: The text to process. It can be presegmented or pretokenized:
 75 |             str             : raw text,
 76 |             List[str]       : presegmented text,
 77 |             List[List[str]] : pretokenized text.
 78 |         RETURNS: The spaCy Doc object.
 79 |         """
 80 |         if not text:
 81 |             return Doc(vocab=self.vocab)
 82 | 
 83 |         udpipe_sents = self.model(text=text) if text else [Sentence()]
 84 |         text = " ".join(s.getText() for s in udpipe_sents)
 85 |         tokens, heads = self._get_tokens_with_heads(udpipe_sents=udpipe_sents)
 86 | 
 87 |         words = []
 88 |         spaces = []
 89 |         pos = []
 90 |         tags = []
 91 |         morphs = []
 92 |         deps = []
 93 |         lemmas = []
 94 |         offset = 0
 95 |         is_aligned = self._check_aligned(text=text, tokens=tokens)
 96 |         if not is_aligned:
 97 |             text = ""
 98 |             for token in tokens:
 99 |                 text += token.form
100 |                 if token.getSpaceAfter():
101 |                     text += " "
102 |         for i, token in enumerate(tokens):
103 |             span = text[offset:]
104 |             if not span:
105 |                 break
106 |             while len(span) and span[0].isspace():
107 |                 # If we encounter leading whitespace, skip one character ahead
108 |                 offset += 1
109 |                 span = text[offset:]
110 |             words.append(token.form)
111 |             # Make sure all strings are in the vocabulary
112 |             pos.append(token.upostag or "")
113 |             # CoNNL xpostag-s, custom for each UD treebank
114 |             morphs.append(token.feats or "")
115 |             tags.append(token.xpostag or "")
116 |             deps.append(_spacy_dep(token.deprel) or "")
117 |             lemmas.append(token.lemma or "")
118 |             offset += len(token.form)
119 |             span = text[offset:]
120 |             if i == len(tokens) - 1 or not token.getSpaceAfter():
121 |                 spaces.append(False)
122 |             elif not is_aligned:
123 |                 spaces.append(True)
124 |             else:
125 |                 next_token = tokens[i + 1]
126 |                 spaces.append(not span.startswith(next_token.form))
127 |         doc = Doc(
128 |             vocab=self.vocab,
129 |             words=words,
130 |             spaces=spaces,
131 |             pos=pos,
132 |             tags=tags,
133 |             morphs=morphs,
134 |             lemmas=lemmas,
135 |             deps=deps,
136 |             heads=[head + i for i, head in enumerate(heads)],
137 |         )
138 |         return doc
139 | 
140 |     def pipe(
141 |         self,
142 |         texts: Union[
143 |             Iterable[str],
144 |             Iterable[List[str]],
145 |             Iterable[List[List[str]]]
146 |         ]
147 |     ) -> Iterable[Doc]:
148 |         """Tokenize a stream of texts.
149 | 
150 |         texts: A sequence of unicode texts (raw, presegmented or pretokenized).
151 |         n_process: Number of processes to use.
152 |         YIELDS: A sequence of Doc objects, in order.
153 |         """
154 |         for text in texts:
155 |             yield self(text)
156 | 
157 |     def _get_tokens_with_heads(
158 |             self,
159 |             udpipe_sents: List[Sentence]
160 |     ) -> Tuple[List[str], List[int]]:
161 |         """Flatten the tokens in the UDPipe sentence representations and extract
162 |         the token indices of the sentence start tokens to is_sent_start set.
163 | 
164 |         udpipe_sents: The processed sentences.
165 |         RETURNS: The tokens (words).
166 |         """
167 |         tokens = []
168 |         heads = []
169 |         offset = 0
170 |         for sentence in udpipe_sents:
171 |             words = sentence.words[1:]  # Ignore <root>
172 |             for token in words:
173 |                 # Calculate the absolute token index in the doc,
174 |                 # then the *relative* index of the head, -1 for zero-indexed
175 |                 # and if the governor is 0 (root), we leave it at 0
176 |                 if token.head:
177 |                     head = token.head + offset - len(tokens) - 1
178 |                 else:
179 |                     head = 0
180 |                 heads.append(head)
181 |                 tokens.append(token)
182 |             offset += len(words)
183 |         return tokens, heads
184 | 
185 |     def _check_aligned(self, text: str, tokens: List[Word]) -> bool:
186 |         """Check if tokens are aligned with text.
187 | 
188 |         text: Text to check.
189 |         tokens: Tokens to check.
190 |         RETURNS: True iff text and tokens are aligned.
191 |         """
192 |         token_texts = "".join(t.form for t in tokens)
193 |         return re.sub(self._ws_pattern, "", text) == token_texts
194 | 
195 |     def to_disk(self, _path, **kwargs):
196 |         return None
197 | 
198 |     def from_disk(self, _path, **kwargs):
199 |         return self
200 | 
201 |     def to_bytes(self, **kwargs):
202 |         return b""
203 | 
204 |     def from_bytes(self, _bytes_data, **kwargs):
205 |         return self
206 | 


--------------------------------------------------------------------------------
/spacy_udpipe/udpipe.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from typing import Dict, List, Optional, Union
  3 | 
  4 | from ufal.udpipe import InputFormat
  5 | from ufal.udpipe import Model
  6 | from ufal.udpipe import OutputFormat, ProcessingError, Sentence
  7 | 
  8 | from .utils import get_path
  9 | 
 10 | 
 11 | def _default_model_meta(lang: str, name: str) -> Dict:
 12 |     return {
 13 |         "author": "Milan Straka & Jana Straková",
 14 |         "description": "UDPipe pretrained model.",
 15 |         "email": "straka@ufal.mff.cuni.cz",
 16 |         "lang": f"udpipe_{lang}",
 17 |         "license": "CC BY-NC-SA 4.0",
 18 |         "name": name,
 19 |         "parent_package": "spacy_udpipe",
 20 |         "pipeline": [
 21 |             "Tokenizer", "Tagger", "Lemmatizer", "Parser"
 22 |         ],
 23 |         "source": "Universal Dependencies 2.5",
 24 |         "url": "http://ufal.mff.cuni.cz/udpipe",
 25 |         "version": "1.2.0"
 26 |     }
 27 | 
 28 | 
 29 | class PretokenizedInputFormat:
 30 |     """Dummy tokenizer for pretokenized input.
 31 | 
 32 |     Execution speed might be slow compared to other UDPipe tokenizers
 33 |     due to pure Python implementation. Mocks InputFormat API to enable
 34 |     plug-and-play behaviour.
 35 |     """
 36 | 
 37 |     def setText(self, text: str) -> None:
 38 |         """Store text in iterable lines for tokenization.
 39 | 
 40 |         text: string, where each sentence is on a line and tokens
 41 |               are separated by tabs.
 42 |         """
 43 |         self.lines = iter(text.split("\n"))
 44 | 
 45 |     def nextSentence(self, sentence: Sentence, _: ProcessingError) -> bool:
 46 |         """Tokenize each line from stored lines and store tokens in sentence.
 47 | 
 48 |         sentence: UDPipe container for storing tokens.
 49 |         """
 50 |         try:
 51 |             line = next(self.lines)
 52 |         except StopIteration:
 53 |             return False
 54 | 
 55 |         tokens = line.split("\t")
 56 |         num_tokens = len(tokens)
 57 |         for i, token in enumerate(tokens):
 58 |             word = sentence.addWord(token)
 59 |             if i < num_tokens - 1 and re.match(r"\W", tokens[i + 1]):
 60 |                 # leave no space after current token iff next token
 61 |                 # is non-alphanumeric (i.e. punctuation)
 62 |                 word.setSpaceAfter(False)
 63 |         return True
 64 | 
 65 | 
 66 | class UDPipeModel:
 67 | 
 68 |     def __init__(
 69 |         self,
 70 |         lang: str,
 71 |         path: Optional[str] = None,
 72 |         meta: Optional[Dict] = None
 73 |     ):
 74 |         """Load UDPipe model for given language.
 75 | 
 76 |         lang: ISO 639-1 language code or shorthand UDPipe model name.
 77 |         path: Path to UDPipe model.
 78 |         meta: Meta-information about the UDPipe model.
 79 |         """
 80 |         path = path or get_path(lang=lang)
 81 |         self.model = Model.load(path)
 82 |         self._lang = lang.split("-")[0]
 83 |         self._path = path
 84 |         self._meta = meta or _default_model_meta(
 85 |             self._lang, self._path.split("/")[-1]
 86 |         )
 87 | 
 88 |     def __reduce__(self):
 89 |         # required for multiprocessing on Windows
 90 |         return self.__class__, (self._lang, self._path, self._meta)
 91 | 
 92 |     def __call__(
 93 |         self,
 94 |         text: Union[
 95 |             str,
 96 |             List[str],
 97 |             List[List[str]]
 98 |         ]
 99 |     ) -> List[Sentence]:
100 |         """Tokenize, tag and parse the text and return it in an UDPipe
101 |         representation.
102 | 
103 |         text: Input text, can be presegmented or pretokenized:
104 |             str             : raw text,
105 |             List[str]       : presegmented text,
106 |             List[List[str]] : pretokenized text.
107 |         RETURNS: Processed sentences.
108 |         """
109 |         sentences = self.tokenize(text)
110 |         for s in sentences:
111 |             self.tag(s)
112 |             self.parse(s)
113 |         return sentences
114 | 
115 |     def _read(self, text: str, input_format: str) -> List[Sentence]:
116 |         """Convert the text to an UDPipe representation.
117 | 
118 |         text: Input text.
119 |         input_format: Desired input format.
120 |         RETURNS: Processed sentences.
121 |         """
122 |         input_format.setText(text)
123 |         error = ProcessingError()
124 |         sentences = []
125 | 
126 |         sentence = Sentence()
127 |         while input_format.nextSentence(sentence, error):
128 |             sentences.append(sentence)
129 |             sentence = Sentence()
130 |         if error.occurred():
131 |             raise Exception(error.message)
132 | 
133 |         return sentences
134 | 
135 |     def tokenize(
136 |         self,
137 |         text: Union[
138 |             str,
139 |             List[str],
140 |             List[List[str]]
141 |         ]
142 |     ) -> List[Sentence]:
143 |         """Tokenize input text.
144 | 
145 |         text: Input text, can be presegmented or pretokenized:
146 |             str             : raw text,
147 |             List[str]       : presegmented text,
148 |             List[List[str]] : pretokenized text.
149 |         Note: both presegmented and pretokenized text can not contain
150 |               newline or tab characters.
151 |         RETURNS: Processed sentences.
152 |         """
153 |         if isinstance(text, str):
154 |             tokenizer = self.model.newTokenizer(self.model.DEFAULT)
155 |         elif isinstance(text, list):
156 |             if isinstance(text[0], list):
157 |                 text = "\n".join("\t".join(sent) for sent in text)
158 |                 tokenizer = PretokenizedInputFormat()
159 |             else:
160 |                 text = "\n".join(text)
161 |                 tokenizer = self.model.newTokenizer(
162 |                     self.model.TOKENIZER_PRESEGMENTED
163 |                 )
164 |         else:
165 |             raise TypeError(
166 |                 "\n".join(
167 |                     (f"Input type is {type(text)}, but must be one:",
168 |                      "str             : raw text",
169 |                      "List[str]       : presegmented text",
170 |                      "List[List[str]] : pretokenized text")
171 |                 )
172 |             )
173 |         if not tokenizer:
174 |             raise Exception(
175 |                 "The model does not have a tokenizer "
176 |                 f"so it can not tokenize input: {text}"
177 |             )
178 |         return self._read(text=text, input_format=tokenizer)
179 | 
180 |     def tag(self, sentence: Sentence) -> None:
181 |         """Assign part-of-speech tags (inplace).
182 | 
183 |         sentence: Input sentence.
184 |         """
185 |         self.model.tag(sentence, self.model.DEFAULT)
186 | 
187 |     def parse(self, sentence: Sentence) -> None:
188 |         """Assign dependency parse relations (inplace).
189 | 
190 |         sentence: Input sentence.
191 |         """
192 |         self.model.parse(sentence, self.model.DEFAULT)
193 | 
194 |     def read(self, text: str, in_format: str) -> List[Sentence]:
195 |         """Load text in the given format and return it in an UDPipe
196 |         representation.
197 | 
198 |         text: Text to load.
199 |         in_format: 'conllu'|'horizontal'|'vertical'.
200 |         RETURNS: Processed sentences.
201 |         """
202 |         input_format = InputFormat.newInputFormat(in_format)
203 |         if not input_format:
204 |             raise Exception(f"Cannot create input format '{in_format}'")
205 |         return self._read(text=text, input_format=input_format)
206 | 
207 |     def write(self, sentences: List[Sentence], out_format: str) -> str:
208 |         """Write given sentences in the required output format.
209 | 
210 |         sentences: Input ufal.udpipe.Sentence-s.
211 |         out_format: 'conllu'|'horizontal'|'vertical'.
212 |         RETURNS: Sentences formatted in the out_format.
213 |         """
214 |         output_format = OutputFormat.newOutputFormat(out_format)
215 |         output = "".join([output_format.writeSentence(s) for s in sentences])
216 |         output += output_format.finishDocument()
217 | 
218 |         return output
219 | 


--------------------------------------------------------------------------------
/spacy_udpipe/utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import urllib.request
  4 | from typing import Dict, Optional
  5 | 
  6 | from spacy import blank, Language
  7 | from spacy.util import get_lang_class
  8 | 
  9 | from . import resources
 10 | 
 11 | 
 12 | # Read files from inside a package: https://stackoverflow.com/a/20885799
 13 | try:
 14 |     import importlib.resources as pkg_resources
 15 | except ImportError:
 16 |     # Try backported to Python 3.7 `importlib_resources`.
 17 |     import importlib_resources as pkg_resources
 18 | 
 19 | 
 20 | BASE_URL = "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-3131"  # noqa: E501
 21 | MODELS_DIR = os.getenv(
 22 |     "SPACY_UDPIPE_MODELS_DIR",
 23 |     os.path.join(os.path.expanduser("~/.cache"), "spacy_udpipe_models"),
 24 | )
 25 | 
 26 | with pkg_resources.open_text(resources, "languages.json", encoding="utf-8") as f:
 27 |     LANGUAGES = json.load(f)
 28 | 
 29 | 
 30 | def _check_language(lang: str) -> None:
 31 |     assert lang in LANGUAGES, f"'{lang}' language not available"
 32 | 
 33 | 
 34 | def _check_models_dir(models_dir) -> None:
 35 |     assert os.path.exists(models_dir), "Download the pretrained model(s) first"
 36 | 
 37 | 
 38 | def download(lang: str, models_dir: Optional[str] = None, verbose: bool = False) -> None:
 39 |     """Download the UDPipe pretrained model.
 40 | 
 41 |     lang: ISO 639-1 language code or shorthand UDPipe model name.
 42 |     models_dir: Directory to store a downloaded model.
 43 |     """
 44 |     models_dir = models_dir or MODELS_DIR
 45 |     _check_language(lang)
 46 |     try:
 47 |         _check_models_dir(models_dir)
 48 |     except AssertionError:
 49 |         os.makedirs(models_dir)
 50 |     if LANGUAGES[lang] in os.listdir(models_dir):
 51 |         if verbose:
 52 |             print(f"Already downloaded a model for the '{lang}' language")
 53 |         return
 54 |     url = f"{BASE_URL}/{LANGUAGES[lang]}"
 55 |     filename = os.path.join(models_dir, LANGUAGES[lang])
 56 |     urllib.request.urlretrieve(url=url, filename=filename)
 57 |     if verbose:
 58 |         print(f"Downloaded pre-trained UDPipe model for '{lang}' language")
 59 | 
 60 | 
 61 | def get_path(lang: str, models_dir: Optional[str] = None) -> str:
 62 |     """Get the path to the UDPipe pretrained model if it was downloaded.
 63 | 
 64 |     lang: ISO 639-1 language code or shorthand UDPipe model name.
 65 |     models_dir: Directory with the pretrained models.
 66 |     RETURNS: The path to the UDPipe pretrained model.
 67 |     """
 68 |     models_dir = models_dir or MODELS_DIR
 69 |     _check_language(lang)
 70 |     _check_models_dir(models_dir)
 71 |     if not LANGUAGES[lang] in os.listdir(models_dir):
 72 |         raise Exception(
 73 |                 "Use spacy_udpipe.download to download the pre-trained"
 74 |                 f" UDPipe model for the '{lang}' language"
 75 |             )
 76 |     path = os.path.join(models_dir, LANGUAGES[lang])
 77 |     return path
 78 | 
 79 | 
 80 | def get_defaults(lang: str) -> Language.Defaults:
 81 |     """Get the language-specific defaults, if available in spaCy. This allows
 82 |     using lexical attribute getters that depend on static language data, e.g.
 83 |     Token.like_num, Token.is_stop, Doc.noun_chunks, etc.
 84 | 
 85 |     lang: ISO 639-1 language code or shorthand UDPipe model name.
 86 |     RETURNS: The language defaults.
 87 |     """
 88 |     try:
 89 |         lang_cls = get_lang_class(lang)
 90 |         return lang_cls.Defaults
 91 |     except ImportError:
 92 |         return Language.Defaults
 93 | 
 94 | 
 95 | def load(
 96 |     lang: str = ""
 97 | ) -> Language:
 98 |     """Convenience function for initializing the Language class that
 99 |     mimicks spacy.load.
100 | 
101 |     lang: ISO 639-1 language code or shorthand UDPipe model name.
102 |     RETURNS: SpaCy Language object with UDPipeTokenizer.
103 |     """
104 |     config = {"nlp": {"tokenizer": {}}}
105 |     name = lang.split("-")[0]
106 |     config["nlp"]["tokenizer"]["@tokenizers"] = "spacy_udpipe.PipelineAsTokenizer.v1"  # noqa: E501
107 |     # Set UDPipe options
108 |     config["nlp"]["tokenizer"]["lang"] = lang
109 |     config["nlp"]["tokenizer"]["path"] = get_path(lang)
110 |     config["nlp"]["tokenizer"]["meta"] = None
111 |     return blank(name, config=config)
112 | 
113 | 
114 | def load_from_path(
115 |     lang: str,
116 |     path: str,
117 |     meta: Optional[Dict] = {"description": "custom model"},
118 | ) -> Language:
119 |     """Convenience function for initializing the Language class
120 |     and loading a custom UDPipe model via the path argument.
121 | 
122 |     lang: ISO 639-1 language code or shorthand UDPipe model name.
123 |     path: Path to the UDPipe model.
124 |     meta: Optional meta-information about the UDPipe model.
125 |     RETURNS: SpaCy Language object with UDPipeTokenizer.
126 |     """
127 |     config = {"nlp": {"tokenizer": {}}}
128 |     name = lang.split("-")[0]
129 |     config["nlp"]["tokenizer"]["@tokenizers"] = "spacy_udpipe.PipelineAsTokenizer.v1"  # noqa: E501
130 |     # Set UDPipe options
131 |     config["nlp"]["tokenizer"]["lang"] = lang
132 |     config["nlp"]["tokenizer"]["path"] = path
133 |     config["nlp"]["tokenizer"]["meta"] = meta
134 |     return blank(name, config=config)
135 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TakeLab/spacy-udpipe/a38cd1e1c8dc5e18def177cd21d12922da833d6f/tests/__init__.py


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | 
3 | 
4 | @pytest.fixture(scope="session", autouse=True)
5 | def set_test_models_dir(tmp_path_factory, session_mocker):
6 |     test_models_dir = tmp_path_factory.mktemp("models")
7 |     session_mocker.patch("spacy_udpipe.utils.MODELS_DIR", str(test_models_dir))
8 | 


--------------------------------------------------------------------------------
/tests/languages/en/test_en_language.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | 
  3 | import pytest
  4 | from spacy.lang.en import EnglishDefaults
  5 | from spacy.language import BaseDefaults
  6 | from spacy_udpipe import download
  7 | from spacy_udpipe import load
  8 | from spacy_udpipe.utils import get_defaults
  9 | 
 10 | 
 11 | @pytest.fixture
 12 | def lang() -> str:
 13 |     return "en"
 14 | 
 15 | 
 16 | @pytest.fixture(autouse=True)
 17 | def download_lang(lang: str) -> None:
 18 |     download(lang=lang)
 19 | 
 20 | 
 21 | def tags_equal(act: List[str], exp: List[str]) -> bool:
 22 |     """Check if each actual tag is equal to one or more expected tags."""
 23 |     return all(a == e if isinstance(e, str) else a in e
 24 |                for a, e in zip(act, exp))
 25 | 
 26 | 
 27 | def test_get_defaults(lang: str) -> None:
 28 |     assert get_defaults(lang) == EnglishDefaults
 29 |     assert get_defaults("blabla") == BaseDefaults
 30 | 
 31 | 
 32 | def test_spacy_udpipe_default(lang: str) -> None:
 33 |     nlp = load(lang=lang)
 34 | 
 35 |     text = "Testing one, two, three. This is a test."
 36 |     doc = nlp(text=text)
 37 | 
 38 |     pos_actual = ["PROPN", "NUM", "PUNCT", "NUM", "PUNCT", "NUM",
 39 |                   "PUNCT",
 40 |                   "PRON", "AUX", "DET", "NOUN",
 41 |                   "PUNCT"]
 42 |     # test token attributes
 43 |     assert [t.text for t in doc] == ["Testing", "one", ",", "two", ",", "three",  # noqa: E501
 44 |                                      ".",
 45 |                                      "This", "is", "a", "test",
 46 |                                      "."]
 47 |     assert [t.lemma_ for t in doc] == ["test", "one", ",", "two", ",", "three",
 48 |                                        ".",
 49 |                                        "this", "be", "a", "test",
 50 |                                        "."]
 51 |     assert tags_equal(act=pos_actual, exp=[t.pos_ for t in doc])
 52 |     # CoNNL xpostag-s, custom for each UD treebank
 53 |     assert [t.tag_ for t in doc] == ["NNP", "CD", ",", "CD", ",", "CD",
 54 |                                      ".",
 55 |                                      "DT", "VBZ", "DT", "NN",
 56 |                                      "."]
 57 |     assert [t.dep_ for t in doc] == ["ROOT", "nummod", "punct", "appos", "punct", "nummod",  # noqa: E501
 58 |                                      "punct",
 59 |                                      "nsubj", "cop", "det", "ROOT",
 60 |                                      "punct"]
 61 |     assert [t.is_sent_start for t in doc] == [True, False, False, False, False, False, False,  # noqa: E501
 62 |                                               True, False, False, False, False]
 63 |     assert any([t.is_stop for t in doc])
 64 |     assert [str(t.morph) for t in doc] == [
 65 |         'Number=Sing',
 66 |         'NumType=Card',
 67 |         '',
 68 |         'NumType=Card',
 69 |         '',
 70 |         'NumType=Card',
 71 |         '',
 72 |         'Number=Sing|PronType=Dem',
 73 |         'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin',
 74 |         'Definite=Ind|PronType=Art',
 75 |         'Number=Sing',
 76 |         ''
 77 |     ]
 78 |     # test doc attributes
 79 |     assert len(list(doc.sents)) == 2
 80 |     assert doc.has_annotation("TAG")
 81 |     assert doc.has_annotation("DEP")
 82 |     assert doc.has_annotation("SENT_START")
 83 |     assert doc.has_annotation("MORPH")
 84 |     # test pipe
 85 |     docs = list(nlp.pipe(["Testing one, two, three.", "This is a test."]))
 86 |     assert docs[0].text == "Testing one, two, three."
 87 |     assert [t.pos_ for t in docs[0]] == ["PROPN", "NUM", "PUNCT", "NUM", "PUNCT", "NUM", "PUNCT"]  # noqa: E501
 88 |     assert docs[1].text == "This is a test."
 89 |     assert tags_equal(act=pos_actual[-5:], exp=[t.pos_ for t in docs[1]])
 90 | 
 91 | 
 92 | def test_spacy_udpipe_presegmented(lang: str) -> None:
 93 |     nlp = load(lang=lang)
 94 | 
 95 |     text = "Testing one, two, three. This is a test."
 96 |     doc = nlp(text=text)
 97 |     doc_json = doc.to_json()
 98 | 
 99 |     text_pre = ["Testing one, two, three.", "This is a test."]
100 |     doc_pre = nlp(text=text_pre)
101 |     doc_pre_json = doc_pre.to_json()
102 | 
103 |     assert doc_json["text"] == doc_pre_json["text"]
104 |     assert doc_json["sents"] == doc_pre_json["sents"]
105 |     assert doc_json["tokens"] == doc_pre_json["tokens"]
106 | 
107 | 
108 | def test_spacy_udpipe_pretokenized(lang: str) -> None:
109 |     nlp = load(lang=lang)
110 | 
111 |     text = "Testing one, two, three. This is a test."
112 |     doc = nlp(text=text)
113 |     doc_json = doc.to_json()
114 | 
115 |     text_pre = [
116 |         ["Testing", "one", ",", "two", ",", "three", "."],
117 |         ["This", "is", "a", "test", "."]
118 |     ]
119 |     doc_pre = nlp(text=text_pre)
120 |     doc_pre_json = doc_pre.to_json()
121 | 
122 |     assert doc_json["text"] == doc_pre_json["text"]
123 |     assert doc_json["sents"] == doc_pre_json["sents"]
124 |     assert doc_json["tokens"] == doc_pre_json["tokens"]
125 | 


--------------------------------------------------------------------------------
/tests/languages/fr/test_fr_language.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from spacy.lang.fr import FrenchDefaults
 3 | from spacy.language import BaseDefaults
 4 | from spacy_udpipe import download, load
 5 | from spacy_udpipe.utils import get_defaults
 6 | 
 7 | 
 8 | @pytest.fixture
 9 | def lang() -> str:
10 |     return "fr"
11 | 
12 | 
13 | @pytest.fixture(autouse=True)
14 | def download_lang(lang: str) -> None:
15 |     download(lang=lang)
16 | 
17 | 
18 | def test_get_defaults(lang: str) -> None:
19 |     assert get_defaults(lang) == FrenchDefaults
20 |     assert get_defaults("blabla") == BaseDefaults
21 | 
22 | 
23 | def test_spacy_udpipe(lang: str) -> None:
24 |     nlp = load(lang=lang)
25 | 
26 |     text = "Attention aux articles contractés!"
27 |     doc = nlp(text=text)
28 | 
29 |     assert [t.orth_ for t in doc] == ["Attention", "à", "les", "articles", "contractés", "!"]
30 | 
31 |     pos = [{"INTJ", "NOUN"}, {"ADP"}, {"DET"}, {"NOUN"}, {"VERB", "ADJ"}, {"PUNCT"}]
32 |     for i, t in enumerate(doc):
33 |         assert t.pos_ in pos[i]
34 | 
35 |     assert [t.head.i for t in doc] == [0, 3, 3, 0, 3, 0]
36 | 
37 |     dep = [{"ROOT", "root"}, {"case"}, {"det"}, {"nmod", "obl", "obl:arg"}, {"acl", "amod"}, {"punct"}]
38 |     for i, t in enumerate(doc):
39 |         assert t.dep_ in dep[i]
40 | 


--------------------------------------------------------------------------------
/tests/test_spacy_udpipe.py:
--------------------------------------------------------------------------------
 1 | import tempfile
 2 | 
 3 | import pytest
 4 | import spacy
 5 | 
 6 | from spacy_udpipe import download, load
 7 | 
 8 | 
 9 | @pytest.fixture
10 | def lang() -> str:
11 |     return "en"
12 | 
13 | 
14 | @pytest.fixture(autouse=True)
15 | def download_lang(lang: str) -> None:
16 |     download(lang=lang)
17 | 
18 | 
19 | def test_serialization(lang: str) -> None:
20 |     with tempfile.TemporaryDirectory() as tdir:
21 |         nlp = load(lang=lang)
22 |         doc = nlp("A simple sentence.")
23 |         nlp.to_disk(tdir)
24 |         del nlp
25 | 
26 |         nlp = spacy.load(tdir)
27 |         same_doc = nlp("A simple sentence.")
28 | 
29 |         assert doc.to_json() == same_doc.to_json()
30 | 
31 | 
32 | def test_pipe(lang: str) -> None:
33 |     nlp = load(lang=lang)
34 | 
35 |     text = "spacy-udpipe still does not support multiprocess execution."
36 |     doc = nlp(text)
37 |     del nlp
38 | 
39 |     nlp = load(lang=lang)
40 |     texts = [text for _ in range(2)]
41 |     docs = list(nlp.pipe(texts, n_process=-1))
42 | 
43 |     assert len(docs) == len(texts)
44 |     assert docs[0].to_json() == doc.to_json()
45 |     assert docs[-1].to_json() == doc.to_json()
46 | 


--------------------------------------------------------------------------------