├── wn ├── py.typed ├── metrics.py ├── _exceptions.py ├── _types.py ├── _ili.py ├── __init__.py ├── _util.py ├── _db.py ├── _download.py ├── morphy.py ├── __main__.py ├── _module_functions.py ├── util.py └── ic.py ├── tests ├── data │ ├── test-package │ │ ├── LICENSE │ │ ├── README.md │ │ ├── citation.bib │ │ └── test-wn.xml │ ├── README.md │ ├── E101-3.xml │ ├── E101-2.xml │ ├── W306-0.xml │ ├── W305-0.xml │ ├── E101-1.xml │ ├── E101-0.xml │ ├── W307-0.xml │ ├── sense-member-order.xml │ ├── sense-key-variations.xml │ ├── mini-lmf-1.3.xml │ ├── mini-lmf-1.4.xml │ └── mini-lmf-1.1.xml ├── util_test.py ├── validate_test.py ├── export_test.py ├── _util_test.py ├── project_test.py ├── morphy_test.py ├── db_test.py ├── conftest.py ├── compat_sensekey_test.py ├── wordnet_test.py ├── ic_test.py ├── web_test.py ├── taxonomy_test.py ├── lmf_test.py ├── secondary_query_test.py ├── relations_test.py └── similarity_test.py ├── docs ├── docutils.conf ├── requirements.txt ├── api │ ├── wn.validate.rst │ ├── wn.lmf.rst │ ├── wn.compat.sensekey.rst │ ├── wn.compat.rst │ ├── wn.util.rst │ ├── wn.project.rst │ ├── wn.taxonomy.rst │ ├── wn.morphy.rst │ ├── wn.similarity.rst │ └── wn.ic.rst ├── _static │ ├── css │ │ └── svg.css │ ├── wn-logo.svg │ └── wn-logo-rotate.svg ├── Makefile ├── .readthedocs.yaml ├── make.bat ├── index.rst ├── cli.rst ├── setup.rst ├── conf.py ├── guides │ ├── nltk-migration.rst │ └── wordnet.rst └── faq.rst ├── .github ├── ISSUE_TEMPLATE │ ├── feature_request.md │ ├── data-issue.md │ └── bug_report.md └── workflows │ ├── checks.yml │ ├── publish.yml │ └── publish-docker.yaml ├── Dockerfile ├── .gitignore ├── LICENSE ├── bench ├── README.md ├── test_bench.py └── conftest.py ├── CITATION.cff ├── pyproject.toml └── CONTRIBUTING.md /wn/py.typed: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tests/data/test-package/LICENSE: -------------------------------------------------------------------------------- 1 | Test License 2 | -------------------------------------------------------------------------------- /tests/data/test-package/README.md: -------------------------------------------------------------------------------- 1 | # Test README 2 | -------------------------------------------------------------------------------- /tests/data/test-package/citation.bib: -------------------------------------------------------------------------------- 1 | % test bib 2 | -------------------------------------------------------------------------------- /docs/docutils.conf: -------------------------------------------------------------------------------- 1 | [restructuredtext parser] 2 | syntax_highlight = short 3 | 4 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx ~= 8.1 2 | furo == 2024.8.6 3 | sphinx-copybutton == 0.5.2 4 | . 5 | 6 | -------------------------------------------------------------------------------- /docs/api/wn.validate.rst: -------------------------------------------------------------------------------- 1 | 2 | wn.validate 3 | =========== 4 | 5 | .. automodule:: wn.validate 6 | 7 | .. autofunction:: validate 8 | -------------------------------------------------------------------------------- /tests/data/README.md: -------------------------------------------------------------------------------- 1 | # Testing Data Directory 2 | 3 | This directory is used to store data files used by the testing system. 4 | 5 | -------------------------------------------------------------------------------- /docs/api/wn.lmf.rst: -------------------------------------------------------------------------------- 1 | 2 | wn.lmf 3 | ====== 4 | 5 | .. automodule:: wn.lmf 6 | 7 | .. autofunction:: load 8 | .. autofunction:: scan_lexicons 9 | .. autofunction:: is_lmf 10 | 11 | -------------------------------------------------------------------------------- /tests/data/test-package/test-wn.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /docs/api/wn.compat.sensekey.rst: -------------------------------------------------------------------------------- 1 | wn.compat.sensekey 2 | ================== 3 | 4 | .. automodule:: wn.compat.sensekey 5 | 6 | .. autofunction:: escape 7 | .. autofunction:: unescape 8 | .. autofunction:: sense_key_getter 9 | .. autofunction:: sense_getter 10 | -------------------------------------------------------------------------------- /docs/_static/css/svg.css: -------------------------------------------------------------------------------- 1 | svg { 2 | width: 500px; 3 | height: 300px; 4 | 5 | position: relative; 6 | left: 20%; 7 | -webkit-transform: translateX(-20%); 8 | -ms-transform: translateX(-20%); 9 | transform: translateX(-20%); 10 | 11 | } 12 | 13 | -------------------------------------------------------------------------------- /wn/metrics.py: -------------------------------------------------------------------------------- 1 | 2 | from wn._core import Word, Synset 3 | 4 | 5 | # Word-based Metrics 6 | 7 | def ambiguity(word: Word) -> int: 8 | return len(word.synsets()) 9 | 10 | 11 | def average_ambiguity(synset: Synset) -> float: 12 | words = synset.words() 13 | return sum(len(word.synsets()) for word in words) / len(words) 14 | -------------------------------------------------------------------------------- /tests/util_test.py: -------------------------------------------------------------------------------- 1 | 2 | from wn import util 3 | 4 | 5 | def test_synset_id_formatter(): 6 | f = util.synset_id_formatter 7 | assert f()(prefix='xyz', offset=123, pos='n') == 'xyz-00000123-n' 8 | assert f(prefix='xyz')(offset=123, pos='n') == 'xyz-00000123-n' 9 | assert f(prefix='xyz', pos='n')(offset=123) == 'xyz-00000123-n' 10 | assert f('abc-{offset}-{pos}')(offset=1, pos='v') == 'abc-1-v' 11 | -------------------------------------------------------------------------------- /docs/api/wn.compat.rst: -------------------------------------------------------------------------------- 1 | wn.compat 2 | ========= 3 | 4 | Compatibility modules for Wn. 5 | 6 | This subpackage is a namespace for compatibility modules when working 7 | with particular lexicons. Wn is designed to be agnostic to the 8 | language or lexicon and not favor one over the other (with the 9 | exception of :mod:`wn.morphy`, which is English-specific). However, 10 | there are some kinds of functionality that would be useful to 11 | include in Wn, even if they don't generalize to all lexicons. 12 | 13 | Included modules 14 | ---------------- 15 | 16 | .. toctree:: 17 | :maxdepth: 1 18 | 19 | wn.compat.sensekey.rst 20 | 21 | -------------------------------------------------------------------------------- /tests/validate_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from wn import lmf 4 | from wn.validate import validate 5 | 6 | tests = [ 7 | ("E101", 0), 8 | ("E101", 1), 9 | ("E101", 2), 10 | ("E101", 3), 11 | ("W305", 0), 12 | ("W306", 0), 13 | ("W307", 0), 14 | ] 15 | test_ids = [f"{code}-{i}" for code, i in tests] 16 | 17 | 18 | @pytest.mark.parametrize("code,i", tests, ids=test_ids) 19 | def test_validate(datadir, code: str, i: int) -> None: 20 | path = datadir / f"{code}-{i}.xml" 21 | lex = lmf.load(path, progress_handler=None)["lexicons"][0] 22 | report = validate(lex, select=[code], progress_handler=None) 23 | print(report) 24 | assert len(report[code]["items"]) > 0 25 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the version of Python and other tools you might need 9 | build: 10 | os: ubuntu-22.04 11 | tools: 12 | python: "3.12" 13 | 14 | # Build documentation in the docs/ directory with Sphinx 15 | sphinx: 16 | configuration: docs/conf.py 17 | 18 | # We recommend specifying your dependencies to enable reproducible builds: 19 | # https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html 20 | python: 21 | install: 22 | - requirements: docs/requirements.txt 23 | 24 | formats: 25 | - pdf 26 | - epub 27 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11-slim 2 | 3 | WORKDIR /app 4 | 5 | # Install system dependencies 6 | RUN apt-get update && apt-get install -y \ 7 | python3-pip \ 8 | python3-dev \ 9 | build-essential \ 10 | && rm -rf /var/lib/apt/lists/* 11 | 12 | # Install web server 13 | RUN pip install uvicorn 14 | 15 | COPY . . 16 | RUN pip install --no-cache-dir ".[web]" 17 | 18 | # Download the wordnet data and initialize the database 19 | # TODO: this should be done in a separate volume 20 | RUN python -m wn download omw:1.4 cili 21 | 22 | # Clean up the downloads directory 23 | RUN rm -r ~/.wn_data/downloads 24 | 25 | # Expose the port 26 | EXPOSE 8080 27 | 28 | CMD ["uvicorn", "wn.web:app", "--host", "0.0.0.0", "--port", "8080"] -------------------------------------------------------------------------------- /docs/api/wn.util.rst: -------------------------------------------------------------------------------- 1 | wn.util 2 | ======= 3 | 4 | .. automodule:: wn.util 5 | 6 | .. autofunction:: synset_id_formatter 7 | 8 | .. autoclass:: ProgressHandler 9 | :members: 10 | 11 | .. attribute:: kwargs 12 | 13 | A dictionary storing the updateable parameters for the progress 14 | handler. The keys are: 15 | 16 | - ``message`` (:class:`str`) -- a generic message or name 17 | - ``count`` (:class:`int`) -- the current progress counter 18 | - ``total`` (:class:`int`) -- the expected final value of the counter 19 | - ``unit`` (:class:`str`) -- the unit of measurement 20 | - ``status`` (:class:`str`) -- the current status of the process 21 | 22 | .. autoclass:: ProgressBar 23 | :members: 24 | -------------------------------------------------------------------------------- /tests/export_test.py: -------------------------------------------------------------------------------- 1 | 2 | from xml.etree import ElementTree as ET 3 | 4 | import pytest 5 | 6 | import wn 7 | 8 | 9 | @pytest.mark.usefixtures('mini_db') 10 | def test_export(datadir, tmp_path): 11 | tmpdir = tmp_path / 'test_export' 12 | tmpdir.mkdir() 13 | tmppath = tmpdir / 'mini_lmf_export.xml' 14 | lexicons = wn.lexicons(lexicon='test-en test-es') 15 | wn.export(lexicons, tmppath) 16 | 17 | # remove comments, indentation, etc. 18 | orig = ET.canonicalize(from_file=datadir / 'mini-lmf-1.0.xml', strip_text=True) 19 | temp = ET.canonicalize(from_file=tmppath, strip_text=True) 20 | # additional transformation to help with debugging 21 | orig = orig.replace('<', '\n<') 22 | temp = temp.replace('<', '\n<') 23 | assert orig == temp 24 | -------------------------------------------------------------------------------- /wn/_exceptions.py: -------------------------------------------------------------------------------- 1 | 2 | class Error(Exception): 3 | """Generic error class for invalid wordnet operations.""" 4 | 5 | # reset the module so the user sees the public name 6 | __module__ = 'wn' 7 | 8 | 9 | class DatabaseError(Error): 10 | """Error class for issues with the database.""" 11 | 12 | __module__ = 'wn' 13 | 14 | 15 | class ConfigurationError(Error): 16 | """Raised on invalid configurations.""" 17 | __module__ = 'wn' 18 | 19 | 20 | class ProjectError(Error): 21 | """Raised when a project is not found or on errors defined in the index.""" 22 | __module__ = 'wn' 23 | 24 | 25 | class WnWarning(Warning): 26 | """Generic warning class for dubious wordnet operations.""" 27 | 28 | # reset the module so the user sees the public name 29 | __module__ = 'wn' 30 | -------------------------------------------------------------------------------- /tests/data/E101-3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /tests/data/E101-2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /tests/data/W306-0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /tests/_util_test.py: -------------------------------------------------------------------------------- 1 | 2 | from wn._util import flatten, unique_list 3 | 4 | 5 | def test_flatten(): 6 | assert flatten([]) == [] 7 | assert flatten([[]]) == [] 8 | assert flatten([[], []]) == [] 9 | assert flatten([[[], []], [[], []]]) == [[], [], [], []] 10 | assert flatten([[1]]) == [1] 11 | assert flatten([[1, 2], [3, 4]]) == [1, 2, 3, 4] 12 | assert flatten(["AB", "CD"]) == ["A", "B", "C", "D"] 13 | 14 | 15 | def test_unique_list(): 16 | assert unique_list([]) == [] 17 | assert unique_list([1]) == [1] 18 | assert unique_list([1, 1, 1, 1, 1]) == [1] 19 | assert unique_list([1, 1, 2, 2, 1]) == [1, 2] 20 | assert unique_list([2, 1, 2, 2, 1]) == [2, 1] 21 | assert unique_list("A") == ["A"] 22 | assert unique_list("AAA") == ["A"] 23 | assert unique_list("ABABA") == ["A", "B"] 24 | assert unique_list([(1, 2), (1, 2), (2, 3)]) == [(1, 2), (2, 3)] 25 | -------------------------------------------------------------------------------- /tests/data/W305-0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /tests/data/E101-1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/_static/wn-logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /tests/data/E101-0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /wn/_types.py: -------------------------------------------------------------------------------- 1 | 2 | from collections.abc import Callable, Mapping, Sequence 3 | from typing import Any, Optional, Union 4 | from pathlib import Path 5 | 6 | # For functions taking a filesystem path as a str or a pathlib.Path 7 | AnyPath = Union[str, Path] 8 | 9 | # LMF versions for comparison 10 | VersionInfo = tuple[int, ...] 11 | 12 | # Synset and Sense relations map a relation type to one or more ids 13 | RelationMap = Mapping[str, Sequence[str]] 14 | 15 | # User-facing metadata representation 16 | Metadata = dict[str, Any] 17 | 18 | # A callable that returns a normalized word form for a given word form 19 | NormalizeFunction = Callable[[str], str] 20 | 21 | # Lemmatization returns a mapping of parts of speech (or None) to 22 | # lists of wordforms that are potential lemmas for some query word 23 | LemmatizeResult = dict[Optional[str], set[str]] 24 | 25 | # A callable that returns a LemmatizationResult for a given word form 26 | # and optional part of speech 27 | LemmatizeFunction = Callable[[str, Optional[str]], LemmatizeResult] 28 | -------------------------------------------------------------------------------- /tests/data/W307-0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | foo 22 | 23 | 24 | 25 | foo 26 | 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # Unit test / coverage reports 31 | htmlcov/ 32 | .tox/ 33 | .nox/ 34 | .coverage 35 | .coverage.* 36 | .cache 37 | nosetests.xml 38 | coverage.xml 39 | *.cover 40 | *.py,cover 41 | .hypothesis/ 42 | .pytest_cache/ 43 | 44 | # Ruff (has its own .gitignore, but in case that ever changes...) 45 | .ruff_cache 46 | 47 | # Sphinx documentation 48 | docs/_build/ 49 | 50 | # Jupyter Notebook 51 | .ipynb_checkpoints 52 | 53 | # Environments 54 | .env 55 | .venv 56 | env/ 57 | venv/ 58 | ENV/ 59 | env.bak/ 60 | venv.bak/ 61 | 62 | # mypy 63 | .mypy_cache/ 64 | .dmypy.json 65 | dmypy.json 66 | 67 | # PyCharm 68 | .idea/ 69 | 70 | # VS Code 71 | .vscode/ 72 | 73 | # benchmarking results 74 | .benchmarks/ -------------------------------------------------------------------------------- /docs/api/wn.project.rst: -------------------------------------------------------------------------------- 1 | wn.project 2 | ========== 3 | 4 | .. automodule:: wn.project 5 | 6 | .. autofunction:: get_project 7 | .. autofunction:: iterpackages 8 | .. autofunction:: is_package_directory 9 | .. autofunction:: is_collection_directory 10 | 11 | Project Classes 12 | --------------- 13 | 14 | Projects can be simple resource files, :class:`Package` directories, 15 | or :class:`Collection` directories. For API consistency, resource 16 | files are modeled as a virtual package (:class:`ResourceOnlyPackage`). 17 | 18 | .. class:: Project 19 | 20 | The base class for packages and collections. 21 | 22 | This class is not used directly, but all subclasses will implement 23 | the methods listed here. 24 | 25 | .. autoproperty:: path 26 | .. automethod:: readme 27 | .. automethod:: license 28 | .. automethod:: citation 29 | 30 | .. autoclass:: Package 31 | :show-inheritance: 32 | 33 | .. autoproperty:: type 34 | .. automethod:: resource_file 35 | 36 | .. autoclass:: ResourceOnlyPackage 37 | :show-inheritance: 38 | 39 | .. autoclass:: Collection 40 | :show-inheritance: 41 | 42 | .. automethod:: packages 43 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Michael Wayne Goodman 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.github/workflows/checks.yml: -------------------------------------------------------------------------------- 1 | name: tests 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | branches: [main] 8 | 9 | jobs: 10 | lint: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v4 14 | - name: Set up Python 15 | uses: actions/setup-python@v4 16 | with: 17 | python-version: "3.9" 18 | - name: Install Hatch 19 | run: pipx install hatch 20 | - name: Lint 21 | run: hatch fmt --linter --check 22 | - name: Type Check 23 | run: hatch run mypy:check 24 | - name: Check Buildable 25 | run: hatch build 26 | 27 | tests: 28 | runs-on: ${{ matrix.os }} 29 | strategy: 30 | matrix: 31 | python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] 32 | os: [ubuntu-latest, windows-latest] 33 | steps: 34 | - uses: actions/checkout@v4 35 | - name: Set up Python ${{ matrix.python-version }} 36 | uses: actions/setup-python@v4 37 | with: 38 | python-version: ${{ matrix.python-version }} 39 | - name: Install Hatch 40 | run: pipx install hatch 41 | - name: Test 42 | run: hatch test 43 | -------------------------------------------------------------------------------- /wn/_ili.py: -------------------------------------------------------------------------------- 1 | 2 | from collections.abc import Iterator 3 | from pathlib import Path 4 | 5 | from wn._types import AnyPath 6 | 7 | 8 | def is_ili(source: AnyPath) -> bool: 9 | """Return True if *source* is an ILI tab-separated-value file. 10 | 11 | This only checks that the first column, split by tabs, of the 12 | first line is 'ili' or 'ILI'. It does not check if each line has 13 | the correct number of columns. 14 | 15 | """ 16 | source = Path(source).expanduser() 17 | if source.is_file(): 18 | try: 19 | with source.open('rb') as fh: 20 | return next(fh).split(b'\t')[0] in (b'ili', b'ILI') 21 | except (StopIteration, IndexError): 22 | pass 23 | return False 24 | 25 | 26 | def load(source: AnyPath) -> Iterator[dict[str, str]]: 27 | """Load an interlingual index file. 28 | 29 | Args: 30 | source: path to an ILI file 31 | """ 32 | source = Path(source).expanduser() 33 | with source.open(encoding='utf-8') as fh: 34 | header = next(fh).rstrip('\r\n') 35 | fields = tuple(map(str.lower, header.split('\t'))) 36 | for line in fh: 37 | yield dict(zip(fields, line.rstrip('\r\n').split('\t'))) 38 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/data-issue.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Data issue 3 | about: Report an issue Wn's data index 4 | title: '' 5 | labels: data 6 | assignees: '' 7 | 8 | --- 9 | 10 | **If your issue is regarding the contents of the data** (e.g., a lexicon is missing a word, synset, relation, etc.), then please find the upstream project and file the issue there. You can find links to the projects on Wn's [README](https://github.com/goodmami/wn/). Projects without links are probably managed by the [Open Multilingual Wordnet](https://github.com/omwn/omw-data). 11 | 12 | **Use this issue template for the following kinds of issues:** 13 | 1. Request a wordnet lexicon (including new versions of existing lexicons) to be indexed by Wn 14 | 15 | Please provide: 16 | - the project name 17 | - the name and contact info of the current maintainer 18 | - the language of the lexicon (BCP-47 code preferred) 19 | - a URL to the project (e.g., on GitHub or other homepage) 20 | - a URL to the [WN-LMF](https://github.com/globalwordnet/schemas/) resource 21 | 22 | 2. Report an issue with an indexed lexicon (e.g., the source URL has changed) 23 | 24 | Please indicate the lexicon id and version and the correct project information, if available. 25 | -------------------------------------------------------------------------------- /bench/README.md: -------------------------------------------------------------------------------- 1 | # Wn Benchmarking 2 | 3 | This directory contains code and data for running benchmarks for 4 | Wn. The benchmarks are implemented using 5 | [pytest-benchmarks](https://github.com/ionelmc/pytest-benchmark/), so 6 | they are run using pytest as follows (from the top-level project 7 | directory): 8 | 9 | ```console 10 | $ hatch test bench/ # run the benchmarks 11 | $ hatch test bench/ --benchmark-autosave # run benchmarks and store results 12 | $ hatch test bench/ --benchmark-compare # run benchmarks and compare to stored result 13 | $ hatch test -- --help # get help on options (look for those prefixed `--benchmark-`) 14 | ``` 15 | 16 | Notes: 17 | 18 | * The tests are not exhaustive; when making a change that may affect 19 | performance, consider making a new test if one doesn't exist 20 | already. It would be helpful to check in the test to Git, but not 21 | the benchmark results since those are dependent on the machine. 22 | * Benchmark the code before and after the changes. Store the results 23 | locally for comparison. 24 | * Ensure the testing environment has a steady load (wait for 25 | long-running processes to finish, close any active web browser tabs, 26 | etc.) prior to and while running the test. 27 | * Expect high variance for IO-bound tasks. 28 | 29 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | :warning: If this is a question about Wn or how to use it, please create a [discussion](https://github.com/goodmami/wn/discussions) instead of an issue. 14 | 15 | **To Reproduce** 16 | Please enter a minimal working example of the command or Python code that illustrates the problem. To avoid formatting issues, enter the code in a Markdown code block: 17 | 18 | ```console 19 | $ python -m wn ... 20 | output... 21 | ``` 22 | 23 | or 24 | 25 | ```pycon 26 | >>> import wn 27 | >>> ... 28 | output 29 | ``` 30 | 31 | **Expected behavior** 32 | A clear and concise description of what you expected to happen. 33 | 34 | **Environment** 35 | Please enter the versions of Python and Wn you are using as well as the installed lexicons. You can find these by executing the following commands (adjust your platform-specific Python command as necessary, e.g., `python3` or `py -3`): 36 | 37 | ```console 38 | python --version 39 | python -m wn --version 40 | python -m wn lexicons 41 | ``` 42 | 43 | **Additional context** 44 | Add any other context about the problem here. 45 | -------------------------------------------------------------------------------- /tests/data/sense-member-order.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | title: Wn 3 | message: >- 4 | Please cite this software using the metadata from 5 | 'preferred-citation'. 6 | type: software 7 | authors: 8 | - given-names: Michael Wayne 9 | family-names: Goodman 10 | email: goodman.m.w@gmail.com 11 | orcid: 'https://orcid.org/0000-0002-2896-5141' 12 | - given-names: Francis 13 | family-names: Bond 14 | email: bond@ieee.org 15 | orcid: 'https://orcid.org/0000-0003-4973-8068' 16 | repository-code: 'https://github.com/goodmami/wn/' 17 | preferred-citation: 18 | type: conference-paper 19 | authors: 20 | - given-names: Michael Wayne 21 | family-names: Goodman 22 | email: goodmami@uw.edu 23 | orcid: 'https://orcid.org/0000-0002-2896-5141' 24 | affiliation: Nanyang Technological University 25 | - given-names: Francis 26 | family-names: Bond 27 | email: bond@ieee.org 28 | orcid: 'https://orcid.org/0000-0003-4973-8068' 29 | affiliation: Nanyang Technological University 30 | start: 100 # First page number 31 | end: 107 # Last page number 32 | conference: 33 | name: "Proceedings of the 11th Global Wordnet Conference" 34 | title: "Intrinsically Interlingual: The Wn Python Library for Wordnets" 35 | year: 2021 36 | month: 1 37 | url: 'https://aclanthology.org/2021.gwc-1.12/' 38 | publisher: "Global Wordnet Association" 39 | -------------------------------------------------------------------------------- /wn/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | Wordnet Interface. 4 | """ 5 | 6 | __all__ = ( 7 | '__version__', 8 | 'Wordnet', 9 | 'download', 10 | 'add', 11 | 'add_lexical_resource', 12 | 'remove', 13 | 'export', 14 | 'projects', 15 | 'lexicons', 16 | 'Lexicon', 17 | 'word', 18 | 'words', 19 | 'Word', 20 | 'Form', 21 | 'Pronunciation', 22 | 'Tag', 23 | 'sense', 24 | 'senses', 25 | 'Sense', 26 | 'Example', 27 | 'Count', 28 | 'synset', 29 | 'synsets', 30 | 'Synset', 31 | 'Definition', 32 | 'Relation', 33 | 'ili', 34 | 'ilis', 35 | 'ILI', 36 | 'Error', 37 | 'DatabaseError', 38 | 'ConfigurationError', 39 | 'ProjectError', 40 | 'WnWarning', 41 | ) 42 | 43 | from wn._exceptions import ( 44 | Error, 45 | DatabaseError, 46 | ConfigurationError, 47 | ProjectError, 48 | WnWarning, 49 | ) 50 | from wn._config import config # noqa: F401 51 | from wn._add import add, add_lexical_resource, remove 52 | from wn._export import export 53 | from wn._download import download 54 | from wn._core import ( 55 | Lexicon, 56 | Word, Form, Pronunciation, Tag, 57 | Sense, Example, Count, 58 | Synset, Definition, 59 | Relation, 60 | ILI, 61 | Wordnet 62 | ) 63 | from wn._module_functions import ( 64 | projects, 65 | lexicons, 66 | word, words, 67 | sense, senses, 68 | synset, synsets, 69 | ili, ilis, 70 | ) 71 | 72 | __version__ = '0.13.0' 73 | -------------------------------------------------------------------------------- /tests/data/sense-key-variations.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | 2 | Wn Documentation 3 | ================ 4 | 5 | Overview 6 | -------- 7 | 8 | This package provides an interface to wordnet data, from simple lookup 9 | queries, to graph traversals, to more sophisticated algorithms and 10 | metrics. Features include: 11 | 12 | - Support for wordnets in the 13 | `WN-LMF `_ format 14 | - A `SQLite `_ database backend for data 15 | consistency and efficient queries 16 | - Accurate modeling of Words, Senses, and Synsets 17 | 18 | Quick Start 19 | ----------- 20 | 21 | .. code-block:: console 22 | 23 | $ pip install wn 24 | 25 | .. code-block:: python 26 | 27 | >>> import wn 28 | >>> wn.download('ewn:2020') 29 | >>> wn.synsets('coffee') 30 | [Synset('ewn-04979718-n'), Synset('ewn-07945591-n'), Synset('ewn-07945759-n'), Synset('ewn-12683533-n')] 31 | 32 | 33 | Contents 34 | -------- 35 | 36 | .. toctree:: 37 | :maxdepth: 2 38 | 39 | setup.rst 40 | cli.rst 41 | faq.rst 42 | 43 | .. toctree:: 44 | :caption: Guides 45 | :maxdepth: 2 46 | 47 | guides/lexicons.rst 48 | guides/basic.rst 49 | guides/interlingual.rst 50 | guides/wordnet.rst 51 | guides/lemmatization.rst 52 | guides/nltk-migration.rst 53 | 54 | .. toctree:: 55 | :caption: API Reference 56 | :maxdepth: 1 57 | :hidden: 58 | 59 | api/wn.rst 60 | api/wn.compat.rst 61 | api/wn.compat.sensekey.rst 62 | api/wn.constants.rst 63 | api/wn.ic.rst 64 | api/wn.lmf.rst 65 | api/wn.morphy.rst 66 | api/wn.project.rst 67 | api/wn.similarity.rst 68 | api/wn.taxonomy.rst 69 | api/wn.util.rst 70 | api/wn.validate.rst 71 | api/wn.web.rst 72 | -------------------------------------------------------------------------------- /docs/_static/wn-logo-rotate.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /tests/data/mini-lmf-1.3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 8 | 9 | 10 | 11 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | one 38 | two 39 | three 40 | 41 | 42 | 43 | 44 | 45 | one 46 | two 47 | three 48 | 49 | 50 | 51 | 52 | 53 | one 54 | two 55 | three 56 | 57 | 58 | 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /bench/test_bench.py: -------------------------------------------------------------------------------- 1 | import wn 2 | from wn import lmf 3 | 4 | import pytest 5 | 6 | 7 | @pytest.mark.benchmark(group="lmf.load", warmup=True) 8 | def test_load(datadir, benchmark): 9 | benchmark(lmf.load, datadir / 'mini-lmf-1.0.xml') 10 | 11 | 12 | @pytest.mark.benchmark(group="wn.add_lexical_resource") 13 | @pytest.mark.usefixtures('empty_db') 14 | def test_add_lexical_resource(mock_lmf, benchmark): 15 | # TODO: when pytest-benchmark's teardown option is released, use 16 | # that here with more rounds 17 | benchmark.pedantic( 18 | wn.add_lexical_resource, 19 | args=(mock_lmf,), 20 | # teardown=clean_db, 21 | iterations=1, 22 | rounds=1, 23 | ) 24 | 25 | 26 | @pytest.mark.benchmark(group="wn.add_lexical_resource") 27 | @pytest.mark.usefixtures('empty_db') 28 | def test_add_lexical_resource_no_progress(mock_lmf, benchmark): 29 | # TODO: when pytest-benchmark's teardown option is released, use 30 | # that here with more rounds 31 | benchmark.pedantic( 32 | wn.add_lexical_resource, 33 | args=(mock_lmf,), 34 | kwargs={"progress_handler": None}, 35 | # teardown=clean_db, 36 | iterations=1, 37 | rounds=1, 38 | ) 39 | 40 | 41 | @pytest.mark.benchmark(group="primary queries") 42 | @pytest.mark.usefixtures('mock_db') 43 | def test_synsets(benchmark): 44 | benchmark(wn.synsets) 45 | 46 | 47 | @pytest.mark.benchmark(group="primary queries") 48 | @pytest.mark.usefixtures('mock_db') 49 | def test_words(benchmark): 50 | benchmark(wn.words) 51 | 52 | 53 | @pytest.mark.benchmark(group="secondary queries") 54 | @pytest.mark.usefixtures('mock_db') 55 | def test_word_senses_no_wordnet(benchmark): 56 | word = wn.words()[0] 57 | benchmark(word.senses) 58 | 59 | 60 | @pytest.mark.benchmark(group="secondary queries") 61 | @pytest.mark.usefixtures('mock_db') 62 | def test_word_senses_with_wordnet(benchmark): 63 | w = wn.Wordnet("mock:1") 64 | word = w.words()[0] 65 | benchmark(word.senses) 66 | 67 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Build and Publish to PyPI or TestPyPI 2 | 3 | on: push 4 | 5 | jobs: 6 | build: 7 | name: Build distribution 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@v4 11 | - name: Set up Python 12 | uses: actions/setup-python@v4 13 | with: 14 | python-version: "3.x" 15 | - name: Install Hatch 16 | run: pipx install hatch 17 | - name: Build 18 | run: hatch build 19 | - name: Store the distribution packages 20 | uses: actions/upload-artifact@v4 21 | with: 22 | name: python-package-distributions 23 | path: dist/ 24 | 25 | publish-to-pypi: 26 | name: Publish distributions to PyPI 27 | if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes 28 | needs: 29 | - build 30 | runs-on: ubuntu-latest 31 | environment: 32 | name: pypi 33 | url: https://pypi.org/p/wn 34 | permissions: 35 | id-token: write # IMPORTANT: mandatory for trusted publishing 36 | steps: 37 | - name: Download the dists 38 | uses: actions/download-artifact@v4.1.8 39 | with: 40 | name: python-package-distributions 41 | path: dist/ 42 | - name: Publish to PyPI 43 | uses: pypa/gh-action-pypi-publish@release/v1 44 | 45 | publish-to-testpypi: 46 | name: Publish distributions to TestPyPI 47 | needs: 48 | - build 49 | runs-on: ubuntu-latest 50 | environment: 51 | name: testpypi 52 | url: https://test.pypi.org/p/wn 53 | permissions: 54 | id-token: write # IMPORTANT: mandatory for trusted publishing 55 | steps: 56 | - name: Download the dists 57 | uses: actions/download-artifact@v4.1.8 58 | with: 59 | name: python-package-distributions 60 | path: dist/ 61 | - name: Publish to TestPyPI 62 | uses: pypa/gh-action-pypi-publish@release/v1 63 | with: 64 | repository-url: https://test.pypi.org/legacy/ 65 | skip-existing: true 66 | -------------------------------------------------------------------------------- /tests/project_test.py: -------------------------------------------------------------------------------- 1 | from wn import project 2 | 3 | def test_is_package_directory(datadir): 4 | assert project.is_package_directory(datadir / "test-package") 5 | assert not project.is_package_directory(datadir) 6 | 7 | 8 | def test_is_collection_directory(datadir): 9 | # not really, but it is a directory containing a package 10 | assert project.is_collection_directory(datadir) 11 | assert not project.is_collection_directory(datadir / "test-package") 12 | 13 | 14 | def test_get_project(datadir): 15 | proj = project.get_project(path=datadir / "test-package") 16 | assert proj.type == "wordnet" 17 | assert proj.resource_file() == datadir / "test-package" / "test-wn.xml" 18 | assert proj.readme() == datadir / "test-package" / "README.md" 19 | assert proj.license() == datadir / "test-package" / "LICENSE" 20 | assert proj.citation() == datadir / "test-package" / "citation.bib" 21 | 22 | proj = project.get_project(path=datadir / "mini-lmf-1.0.xml") 23 | assert proj.type == "wordnet" 24 | assert proj.resource_file() == datadir / "mini-lmf-1.0.xml" 25 | assert proj.readme() is None 26 | assert proj.license() is None 27 | assert proj.citation() is None 28 | 29 | 30 | def test_iterpackages(datadir): 31 | # for now, collection.packages() does not return contained resource files 32 | pkg_names = { 33 | pkg.resource_file().name 34 | for pkg in project.iterpackages(datadir) 35 | } 36 | assert "mini-lmf-1.0.xml" not in pkg_names 37 | assert "test-wn.xml" in pkg_names 38 | 39 | # explicitly giving a resource file path works, though 40 | pkg_names = { 41 | pkg.resource_file().name 42 | for pkg in project.iterpackages(datadir / "mini-lmf-1.0.xml") 43 | } 44 | assert "mini-lmf-1.0.xml" in pkg_names 45 | assert "test-wn.xml" not in pkg_names 46 | 47 | 48 | def test_compressed_iterpackages(mini_lmf_compressed): 49 | for pkg in project.iterpackages(mini_lmf_compressed): 50 | assert pkg.type == "wordnet" 51 | assert pkg.resource_file().exists() 52 | # ensure cleanup of temporary data 53 | assert not pkg.resource_file().exists() 54 | # ensure original file not deleted 55 | assert mini_lmf_compressed.exists() 56 | -------------------------------------------------------------------------------- /tests/morphy_test.py: -------------------------------------------------------------------------------- 1 | 2 | import pytest 3 | 4 | import wn 5 | from wn import morphy 6 | 7 | 8 | def test_morphy_uninitialized(): 9 | # An unintialized Morphy isn't very bright, but it starts up 10 | # fast. It relies on the database to filter bad items. 11 | m = morphy.Morphy() 12 | assert m('example', 'n') == {'n': {'example'}} 13 | assert m('examples', 'n') == {'n': {'examples', 'example'}} 14 | assert m('examples', 'v') == {'v': {'examples', 'example', 'exampl'}} 15 | assert m('exemplifying', 'n') == {'n': {'exemplifying'}} 16 | assert m('exemplifying', 'v') == {'v': {'exemplifying', 'exemplify', 'exemplifye'}} 17 | assert m('data', 'n') == {'n': {'data'}} 18 | assert m('datums', 'n') == {'n': {'datums', 'datum'}} # expected false positive 19 | assert m('examples', None) == {None: {'examples'}, 20 | 'n': {'example'}, 21 | 'v': {'example', 'exampl'}} 22 | assert m('exemplifying', None) == {None: {'exemplifying'}, 23 | 'v': {'exemplify', 'exemplifye'}} 24 | assert m('data', None) == {None: {'data'}} 25 | 26 | 27 | @pytest.mark.usefixtures('mini_db') 28 | def test_morphy_initialized(): 29 | w = wn.Wordnet('test-en:1') 30 | m = morphy.Morphy(wordnet=w) 31 | assert m('example', 'n') == {'n': {'example'}} 32 | assert m('examples', 'n') == {'n': {'example'}} 33 | assert m('examples', 'v') == {} 34 | assert m('exemplifying', 'n') == {} 35 | assert m('exemplifying', 'v') == {'v': {'exemplify'}} 36 | assert m('data', 'n') == {'n': {'datum'}} 37 | assert m('datums', 'n') == {'n': {'datum'}} # expected false positive 38 | assert m('examples', None) == {'n': {'example'}} 39 | assert m('exemplifying', None) == {'v': {'exemplify'}} 40 | assert m('data', None) == {'n': {'datum'}} 41 | 42 | 43 | @pytest.mark.usefixtures('mini_db') 44 | def test_issue_154(): 45 | # https://github.com/goodmami/wn/issues/154 46 | w = wn.Wordnet('test-en:1') 47 | assert w.words('exemplifies') == [w.word('test-en-exemplify-v')] 48 | assert w.words('samples') == [] 49 | w = wn.Wordnet('test-en:1', lemmatizer=morphy.Morphy()) 50 | assert w.words('exemplifies') == [w.word('test-en-exemplify-v')] 51 | assert w.words('samples') == [w.word('test-en-sample-n')] 52 | -------------------------------------------------------------------------------- /wn/_util.py: -------------------------------------------------------------------------------- 1 | """Non-public Wn utilities.""" 2 | 3 | from collections.abc import Iterable, Hashable 4 | from typing import TypeVar 5 | from pathlib import Path 6 | import hashlib 7 | from unicodedata import normalize, combining 8 | 9 | 10 | from wn._types import VersionInfo 11 | 12 | 13 | def version_info(version_string: str) -> VersionInfo: 14 | return tuple(map(int, version_string.split('.'))) 15 | 16 | 17 | def is_url(string: str) -> bool: 18 | """Return True if *string* appears to be a URL.""" 19 | # TODO: ETags? 20 | return any(string.startswith(scheme) 21 | for scheme in ('http://', 'https://')) 22 | 23 | 24 | def is_gzip(path: Path) -> bool: 25 | """Return True if the file at *path* appears to be gzipped.""" 26 | return _inspect_file_signature(path, b'\x1F\x8B') 27 | 28 | 29 | def is_lzma(path: Path) -> bool: 30 | """Return True if the file at *path* appears to be lzma-compressed.""" 31 | return _inspect_file_signature(path, b'\xFD7zXZ\x00') 32 | 33 | 34 | def is_xml(path: Path) -> bool: 35 | """Return True if the file at *path* appears to be an XML file.""" 36 | return _inspect_file_signature(path, b' bool: 40 | if path.is_file(): 41 | with path.open('rb') as f: 42 | return f.read(len(signature)) == signature 43 | return False 44 | 45 | 46 | def short_hash(string: str) -> str: 47 | """Return a short hash of *string*.""" 48 | b2 = hashlib.blake2b(digest_size=20) 49 | b2.update(string.encode('utf-8')) 50 | return b2.hexdigest() 51 | 52 | 53 | T = TypeVar('T') 54 | 55 | 56 | def flatten(iterable: Iterable[Iterable[T]]) -> list[T]: 57 | return [x for xs in iterable for x in xs] 58 | 59 | 60 | H = TypeVar('H', bound=Hashable) 61 | 62 | 63 | def unique_list(items: Iterable[H]) -> list[H]: 64 | # use a dictionary as an order-preserving set 65 | targets = {item: True for item in items} 66 | return list(targets) 67 | 68 | 69 | def normalize_form(s: str) -> str: 70 | return ''.join(c for c in normalize('NFKD', s.lower()) if not combining(c)) 71 | 72 | 73 | def format_lexicon_specifier(id: str, version: str) -> str: 74 | return f"{id}:{version}" 75 | 76 | 77 | def split_lexicon_specifier(lexicon: str) -> tuple[str, str]: 78 | id, _, ver = lexicon.partition(":") 79 | return id, ver 80 | -------------------------------------------------------------------------------- /docs/api/wn.taxonomy.rst: -------------------------------------------------------------------------------- 1 | 2 | wn.taxonomy 3 | =========== 4 | 5 | .. automodule:: wn.taxonomy 6 | 7 | 8 | Overview 9 | -------- 10 | 11 | Among the valid synset relations for wordnets (see 12 | :data:`wn.constants.SYNSET_RELATIONS`), those used for describing 13 | *is-a* `taxonomies `_ are 14 | given special treatment and they are generally the most 15 | well-developed relations in any wordnet. Typically these are the 16 | ``hypernym`` and ``hyponym`` relations, which encode *is-a-type-of* 17 | relationships (e.g., a *hermit crab* is a type of *decapod*, which is 18 | a type of *crustacean*, etc.). They also include ``instance_hypernym`` 19 | and ``instance_hyponym``, which encode *is-an-instance-of* 20 | relationships (e.g., *Oregon* is an instance of *American state*). 21 | 22 | The taxonomy forms a multiply-inheriting hierarchy with the synsets as 23 | nodes. In the English wordnets, such as the Princeton WordNet and its 24 | derivatives, nearly all nominal synsets form such a hierarchy with 25 | single root node, while verbal synsets form many smaller hierarchies 26 | without a common root. Other wordnets may have different properties, 27 | but as many are based off of the Princeton WordNet, they tend to 28 | follow this structure. 29 | 30 | Functions to find paths within the taxonomies form the basis of all 31 | :mod:`wordnet similarity measures `. For instance, the 32 | :ref:`leacock-chodorow-similarity` measure uses both 33 | :func:`shortest_path` and (indirectly) :func:`taxonomy_depth`. 34 | 35 | 36 | Wordnet-level Functions 37 | ----------------------- 38 | 39 | Root and leaf synsets in the taxonomy are those with no ancestors 40 | (``hypernym``, ``instance_hypernym``, etc.) or hyponyms (``hyponym``, 41 | ``instance_hyponym``, etc.), respectively. 42 | 43 | Finding root and leaf synsets 44 | ''''''''''''''''''''''''''''' 45 | 46 | .. autofunction:: roots 47 | .. autofunction:: leaves 48 | 49 | Computing the taxonomy depth 50 | '''''''''''''''''''''''''''' 51 | 52 | The taxonomy depth is the maximum depth from a root node to a leaf 53 | node within synsets for a particular part of speech. 54 | 55 | .. autofunction:: taxonomy_depth 56 | 57 | 58 | Synset-level Functions 59 | ---------------------- 60 | 61 | .. autofunction:: hypernym_paths 62 | .. autofunction:: min_depth 63 | .. autofunction:: max_depth 64 | .. autofunction:: shortest_path 65 | .. autofunction:: common_hypernyms 66 | .. autofunction:: lowest_common_hypernyms 67 | -------------------------------------------------------------------------------- /tests/db_test.py: -------------------------------------------------------------------------------- 1 | 2 | import sqlite3 3 | import threading 4 | import tempfile 5 | 6 | import pytest 7 | 8 | import wn 9 | from wn import lmf 10 | 11 | 12 | @pytest.mark.usefixtures('mini_db') 13 | def test_schema_compatibility(): 14 | conn = sqlite3.connect(str(wn.config.database_path)) 15 | schema_hash = wn._db.schema_hash(conn) 16 | assert schema_hash in wn._db.COMPATIBLE_SCHEMA_HASHES 17 | 18 | 19 | @pytest.mark.usefixtures('mini_db') 20 | def test_db_multithreading(): 21 | """ 22 | See https://github.com/goodmami/wn/issues/86 23 | Thanks: @fushinari 24 | """ 25 | 26 | class WNThread: 27 | w = None 28 | 29 | def __init__(self): 30 | w_thread = threading.Thread(target=self.set_w) 31 | w_thread.start() 32 | w_thread.join() 33 | self.w.synsets() 34 | 35 | def set_w(self): 36 | if self.w is None: 37 | self.w = wn.Wordnet() 38 | 39 | # close the connections by resetting the pool 40 | wn._db.pool = {} 41 | with pytest.raises(sqlite3.ProgrammingError): 42 | WNThread() 43 | wn._db.pool = {} 44 | wn.config.allow_multithreading = True 45 | WNThread() # no error 46 | wn.config.allow_multithreading = False 47 | wn._db.pool = {} 48 | 49 | 50 | def test_remove_extension(datadir): 51 | with tempfile.TemporaryDirectory('wn_data_1_1_trigger') as dir: 52 | old_data_dir = wn.config.data_directory 53 | wn.config.data_directory = dir 54 | wn.add(datadir / 'mini-lmf-1.0.xml') 55 | wn.add(datadir / 'mini-lmf-1.1.xml') 56 | assert len(wn.lexicons()) == 4 57 | wn.remove('test-en-ext') 58 | assert len(wn.lexicons()) == 3 59 | wn.remove('test-ja') 60 | assert len(wn.lexicons()) == 2 61 | wn.add(datadir / 'mini-lmf-1.1.xml') 62 | assert len(wn.lexicons()) == 4 63 | wn.remove('test-en') 64 | assert {lex.id for lex in wn.lexicons()} == {'test-es', 'test-ja'} 65 | wn.config.data_directory = old_data_dir 66 | # close any open DB connections before teardown 67 | for conn in wn._db.pool.values(): 68 | conn.close() 69 | 70 | 71 | def test_add_lexical_resource(datadir): 72 | with tempfile.TemporaryDirectory('wn_data_add_lexical_resource') as dir: 73 | old_data_dir = wn.config.data_directory 74 | wn.config.data_directory = dir 75 | wn.add_lexical_resource(lmf.load(datadir / 'mini-lmf-1.0.xml')) 76 | assert len(wn.lexicons()) == 2 77 | wn.add_lexical_resource(lmf.load(datadir / 'mini-lmf-1.1.xml')) 78 | assert len(wn.lexicons()) == 4 79 | wn.config.data_directory = old_data_dir 80 | # close any open DB connections before teardown 81 | for conn in wn._db.pool.values(): 82 | conn.close() 83 | 84 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | dynamic = ['version'] 7 | 8 | name = "wn" 9 | description = "Wordnet interface library" 10 | readme = "README.md" 11 | requires-python = ">=3.9" 12 | license = {file = "LICENSE"} 13 | keywords = ["wordnet", "interlingual", "linguistics", "language", "library"] 14 | authors = [ 15 | {name = "Michael Wayne Goodman", email = "goodman.m.w@gmail.com"} 16 | ] 17 | classifiers = [ 18 | "Development Status :: 4 - Beta", 19 | "Environment :: Console", 20 | "Intended Audience :: Developers", 21 | "Intended Audience :: Information Technology", 22 | "Intended Audience :: Science/Research", 23 | "License :: OSI Approved :: MIT License", 24 | "Programming Language :: Python :: 3", 25 | "Programming Language :: Python :: 3.9", 26 | "Programming Language :: Python :: 3.10", 27 | "Programming Language :: Python :: 3.11", 28 | "Programming Language :: Python :: 3.12", 29 | "Programming Language :: Python :: 3.13", 30 | "Topic :: Scientific/Engineering :: Information Analysis", 31 | "Topic :: Software Development :: Libraries :: Python Modules", 32 | "Topic :: Text Processing :: Linguistic", 33 | ] 34 | 35 | dependencies = [ 36 | "httpx", 37 | "tomli", 38 | ] 39 | 40 | [project.optional-dependencies] 41 | web = [ 42 | "starlette", 43 | ] 44 | editor = [ 45 | "wn-editor" 46 | ] 47 | 48 | [project.urls] 49 | homepage = "https://github.com/goodmami/wn" 50 | documentation = "https://wn.readthedocs.io" 51 | changelog = "https://github.com/goodmami/wn/blob/main/CHANGELOG.md" 52 | 53 | [tool.hatch.version] 54 | path = "wn/__init__.py" 55 | 56 | [tool.hatch.build.targets.sdist] 57 | exclude = [ 58 | "/.github", 59 | ] 60 | 61 | [tool.hatch.envs.hatch-test] 62 | extra-dependencies = [ 63 | "pytest-benchmark", 64 | ] 65 | features = ["web"] 66 | 67 | [tool.hatch.envs.mypy] 68 | dependencies = [ 69 | "mypy", 70 | ] 71 | 72 | [tool.hatch.envs.mypy.scripts] 73 | check = "mypy wn/" 74 | 75 | [tool.hatch.envs.docs] 76 | dependencies = [ 77 | "wn[web]", 78 | "furo", 79 | "sphinx", 80 | "sphinx-copybutton", 81 | "sphinx-autobuild", 82 | ] 83 | 84 | [tool.hatch.envs.docs.scripts] 85 | build = "sphinx-build -M html docs docs/_build" 86 | clean = "sphinx-build -M clean docs docs/_build" 87 | watch = "sphinx-autobuild docs docs/_build/html" 88 | 89 | [tool.ruff] 90 | target-version = "py39" 91 | line-length = 88 92 | 93 | [tool.ruff.lint] 94 | select = [ 95 | "B", # flake8-bugbear 96 | "C90", # McCabe cyclomatic complexity 97 | "E", # pycodestyle 98 | "F", # Pyflakes 99 | "W", # pycodestyle 100 | ] 101 | 102 | [tool.ruff.lint.per-file-ignores] 103 | "docs/conf.py" = ["E402"] 104 | 105 | [tool.ruff.format] 106 | quote-style = "single" 107 | -------------------------------------------------------------------------------- /tests/data/mini-lmf-1.4.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 10 | 11 | 12 | 13 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | 2 | import lzma 3 | import tempfile 4 | from pathlib import Path 5 | 6 | import pytest 7 | 8 | import wn 9 | 10 | 11 | @pytest.fixture(scope='session') 12 | def datadir(): 13 | return Path(__file__).parent / 'data' 14 | 15 | 16 | @pytest.fixture 17 | def uninitialized_datadir(monkeypatch, tmp_path: Path): 18 | with monkeypatch.context() as m: 19 | m.setattr(wn.config, 'data_directory', tmp_path / 'uninitialized_datadir') 20 | yield 21 | 22 | 23 | @pytest.fixture(scope='session') 24 | def empty_db(): 25 | with tempfile.TemporaryDirectory('wn_data_empty') as dir: 26 | with pytest.MonkeyPatch.context() as m: 27 | m.setattr(wn.config, 'data_directory', dir) 28 | yield 29 | 30 | 31 | # We want to build these DBs once per session, but connections 32 | # are created once for every test. 33 | 34 | @pytest.fixture(scope='session') 35 | def mini_db_dir(datadir): 36 | with tempfile.TemporaryDirectory('wn_data_mini') as dir: 37 | with pytest.MonkeyPatch.context() as m: 38 | m.setattr(wn.config, 'data_directory', dir) 39 | wn.add(datadir / 'mini-lmf-1.0.xml') 40 | wn._db.clear_connections() 41 | 42 | yield Path(dir) 43 | 44 | 45 | @pytest.fixture 46 | def mini_lmf_compressed(datadir): 47 | data = (datadir / 'mini-lmf-1.0.xml').read_bytes() 48 | with tempfile.NamedTemporaryFile(suffix='.xml.xz', delete=False) as file: 49 | path = Path(file.name) 50 | # Windows cannot reliably reopen file until it's closed 51 | with lzma.open(path, "w") as f: 52 | f.write(data) 53 | try: 54 | yield Path(file.name) 55 | finally: 56 | Path(file.name).unlink() 57 | 58 | 59 | @pytest.fixture(scope='session') 60 | def mini_db_1_1_dir(datadir): 61 | with tempfile.TemporaryDirectory('wn_data_mini_1_1') as dir: 62 | with pytest.MonkeyPatch.context() as m: 63 | m.setattr(wn.config, 'data_directory', dir) 64 | wn.add(datadir / 'mini-lmf-1.0.xml') 65 | wn.add(datadir / 'mini-lmf-1.1.xml') 66 | wn._db.clear_connections() 67 | 68 | yield Path(dir) 69 | 70 | 71 | @pytest.fixture(scope='session') 72 | def mini_db_1_4_dir(datadir): 73 | with tempfile.TemporaryDirectory('wn_data_mini_1_4') as dir: 74 | with pytest.MonkeyPatch.context() as m: 75 | m.setattr(wn.config, 'data_directory', dir) 76 | wn.add(datadir / 'mini-lmf-1.4.xml') 77 | wn._db.clear_connections() 78 | 79 | yield Path(dir) 80 | 81 | 82 | @pytest.fixture 83 | def mini_db(monkeypatch, mini_db_dir): 84 | with monkeypatch.context() as m: 85 | m.setattr(wn.config, 'data_directory', mini_db_dir) 86 | yield 87 | wn._db.clear_connections() 88 | 89 | 90 | @pytest.fixture 91 | def mini_db_1_1(monkeypatch, mini_db_1_1_dir): 92 | with monkeypatch.context() as m: 93 | m.setattr(wn.config, 'data_directory', mini_db_1_1_dir) 94 | yield 95 | wn._db.clear_connections() 96 | 97 | 98 | @pytest.fixture 99 | def mini_db_1_4(monkeypatch, mini_db_1_4_dir): 100 | with monkeypatch.context() as m: 101 | m.setattr(wn.config, 'data_directory', mini_db_1_4_dir) 102 | yield 103 | wn._db.clear_connections() 104 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Wn 2 | 3 | Thanks for helping to make Wn better! 4 | 5 | **Quick Links:** 6 | 7 | - [Report a bug or request a features](https://github.com/goodmami/wn/issues/new) 8 | - [Ask a question](https://github.com/goodmami/wn/discussions) 9 | - [View documentation](https://wn.readthedocs.io/) 10 | 11 | **Developer Information:** 12 | 13 | - Versioning scheme: [Semantic Versioning](https://semver.org/) 14 | - Branching scheme: [GitHub Flow](https://guides.github.com/introduction/flow/) 15 | - Changelog: [keep a changelog](https://keepachangelog.com/en/1.0.0/) 16 | - Documentation framework: [Sphinx](https://www.sphinx-doc.org/) 17 | - Docstring style: [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings) (via [sphinx.ext.napoleon](https://www.sphinx-doc.org/en/master/usage/extensions/napoleon.html)) 18 | - Unit/regression testing: [pytest](https://pytest.org/) 19 | - Benchmarking: [pytest-benchmark](https://pytest-benchmark.readthedocs.io/) 20 | - Packaging framework: [Hatch](https://hatch.pypa.io/) 21 | - Coding style: [PEP-8](https://www.python.org/dev/peps/pep-0008/) (via [Ruff](https://beta.ruff.rs/docs/)) 22 | - Type checking: [Mypy](http://mypy-lang.org/) 23 | 24 | 25 | ## Get Help 26 | 27 | Confused about wordnets in general? See the [Global Wordnet 28 | Association Documentation](https://globalwordnet.github.io/gwadoc/) 29 | 30 | Confused about using Wn or wish to share some tips? [Start a 31 | discussion](https://github.com/goodmami/wn/discussions) 32 | 33 | Encountering a problem with Wn or wish to propose a new features? [Raise an 34 | issue](https://github.com/goodmami/wn/issues/new) 35 | 36 | 37 | ## Report a Bug 38 | 39 | When reporting a bug, please provide enough information for someone to 40 | reproduce the problem. This might include the version of Python you're 41 | running, the version of Wn you have installed, the wordnet lexicons 42 | you have installed, and possibly the platform (Linux, Windows, macOS) 43 | you're on. Please give a minimal working example that illustrates the 44 | problem. For example: 45 | 46 | > I'm using Wn 0.9.5 with Python 3.11 on Linux and [description of 47 | > problem...]. Here's what I have tried: 48 | > 49 | > ```pycon 50 | > >>> import wn 51 | > >>> # some code 52 | > ... # some result or error 53 | > ``` 54 | 55 | 56 | ## Request a Feature 57 | 58 | If there's a feature that you think would make a good addition to Wn, 59 | raise an issue describing what the feature is and what problems it 60 | would address. 61 | 62 | ## Guidelines for Contributing 63 | 64 | See the "developer information" above for a brief description of 65 | guidelines and conventions used in Wn. If you have a fix, please 66 | submit a pull request to the `main` branch. In general, every pull 67 | request should have an associated issue. 68 | 69 | Developers should run and test Wn locally from source using 70 | [Hatch](https://hatch.pypa.io/). Hatch may be installed 71 | system-wide or within a virtual environment: 72 | 73 | ```bash 74 | $ pip install hatch 75 | ``` 76 | 77 | You can then use the `hatch` commands like the following: 78 | 79 | ```console 80 | $ hatch shell # activate a Wn virtual environment 81 | $ hatch fmt --check # lint the code and check code style 82 | $ hatch run mypy:check # type check with mypy 83 | $ hatch test # run unit tests 84 | $ hatch test bench # run benchmarks 85 | $ hatch build # build a source distribution and wheel 86 | $ hatch publish # publish build artifacts to PyPI 87 | ``` 88 | -------------------------------------------------------------------------------- /docs/cli.rst: -------------------------------------------------------------------------------- 1 | Command Line Interface 2 | ====================== 3 | 4 | Some of Wn's functionality is exposed via the command line. 5 | 6 | Global Options 7 | -------------- 8 | 9 | .. option:: -d DIR, --dir DIR 10 | 11 | Change to use ``DIR`` as the data directory prior to invoking any 12 | commands. 13 | 14 | 15 | Subcommands 16 | ----------- 17 | 18 | download 19 | -------- 20 | 21 | Download and add projects to the database given one or more project 22 | specifiers or URLs. 23 | 24 | .. code-block:: console 25 | 26 | $ python -m wn download oewn:2021 omw:1.4 cili 27 | $ python -m wn download https://en-word.net/static/english-wordnet-2021.xml.gz 28 | 29 | .. option:: --index FILE 30 | 31 | Use the index at ``FILE`` to resolve project specifiers. 32 | 33 | .. code-block:: console 34 | 35 | $ python -m wn download --index my-index.toml mywn 36 | 37 | .. option:: --no-add 38 | 39 | Download and cache the remote file, but don't add it to the 40 | database. 41 | 42 | 43 | lexicons 44 | -------- 45 | 46 | The ``lexicons`` subcommand lets you quickly see what is installed: 47 | 48 | .. code-block:: console 49 | 50 | $ python -m wn lexicons 51 | omw-en 1.4 [en] OMW English Wordnet based on WordNet 3.0 52 | omw-sk 1.4 [sk] Slovak WordNet 53 | omw-pl 1.4 [pl] plWordNet 54 | omw-is 1.4 [is] IceWordNet 55 | omw-zsm 1.4 [zsm] Wordnet Bahasa (Malaysian) 56 | omw-sl 1.4 [sl] sloWNet 57 | omw-ja 1.4 [ja] Japanese Wordnet 58 | ... 59 | 60 | .. option:: -l LG, --lang LG 61 | .. option:: --lexicon SPEC 62 | 63 | The ``--lang`` or ``--lexicon`` option can help you narrow down 64 | the results: 65 | 66 | .. code-block:: console 67 | 68 | $ python -m wn lexicons --lang en 69 | oewn 2021 [en] Open English WordNet 70 | omw-en 1.4 [en] OMW English Wordnet based on WordNet 3.0 71 | $ python -m wn lexicons --lexicon "omw-*" 72 | omw-en 1.4 [en] OMW English Wordnet based on WordNet 3.0 73 | omw-sk 1.4 [sk] Slovak WordNet 74 | omw-pl 1.4 [pl] plWordNet 75 | omw-is 1.4 [is] IceWordNet 76 | omw-zsm 1.4 [zsm] Wordnet Bahasa (Malaysian) 77 | 78 | 79 | projects 80 | -------- 81 | 82 | The ``projects`` subcommand lists all known projects in Wn's 83 | index. This is helpful to see what is available for downloading. 84 | 85 | .. code-block:: 86 | 87 | $ python -m wn projects 88 | ic cili 1.0 [---] Collaborative Interlingual Index 89 | ic oewn 2024 [en] Open English WordNet 90 | ic oewn 2023 [en] Open English WordNet 91 | ic oewn 2022 [en] Open English WordNet 92 | ic oewn 2021 [en] Open English WordNet 93 | ic ewn 2020 [en] Open English WordNet 94 | ic ewn 2019 [en] Open English WordNet 95 | i- odenet 1.4 [de] Open German WordNet 96 | ic odenet 1.3 [de] Open German WordNet 97 | ic omw 1.4 [mul] Open Multilingual Wordnet 98 | ic omw-en 1.4 [en] OMW English Wordnet based on WordNet 3.0 99 | ... 100 | 101 | 102 | validate 103 | -------- 104 | 105 | Given a path to a WN-LMF XML file, check the file for structural 106 | problems and print a report. 107 | 108 | .. code-block:: 109 | 110 | $ python -m wn validate english-wordnet-2021.xml 111 | 112 | .. option:: --select CHECKS 113 | 114 | Run the checks with the given comma-separated list of check codes 115 | or categories. 116 | 117 | .. code-block:: 118 | 119 | $ python -m wn validate --select E W201 W204 deWordNet.xml 120 | 121 | .. option:: --output-file FILE 122 | 123 | Write the report to FILE as a JSON object instead of printing the 124 | report to stdout. 125 | -------------------------------------------------------------------------------- /.github/workflows/publish-docker.yaml: -------------------------------------------------------------------------------- 1 | # Adapted from https://docs.github.com/en/actions/tutorials/publishing-packages/publishing-docker-images 2 | name: Publish a Docker image 3 | 4 | # Configures this workflow to run every time a new release is created in the repository. 5 | on: 6 | release: 7 | types: [ created ] 8 | 9 | # Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds. 10 | env: 11 | REGISTRY: ghcr.io 12 | IMAGE_NAME: ${{ github.repository }} 13 | 14 | # There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu. 15 | jobs: 16 | build-and-push-image: 17 | runs-on: ubuntu-latest 18 | # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job. 19 | permissions: 20 | contents: read 21 | packages: write 22 | attestations: write 23 | id-token: write 24 | 25 | steps: 26 | - name: Checkout repository 27 | uses: actions/checkout@v4 28 | # Uses the `docker/login-action` action to log in to the Container registry registry using the account and password that will publish the packages. Once published, the packages are scoped to the account defined here. 29 | - name: Log in to the Container registry 30 | uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 31 | with: 32 | registry: ${{ env.REGISTRY }} 33 | username: ${{ github.actor }} 34 | password: ${{ secrets.GITHUB_TOKEN }} 35 | # This step uses [docker/metadata-action](https://github.com/docker/metadata-action#about) to extract tags and labels that will be applied to the specified image. The `id` "meta" allows the output of this step to be referenced in a subsequent step. The `images` value provides the base name for the tags and labels. 36 | - name: Extract metadata (tags, labels) for Docker 37 | id: meta 38 | uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 39 | with: 40 | images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} 41 | # This step uses the `docker/build-push-action` action to build the image, based on your repository's `Dockerfile`. If the build succeeds, it pushes the image to GitHub Packages. 42 | # It uses the `context` parameter to define the build's context as the set of files located in the specified path. For more information, see [Usage](https://github.com/docker/build-push-action#usage) in the README of the `docker/build-push-action` repository. 43 | # It uses the `tags` and `labels` parameters to tag and label the image with the output from the "meta" step. 44 | - name: Build and push Docker image 45 | id: push 46 | uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4 47 | with: 48 | context: . 49 | push: true 50 | tags: ${{ steps.meta.outputs.tags }} 51 | labels: ${{ steps.meta.outputs.labels }} 52 | 53 | # This step generates an artifact attestation for the image, which is an unforgeable statement about where and how it was built. It increases supply chain security for people who consume the image. For more information, see [Using artifact attestations to establish provenance for builds](/actions/security-guides/using-artifact-attestations-to-establish-provenance-for-builds). 54 | - name: Generate artifact attestation 55 | uses: actions/attest-build-provenance@v2 56 | with: 57 | subject-name: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME}} 58 | subject-digest: ${{ steps.push.outputs.digest }} 59 | push-to-registry: true 60 | 61 | -------------------------------------------------------------------------------- /docs/api/wn.morphy.rst: -------------------------------------------------------------------------------- 1 | 2 | wn.morphy 3 | ========= 4 | 5 | .. automodule:: wn.morphy 6 | 7 | .. seealso:: 8 | 9 | The Princeton WordNet `documentation 10 | `_ describes 11 | the original implementation of Morphy. 12 | 13 | The :doc:`../guides/lemmatization` guide describes how Wn handles 14 | lemmatization in general. 15 | 16 | 17 | Initialized and Uninitialized Morphy 18 | ------------------------------------ 19 | 20 | There are two ways of using Morphy in Wn: initialized and 21 | uninitialized. 22 | 23 | Unintialized Morphy is a simple callable that returns lemma 24 | *candidates* for some given wordform. That is, the results might not 25 | be valid lemmas, but this is not a problem in practice because 26 | subsequent queries against the database will filter out the invalid 27 | ones. This callable is obtained by creating a :class:`Morphy` object 28 | with no arguments: 29 | 30 | >>> from wn import morphy 31 | >>> m = morphy.Morphy() 32 | 33 | As an uninitialized Morphy cannot predict which lemmas in the result 34 | are valid, it always returns the original form and any transformations 35 | it can find for each part of speech: 36 | 37 | >>> m('lemmata', pos='n') # exceptional form 38 | {'n': {'lemmata'}} 39 | >>> m('lemmas', pos='n') # regular morphology with part-of-speech 40 | {'n': {'lemma', 'lemmas'}} 41 | >>> m('lemmas') # regular morphology for any part-of-speech 42 | {None: {'lemmas'}, 'n': {'lemma'}, 'v': {'lemma'}} 43 | >>> m('wolves') # invalid forms may be returned 44 | {None: {'wolves'}, 'n': {'wolf', 'wolve'}, 'v': {'wolve', 'wolv'}} 45 | 46 | 47 | This lemmatizer can also be used with a :class:`wn.Wordnet` object to 48 | expand queries: 49 | 50 | >>> import wn 51 | >>> ewn = wn.Wordnet('ewn:2020') 52 | >>> ewn.words('lemmas') 53 | [] 54 | >>> ewn = wn.Wordnet('ewn:2020', lemmatizer=morphy.Morphy()) 55 | >>> ewn.words('lemmas') 56 | [Word('ewn-lemma-n')] 57 | 58 | An initialized Morphy is created with a :class:`wn.Wordnet` object as 59 | its argument. It then uses the wordnet to build lists of valid lemmas 60 | and exceptional forms (this takes a few seconds). Once this is done, 61 | it will only return lemmas it knows about: 62 | 63 | >>> ewn = wn.Wordnet('ewn:2020') 64 | >>> m = morphy.Morphy(ewn) 65 | >>> m('lemmata', pos='n') # exceptional form 66 | {'n': {'lemma'}} 67 | >>> m('lemmas', pos='n') # regular morphology with part-of-speech 68 | {'n': {'lemma'}} 69 | >>> m('lemmas') # regular morphology for any part-of-speech 70 | {'n': {'lemma'}} 71 | >>> m('wolves') # invalid forms are pre-filtered 72 | {'n': {'wolf'}} 73 | 74 | In order to use an initialized Morphy lemmatizer with a 75 | :class:`wn.Wordnet` object, it must be assigned to the object after 76 | creation: 77 | 78 | >>> ewn = wn.Wordnet('ewn:2020') # default: lemmatizer=None 79 | >>> ewn.words('lemmas') 80 | [] 81 | >>> ewn.lemmatizer = morphy.Morphy(ewn) 82 | >>> ewn.words('lemmas') 83 | [Word('ewn-lemma-n')] 84 | 85 | There is little to no difference in the results obtained from a 86 | :class:`wn.Wordnet` object using an initialized or uninitialized 87 | :class:`Morphy` object, but there may be slightly different 88 | performance profiles for future queries. 89 | 90 | 91 | Default Morphy Lemmatizer 92 | ------------------------- 93 | 94 | As a convenience, an uninitialized Morphy lemmatizer is provided in 95 | this module via the :data:`morphy` member. 96 | 97 | .. data:: morphy 98 | 99 | A :class:`Morphy` object created without a :class:`wn.Wordnet` 100 | object. 101 | 102 | 103 | The Morphy Class 104 | ---------------- 105 | 106 | .. autoclass:: Morphy 107 | -------------------------------------------------------------------------------- /tests/compat_sensekey_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import wn 4 | from wn.compat import sensekey 5 | 6 | 7 | def test_unescape_oewn_sense_key(): 8 | 9 | def unescape(s: str) -> str: 10 | return sensekey.unescape(s, flavor="oewn") 11 | 12 | assert unescape("") == "" 13 | assert unescape("abc") == "abc" 14 | assert unescape(".") == "." # only becomes : in second part of key 15 | # escape patterns 16 | assert unescape("-ap-") == "'" 17 | assert unescape("-ex-") == "!" 18 | assert unescape("-cm-") == "," 19 | assert unescape("-cn-") == ":" 20 | assert unescape("-pl-") == "+" 21 | assert unescape("-sl-") == "/" 22 | # adjacent escapes need their own dashes 23 | assert unescape("-ap-ex-") == "'ex-" 24 | assert unescape("-ap--ex-") == "'!" 25 | # invalid escapes are unchanged 26 | assert unescape("-foo-") == "-foo-" # not an escape sequence 27 | assert unescape("-sp-") == "-sp-" # not valid in lemma portion 28 | assert unescape("ap-") == "ap-" # no preceding dash 29 | assert unescape("-ap") == "-ap" # no trailing dash 30 | assert unescape("-AP-") == "-AP-" # case sensitivity 31 | # idempotency 32 | assert unescape(unescape("-ap--ex--cm-")) == unescape("-ap--ex--cm-") 33 | # full key, second part escapes differently 34 | assert unescape("abc__1.23.00..") == "abc%1:23:00::" 35 | assert unescape("abc__1.23.00.foo-sp-bar.") == "abc%1:23:00:foo_bar:" 36 | assert unescape("abc__1.23.00.foo-ap-bar.") == "abc%1:23:00:foo-ap-bar:" 37 | 38 | 39 | def test_escape_oewn_sense_key(): 40 | 41 | def escape(s: str) -> str: 42 | return sensekey.escape(s, flavor="oewn") 43 | 44 | assert escape("") == "" 45 | assert escape("abc") == "abc" 46 | assert escape(".") == "." # only becomes : in second part of key 47 | # escape patterns 48 | assert escape("'") == "-ap-" 49 | assert escape("!") == "-ex-" 50 | assert escape(",") == "-cm-" 51 | assert escape(":") == "-cn-" 52 | assert escape("+") == "-pl-" 53 | assert escape("/") == "-sl-" 54 | # adjacent escapes need their own dashes 55 | assert escape("'!") == "-ap--ex-" 56 | # idempotency 57 | assert escape(escape("'!,")) == escape("'!,") 58 | # full key, second part escapes differently 59 | assert escape("abc%1:23:00::") == "abc__1.23.00.." 60 | assert escape("abc%1:23:00:foo_bar:") == "abc__1.23.00.foo-sp-bar." 61 | assert escape("abc%1:23:00:foo'bar:") == "abc__1.23.00.foo'bar." 62 | 63 | 64 | @pytest.mark.usefixtures("uninitialized_datadir") 65 | def test_sense_key_getter(datadir): 66 | wn.add(datadir / "sense-key-variations.xml") 67 | 68 | get_omw_sense_key = sensekey.sense_key_getter("omw-en:1.4") 69 | get_oewn_sense_key = sensekey.sense_key_getter("oewn:2024") 70 | 71 | omw_sense = wn.sense("omw-en--apos-s_Gravenhage-08950407-n", lexicon="omw-en:1.4") 72 | oewn_sense = wn.sense("oewn--ap-s_gravenhage__1.15.00..", lexicon="oewn:2024") 73 | 74 | assert get_omw_sense_key(omw_sense) == "'s_gravenhage%1:15:00::" 75 | assert get_omw_sense_key(oewn_sense) is None 76 | 77 | assert get_oewn_sense_key(omw_sense) is None 78 | assert get_oewn_sense_key(oewn_sense) == "'s_gravenhage%1:15:00::" 79 | 80 | 81 | @pytest.mark.usefixtures("uninitialized_datadir") 82 | def test_sense_getter(datadir): 83 | wn.add(datadir / "sense-key-variations.xml") 84 | 85 | get_omw_sense = sensekey.sense_getter("omw-en:1.4") 86 | get_oewn_sense = sensekey.sense_getter("oewn:2024") 87 | 88 | omw_sense = wn.sense("omw-en--apos-s_Gravenhage-08950407-n", lexicon="omw-en:1.4") 89 | oewn_sense = wn.sense("oewn--ap-s_gravenhage__1.15.00..", lexicon="oewn:2024") 90 | 91 | assert get_omw_sense("'s_gravenhage%1:15:00::") == omw_sense 92 | assert get_oewn_sense("'s_gravenhage%1:15:00::") == oewn_sense 93 | -------------------------------------------------------------------------------- /docs/setup.rst: -------------------------------------------------------------------------------- 1 | Installation and Configuration 2 | ============================== 3 | 4 | .. seealso:: 5 | 6 | This guide is for installing and configuring the Wn software. For 7 | adding lexicons to the database, see :doc:`guides/lexicons`. 8 | 9 | 10 | Installing from PyPI 11 | -------------------- 12 | 13 | Install the latest release from `PyPI `_: 14 | 15 | .. code-block:: bash 16 | 17 | pip install wn 18 | 19 | To get the dependencies for the :mod:`wn.web` module, use the ``web`` 20 | installation extra: 21 | 22 | .. code-block:: bash 23 | 24 | pip install "wn[web]" 25 | 26 | 27 | Installing with Conda 28 | --------------------- 29 | 30 | Alternatively, if you use the `Anaconda ` 31 | distribution of Python, you can install with conda: 32 | 33 | .. code-block:: bash 34 | 35 | conda install -c conda-forge wn 36 | 37 | 38 | The Data Directory 39 | ------------------ 40 | 41 | By default, Wn stores its data (such as downloaded LMF files and the 42 | database file) in a ``.wn_data/`` directory under the user's home 43 | directory. This directory can be changed (see `Configuration`_ 44 | below). Whenever Wn attempts to download a resource or access its 45 | database, it will check for the existence of, and create if necessary, 46 | this directory, the ``.wn_data/downloads/`` subdirectory, and the 47 | ``.wn_data/wn.db`` database file. The file system will look like 48 | this:: 49 | 50 | .wn_data/ 51 | ├── downloads 52 | │   ├── ... 53 | │   └── ... 54 | └── wn.db 55 | 56 | The ``...`` entries in the ``downloads/`` subdirectory represent the 57 | files of resources downloaded from the web. Their filename is a hash 58 | of the URL so that Wn can avoid downloading the same file twice. 59 | 60 | 61 | Configuration 62 | ------------- 63 | 64 | The :py:data:`wn.config` object contains the paths Wn uses for local 65 | storage and information about resources available on the web. To 66 | change the directory Wn uses for storing data locally, modify the 67 | :python:`wn.config.data_directory` member: 68 | 69 | .. code-block:: python 70 | 71 | import wn 72 | wn.config.data_directory = '~/Projects/wn_data' 73 | 74 | There are some things to note: 75 | 76 | - The downloads directory and database path are always relative to the 77 | data directory and cannot be changed directly. 78 | - This change only affects subsequent operations, so any data in the 79 | previous location will not be moved nor deleted. 80 | - This change only affects the current session. If you want a script 81 | or application to always use the new location, it must reset the 82 | data directory each time it is initialized. 83 | 84 | You can also add project information for remote resources. First you 85 | add a project, with a project ID, full name, and language code. Then 86 | you create one or more versions for that project with a version ID, 87 | resource URL, and license information. This may be done either through 88 | the :py:data:`wn.config` object's 89 | :py:meth:`~wn._config.WNConfig.add_project` and 90 | :py:meth:`~wn._config.WNConfig.add_project_version` methods, or loaded 91 | from a TOML_ file via the :py:data:`wn.config` object's 92 | :py:meth:`~wn._config.WNConfig.load_index` method. 93 | 94 | .. _TOML: https://toml.io 95 | 96 | .. code-block:: python 97 | 98 | wn.config.add_project('ewn', 'English WordNet', 'en') 99 | wn.config.add_project_version( 100 | 'ewn', '2020', 101 | 'https://en-word.net/static/english-wordnet-2020.xml.gz', 102 | 'https://creativecommons.org/licenses/by/4.0/', 103 | ) 104 | 105 | 106 | Installing From Source 107 | ---------------------- 108 | 109 | If you wish to install the code from the source repository (e.g., to 110 | get an unreleased feature or to contribute toward Wn's development), 111 | clone the repository and use `Hatch `_ to 112 | start a virtual environment with Wn installed: 113 | 114 | .. code-block:: console 115 | 116 | $ git clone https://github.com/goodmami/wn.git 117 | $ cd wn 118 | $ hatch shell 119 | -------------------------------------------------------------------------------- /tests/wordnet_test.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | import warnings 3 | from pathlib import Path 4 | 5 | import pytest 6 | 7 | import wn 8 | 9 | 10 | @pytest.mark.usefixtures('mini_db_1_1') 11 | def test_wordnet_lexicons(): 12 | en = wn.Wordnet('test-en') 13 | assert len(en.lexicons()) == 1 14 | assert len(en.expanded_lexicons()) == 0 15 | 16 | en1 = wn.Wordnet('test-en:1') 17 | assert en.lexicons() == en1.lexicons() 18 | assert en.expanded_lexicons() == en1.expanded_lexicons() 19 | 20 | en2 = wn.Wordnet(lang='en') 21 | assert len(en2.lexicons()) == 2 22 | assert len(en2.expanded_lexicons()) == 0 23 | 24 | es = wn.Wordnet('test-es') 25 | assert len(es.lexicons()) == 1 26 | assert len(es.expanded_lexicons()) == 0 27 | 28 | es2 = wn.Wordnet('test-es', expand='test-en') 29 | assert len(es2.lexicons()) == 1 30 | assert len(es2.expanded_lexicons()) == 1 31 | 32 | ja = wn.Wordnet('test-ja') 33 | assert len(ja.lexicons()) == 1 34 | assert len(ja.expanded_lexicons()) == 1 35 | 36 | ja2 = wn.Wordnet('test-ja', expand='') 37 | assert len(ja2.lexicons()) == 1 38 | assert len(ja2.expanded_lexicons()) == 0 39 | 40 | 41 | @pytest.mark.usefixtures('mini_db') 42 | def test_wordnet_normalize(): 43 | es = wn.Wordnet('test-es') 44 | assert es.words('Informacion') == es.words('información') 45 | assert es.words('ínfórmácíón') == es.words('información') 46 | es = wn.Wordnet('test-es', normalizer=None) 47 | assert es.words('informacion') == [] 48 | assert es.words('Información') == [] 49 | 50 | # The following doesn't necessarily work because any non-None 51 | # normalizer causes the normalized form column to be tested with 52 | # the original form 53 | # es = wn.Wordnet('test-es', normalizer=str.lower) 54 | # assert es.words('informacion') == [] 55 | # assert es.words('Información') == es.words('información') 56 | 57 | 58 | @pytest.mark.usefixtures('mini_db') 59 | def test_wordnet_lemmatize(): 60 | # default lemmatizer compares alternative forms 61 | en = wn.Wordnet('test-en') 62 | assert en.words('examples') == [] 63 | assert en.words('exemplifying') == en.words('exemplify') 64 | assert en.words('data') == en.words('datum') 65 | 66 | en = wn.Wordnet('test-en', search_all_forms=False) 67 | assert en.words('examples') == [] 68 | assert en.words('exemplifying') == [] 69 | assert en.words('data') == [] 70 | 71 | def morphy_lite(form, pos): 72 | result = {pos: {form}} 73 | if pos in ('n', None) and form.endswith('s'): 74 | result.setdefault('n', set()).add(form[:-1]) 75 | return result 76 | 77 | en = wn.Wordnet('test-en', lemmatizer=morphy_lite, search_all_forms=False) 78 | assert en.words('examples', pos='n') == en.words('example') 79 | assert en.words('examples') == en.words('example') 80 | assert en.words('exemplifying') == [] 81 | assert en.words('data') == [] 82 | 83 | en = wn.Wordnet('test-en', lemmatizer=morphy_lite, search_all_forms=True) 84 | assert en.words('data') == en.words('datum') 85 | assert en.words('exemplifying') == en.words('exemplify') 86 | 87 | 88 | def test_portable_entities_issue_226(monkeypatch, datadir): 89 | # instead use ignore_cleanup_errors=True from Python 3.10 90 | tempdir = tempfile.TemporaryDirectory('wn_issue_226') 91 | with tempdir as dir: 92 | with monkeypatch.context() as m: 93 | m.setattr(wn.config, 'data_directory', Path(dir)) 94 | wn.add(datadir / 'mini-lmf-1.0.xml') 95 | en = wn.Wordnet('test-en') 96 | info1 = en.synsets('information')[0] 97 | wn.remove('test-en') 98 | wn.add(datadir / 'mini-lmf-1.0.xml') 99 | info2 = en.synsets('information')[0] # en Wordnet object still works 100 | assert info1 == info2 # synsets are equivalent 101 | wn._db.clear_connections() 102 | # Not needed if ignore_cleanup_errors=True and delete=True above 103 | try: 104 | tempdir.cleanup() 105 | except PermissionError: 106 | warnings.warn( 107 | f"Failed to clean up temporary directory {dir!s}", 108 | stacklevel=1, 109 | ) 110 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'wn' 21 | copyright = '2020, Michael Wayne Goodman' 22 | author = 'Michael Wayne Goodman' 23 | 24 | import wn 25 | 26 | # The short X.Y version 27 | version = '.'.join(wn.__version__.split('.')[:2]) 28 | # The full version, including alpha/beta/rc tags 29 | release = wn.__version__ 30 | 31 | # -- General configuration --------------------------------------------------- 32 | 33 | # Add any Sphinx extension module names here, as strings. They can be 34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 35 | # ones. 36 | extensions = [ 37 | 'sphinx.ext.autodoc', 38 | 'sphinx.ext.intersphinx', 39 | 'sphinx.ext.coverage', 40 | # 'sphinx.ext.viewcode', 41 | 'sphinx.ext.githubpages', 42 | 'sphinx.ext.napoleon', 43 | "sphinx_copybutton", 44 | ] 45 | 46 | # Add any paths that contain templates here, relative to this directory. 47 | templates_path = ['_templates'] 48 | 49 | # List of patterns, relative to source directory, that match files and 50 | # directories to ignore when looking for source files. 51 | # This pattern also affects html_static_path and html_extra_path. 52 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 53 | 54 | # Global definitions 55 | rst_prolog = """ 56 | .. role:: python(code) 57 | :language: python 58 | :class: highlight 59 | """ 60 | 61 | # smartquotes = False 62 | smartquotes_action = 'De' # D = en- and em-dash; e = ellipsis 63 | 64 | # -- Options for HTML output ------------------------------------------------- 65 | 66 | # The theme to use for HTML and HTML Help pages. See the documentation for 67 | # a list of builtin themes.# 68 | 69 | html_theme = "furo" 70 | html_theme_options = { 71 | "light_css_variables": { 72 | "color-brand-primary": "#006699", 73 | "color-brand-content": "#006699", 74 | # "color-background": "#f0f0f0", 75 | # "color-sidebar-background": "#ddd", 76 | }, 77 | "dark_css_variables": { 78 | "color-brand-primary": "#00CCFF", 79 | "color-brand-content": "#00CCFF", 80 | } 81 | } 82 | 83 | html_logo = "_static/wn-logo.svg" 84 | 85 | pygments_style = 'manni' 86 | pygments_dark_style = 'monokai' 87 | 88 | # Add any paths that contain custom static files (such as style sheets) here, 89 | # relative to this directory. They are copied after the builtin static files, 90 | # so a file named "default.css" will overwrite the builtin "default.css". 91 | html_static_path = ['_static'] 92 | html_css_files = [ 93 | 'css/svg.css', 94 | ] 95 | 96 | # Don't offer to show the source of the current page 97 | html_show_sourcelink = False 98 | 99 | # -- Options for autodoc extension ------------------------------------------- 100 | 101 | # autodoc_typehints = 'description' 102 | autodoc_typehints = 'signature' 103 | # autodoc_typehints = 'none' 104 | 105 | # -- Options for intersphinx extension --------------------------------------- 106 | 107 | # Example configuration for intersphinx: refer to the Python standard library. 108 | intersphinx_mapping = { 109 | 'python': ('https://docs.python.org/3', None), 110 | 'httpx': ('https://httpx.readthedocs.io/en/latest/', None), 111 | } 112 | 113 | # -- Options for sphinx_copybutton extension --------------------------------- 114 | 115 | copybutton_prompt_text = ( 116 | r">>> " # regular Python prompt 117 | r"|\.\.\. " # Python continuation prompt 118 | r"|\$ " # Basic shell 119 | r"|In \[\d*\]: " # Jupyter notebook 120 | ) 121 | copybutton_prompt_is_regexp = True 122 | -------------------------------------------------------------------------------- /tests/ic_test.py: -------------------------------------------------------------------------------- 1 | 2 | from math import log 3 | 4 | import pytest 5 | 6 | import wn 7 | from wn.constants import (NOUN, VERB, ADJ, ADV) 8 | from wn.util import synset_id_formatter 9 | import wn.ic 10 | 11 | 12 | synset_id = { 13 | 'information': 'test-en-0001-n', 14 | 'illustration_example': 'test-en-0002-n', 15 | 'sample': 'test-en-0004-n', 16 | 'random_sample': 'test-en-0005-n', 17 | 'random_sample2': 'test-en-0008-n', # no hypernyms 18 | 'datum': 'test-en-0006-n', 19 | 'illustrate_exemplify': 'test-en-0003-v', 20 | 'resignate': 'test-en-0007-v', 21 | } 22 | 23 | 24 | words = [ 25 | 'For', 'example', ':', 'random sample', '.', 26 | 'This', 'will', 'illustrate', 'and', 'exemplify', '.', 27 | 'A', 'sample', 'of', 'data', '.', 28 | ] 29 | 30 | 31 | @pytest.mark.usefixtures('mini_db') 32 | def test_compute_nodistribute_nosmoothing(): 33 | w = wn.Wordnet('test-en:1') 34 | assert wn.ic.compute(words, w, distribute_weight=False, smoothing=0) == { 35 | NOUN: { 36 | synset_id['information']: 4.0, 37 | synset_id['illustration_example']: 3.0, 38 | synset_id['sample']: 2.0, 39 | synset_id['random_sample']: 1.0, 40 | synset_id['random_sample2']: 1.0, 41 | synset_id['datum']: 1.0, 42 | None: 5.0, 43 | }, 44 | VERB: { 45 | synset_id['illustrate_exemplify']: 2.0, 46 | synset_id['resignate']: 0.0, 47 | None: 2.0, 48 | }, 49 | ADJ: {None: 0.0}, 50 | ADV: {None: 0.0}, 51 | } 52 | 53 | 54 | @pytest.mark.usefixtures('mini_db') 55 | def test_compute_nodistribute_smoothing(): 56 | w = wn.Wordnet('test-en:1') 57 | assert wn.ic.compute(words, w, distribute_weight=False, smoothing=1.0) == { 58 | NOUN: { 59 | synset_id['information']: 5.0, 60 | synset_id['illustration_example']: 4.0, 61 | synset_id['sample']: 3.0, 62 | synset_id['random_sample']: 2.0, 63 | synset_id['random_sample2']: 2.0, 64 | synset_id['datum']: 2.0, 65 | None: 6.0, 66 | }, 67 | VERB: { 68 | synset_id['illustrate_exemplify']: 3.0, 69 | synset_id['resignate']: 1.0, 70 | None: 3.0, 71 | }, 72 | ADJ: {None: 1.0}, 73 | ADV: {None: 1.0}, 74 | } 75 | 76 | 77 | @pytest.mark.usefixtures('mini_db') 78 | def test_compute_distribute_smoothing(): 79 | w = wn.Wordnet('test-en:1') 80 | assert wn.ic.compute(words, w, distribute_weight=True, smoothing=1.0) == { 81 | NOUN: { 82 | synset_id['information']: 4.5, 83 | synset_id['illustration_example']: 3.5, 84 | synset_id['sample']: 2.5, 85 | synset_id['random_sample']: 1.5, 86 | synset_id['random_sample2']: 1.5, 87 | synset_id['datum']: 2.0, 88 | None: 5.0, 89 | }, 90 | VERB: { 91 | synset_id['illustrate_exemplify']: 3.0, 92 | synset_id['resignate']: 1.0, 93 | None: 3.0, 94 | }, 95 | ADJ: {None: 1.0}, 96 | ADV: {None: 1.0}, 97 | } 98 | 99 | 100 | @pytest.mark.usefixtures('mini_db') 101 | def test_load(tmp_path): 102 | w = wn.Wordnet('test-en:1') 103 | icpath = tmp_path / 'foo.dat' 104 | icpath.write_text( 105 | 'wnver:1234567890AbCdEf\n' 106 | '1n 4.0 ROOT\n' 107 | '2n 3.0\n' 108 | '4n 2.0\n' 109 | '5n 1.0\n' 110 | '8n 1.0 ROOT\n' 111 | '6n 1.0\n' 112 | '3v 2.0 ROOT\n' 113 | '7v 0.0 ROOT\n' 114 | ) 115 | 116 | get_synset_id = synset_id_formatter('test-en-{offset:04}-{pos}') 117 | assert (wn.ic.load(icpath, w, get_synset_id=get_synset_id) 118 | == wn.ic.compute(words, w, distribute_weight=False, smoothing=0.0)) 119 | 120 | 121 | @pytest.mark.usefixtures('mini_db') 122 | def test_information_content(): 123 | w = wn.Wordnet('test-en:1') 124 | ic = wn.ic.compute(words, w) 125 | info = w.synsets('information')[0] 126 | samp = w.synsets('sample')[0] 127 | # info is a root but not the only one, so its IC is not 0.0 128 | assert wn.ic.information_content(info, ic) == -log( 129 | ic['n'][info.id] 130 | / ic['n'][None] 131 | ) 132 | assert wn.ic.information_content(samp, ic) == -log( 133 | ic['n'][samp.id] 134 | / ic['n'][None] 135 | ) 136 | -------------------------------------------------------------------------------- /tests/web_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from starlette.testclient import TestClient 3 | 4 | import wn 5 | import wn._db 6 | from wn import web 7 | 8 | 9 | # clearing connections on teardown (see conftest.py) isn't enough. For 10 | # this we apparently need to monkeypatch the wn._db.pool as well. 11 | 12 | @pytest.fixture 13 | def mini_db_web(monkeypatch, mini_db_dir): 14 | with monkeypatch.context() as m: 15 | m.setattr(wn._db, 'pool', {}) 16 | m.setattr(wn.config, 'data_directory', mini_db_dir) 17 | m.setattr(wn.config, 'allow_multithreading', True) 18 | yield 19 | wn._db.clear_connections() 20 | 21 | 22 | client = TestClient(web.app) 23 | 24 | 25 | @pytest.mark.usefixtures('mini_db_web') 26 | def test_root(): 27 | response = client.get('/') 28 | assert response.status_code == 404 29 | 30 | 31 | @pytest.mark.usefixtures('mini_db_web') 32 | def test_lexicons(): 33 | response = client.get("/lexicons") 34 | assert response.status_code == 200 35 | data = response.json()["data"] 36 | assert [lex["id"] for lex in data] == ["test-en:1", "test-es:1"] 37 | 38 | 39 | @pytest.mark.usefixtures('mini_db_web') 40 | def test_words(): 41 | response = client.get("/words") 42 | assert response.status_code == 200 43 | data = response.json()["data"] 44 | word_ids = {word["id"] for word in data} 45 | assert "test-en-information-n" in word_ids 46 | assert "test-es-información-n" in word_ids 47 | 48 | response = client.get("/words", params={"lexicon": "test-en:1"}) 49 | assert response.status_code == 200 50 | data = response.json()["data"] 51 | word_ids = {word["id"] for word in data} 52 | assert "test-en-information-n" in word_ids 53 | assert "test-es-información-n" not in word_ids 54 | 55 | 56 | @pytest.mark.usefixtures('mini_db_web') 57 | def test_senses(): 58 | response = client.get("/senses") 59 | assert response.status_code == 200 60 | data = response.json()["data"] 61 | sense_ids = {sense["id"] for sense in data} 62 | assert "test-en-information-n-0001-01" in sense_ids 63 | assert "test-es-información-n-0001-01" in sense_ids 64 | 65 | response = client.get("/senses", params={"lexicon": "test-en:1"}) 66 | assert response.status_code == 200 67 | data = response.json()["data"] 68 | sense_ids = {sense["id"] for sense in data} 69 | assert "test-en-information-n-0001-01" in sense_ids 70 | assert "test-es-información-n-0001-01" not in sense_ids 71 | 72 | 73 | @pytest.mark.usefixtures('mini_db_web') 74 | def test_synsets(): 75 | response = client.get("/synsets") 76 | assert response.status_code == 200 77 | data = response.json()["data"] 78 | synset_ids = {synset["id"] for synset in data} 79 | assert "test-en-0001-n" in synset_ids 80 | assert "test-es-0001-n" in synset_ids 81 | 82 | response = client.get("/synsets", params={"lexicon": "test-en:1"}) 83 | assert response.status_code == 200 84 | data = response.json()["data"] 85 | synset_ids = {synset["id"] for synset in data} 86 | assert "test-en-0001-n" in synset_ids 87 | assert "test-es-0001-n" not in synset_ids 88 | 89 | 90 | @pytest.mark.usefixtures('mini_db_web') 91 | def test_lexicon_words(): 92 | response1 = client.get("/lexicons/test-en:1/words") 93 | response2 = client.get("/words", params={"lexicon": "test-en:1"}) 94 | assert response1.status_code == 200 95 | assert response2.status_code == 200 96 | data1 = response1.json()["data"] 97 | data2 = response2.json()["data"] 98 | assert {word["id"] for word in data1} == {word["id"] for word in data2} 99 | 100 | 101 | @pytest.mark.usefixtures('mini_db_web') 102 | def test_lexicon_senses(): 103 | response1 = client.get("/lexicons/test-en:1/senses") 104 | response2 = client.get("/senses", params={"lexicon": "test-en:1"}) 105 | assert response1.status_code == 200 106 | assert response2.status_code == 200 107 | data1 = response1.json()["data"] 108 | data2 = response2.json()["data"] 109 | assert {sense["id"] for sense in data1} == {sense["id"] for sense in data2} 110 | 111 | 112 | @pytest.mark.usefixtures('mini_db_web') 113 | def test_lexicon_synsets(): 114 | response1 = client.get("/lexicons/test-en:1/synsets") 115 | response2 = client.get("/synsets", params={"lexicon": "test-en:1"}) 116 | assert response1.status_code == 200 117 | assert response2.status_code == 200 118 | data1 = response1.json()["data"] 119 | data2 = response2.json()["data"] 120 | assert {synset["id"] for synset in data1} == {synset["id"] for synset in data2} 121 | -------------------------------------------------------------------------------- /tests/taxonomy_test.py: -------------------------------------------------------------------------------- 1 | 2 | import pytest 3 | 4 | import wn 5 | from wn.taxonomy import ( 6 | roots, 7 | leaves, 8 | taxonomy_depth, 9 | hypernym_paths, 10 | min_depth, 11 | max_depth, 12 | shortest_path, 13 | # common_hypernyms, 14 | # lowest_common_hypernyms, 15 | ) 16 | 17 | 18 | @pytest.mark.usefixtures('mini_db') 19 | def test_roots(): 20 | en = wn.Wordnet('test-en') 21 | assert set(roots(en, pos='n')) == {en.synset('test-en-0001-n'), 22 | en.synset('test-en-0008-n')} 23 | assert set(roots(en, pos='v')) == {en.synset('test-en-0003-v'), 24 | en.synset('test-en-0007-v')} 25 | assert roots(en, pos='a') == [] 26 | assert set(roots(en)) == set(roots(en, pos='n') + roots(en, pos='v')) 27 | 28 | # with no expand relations and no relation of its own, every 29 | # synset looks like a root 30 | es = wn.Wordnet('test-es') 31 | assert set(roots(es, pos='n')) == {es.synset('test-es-0001-n'), 32 | es.synset('test-es-0002-n'), 33 | es.synset('test-es-0005-n')} 34 | 35 | es = wn.Wordnet('test-es', expand='test-en') 36 | assert roots(es, pos='n') == [es.synset('test-es-0001-n')] 37 | 38 | 39 | @pytest.mark.usefixtures('mini_db') 40 | def test_leaves(): 41 | en = wn.Wordnet('test-en') 42 | assert set(leaves(en, pos='n')) == {en.synset('test-en-0005-n'), 43 | en.synset('test-en-0006-n'), 44 | en.synset('test-en-0008-n')} 45 | assert set(leaves(en, pos='v')) == {en.synset('test-en-0003-v'), 46 | en.synset('test-en-0007-v')} 47 | 48 | 49 | @pytest.mark.usefixtures('mini_db') 50 | def test_taxonomy_depth(): 51 | en = wn.Wordnet('test-en') 52 | assert taxonomy_depth(en, pos='n') == 3 53 | assert taxonomy_depth(en, pos='v') == 0 54 | 55 | 56 | @pytest.mark.usefixtures('mini_db') 57 | def test_hypernym_paths(): 58 | information = wn.synsets('information')[0] 59 | example = wn.synsets('example')[0] 60 | sample = wn.synsets('sample')[0] 61 | random_sample = wn.synsets('random sample')[0] 62 | assert hypernym_paths(information) == [] 63 | assert hypernym_paths(example) == [[information]] 64 | assert hypernym_paths(sample) == [[example, information]] 65 | assert hypernym_paths(random_sample) == [[sample, example, information]] 66 | 67 | 68 | @pytest.mark.usefixtures('mini_db') 69 | def test_interlingual_hypernym_paths(): 70 | información = wn.synsets('información')[0] 71 | ejemplo = wn.synsets('ejemplo')[0] 72 | sample = wn.synsets('sample', lexicon='test-en:1')[0] 73 | inferred = wn.Synset.empty('*INFERRED*', ili=sample.ili.id, _lexicon='test-es:1') 74 | muestra_aleatoria = wn.synsets('muestra aleatoria')[0] 75 | assert hypernym_paths(información) == [] 76 | assert hypernym_paths(ejemplo) == [[información]] 77 | assert hypernym_paths(muestra_aleatoria) == [[inferred, ejemplo, información]] 78 | 79 | 80 | @pytest.mark.usefixtures('mini_db') 81 | def test_shortest_path(): 82 | information = wn.synsets('information')[0] 83 | example = wn.synsets('example')[0] 84 | sample = wn.synsets('sample')[0] 85 | random_sample = wn.synsets('random sample')[0] 86 | datum = wn.synsets('datum')[0] 87 | exemplify = wn.synsets('exemplify')[0] 88 | inferred_root = wn.Synset.empty('*ROOT*', _lexicon='test-en:1') 89 | assert shortest_path(information, information) == [] 90 | assert shortest_path(information, datum) == [datum] 91 | assert shortest_path(information, sample) == [example, sample] 92 | assert shortest_path(sample, information) == [example, information] 93 | assert shortest_path(random_sample, datum) == [sample, example, information, datum] 94 | with pytest.raises(wn.Error): 95 | shortest_path(example, exemplify) 96 | assert shortest_path(example, exemplify, simulate_root=True) == [ 97 | information, inferred_root, exemplify 98 | ] 99 | 100 | 101 | @pytest.mark.usefixtures('mini_db') 102 | def test_min_depth(): 103 | assert min_depth(wn.synsets('information')[0]) == 0 104 | assert min_depth(wn.synsets('example')[0]) == 1 105 | assert min_depth(wn.synsets('sample')[0]) == 2 106 | assert min_depth(wn.synsets('random sample')[0]) == 3 107 | 108 | 109 | @pytest.mark.usefixtures('mini_db') 110 | def test_max_depth(): 111 | assert max_depth(wn.synsets('information')[0]) == 0 112 | assert max_depth(wn.synsets('example')[0]) == 1 113 | assert max_depth(wn.synsets('sample')[0]) == 2 114 | assert max_depth(wn.synsets('random sample')[0]) == 3 115 | -------------------------------------------------------------------------------- /docs/guides/nltk-migration.rst: -------------------------------------------------------------------------------- 1 | Migrating from the NLTK 2 | ======================= 3 | 4 | This guide is for users of the `NLTK `_\ 's 5 | ``nltk.corpus.wordnet`` module who are migrating to Wn. It is not 6 | guaranteed that Wn will produce the same results as the NLTK's module, 7 | but with some care its behavior can be very similar. 8 | 9 | Overview 10 | -------- 11 | 12 | One important thing to note is that Wn will search all wordnets in the 13 | database by default where the NLTK would only search the English. 14 | 15 | >>> from nltk.corpus import wordnet as nltk_wn 16 | >>> nltk_wn.synsets('chat') # only English 17 | >>> nltk_wn.synsets('chat', lang='fra') # only French 18 | >>> import wn 19 | >>> wn.synsets('chat') # all wordnets 20 | >>> wn.synsets('chat', lang='fr') # only French 21 | 22 | With Wn it helps to create a :class:`wn.Wordnet` object to pre-filter 23 | the results by language or lexicon. 24 | 25 | >>> en = wn.Wordnet('omw-en:1.4') 26 | >>> en.synsets('chat') # only the OMW English Wordnet 27 | 28 | Equivalent Operations 29 | --------------------- 30 | 31 | The following table lists equivalent API calls for the NLTK's wordnet 32 | module and Wn assuming the respective modules have been instantiated 33 | (in separate Python sessions) as follows: 34 | 35 | NLTK: 36 | 37 | >>> from nltk.corpus import wordnet as wn 38 | >>> ss = wn.synsets("chat", pos="v")[0] 39 | 40 | Wn: 41 | 42 | >>> import wn 43 | >>> en = wn.Wordnet('omw-en:1.4') 44 | >>> ss = en.synsets("chat", pos="v")[0] 45 | 46 | .. default-role:: python 47 | 48 | Primary Queries 49 | ''''''''''''''' 50 | 51 | ========================================= =============================================== 52 | NLTK Wn 53 | ========================================= =============================================== 54 | `wn.langs()` `[lex.language for lex in wn.lexicons()]` 55 | `wn.lemmas("chat")` -- 56 | -- `en.words("chat")` 57 | -- `en.senses("chat")` 58 | `wn.synsets("chat")` `en.synsets("chat")` 59 | `wn.synsets("chat", pos="v")` `en.synsets("chat", pos="v")` 60 | `wn.all_synsets()` `en.synsets()` 61 | `wn.all_synsets(pos="v")` `en.synsets(pos="v")` 62 | ========================================= =============================================== 63 | 64 | Synsets -- Basic 65 | '''''''''''''''' 66 | 67 | =================== ================= 68 | NLTK Wn 69 | =================== ================= 70 | `ss.lemmas()` -- 71 | -- `ss.senses()` 72 | -- `ss.words()` 73 | `ss.lemmas_names()` `ss.lemmas()` 74 | `ss.definition()` `ss.definition()` 75 | `ss.examples()` `ss.examples()` 76 | `ss.pos()` `ss.pos` 77 | =================== ================= 78 | 79 | Synsets -- Relations 80 | '''''''''''''''''''' 81 | 82 | ========================================== ===================================== 83 | NLTK Wn 84 | ========================================== ===================================== 85 | `ss.hypernyms()` `ss.get_related("hypernym")` 86 | `ss.instance_hypernyms()` `ss.get_related("instance_hypernym")` 87 | `ss.hypernyms() + ss.instance_hypernyms()` `ss.hypernyms()` 88 | `ss.hyponyms()` `ss.get_related("hyponym")` 89 | `ss.member_holonyms()` `ss.get_related("holo_member")` 90 | `ss.member_meronyms()` `ss.get_related("mero_member")` 91 | `ss.closure(lambda x: x.hypernyms())` `ss.closure("hypernym")` 92 | ========================================== ===================================== 93 | 94 | Synsets -- Taxonomic Structure 95 | '''''''''''''''''''''''''''''' 96 | 97 | ================================ ========================================================= 98 | NLTK Wn 99 | ================================ ========================================================= 100 | `ss.min_depth()` `ss.min_depth()` 101 | `ss.max_depth()` `ss.max_depth()` 102 | `ss.hypernym_paths()` `[list(reversed([ss] + p)) for p in ss.hypernym_paths()]` 103 | `ss.common_hypernyms(ss)` `ss.common_hypernyms(ss)` 104 | `ss.lowest_common_hypernyms(ss)` `ss.lowest_common_hypernyms(ss)` 105 | `ss.shortest_path_distance(ss)` `len(ss.shortest_path(ss))` 106 | ================================ ========================================================= 107 | 108 | .. reset default role 109 | .. default-role:: 110 | 111 | (these tables are incomplete) 112 | -------------------------------------------------------------------------------- /wn/_db.py: -------------------------------------------------------------------------------- 1 | """ 2 | Storage back-end interface. 3 | """ 4 | 5 | from importlib import resources 6 | from pathlib import Path 7 | import json 8 | import sqlite3 9 | import logging 10 | 11 | import wn 12 | from wn._types import AnyPath 13 | from wn._util import short_hash, format_lexicon_specifier 14 | 15 | 16 | logger = logging.getLogger('wn') 17 | 18 | 19 | # Module Constants 20 | 21 | DEBUG = False 22 | 23 | # This stores hashes of the schema to check for version differences. 24 | # When the schema changes, the hash will change. If the new hash is 25 | # not added here, the 'test_schema_compatibility' test will fail. It 26 | # is the developer's responsibility to only add compatible schema 27 | # hashes here. If the schema change is not backwards-compatible, then 28 | # clear all old hashes and only put the latest hash here. A hash can 29 | # be generated like this: 30 | # 31 | # >>> import sqlite3 32 | # >>> import wn 33 | # >>> conn = sqlite3.connect(wn.config.database_path) 34 | # >>> wn._db.schema_hash(conn) 35 | # 36 | COMPATIBLE_SCHEMA_HASHES = { 37 | '4c8ad03af5422d6979039ee2b80838d07c12d2c8', # Original schema 38 | '01909cb2d0cdee19ed687dbd95c5983d7b68f807', # Added form_lexicon_index 39 | '4c2728bb7999685d9748ad6245638a210d0f099d', # Added form_lexicon_form_covering_index 40 | 'c1ef1e74d47810fd313383cdb8ecb9a2d9aef7db', # Migrated database with covering index 41 | } 42 | 43 | 44 | # Optional metadata is stored as a JSON string 45 | 46 | def _adapt_dict(d: dict) -> bytes: 47 | return json.dumps(d).encode('utf-8') 48 | 49 | 50 | def _convert_dict(s: bytes) -> dict: 51 | return json.loads(s) 52 | 53 | 54 | def _convert_boolean(s: bytes) -> bool: 55 | return bool(int(s)) 56 | 57 | 58 | sqlite3.register_adapter(dict, _adapt_dict) 59 | sqlite3.register_converter('meta', _convert_dict) 60 | sqlite3.register_converter('boolean', _convert_boolean) 61 | 62 | 63 | # The pool is a cache of open connections. Unless the database path is 64 | # changed, there should only be zero or one. 65 | pool: dict[AnyPath, sqlite3.Connection] = {} 66 | 67 | 68 | # The connect() function should be used for all connections 69 | 70 | def connect() -> sqlite3.Connection: 71 | dbpath = wn.config.database_path 72 | if dbpath not in pool: 73 | if not wn.config.data_directory.exists(): 74 | wn.config.data_directory.mkdir(parents=True, exist_ok=True) 75 | initialized = dbpath.is_file() 76 | conn = sqlite3.connect( 77 | str(dbpath), 78 | detect_types=sqlite3.PARSE_DECLTYPES, 79 | check_same_thread=not wn.config.allow_multithreading, 80 | ) 81 | # foreign key support needs to be enabled for each connection 82 | conn.execute('PRAGMA foreign_keys = ON') 83 | if DEBUG: 84 | conn.set_trace_callback(print) 85 | if not initialized: 86 | logger.info('initializing database: %s', dbpath) 87 | _init_db(conn) 88 | _check_schema_compatibility(conn, dbpath) 89 | 90 | pool[dbpath] = conn 91 | return pool[dbpath] 92 | 93 | 94 | def _init_db(conn: sqlite3.Connection) -> None: 95 | schema = (resources.files('wn') / 'schema.sql').read_text() 96 | conn.executescript(schema) 97 | with conn: 98 | conn.executemany('INSERT INTO ili_statuses VALUES (null,?)', 99 | [('presupposed',), ('proposed',)]) 100 | 101 | 102 | def _check_schema_compatibility(conn: sqlite3.Connection, dbpath: Path) -> None: 103 | hash = schema_hash(conn) 104 | 105 | # if the hash is known, then we're all good here 106 | if hash in COMPATIBLE_SCHEMA_HASHES: 107 | return 108 | 109 | logger.debug('current schema hash:\n %s', hash) 110 | logger.debug('compatible schema hashes:\n %s', 111 | '\n '.join(COMPATIBLE_SCHEMA_HASHES)) 112 | # otherwise, try to raise a helpful error message 113 | msg = ("Wn's schema has changed and is no longer compatible with the " 114 | f"database. Please move or delete {dbpath} and rebuild it.") 115 | try: 116 | specs = conn.execute('SELECT id, version FROM lexicons').fetchall() 117 | except sqlite3.OperationalError as exc: 118 | raise wn.DatabaseError(msg) from exc 119 | else: 120 | if specs: 121 | installed = '\n '.join( 122 | format_lexicon_specifier(id, ver) 123 | for id, ver in specs 124 | ) 125 | msg += f" Lexicons currently installed:\n {installed}" 126 | else: 127 | msg += ' No lexicons are currently installed.' 128 | raise wn.DatabaseError(msg) 129 | 130 | 131 | def schema_hash(conn: sqlite3.Connection) -> str: 132 | query = 'SELECT sql FROM sqlite_master WHERE NOT sql ISNULL' 133 | schema = '\n\n'.join(row[0] for row in conn.execute(query)) 134 | return short_hash(schema) 135 | 136 | 137 | def clear_connections() -> None: 138 | """Close and delete any open database connections.""" 139 | for path in list(pool): 140 | pool[path].close() 141 | del pool[path] 142 | -------------------------------------------------------------------------------- /bench/conftest.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | from collections.abc import Iterator 3 | from itertools import product, cycle 4 | from pathlib import Path 5 | 6 | import pytest 7 | 8 | import wn 9 | from wn import lmf 10 | 11 | 12 | @pytest.fixture 13 | def clean_db(): 14 | 15 | def clean_db(): 16 | wn.remove("*") 17 | dummy_lex = lmf.Lexicon( 18 | id="dummy", 19 | version="1", 20 | label="placeholder to initialize the db", 21 | language="zxx", 22 | email="", 23 | license="", 24 | ) 25 | wn.add_lexical_resource( 26 | lmf.LexicalResource(lmf_version="1.3", lexicons=[dummy_lex]) 27 | ) 28 | 29 | return clean_db 30 | 31 | 32 | @pytest.fixture(scope="session") 33 | def datadir(): 34 | return Path(__file__).parent.parent / "tests" / "data" 35 | 36 | 37 | @pytest.fixture 38 | def empty_db(clean_db): 39 | with tempfile.TemporaryDirectory('wn_data_empty') as dir: 40 | with pytest.MonkeyPatch.context() as m: 41 | m.setattr(wn.config, 'data_directory', dir) 42 | clean_db() 43 | yield 44 | 45 | 46 | @pytest.fixture(scope="session") 47 | def mock_lmf(): 48 | synsets: list[lmf.Synset] = [ 49 | * _make_synsets("n", 20000), 50 | * _make_synsets("v", 10000), 51 | * _make_synsets("a", 2000), 52 | * _make_synsets("r", 1000), 53 | ] 54 | entries = _make_entries(synsets) 55 | lexicon = lmf.Lexicon( 56 | id="mock", 57 | version="1", 58 | label="", 59 | language="zxx", 60 | email="", 61 | license="", 62 | entries=entries, 63 | synsets=synsets, 64 | ) 65 | return lmf.LexicalResource(lmf_version="1.3", lexicons=[lexicon]) 66 | 67 | 68 | @pytest.fixture(scope="session") 69 | def mock_db_dir(mock_lmf): 70 | with tempfile.TemporaryDirectory("wn_data_empty") as dir: 71 | with pytest.MonkeyPatch.context() as m: 72 | m.setattr(wn.config, 'data_directory', dir) 73 | wn.add_lexical_resource(mock_lmf, progress_handler=None) 74 | wn._db.clear_connections() 75 | 76 | yield Path(dir) 77 | 78 | 79 | @pytest.fixture 80 | def mock_db(monkeypatch, mock_db_dir): 81 | with monkeypatch.context() as m: 82 | m.setattr(wn.config, "data_directory", mock_db_dir) 83 | yield 84 | wn._db.clear_connections() 85 | 86 | 87 | def _make_synsets(pos: str, n: int) -> list[lmf.Synset]: 88 | synsets: list[lmf.Synset] = [ 89 | lmf.Synset( 90 | id=f"{i}-{pos}", 91 | ili="", 92 | partOfSpeech=pos, 93 | relations=[], 94 | meta={}, 95 | ) 96 | for i in range(1, n+1) 97 | ] 98 | # add relations for nouns and verbs 99 | if pos in "nv": 100 | total = len(synsets) 101 | tgt_i = 1 # index of next target synset 102 | n = cycle([2]) # how many targets to relate 103 | for cur_i in range(total): 104 | if tgt_i <= cur_i: 105 | tgt_i = cur_i + 1 106 | source = synsets[cur_i] 107 | for cur_k in range(tgt_i, tgt_i + next(n)): 108 | if cur_k >= total: 109 | break 110 | target = synsets[cur_k] 111 | source["relations"].append( 112 | lmf.Relation(target=target["id"], relType="hyponym", meta={}) 113 | ) 114 | target["relations"].append( 115 | lmf.Relation(target=source["id"], relType="hypernym", meta={}) 116 | ) 117 | tgt_i = cur_k + 1 118 | 119 | return synsets 120 | 121 | 122 | def _words() -> Iterator[str]: 123 | consonants = "kgtdpbfvszrlmnhw" 124 | vowels = "aeiou" 125 | while True: 126 | yield from map("".join, product(consonants, vowels, consonants, vowels)) 127 | 128 | 129 | def _make_entries(synsets: list[lmf.Synset]) -> list[lmf.LexicalEntry]: 130 | words = _words() 131 | member_count = cycle(range(1, 4)) # 1, 2, or 3 synset members 132 | entries: dict[str, lmf.LexicalEntry] = {} 133 | prev_synsets: list[lmf.Synset] = [] 134 | for synset in synsets: 135 | ssid = synset["id"] 136 | pos = synset["partOfSpeech"] 137 | 138 | for _ in range(next(member_count)): 139 | word = next(words) 140 | senses = [lmf.Sense(id=f"{word}-{ssid}", synset=ssid, meta={})] 141 | # add some polysemy 142 | if prev_synsets: 143 | ssid2 = prev_synsets.pop()["id"] 144 | senses.append(lmf.Sense(id=f"{word}-{ssid2}", synset=ssid2, meta={})) 145 | eid = f"{word}-{pos}" 146 | if eid not in entries: 147 | entries[eid] = lmf.LexicalEntry( 148 | id=eid, 149 | lemma=lmf.Lemma( 150 | writtenForm=word, 151 | partOfSpeech=pos, 152 | ), 153 | senses=[], 154 | meta={}, 155 | ) 156 | entries[eid]["senses"].extend(senses) 157 | 158 | prev_synsets.append(synset) 159 | 160 | return list(entries.values()) 161 | -------------------------------------------------------------------------------- /wn/_download.py: -------------------------------------------------------------------------------- 1 | 2 | from collections.abc import Sequence 3 | from typing import Optional 4 | from pathlib import Path 5 | import logging 6 | 7 | import httpx 8 | 9 | import wn 10 | from wn._util import is_url 11 | from wn.util import ProgressHandler, ProgressBar 12 | from wn._add import add as add_to_db 13 | from wn import config 14 | 15 | 16 | CHUNK_SIZE = 8 * 1024 # how many KB to read at a time 17 | TIMEOUT = 10 # number of seconds to wait for a server response 18 | 19 | 20 | logger = logging.getLogger('wn') 21 | 22 | 23 | def download( 24 | project_or_url: str, 25 | add: bool = True, 26 | progress_handler: Optional[type[ProgressHandler]] = ProgressBar, 27 | ) -> Path: 28 | """Download the resource specified by *project_or_url*. 29 | 30 | First the URL of the resource is determined and then, depending on 31 | the parameters, the resource is downloaded and added to the 32 | database. The function then returns the path of the cached file. 33 | 34 | If *project_or_url* starts with `'http://'` or `'https://'`, then 35 | it is taken to be the URL for the resource. Otherwise, 36 | *project_or_url* is taken as a :ref:`project specifier 37 | ` and the URL is taken from a matching entry 38 | in Wn's project index. If no project matches the specifier, 39 | :exc:`wn.Error` is raised. 40 | 41 | If the URL has been downloaded and cached before, the cached file 42 | is used. Otherwise the URL is retrieved and stored in the cache. 43 | 44 | If the *add* paramter is ``True`` (default), the downloaded 45 | resource is added to the database. 46 | 47 | >>> wn.download('ewn:2020') 48 | Added ewn:2020 (English WordNet) 49 | 50 | The *progress_handler* parameter takes a subclass of 51 | :class:`wn.util.ProgressHandler`. An instance of the class will be 52 | created, used, and closed by this function. 53 | 54 | """ 55 | if progress_handler is None: 56 | progress_handler = ProgressHandler 57 | progress = progress_handler(message='Download', unit=' bytes') 58 | 59 | cache_path, urls = _get_cache_path_and_urls(project_or_url) 60 | 61 | try: 62 | if cache_path and cache_path.exists(): 63 | progress.flash(f'Cached file found: {cache_path!s}') 64 | path = cache_path 65 | elif urls: 66 | path = _download(urls, progress) 67 | else: 68 | raise wn.Error('no urls to download') 69 | finally: 70 | progress.close() 71 | 72 | if add: 73 | try: 74 | add_to_db(path, progress_handler=progress_handler) 75 | except wn.Error as exc: 76 | raise wn.Error( 77 | f'could not add downloaded file: {path}\n You might try ' 78 | 'deleting the cached file and trying the download again.' 79 | ) from exc 80 | 81 | return path 82 | 83 | 84 | def _get_cache_path_and_urls(project_or_url: str) -> tuple[Optional[Path], list[str]]: 85 | if is_url(project_or_url): 86 | return config.get_cache_path(project_or_url), [project_or_url] 87 | else: 88 | info = config.get_project_info(project_or_url) 89 | return info.get('cache'), info['resource_urls'] 90 | 91 | 92 | def _download(urls: Sequence[str], progress: ProgressHandler) -> Path: 93 | client = httpx.Client(timeout=TIMEOUT, follow_redirects=True) 94 | try: 95 | for i, url in enumerate(urls, 1): 96 | path = config.get_cache_path(url) 97 | logger.info('download url: %s', url) 98 | logger.info('download cache path: %s', path) 99 | try: 100 | with open(path, 'wb') as f: 101 | progress.set(status='Requesting', count=0) 102 | with client.stream("GET", url) as response: 103 | response.raise_for_status() 104 | total = int(response.headers.get('Content-Length', 0)) 105 | count = response.num_bytes_downloaded 106 | progress.set(count=count, total=total, status='Receiving') 107 | for chunk in response.iter_bytes(chunk_size=CHUNK_SIZE): 108 | if chunk: 109 | f.write(chunk) 110 | progress.update(response.num_bytes_downloaded - count) 111 | count = response.num_bytes_downloaded 112 | progress.set(status='Complete') 113 | except httpx.RequestError as exc: 114 | path.unlink(missing_ok=True) 115 | last_count = progress.kwargs['count'] 116 | if i == len(urls): 117 | raise wn.Error(f'download failed at {last_count} bytes') from exc 118 | else: 119 | logger.info( 120 | 'download failed at %d bytes; trying next url', last_count 121 | ) 122 | else: 123 | break # success 124 | 125 | except KeyboardInterrupt as exc: 126 | path.unlink(missing_ok=True) 127 | last_count = progress.kwargs['count'] 128 | raise wn.Error(f'download cancelled at {last_count} bytes') from exc 129 | except Exception: 130 | path.unlink(missing_ok=True) 131 | raise 132 | finally: 133 | client.close() 134 | 135 | return path 136 | -------------------------------------------------------------------------------- /wn/morphy.py: -------------------------------------------------------------------------------- 1 | 2 | """A simple English lemmatizer that finds and removes known suffixes. 3 | 4 | """ 5 | 6 | from typing import Optional 7 | from enum import Flag, auto 8 | 9 | import wn 10 | from wn._types import LemmatizeResult 11 | from wn.constants import NOUN, VERB, ADJ, ADJ_SAT, ADV, PARTS_OF_SPEECH 12 | 13 | POSExceptionMap = dict[str, set[str]] 14 | ExceptionMap = dict[str, POSExceptionMap] 15 | 16 | 17 | class _System(Flag): 18 | """Flags to track suffix rules in various implementations of Morphy.""" 19 | PWN = auto() 20 | NLTK = auto() 21 | WN = auto() 22 | ALL = PWN | NLTK | WN 23 | 24 | 25 | _PWN = _System.PWN 26 | _NLTK = _System.NLTK 27 | _WN = _System.WN 28 | _ALL = _System.ALL 29 | 30 | 31 | Rule = tuple[str, str, _System] 32 | 33 | DETACHMENT_RULES: dict[str, list[Rule]] = { 34 | NOUN: [ 35 | ("s", "", _ALL), 36 | ("ces", "x", _WN), 37 | ("ses", "s", _ALL), 38 | ("ves", "f", _NLTK | _WN), 39 | ("ives", "ife", _WN), 40 | ("xes", "x", _ALL), 41 | ("xes", "xis", _WN), 42 | ("zes", "z", _ALL), 43 | ("ches", "ch", _ALL), 44 | ("shes", "sh", _ALL), 45 | ("men", "man", _ALL), 46 | ("ies", "y", _ALL), 47 | ], 48 | VERB: [ 49 | ("s", "", _ALL), 50 | ("ies", "y", _ALL), 51 | ("es", "e", _ALL), 52 | ("es", "", _ALL), 53 | ("ed", "e", _ALL), 54 | ("ed", "", _ALL), 55 | ("ing", "e", _ALL), 56 | ("ing", "", _ALL), 57 | ], 58 | ADJ: [ 59 | ("er", "", _ALL), 60 | ("est", "", _ALL), 61 | ("er", "e", _ALL), 62 | ("est", "e", _ALL), 63 | ], 64 | ADV: [], 65 | } 66 | DETACHMENT_RULES[ADJ_SAT] = DETACHMENT_RULES[ADJ] 67 | 68 | 69 | class Morphy: 70 | """The Morphy lemmatizer class. 71 | 72 | Objects of this class are callables that take a wordform and an 73 | optional part of speech and return a dictionary mapping parts of 74 | speech to lemmas. If objects of this class are not created with a 75 | :class:`wn.Wordnet` object, the returned lemmas may be invalid. 76 | 77 | Arguments: 78 | wordnet: optional :class:`wn.Wordnet` instance 79 | 80 | Example: 81 | 82 | >>> import wn 83 | >>> from wn.morphy import Morphy 84 | >>> ewn = wn.Wordnet('ewn:2020') 85 | >>> m = Morphy(ewn) 86 | >>> m('axes', pos='n') 87 | {'n': {'axe', 'ax', 'axis'}} 88 | >>> m('geese', pos='n') 89 | {'n': {'goose'}} 90 | >>> m('gooses') 91 | {'n': {'goose'}, 'v': {'goose'}} 92 | >>> m('goosing') 93 | {'v': {'goose'}} 94 | 95 | """ 96 | 97 | def __init__(self, wordnet: Optional[wn.Wordnet] = None): 98 | self._rules = { 99 | pos: [rule for rule in rules if rule[2] & _System.WN] 100 | for pos, rules in DETACHMENT_RULES.items() 101 | } 102 | exceptions: ExceptionMap = {pos: {} for pos in PARTS_OF_SPEECH} 103 | all_lemmas: dict[str, set[str]] = {pos: set() for pos in PARTS_OF_SPEECH} 104 | if wordnet: 105 | for word in wordnet.words(): 106 | pos = word.pos 107 | pos_exc = exceptions[pos] 108 | lemma, *others = word.forms() 109 | # store every lemma whether it has other forms or not 110 | all_lemmas[pos].add(lemma) 111 | # those with other forms map to the original lemmas 112 | for other in others: 113 | if other in pos_exc: 114 | pos_exc[other].add(lemma) 115 | else: 116 | pos_exc[other] = {lemma} 117 | self._initialized = True 118 | else: 119 | self._initialized = False 120 | self._exceptions = exceptions 121 | self._all_lemmas = all_lemmas 122 | 123 | def __call__(self, form: str, pos: Optional[str] = None) -> LemmatizeResult: 124 | result = {} 125 | if not self._initialized: 126 | result[pos] = {form} # always include original when not initialized 127 | 128 | if pos is None: 129 | pos_list = list(DETACHMENT_RULES) 130 | elif pos in DETACHMENT_RULES: 131 | pos_list = [pos] 132 | else: 133 | pos_list = [] # not handled by morphy 134 | 135 | no_pos_forms = result.get(None, set()) # avoid unnecessary duplicates 136 | for _pos in pos_list: 137 | candidates = self._morphstr(form, _pos) - no_pos_forms 138 | if candidates: 139 | result.setdefault(_pos, set()).update(candidates) 140 | 141 | return result 142 | 143 | def _morphstr(self, form: str, pos: str) -> set[str]: 144 | candidates: set[str] = set() 145 | 146 | initialized = self._initialized 147 | if initialized: 148 | all_lemmas = self._all_lemmas[pos] 149 | if form in all_lemmas: 150 | candidates.add(form) 151 | candidates.update(self._exceptions[pos].get(form, set())) 152 | else: 153 | all_lemmas = set() 154 | 155 | for suffix, repl, _ in self._rules[pos]: 156 | # avoid applying rules that perform full suppletion 157 | if form.endswith(suffix) and len(suffix) < len(form): 158 | candidate = f'{form[:-len(suffix)]}{repl}' 159 | if not initialized or candidate in all_lemmas: 160 | candidates.add(candidate) 161 | 162 | return candidates 163 | 164 | 165 | morphy = Morphy() 166 | -------------------------------------------------------------------------------- /docs/faq.rst: -------------------------------------------------------------------------------- 1 | FAQ 2 | === 3 | 4 | Is Wn related to the NLTK's `nltk.corpus.wordnet` module? 5 | --------------------------------------------------------- 6 | 7 | Only in spirit. There was an effort to develop the `NLTK`_\ 's module as a 8 | standalone package (see https://github.com/nltk/wordnet/), but 9 | development had slowed. Wn has the same broad goals and a similar API 10 | as that standalone package, but fundamental architectural differences 11 | demanded a complete rewrite, so Wn was created as a separate 12 | project. With approval from the other package's maintainer, Wn 13 | acquired the `wn `_ project on PyPI and 14 | can be seen as its successor. 15 | 16 | Is Wn compatible with the NLTK's module? 17 | ---------------------------------------- 18 | 19 | The API is intentionally similar, but not exactly the same (for 20 | instance see the next question), and there are differences in the ways 21 | that results are retrieved, particularly for non-English wordnets. See 22 | :doc:`guides/nltk-migration` for more information. Also see 23 | :ref:`princeton-wordnet`. 24 | 25 | Where are the ``Lemma`` objects? What are ``Word`` and ``Sense`` objects? 26 | ------------------------------------------------------------------------- 27 | 28 | Unlike the original `WNDB`_ data format of the original WordNet, the 29 | `WN-LMF`_ XML format grants words (called *lexical entries* in WN-LMF 30 | and a :class:`~wn.Word` object in Wn) and word senses 31 | (:class:`~wn.Sense` in Wn) explicit, first-class status alongside 32 | synsets. While senses are essentially links between words and 33 | synsets, they may contain metadata and be the source or target of 34 | sense relations, so in some ways they are more like nodes than edges 35 | when the wordnet is viewed as a graph. The `NLTK`_\ 's module, using 36 | the WNDB format, combines the information of a word and a sense into a 37 | single object called a ``Lemmas``. Wn also has an unrelated concept 38 | called a :meth:`~wn.Word.lemma`, but it is merely the canonical form 39 | of a word. 40 | 41 | .. _princeton-wordnet: 42 | 43 | Where is the Princeton WordNet data? 44 | ------------------------------------ 45 | 46 | The original English wordnet, named simply *WordNet* but often 47 | referred to as the *Princeton WordNet* to better distinguish it from 48 | other projects, is specifically the data distributed by Princeton in 49 | the `WNDB`_ format. The `Open Multilingual Wordnet `_ (OMW) 50 | packages an export of the WordNet data as the *OMW English Wordnet 51 | based on WordNet 3.0* which is used by Wn (with the lexicon ID 52 | ``omw-en``). It also has a similar export for WordNet 3.1 data 53 | (``omw-en31``). Both of these are highly compatible with the original 54 | data and can be used as drop-in replacements. 55 | 56 | Prior to Wn version 0.9 (and, correspondingly, prior to the `OMW 57 | data`_ version 1.4), the ``pwn:3.0`` and ``pwn:3.1`` English wordnets 58 | distributed by OMW were incorrectly called the *Princeton WordNet* 59 | (for WordNet 3.0 and 3.1, respectively). From Wn version 0.9 (and from 60 | version 1.4 of the OMW data), these are called the *OMW English 61 | Wordnet based on WordNet 3.0/3.1* (``omw-en:1.4`` and 62 | ``omw-en31:1.4``, respectively). These lexicons are intentionally 63 | compatible with the original WordNet data, and the 1.4 versions are 64 | even more compatible than the previous ``pwn:3.0`` and ``pwn:3.1`` 65 | lexicons, so it is strongly recommended to use them over the previous 66 | versions. 67 | 68 | .. _OMW data: https://github.com/omwn/omw-data 69 | 70 | Why don't all wordnets share the same synsets? 71 | ---------------------------------------------- 72 | 73 | The `Open Multilingual Wordnet `_ (OMW) contains wordnets for 74 | many languages created using the *expand* methodology [VOSSEN1998]_, 75 | where non-English wordnets provide words on top of the English 76 | wordnet's synset structure. This allows new wordnets to be built in 77 | much less time than starting from scratch, but with a few drawbacks, 78 | such as that words cannot be added if they do not have a synset in the 79 | English wordnet, and that it is difficult to version the wordnets 80 | independently (e.g., for reproducibility of experiments involving 81 | wordnet data) as all are interconnected. Wn, therefore, creates new 82 | synsets for each wordnet added to its database, and synsets then 83 | specify which resource they belong to. Queries can specify which 84 | resources may be examined. Also see :doc:`guides/interlingual`. 85 | 86 | Why does Wn's database get so big? 87 | ---------------------------------- 88 | 89 | The *OMW English Wordnet based on WordNet 3.0* takes about 114 MiB of 90 | disk space in Wn's database, which is only about 8 MiB more than it 91 | takes as a `WN-LMF`_ XML file. The `NLTK`_, however, uses the obsolete 92 | `WNDB`_ format which is more compact, requiring only 35 MiB of disk 93 | space. The difference with the Open Multilingual Wordnet 1.4 is more 94 | striking: it takes about 659 MiB of disk space in the database, but 95 | only 49 MiB in the NLTK. Part of the difference here is that the OMW 96 | files in the NLTK are simple tab-separated-value files listing only 97 | the words added to each synset for each language. In addition, Wn 98 | creates new synsets for each wordnet added (see the previous 99 | question). One more reason is that Wn creates various indexes in the 100 | database for efficient lookup. 101 | 102 | .. _NLTK: https://www.nltk.org/ 103 | .. _OMW: http://github.com/omwn 104 | .. [VOSSEN1998] Piek Vossen. 1998. *Introduction to EuroWordNet.* Computers and the Humanities, 32(2): 73--89. 105 | .. _Open English Wordnet 2021: https://en-word.net/ 106 | .. _WNDB: https://wordnet.princeton.edu/documentation/wndb5wn 107 | .. _WN-LMF: https://globalwordnet.github.io/schemas/ 108 | -------------------------------------------------------------------------------- /wn/__main__.py: -------------------------------------------------------------------------------- 1 | 2 | import sys 3 | import argparse 4 | from pathlib import Path 5 | import json 6 | import logging 7 | 8 | import wn 9 | from wn.project import iterpackages 10 | from wn import lmf 11 | from wn.validate import validate 12 | from wn._util import format_lexicon_specifier 13 | 14 | 15 | def _download(args): 16 | if args.index: 17 | wn.config.load_index(args.index) 18 | for target in args.target: 19 | wn.download(target, add=args.add) 20 | 21 | 22 | def _lexicons(args): 23 | for lex in wn.lexicons(lang=args.lang, lexicon=args.lexicon): 24 | print('\t'.join((lex.id, lex.version, f'[{lex.language}]', lex.label))) 25 | 26 | 27 | def _projects(args): 28 | for info in wn.projects(): 29 | key = 'i' 30 | key += 'c' if info['cache'] else '-' 31 | # key += 'a' if False else '-' # TODO: check if project is added to db 32 | print( 33 | '\t'.join(( 34 | key, 35 | info['id'], 36 | info['version'], 37 | f"[{info['language'] or '---'}]", 38 | info['label'] or '---', 39 | )) 40 | ) 41 | 42 | 43 | def _validate(args): 44 | all_valid = True 45 | selectseq = [check.strip() for check in args.select.split(',')] 46 | for package in iterpackages(args.FILE): 47 | resource = lmf.load(package.resource_file()) 48 | for lexicon in resource['lexicons']: 49 | spec = format_lexicon_specifier(lexicon["id"], lexicon["version"]) 50 | print(f'{spec:<20}', end='') 51 | report = validate(lexicon, select=selectseq) 52 | if not any(check.get('items', []) for check in report.values()): 53 | print('passed') 54 | else: 55 | print('failed') 56 | all_valid = False 57 | # clean up report 58 | for code in list(report): 59 | if not report[code].get('items'): 60 | del report[code] 61 | if args.output_file: 62 | with open(args.output_file, 'w') as outfile: 63 | json.dump(report, outfile, indent=2) 64 | else: 65 | for _code, check in report.items(): 66 | if not check['items']: 67 | continue 68 | print(f' {check["message"]}') 69 | for id, context in check['items'].items(): 70 | print(f' {id}: {context}' if context else f' {id}') 71 | 72 | sys.exit(0 if all_valid else 1) 73 | 74 | 75 | def _path_type(arg): 76 | return Path(arg) 77 | 78 | 79 | def _file_path_type(arg): 80 | path = Path(arg) 81 | if not path.is_file(): 82 | raise argparse.ArgumentTypeError(f'cannot file file: {arg}') 83 | return path 84 | 85 | 86 | parser = argparse.ArgumentParser( 87 | prog='python3 -m wn', 88 | description="Manage Wn's wordnet data from the command line.", 89 | ) 90 | parser.add_argument( 91 | '-V', '--version', action='version', version=f'Wn {wn.__version__}' 92 | ) 93 | parser.add_argument( 94 | '-v', '--verbose', action='count', dest='verbosity', default=0, 95 | help='increase verbosity (can repeat: -vv, -vvv)' 96 | ) 97 | parser.add_argument( 98 | '-d', '--dir', 99 | type=_path_type, 100 | help="data directory for Wn's database and cache", 101 | ) 102 | parser.set_defaults(func=lambda _: parser.print_help()) 103 | sub_parsers = parser.add_subparsers(title='subcommands') 104 | 105 | 106 | parser_download = sub_parsers.add_parser( 107 | 'download', 108 | description="Download wordnets and add them to Wn's database.", 109 | help='download wordnets', 110 | ) 111 | parser_download.add_argument( 112 | 'target', nargs='+', help='project specifiers or URLs' 113 | ) 114 | parser_download.add_argument( 115 | '--index', type=_file_path_type, help='project index to use for downloading' 116 | ) 117 | parser_download.add_argument( 118 | '--no-add', action='store_false', dest='add', 119 | help='download and cache without adding to the database' 120 | ) 121 | parser_download.set_defaults(func=_download) 122 | 123 | 124 | parser_lexicons = sub_parsers.add_parser( 125 | 'lexicons', 126 | description="Display a list of installed lexicons.", 127 | help='list installed lexicons', 128 | ) 129 | parser_lexicons.add_argument( 130 | '-l', '--lang', help='BCP 47 language code' 131 | ) 132 | parser_lexicons.add_argument( 133 | '--lexicon', help='lexicon specifiers' 134 | ) 135 | parser_lexicons.set_defaults(func=_lexicons) 136 | 137 | 138 | parser_projects = sub_parsers.add_parser( 139 | 'projects', 140 | description=( 141 | "Display a list of known projects. The first column shows the " 142 | "status for a project (i=indexed, c=cached)." 143 | ), 144 | help='list known projects', 145 | ) 146 | parser_projects.set_defaults(func=_projects) 147 | 148 | 149 | parser_validate = sub_parsers.add_parser( 150 | 'validate', 151 | description=( 152 | "Validate a WN-LMF lexicon" 153 | ), 154 | help='validate a lexicon', 155 | ) 156 | parser_validate.add_argument( 157 | 'FILE', type=_file_path_type, help='WN-LMF (XML) lexicon file to validate' 158 | ) 159 | parser_validate.add_argument( 160 | '--select', metavar='CHECKS', default='E,W', 161 | help='comma-separated list of checks to run (default: E,W)' 162 | ) 163 | parser_validate.add_argument( 164 | '--output-file', metavar='FILE', 165 | help='write report to a JSON file' 166 | ) 167 | parser_validate.set_defaults(func=_validate) 168 | 169 | 170 | args = parser.parse_args() 171 | 172 | logging.basicConfig(level=logging.ERROR - (min(args.verbosity, 3) * 10)) 173 | 174 | if args.dir: 175 | wn.config.data_directory = args.dir 176 | 177 | args.func(args) 178 | -------------------------------------------------------------------------------- /tests/data/mini-lmf-1.1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 14 | 15 | 16 | 17 | 18 | 19 |
20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | tatoe 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 53 | 54 | 56 | 57 | 59 | 60 | 61 | 62 | 63 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | "the artist illustrated the story beautifully" 92 | 93 | 94 | 95 | 96 | 97 | 98 | INF 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 118 | depict something in a visual medium 119 | 120 | 121 | 123 | terminate employment 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | -------------------------------------------------------------------------------- /tests/lmf_test.py: -------------------------------------------------------------------------------- 1 | 2 | from xml.etree import ElementTree as ET 3 | 4 | from wn import lmf 5 | 6 | 7 | def test_is_lmf(datadir): 8 | assert lmf.is_lmf(datadir / 'mini-lmf-1.0.xml') 9 | assert lmf.is_lmf(str(datadir / 'mini-lmf-1.0.xml')) 10 | assert not lmf.is_lmf(datadir / 'README.md') 11 | assert not lmf.is_lmf(datadir / 'missing.xml') 12 | assert lmf.is_lmf(datadir / 'mini-lmf-1.1.xml') 13 | 14 | 15 | def test_scan_lexicons(datadir): 16 | assert lmf.scan_lexicons(datadir / 'mini-lmf-1.0.xml') == [ 17 | { 18 | 'id': 'test-en', 19 | 'version': '1', 20 | 'label': 'Testing English WordNet', 21 | 'extends': None, 22 | }, 23 | { 24 | 'id': 'test-es', 25 | 'version': '1', 26 | 'label': 'Testing Spanish WordNet', 27 | 'extends': None, 28 | }, 29 | ] 30 | 31 | assert lmf.scan_lexicons(datadir / 'mini-lmf-1.1.xml') == [ 32 | { 33 | 'id': 'test-ja', 34 | 'version': '1', 35 | 'label': 'Testing Japanese WordNet', 36 | 'extends': None, 37 | }, 38 | { 39 | 'id': 'test-en-ext', 40 | 'version': '1', 41 | 'label': 'Testing English Extension', 42 | 'extends': { 43 | 'id': 'test-en', 44 | 'version': '1', 45 | }, 46 | }, 47 | ] 48 | 49 | 50 | def test_load_1_0(datadir): 51 | resource = lmf.load(datadir / 'mini-lmf-1.0.xml') 52 | lexicons = resource['lexicons'] 53 | assert len(lexicons) == 2 54 | lexicon = lexicons[0] 55 | 56 | assert lexicon['id'] == 'test-en' 57 | assert lexicon['label'] == 'Testing English WordNet' 58 | assert lexicon['language'] == 'en' 59 | assert lexicon['email'] == 'maintainer@example.com' 60 | assert lexicon['license'] == 'https://creativecommons.org/licenses/by/4.0/' 61 | assert lexicon['version'] == '1' 62 | assert lexicon['url'] == 'https://example.com/test-en' 63 | 64 | assert len(lexicon['entries']) == 9 65 | le = lexicon['entries'][0] 66 | assert le['id'] == 'test-en-information-n' 67 | 68 | assert le['lemma']['writtenForm'] == 'information' 69 | assert le['lemma']['partOfSpeech'] == 'n' 70 | assert le['lemma']['script'] == 'Latn' 71 | assert len(le['lemma']['tags']) == 1 72 | 73 | assert len(le.get('forms', [])) == 0 74 | 75 | assert len(le['senses']) == 1 76 | sense = le['senses'][0] 77 | assert sense['id'] == 'test-en-information-n-0001-01' 78 | assert sense['synset'] == 'test-en-0001-n' 79 | assert len(sense.get('relations', [])) == 0 80 | # assert sense['relations'][0]['target'] == 'test-en-exemplify-v-01023137-01' 81 | # assert sense['relations'][0]['type'] == 'derivation' 82 | 83 | assert len(lexicon.get('frames', [])) == 0 # frames are on lexical entry 84 | assert len(lexicon['entries'][6]['frames']) == 2 85 | frames = lexicon['entries'][6]['frames'] 86 | assert frames[0]['subcategorizationFrame'] == 'Somebody ----s something' 87 | assert frames[0]['senses'] == ['test-en-illustrate-v-0003-01'] 88 | 89 | assert len(lexicon['synsets']) == 8 90 | 91 | assert lexicons[1]['id'] == 'test-es' 92 | 93 | 94 | def test_load_1_1(datadir): 95 | resource = lmf.load(datadir / 'mini-lmf-1.1.xml') 96 | lexicons = resource['lexicons'] 97 | assert len(lexicons) == 2 98 | lexicon = lexicons[0] 99 | assert lexicon['id'] == 'test-ja' 100 | assert lexicon['version'] == '1' 101 | # assert lexicon.logo == 'logo.svg' 102 | assert lexicon.get('requires') == [{'id': 'test-en', 'version': '1'}] 103 | 104 | lexicon = lexicons[1] 105 | assert lexicon['id'] == 'test-en-ext' 106 | assert lexicon.get('extends') == {'id': 'test-en', 'version': '1'} 107 | 108 | 109 | def test_load_1_3(datadir): 110 | resource = lmf.load(datadir / 'mini-lmf-1.3.xml') 111 | lexicons = resource['lexicons'] 112 | assert len(lexicons) == 1 113 | lexicon = lexicons[0] 114 | synsets = lexicon['synsets'] 115 | assert synsets[0]['definitions'][0]['text'] == 'one two three' 116 | assert synsets[1]['definitions'][0]['text'] == 'one two three' 117 | assert synsets[2]['definitions'][0]['text'] == ''' 118 | one 119 | two 120 | three 121 | ''' 122 | 123 | 124 | def test_load_1_4(datadir): 125 | resource = lmf.load(datadir / 'mini-lmf-1.4.xml') 126 | lexicons = resource['lexicons'] 127 | assert len(lexicons) == 1 128 | lexicon = lexicons[0] 129 | assert lexicon['entries'][0].get('index') == 'foo_bar' 130 | assert lexicon['entries'][1].get('index') == 'foo_bar' 131 | assert lexicon['entries'][2].get('index') is None 132 | assert lexicon['entries'][3].get('index') == 'baz' 133 | assert lexicon['entries'][4].get('index') is None 134 | assert lexicon['entries'][5].get('index') == 'baz' 135 | 136 | assert lexicon['entries'][0]['senses'][0].get('n') == 3 137 | assert lexicon['entries'][1]['senses'][0].get('n') == 2 138 | assert lexicon['entries'][1]['senses'][1].get('n') == 1 139 | assert lexicon['entries'][2]['senses'][0].get('n') is None 140 | assert lexicon['entries'][3]['senses'][0].get('n') == 2 141 | assert lexicon['entries'][4]['senses'][0].get('n') == 2 142 | assert lexicon['entries'][4]['senses'][1].get('n') is None 143 | assert lexicon['entries'][5]['senses'][0].get('n') == 1 144 | 145 | 146 | def test_dump(datadir, tmp_path): 147 | tmpdir = tmp_path / 'test_dump' 148 | tmpdir.mkdir() 149 | tmppath = tmpdir / 'mini_lmf_dump.xml' 150 | 151 | def assert_xml_equal(mini_lmf, dump_lmf): 152 | orig = ET.canonicalize(from_file=mini_lmf, strip_text=True) 153 | temp = ET.canonicalize(from_file=dump_lmf, strip_text=True) 154 | # additional transformation to help with debugging 155 | orig = orig.replace('<', '\n<') 156 | temp = temp.replace('<', '\n<') 157 | assert orig == temp 158 | 159 | lmf.dump(lmf.load(datadir / 'mini-lmf-1.0.xml'), tmppath) 160 | assert_xml_equal(datadir / 'mini-lmf-1.0.xml', tmppath) 161 | 162 | lmf.dump(lmf.load(datadir / 'mini-lmf-1.1.xml'), tmppath) 163 | assert_xml_equal(datadir / 'mini-lmf-1.1.xml', tmppath) 164 | 165 | lmf.dump(lmf.load(datadir / 'mini-lmf-1.4.xml'), tmppath) 166 | assert_xml_equal(datadir / 'mini-lmf-1.4.xml', tmppath) 167 | -------------------------------------------------------------------------------- /docs/guides/wordnet.rst: -------------------------------------------------------------------------------- 1 | .. raw:: html 2 | 3 | 4 | 5 | 6 | The Structure of a Wordnet 7 | ========================== 8 | A **wordnet** is an online lexicon which is organized by concepts. 9 | 10 | The basic unit of a wordnet is the synonym set (**synset**), a group of words that all refer to the 11 | same concept. Words and synsets are linked by means of conceptual-semantic relations to form the 12 | structure of wordnet. 13 | 14 | Words, Senses, and Synsets 15 | -------------------------- 16 | We all know that **words** are the basic building blocks of languages, a word is built up with two parts, 17 | its form and its meaning, but in natural languages, the word form and word meaning are not in an elegant 18 | one-to-one match, one word form may connect to many different meanings, so hereforth, we need **senses**, 19 | to work as the unit of word meanings, for example, the word *bank* has at least two senses: 20 | 21 | 1. bank\ :sup:`1`\: financial institution, like *City Bank*; 22 | 2. bank\ :sup:`2`\: sloping land, like *river bank*; 23 | 24 | Since **synsets** are group of words sharing the same concept, bank\ :sup:`1`\ and bank\ :sup:`2`\ are members of 25 | two different synsets, although they have the same word form. 26 | 27 | On the other hand, different word forms may also convey the same concept, such as *cab* and *taxi*, 28 | these word forms with the same concept are grouped together into one synset. 29 | 30 | .. raw:: html 31 | :file: images/word-sense-synset.svg 32 | 33 | 34 | .. role:: center 35 | :class: center 36 | 37 | :center:`Figure: relations between words, senses and synsets` 38 | 39 | 40 | Synset Relations 41 | ---------------- 42 | In wordnet, synsets are linked with each other to form various kinds of relations. For example, if 43 | the concept expressed by a synset is more general than a given synset, then it is in a 44 | *hypernym* relation with the given synset. As shown in the figure below, the synset with *car*, *auto* and *automobile* as its 45 | member is the *hypernym* of the other synset with *cab*, *taxi* and *hack*. Such relation which is built on 46 | the synset level is categorized as synset relations. 47 | 48 | .. raw:: html 49 | :file: images/synset-synset.svg 50 | 51 | :center:`Figure: example of synset relations` 52 | 53 | Sense Relations 54 | --------------- 55 | 56 | Some relations in wordnet are also built on sense level, which can be further divided into two types, 57 | relations that link sense with another sense, and relations that link sense with another synset. 58 | 59 | .. note:: In wordnet, synset relation and sense relation can both employ a particular 60 | relation type, such as `domain topic `_. 61 | 62 | **Sense-Sense** 63 | 64 | Sense to sense relations emphasize the connections between different senses, especially when dealing 65 | with morphologically related words. For example, *behavioral* is the adjective to the noun *behavior*, 66 | which is known as in the *pertainym* relation with *behavior*, however, such relation doesn't exist between 67 | *behavioral* and *conduct*, which is a synonym of *behavior* and is in the same synset. Here *pertainym* 68 | is a sense-sense relation. 69 | 70 | .. raw:: html 71 | :file: images/sense-sense.svg 72 | 73 | :center:`Figure: example of sense-sense relations` 74 | 75 | **Sense-Synset** 76 | 77 | Sense-synset relations connect a particular sense with a synset. For example, *cursor* is a term in the 78 | *computer science* discipline, in wordnet, it is in the *has domain topic* relation with the 79 | *computer science* synset, but *pointer*, which is in the same synset with *cursor*, is not a term, thus 80 | has no such relation with *computer science* synset. 81 | 82 | .. raw:: html 83 | :file: images/sense-synset.svg 84 | 85 | :center:`Figure: example of sense-synset relations` 86 | 87 | Other Information 88 | ----------------- 89 | A wordnet should be built in an appropriate form, two schemas are accepted: 90 | 91 | * XML schema based on the Lexical Markup Framework (LMF) 92 | * JSON-LD using the Lexicon Model for Ontologies 93 | 94 | The structure of a wordnet should contain below info: 95 | 96 | **Definition** 97 | 98 | Definition is used to define senses and synsets in a wordnet, it is given in the language 99 | of the wordnet it came from. 100 | 101 | **Example** 102 | 103 | Example is used to clarify the senses and synsets in a wordnet, users can understand the definition 104 | more clearly with a given example. 105 | 106 | **Metadata** 107 | 108 | A wordnet has its own metadata, based on the `Dublin Core `_, to state the 109 | basic info of it, below table lists all the items in the metadata of a wordnet: 110 | 111 | +------------------+-----------+-----------+ 112 | | contributor | Optional | str | 113 | +------------------+-----------+-----------+ 114 | | coverage | Optional | str | 115 | +------------------+-----------+-----------+ 116 | | creator | Optional | str | 117 | +------------------+-----------+-----------+ 118 | | date | Optional | str | 119 | +------------------+-----------+-----------+ 120 | | description | Optional | str | 121 | +------------------+-----------+-----------+ 122 | | format | Optional | str | 123 | +------------------+-----------+-----------+ 124 | | identifier | Optional | str | 125 | +------------------+-----------+-----------+ 126 | | publisher | Optional | str | 127 | +------------------+-----------+-----------+ 128 | | relation | Optional | str | 129 | +------------------+-----------+-----------+ 130 | | rights | Optional | str | 131 | +------------------+-----------+-----------+ 132 | | source | Optional | str | 133 | +------------------+-----------+-----------+ 134 | | subject | Optional | str | 135 | +------------------+-----------+-----------+ 136 | | title | Optional | str | 137 | +------------------+-----------+-----------+ 138 | | type | Optional | str | 139 | +------------------+-----------+-----------+ 140 | | status | Optional | str | 141 | +------------------+-----------+-----------+ 142 | | note | Optional | str | 143 | +------------------+-----------+-----------+ 144 | | confidence | Optional | float | 145 | +------------------+-----------+-----------+ -------------------------------------------------------------------------------- /wn/_module_functions.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Union 2 | 3 | import wn 4 | from wn._util import format_lexicon_specifier 5 | 6 | 7 | def projects() -> list[dict]: 8 | """Return the list of indexed projects. 9 | 10 | This returns the same dictionaries of information as 11 | :meth:`wn.config.get_project_info 12 | `, but for all indexed 13 | projects. 14 | 15 | Example: 16 | 17 | >>> infos = wn.projects() 18 | >>> len(infos) 19 | 36 20 | >>> infos[0]['label'] 21 | 'Open English WordNet' 22 | 23 | """ 24 | index = wn.config.index 25 | return [ 26 | wn.config.get_project_info(format_lexicon_specifier(project_id, version)) 27 | for project_id, project_info in index.items() 28 | for version in project_info.get('versions', []) 29 | if 'resource_urls' in project_info['versions'][version] 30 | ] 31 | 32 | 33 | def lexicons( 34 | *, 35 | lexicon: Optional[str] = "*", 36 | lang: Optional[str] = None 37 | ) -> list[wn.Lexicon]: 38 | """Return the lexicons matching a language or lexicon specifier. 39 | 40 | Example: 41 | 42 | >>> wn.lexicons(lang='en') 43 | [, ] 44 | 45 | """ 46 | try: 47 | w = wn.Wordnet(lang=lang, lexicon=lexicon or '*') 48 | except wn.Error: 49 | return [] 50 | else: 51 | return w.lexicons() 52 | 53 | 54 | def word( 55 | id: str, 56 | *, 57 | lexicon: Optional[str] = None, 58 | lang: Optional[str] = None 59 | ) -> wn.Word: 60 | """Return the word with *id* in *lexicon*. 61 | 62 | This will create a :class:`Wordnet` object using the *lang* and 63 | *lexicon* arguments. The *id* argument is then passed to the 64 | :meth:`Wordnet.word` method. 65 | 66 | >>> wn.word('ewn-cell-n') 67 | Word('ewn-cell-n') 68 | 69 | """ 70 | return wn.Wordnet(lang=lang, lexicon=lexicon).word(id) 71 | 72 | 73 | def words( 74 | form: Optional[str] = None, 75 | pos: Optional[str] = None, 76 | *, 77 | lexicon: Optional[str] = None, 78 | lang: Optional[str] = None, 79 | ) -> list[wn.Word]: 80 | """Return the list of matching words. 81 | 82 | This will create a :class:`Wordnet` object using the *lang* and 83 | *lexicon* arguments. The remaining arguments are passed to the 84 | :meth:`Wordnet.words` method. 85 | 86 | >>> len(wn.words()) 87 | 282902 88 | >>> len(wn.words(pos='v')) 89 | 34592 90 | >>> wn.words(form="scurry") 91 | [Word('ewn-scurry-n'), Word('ewn-scurry-v')] 92 | 93 | """ 94 | return wn.Wordnet(lang=lang, lexicon=lexicon).words(form=form, pos=pos) 95 | 96 | 97 | def synset( 98 | id: str, 99 | *, 100 | lexicon: Optional[str] = None, 101 | lang: Optional[str] = None 102 | ) -> wn.Synset: 103 | """Return the synset with *id* in *lexicon*. 104 | 105 | This will create a :class:`Wordnet` object using the *lang* and 106 | *lexicon* arguments. The *id* argument is then passed to the 107 | :meth:`Wordnet.synset` method. 108 | 109 | >>> wn.synset('ewn-03311152-n') 110 | Synset('ewn-03311152-n') 111 | 112 | """ 113 | return wn.Wordnet(lang=lang, lexicon=lexicon).synset(id=id) 114 | 115 | 116 | def synsets( 117 | form: Optional[str] = None, 118 | pos: Optional[str] = None, 119 | ili: Optional[Union[str, wn.ILI]] = None, 120 | *, 121 | lexicon: Optional[str] = None, 122 | lang: Optional[str] = None, 123 | ) -> list[wn.Synset]: 124 | """Return the list of matching synsets. 125 | 126 | This will create a :class:`Wordnet` object using the *lang* and 127 | *lexicon* arguments. The remaining arguments are passed to the 128 | :meth:`Wordnet.synsets` method. 129 | 130 | >>> len(wn.synsets('couch')) 131 | 4 132 | >>> wn.synsets('couch', pos='v') 133 | [Synset('ewn-00983308-v')] 134 | 135 | """ 136 | return wn.Wordnet(lang=lang, lexicon=lexicon).synsets(form=form, pos=pos, ili=ili) 137 | 138 | 139 | def senses( 140 | form: Optional[str] = None, 141 | pos: Optional[str] = None, 142 | *, 143 | lexicon: Optional[str] = None, 144 | lang: Optional[str] = None, 145 | ) -> list[wn.Sense]: 146 | """Return the list of matching senses. 147 | 148 | This will create a :class:`Wordnet` object using the *lang* and 149 | *lexicon* arguments. The remaining arguments are passed to the 150 | :meth:`Wordnet.senses` method. 151 | 152 | >>> len(wn.senses('twig')) 153 | 3 154 | >>> wn.senses('twig', pos='n') 155 | [Sense('ewn-twig-n-13184889-02')] 156 | 157 | """ 158 | return wn.Wordnet(lang=lang, lexicon=lexicon).senses(form=form, pos=pos) 159 | 160 | 161 | def sense( 162 | id: str, 163 | *, 164 | lexicon: Optional[str] = None, 165 | lang: Optional[str] = None 166 | ) -> wn.Sense: 167 | """Return the sense with *id* in *lexicon*. 168 | 169 | This will create a :class:`Wordnet` object using the *lang* and 170 | *lexicon* arguments. The *id* argument is then passed to the 171 | :meth:`Wordnet.sense` method. 172 | 173 | >>> wn.sense('ewn-flutter-v-01903884-02') 174 | Sense('ewn-flutter-v-01903884-02') 175 | 176 | """ 177 | return wn.Wordnet(lang=lang, lexicon=lexicon).sense(id=id) 178 | 179 | 180 | def ili( 181 | id: str, 182 | *, 183 | lexicon: Optional[str] = None, 184 | lang: Optional[str] = None 185 | ) -> wn.ILI: 186 | """Return the interlingual index with *id*. 187 | 188 | This will create a :class:`Wordnet` object using the *lang* and 189 | *lexicon* arguments. The *id* argument is then passed to the 190 | :meth:`Wordnet.ili` method. 191 | 192 | """ 193 | return wn.Wordnet(lang=lang, lexicon=lexicon).ili(id=id) 194 | 195 | 196 | def ilis( 197 | status: Optional[str] = None, 198 | *, 199 | lexicon: Optional[str] = None, 200 | lang: Optional[str] = None, 201 | ) -> list[wn.ILI]: 202 | """Return the list of matching interlingual indices. 203 | 204 | This will create a :class:`Wordnet` object using the *lang* and 205 | *lexicon* arguments. The remaining arguments are passed to the 206 | :meth:`Wordnet.ilis` method. 207 | 208 | >>> len(wn.ilis()) 209 | 120071 210 | >>> len(wn.ilis(status='proposed')) 211 | 2573 212 | >>> wn.ilis(status='proposed')[-1].definition() 213 | 'the neutrino associated with the tau lepton.' 214 | 215 | """ 216 | return wn.Wordnet(lang=lang, lexicon=lexicon).ilis(status=status) 217 | -------------------------------------------------------------------------------- /tests/secondary_query_test.py: -------------------------------------------------------------------------------- 1 | 2 | import pytest 3 | 4 | import wn 5 | 6 | 7 | @pytest.mark.usefixtures('mini_db') 8 | def test_word_senses(): 9 | assert len(wn.word('test-en-information-n').senses()) == 1 10 | assert len(wn.word('test-es-información-n').senses()) == 1 11 | 12 | 13 | @pytest.mark.usefixtures('mini_db') 14 | def test_word_synsets(): 15 | assert len(wn.word('test-en-information-n').synsets()) == 1 16 | assert len(wn.word('test-es-información-n').synsets()) == 1 17 | 18 | 19 | @pytest.mark.usefixtures('mini_db') 20 | def test_word_translate(): 21 | assert len(wn.word('test-en-example-n').translate(lang='es')) == 1 22 | assert len(wn.word('test-es-ejemplo-n').translate(lang='en')) == 1 23 | 24 | 25 | @pytest.mark.usefixtures('mini_db') 26 | def test_sense_word(): 27 | assert (wn.sense('test-en-information-n-0001-01').word() 28 | == wn.word('test-en-information-n')) 29 | assert (wn.sense('test-es-información-n-0001-01').word() 30 | == wn.word('test-es-información-n')) 31 | 32 | 33 | @pytest.mark.usefixtures('mini_db') 34 | def test_sense_synset(): 35 | assert (wn.sense('test-en-information-n-0001-01').synset() 36 | == wn.synset('test-en-0001-n')) 37 | assert (wn.sense('test-es-información-n-0001-01').synset() 38 | == wn.synset('test-es-0001-n')) 39 | 40 | 41 | @pytest.mark.usefixtures('mini_db') 42 | def test_sense_issue_157(): 43 | # https://github.com/goodmami/wn/issues/157 44 | sense = wn.sense('test-en-information-n-0001-01') 45 | # This test uses non-public members, which is not ideal, but there 46 | # is currently no better alternative. 47 | assert sense._lexconf is sense.word()._lexconf 48 | assert sense._lexconf is sense.synset()._lexconf 49 | 50 | 51 | @pytest.mark.usefixtures('mini_db') 52 | def test_sense_examples(): 53 | assert wn.sense('test-en-information-n-0001-01').examples() == [] 54 | assert wn.sense('test-es-información-n-0001-01').examples() == [] 55 | 56 | 57 | @pytest.mark.usefixtures('mini_db') 58 | def test_sense_lexicalized(): 59 | assert wn.sense('test-en-information-n-0001-01').lexicalized() 60 | assert wn.sense('test-es-información-n-0001-01').lexicalized() 61 | 62 | 63 | @pytest.mark.usefixtures('mini_db') 64 | def test_sense_frames(): 65 | assert wn.sense('test-en-illustrate-v-0003-01').frames() == [ 66 | 'Somebody ----s something', 67 | 'Something ----s something', 68 | ] 69 | assert wn.sense('test-es-ilustrar-v-0003-01').frames() == [] 70 | 71 | 72 | @pytest.mark.usefixtures('mini_db_1_1') 73 | def test_sense_frames_issue_156(): 74 | # https://github.com/goodmami/wn/issues/156 75 | assert wn.sense('test-ja-示す-v-0003-01').frames() == [ 76 | 'ある人が何かを----', 77 | ] 78 | assert wn.sense('test-ja-事例-n-0002-01').frames() == [] 79 | 80 | 81 | @pytest.mark.usefixtures('mini_db') 82 | def test_sense_translate(): 83 | assert len(wn.sense('test-en-information-n-0001-01').translate(lang='es')) == 1 84 | assert len(wn.sense('test-es-información-n-0001-01').translate(lang='en')) == 1 85 | 86 | 87 | @pytest.mark.usefixtures('mini_db') 88 | def test_synset_senses(): 89 | assert len(wn.synset('test-en-0003-v').senses()) == 2 90 | assert len(wn.synset('test-es-0003-v').senses()) == 2 91 | 92 | 93 | @pytest.mark.usefixtures('mini_db') 94 | def test_synset_words(): 95 | assert len(wn.synset('test-en-0003-v').words()) == 2 96 | assert len(wn.synset('test-es-0003-v').words()) == 2 97 | 98 | 99 | @pytest.mark.usefixtures('mini_db') 100 | def test_synset_lemmas(): 101 | assert wn.synset('test-en-0003-v').lemmas() == ['exemplify', 'illustrate'] 102 | assert wn.synset('test-es-0003-v').lemmas() == ['ejemplificar', 'ilustrar'] 103 | 104 | 105 | @pytest.mark.usefixtures('mini_db') 106 | def test_synset_ili(): 107 | assert isinstance(wn.synset('test-en-0001-n').ili, wn.ILI) 108 | assert wn.synset('test-en-0001-n').ili.id == 'i67447' 109 | assert wn.synset('test-en-0001-n').ili.status == 'presupposed' 110 | assert wn.synset('test-en-0008-n').ili is None 111 | assert wn.synset('test-en-0007-v').ili.id is None 112 | assert wn.synset('test-en-0007-v').ili.status == 'proposed' 113 | 114 | 115 | @pytest.mark.usefixtures('mini_db') 116 | def test_synset_definition(): 117 | assert wn.synset('test-en-0001-n').definition() == 'something that informs' 118 | defn = wn.synset('test-en-0001-n').definition(data=True) 119 | assert defn.source_sense_id == 'test-en-information-n-0001-01' 120 | assert wn.synset('test-es-0001-n').definition() == 'algo que informa' 121 | 122 | 123 | @pytest.mark.usefixtures('mini_db') 124 | def test_synset_definitions(): 125 | assert wn.synset('test-en-0001-n').definitions() == ['something that informs'] 126 | defns = wn.synset('test-en-0001-n').definitions(data=True) 127 | assert defns[0].source_sense_id == 'test-en-information-n-0001-01' 128 | assert wn.synset('test-es-0001-n').definitions() == ['algo que informa'] 129 | 130 | 131 | @pytest.mark.usefixtures('mini_db') 132 | def test_synset_examples(): 133 | assert wn.synset('test-en-0001-n').examples() == ['"this is information"'] 134 | ex = wn.synset('test-en-0001-n').examples(data=True)[0] 135 | assert ex.text == '"this is information"' 136 | assert wn.synset('test-es-0001-n').examples() == ['"este es la información"'] 137 | 138 | 139 | @pytest.mark.usefixtures('mini_db') 140 | def test_synset_lexicalized(): 141 | assert wn.synset('test-en-0001-n').lexicalized() 142 | assert wn.synset('test-es-0001-n').lexicalized() 143 | 144 | 145 | @pytest.mark.usefixtures('mini_db') 146 | def test_synset_translate(): 147 | assert len(wn.synset('test-en-0001-n').translate(lang='es')) == 1 148 | assert len(wn.synset('test-es-0001-n').translate(lang='en')) == 1 149 | 150 | 151 | @pytest.mark.usefixtures('uninitialized_datadir') 152 | def test_word_sense_order(datadir): 153 | wn.add(datadir / 'sense-member-order.xml') 154 | assert [s.id for s in wn.word('test-foo-n').senses()] == [ 155 | "test-01-foo-n", "test-02-foo-n", 156 | ] 157 | assert [s.id for s in wn.word('test-bar-n').senses()] == [ 158 | "test-02-bar-n", "test-01-bar-n", 159 | ] 160 | 161 | 162 | @pytest.mark.usefixtures('uninitialized_datadir') 163 | def test_synset_member_order(datadir): 164 | wn.add(datadir / 'sense-member-order.xml') 165 | assert [s.id for s in wn.synset('test-01-n').senses()] == [ 166 | "test-01-bar-n", "test-01-foo-n", 167 | ] 168 | assert [s.id for s in wn.synset('test-02-n').senses()] == [ 169 | "test-02-bar-n", "test-02-foo-n", 170 | ] 171 | -------------------------------------------------------------------------------- /tests/relations_test.py: -------------------------------------------------------------------------------- 1 | 2 | import pytest 3 | 4 | import wn 5 | 6 | 7 | @pytest.mark.usefixtures('mini_db') 8 | def test_word_derived_words(): 9 | assert len(wn.word('test-en-example-n').derived_words()) == 1 10 | assert len(wn.word('test-es-ejemplo-n').derived_words()) == 1 11 | 12 | 13 | @pytest.mark.usefixtures('mini_db') 14 | def test_synset_hypernyms(): 15 | assert wn.synset('test-en-0002-n').hypernyms() == [ 16 | wn.synset('test-en-0001-n') 17 | ] 18 | assert wn.synset('test-en-0001-n').hypernyms() == [] 19 | 20 | 21 | @pytest.mark.usefixtures('mini_db') 22 | def test_synset_hypernyms_expand_default(): 23 | assert wn.synset('test-es-0002-n').hypernyms() == [ 24 | wn.synset('test-es-0001-n') 25 | ] 26 | assert wn.synset('test-es-0001-n').hypernyms() == [] 27 | 28 | 29 | @pytest.mark.usefixtures('mini_db') 30 | def test_synset_hypernyms_expand_empty(): 31 | w = wn.Wordnet(lang='es', expand='') 32 | assert w.synset('test-es-0002-n').hypernyms() == [] 33 | 34 | 35 | @pytest.mark.usefixtures('mini_db') 36 | def test_synset_hypernyms_expand_specified(): 37 | w = wn.Wordnet(lang='es', expand='test-en') 38 | assert w.synset('test-es-0002-n').hypernyms() == [ 39 | w.synset('test-es-0001-n') 40 | ] 41 | 42 | 43 | @pytest.mark.usefixtures('mini_db') 44 | def test_synset_relations(): 45 | w = wn.Wordnet(lang='en') 46 | assert w.synset('test-en-0002-n').relations() == { 47 | 'hypernym': [w.synset('test-en-0001-n')], 48 | 'hyponym': [w.synset('test-en-0004-n')] 49 | } 50 | 51 | 52 | @pytest.mark.usefixtures('mini_db') 53 | def test_sense_get_related(): 54 | w = wn.Wordnet('test-en') 55 | assert w.sense('test-en-example-n-0002-01').get_related() == [ 56 | w.sense('test-en-exemplify-v-0003-01') 57 | ] 58 | 59 | 60 | @pytest.mark.usefixtures('mini_db') 61 | def test_sense_relations(): 62 | w = wn.Wordnet('test-en') 63 | assert w.sense('test-en-example-n-0002-01').relations() == { 64 | 'derivation': [w.sense('test-en-exemplify-v-0003-01')] 65 | } 66 | 67 | 68 | @pytest.mark.usefixtures('mini_db_1_1') 69 | def test_extension_relations(): 70 | # default mode 71 | assert wn.synset('test-en-0007-v').hypernyms() == [ 72 | wn.synset('test-en-ext-0009-v') 73 | ] 74 | assert wn.synset('test-en-ext-0009-v').hyponyms() == [ 75 | wn.synset('test-en-0007-v') 76 | ] 77 | assert wn.sense('test-en-information-n-0001-01').get_related('pertainym') == [ 78 | wn.sense('test-en-ext-info-n-0001-01') 79 | ] 80 | assert wn.sense('test-en-ext-info-n-0001-01').get_related('pertainym') == [ 81 | wn.sense('test-en-information-n-0001-01') 82 | ] 83 | 84 | # restricted to base 85 | w = wn.Wordnet(lexicon='test-en') 86 | assert w.synset('test-en-0007-v').hypernyms() == [] 87 | assert w.sense('test-en-information-n-0001-01').get_related('pertainym') == [] 88 | 89 | # base and extension 90 | w = wn.Wordnet(lexicon='test-en test-en-ext') 91 | assert w.synset('test-en-0007-v').hypernyms() == [ 92 | w.synset('test-en-ext-0009-v') 93 | ] 94 | assert w.synset('test-en-ext-0009-v').hyponyms() == [ 95 | w.synset('test-en-0007-v') 96 | ] 97 | assert w.sense('test-en-information-n-0001-01').get_related('pertainym') == [ 98 | w.sense('test-en-ext-info-n-0001-01') 99 | ] 100 | assert w.sense('test-en-ext-info-n-0001-01').get_related('pertainym') == [ 101 | w.sense('test-en-information-n-0001-01') 102 | ] 103 | 104 | # restricted to extension 105 | w = wn.Wordnet(lexicon='test-en-ext') 106 | assert w.synset('test-en-ext-0009-v').hyponyms() == [] 107 | assert w.sense('test-en-ext-info-n-0001-01').get_related('pertainym') == [] 108 | 109 | 110 | @pytest.mark.usefixtures('mini_db_1_1') 111 | def test_sense_synset_issue_168(): 112 | # https://github.com/goodmami/wn/issues/168 113 | ja = wn.Wordnet(lexicon='test-ja', expand='') 114 | assert ja.synset('test-ja-0001-n').get_related() == [] 115 | assert ja.sense('test-ja-情報-n-0001-01').synset().get_related() == [] 116 | 117 | 118 | @pytest.mark.usefixtures('mini_db') 119 | def test_synset_relations_issue_169(): 120 | # https://github.com/goodmami/wn/issues/169 121 | en = wn.Wordnet('test-en') 122 | assert list(en.synset("test-en-0001-n").relations('hyponym')) == ['hyponym'] 123 | es = wn.Wordnet('test-es', expand='test-en') 124 | assert list(es.synset("test-es-0001-n").relations('hyponym')) == ['hyponym'] 125 | 126 | 127 | @pytest.mark.usefixtures('mini_db') 128 | def test_synset_relations_issue_177(): 129 | # https://github.com/goodmami/wn/issues/177 130 | assert 'hyponym' in wn.synset('test-es-0001-n').relations() 131 | 132 | 133 | @pytest.mark.usefixtures('mini_db') 134 | def test_sense_relation_map(): 135 | en = wn.Wordnet('test-en') 136 | assert en.sense('test-en-information-n-0001-01').relation_map() == {} 137 | relmap = en.sense('test-en-illustrate-v-0003-01').relation_map() 138 | # only sense-sense relations by default 139 | assert len(relmap) == 3 140 | assert all(isinstance(tgt, wn.Sense) for tgt in relmap.values()) 141 | assert {rel.name for rel in relmap} == {'derivation', 'other'} 142 | assert {rel.target_id for rel in relmap} == {'test-en-illustration-n-0002-01'} 143 | # sense relations targets should always have same ids as resolved targets 144 | assert all(rel.target_id == tgt.id for rel, tgt in relmap.items()) 145 | 146 | 147 | @pytest.mark.usefixtures('mini_db') 148 | def test_synset_relation_map(): 149 | en = wn.Wordnet('test-en') 150 | assert en.synset('test-en-0003-v').relation_map() == {} 151 | relmap = en.synset('test-en-0002-n').relation_map() 152 | assert len(relmap) == 2 153 | assert {rel.name for rel in relmap} == {'hypernym', 'hyponym'} 154 | assert {rel.target_id for rel in relmap} == {'test-en-0001-n', 'test-en-0004-n'} 155 | # synset relation targets have same ids as resolved targets in same lexicon 156 | assert all(rel.target_id == tgt.id for rel, tgt in relmap.items()) 157 | assert all(rel.lexicon().id == 'test-en' for rel in relmap) 158 | 159 | # interlingual synset relation targets show original target ids 160 | es = wn.Wordnet('test-es', expand='test-en') 161 | relmap = es.synset('test-es-0002-n').relation_map() 162 | assert len(relmap) == 2 163 | assert {rel.name for rel in relmap} == {'hypernym', 'hyponym'} 164 | assert {rel.target_id for rel in relmap} == {'test-en-0001-n', 'test-en-0004-n'} 165 | assert all(rel.target_id != tgt.id for rel, tgt in relmap.items()) 166 | assert all(rel.lexicon().id == 'test-en' for rel in relmap) 167 | -------------------------------------------------------------------------------- /docs/api/wn.similarity.rst: -------------------------------------------------------------------------------- 1 | wn.similarity 2 | ============= 3 | 4 | .. automodule:: wn.similarity 5 | 6 | Taxonomy-based Metrics 7 | ---------------------- 8 | 9 | The `Path `_, `Leacock-Chodorow `_, and `Wu-Palmer `_ similarity 11 | metrics work by finding path distances in the hypernym/hyponym 12 | taxonomy. As such, they are most useful when the synsets are, in fact, 13 | arranged in a taxonomy. For the Princeton WordNet and derivative 14 | wordnets, such as the `Open English Wordnet`_ and `OMW English Wordnet 15 | based on WordNet 3.0`_ available to Wn, synsets for nouns and verbs 16 | are arranged taxonomically: the nouns mostly form a single structure 17 | with a single root while verbs form many smaller structures with many 18 | roots. Synsets for the other parts of speech do not use 19 | hypernym/hyponym relations at all. This situation may be different for 20 | other wordnet projects or future versions of the English wordnets. 21 | 22 | .. _Open English Wordnet: https://en-word.net 23 | .. _OMW English Wordnet based on WordNet 3.0: https://github.com/omwn/omw-data 24 | 25 | The similarity metrics tend to fail when the synsets are not connected 26 | by some path. When the synsets are in different parts of speech, or 27 | even in separate lexicons, this failure is acceptable and 28 | expected. But for cases like the verbs in the Princeton WordNet, it 29 | might be more useful to pretend that there is some unique root for all 30 | verbs so as to create a path connecting any two of them. For this 31 | purpose, the *simulate_root* parameter is available on the 32 | :func:`path`, :func:`lch`, and :func:`wup` functions, where it is 33 | passed on to calls to :meth:`wn.Synset.shortest_path` and 34 | :meth:`wn.Synset.lowest_common_hypernyms`. Setting *simulate_root* to 35 | :python:`True` can, however, give surprising results if the words are 36 | from a different lexicon. Currently, computing similarity for synsets 37 | from a different part of speech raises an error. 38 | 39 | 40 | Path Similarity 41 | ''''''''''''''' 42 | 43 | When :math:`p` is the length of the shortest path between two synsets, 44 | the path similarity is: 45 | 46 | .. math:: 47 | 48 | \frac{1}{p + 1} 49 | 50 | The similarity score ranges between 0.0 and 1.0, where the higher the 51 | score is, the more similar the synsets are. The score is 1.0 when a 52 | synset is compared to itself, and 0.0 when there is no path between 53 | the two synsets (i.e., the path distance is infinite). 54 | 55 | .. autofunction:: path 56 | 57 | 58 | .. _leacock-chodorow-similarity: 59 | 60 | Leacock-Chodorow Similarity 61 | ''''''''''''''''''''''''''' 62 | 63 | When :math:`p` is the length of the shortest path between two synsets 64 | and :math:`d` is the maximum taxonomy depth, the Leacock-Chodorow 65 | similarity is: 66 | 67 | .. math:: 68 | 69 | -\text{log}\left(\frac{p + 1}{2d}\right) 70 | 71 | .. autofunction:: lch 72 | 73 | 74 | Wu-Palmer Similarity 75 | '''''''''''''''''''' 76 | 77 | When *LCS* is the lowest common hypernym (also called "least common 78 | subsumer") between two synsets, :math:`i` is the shortest path 79 | distance from the first synset to *LCS*, :math:`j` is the shortest 80 | path distance from the second synset to *LCS*, and :math:`k` is the 81 | number of nodes (distance + 1) from *LCS* to the root node, then the 82 | Wu-Palmer similarity is: 83 | 84 | .. math:: 85 | 86 | \frac{2k}{i + j + 2k} 87 | 88 | .. autofunction:: wup 89 | 90 | 91 | Information Content-based Metrics 92 | --------------------------------- 93 | 94 | The `Resnik `_, `Jiang-Conrath `_, and `Lin `_ similarity metrics work 96 | by computing the information content of the synsets and/or that of 97 | their lowest common hypernyms. They therefore require information 98 | content weights (see :mod:`wn.ic`), and the values returned 99 | necessarily depend on the weights used. 100 | 101 | 102 | Resnik Similarity 103 | ''''''''''''''''' 104 | 105 | The Resnik similarity (`Resnik 1995 106 | `_) is the maximum 107 | information content value of the common subsumers (hypernym ancestors) 108 | of the two synsets. Formally it is defined as follows, where 109 | :math:`c_1` and :math:`c_2` are the two synsets being compared. 110 | 111 | .. math:: 112 | 113 | \text{max}_{c \in \text{S}(c_1, c_2)} \text{IC}(c) 114 | 115 | Since a synset's information content is always equal or greater than 116 | the information content of its hypernyms, :math:`S(c_1, c_2)` above is 117 | more efficiently computed using the lowest common hypernyms instead of 118 | all common hypernyms. 119 | 120 | .. autofunction:: res 121 | 122 | 123 | Jiang-Conrath Similarity 124 | '''''''''''''''''''''''' 125 | 126 | The Jiang-Conrath similarity metric (`Jiang and Conrath, 1997 127 | `_) combines the ideas 128 | of the taxonomy-based and information content-based metrics. It is 129 | defined as follows, where :math:`c_1` and :math:`c_2` are the two 130 | synsets being compared and :math:`c_0` is the lowest common hypernym 131 | of the two with the highest information content weight: 132 | 133 | .. math:: 134 | 135 | \frac{1}{\text{IC}(c_1) + \text{IC}(c_2) - 2(\text{IC}(c_0))} 136 | 137 | This equation is the simplified form given in the paper were several 138 | parameterized terms are cancelled out because the full form is not 139 | often used in practice. 140 | 141 | There are two special cases: 142 | 143 | 1. If the information content of :math:`c_0`, :math:`c_1`, and 144 | :math:`c_2` are all zero, the metric returns zero. This occurs when 145 | both :math:`c_1` and :math:`c_2` are the root node, but it can also 146 | occur if the synsets did not occur in the corpus and the smoothing 147 | value was set to zero. 148 | 149 | 2. Otherwise if :math:`c_1 + c_2 = 2c_0`, the metric returns 150 | infinity. This occurs when the two synsets are the same, one is a 151 | descendant of the other, etc., such that they have the same 152 | frequency as each other and as their lowest common hypernym. 153 | 154 | .. autofunction:: jcn 155 | 156 | 157 | Lin Similarity 158 | '''''''''''''' 159 | 160 | Another formulation of information content-based similarity is the Lin 161 | metric (`Lin 1997 `_), 162 | which is defined as follows, where :math:`c_1` and :math:`c_2` are the 163 | two synsets being compared and :math:`c_0` is the lowest common 164 | hypernym with the highest information content weight: 165 | 166 | .. math:: 167 | 168 | \frac{2(\text{IC}(c_0))}{\text{IC}(c_1) + \text{IC}(c_0)} 169 | 170 | One special case is if either synset has an information content value 171 | of zero, in which case the metric returns zero. 172 | 173 | .. autofunction:: lin 174 | -------------------------------------------------------------------------------- /wn/util.py: -------------------------------------------------------------------------------- 1 | """Wn utility classes.""" 2 | from collections.abc import Callable 3 | from typing import TextIO 4 | import sys 5 | 6 | 7 | def synset_id_formatter( 8 | fmt: str = '{prefix}-{offset:08}-{pos}', 9 | **kwargs 10 | ) -> Callable: 11 | """Return a function for formatting synset ids. 12 | 13 | The *fmt* argument can be customized. It will be formatted using 14 | any other keyword arguments given to this function and any given 15 | to the resulting function. By default, the format string expects a 16 | ``prefix`` string argument for the namespace (such as a lexicon 17 | id), an ``offset`` integer argument (such as a WNDB offset), and a 18 | ``pos`` string argument. 19 | 20 | Arguments: 21 | fmt: A Python format string 22 | **kwargs: Keyword arguments for the format string. 23 | 24 | Example: 25 | 26 | >>> pwn_synset_id = synset_id_formatter(prefix='pwn') 27 | >>> pwn_synset_id(offset=1174, pos='n') 28 | 'pwn-00001174-n' 29 | 30 | """ 31 | 32 | def format_synset_id(**_kwargs) -> str: 33 | return fmt.format(**kwargs, **_kwargs) 34 | 35 | return format_synset_id 36 | 37 | 38 | class ProgressHandler: 39 | """An interface for updating progress in long-running processes. 40 | 41 | Long-running processes in Wn, such as :func:`wn.download` and 42 | :func:`wn.add`, call to a progress handler object as they go. The 43 | default progress handler used by Wn is :class:`ProgressBar`, which 44 | updates progress by formatting and printing a textual bar to 45 | stderr. The :class:`ProgressHandler` class may be used directly, 46 | which does nothing, or users may create their own subclasses for, 47 | e.g., updating a GUI or some other handler. 48 | 49 | The initialization parameters, except for ``file``, are stored in 50 | a :attr:`kwargs` member and may be updated after the handler is 51 | created through the :meth:`set` method. The :meth:`update` method 52 | is the primary way a counter is updated. The :meth:`flash` method 53 | is sometimes called for simple messages. When the process is 54 | complete, the :meth:`close` method is called, optionally with a 55 | message. 56 | 57 | """ 58 | 59 | def __init__( 60 | self, 61 | *, 62 | message: str = '', 63 | count: int = 0, 64 | total: int = 0, 65 | refresh_interval: int = 0, 66 | unit: str = '', 67 | status: str = '', 68 | file: TextIO = sys.stderr, 69 | ): 70 | self.file = file 71 | self.kwargs = { 72 | 'count': count, 73 | 'total': total, 74 | 'refresh_interval': refresh_interval, 75 | 'message': message, 76 | 'unit': unit, 77 | 'status': status, 78 | } 79 | self._refresh_quota: int = refresh_interval 80 | 81 | def update(self, n: int = 1, force: bool = False) -> None: 82 | """Update the counter with the increment value *n*. 83 | 84 | This method should update the ``count`` key of :attr:`kwargs` 85 | with the increment value *n*. After this, it is expected to 86 | update some user-facing progress indicator. 87 | 88 | If *force* is :python:`True`, any indicator will be refreshed 89 | regardless of the value of the refresh interval. 90 | 91 | """ 92 | self.kwargs['count'] += n # type: ignore 93 | 94 | def set(self, **kwargs) -> None: 95 | """Update progress handler parameters. 96 | 97 | Calling this method also runs :meth:`update` with an increment 98 | of 0, which causes a refresh of any indicator without changing 99 | the counter. 100 | 101 | """ 102 | self.kwargs.update(**kwargs) 103 | self.update(0, force=True) 104 | 105 | def flash(self, message: str) -> None: 106 | """Issue a message unrelated to the current counter. 107 | 108 | This may be useful for multi-stage processes to indicate the 109 | move to a new stage, or to log unexpected situations. 110 | 111 | """ 112 | pass 113 | 114 | def close(self) -> None: 115 | """Close the progress handler. 116 | 117 | This might be useful for closing file handles or cleaning up 118 | resources. 119 | 120 | """ 121 | pass 122 | 123 | 124 | class ProgressBar(ProgressHandler): 125 | """A :class:`ProgressHandler` subclass for printing a progress bar. 126 | 127 | Example: 128 | >>> p = ProgressBar(message='Progress: ', total=10, unit=' units') 129 | >>> p.update(3) 130 | Progress: [######### ] (3/10 units) 131 | 132 | See :meth:`format` for a description of how the progress bar is 133 | formatted. 134 | 135 | """ 136 | 137 | #: The default formatting template. 138 | FMT = '\r{message}{bar}{counter}{status}' 139 | 140 | def update(self, n: int = 1, force: bool = False) -> None: 141 | """Increment the count by *n* and print the reformatted bar.""" 142 | self.kwargs['count'] += n # type: ignore 143 | self._refresh_quota -= n 144 | if force or self._refresh_quota <= 0: 145 | self._refresh_quota = self.kwargs['refresh_interval'] # type: ignore 146 | s = self.format() 147 | if self.file: 148 | print('\r\033[K', end='', file=self.file) 149 | print(s, end='', file=self.file) 150 | 151 | def format(self) -> str: 152 | """Format and return the progress bar. 153 | 154 | The bar is is formatted according to :attr:`FMT`, using 155 | variables from :attr:`kwargs` and two computed variables: 156 | 157 | - ``bar``: visualization of the progress bar, empty when 158 | ``total`` is 0 159 | 160 | - ``counter``: display of ``count``, ``total``, and ``units`` 161 | 162 | >>> p = ProgressBar(message='Progress', count=2, total=10, unit='K') 163 | >>> p.format() 164 | '\\rProgress [###### ] (2/10K) ' 165 | >>> p = ProgressBar(count=2, status='Counting...') 166 | >>> p.format() 167 | '\\r (2) Counting...' 168 | 169 | """ 170 | _kw = self.kwargs 171 | width = 30 172 | total: int = _kw['total'] # type: ignore 173 | count: int = _kw['count'] # type: ignore 174 | 175 | if total > 0: 176 | num = min(count, total) * width 177 | fill = (num // total) * '#' 178 | part = ((num % total) * 3) // total 179 | if part: 180 | fill += '-='[part-1] 181 | bar = f' [{fill:<{width}}]' 182 | counter = f' ({count}/{total}{_kw["unit"]}) ' 183 | else: 184 | bar = '' 185 | counter = f' ({count}{_kw["unit"]}) ' 186 | 187 | return self.FMT.format(bar=bar, counter=counter, **_kw) 188 | 189 | def flash(self, message: str) -> None: 190 | """Overwrite the progress bar with *message*.""" 191 | print(f'\r\033[K{message}', end='', file=self.file) 192 | 193 | def close(self) -> None: 194 | """Print a newline so the last printed bar remains on screen.""" 195 | print(file=self.file) 196 | -------------------------------------------------------------------------------- /tests/similarity_test.py: -------------------------------------------------------------------------------- 1 | 2 | from math import log 3 | 4 | import pytest 5 | 6 | import wn 7 | from wn import similarity as sim 8 | from wn.taxonomy import taxonomy_depth 9 | from wn.ic import information_content as infocont 10 | 11 | 12 | def get_synsets(w): 13 | return { 14 | 'information': w.synset('test-en-0001-n'), 15 | 'example': w.synset('test-en-0002-n'), 16 | 'sample': w.synset('test-en-0004-n'), 17 | 'random sample': w.synset('test-en-0005-n'), 18 | 'random sample2': w.synset('test-en-0008-n'), 19 | 'datum': w.synset('test-en-0006-n'), 20 | 'exemplify': w.synset('test-en-0003-v'), 21 | } 22 | 23 | 24 | # some fake information content; computed using: 25 | # words = ['example', 'example', 'sample', 'random sample', 'illustrate'] 26 | # ic = compute(words, wn.Wordnet('test-en'), distribute_weight=False) 27 | 28 | ic = { 29 | 'n': {'test-en-0001-n': 5.0, # information 30 | 'test-en-0002-n': 5.0, # example, illustration 31 | 'test-en-0004-n': 3.0, # sample 32 | 'test-en-0005-n': 2.0, # random sample 33 | 'test-en-0008-n': 2.0, # random sample 2 34 | 'test-en-0006-n': 1.0, # datum 35 | None: 6.0}, 36 | 'v': {'test-en-0003-v': 2.0, # exemplify, illustrate 37 | 'test-en-0007-v': 1.0, # resignate 38 | None: 2.0}, 39 | 'a': {None: 1.0}, 40 | 'r': {None: 1.0} 41 | } 42 | 43 | 44 | @pytest.mark.usefixtures('mini_db') 45 | def test_path(): 46 | ss = get_synsets(wn.Wordnet('test-en')) 47 | assert sim.path(ss['information'], ss['information']) == 1/1 48 | assert sim.path(ss['information'], ss['example']) == 1/2 49 | assert sim.path(ss['information'], ss['sample']) == 1/3 50 | assert sim.path(ss['information'], ss['random sample']) == 1/4 51 | assert sim.path(ss['random sample'], ss['datum']) == 1/5 52 | assert sim.path(ss['random sample2'], ss['datum']) == 0 53 | assert sim.path(ss['random sample2'], ss['datum'], simulate_root=True) == 1/4 54 | assert sim.path( 55 | ss['random sample'], ss['random sample2'], simulate_root=True 56 | ) == 1/6 57 | with pytest.raises(wn.Error): 58 | sim.path(ss['example'], ss['exemplify']) 59 | with pytest.raises(wn.Error): 60 | sim.wup(ss['example'], ss['exemplify'], simulate_root=True) 61 | 62 | 63 | @pytest.mark.usefixtures('mini_db') 64 | def test_wup(): 65 | ss = get_synsets(wn.Wordnet('test-en')) 66 | assert sim.wup(ss['information'], ss['information']) == (2*1) / (0+0+2*1) 67 | assert sim.wup(ss['information'], ss['example']) == (2*1) / (0+1+2*1) 68 | assert sim.wup(ss['information'], ss['sample']) == (2*1) / (0+2+2*1) 69 | assert sim.wup(ss['information'], ss['random sample']) == (2*1) / (0+3+2*1) 70 | assert sim.wup(ss['random sample'], ss['datum']) == (2*1) / (3+1+2*1) 71 | with pytest.raises(wn.Error): 72 | assert sim.wup(ss['random sample2'], ss['datum']) 73 | assert (sim.wup(ss['random sample2'], ss['datum'], simulate_root=True) 74 | == (2*1) / (1+2+2*1)) 75 | assert (sim.wup(ss['random sample'], ss['random sample2'], simulate_root=True) 76 | == (2*1) / (4+1+2*1)) 77 | with pytest.raises(wn.Error): 78 | sim.wup(ss['example'], ss['exemplify']) 79 | with pytest.raises(wn.Error): 80 | sim.wup(ss['example'], ss['exemplify'], simulate_root=True) 81 | 82 | 83 | @pytest.mark.usefixtures('mini_db') 84 | def test_lch(): 85 | w = wn.Wordnet('test-en') 86 | ss = get_synsets(w) 87 | d_n = taxonomy_depth(w, 'n') 88 | assert sim.lch(ss['information'], ss['information'], d_n) == -log((0+1) / (2*d_n)) 89 | assert sim.lch(ss['information'], ss['example'], d_n) == -log((1+1) / (2*d_n)) 90 | assert sim.lch(ss['information'], ss['sample'], d_n) == -log((2+1) / (2*d_n)) 91 | assert sim.lch(ss['information'], ss['random sample'], d_n) == -log((3+1) / (2*d_n)) 92 | assert sim.lch(ss['random sample'], ss['datum'], d_n) == -log((4+1) / (2*d_n)) 93 | with pytest.raises(wn.Error): 94 | assert sim.lch(ss['random sample2'], ss['datum'], d_n) 95 | assert (sim.lch(ss['random sample2'], ss['datum'], d_n, simulate_root=True) 96 | == -log((3+1) / (2*d_n))) 97 | assert (sim.lch(ss['random sample'], ss['random sample2'], d_n, simulate_root=True) 98 | == -log((5+1) / (2*d_n))) 99 | with pytest.raises(wn.Error): 100 | sim.lch(ss['example'], ss['exemplify'], d_n) 101 | with pytest.raises(wn.Error): 102 | sim.lch(ss['example'], ss['exemplify'], d_n, simulate_root=True) 103 | 104 | 105 | @pytest.mark.usefixtures('mini_db') 106 | def test_res(): 107 | w = wn.Wordnet('test-en') 108 | ss = get_synsets(w) 109 | assert (sim.res(ss['information'], ss['information'], ic) 110 | == infocont(ss['information'], ic)) 111 | assert (sim.res(ss['information'], ss['example'], ic) 112 | == infocont(ss['information'], ic)) 113 | assert (sim.res(ss['information'], ss['sample'], ic) 114 | == infocont(ss['information'], ic)) 115 | assert (sim.res(ss['information'], ss['random sample'], ic) 116 | == infocont(ss['information'], ic)) 117 | assert (sim.res(ss['random sample'], ss['datum'], ic) 118 | == infocont(ss['information'], ic)) 119 | with pytest.raises(wn.Error): 120 | sim.res(ss['random sample2'], ss['datum'], ic) 121 | with pytest.raises(wn.Error): 122 | sim.res(ss['example'], ss['exemplify'], ic) 123 | 124 | 125 | @pytest.mark.usefixtures('mini_db') 126 | def test_jcn(): 127 | w = wn.Wordnet('test-en') 128 | ss = get_synsets(w) 129 | info_ic = infocont(ss['information'], ic) 130 | assert (sim.jcn(ss['information'], ss['information'], ic) 131 | == float('inf')) 132 | assert (sim.jcn(ss['information'], ss['example'], ic) 133 | == float('inf')) 134 | assert (sim.jcn(ss['information'], ss['sample'], ic) 135 | == 1 / ((info_ic + infocont(ss['sample'], ic)) - 2 * info_ic)) 136 | assert (sim.jcn(ss['information'], ss['random sample'], ic) 137 | == 1 / ((info_ic + infocont(ss['random sample'], ic)) - 2 * info_ic)) 138 | assert (sim.jcn(ss['random sample'], ss['datum'], ic) 139 | == 1 / ( 140 | (infocont(ss['random sample'], ic) + infocont(ss['datum'], ic)) 141 | - 2 * info_ic)) 142 | with pytest.raises(wn.Error): 143 | sim.jcn(ss['random sample2'], ss['datum'], ic) 144 | with pytest.raises(wn.Error): 145 | sim.jcn(ss['example'], ss['exemplify'], ic) 146 | 147 | 148 | @pytest.mark.usefixtures('mini_db') 149 | def test_lin(): 150 | w = wn.Wordnet('test-en') 151 | ss = get_synsets(w) 152 | info_ic = infocont(ss['information'], ic) 153 | assert (sim.lin(ss['information'], ss['information'], ic) 154 | == 1.0) 155 | assert (sim.lin(ss['information'], ss['example'], ic) 156 | == 1.0) 157 | assert (sim.lin(ss['information'], ss['sample'], ic) 158 | == (2 * info_ic) / (info_ic + infocont(ss['sample'], ic))) 159 | assert (sim.lin(ss['information'], ss['random sample'], ic) 160 | == (2 * info_ic) / (info_ic + infocont(ss['random sample'], ic))) 161 | assert (sim.lin(ss['random sample'], ss['datum'], ic) 162 | == ((2 * info_ic) 163 | / (infocont(ss['random sample'], ic) + infocont(ss['datum'], ic)))) 164 | with pytest.raises(wn.Error): 165 | sim.lin(ss['random sample2'], ss['datum'], ic) 166 | with pytest.raises(wn.Error): 167 | sim.lin(ss['example'], ss['exemplify'], ic) 168 | -------------------------------------------------------------------------------- /wn/ic.py: -------------------------------------------------------------------------------- 1 | 2 | """Information Content is a corpus-based metrics of synset or sense 3 | specificity. 4 | 5 | """ 6 | 7 | from typing import Optional, TextIO 8 | from pathlib import Path 9 | from collections import Counter 10 | from collections.abc import Callable, Iterable, Iterator 11 | from math import log 12 | 13 | from wn._types import AnyPath 14 | from wn._core import Synset, Wordnet 15 | from wn.constants import NOUN, VERB, ADJ, ADV, ADJ_SAT 16 | from wn.util import synset_id_formatter 17 | 18 | 19 | # Just use a subset of all available parts of speech 20 | IC_PARTS_OF_SPEECH = frozenset((NOUN, VERB, ADJ, ADV)) 21 | Freq = dict[str, dict[Optional[str], float]] 22 | 23 | 24 | def information_content(synset: Synset, freq: Freq) -> float: 25 | """Calculate the Information Content value for a synset. 26 | 27 | The information content of a synset is the negative log of the 28 | synset probability (see :func:`synset_probability`). 29 | 30 | """ 31 | return -log(synset_probability(synset, freq)) 32 | 33 | 34 | def synset_probability(synset: Synset, freq: Freq) -> float: 35 | """Calculate the synset probability. 36 | 37 | The synset probability is defined as freq(ss)/N where freq(ss) is 38 | the IC weight for the synset and N is the total IC weight for all 39 | synsets with the same part of speech. 40 | 41 | Note: this function is not generally used directly, but indirectly 42 | through :func:`information_content`. 43 | 44 | """ 45 | pos_freq = freq[synset.pos] 46 | return pos_freq[synset.id] / pos_freq[None] 47 | 48 | 49 | def _initialize( 50 | wordnet: Wordnet, 51 | smoothing: float, 52 | ) -> Freq: 53 | """Populate an Information Content weight mapping to a smoothing value. 54 | 55 | All synsets in *wordnet* are inserted into the dictionary and 56 | mapped to *smoothing*. 57 | 58 | """ 59 | freq: Freq = { 60 | pos: {synset.id: smoothing for synset in wordnet.synsets(pos=pos)} 61 | for pos in IC_PARTS_OF_SPEECH 62 | } 63 | # pretend ADJ_SAT is just ADJ 64 | for synset in wordnet.synsets(pos=ADJ_SAT): 65 | freq[ADJ][synset.id] = smoothing 66 | # also initialize totals (when synset is None) for each part-of-speech 67 | for pos in IC_PARTS_OF_SPEECH: 68 | freq[pos][None] = smoothing 69 | return freq 70 | 71 | 72 | def compute( 73 | corpus: Iterable[str], 74 | wordnet: Wordnet, 75 | distribute_weight: bool = True, 76 | smoothing: float = 1.0 77 | ) -> Freq: 78 | """Compute Information Content weights from a corpus. 79 | 80 | Arguments: 81 | corpus: An iterable of string tokens. This is a flat list of 82 | words and the order does not matter. Tokens may be single 83 | words or multiple words separated by a space. 84 | 85 | wordnet: An instantiated :class:`wn.Wordnet` object, used to 86 | look up synsets from words. 87 | 88 | distribute_weight: If :python:`True`, the counts for a word 89 | are divided evenly among all synsets for the word. 90 | 91 | smoothing: The initial value given to each synset. 92 | 93 | Example: 94 | >>> import wn, wn.ic, wn.morphy 95 | >>> ewn = wn.Wordnet('ewn:2020', lemmatizer=wn.morphy.morphy) 96 | >>> freq = wn.ic.compute(["Dogs", "run", ".", "Cats", "sleep", "."], ewn) 97 | >>> dog = ewn.synsets('dog', pos='n')[0] 98 | >>> cat = ewn.synsets('cat', pos='n')[0] 99 | >>> frog = ewn.synsets('frog', pos='n')[0] 100 | >>> freq['n'][dog.id] 101 | 1.125 102 | >>> freq['n'][cat.id] 103 | 1.1 104 | >>> freq['n'][frog.id] # no occurrence; smoothing value only 105 | 1.0 106 | >>> carnivore = dog.lowest_common_hypernyms(cat)[0] 107 | >>> freq['n'][carnivore.id] 108 | 1.3250000000000002 109 | """ 110 | freq = _initialize(wordnet, smoothing) 111 | counts = Counter(corpus) 112 | 113 | hypernym_cache: dict[Synset, list[Synset]] = {} 114 | for word, count in counts.items(): 115 | synsets = wordnet.synsets(word) 116 | num = len(synsets) 117 | if num == 0: 118 | continue 119 | 120 | weight = float(count / num if distribute_weight else count) 121 | 122 | for synset in synsets: 123 | pos = synset.pos 124 | if pos == ADJ_SAT: 125 | pos = ADJ 126 | if pos not in IC_PARTS_OF_SPEECH: 127 | continue 128 | 129 | freq[pos][None] += weight 130 | 131 | # The following while-loop is equivalent to: 132 | # 133 | # freq[pos][synset.id] += weight 134 | # for path in synset.hypernym_paths(): 135 | # for ss in path: 136 | # freq[pos][ss.id] += weight 137 | # 138 | # ...but it caches hypernym lookups for speed 139 | 140 | agenda: list[tuple[Synset, set[Synset]]] = [(synset, set())] 141 | while agenda: 142 | ss, seen = agenda.pop() 143 | 144 | # avoid cycles 145 | if ss in seen: 146 | continue 147 | 148 | freq[pos][ss.id] += weight 149 | 150 | if ss not in hypernym_cache: 151 | hypernym_cache[ss] = ss.hypernyms() 152 | agenda.extend((hyp, seen | {ss}) for hyp in hypernym_cache[ss]) 153 | 154 | return freq 155 | 156 | 157 | def load( 158 | source: AnyPath, 159 | wordnet: Wordnet, 160 | get_synset_id: Optional[Callable] = None, 161 | ) -> Freq: 162 | """Load an Information Content mapping from a file. 163 | 164 | Arguments: 165 | 166 | source: A path to an information content weights file. 167 | 168 | wordnet: A :class:`wn.Wordnet` instance with synset 169 | identifiers matching the offsets in the weights file. 170 | 171 | get_synset_id: A callable that takes a synset offset and part 172 | of speech and returns a synset ID valid in *wordnet*. 173 | 174 | Raises: 175 | 176 | :class:`wn.Error`: If *wordnet* does not have exactly one 177 | lexicon. 178 | 179 | Example: 180 | 181 | >>> import wn, wn.ic 182 | >>> pwn = wn.Wordnet('pwn:3.0') 183 | >>> path = '~/nltk_data/corpora/wordnet_ic/ic-brown-resnik-add1.dat' 184 | >>> freq = wn.ic.load(path, pwn) 185 | 186 | """ 187 | source = Path(source).expanduser().resolve(strict=True) 188 | assert len(wordnet.lexicons()) == 1 189 | lexid = wordnet.lexicons()[0].id 190 | if get_synset_id is None: 191 | get_synset_id = synset_id_formatter(prefix=lexid) 192 | 193 | freq = _initialize(wordnet, 0.0) 194 | 195 | with source.open() as icfile: 196 | for offset, pos, weight, is_root in _parse_ic_file(icfile): 197 | ssid = get_synset_id(offset=offset, pos=pos) 198 | # synset = wordnet.synset(ssid) 199 | freq[pos][ssid] = weight 200 | if is_root: 201 | freq[pos][None] += weight 202 | return freq 203 | 204 | 205 | def _parse_ic_file(icfile: TextIO) -> Iterator[tuple[int, str, float, bool]]: 206 | """Parse the Information Content file. 207 | 208 | A sample of the format is:: 209 | 210 | wnver::eOS9lXC6GvMWznF1wkZofDdtbBU 211 | 1740n 1915712 ROOT 212 | 1930n 859272 213 | 2137n 1055337 214 | 215 | """ 216 | next(icfile) # skip header 217 | for line in icfile: 218 | ssinfo, value, *isroot = line.split() 219 | yield (int(ssinfo[:-1]), 220 | ssinfo[-1], 221 | float(value), 222 | bool(isroot)) 223 | -------------------------------------------------------------------------------- /docs/api/wn.ic.rst: -------------------------------------------------------------------------------- 1 | 2 | wn.ic 3 | ===== 4 | 5 | .. automodule:: wn.ic 6 | 7 | The mathematical formulae for information content are defined in 8 | `Formal Description`_, and the corresponding Python API function are 9 | described in `Calculating Information Content`_. These functions 10 | require information content weights obtained either by `computing them 11 | from a corpus `_, or by `loading 12 | pre-computed weights from a file `_. 14 | 15 | .. note:: 16 | 17 | The term *information content* can be ambiguous. It often, and most 18 | accurately, refers to the result of the :func:`information_content` 19 | function (:math:`\text{IC}(c)` in the mathematical notation), but 20 | is also sometimes used to refer to the corpus frequencies/weights 21 | (:math:`\text{freq}(c)` in the mathematical notation) returned by 22 | :func:`load` or :func:`compute`, as these weights are the basis of 23 | the value computed by :func:`information_content`. The Wn 24 | documentation tries to consistently refer to former as the 25 | *information content value*, or just *information content*, and the 26 | latter as *information content weights*, or *weights*. 27 | 28 | 29 | Formal Description 30 | ------------------ 31 | 32 | The Information Content (IC) of a concept (synset) is a measure of its 33 | specificity computed from the wordnet's taxonomy structure and corpus 34 | frequencies. It is defined by Resnik 1995 ([RES95]_), following 35 | information theory, as the negative log-probability of a concept: 36 | 37 | .. math:: 38 | 39 | \text{IC}(c) = -\log{p(c)} 40 | 41 | A concept's probability is the empirical probability over a corpus: 42 | 43 | .. math:: 44 | 45 | p(c) = \frac{\text{freq}(c)}{N} 46 | 47 | Here, :math:`N` is the total count of words of the same category as 48 | concept :math:`c` ([RES95]_ only considered nouns) where each word has 49 | some representation in the wordnet, and :math:`\text{freq}` is defined 50 | as the sum of corpus counts of words in :math:`\text{words}(c)`, which 51 | is the set of words subsumed by concept :math:`c`: 52 | 53 | .. math:: 54 | 55 | \text{freq}(c) = \sum_{w \in \text{words}(c)}{\text{count}(w)} 56 | 57 | It is common for :math:`\text{freq}` to not contain actual frequencies 58 | but instead weights distributed evenly among the synsets for a 59 | word. These weights are calculated as the word frequency divided by 60 | the number of synsets for the word: 61 | 62 | .. math:: 63 | 64 | \text{freq}_{\text{distributed}}(c) 65 | = \sum_{w \in \text{words}(c)}{\frac{\text{count}(w)}{|\text{synsets}(w)|}} 66 | 67 | .. [RES95] Resnik, Philip. "Using information content to evaluate 68 | semantic similarity." In Proceedings of the 14th International 69 | Joint Conference on Artificial Intelligence (IJCAI-95), Montreal, 70 | Canada, pp. 448-453. 1995. 71 | 72 | 73 | Example 74 | ------- 75 | 76 | In the Princeton WordNet 3.0 (hereafter *WordNet*, but note that the 77 | equivalent lexicon in Wn is the *OMW English Wordnet based on WordNet 78 | 3.0* with specifier ``omw-en:1.4``), the frequency of a concept like 79 | **stone fruit** is not just the number of occurrences of *stone 80 | fruit*, but also includes the counts of the words for its hyponyms 81 | (*almond*, *olive*, etc.) and other taxonomic descendants (*Jordan 82 | almond*, *green olive*, etc.). The word *almond* has two synsets: one 83 | for the fruit or nut, another for the plant. Thus, if the word 84 | *almond* is encountered :math:`n` times in a corpus, then the weight 85 | (either the frequency :math:`n` or distributed weight 86 | :math:`\frac{n}{2}`) is added to the total weights for both synsets 87 | and to those of their ancestors, but not for descendant synsets, such 88 | as for **Jordan almond**. The fruit/nut synset of almond has two 89 | hypernym paths which converge on **fruit**: 90 | 91 | 1. **almond** ⊃ **stone fruit** ⊃ **fruit** 92 | 2. **almond** ⊃ **nut** ⊃ **seed** ⊃ **fruit** 93 | 94 | The weight is added to each ancestor (**stone fruit**, **nut**, 95 | **seed**, **fruit**, ...) once. That is, the weight is not added to 96 | the convergent ancestor for **fruit** twice, but only once. 97 | 98 | 99 | Calculating Information Content 100 | ------------------------------- 101 | 102 | .. autofunction:: information_content 103 | .. autofunction:: synset_probability 104 | 105 | 106 | Computing Corpus Weights 107 | ------------------------ 108 | 109 | If pre-computed weights are not available for a wordnet or for some 110 | domain, they can be computed given a corpus and a wordnet. 111 | 112 | The corpus is an iterable of words. For large corpora it may help to 113 | use a generator for this iterable, but the entire vocabulary (i.e., 114 | unique words and counts) will be held at once in memory. Multi-word 115 | expressions are also possible if they exist in the wordnet. For 116 | instance, WordNet has *stone fruit*, with a single space delimiting 117 | the words, as an entry. 118 | 119 | The :class:`wn.Wordnet` object must be instantiated with a single 120 | lexicon, although it may have expand-lexicons for relation 121 | traversal. For best results, the wordnet should use a lemmatizer to 122 | help it deal with inflected wordforms from running text. 123 | 124 | .. autofunction:: compute 125 | 126 | 127 | Reading Pre-computed Information Content Files 128 | ---------------------------------------------- 129 | 130 | The :func:`load` function reads pre-computed information content 131 | weights files as used by the `WordNet::Similarity 132 | `_ Perl module or the `NLTK 133 | `_ Python package. These files are computed for 134 | a specific version of a wordnet using the synset offsets from the 135 | `WNDB `_ format, 136 | which Wn does not use. These offsets therefore must be converted into 137 | an identifier that matches those used by the wordnet. By default, 138 | :func:`load` uses the lexicon identifier from its *wordnet* argument 139 | with synset offsets (padded with 0s to make 8 digits) and 140 | parts-of-speech from the weights file to format an identifier, such as 141 | ``omw-en-00001174-n``. For wordnets that use a different identifier 142 | scheme, the *get_synset_id* parameter of :func:`load` can be given a 143 | callable created with :func:`wn.util.synset_id_formatter`. It can also 144 | be given another callable with the same signature as shown below: 145 | 146 | .. code-block:: python 147 | 148 | get_synset_id(*, offset: int, pos: str) -> str 149 | 150 | 151 | When loading pre-computed information content files, it is recommended 152 | to use the ones with smoothing (i.e., ``*-add1.dat`` or 153 | ``*-resnik-add1.dat``) to avoid math domain errors when computing the 154 | information content value. 155 | 156 | .. warning:: 157 | 158 | The weights files are only valid for the version of wordnet for 159 | which they were created. Files created for WordNet 3.0 do not work 160 | for WordNet 3.1 because the offsets used in its identifiers are 161 | different, although the *get_synset_id* parameter of :func:`load` 162 | could be given a function that performs a suitable mapping. Some 163 | `Open Multilingual Wordnet `_ 164 | wordnets use the WordNet 3.0 offsets in their identifiers and can 165 | therefore technically use the weights, but this usage is 166 | discouraged because the distributional properties of text in 167 | another language and the structure of the other wordnet will not be 168 | compatible with that of the English WordNet. For these cases, it is 169 | recommended to compute new weights using :func:`compute`. 170 | 171 | .. autofunction:: load 172 | --------------------------------------------------------------------------------