├── wn
├── py.typed
├── metrics.py
├── _exceptions.py
├── _types.py
├── _ili.py
├── __init__.py
├── _util.py
├── _db.py
├── _download.py
├── morphy.py
├── __main__.py
├── _module_functions.py
├── util.py
└── ic.py
├── tests
├── data
│ ├── test-package
│ │ ├── LICENSE
│ │ ├── README.md
│ │ ├── citation.bib
│ │ └── test-wn.xml
│ ├── README.md
│ ├── E101-3.xml
│ ├── E101-2.xml
│ ├── W306-0.xml
│ ├── W305-0.xml
│ ├── E101-1.xml
│ ├── E101-0.xml
│ ├── W307-0.xml
│ ├── sense-member-order.xml
│ ├── sense-key-variations.xml
│ ├── mini-lmf-1.3.xml
│ ├── mini-lmf-1.4.xml
│ └── mini-lmf-1.1.xml
├── util_test.py
├── validate_test.py
├── export_test.py
├── _util_test.py
├── project_test.py
├── morphy_test.py
├── db_test.py
├── conftest.py
├── compat_sensekey_test.py
├── wordnet_test.py
├── ic_test.py
├── web_test.py
├── taxonomy_test.py
├── lmf_test.py
├── secondary_query_test.py
├── relations_test.py
└── similarity_test.py
├── docs
├── docutils.conf
├── requirements.txt
├── api
│ ├── wn.validate.rst
│ ├── wn.lmf.rst
│ ├── wn.compat.sensekey.rst
│ ├── wn.compat.rst
│ ├── wn.util.rst
│ ├── wn.project.rst
│ ├── wn.taxonomy.rst
│ ├── wn.morphy.rst
│ ├── wn.similarity.rst
│ └── wn.ic.rst
├── _static
│ ├── css
│ │ └── svg.css
│ ├── wn-logo.svg
│ └── wn-logo-rotate.svg
├── Makefile
├── .readthedocs.yaml
├── make.bat
├── index.rst
├── cli.rst
├── setup.rst
├── conf.py
├── guides
│ ├── nltk-migration.rst
│ └── wordnet.rst
└── faq.rst
├── .github
├── ISSUE_TEMPLATE
│ ├── feature_request.md
│ ├── data-issue.md
│ └── bug_report.md
└── workflows
│ ├── checks.yml
│ ├── publish.yml
│ └── publish-docker.yaml
├── Dockerfile
├── .gitignore
├── LICENSE
├── bench
├── README.md
├── test_bench.py
└── conftest.py
├── CITATION.cff
├── pyproject.toml
└── CONTRIBUTING.md
/wn/py.typed:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/tests/data/test-package/LICENSE:
--------------------------------------------------------------------------------
1 | Test License
2 |
--------------------------------------------------------------------------------
/tests/data/test-package/README.md:
--------------------------------------------------------------------------------
1 | # Test README
2 |
--------------------------------------------------------------------------------
/tests/data/test-package/citation.bib:
--------------------------------------------------------------------------------
1 | % test bib
2 |
--------------------------------------------------------------------------------
/docs/docutils.conf:
--------------------------------------------------------------------------------
1 | [restructuredtext parser]
2 | syntax_highlight = short
3 |
4 |
--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx ~= 8.1
2 | furo == 2024.8.6
3 | sphinx-copybutton == 0.5.2
4 | .
5 |
6 |
--------------------------------------------------------------------------------
/docs/api/wn.validate.rst:
--------------------------------------------------------------------------------
1 |
2 | wn.validate
3 | ===========
4 |
5 | .. automodule:: wn.validate
6 |
7 | .. autofunction:: validate
8 |
--------------------------------------------------------------------------------
/tests/data/README.md:
--------------------------------------------------------------------------------
1 | # Testing Data Directory
2 |
3 | This directory is used to store data files used by the testing system.
4 |
5 |
--------------------------------------------------------------------------------
/docs/api/wn.lmf.rst:
--------------------------------------------------------------------------------
1 |
2 | wn.lmf
3 | ======
4 |
5 | .. automodule:: wn.lmf
6 |
7 | .. autofunction:: load
8 | .. autofunction:: scan_lexicons
9 | .. autofunction:: is_lmf
10 |
11 |
--------------------------------------------------------------------------------
/tests/data/test-package/test-wn.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/docs/api/wn.compat.sensekey.rst:
--------------------------------------------------------------------------------
1 | wn.compat.sensekey
2 | ==================
3 |
4 | .. automodule:: wn.compat.sensekey
5 |
6 | .. autofunction:: escape
7 | .. autofunction:: unescape
8 | .. autofunction:: sense_key_getter
9 | .. autofunction:: sense_getter
10 |
--------------------------------------------------------------------------------
/docs/_static/css/svg.css:
--------------------------------------------------------------------------------
1 | svg {
2 | width: 500px;
3 | height: 300px;
4 |
5 | position: relative;
6 | left: 20%;
7 | -webkit-transform: translateX(-20%);
8 | -ms-transform: translateX(-20%);
9 | transform: translateX(-20%);
10 |
11 | }
12 |
13 |
--------------------------------------------------------------------------------
/wn/metrics.py:
--------------------------------------------------------------------------------
1 |
2 | from wn._core import Word, Synset
3 |
4 |
5 | # Word-based Metrics
6 |
7 | def ambiguity(word: Word) -> int:
8 | return len(word.synsets())
9 |
10 |
11 | def average_ambiguity(synset: Synset) -> float:
12 | words = synset.words()
13 | return sum(len(word.synsets()) for word in words) / len(words)
14 |
--------------------------------------------------------------------------------
/tests/util_test.py:
--------------------------------------------------------------------------------
1 |
2 | from wn import util
3 |
4 |
5 | def test_synset_id_formatter():
6 | f = util.synset_id_formatter
7 | assert f()(prefix='xyz', offset=123, pos='n') == 'xyz-00000123-n'
8 | assert f(prefix='xyz')(offset=123, pos='n') == 'xyz-00000123-n'
9 | assert f(prefix='xyz', pos='n')(offset=123) == 'xyz-00000123-n'
10 | assert f('abc-{offset}-{pos}')(offset=1, pos='v') == 'abc-1-v'
11 |
--------------------------------------------------------------------------------
/docs/api/wn.compat.rst:
--------------------------------------------------------------------------------
1 | wn.compat
2 | =========
3 |
4 | Compatibility modules for Wn.
5 |
6 | This subpackage is a namespace for compatibility modules when working
7 | with particular lexicons. Wn is designed to be agnostic to the
8 | language or lexicon and not favor one over the other (with the
9 | exception of :mod:`wn.morphy`, which is English-specific). However,
10 | there are some kinds of functionality that would be useful to
11 | include in Wn, even if they don't generalize to all lexicons.
12 |
13 | Included modules
14 | ----------------
15 |
16 | .. toctree::
17 | :maxdepth: 1
18 |
19 | wn.compat.sensekey.rst
20 |
21 |
--------------------------------------------------------------------------------
/tests/validate_test.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from wn import lmf
4 | from wn.validate import validate
5 |
6 | tests = [
7 | ("E101", 0),
8 | ("E101", 1),
9 | ("E101", 2),
10 | ("E101", 3),
11 | ("W305", 0),
12 | ("W306", 0),
13 | ("W307", 0),
14 | ]
15 | test_ids = [f"{code}-{i}" for code, i in tests]
16 |
17 |
18 | @pytest.mark.parametrize("code,i", tests, ids=test_ids)
19 | def test_validate(datadir, code: str, i: int) -> None:
20 | path = datadir / f"{code}-{i}.xml"
21 | lex = lmf.load(path, progress_handler=None)["lexicons"][0]
22 | report = validate(lex, select=[code], progress_handler=None)
23 | print(report)
24 | assert len(report[code]["items"]) > 0
25 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: ''
5 | labels: enhancement
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 |
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/.readthedocs.yaml:
--------------------------------------------------------------------------------
1 | # .readthedocs.yaml
2 | # Read the Docs configuration file
3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
4 |
5 | # Required
6 | version: 2
7 |
8 | # Set the version of Python and other tools you might need
9 | build:
10 | os: ubuntu-22.04
11 | tools:
12 | python: "3.12"
13 |
14 | # Build documentation in the docs/ directory with Sphinx
15 | sphinx:
16 | configuration: docs/conf.py
17 |
18 | # We recommend specifying your dependencies to enable reproducible builds:
19 | # https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
20 | python:
21 | install:
22 | - requirements: docs/requirements.txt
23 |
24 | formats:
25 | - pdf
26 | - epub
27 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.11-slim
2 |
3 | WORKDIR /app
4 |
5 | # Install system dependencies
6 | RUN apt-get update && apt-get install -y \
7 | python3-pip \
8 | python3-dev \
9 | build-essential \
10 | && rm -rf /var/lib/apt/lists/*
11 |
12 | # Install web server
13 | RUN pip install uvicorn
14 |
15 | COPY . .
16 | RUN pip install --no-cache-dir ".[web]"
17 |
18 | # Download the wordnet data and initialize the database
19 | # TODO: this should be done in a separate volume
20 | RUN python -m wn download omw:1.4 cili
21 |
22 | # Clean up the downloads directory
23 | RUN rm -r ~/.wn_data/downloads
24 |
25 | # Expose the port
26 | EXPOSE 8080
27 |
28 | CMD ["uvicorn", "wn.web:app", "--host", "0.0.0.0", "--port", "8080"]
--------------------------------------------------------------------------------
/docs/api/wn.util.rst:
--------------------------------------------------------------------------------
1 | wn.util
2 | =======
3 |
4 | .. automodule:: wn.util
5 |
6 | .. autofunction:: synset_id_formatter
7 |
8 | .. autoclass:: ProgressHandler
9 | :members:
10 |
11 | .. attribute:: kwargs
12 |
13 | A dictionary storing the updateable parameters for the progress
14 | handler. The keys are:
15 |
16 | - ``message`` (:class:`str`) -- a generic message or name
17 | - ``count`` (:class:`int`) -- the current progress counter
18 | - ``total`` (:class:`int`) -- the expected final value of the counter
19 | - ``unit`` (:class:`str`) -- the unit of measurement
20 | - ``status`` (:class:`str`) -- the current status of the process
21 |
22 | .. autoclass:: ProgressBar
23 | :members:
24 |
--------------------------------------------------------------------------------
/tests/export_test.py:
--------------------------------------------------------------------------------
1 |
2 | from xml.etree import ElementTree as ET
3 |
4 | import pytest
5 |
6 | import wn
7 |
8 |
9 | @pytest.mark.usefixtures('mini_db')
10 | def test_export(datadir, tmp_path):
11 | tmpdir = tmp_path / 'test_export'
12 | tmpdir.mkdir()
13 | tmppath = tmpdir / 'mini_lmf_export.xml'
14 | lexicons = wn.lexicons(lexicon='test-en test-es')
15 | wn.export(lexicons, tmppath)
16 |
17 | # remove comments, indentation, etc.
18 | orig = ET.canonicalize(from_file=datadir / 'mini-lmf-1.0.xml', strip_text=True)
19 | temp = ET.canonicalize(from_file=tmppath, strip_text=True)
20 | # additional transformation to help with debugging
21 | orig = orig.replace('<', '\n<')
22 | temp = temp.replace('<', '\n<')
23 | assert orig == temp
24 |
--------------------------------------------------------------------------------
/wn/_exceptions.py:
--------------------------------------------------------------------------------
1 |
2 | class Error(Exception):
3 | """Generic error class for invalid wordnet operations."""
4 |
5 | # reset the module so the user sees the public name
6 | __module__ = 'wn'
7 |
8 |
9 | class DatabaseError(Error):
10 | """Error class for issues with the database."""
11 |
12 | __module__ = 'wn'
13 |
14 |
15 | class ConfigurationError(Error):
16 | """Raised on invalid configurations."""
17 | __module__ = 'wn'
18 |
19 |
20 | class ProjectError(Error):
21 | """Raised when a project is not found or on errors defined in the index."""
22 | __module__ = 'wn'
23 |
24 |
25 | class WnWarning(Warning):
26 | """Generic warning class for dubious wordnet operations."""
27 |
28 | # reset the module so the user sees the public name
29 | __module__ = 'wn'
30 |
--------------------------------------------------------------------------------
/tests/data/E101-3.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
--------------------------------------------------------------------------------
/tests/data/E101-2.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
--------------------------------------------------------------------------------
/tests/data/W306-0.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
--------------------------------------------------------------------------------
/tests/_util_test.py:
--------------------------------------------------------------------------------
1 |
2 | from wn._util import flatten, unique_list
3 |
4 |
5 | def test_flatten():
6 | assert flatten([]) == []
7 | assert flatten([[]]) == []
8 | assert flatten([[], []]) == []
9 | assert flatten([[[], []], [[], []]]) == [[], [], [], []]
10 | assert flatten([[1]]) == [1]
11 | assert flatten([[1, 2], [3, 4]]) == [1, 2, 3, 4]
12 | assert flatten(["AB", "CD"]) == ["A", "B", "C", "D"]
13 |
14 |
15 | def test_unique_list():
16 | assert unique_list([]) == []
17 | assert unique_list([1]) == [1]
18 | assert unique_list([1, 1, 1, 1, 1]) == [1]
19 | assert unique_list([1, 1, 2, 2, 1]) == [1, 2]
20 | assert unique_list([2, 1, 2, 2, 1]) == [2, 1]
21 | assert unique_list("A") == ["A"]
22 | assert unique_list("AAA") == ["A"]
23 | assert unique_list("ABABA") == ["A", "B"]
24 | assert unique_list([(1, 2), (1, 2), (2, 3)]) == [(1, 2), (2, 3)]
25 |
--------------------------------------------------------------------------------
/tests/data/W305-0.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
--------------------------------------------------------------------------------
/tests/data/E101-1.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/_static/wn-logo.svg:
--------------------------------------------------------------------------------
1 |
2 |
24 |
--------------------------------------------------------------------------------
/tests/data/E101-0.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
--------------------------------------------------------------------------------
/wn/_types.py:
--------------------------------------------------------------------------------
1 |
2 | from collections.abc import Callable, Mapping, Sequence
3 | from typing import Any, Optional, Union
4 | from pathlib import Path
5 |
6 | # For functions taking a filesystem path as a str or a pathlib.Path
7 | AnyPath = Union[str, Path]
8 |
9 | # LMF versions for comparison
10 | VersionInfo = tuple[int, ...]
11 |
12 | # Synset and Sense relations map a relation type to one or more ids
13 | RelationMap = Mapping[str, Sequence[str]]
14 |
15 | # User-facing metadata representation
16 | Metadata = dict[str, Any]
17 |
18 | # A callable that returns a normalized word form for a given word form
19 | NormalizeFunction = Callable[[str], str]
20 |
21 | # Lemmatization returns a mapping of parts of speech (or None) to
22 | # lists of wordforms that are potential lemmas for some query word
23 | LemmatizeResult = dict[Optional[str], set[str]]
24 |
25 | # A callable that returns a LemmatizationResult for a given word form
26 | # and optional part of speech
27 | LemmatizeFunction = Callable[[str, Optional[str]], LemmatizeResult]
28 |
--------------------------------------------------------------------------------
/tests/data/W307-0.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 | foo
22 |
23 |
24 |
25 | foo
26 |
27 |
28 |
29 |
30 |
31 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # Unit test / coverage reports
31 | htmlcov/
32 | .tox/
33 | .nox/
34 | .coverage
35 | .coverage.*
36 | .cache
37 | nosetests.xml
38 | coverage.xml
39 | *.cover
40 | *.py,cover
41 | .hypothesis/
42 | .pytest_cache/
43 |
44 | # Ruff (has its own .gitignore, but in case that ever changes...)
45 | .ruff_cache
46 |
47 | # Sphinx documentation
48 | docs/_build/
49 |
50 | # Jupyter Notebook
51 | .ipynb_checkpoints
52 |
53 | # Environments
54 | .env
55 | .venv
56 | env/
57 | venv/
58 | ENV/
59 | env.bak/
60 | venv.bak/
61 |
62 | # mypy
63 | .mypy_cache/
64 | .dmypy.json
65 | dmypy.json
66 |
67 | # PyCharm
68 | .idea/
69 |
70 | # VS Code
71 | .vscode/
72 |
73 | # benchmarking results
74 | .benchmarks/
--------------------------------------------------------------------------------
/docs/api/wn.project.rst:
--------------------------------------------------------------------------------
1 | wn.project
2 | ==========
3 |
4 | .. automodule:: wn.project
5 |
6 | .. autofunction:: get_project
7 | .. autofunction:: iterpackages
8 | .. autofunction:: is_package_directory
9 | .. autofunction:: is_collection_directory
10 |
11 | Project Classes
12 | ---------------
13 |
14 | Projects can be simple resource files, :class:`Package` directories,
15 | or :class:`Collection` directories. For API consistency, resource
16 | files are modeled as a virtual package (:class:`ResourceOnlyPackage`).
17 |
18 | .. class:: Project
19 |
20 | The base class for packages and collections.
21 |
22 | This class is not used directly, but all subclasses will implement
23 | the methods listed here.
24 |
25 | .. autoproperty:: path
26 | .. automethod:: readme
27 | .. automethod:: license
28 | .. automethod:: citation
29 |
30 | .. autoclass:: Package
31 | :show-inheritance:
32 |
33 | .. autoproperty:: type
34 | .. automethod:: resource_file
35 |
36 | .. autoclass:: ResourceOnlyPackage
37 | :show-inheritance:
38 |
39 | .. autoclass:: Collection
40 | :show-inheritance:
41 |
42 | .. automethod:: packages
43 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Michael Wayne Goodman
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/.github/workflows/checks.yml:
--------------------------------------------------------------------------------
1 | name: tests
2 |
3 | on:
4 | push:
5 | branches: [main]
6 | pull_request:
7 | branches: [main]
8 |
9 | jobs:
10 | lint:
11 | runs-on: ubuntu-latest
12 | steps:
13 | - uses: actions/checkout@v4
14 | - name: Set up Python
15 | uses: actions/setup-python@v4
16 | with:
17 | python-version: "3.9"
18 | - name: Install Hatch
19 | run: pipx install hatch
20 | - name: Lint
21 | run: hatch fmt --linter --check
22 | - name: Type Check
23 | run: hatch run mypy:check
24 | - name: Check Buildable
25 | run: hatch build
26 |
27 | tests:
28 | runs-on: ${{ matrix.os }}
29 | strategy:
30 | matrix:
31 | python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
32 | os: [ubuntu-latest, windows-latest]
33 | steps:
34 | - uses: actions/checkout@v4
35 | - name: Set up Python ${{ matrix.python-version }}
36 | uses: actions/setup-python@v4
37 | with:
38 | python-version: ${{ matrix.python-version }}
39 | - name: Install Hatch
40 | run: pipx install hatch
41 | - name: Test
42 | run: hatch test
43 |
--------------------------------------------------------------------------------
/wn/_ili.py:
--------------------------------------------------------------------------------
1 |
2 | from collections.abc import Iterator
3 | from pathlib import Path
4 |
5 | from wn._types import AnyPath
6 |
7 |
8 | def is_ili(source: AnyPath) -> bool:
9 | """Return True if *source* is an ILI tab-separated-value file.
10 |
11 | This only checks that the first column, split by tabs, of the
12 | first line is 'ili' or 'ILI'. It does not check if each line has
13 | the correct number of columns.
14 |
15 | """
16 | source = Path(source).expanduser()
17 | if source.is_file():
18 | try:
19 | with source.open('rb') as fh:
20 | return next(fh).split(b'\t')[0] in (b'ili', b'ILI')
21 | except (StopIteration, IndexError):
22 | pass
23 | return False
24 |
25 |
26 | def load(source: AnyPath) -> Iterator[dict[str, str]]:
27 | """Load an interlingual index file.
28 |
29 | Args:
30 | source: path to an ILI file
31 | """
32 | source = Path(source).expanduser()
33 | with source.open(encoding='utf-8') as fh:
34 | header = next(fh).rstrip('\r\n')
35 | fields = tuple(map(str.lower, header.split('\t')))
36 | for line in fh:
37 | yield dict(zip(fields, line.rstrip('\r\n').split('\t')))
38 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/data-issue.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Data issue
3 | about: Report an issue Wn's data index
4 | title: ''
5 | labels: data
6 | assignees: ''
7 |
8 | ---
9 |
10 | **If your issue is regarding the contents of the data** (e.g., a lexicon is missing a word, synset, relation, etc.), then please find the upstream project and file the issue there. You can find links to the projects on Wn's [README](https://github.com/goodmami/wn/). Projects without links are probably managed by the [Open Multilingual Wordnet](https://github.com/omwn/omw-data).
11 |
12 | **Use this issue template for the following kinds of issues:**
13 | 1. Request a wordnet lexicon (including new versions of existing lexicons) to be indexed by Wn
14 |
15 | Please provide:
16 | - the project name
17 | - the name and contact info of the current maintainer
18 | - the language of the lexicon (BCP-47 code preferred)
19 | - a URL to the project (e.g., on GitHub or other homepage)
20 | - a URL to the [WN-LMF](https://github.com/globalwordnet/schemas/) resource
21 |
22 | 2. Report an issue with an indexed lexicon (e.g., the source URL has changed)
23 |
24 | Please indicate the lexicon id and version and the correct project information, if available.
25 |
--------------------------------------------------------------------------------
/bench/README.md:
--------------------------------------------------------------------------------
1 | # Wn Benchmarking
2 |
3 | This directory contains code and data for running benchmarks for
4 | Wn. The benchmarks are implemented using
5 | [pytest-benchmarks](https://github.com/ionelmc/pytest-benchmark/), so
6 | they are run using pytest as follows (from the top-level project
7 | directory):
8 |
9 | ```console
10 | $ hatch test bench/ # run the benchmarks
11 | $ hatch test bench/ --benchmark-autosave # run benchmarks and store results
12 | $ hatch test bench/ --benchmark-compare # run benchmarks and compare to stored result
13 | $ hatch test -- --help # get help on options (look for those prefixed `--benchmark-`)
14 | ```
15 |
16 | Notes:
17 |
18 | * The tests are not exhaustive; when making a change that may affect
19 | performance, consider making a new test if one doesn't exist
20 | already. It would be helpful to check in the test to Git, but not
21 | the benchmark results since those are dependent on the machine.
22 | * Benchmark the code before and after the changes. Store the results
23 | locally for comparison.
24 | * Ensure the testing environment has a steady load (wait for
25 | long-running processes to finish, close any active web browser tabs,
26 | etc.) prior to and while running the test.
27 | * Expect high variance for IO-bound tasks.
28 |
29 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: ''
5 | labels: bug
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | :warning: If this is a question about Wn or how to use it, please create a [discussion](https://github.com/goodmami/wn/discussions) instead of an issue.
14 |
15 | **To Reproduce**
16 | Please enter a minimal working example of the command or Python code that illustrates the problem. To avoid formatting issues, enter the code in a Markdown code block:
17 |
18 | ```console
19 | $ python -m wn ...
20 | output...
21 | ```
22 |
23 | or
24 |
25 | ```pycon
26 | >>> import wn
27 | >>> ...
28 | output
29 | ```
30 |
31 | **Expected behavior**
32 | A clear and concise description of what you expected to happen.
33 |
34 | **Environment**
35 | Please enter the versions of Python and Wn you are using as well as the installed lexicons. You can find these by executing the following commands (adjust your platform-specific Python command as necessary, e.g., `python3` or `py -3`):
36 |
37 | ```console
38 | python --version
39 | python -m wn --version
40 | python -m wn lexicons
41 | ```
42 |
43 | **Additional context**
44 | Add any other context about the problem here.
45 |
--------------------------------------------------------------------------------
/tests/data/sense-member-order.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
1 | cff-version: 1.2.0
2 | title: Wn
3 | message: >-
4 | Please cite this software using the metadata from
5 | 'preferred-citation'.
6 | type: software
7 | authors:
8 | - given-names: Michael Wayne
9 | family-names: Goodman
10 | email: goodman.m.w@gmail.com
11 | orcid: 'https://orcid.org/0000-0002-2896-5141'
12 | - given-names: Francis
13 | family-names: Bond
14 | email: bond@ieee.org
15 | orcid: 'https://orcid.org/0000-0003-4973-8068'
16 | repository-code: 'https://github.com/goodmami/wn/'
17 | preferred-citation:
18 | type: conference-paper
19 | authors:
20 | - given-names: Michael Wayne
21 | family-names: Goodman
22 | email: goodmami@uw.edu
23 | orcid: 'https://orcid.org/0000-0002-2896-5141'
24 | affiliation: Nanyang Technological University
25 | - given-names: Francis
26 | family-names: Bond
27 | email: bond@ieee.org
28 | orcid: 'https://orcid.org/0000-0003-4973-8068'
29 | affiliation: Nanyang Technological University
30 | start: 100 # First page number
31 | end: 107 # Last page number
32 | conference:
33 | name: "Proceedings of the 11th Global Wordnet Conference"
34 | title: "Intrinsically Interlingual: The Wn Python Library for Wordnets"
35 | year: 2021
36 | month: 1
37 | url: 'https://aclanthology.org/2021.gwc-1.12/'
38 | publisher: "Global Wordnet Association"
39 |
--------------------------------------------------------------------------------
/wn/__init__.py:
--------------------------------------------------------------------------------
1 |
2 | """
3 | Wordnet Interface.
4 | """
5 |
6 | __all__ = (
7 | '__version__',
8 | 'Wordnet',
9 | 'download',
10 | 'add',
11 | 'add_lexical_resource',
12 | 'remove',
13 | 'export',
14 | 'projects',
15 | 'lexicons',
16 | 'Lexicon',
17 | 'word',
18 | 'words',
19 | 'Word',
20 | 'Form',
21 | 'Pronunciation',
22 | 'Tag',
23 | 'sense',
24 | 'senses',
25 | 'Sense',
26 | 'Example',
27 | 'Count',
28 | 'synset',
29 | 'synsets',
30 | 'Synset',
31 | 'Definition',
32 | 'Relation',
33 | 'ili',
34 | 'ilis',
35 | 'ILI',
36 | 'Error',
37 | 'DatabaseError',
38 | 'ConfigurationError',
39 | 'ProjectError',
40 | 'WnWarning',
41 | )
42 |
43 | from wn._exceptions import (
44 | Error,
45 | DatabaseError,
46 | ConfigurationError,
47 | ProjectError,
48 | WnWarning,
49 | )
50 | from wn._config import config # noqa: F401
51 | from wn._add import add, add_lexical_resource, remove
52 | from wn._export import export
53 | from wn._download import download
54 | from wn._core import (
55 | Lexicon,
56 | Word, Form, Pronunciation, Tag,
57 | Sense, Example, Count,
58 | Synset, Definition,
59 | Relation,
60 | ILI,
61 | Wordnet
62 | )
63 | from wn._module_functions import (
64 | projects,
65 | lexicons,
66 | word, words,
67 | sense, senses,
68 | synset, synsets,
69 | ili, ilis,
70 | )
71 |
72 | __version__ = '0.13.0'
73 |
--------------------------------------------------------------------------------
/tests/data/sense-key-variations.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 |
2 | Wn Documentation
3 | ================
4 |
5 | Overview
6 | --------
7 |
8 | This package provides an interface to wordnet data, from simple lookup
9 | queries, to graph traversals, to more sophisticated algorithms and
10 | metrics. Features include:
11 |
12 | - Support for wordnets in the
13 | `WN-LMF `_ format
14 | - A `SQLite `_ database backend for data
15 | consistency and efficient queries
16 | - Accurate modeling of Words, Senses, and Synsets
17 |
18 | Quick Start
19 | -----------
20 |
21 | .. code-block:: console
22 |
23 | $ pip install wn
24 |
25 | .. code-block:: python
26 |
27 | >>> import wn
28 | >>> wn.download('ewn:2020')
29 | >>> wn.synsets('coffee')
30 | [Synset('ewn-04979718-n'), Synset('ewn-07945591-n'), Synset('ewn-07945759-n'), Synset('ewn-12683533-n')]
31 |
32 |
33 | Contents
34 | --------
35 |
36 | .. toctree::
37 | :maxdepth: 2
38 |
39 | setup.rst
40 | cli.rst
41 | faq.rst
42 |
43 | .. toctree::
44 | :caption: Guides
45 | :maxdepth: 2
46 |
47 | guides/lexicons.rst
48 | guides/basic.rst
49 | guides/interlingual.rst
50 | guides/wordnet.rst
51 | guides/lemmatization.rst
52 | guides/nltk-migration.rst
53 |
54 | .. toctree::
55 | :caption: API Reference
56 | :maxdepth: 1
57 | :hidden:
58 |
59 | api/wn.rst
60 | api/wn.compat.rst
61 | api/wn.compat.sensekey.rst
62 | api/wn.constants.rst
63 | api/wn.ic.rst
64 | api/wn.lmf.rst
65 | api/wn.morphy.rst
66 | api/wn.project.rst
67 | api/wn.similarity.rst
68 | api/wn.taxonomy.rst
69 | api/wn.util.rst
70 | api/wn.validate.rst
71 | api/wn.web.rst
72 |
--------------------------------------------------------------------------------
/docs/_static/wn-logo-rotate.svg:
--------------------------------------------------------------------------------
1 |
2 |
41 |
--------------------------------------------------------------------------------
/tests/data/mini-lmf-1.3.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
8 |
9 |
10 |
11 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 | one
38 | two
39 | three
40 |
41 |
42 |
43 |
44 |
45 | one
46 | two
47 | three
48 |
49 |
50 |
51 |
52 |
53 | one
54 | two
55 | three
56 |
57 |
58 |
59 |
60 |
61 |
62 |
--------------------------------------------------------------------------------
/bench/test_bench.py:
--------------------------------------------------------------------------------
1 | import wn
2 | from wn import lmf
3 |
4 | import pytest
5 |
6 |
7 | @pytest.mark.benchmark(group="lmf.load", warmup=True)
8 | def test_load(datadir, benchmark):
9 | benchmark(lmf.load, datadir / 'mini-lmf-1.0.xml')
10 |
11 |
12 | @pytest.mark.benchmark(group="wn.add_lexical_resource")
13 | @pytest.mark.usefixtures('empty_db')
14 | def test_add_lexical_resource(mock_lmf, benchmark):
15 | # TODO: when pytest-benchmark's teardown option is released, use
16 | # that here with more rounds
17 | benchmark.pedantic(
18 | wn.add_lexical_resource,
19 | args=(mock_lmf,),
20 | # teardown=clean_db,
21 | iterations=1,
22 | rounds=1,
23 | )
24 |
25 |
26 | @pytest.mark.benchmark(group="wn.add_lexical_resource")
27 | @pytest.mark.usefixtures('empty_db')
28 | def test_add_lexical_resource_no_progress(mock_lmf, benchmark):
29 | # TODO: when pytest-benchmark's teardown option is released, use
30 | # that here with more rounds
31 | benchmark.pedantic(
32 | wn.add_lexical_resource,
33 | args=(mock_lmf,),
34 | kwargs={"progress_handler": None},
35 | # teardown=clean_db,
36 | iterations=1,
37 | rounds=1,
38 | )
39 |
40 |
41 | @pytest.mark.benchmark(group="primary queries")
42 | @pytest.mark.usefixtures('mock_db')
43 | def test_synsets(benchmark):
44 | benchmark(wn.synsets)
45 |
46 |
47 | @pytest.mark.benchmark(group="primary queries")
48 | @pytest.mark.usefixtures('mock_db')
49 | def test_words(benchmark):
50 | benchmark(wn.words)
51 |
52 |
53 | @pytest.mark.benchmark(group="secondary queries")
54 | @pytest.mark.usefixtures('mock_db')
55 | def test_word_senses_no_wordnet(benchmark):
56 | word = wn.words()[0]
57 | benchmark(word.senses)
58 |
59 |
60 | @pytest.mark.benchmark(group="secondary queries")
61 | @pytest.mark.usefixtures('mock_db')
62 | def test_word_senses_with_wordnet(benchmark):
63 | w = wn.Wordnet("mock:1")
64 | word = w.words()[0]
65 | benchmark(word.senses)
66 |
67 |
--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
1 | name: Build and Publish to PyPI or TestPyPI
2 |
3 | on: push
4 |
5 | jobs:
6 | build:
7 | name: Build distribution
8 | runs-on: ubuntu-latest
9 | steps:
10 | - uses: actions/checkout@v4
11 | - name: Set up Python
12 | uses: actions/setup-python@v4
13 | with:
14 | python-version: "3.x"
15 | - name: Install Hatch
16 | run: pipx install hatch
17 | - name: Build
18 | run: hatch build
19 | - name: Store the distribution packages
20 | uses: actions/upload-artifact@v4
21 | with:
22 | name: python-package-distributions
23 | path: dist/
24 |
25 | publish-to-pypi:
26 | name: Publish distributions to PyPI
27 | if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes
28 | needs:
29 | - build
30 | runs-on: ubuntu-latest
31 | environment:
32 | name: pypi
33 | url: https://pypi.org/p/wn
34 | permissions:
35 | id-token: write # IMPORTANT: mandatory for trusted publishing
36 | steps:
37 | - name: Download the dists
38 | uses: actions/download-artifact@v4.1.8
39 | with:
40 | name: python-package-distributions
41 | path: dist/
42 | - name: Publish to PyPI
43 | uses: pypa/gh-action-pypi-publish@release/v1
44 |
45 | publish-to-testpypi:
46 | name: Publish distributions to TestPyPI
47 | needs:
48 | - build
49 | runs-on: ubuntu-latest
50 | environment:
51 | name: testpypi
52 | url: https://test.pypi.org/p/wn
53 | permissions:
54 | id-token: write # IMPORTANT: mandatory for trusted publishing
55 | steps:
56 | - name: Download the dists
57 | uses: actions/download-artifact@v4.1.8
58 | with:
59 | name: python-package-distributions
60 | path: dist/
61 | - name: Publish to TestPyPI
62 | uses: pypa/gh-action-pypi-publish@release/v1
63 | with:
64 | repository-url: https://test.pypi.org/legacy/
65 | skip-existing: true
66 |
--------------------------------------------------------------------------------
/tests/project_test.py:
--------------------------------------------------------------------------------
1 | from wn import project
2 |
3 | def test_is_package_directory(datadir):
4 | assert project.is_package_directory(datadir / "test-package")
5 | assert not project.is_package_directory(datadir)
6 |
7 |
8 | def test_is_collection_directory(datadir):
9 | # not really, but it is a directory containing a package
10 | assert project.is_collection_directory(datadir)
11 | assert not project.is_collection_directory(datadir / "test-package")
12 |
13 |
14 | def test_get_project(datadir):
15 | proj = project.get_project(path=datadir / "test-package")
16 | assert proj.type == "wordnet"
17 | assert proj.resource_file() == datadir / "test-package" / "test-wn.xml"
18 | assert proj.readme() == datadir / "test-package" / "README.md"
19 | assert proj.license() == datadir / "test-package" / "LICENSE"
20 | assert proj.citation() == datadir / "test-package" / "citation.bib"
21 |
22 | proj = project.get_project(path=datadir / "mini-lmf-1.0.xml")
23 | assert proj.type == "wordnet"
24 | assert proj.resource_file() == datadir / "mini-lmf-1.0.xml"
25 | assert proj.readme() is None
26 | assert proj.license() is None
27 | assert proj.citation() is None
28 |
29 |
30 | def test_iterpackages(datadir):
31 | # for now, collection.packages() does not return contained resource files
32 | pkg_names = {
33 | pkg.resource_file().name
34 | for pkg in project.iterpackages(datadir)
35 | }
36 | assert "mini-lmf-1.0.xml" not in pkg_names
37 | assert "test-wn.xml" in pkg_names
38 |
39 | # explicitly giving a resource file path works, though
40 | pkg_names = {
41 | pkg.resource_file().name
42 | for pkg in project.iterpackages(datadir / "mini-lmf-1.0.xml")
43 | }
44 | assert "mini-lmf-1.0.xml" in pkg_names
45 | assert "test-wn.xml" not in pkg_names
46 |
47 |
48 | def test_compressed_iterpackages(mini_lmf_compressed):
49 | for pkg in project.iterpackages(mini_lmf_compressed):
50 | assert pkg.type == "wordnet"
51 | assert pkg.resource_file().exists()
52 | # ensure cleanup of temporary data
53 | assert not pkg.resource_file().exists()
54 | # ensure original file not deleted
55 | assert mini_lmf_compressed.exists()
56 |
--------------------------------------------------------------------------------
/tests/morphy_test.py:
--------------------------------------------------------------------------------
1 |
2 | import pytest
3 |
4 | import wn
5 | from wn import morphy
6 |
7 |
8 | def test_morphy_uninitialized():
9 | # An unintialized Morphy isn't very bright, but it starts up
10 | # fast. It relies on the database to filter bad items.
11 | m = morphy.Morphy()
12 | assert m('example', 'n') == {'n': {'example'}}
13 | assert m('examples', 'n') == {'n': {'examples', 'example'}}
14 | assert m('examples', 'v') == {'v': {'examples', 'example', 'exampl'}}
15 | assert m('exemplifying', 'n') == {'n': {'exemplifying'}}
16 | assert m('exemplifying', 'v') == {'v': {'exemplifying', 'exemplify', 'exemplifye'}}
17 | assert m('data', 'n') == {'n': {'data'}}
18 | assert m('datums', 'n') == {'n': {'datums', 'datum'}} # expected false positive
19 | assert m('examples', None) == {None: {'examples'},
20 | 'n': {'example'},
21 | 'v': {'example', 'exampl'}}
22 | assert m('exemplifying', None) == {None: {'exemplifying'},
23 | 'v': {'exemplify', 'exemplifye'}}
24 | assert m('data', None) == {None: {'data'}}
25 |
26 |
27 | @pytest.mark.usefixtures('mini_db')
28 | def test_morphy_initialized():
29 | w = wn.Wordnet('test-en:1')
30 | m = morphy.Morphy(wordnet=w)
31 | assert m('example', 'n') == {'n': {'example'}}
32 | assert m('examples', 'n') == {'n': {'example'}}
33 | assert m('examples', 'v') == {}
34 | assert m('exemplifying', 'n') == {}
35 | assert m('exemplifying', 'v') == {'v': {'exemplify'}}
36 | assert m('data', 'n') == {'n': {'datum'}}
37 | assert m('datums', 'n') == {'n': {'datum'}} # expected false positive
38 | assert m('examples', None) == {'n': {'example'}}
39 | assert m('exemplifying', None) == {'v': {'exemplify'}}
40 | assert m('data', None) == {'n': {'datum'}}
41 |
42 |
43 | @pytest.mark.usefixtures('mini_db')
44 | def test_issue_154():
45 | # https://github.com/goodmami/wn/issues/154
46 | w = wn.Wordnet('test-en:1')
47 | assert w.words('exemplifies') == [w.word('test-en-exemplify-v')]
48 | assert w.words('samples') == []
49 | w = wn.Wordnet('test-en:1', lemmatizer=morphy.Morphy())
50 | assert w.words('exemplifies') == [w.word('test-en-exemplify-v')]
51 | assert w.words('samples') == [w.word('test-en-sample-n')]
52 |
--------------------------------------------------------------------------------
/wn/_util.py:
--------------------------------------------------------------------------------
1 | """Non-public Wn utilities."""
2 |
3 | from collections.abc import Iterable, Hashable
4 | from typing import TypeVar
5 | from pathlib import Path
6 | import hashlib
7 | from unicodedata import normalize, combining
8 |
9 |
10 | from wn._types import VersionInfo
11 |
12 |
13 | def version_info(version_string: str) -> VersionInfo:
14 | return tuple(map(int, version_string.split('.')))
15 |
16 |
17 | def is_url(string: str) -> bool:
18 | """Return True if *string* appears to be a URL."""
19 | # TODO: ETags?
20 | return any(string.startswith(scheme)
21 | for scheme in ('http://', 'https://'))
22 |
23 |
24 | def is_gzip(path: Path) -> bool:
25 | """Return True if the file at *path* appears to be gzipped."""
26 | return _inspect_file_signature(path, b'\x1F\x8B')
27 |
28 |
29 | def is_lzma(path: Path) -> bool:
30 | """Return True if the file at *path* appears to be lzma-compressed."""
31 | return _inspect_file_signature(path, b'\xFD7zXZ\x00')
32 |
33 |
34 | def is_xml(path: Path) -> bool:
35 | """Return True if the file at *path* appears to be an XML file."""
36 | return _inspect_file_signature(path, b' bool:
40 | if path.is_file():
41 | with path.open('rb') as f:
42 | return f.read(len(signature)) == signature
43 | return False
44 |
45 |
46 | def short_hash(string: str) -> str:
47 | """Return a short hash of *string*."""
48 | b2 = hashlib.blake2b(digest_size=20)
49 | b2.update(string.encode('utf-8'))
50 | return b2.hexdigest()
51 |
52 |
53 | T = TypeVar('T')
54 |
55 |
56 | def flatten(iterable: Iterable[Iterable[T]]) -> list[T]:
57 | return [x for xs in iterable for x in xs]
58 |
59 |
60 | H = TypeVar('H', bound=Hashable)
61 |
62 |
63 | def unique_list(items: Iterable[H]) -> list[H]:
64 | # use a dictionary as an order-preserving set
65 | targets = {item: True for item in items}
66 | return list(targets)
67 |
68 |
69 | def normalize_form(s: str) -> str:
70 | return ''.join(c for c in normalize('NFKD', s.lower()) if not combining(c))
71 |
72 |
73 | def format_lexicon_specifier(id: str, version: str) -> str:
74 | return f"{id}:{version}"
75 |
76 |
77 | def split_lexicon_specifier(lexicon: str) -> tuple[str, str]:
78 | id, _, ver = lexicon.partition(":")
79 | return id, ver
80 |
--------------------------------------------------------------------------------
/docs/api/wn.taxonomy.rst:
--------------------------------------------------------------------------------
1 |
2 | wn.taxonomy
3 | ===========
4 |
5 | .. automodule:: wn.taxonomy
6 |
7 |
8 | Overview
9 | --------
10 |
11 | Among the valid synset relations for wordnets (see
12 | :data:`wn.constants.SYNSET_RELATIONS`), those used for describing
13 | *is-a* `taxonomies `_ are
14 | given special treatment and they are generally the most
15 | well-developed relations in any wordnet. Typically these are the
16 | ``hypernym`` and ``hyponym`` relations, which encode *is-a-type-of*
17 | relationships (e.g., a *hermit crab* is a type of *decapod*, which is
18 | a type of *crustacean*, etc.). They also include ``instance_hypernym``
19 | and ``instance_hyponym``, which encode *is-an-instance-of*
20 | relationships (e.g., *Oregon* is an instance of *American state*).
21 |
22 | The taxonomy forms a multiply-inheriting hierarchy with the synsets as
23 | nodes. In the English wordnets, such as the Princeton WordNet and its
24 | derivatives, nearly all nominal synsets form such a hierarchy with
25 | single root node, while verbal synsets form many smaller hierarchies
26 | without a common root. Other wordnets may have different properties,
27 | but as many are based off of the Princeton WordNet, they tend to
28 | follow this structure.
29 |
30 | Functions to find paths within the taxonomies form the basis of all
31 | :mod:`wordnet similarity measures `. For instance, the
32 | :ref:`leacock-chodorow-similarity` measure uses both
33 | :func:`shortest_path` and (indirectly) :func:`taxonomy_depth`.
34 |
35 |
36 | Wordnet-level Functions
37 | -----------------------
38 |
39 | Root and leaf synsets in the taxonomy are those with no ancestors
40 | (``hypernym``, ``instance_hypernym``, etc.) or hyponyms (``hyponym``,
41 | ``instance_hyponym``, etc.), respectively.
42 |
43 | Finding root and leaf synsets
44 | '''''''''''''''''''''''''''''
45 |
46 | .. autofunction:: roots
47 | .. autofunction:: leaves
48 |
49 | Computing the taxonomy depth
50 | ''''''''''''''''''''''''''''
51 |
52 | The taxonomy depth is the maximum depth from a root node to a leaf
53 | node within synsets for a particular part of speech.
54 |
55 | .. autofunction:: taxonomy_depth
56 |
57 |
58 | Synset-level Functions
59 | ----------------------
60 |
61 | .. autofunction:: hypernym_paths
62 | .. autofunction:: min_depth
63 | .. autofunction:: max_depth
64 | .. autofunction:: shortest_path
65 | .. autofunction:: common_hypernyms
66 | .. autofunction:: lowest_common_hypernyms
67 |
--------------------------------------------------------------------------------
/tests/db_test.py:
--------------------------------------------------------------------------------
1 |
2 | import sqlite3
3 | import threading
4 | import tempfile
5 |
6 | import pytest
7 |
8 | import wn
9 | from wn import lmf
10 |
11 |
12 | @pytest.mark.usefixtures('mini_db')
13 | def test_schema_compatibility():
14 | conn = sqlite3.connect(str(wn.config.database_path))
15 | schema_hash = wn._db.schema_hash(conn)
16 | assert schema_hash in wn._db.COMPATIBLE_SCHEMA_HASHES
17 |
18 |
19 | @pytest.mark.usefixtures('mini_db')
20 | def test_db_multithreading():
21 | """
22 | See https://github.com/goodmami/wn/issues/86
23 | Thanks: @fushinari
24 | """
25 |
26 | class WNThread:
27 | w = None
28 |
29 | def __init__(self):
30 | w_thread = threading.Thread(target=self.set_w)
31 | w_thread.start()
32 | w_thread.join()
33 | self.w.synsets()
34 |
35 | def set_w(self):
36 | if self.w is None:
37 | self.w = wn.Wordnet()
38 |
39 | # close the connections by resetting the pool
40 | wn._db.pool = {}
41 | with pytest.raises(sqlite3.ProgrammingError):
42 | WNThread()
43 | wn._db.pool = {}
44 | wn.config.allow_multithreading = True
45 | WNThread() # no error
46 | wn.config.allow_multithreading = False
47 | wn._db.pool = {}
48 |
49 |
50 | def test_remove_extension(datadir):
51 | with tempfile.TemporaryDirectory('wn_data_1_1_trigger') as dir:
52 | old_data_dir = wn.config.data_directory
53 | wn.config.data_directory = dir
54 | wn.add(datadir / 'mini-lmf-1.0.xml')
55 | wn.add(datadir / 'mini-lmf-1.1.xml')
56 | assert len(wn.lexicons()) == 4
57 | wn.remove('test-en-ext')
58 | assert len(wn.lexicons()) == 3
59 | wn.remove('test-ja')
60 | assert len(wn.lexicons()) == 2
61 | wn.add(datadir / 'mini-lmf-1.1.xml')
62 | assert len(wn.lexicons()) == 4
63 | wn.remove('test-en')
64 | assert {lex.id for lex in wn.lexicons()} == {'test-es', 'test-ja'}
65 | wn.config.data_directory = old_data_dir
66 | # close any open DB connections before teardown
67 | for conn in wn._db.pool.values():
68 | conn.close()
69 |
70 |
71 | def test_add_lexical_resource(datadir):
72 | with tempfile.TemporaryDirectory('wn_data_add_lexical_resource') as dir:
73 | old_data_dir = wn.config.data_directory
74 | wn.config.data_directory = dir
75 | wn.add_lexical_resource(lmf.load(datadir / 'mini-lmf-1.0.xml'))
76 | assert len(wn.lexicons()) == 2
77 | wn.add_lexical_resource(lmf.load(datadir / 'mini-lmf-1.1.xml'))
78 | assert len(wn.lexicons()) == 4
79 | wn.config.data_directory = old_data_dir
80 | # close any open DB connections before teardown
81 | for conn in wn._db.pool.values():
82 | conn.close()
83 |
84 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["hatchling"]
3 | build-backend = "hatchling.build"
4 |
5 | [project]
6 | dynamic = ['version']
7 |
8 | name = "wn"
9 | description = "Wordnet interface library"
10 | readme = "README.md"
11 | requires-python = ">=3.9"
12 | license = {file = "LICENSE"}
13 | keywords = ["wordnet", "interlingual", "linguistics", "language", "library"]
14 | authors = [
15 | {name = "Michael Wayne Goodman", email = "goodman.m.w@gmail.com"}
16 | ]
17 | classifiers = [
18 | "Development Status :: 4 - Beta",
19 | "Environment :: Console",
20 | "Intended Audience :: Developers",
21 | "Intended Audience :: Information Technology",
22 | "Intended Audience :: Science/Research",
23 | "License :: OSI Approved :: MIT License",
24 | "Programming Language :: Python :: 3",
25 | "Programming Language :: Python :: 3.9",
26 | "Programming Language :: Python :: 3.10",
27 | "Programming Language :: Python :: 3.11",
28 | "Programming Language :: Python :: 3.12",
29 | "Programming Language :: Python :: 3.13",
30 | "Topic :: Scientific/Engineering :: Information Analysis",
31 | "Topic :: Software Development :: Libraries :: Python Modules",
32 | "Topic :: Text Processing :: Linguistic",
33 | ]
34 |
35 | dependencies = [
36 | "httpx",
37 | "tomli",
38 | ]
39 |
40 | [project.optional-dependencies]
41 | web = [
42 | "starlette",
43 | ]
44 | editor = [
45 | "wn-editor"
46 | ]
47 |
48 | [project.urls]
49 | homepage = "https://github.com/goodmami/wn"
50 | documentation = "https://wn.readthedocs.io"
51 | changelog = "https://github.com/goodmami/wn/blob/main/CHANGELOG.md"
52 |
53 | [tool.hatch.version]
54 | path = "wn/__init__.py"
55 |
56 | [tool.hatch.build.targets.sdist]
57 | exclude = [
58 | "/.github",
59 | ]
60 |
61 | [tool.hatch.envs.hatch-test]
62 | extra-dependencies = [
63 | "pytest-benchmark",
64 | ]
65 | features = ["web"]
66 |
67 | [tool.hatch.envs.mypy]
68 | dependencies = [
69 | "mypy",
70 | ]
71 |
72 | [tool.hatch.envs.mypy.scripts]
73 | check = "mypy wn/"
74 |
75 | [tool.hatch.envs.docs]
76 | dependencies = [
77 | "wn[web]",
78 | "furo",
79 | "sphinx",
80 | "sphinx-copybutton",
81 | "sphinx-autobuild",
82 | ]
83 |
84 | [tool.hatch.envs.docs.scripts]
85 | build = "sphinx-build -M html docs docs/_build"
86 | clean = "sphinx-build -M clean docs docs/_build"
87 | watch = "sphinx-autobuild docs docs/_build/html"
88 |
89 | [tool.ruff]
90 | target-version = "py39"
91 | line-length = 88
92 |
93 | [tool.ruff.lint]
94 | select = [
95 | "B", # flake8-bugbear
96 | "C90", # McCabe cyclomatic complexity
97 | "E", # pycodestyle
98 | "F", # Pyflakes
99 | "W", # pycodestyle
100 | ]
101 |
102 | [tool.ruff.lint.per-file-ignores]
103 | "docs/conf.py" = ["E402"]
104 |
105 | [tool.ruff.format]
106 | quote-style = "single"
107 |
--------------------------------------------------------------------------------
/tests/data/mini-lmf-1.4.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
10 |
11 |
12 |
13 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 |
2 | import lzma
3 | import tempfile
4 | from pathlib import Path
5 |
6 | import pytest
7 |
8 | import wn
9 |
10 |
11 | @pytest.fixture(scope='session')
12 | def datadir():
13 | return Path(__file__).parent / 'data'
14 |
15 |
16 | @pytest.fixture
17 | def uninitialized_datadir(monkeypatch, tmp_path: Path):
18 | with monkeypatch.context() as m:
19 | m.setattr(wn.config, 'data_directory', tmp_path / 'uninitialized_datadir')
20 | yield
21 |
22 |
23 | @pytest.fixture(scope='session')
24 | def empty_db():
25 | with tempfile.TemporaryDirectory('wn_data_empty') as dir:
26 | with pytest.MonkeyPatch.context() as m:
27 | m.setattr(wn.config, 'data_directory', dir)
28 | yield
29 |
30 |
31 | # We want to build these DBs once per session, but connections
32 | # are created once for every test.
33 |
34 | @pytest.fixture(scope='session')
35 | def mini_db_dir(datadir):
36 | with tempfile.TemporaryDirectory('wn_data_mini') as dir:
37 | with pytest.MonkeyPatch.context() as m:
38 | m.setattr(wn.config, 'data_directory', dir)
39 | wn.add(datadir / 'mini-lmf-1.0.xml')
40 | wn._db.clear_connections()
41 |
42 | yield Path(dir)
43 |
44 |
45 | @pytest.fixture
46 | def mini_lmf_compressed(datadir):
47 | data = (datadir / 'mini-lmf-1.0.xml').read_bytes()
48 | with tempfile.NamedTemporaryFile(suffix='.xml.xz', delete=False) as file:
49 | path = Path(file.name)
50 | # Windows cannot reliably reopen file until it's closed
51 | with lzma.open(path, "w") as f:
52 | f.write(data)
53 | try:
54 | yield Path(file.name)
55 | finally:
56 | Path(file.name).unlink()
57 |
58 |
59 | @pytest.fixture(scope='session')
60 | def mini_db_1_1_dir(datadir):
61 | with tempfile.TemporaryDirectory('wn_data_mini_1_1') as dir:
62 | with pytest.MonkeyPatch.context() as m:
63 | m.setattr(wn.config, 'data_directory', dir)
64 | wn.add(datadir / 'mini-lmf-1.0.xml')
65 | wn.add(datadir / 'mini-lmf-1.1.xml')
66 | wn._db.clear_connections()
67 |
68 | yield Path(dir)
69 |
70 |
71 | @pytest.fixture(scope='session')
72 | def mini_db_1_4_dir(datadir):
73 | with tempfile.TemporaryDirectory('wn_data_mini_1_4') as dir:
74 | with pytest.MonkeyPatch.context() as m:
75 | m.setattr(wn.config, 'data_directory', dir)
76 | wn.add(datadir / 'mini-lmf-1.4.xml')
77 | wn._db.clear_connections()
78 |
79 | yield Path(dir)
80 |
81 |
82 | @pytest.fixture
83 | def mini_db(monkeypatch, mini_db_dir):
84 | with monkeypatch.context() as m:
85 | m.setattr(wn.config, 'data_directory', mini_db_dir)
86 | yield
87 | wn._db.clear_connections()
88 |
89 |
90 | @pytest.fixture
91 | def mini_db_1_1(monkeypatch, mini_db_1_1_dir):
92 | with monkeypatch.context() as m:
93 | m.setattr(wn.config, 'data_directory', mini_db_1_1_dir)
94 | yield
95 | wn._db.clear_connections()
96 |
97 |
98 | @pytest.fixture
99 | def mini_db_1_4(monkeypatch, mini_db_1_4_dir):
100 | with monkeypatch.context() as m:
101 | m.setattr(wn.config, 'data_directory', mini_db_1_4_dir)
102 | yield
103 | wn._db.clear_connections()
104 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to Wn
2 |
3 | Thanks for helping to make Wn better!
4 |
5 | **Quick Links:**
6 |
7 | - [Report a bug or request a features](https://github.com/goodmami/wn/issues/new)
8 | - [Ask a question](https://github.com/goodmami/wn/discussions)
9 | - [View documentation](https://wn.readthedocs.io/)
10 |
11 | **Developer Information:**
12 |
13 | - Versioning scheme: [Semantic Versioning](https://semver.org/)
14 | - Branching scheme: [GitHub Flow](https://guides.github.com/introduction/flow/)
15 | - Changelog: [keep a changelog](https://keepachangelog.com/en/1.0.0/)
16 | - Documentation framework: [Sphinx](https://www.sphinx-doc.org/)
17 | - Docstring style: [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings) (via [sphinx.ext.napoleon](https://www.sphinx-doc.org/en/master/usage/extensions/napoleon.html))
18 | - Unit/regression testing: [pytest](https://pytest.org/)
19 | - Benchmarking: [pytest-benchmark](https://pytest-benchmark.readthedocs.io/)
20 | - Packaging framework: [Hatch](https://hatch.pypa.io/)
21 | - Coding style: [PEP-8](https://www.python.org/dev/peps/pep-0008/) (via [Ruff](https://beta.ruff.rs/docs/))
22 | - Type checking: [Mypy](http://mypy-lang.org/)
23 |
24 |
25 | ## Get Help
26 |
27 | Confused about wordnets in general? See the [Global Wordnet
28 | Association Documentation](https://globalwordnet.github.io/gwadoc/)
29 |
30 | Confused about using Wn or wish to share some tips? [Start a
31 | discussion](https://github.com/goodmami/wn/discussions)
32 |
33 | Encountering a problem with Wn or wish to propose a new features? [Raise an
34 | issue](https://github.com/goodmami/wn/issues/new)
35 |
36 |
37 | ## Report a Bug
38 |
39 | When reporting a bug, please provide enough information for someone to
40 | reproduce the problem. This might include the version of Python you're
41 | running, the version of Wn you have installed, the wordnet lexicons
42 | you have installed, and possibly the platform (Linux, Windows, macOS)
43 | you're on. Please give a minimal working example that illustrates the
44 | problem. For example:
45 |
46 | > I'm using Wn 0.9.5 with Python 3.11 on Linux and [description of
47 | > problem...]. Here's what I have tried:
48 | >
49 | > ```pycon
50 | > >>> import wn
51 | > >>> # some code
52 | > ... # some result or error
53 | > ```
54 |
55 |
56 | ## Request a Feature
57 |
58 | If there's a feature that you think would make a good addition to Wn,
59 | raise an issue describing what the feature is and what problems it
60 | would address.
61 |
62 | ## Guidelines for Contributing
63 |
64 | See the "developer information" above for a brief description of
65 | guidelines and conventions used in Wn. If you have a fix, please
66 | submit a pull request to the `main` branch. In general, every pull
67 | request should have an associated issue.
68 |
69 | Developers should run and test Wn locally from source using
70 | [Hatch](https://hatch.pypa.io/). Hatch may be installed
71 | system-wide or within a virtual environment:
72 |
73 | ```bash
74 | $ pip install hatch
75 | ```
76 |
77 | You can then use the `hatch` commands like the following:
78 |
79 | ```console
80 | $ hatch shell # activate a Wn virtual environment
81 | $ hatch fmt --check # lint the code and check code style
82 | $ hatch run mypy:check # type check with mypy
83 | $ hatch test # run unit tests
84 | $ hatch test bench # run benchmarks
85 | $ hatch build # build a source distribution and wheel
86 | $ hatch publish # publish build artifacts to PyPI
87 | ```
88 |
--------------------------------------------------------------------------------
/docs/cli.rst:
--------------------------------------------------------------------------------
1 | Command Line Interface
2 | ======================
3 |
4 | Some of Wn's functionality is exposed via the command line.
5 |
6 | Global Options
7 | --------------
8 |
9 | .. option:: -d DIR, --dir DIR
10 |
11 | Change to use ``DIR`` as the data directory prior to invoking any
12 | commands.
13 |
14 |
15 | Subcommands
16 | -----------
17 |
18 | download
19 | --------
20 |
21 | Download and add projects to the database given one or more project
22 | specifiers or URLs.
23 |
24 | .. code-block:: console
25 |
26 | $ python -m wn download oewn:2021 omw:1.4 cili
27 | $ python -m wn download https://en-word.net/static/english-wordnet-2021.xml.gz
28 |
29 | .. option:: --index FILE
30 |
31 | Use the index at ``FILE`` to resolve project specifiers.
32 |
33 | .. code-block:: console
34 |
35 | $ python -m wn download --index my-index.toml mywn
36 |
37 | .. option:: --no-add
38 |
39 | Download and cache the remote file, but don't add it to the
40 | database.
41 |
42 |
43 | lexicons
44 | --------
45 |
46 | The ``lexicons`` subcommand lets you quickly see what is installed:
47 |
48 | .. code-block:: console
49 |
50 | $ python -m wn lexicons
51 | omw-en 1.4 [en] OMW English Wordnet based on WordNet 3.0
52 | omw-sk 1.4 [sk] Slovak WordNet
53 | omw-pl 1.4 [pl] plWordNet
54 | omw-is 1.4 [is] IceWordNet
55 | omw-zsm 1.4 [zsm] Wordnet Bahasa (Malaysian)
56 | omw-sl 1.4 [sl] sloWNet
57 | omw-ja 1.4 [ja] Japanese Wordnet
58 | ...
59 |
60 | .. option:: -l LG, --lang LG
61 | .. option:: --lexicon SPEC
62 |
63 | The ``--lang`` or ``--lexicon`` option can help you narrow down
64 | the results:
65 |
66 | .. code-block:: console
67 |
68 | $ python -m wn lexicons --lang en
69 | oewn 2021 [en] Open English WordNet
70 | omw-en 1.4 [en] OMW English Wordnet based on WordNet 3.0
71 | $ python -m wn lexicons --lexicon "omw-*"
72 | omw-en 1.4 [en] OMW English Wordnet based on WordNet 3.0
73 | omw-sk 1.4 [sk] Slovak WordNet
74 | omw-pl 1.4 [pl] plWordNet
75 | omw-is 1.4 [is] IceWordNet
76 | omw-zsm 1.4 [zsm] Wordnet Bahasa (Malaysian)
77 |
78 |
79 | projects
80 | --------
81 |
82 | The ``projects`` subcommand lists all known projects in Wn's
83 | index. This is helpful to see what is available for downloading.
84 |
85 | .. code-block::
86 |
87 | $ python -m wn projects
88 | ic cili 1.0 [---] Collaborative Interlingual Index
89 | ic oewn 2024 [en] Open English WordNet
90 | ic oewn 2023 [en] Open English WordNet
91 | ic oewn 2022 [en] Open English WordNet
92 | ic oewn 2021 [en] Open English WordNet
93 | ic ewn 2020 [en] Open English WordNet
94 | ic ewn 2019 [en] Open English WordNet
95 | i- odenet 1.4 [de] Open German WordNet
96 | ic odenet 1.3 [de] Open German WordNet
97 | ic omw 1.4 [mul] Open Multilingual Wordnet
98 | ic omw-en 1.4 [en] OMW English Wordnet based on WordNet 3.0
99 | ...
100 |
101 |
102 | validate
103 | --------
104 |
105 | Given a path to a WN-LMF XML file, check the file for structural
106 | problems and print a report.
107 |
108 | .. code-block::
109 |
110 | $ python -m wn validate english-wordnet-2021.xml
111 |
112 | .. option:: --select CHECKS
113 |
114 | Run the checks with the given comma-separated list of check codes
115 | or categories.
116 |
117 | .. code-block::
118 |
119 | $ python -m wn validate --select E W201 W204 deWordNet.xml
120 |
121 | .. option:: --output-file FILE
122 |
123 | Write the report to FILE as a JSON object instead of printing the
124 | report to stdout.
125 |
--------------------------------------------------------------------------------
/.github/workflows/publish-docker.yaml:
--------------------------------------------------------------------------------
1 | # Adapted from https://docs.github.com/en/actions/tutorials/publishing-packages/publishing-docker-images
2 | name: Publish a Docker image
3 |
4 | # Configures this workflow to run every time a new release is created in the repository.
5 | on:
6 | release:
7 | types: [ created ]
8 |
9 | # Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds.
10 | env:
11 | REGISTRY: ghcr.io
12 | IMAGE_NAME: ${{ github.repository }}
13 |
14 | # There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu.
15 | jobs:
16 | build-and-push-image:
17 | runs-on: ubuntu-latest
18 | # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job.
19 | permissions:
20 | contents: read
21 | packages: write
22 | attestations: write
23 | id-token: write
24 |
25 | steps:
26 | - name: Checkout repository
27 | uses: actions/checkout@v4
28 | # Uses the `docker/login-action` action to log in to the Container registry registry using the account and password that will publish the packages. Once published, the packages are scoped to the account defined here.
29 | - name: Log in to the Container registry
30 | uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
31 | with:
32 | registry: ${{ env.REGISTRY }}
33 | username: ${{ github.actor }}
34 | password: ${{ secrets.GITHUB_TOKEN }}
35 | # This step uses [docker/metadata-action](https://github.com/docker/metadata-action#about) to extract tags and labels that will be applied to the specified image. The `id` "meta" allows the output of this step to be referenced in a subsequent step. The `images` value provides the base name for the tags and labels.
36 | - name: Extract metadata (tags, labels) for Docker
37 | id: meta
38 | uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
39 | with:
40 | images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
41 | # This step uses the `docker/build-push-action` action to build the image, based on your repository's `Dockerfile`. If the build succeeds, it pushes the image to GitHub Packages.
42 | # It uses the `context` parameter to define the build's context as the set of files located in the specified path. For more information, see [Usage](https://github.com/docker/build-push-action#usage) in the README of the `docker/build-push-action` repository.
43 | # It uses the `tags` and `labels` parameters to tag and label the image with the output from the "meta" step.
44 | - name: Build and push Docker image
45 | id: push
46 | uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
47 | with:
48 | context: .
49 | push: true
50 | tags: ${{ steps.meta.outputs.tags }}
51 | labels: ${{ steps.meta.outputs.labels }}
52 |
53 | # This step generates an artifact attestation for the image, which is an unforgeable statement about where and how it was built. It increases supply chain security for people who consume the image. For more information, see [Using artifact attestations to establish provenance for builds](/actions/security-guides/using-artifact-attestations-to-establish-provenance-for-builds).
54 | - name: Generate artifact attestation
55 | uses: actions/attest-build-provenance@v2
56 | with:
57 | subject-name: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME}}
58 | subject-digest: ${{ steps.push.outputs.digest }}
59 | push-to-registry: true
60 |
61 |
--------------------------------------------------------------------------------
/docs/api/wn.morphy.rst:
--------------------------------------------------------------------------------
1 |
2 | wn.morphy
3 | =========
4 |
5 | .. automodule:: wn.morphy
6 |
7 | .. seealso::
8 |
9 | The Princeton WordNet `documentation
10 | `_ describes
11 | the original implementation of Morphy.
12 |
13 | The :doc:`../guides/lemmatization` guide describes how Wn handles
14 | lemmatization in general.
15 |
16 |
17 | Initialized and Uninitialized Morphy
18 | ------------------------------------
19 |
20 | There are two ways of using Morphy in Wn: initialized and
21 | uninitialized.
22 |
23 | Unintialized Morphy is a simple callable that returns lemma
24 | *candidates* for some given wordform. That is, the results might not
25 | be valid lemmas, but this is not a problem in practice because
26 | subsequent queries against the database will filter out the invalid
27 | ones. This callable is obtained by creating a :class:`Morphy` object
28 | with no arguments:
29 |
30 | >>> from wn import morphy
31 | >>> m = morphy.Morphy()
32 |
33 | As an uninitialized Morphy cannot predict which lemmas in the result
34 | are valid, it always returns the original form and any transformations
35 | it can find for each part of speech:
36 |
37 | >>> m('lemmata', pos='n') # exceptional form
38 | {'n': {'lemmata'}}
39 | >>> m('lemmas', pos='n') # regular morphology with part-of-speech
40 | {'n': {'lemma', 'lemmas'}}
41 | >>> m('lemmas') # regular morphology for any part-of-speech
42 | {None: {'lemmas'}, 'n': {'lemma'}, 'v': {'lemma'}}
43 | >>> m('wolves') # invalid forms may be returned
44 | {None: {'wolves'}, 'n': {'wolf', 'wolve'}, 'v': {'wolve', 'wolv'}}
45 |
46 |
47 | This lemmatizer can also be used with a :class:`wn.Wordnet` object to
48 | expand queries:
49 |
50 | >>> import wn
51 | >>> ewn = wn.Wordnet('ewn:2020')
52 | >>> ewn.words('lemmas')
53 | []
54 | >>> ewn = wn.Wordnet('ewn:2020', lemmatizer=morphy.Morphy())
55 | >>> ewn.words('lemmas')
56 | [Word('ewn-lemma-n')]
57 |
58 | An initialized Morphy is created with a :class:`wn.Wordnet` object as
59 | its argument. It then uses the wordnet to build lists of valid lemmas
60 | and exceptional forms (this takes a few seconds). Once this is done,
61 | it will only return lemmas it knows about:
62 |
63 | >>> ewn = wn.Wordnet('ewn:2020')
64 | >>> m = morphy.Morphy(ewn)
65 | >>> m('lemmata', pos='n') # exceptional form
66 | {'n': {'lemma'}}
67 | >>> m('lemmas', pos='n') # regular morphology with part-of-speech
68 | {'n': {'lemma'}}
69 | >>> m('lemmas') # regular morphology for any part-of-speech
70 | {'n': {'lemma'}}
71 | >>> m('wolves') # invalid forms are pre-filtered
72 | {'n': {'wolf'}}
73 |
74 | In order to use an initialized Morphy lemmatizer with a
75 | :class:`wn.Wordnet` object, it must be assigned to the object after
76 | creation:
77 |
78 | >>> ewn = wn.Wordnet('ewn:2020') # default: lemmatizer=None
79 | >>> ewn.words('lemmas')
80 | []
81 | >>> ewn.lemmatizer = morphy.Morphy(ewn)
82 | >>> ewn.words('lemmas')
83 | [Word('ewn-lemma-n')]
84 |
85 | There is little to no difference in the results obtained from a
86 | :class:`wn.Wordnet` object using an initialized or uninitialized
87 | :class:`Morphy` object, but there may be slightly different
88 | performance profiles for future queries.
89 |
90 |
91 | Default Morphy Lemmatizer
92 | -------------------------
93 |
94 | As a convenience, an uninitialized Morphy lemmatizer is provided in
95 | this module via the :data:`morphy` member.
96 |
97 | .. data:: morphy
98 |
99 | A :class:`Morphy` object created without a :class:`wn.Wordnet`
100 | object.
101 |
102 |
103 | The Morphy Class
104 | ----------------
105 |
106 | .. autoclass:: Morphy
107 |
--------------------------------------------------------------------------------
/tests/compat_sensekey_test.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | import wn
4 | from wn.compat import sensekey
5 |
6 |
7 | def test_unescape_oewn_sense_key():
8 |
9 | def unescape(s: str) -> str:
10 | return sensekey.unescape(s, flavor="oewn")
11 |
12 | assert unescape("") == ""
13 | assert unescape("abc") == "abc"
14 | assert unescape(".") == "." # only becomes : in second part of key
15 | # escape patterns
16 | assert unescape("-ap-") == "'"
17 | assert unescape("-ex-") == "!"
18 | assert unescape("-cm-") == ","
19 | assert unescape("-cn-") == ":"
20 | assert unescape("-pl-") == "+"
21 | assert unescape("-sl-") == "/"
22 | # adjacent escapes need their own dashes
23 | assert unescape("-ap-ex-") == "'ex-"
24 | assert unescape("-ap--ex-") == "'!"
25 | # invalid escapes are unchanged
26 | assert unescape("-foo-") == "-foo-" # not an escape sequence
27 | assert unescape("-sp-") == "-sp-" # not valid in lemma portion
28 | assert unescape("ap-") == "ap-" # no preceding dash
29 | assert unescape("-ap") == "-ap" # no trailing dash
30 | assert unescape("-AP-") == "-AP-" # case sensitivity
31 | # idempotency
32 | assert unescape(unescape("-ap--ex--cm-")) == unescape("-ap--ex--cm-")
33 | # full key, second part escapes differently
34 | assert unescape("abc__1.23.00..") == "abc%1:23:00::"
35 | assert unescape("abc__1.23.00.foo-sp-bar.") == "abc%1:23:00:foo_bar:"
36 | assert unescape("abc__1.23.00.foo-ap-bar.") == "abc%1:23:00:foo-ap-bar:"
37 |
38 |
39 | def test_escape_oewn_sense_key():
40 |
41 | def escape(s: str) -> str:
42 | return sensekey.escape(s, flavor="oewn")
43 |
44 | assert escape("") == ""
45 | assert escape("abc") == "abc"
46 | assert escape(".") == "." # only becomes : in second part of key
47 | # escape patterns
48 | assert escape("'") == "-ap-"
49 | assert escape("!") == "-ex-"
50 | assert escape(",") == "-cm-"
51 | assert escape(":") == "-cn-"
52 | assert escape("+") == "-pl-"
53 | assert escape("/") == "-sl-"
54 | # adjacent escapes need their own dashes
55 | assert escape("'!") == "-ap--ex-"
56 | # idempotency
57 | assert escape(escape("'!,")) == escape("'!,")
58 | # full key, second part escapes differently
59 | assert escape("abc%1:23:00::") == "abc__1.23.00.."
60 | assert escape("abc%1:23:00:foo_bar:") == "abc__1.23.00.foo-sp-bar."
61 | assert escape("abc%1:23:00:foo'bar:") == "abc__1.23.00.foo'bar."
62 |
63 |
64 | @pytest.mark.usefixtures("uninitialized_datadir")
65 | def test_sense_key_getter(datadir):
66 | wn.add(datadir / "sense-key-variations.xml")
67 |
68 | get_omw_sense_key = sensekey.sense_key_getter("omw-en:1.4")
69 | get_oewn_sense_key = sensekey.sense_key_getter("oewn:2024")
70 |
71 | omw_sense = wn.sense("omw-en--apos-s_Gravenhage-08950407-n", lexicon="omw-en:1.4")
72 | oewn_sense = wn.sense("oewn--ap-s_gravenhage__1.15.00..", lexicon="oewn:2024")
73 |
74 | assert get_omw_sense_key(omw_sense) == "'s_gravenhage%1:15:00::"
75 | assert get_omw_sense_key(oewn_sense) is None
76 |
77 | assert get_oewn_sense_key(omw_sense) is None
78 | assert get_oewn_sense_key(oewn_sense) == "'s_gravenhage%1:15:00::"
79 |
80 |
81 | @pytest.mark.usefixtures("uninitialized_datadir")
82 | def test_sense_getter(datadir):
83 | wn.add(datadir / "sense-key-variations.xml")
84 |
85 | get_omw_sense = sensekey.sense_getter("omw-en:1.4")
86 | get_oewn_sense = sensekey.sense_getter("oewn:2024")
87 |
88 | omw_sense = wn.sense("omw-en--apos-s_Gravenhage-08950407-n", lexicon="omw-en:1.4")
89 | oewn_sense = wn.sense("oewn--ap-s_gravenhage__1.15.00..", lexicon="oewn:2024")
90 |
91 | assert get_omw_sense("'s_gravenhage%1:15:00::") == omw_sense
92 | assert get_oewn_sense("'s_gravenhage%1:15:00::") == oewn_sense
93 |
--------------------------------------------------------------------------------
/docs/setup.rst:
--------------------------------------------------------------------------------
1 | Installation and Configuration
2 | ==============================
3 |
4 | .. seealso::
5 |
6 | This guide is for installing and configuring the Wn software. For
7 | adding lexicons to the database, see :doc:`guides/lexicons`.
8 |
9 |
10 | Installing from PyPI
11 | --------------------
12 |
13 | Install the latest release from `PyPI `_:
14 |
15 | .. code-block:: bash
16 |
17 | pip install wn
18 |
19 | To get the dependencies for the :mod:`wn.web` module, use the ``web``
20 | installation extra:
21 |
22 | .. code-block:: bash
23 |
24 | pip install "wn[web]"
25 |
26 |
27 | Installing with Conda
28 | ---------------------
29 |
30 | Alternatively, if you use the `Anaconda `
31 | distribution of Python, you can install with conda:
32 |
33 | .. code-block:: bash
34 |
35 | conda install -c conda-forge wn
36 |
37 |
38 | The Data Directory
39 | ------------------
40 |
41 | By default, Wn stores its data (such as downloaded LMF files and the
42 | database file) in a ``.wn_data/`` directory under the user's home
43 | directory. This directory can be changed (see `Configuration`_
44 | below). Whenever Wn attempts to download a resource or access its
45 | database, it will check for the existence of, and create if necessary,
46 | this directory, the ``.wn_data/downloads/`` subdirectory, and the
47 | ``.wn_data/wn.db`` database file. The file system will look like
48 | this::
49 |
50 | .wn_data/
51 | ├── downloads
52 | │ ├── ...
53 | │ └── ...
54 | └── wn.db
55 |
56 | The ``...`` entries in the ``downloads/`` subdirectory represent the
57 | files of resources downloaded from the web. Their filename is a hash
58 | of the URL so that Wn can avoid downloading the same file twice.
59 |
60 |
61 | Configuration
62 | -------------
63 |
64 | The :py:data:`wn.config` object contains the paths Wn uses for local
65 | storage and information about resources available on the web. To
66 | change the directory Wn uses for storing data locally, modify the
67 | :python:`wn.config.data_directory` member:
68 |
69 | .. code-block:: python
70 |
71 | import wn
72 | wn.config.data_directory = '~/Projects/wn_data'
73 |
74 | There are some things to note:
75 |
76 | - The downloads directory and database path are always relative to the
77 | data directory and cannot be changed directly.
78 | - This change only affects subsequent operations, so any data in the
79 | previous location will not be moved nor deleted.
80 | - This change only affects the current session. If you want a script
81 | or application to always use the new location, it must reset the
82 | data directory each time it is initialized.
83 |
84 | You can also add project information for remote resources. First you
85 | add a project, with a project ID, full name, and language code. Then
86 | you create one or more versions for that project with a version ID,
87 | resource URL, and license information. This may be done either through
88 | the :py:data:`wn.config` object's
89 | :py:meth:`~wn._config.WNConfig.add_project` and
90 | :py:meth:`~wn._config.WNConfig.add_project_version` methods, or loaded
91 | from a TOML_ file via the :py:data:`wn.config` object's
92 | :py:meth:`~wn._config.WNConfig.load_index` method.
93 |
94 | .. _TOML: https://toml.io
95 |
96 | .. code-block:: python
97 |
98 | wn.config.add_project('ewn', 'English WordNet', 'en')
99 | wn.config.add_project_version(
100 | 'ewn', '2020',
101 | 'https://en-word.net/static/english-wordnet-2020.xml.gz',
102 | 'https://creativecommons.org/licenses/by/4.0/',
103 | )
104 |
105 |
106 | Installing From Source
107 | ----------------------
108 |
109 | If you wish to install the code from the source repository (e.g., to
110 | get an unreleased feature or to contribute toward Wn's development),
111 | clone the repository and use `Hatch `_ to
112 | start a virtual environment with Wn installed:
113 |
114 | .. code-block:: console
115 |
116 | $ git clone https://github.com/goodmami/wn.git
117 | $ cd wn
118 | $ hatch shell
119 |
--------------------------------------------------------------------------------
/tests/wordnet_test.py:
--------------------------------------------------------------------------------
1 | import tempfile
2 | import warnings
3 | from pathlib import Path
4 |
5 | import pytest
6 |
7 | import wn
8 |
9 |
10 | @pytest.mark.usefixtures('mini_db_1_1')
11 | def test_wordnet_lexicons():
12 | en = wn.Wordnet('test-en')
13 | assert len(en.lexicons()) == 1
14 | assert len(en.expanded_lexicons()) == 0
15 |
16 | en1 = wn.Wordnet('test-en:1')
17 | assert en.lexicons() == en1.lexicons()
18 | assert en.expanded_lexicons() == en1.expanded_lexicons()
19 |
20 | en2 = wn.Wordnet(lang='en')
21 | assert len(en2.lexicons()) == 2
22 | assert len(en2.expanded_lexicons()) == 0
23 |
24 | es = wn.Wordnet('test-es')
25 | assert len(es.lexicons()) == 1
26 | assert len(es.expanded_lexicons()) == 0
27 |
28 | es2 = wn.Wordnet('test-es', expand='test-en')
29 | assert len(es2.lexicons()) == 1
30 | assert len(es2.expanded_lexicons()) == 1
31 |
32 | ja = wn.Wordnet('test-ja')
33 | assert len(ja.lexicons()) == 1
34 | assert len(ja.expanded_lexicons()) == 1
35 |
36 | ja2 = wn.Wordnet('test-ja', expand='')
37 | assert len(ja2.lexicons()) == 1
38 | assert len(ja2.expanded_lexicons()) == 0
39 |
40 |
41 | @pytest.mark.usefixtures('mini_db')
42 | def test_wordnet_normalize():
43 | es = wn.Wordnet('test-es')
44 | assert es.words('Informacion') == es.words('información')
45 | assert es.words('ínfórmácíón') == es.words('información')
46 | es = wn.Wordnet('test-es', normalizer=None)
47 | assert es.words('informacion') == []
48 | assert es.words('Información') == []
49 |
50 | # The following doesn't necessarily work because any non-None
51 | # normalizer causes the normalized form column to be tested with
52 | # the original form
53 | # es = wn.Wordnet('test-es', normalizer=str.lower)
54 | # assert es.words('informacion') == []
55 | # assert es.words('Información') == es.words('información')
56 |
57 |
58 | @pytest.mark.usefixtures('mini_db')
59 | def test_wordnet_lemmatize():
60 | # default lemmatizer compares alternative forms
61 | en = wn.Wordnet('test-en')
62 | assert en.words('examples') == []
63 | assert en.words('exemplifying') == en.words('exemplify')
64 | assert en.words('data') == en.words('datum')
65 |
66 | en = wn.Wordnet('test-en', search_all_forms=False)
67 | assert en.words('examples') == []
68 | assert en.words('exemplifying') == []
69 | assert en.words('data') == []
70 |
71 | def morphy_lite(form, pos):
72 | result = {pos: {form}}
73 | if pos in ('n', None) and form.endswith('s'):
74 | result.setdefault('n', set()).add(form[:-1])
75 | return result
76 |
77 | en = wn.Wordnet('test-en', lemmatizer=morphy_lite, search_all_forms=False)
78 | assert en.words('examples', pos='n') == en.words('example')
79 | assert en.words('examples') == en.words('example')
80 | assert en.words('exemplifying') == []
81 | assert en.words('data') == []
82 |
83 | en = wn.Wordnet('test-en', lemmatizer=morphy_lite, search_all_forms=True)
84 | assert en.words('data') == en.words('datum')
85 | assert en.words('exemplifying') == en.words('exemplify')
86 |
87 |
88 | def test_portable_entities_issue_226(monkeypatch, datadir):
89 | # instead use ignore_cleanup_errors=True from Python 3.10
90 | tempdir = tempfile.TemporaryDirectory('wn_issue_226')
91 | with tempdir as dir:
92 | with monkeypatch.context() as m:
93 | m.setattr(wn.config, 'data_directory', Path(dir))
94 | wn.add(datadir / 'mini-lmf-1.0.xml')
95 | en = wn.Wordnet('test-en')
96 | info1 = en.synsets('information')[0]
97 | wn.remove('test-en')
98 | wn.add(datadir / 'mini-lmf-1.0.xml')
99 | info2 = en.synsets('information')[0] # en Wordnet object still works
100 | assert info1 == info2 # synsets are equivalent
101 | wn._db.clear_connections()
102 | # Not needed if ignore_cleanup_errors=True and delete=True above
103 | try:
104 | tempdir.cleanup()
105 | except PermissionError:
106 | warnings.warn(
107 | f"Failed to clean up temporary directory {dir!s}",
108 | stacklevel=1,
109 | )
110 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
6 |
7 | # -- Path setup --------------------------------------------------------------
8 |
9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | # import os
14 | # import sys
15 | # sys.path.insert(0, os.path.abspath('.'))
16 |
17 |
18 | # -- Project information -----------------------------------------------------
19 |
20 | project = 'wn'
21 | copyright = '2020, Michael Wayne Goodman'
22 | author = 'Michael Wayne Goodman'
23 |
24 | import wn
25 |
26 | # The short X.Y version
27 | version = '.'.join(wn.__version__.split('.')[:2])
28 | # The full version, including alpha/beta/rc tags
29 | release = wn.__version__
30 |
31 | # -- General configuration ---------------------------------------------------
32 |
33 | # Add any Sphinx extension module names here, as strings. They can be
34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
35 | # ones.
36 | extensions = [
37 | 'sphinx.ext.autodoc',
38 | 'sphinx.ext.intersphinx',
39 | 'sphinx.ext.coverage',
40 | # 'sphinx.ext.viewcode',
41 | 'sphinx.ext.githubpages',
42 | 'sphinx.ext.napoleon',
43 | "sphinx_copybutton",
44 | ]
45 |
46 | # Add any paths that contain templates here, relative to this directory.
47 | templates_path = ['_templates']
48 |
49 | # List of patterns, relative to source directory, that match files and
50 | # directories to ignore when looking for source files.
51 | # This pattern also affects html_static_path and html_extra_path.
52 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
53 |
54 | # Global definitions
55 | rst_prolog = """
56 | .. role:: python(code)
57 | :language: python
58 | :class: highlight
59 | """
60 |
61 | # smartquotes = False
62 | smartquotes_action = 'De' # D = en- and em-dash; e = ellipsis
63 |
64 | # -- Options for HTML output -------------------------------------------------
65 |
66 | # The theme to use for HTML and HTML Help pages. See the documentation for
67 | # a list of builtin themes.#
68 |
69 | html_theme = "furo"
70 | html_theme_options = {
71 | "light_css_variables": {
72 | "color-brand-primary": "#006699",
73 | "color-brand-content": "#006699",
74 | # "color-background": "#f0f0f0",
75 | # "color-sidebar-background": "#ddd",
76 | },
77 | "dark_css_variables": {
78 | "color-brand-primary": "#00CCFF",
79 | "color-brand-content": "#00CCFF",
80 | }
81 | }
82 |
83 | html_logo = "_static/wn-logo.svg"
84 |
85 | pygments_style = 'manni'
86 | pygments_dark_style = 'monokai'
87 |
88 | # Add any paths that contain custom static files (such as style sheets) here,
89 | # relative to this directory. They are copied after the builtin static files,
90 | # so a file named "default.css" will overwrite the builtin "default.css".
91 | html_static_path = ['_static']
92 | html_css_files = [
93 | 'css/svg.css',
94 | ]
95 |
96 | # Don't offer to show the source of the current page
97 | html_show_sourcelink = False
98 |
99 | # -- Options for autodoc extension -------------------------------------------
100 |
101 | # autodoc_typehints = 'description'
102 | autodoc_typehints = 'signature'
103 | # autodoc_typehints = 'none'
104 |
105 | # -- Options for intersphinx extension ---------------------------------------
106 |
107 | # Example configuration for intersphinx: refer to the Python standard library.
108 | intersphinx_mapping = {
109 | 'python': ('https://docs.python.org/3', None),
110 | 'httpx': ('https://httpx.readthedocs.io/en/latest/', None),
111 | }
112 |
113 | # -- Options for sphinx_copybutton extension ---------------------------------
114 |
115 | copybutton_prompt_text = (
116 | r">>> " # regular Python prompt
117 | r"|\.\.\. " # Python continuation prompt
118 | r"|\$ " # Basic shell
119 | r"|In \[\d*\]: " # Jupyter notebook
120 | )
121 | copybutton_prompt_is_regexp = True
122 |
--------------------------------------------------------------------------------
/tests/ic_test.py:
--------------------------------------------------------------------------------
1 |
2 | from math import log
3 |
4 | import pytest
5 |
6 | import wn
7 | from wn.constants import (NOUN, VERB, ADJ, ADV)
8 | from wn.util import synset_id_formatter
9 | import wn.ic
10 |
11 |
12 | synset_id = {
13 | 'information': 'test-en-0001-n',
14 | 'illustration_example': 'test-en-0002-n',
15 | 'sample': 'test-en-0004-n',
16 | 'random_sample': 'test-en-0005-n',
17 | 'random_sample2': 'test-en-0008-n', # no hypernyms
18 | 'datum': 'test-en-0006-n',
19 | 'illustrate_exemplify': 'test-en-0003-v',
20 | 'resignate': 'test-en-0007-v',
21 | }
22 |
23 |
24 | words = [
25 | 'For', 'example', ':', 'random sample', '.',
26 | 'This', 'will', 'illustrate', 'and', 'exemplify', '.',
27 | 'A', 'sample', 'of', 'data', '.',
28 | ]
29 |
30 |
31 | @pytest.mark.usefixtures('mini_db')
32 | def test_compute_nodistribute_nosmoothing():
33 | w = wn.Wordnet('test-en:1')
34 | assert wn.ic.compute(words, w, distribute_weight=False, smoothing=0) == {
35 | NOUN: {
36 | synset_id['information']: 4.0,
37 | synset_id['illustration_example']: 3.0,
38 | synset_id['sample']: 2.0,
39 | synset_id['random_sample']: 1.0,
40 | synset_id['random_sample2']: 1.0,
41 | synset_id['datum']: 1.0,
42 | None: 5.0,
43 | },
44 | VERB: {
45 | synset_id['illustrate_exemplify']: 2.0,
46 | synset_id['resignate']: 0.0,
47 | None: 2.0,
48 | },
49 | ADJ: {None: 0.0},
50 | ADV: {None: 0.0},
51 | }
52 |
53 |
54 | @pytest.mark.usefixtures('mini_db')
55 | def test_compute_nodistribute_smoothing():
56 | w = wn.Wordnet('test-en:1')
57 | assert wn.ic.compute(words, w, distribute_weight=False, smoothing=1.0) == {
58 | NOUN: {
59 | synset_id['information']: 5.0,
60 | synset_id['illustration_example']: 4.0,
61 | synset_id['sample']: 3.0,
62 | synset_id['random_sample']: 2.0,
63 | synset_id['random_sample2']: 2.0,
64 | synset_id['datum']: 2.0,
65 | None: 6.0,
66 | },
67 | VERB: {
68 | synset_id['illustrate_exemplify']: 3.0,
69 | synset_id['resignate']: 1.0,
70 | None: 3.0,
71 | },
72 | ADJ: {None: 1.0},
73 | ADV: {None: 1.0},
74 | }
75 |
76 |
77 | @pytest.mark.usefixtures('mini_db')
78 | def test_compute_distribute_smoothing():
79 | w = wn.Wordnet('test-en:1')
80 | assert wn.ic.compute(words, w, distribute_weight=True, smoothing=1.0) == {
81 | NOUN: {
82 | synset_id['information']: 4.5,
83 | synset_id['illustration_example']: 3.5,
84 | synset_id['sample']: 2.5,
85 | synset_id['random_sample']: 1.5,
86 | synset_id['random_sample2']: 1.5,
87 | synset_id['datum']: 2.0,
88 | None: 5.0,
89 | },
90 | VERB: {
91 | synset_id['illustrate_exemplify']: 3.0,
92 | synset_id['resignate']: 1.0,
93 | None: 3.0,
94 | },
95 | ADJ: {None: 1.0},
96 | ADV: {None: 1.0},
97 | }
98 |
99 |
100 | @pytest.mark.usefixtures('mini_db')
101 | def test_load(tmp_path):
102 | w = wn.Wordnet('test-en:1')
103 | icpath = tmp_path / 'foo.dat'
104 | icpath.write_text(
105 | 'wnver:1234567890AbCdEf\n'
106 | '1n 4.0 ROOT\n'
107 | '2n 3.0\n'
108 | '4n 2.0\n'
109 | '5n 1.0\n'
110 | '8n 1.0 ROOT\n'
111 | '6n 1.0\n'
112 | '3v 2.0 ROOT\n'
113 | '7v 0.0 ROOT\n'
114 | )
115 |
116 | get_synset_id = synset_id_formatter('test-en-{offset:04}-{pos}')
117 | assert (wn.ic.load(icpath, w, get_synset_id=get_synset_id)
118 | == wn.ic.compute(words, w, distribute_weight=False, smoothing=0.0))
119 |
120 |
121 | @pytest.mark.usefixtures('mini_db')
122 | def test_information_content():
123 | w = wn.Wordnet('test-en:1')
124 | ic = wn.ic.compute(words, w)
125 | info = w.synsets('information')[0]
126 | samp = w.synsets('sample')[0]
127 | # info is a root but not the only one, so its IC is not 0.0
128 | assert wn.ic.information_content(info, ic) == -log(
129 | ic['n'][info.id]
130 | / ic['n'][None]
131 | )
132 | assert wn.ic.information_content(samp, ic) == -log(
133 | ic['n'][samp.id]
134 | / ic['n'][None]
135 | )
136 |
--------------------------------------------------------------------------------
/tests/web_test.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from starlette.testclient import TestClient
3 |
4 | import wn
5 | import wn._db
6 | from wn import web
7 |
8 |
9 | # clearing connections on teardown (see conftest.py) isn't enough. For
10 | # this we apparently need to monkeypatch the wn._db.pool as well.
11 |
12 | @pytest.fixture
13 | def mini_db_web(monkeypatch, mini_db_dir):
14 | with monkeypatch.context() as m:
15 | m.setattr(wn._db, 'pool', {})
16 | m.setattr(wn.config, 'data_directory', mini_db_dir)
17 | m.setattr(wn.config, 'allow_multithreading', True)
18 | yield
19 | wn._db.clear_connections()
20 |
21 |
22 | client = TestClient(web.app)
23 |
24 |
25 | @pytest.mark.usefixtures('mini_db_web')
26 | def test_root():
27 | response = client.get('/')
28 | assert response.status_code == 404
29 |
30 |
31 | @pytest.mark.usefixtures('mini_db_web')
32 | def test_lexicons():
33 | response = client.get("/lexicons")
34 | assert response.status_code == 200
35 | data = response.json()["data"]
36 | assert [lex["id"] for lex in data] == ["test-en:1", "test-es:1"]
37 |
38 |
39 | @pytest.mark.usefixtures('mini_db_web')
40 | def test_words():
41 | response = client.get("/words")
42 | assert response.status_code == 200
43 | data = response.json()["data"]
44 | word_ids = {word["id"] for word in data}
45 | assert "test-en-information-n" in word_ids
46 | assert "test-es-información-n" in word_ids
47 |
48 | response = client.get("/words", params={"lexicon": "test-en:1"})
49 | assert response.status_code == 200
50 | data = response.json()["data"]
51 | word_ids = {word["id"] for word in data}
52 | assert "test-en-information-n" in word_ids
53 | assert "test-es-información-n" not in word_ids
54 |
55 |
56 | @pytest.mark.usefixtures('mini_db_web')
57 | def test_senses():
58 | response = client.get("/senses")
59 | assert response.status_code == 200
60 | data = response.json()["data"]
61 | sense_ids = {sense["id"] for sense in data}
62 | assert "test-en-information-n-0001-01" in sense_ids
63 | assert "test-es-información-n-0001-01" in sense_ids
64 |
65 | response = client.get("/senses", params={"lexicon": "test-en:1"})
66 | assert response.status_code == 200
67 | data = response.json()["data"]
68 | sense_ids = {sense["id"] for sense in data}
69 | assert "test-en-information-n-0001-01" in sense_ids
70 | assert "test-es-información-n-0001-01" not in sense_ids
71 |
72 |
73 | @pytest.mark.usefixtures('mini_db_web')
74 | def test_synsets():
75 | response = client.get("/synsets")
76 | assert response.status_code == 200
77 | data = response.json()["data"]
78 | synset_ids = {synset["id"] for synset in data}
79 | assert "test-en-0001-n" in synset_ids
80 | assert "test-es-0001-n" in synset_ids
81 |
82 | response = client.get("/synsets", params={"lexicon": "test-en:1"})
83 | assert response.status_code == 200
84 | data = response.json()["data"]
85 | synset_ids = {synset["id"] for synset in data}
86 | assert "test-en-0001-n" in synset_ids
87 | assert "test-es-0001-n" not in synset_ids
88 |
89 |
90 | @pytest.mark.usefixtures('mini_db_web')
91 | def test_lexicon_words():
92 | response1 = client.get("/lexicons/test-en:1/words")
93 | response2 = client.get("/words", params={"lexicon": "test-en:1"})
94 | assert response1.status_code == 200
95 | assert response2.status_code == 200
96 | data1 = response1.json()["data"]
97 | data2 = response2.json()["data"]
98 | assert {word["id"] for word in data1} == {word["id"] for word in data2}
99 |
100 |
101 | @pytest.mark.usefixtures('mini_db_web')
102 | def test_lexicon_senses():
103 | response1 = client.get("/lexicons/test-en:1/senses")
104 | response2 = client.get("/senses", params={"lexicon": "test-en:1"})
105 | assert response1.status_code == 200
106 | assert response2.status_code == 200
107 | data1 = response1.json()["data"]
108 | data2 = response2.json()["data"]
109 | assert {sense["id"] for sense in data1} == {sense["id"] for sense in data2}
110 |
111 |
112 | @pytest.mark.usefixtures('mini_db_web')
113 | def test_lexicon_synsets():
114 | response1 = client.get("/lexicons/test-en:1/synsets")
115 | response2 = client.get("/synsets", params={"lexicon": "test-en:1"})
116 | assert response1.status_code == 200
117 | assert response2.status_code == 200
118 | data1 = response1.json()["data"]
119 | data2 = response2.json()["data"]
120 | assert {synset["id"] for synset in data1} == {synset["id"] for synset in data2}
121 |
--------------------------------------------------------------------------------
/tests/taxonomy_test.py:
--------------------------------------------------------------------------------
1 |
2 | import pytest
3 |
4 | import wn
5 | from wn.taxonomy import (
6 | roots,
7 | leaves,
8 | taxonomy_depth,
9 | hypernym_paths,
10 | min_depth,
11 | max_depth,
12 | shortest_path,
13 | # common_hypernyms,
14 | # lowest_common_hypernyms,
15 | )
16 |
17 |
18 | @pytest.mark.usefixtures('mini_db')
19 | def test_roots():
20 | en = wn.Wordnet('test-en')
21 | assert set(roots(en, pos='n')) == {en.synset('test-en-0001-n'),
22 | en.synset('test-en-0008-n')}
23 | assert set(roots(en, pos='v')) == {en.synset('test-en-0003-v'),
24 | en.synset('test-en-0007-v')}
25 | assert roots(en, pos='a') == []
26 | assert set(roots(en)) == set(roots(en, pos='n') + roots(en, pos='v'))
27 |
28 | # with no expand relations and no relation of its own, every
29 | # synset looks like a root
30 | es = wn.Wordnet('test-es')
31 | assert set(roots(es, pos='n')) == {es.synset('test-es-0001-n'),
32 | es.synset('test-es-0002-n'),
33 | es.synset('test-es-0005-n')}
34 |
35 | es = wn.Wordnet('test-es', expand='test-en')
36 | assert roots(es, pos='n') == [es.synset('test-es-0001-n')]
37 |
38 |
39 | @pytest.mark.usefixtures('mini_db')
40 | def test_leaves():
41 | en = wn.Wordnet('test-en')
42 | assert set(leaves(en, pos='n')) == {en.synset('test-en-0005-n'),
43 | en.synset('test-en-0006-n'),
44 | en.synset('test-en-0008-n')}
45 | assert set(leaves(en, pos='v')) == {en.synset('test-en-0003-v'),
46 | en.synset('test-en-0007-v')}
47 |
48 |
49 | @pytest.mark.usefixtures('mini_db')
50 | def test_taxonomy_depth():
51 | en = wn.Wordnet('test-en')
52 | assert taxonomy_depth(en, pos='n') == 3
53 | assert taxonomy_depth(en, pos='v') == 0
54 |
55 |
56 | @pytest.mark.usefixtures('mini_db')
57 | def test_hypernym_paths():
58 | information = wn.synsets('information')[0]
59 | example = wn.synsets('example')[0]
60 | sample = wn.synsets('sample')[0]
61 | random_sample = wn.synsets('random sample')[0]
62 | assert hypernym_paths(information) == []
63 | assert hypernym_paths(example) == [[information]]
64 | assert hypernym_paths(sample) == [[example, information]]
65 | assert hypernym_paths(random_sample) == [[sample, example, information]]
66 |
67 |
68 | @pytest.mark.usefixtures('mini_db')
69 | def test_interlingual_hypernym_paths():
70 | información = wn.synsets('información')[0]
71 | ejemplo = wn.synsets('ejemplo')[0]
72 | sample = wn.synsets('sample', lexicon='test-en:1')[0]
73 | inferred = wn.Synset.empty('*INFERRED*', ili=sample.ili.id, _lexicon='test-es:1')
74 | muestra_aleatoria = wn.synsets('muestra aleatoria')[0]
75 | assert hypernym_paths(información) == []
76 | assert hypernym_paths(ejemplo) == [[información]]
77 | assert hypernym_paths(muestra_aleatoria) == [[inferred, ejemplo, información]]
78 |
79 |
80 | @pytest.mark.usefixtures('mini_db')
81 | def test_shortest_path():
82 | information = wn.synsets('information')[0]
83 | example = wn.synsets('example')[0]
84 | sample = wn.synsets('sample')[0]
85 | random_sample = wn.synsets('random sample')[0]
86 | datum = wn.synsets('datum')[0]
87 | exemplify = wn.synsets('exemplify')[0]
88 | inferred_root = wn.Synset.empty('*ROOT*', _lexicon='test-en:1')
89 | assert shortest_path(information, information) == []
90 | assert shortest_path(information, datum) == [datum]
91 | assert shortest_path(information, sample) == [example, sample]
92 | assert shortest_path(sample, information) == [example, information]
93 | assert shortest_path(random_sample, datum) == [sample, example, information, datum]
94 | with pytest.raises(wn.Error):
95 | shortest_path(example, exemplify)
96 | assert shortest_path(example, exemplify, simulate_root=True) == [
97 | information, inferred_root, exemplify
98 | ]
99 |
100 |
101 | @pytest.mark.usefixtures('mini_db')
102 | def test_min_depth():
103 | assert min_depth(wn.synsets('information')[0]) == 0
104 | assert min_depth(wn.synsets('example')[0]) == 1
105 | assert min_depth(wn.synsets('sample')[0]) == 2
106 | assert min_depth(wn.synsets('random sample')[0]) == 3
107 |
108 |
109 | @pytest.mark.usefixtures('mini_db')
110 | def test_max_depth():
111 | assert max_depth(wn.synsets('information')[0]) == 0
112 | assert max_depth(wn.synsets('example')[0]) == 1
113 | assert max_depth(wn.synsets('sample')[0]) == 2
114 | assert max_depth(wn.synsets('random sample')[0]) == 3
115 |
--------------------------------------------------------------------------------
/docs/guides/nltk-migration.rst:
--------------------------------------------------------------------------------
1 | Migrating from the NLTK
2 | =======================
3 |
4 | This guide is for users of the `NLTK `_\ 's
5 | ``nltk.corpus.wordnet`` module who are migrating to Wn. It is not
6 | guaranteed that Wn will produce the same results as the NLTK's module,
7 | but with some care its behavior can be very similar.
8 |
9 | Overview
10 | --------
11 |
12 | One important thing to note is that Wn will search all wordnets in the
13 | database by default where the NLTK would only search the English.
14 |
15 | >>> from nltk.corpus import wordnet as nltk_wn
16 | >>> nltk_wn.synsets('chat') # only English
17 | >>> nltk_wn.synsets('chat', lang='fra') # only French
18 | >>> import wn
19 | >>> wn.synsets('chat') # all wordnets
20 | >>> wn.synsets('chat', lang='fr') # only French
21 |
22 | With Wn it helps to create a :class:`wn.Wordnet` object to pre-filter
23 | the results by language or lexicon.
24 |
25 | >>> en = wn.Wordnet('omw-en:1.4')
26 | >>> en.synsets('chat') # only the OMW English Wordnet
27 |
28 | Equivalent Operations
29 | ---------------------
30 |
31 | The following table lists equivalent API calls for the NLTK's wordnet
32 | module and Wn assuming the respective modules have been instantiated
33 | (in separate Python sessions) as follows:
34 |
35 | NLTK:
36 |
37 | >>> from nltk.corpus import wordnet as wn
38 | >>> ss = wn.synsets("chat", pos="v")[0]
39 |
40 | Wn:
41 |
42 | >>> import wn
43 | >>> en = wn.Wordnet('omw-en:1.4')
44 | >>> ss = en.synsets("chat", pos="v")[0]
45 |
46 | .. default-role:: python
47 |
48 | Primary Queries
49 | '''''''''''''''
50 |
51 | ========================================= ===============================================
52 | NLTK Wn
53 | ========================================= ===============================================
54 | `wn.langs()` `[lex.language for lex in wn.lexicons()]`
55 | `wn.lemmas("chat")` --
56 | -- `en.words("chat")`
57 | -- `en.senses("chat")`
58 | `wn.synsets("chat")` `en.synsets("chat")`
59 | `wn.synsets("chat", pos="v")` `en.synsets("chat", pos="v")`
60 | `wn.all_synsets()` `en.synsets()`
61 | `wn.all_synsets(pos="v")` `en.synsets(pos="v")`
62 | ========================================= ===============================================
63 |
64 | Synsets -- Basic
65 | ''''''''''''''''
66 |
67 | =================== =================
68 | NLTK Wn
69 | =================== =================
70 | `ss.lemmas()` --
71 | -- `ss.senses()`
72 | -- `ss.words()`
73 | `ss.lemmas_names()` `ss.lemmas()`
74 | `ss.definition()` `ss.definition()`
75 | `ss.examples()` `ss.examples()`
76 | `ss.pos()` `ss.pos`
77 | =================== =================
78 |
79 | Synsets -- Relations
80 | ''''''''''''''''''''
81 |
82 | ========================================== =====================================
83 | NLTK Wn
84 | ========================================== =====================================
85 | `ss.hypernyms()` `ss.get_related("hypernym")`
86 | `ss.instance_hypernyms()` `ss.get_related("instance_hypernym")`
87 | `ss.hypernyms() + ss.instance_hypernyms()` `ss.hypernyms()`
88 | `ss.hyponyms()` `ss.get_related("hyponym")`
89 | `ss.member_holonyms()` `ss.get_related("holo_member")`
90 | `ss.member_meronyms()` `ss.get_related("mero_member")`
91 | `ss.closure(lambda x: x.hypernyms())` `ss.closure("hypernym")`
92 | ========================================== =====================================
93 |
94 | Synsets -- Taxonomic Structure
95 | ''''''''''''''''''''''''''''''
96 |
97 | ================================ =========================================================
98 | NLTK Wn
99 | ================================ =========================================================
100 | `ss.min_depth()` `ss.min_depth()`
101 | `ss.max_depth()` `ss.max_depth()`
102 | `ss.hypernym_paths()` `[list(reversed([ss] + p)) for p in ss.hypernym_paths()]`
103 | `ss.common_hypernyms(ss)` `ss.common_hypernyms(ss)`
104 | `ss.lowest_common_hypernyms(ss)` `ss.lowest_common_hypernyms(ss)`
105 | `ss.shortest_path_distance(ss)` `len(ss.shortest_path(ss))`
106 | ================================ =========================================================
107 |
108 | .. reset default role
109 | .. default-role::
110 |
111 | (these tables are incomplete)
112 |
--------------------------------------------------------------------------------
/wn/_db.py:
--------------------------------------------------------------------------------
1 | """
2 | Storage back-end interface.
3 | """
4 |
5 | from importlib import resources
6 | from pathlib import Path
7 | import json
8 | import sqlite3
9 | import logging
10 |
11 | import wn
12 | from wn._types import AnyPath
13 | from wn._util import short_hash, format_lexicon_specifier
14 |
15 |
16 | logger = logging.getLogger('wn')
17 |
18 |
19 | # Module Constants
20 |
21 | DEBUG = False
22 |
23 | # This stores hashes of the schema to check for version differences.
24 | # When the schema changes, the hash will change. If the new hash is
25 | # not added here, the 'test_schema_compatibility' test will fail. It
26 | # is the developer's responsibility to only add compatible schema
27 | # hashes here. If the schema change is not backwards-compatible, then
28 | # clear all old hashes and only put the latest hash here. A hash can
29 | # be generated like this:
30 | #
31 | # >>> import sqlite3
32 | # >>> import wn
33 | # >>> conn = sqlite3.connect(wn.config.database_path)
34 | # >>> wn._db.schema_hash(conn)
35 | #
36 | COMPATIBLE_SCHEMA_HASHES = {
37 | '4c8ad03af5422d6979039ee2b80838d07c12d2c8', # Original schema
38 | '01909cb2d0cdee19ed687dbd95c5983d7b68f807', # Added form_lexicon_index
39 | '4c2728bb7999685d9748ad6245638a210d0f099d', # Added form_lexicon_form_covering_index
40 | 'c1ef1e74d47810fd313383cdb8ecb9a2d9aef7db', # Migrated database with covering index
41 | }
42 |
43 |
44 | # Optional metadata is stored as a JSON string
45 |
46 | def _adapt_dict(d: dict) -> bytes:
47 | return json.dumps(d).encode('utf-8')
48 |
49 |
50 | def _convert_dict(s: bytes) -> dict:
51 | return json.loads(s)
52 |
53 |
54 | def _convert_boolean(s: bytes) -> bool:
55 | return bool(int(s))
56 |
57 |
58 | sqlite3.register_adapter(dict, _adapt_dict)
59 | sqlite3.register_converter('meta', _convert_dict)
60 | sqlite3.register_converter('boolean', _convert_boolean)
61 |
62 |
63 | # The pool is a cache of open connections. Unless the database path is
64 | # changed, there should only be zero or one.
65 | pool: dict[AnyPath, sqlite3.Connection] = {}
66 |
67 |
68 | # The connect() function should be used for all connections
69 |
70 | def connect() -> sqlite3.Connection:
71 | dbpath = wn.config.database_path
72 | if dbpath not in pool:
73 | if not wn.config.data_directory.exists():
74 | wn.config.data_directory.mkdir(parents=True, exist_ok=True)
75 | initialized = dbpath.is_file()
76 | conn = sqlite3.connect(
77 | str(dbpath),
78 | detect_types=sqlite3.PARSE_DECLTYPES,
79 | check_same_thread=not wn.config.allow_multithreading,
80 | )
81 | # foreign key support needs to be enabled for each connection
82 | conn.execute('PRAGMA foreign_keys = ON')
83 | if DEBUG:
84 | conn.set_trace_callback(print)
85 | if not initialized:
86 | logger.info('initializing database: %s', dbpath)
87 | _init_db(conn)
88 | _check_schema_compatibility(conn, dbpath)
89 |
90 | pool[dbpath] = conn
91 | return pool[dbpath]
92 |
93 |
94 | def _init_db(conn: sqlite3.Connection) -> None:
95 | schema = (resources.files('wn') / 'schema.sql').read_text()
96 | conn.executescript(schema)
97 | with conn:
98 | conn.executemany('INSERT INTO ili_statuses VALUES (null,?)',
99 | [('presupposed',), ('proposed',)])
100 |
101 |
102 | def _check_schema_compatibility(conn: sqlite3.Connection, dbpath: Path) -> None:
103 | hash = schema_hash(conn)
104 |
105 | # if the hash is known, then we're all good here
106 | if hash in COMPATIBLE_SCHEMA_HASHES:
107 | return
108 |
109 | logger.debug('current schema hash:\n %s', hash)
110 | logger.debug('compatible schema hashes:\n %s',
111 | '\n '.join(COMPATIBLE_SCHEMA_HASHES))
112 | # otherwise, try to raise a helpful error message
113 | msg = ("Wn's schema has changed and is no longer compatible with the "
114 | f"database. Please move or delete {dbpath} and rebuild it.")
115 | try:
116 | specs = conn.execute('SELECT id, version FROM lexicons').fetchall()
117 | except sqlite3.OperationalError as exc:
118 | raise wn.DatabaseError(msg) from exc
119 | else:
120 | if specs:
121 | installed = '\n '.join(
122 | format_lexicon_specifier(id, ver)
123 | for id, ver in specs
124 | )
125 | msg += f" Lexicons currently installed:\n {installed}"
126 | else:
127 | msg += ' No lexicons are currently installed.'
128 | raise wn.DatabaseError(msg)
129 |
130 |
131 | def schema_hash(conn: sqlite3.Connection) -> str:
132 | query = 'SELECT sql FROM sqlite_master WHERE NOT sql ISNULL'
133 | schema = '\n\n'.join(row[0] for row in conn.execute(query))
134 | return short_hash(schema)
135 |
136 |
137 | def clear_connections() -> None:
138 | """Close and delete any open database connections."""
139 | for path in list(pool):
140 | pool[path].close()
141 | del pool[path]
142 |
--------------------------------------------------------------------------------
/bench/conftest.py:
--------------------------------------------------------------------------------
1 | import tempfile
2 | from collections.abc import Iterator
3 | from itertools import product, cycle
4 | from pathlib import Path
5 |
6 | import pytest
7 |
8 | import wn
9 | from wn import lmf
10 |
11 |
12 | @pytest.fixture
13 | def clean_db():
14 |
15 | def clean_db():
16 | wn.remove("*")
17 | dummy_lex = lmf.Lexicon(
18 | id="dummy",
19 | version="1",
20 | label="placeholder to initialize the db",
21 | language="zxx",
22 | email="",
23 | license="",
24 | )
25 | wn.add_lexical_resource(
26 | lmf.LexicalResource(lmf_version="1.3", lexicons=[dummy_lex])
27 | )
28 |
29 | return clean_db
30 |
31 |
32 | @pytest.fixture(scope="session")
33 | def datadir():
34 | return Path(__file__).parent.parent / "tests" / "data"
35 |
36 |
37 | @pytest.fixture
38 | def empty_db(clean_db):
39 | with tempfile.TemporaryDirectory('wn_data_empty') as dir:
40 | with pytest.MonkeyPatch.context() as m:
41 | m.setattr(wn.config, 'data_directory', dir)
42 | clean_db()
43 | yield
44 |
45 |
46 | @pytest.fixture(scope="session")
47 | def mock_lmf():
48 | synsets: list[lmf.Synset] = [
49 | * _make_synsets("n", 20000),
50 | * _make_synsets("v", 10000),
51 | * _make_synsets("a", 2000),
52 | * _make_synsets("r", 1000),
53 | ]
54 | entries = _make_entries(synsets)
55 | lexicon = lmf.Lexicon(
56 | id="mock",
57 | version="1",
58 | label="",
59 | language="zxx",
60 | email="",
61 | license="",
62 | entries=entries,
63 | synsets=synsets,
64 | )
65 | return lmf.LexicalResource(lmf_version="1.3", lexicons=[lexicon])
66 |
67 |
68 | @pytest.fixture(scope="session")
69 | def mock_db_dir(mock_lmf):
70 | with tempfile.TemporaryDirectory("wn_data_empty") as dir:
71 | with pytest.MonkeyPatch.context() as m:
72 | m.setattr(wn.config, 'data_directory', dir)
73 | wn.add_lexical_resource(mock_lmf, progress_handler=None)
74 | wn._db.clear_connections()
75 |
76 | yield Path(dir)
77 |
78 |
79 | @pytest.fixture
80 | def mock_db(monkeypatch, mock_db_dir):
81 | with monkeypatch.context() as m:
82 | m.setattr(wn.config, "data_directory", mock_db_dir)
83 | yield
84 | wn._db.clear_connections()
85 |
86 |
87 | def _make_synsets(pos: str, n: int) -> list[lmf.Synset]:
88 | synsets: list[lmf.Synset] = [
89 | lmf.Synset(
90 | id=f"{i}-{pos}",
91 | ili="",
92 | partOfSpeech=pos,
93 | relations=[],
94 | meta={},
95 | )
96 | for i in range(1, n+1)
97 | ]
98 | # add relations for nouns and verbs
99 | if pos in "nv":
100 | total = len(synsets)
101 | tgt_i = 1 # index of next target synset
102 | n = cycle([2]) # how many targets to relate
103 | for cur_i in range(total):
104 | if tgt_i <= cur_i:
105 | tgt_i = cur_i + 1
106 | source = synsets[cur_i]
107 | for cur_k in range(tgt_i, tgt_i + next(n)):
108 | if cur_k >= total:
109 | break
110 | target = synsets[cur_k]
111 | source["relations"].append(
112 | lmf.Relation(target=target["id"], relType="hyponym", meta={})
113 | )
114 | target["relations"].append(
115 | lmf.Relation(target=source["id"], relType="hypernym", meta={})
116 | )
117 | tgt_i = cur_k + 1
118 |
119 | return synsets
120 |
121 |
122 | def _words() -> Iterator[str]:
123 | consonants = "kgtdpbfvszrlmnhw"
124 | vowels = "aeiou"
125 | while True:
126 | yield from map("".join, product(consonants, vowels, consonants, vowels))
127 |
128 |
129 | def _make_entries(synsets: list[lmf.Synset]) -> list[lmf.LexicalEntry]:
130 | words = _words()
131 | member_count = cycle(range(1, 4)) # 1, 2, or 3 synset members
132 | entries: dict[str, lmf.LexicalEntry] = {}
133 | prev_synsets: list[lmf.Synset] = []
134 | for synset in synsets:
135 | ssid = synset["id"]
136 | pos = synset["partOfSpeech"]
137 |
138 | for _ in range(next(member_count)):
139 | word = next(words)
140 | senses = [lmf.Sense(id=f"{word}-{ssid}", synset=ssid, meta={})]
141 | # add some polysemy
142 | if prev_synsets:
143 | ssid2 = prev_synsets.pop()["id"]
144 | senses.append(lmf.Sense(id=f"{word}-{ssid2}", synset=ssid2, meta={}))
145 | eid = f"{word}-{pos}"
146 | if eid not in entries:
147 | entries[eid] = lmf.LexicalEntry(
148 | id=eid,
149 | lemma=lmf.Lemma(
150 | writtenForm=word,
151 | partOfSpeech=pos,
152 | ),
153 | senses=[],
154 | meta={},
155 | )
156 | entries[eid]["senses"].extend(senses)
157 |
158 | prev_synsets.append(synset)
159 |
160 | return list(entries.values())
161 |
--------------------------------------------------------------------------------
/wn/_download.py:
--------------------------------------------------------------------------------
1 |
2 | from collections.abc import Sequence
3 | from typing import Optional
4 | from pathlib import Path
5 | import logging
6 |
7 | import httpx
8 |
9 | import wn
10 | from wn._util import is_url
11 | from wn.util import ProgressHandler, ProgressBar
12 | from wn._add import add as add_to_db
13 | from wn import config
14 |
15 |
16 | CHUNK_SIZE = 8 * 1024 # how many KB to read at a time
17 | TIMEOUT = 10 # number of seconds to wait for a server response
18 |
19 |
20 | logger = logging.getLogger('wn')
21 |
22 |
23 | def download(
24 | project_or_url: str,
25 | add: bool = True,
26 | progress_handler: Optional[type[ProgressHandler]] = ProgressBar,
27 | ) -> Path:
28 | """Download the resource specified by *project_or_url*.
29 |
30 | First the URL of the resource is determined and then, depending on
31 | the parameters, the resource is downloaded and added to the
32 | database. The function then returns the path of the cached file.
33 |
34 | If *project_or_url* starts with `'http://'` or `'https://'`, then
35 | it is taken to be the URL for the resource. Otherwise,
36 | *project_or_url* is taken as a :ref:`project specifier
37 | ` and the URL is taken from a matching entry
38 | in Wn's project index. If no project matches the specifier,
39 | :exc:`wn.Error` is raised.
40 |
41 | If the URL has been downloaded and cached before, the cached file
42 | is used. Otherwise the URL is retrieved and stored in the cache.
43 |
44 | If the *add* paramter is ``True`` (default), the downloaded
45 | resource is added to the database.
46 |
47 | >>> wn.download('ewn:2020')
48 | Added ewn:2020 (English WordNet)
49 |
50 | The *progress_handler* parameter takes a subclass of
51 | :class:`wn.util.ProgressHandler`. An instance of the class will be
52 | created, used, and closed by this function.
53 |
54 | """
55 | if progress_handler is None:
56 | progress_handler = ProgressHandler
57 | progress = progress_handler(message='Download', unit=' bytes')
58 |
59 | cache_path, urls = _get_cache_path_and_urls(project_or_url)
60 |
61 | try:
62 | if cache_path and cache_path.exists():
63 | progress.flash(f'Cached file found: {cache_path!s}')
64 | path = cache_path
65 | elif urls:
66 | path = _download(urls, progress)
67 | else:
68 | raise wn.Error('no urls to download')
69 | finally:
70 | progress.close()
71 |
72 | if add:
73 | try:
74 | add_to_db(path, progress_handler=progress_handler)
75 | except wn.Error as exc:
76 | raise wn.Error(
77 | f'could not add downloaded file: {path}\n You might try '
78 | 'deleting the cached file and trying the download again.'
79 | ) from exc
80 |
81 | return path
82 |
83 |
84 | def _get_cache_path_and_urls(project_or_url: str) -> tuple[Optional[Path], list[str]]:
85 | if is_url(project_or_url):
86 | return config.get_cache_path(project_or_url), [project_or_url]
87 | else:
88 | info = config.get_project_info(project_or_url)
89 | return info.get('cache'), info['resource_urls']
90 |
91 |
92 | def _download(urls: Sequence[str], progress: ProgressHandler) -> Path:
93 | client = httpx.Client(timeout=TIMEOUT, follow_redirects=True)
94 | try:
95 | for i, url in enumerate(urls, 1):
96 | path = config.get_cache_path(url)
97 | logger.info('download url: %s', url)
98 | logger.info('download cache path: %s', path)
99 | try:
100 | with open(path, 'wb') as f:
101 | progress.set(status='Requesting', count=0)
102 | with client.stream("GET", url) as response:
103 | response.raise_for_status()
104 | total = int(response.headers.get('Content-Length', 0))
105 | count = response.num_bytes_downloaded
106 | progress.set(count=count, total=total, status='Receiving')
107 | for chunk in response.iter_bytes(chunk_size=CHUNK_SIZE):
108 | if chunk:
109 | f.write(chunk)
110 | progress.update(response.num_bytes_downloaded - count)
111 | count = response.num_bytes_downloaded
112 | progress.set(status='Complete')
113 | except httpx.RequestError as exc:
114 | path.unlink(missing_ok=True)
115 | last_count = progress.kwargs['count']
116 | if i == len(urls):
117 | raise wn.Error(f'download failed at {last_count} bytes') from exc
118 | else:
119 | logger.info(
120 | 'download failed at %d bytes; trying next url', last_count
121 | )
122 | else:
123 | break # success
124 |
125 | except KeyboardInterrupt as exc:
126 | path.unlink(missing_ok=True)
127 | last_count = progress.kwargs['count']
128 | raise wn.Error(f'download cancelled at {last_count} bytes') from exc
129 | except Exception:
130 | path.unlink(missing_ok=True)
131 | raise
132 | finally:
133 | client.close()
134 |
135 | return path
136 |
--------------------------------------------------------------------------------
/wn/morphy.py:
--------------------------------------------------------------------------------
1 |
2 | """A simple English lemmatizer that finds and removes known suffixes.
3 |
4 | """
5 |
6 | from typing import Optional
7 | from enum import Flag, auto
8 |
9 | import wn
10 | from wn._types import LemmatizeResult
11 | from wn.constants import NOUN, VERB, ADJ, ADJ_SAT, ADV, PARTS_OF_SPEECH
12 |
13 | POSExceptionMap = dict[str, set[str]]
14 | ExceptionMap = dict[str, POSExceptionMap]
15 |
16 |
17 | class _System(Flag):
18 | """Flags to track suffix rules in various implementations of Morphy."""
19 | PWN = auto()
20 | NLTK = auto()
21 | WN = auto()
22 | ALL = PWN | NLTK | WN
23 |
24 |
25 | _PWN = _System.PWN
26 | _NLTK = _System.NLTK
27 | _WN = _System.WN
28 | _ALL = _System.ALL
29 |
30 |
31 | Rule = tuple[str, str, _System]
32 |
33 | DETACHMENT_RULES: dict[str, list[Rule]] = {
34 | NOUN: [
35 | ("s", "", _ALL),
36 | ("ces", "x", _WN),
37 | ("ses", "s", _ALL),
38 | ("ves", "f", _NLTK | _WN),
39 | ("ives", "ife", _WN),
40 | ("xes", "x", _ALL),
41 | ("xes", "xis", _WN),
42 | ("zes", "z", _ALL),
43 | ("ches", "ch", _ALL),
44 | ("shes", "sh", _ALL),
45 | ("men", "man", _ALL),
46 | ("ies", "y", _ALL),
47 | ],
48 | VERB: [
49 | ("s", "", _ALL),
50 | ("ies", "y", _ALL),
51 | ("es", "e", _ALL),
52 | ("es", "", _ALL),
53 | ("ed", "e", _ALL),
54 | ("ed", "", _ALL),
55 | ("ing", "e", _ALL),
56 | ("ing", "", _ALL),
57 | ],
58 | ADJ: [
59 | ("er", "", _ALL),
60 | ("est", "", _ALL),
61 | ("er", "e", _ALL),
62 | ("est", "e", _ALL),
63 | ],
64 | ADV: [],
65 | }
66 | DETACHMENT_RULES[ADJ_SAT] = DETACHMENT_RULES[ADJ]
67 |
68 |
69 | class Morphy:
70 | """The Morphy lemmatizer class.
71 |
72 | Objects of this class are callables that take a wordform and an
73 | optional part of speech and return a dictionary mapping parts of
74 | speech to lemmas. If objects of this class are not created with a
75 | :class:`wn.Wordnet` object, the returned lemmas may be invalid.
76 |
77 | Arguments:
78 | wordnet: optional :class:`wn.Wordnet` instance
79 |
80 | Example:
81 |
82 | >>> import wn
83 | >>> from wn.morphy import Morphy
84 | >>> ewn = wn.Wordnet('ewn:2020')
85 | >>> m = Morphy(ewn)
86 | >>> m('axes', pos='n')
87 | {'n': {'axe', 'ax', 'axis'}}
88 | >>> m('geese', pos='n')
89 | {'n': {'goose'}}
90 | >>> m('gooses')
91 | {'n': {'goose'}, 'v': {'goose'}}
92 | >>> m('goosing')
93 | {'v': {'goose'}}
94 |
95 | """
96 |
97 | def __init__(self, wordnet: Optional[wn.Wordnet] = None):
98 | self._rules = {
99 | pos: [rule for rule in rules if rule[2] & _System.WN]
100 | for pos, rules in DETACHMENT_RULES.items()
101 | }
102 | exceptions: ExceptionMap = {pos: {} for pos in PARTS_OF_SPEECH}
103 | all_lemmas: dict[str, set[str]] = {pos: set() for pos in PARTS_OF_SPEECH}
104 | if wordnet:
105 | for word in wordnet.words():
106 | pos = word.pos
107 | pos_exc = exceptions[pos]
108 | lemma, *others = word.forms()
109 | # store every lemma whether it has other forms or not
110 | all_lemmas[pos].add(lemma)
111 | # those with other forms map to the original lemmas
112 | for other in others:
113 | if other in pos_exc:
114 | pos_exc[other].add(lemma)
115 | else:
116 | pos_exc[other] = {lemma}
117 | self._initialized = True
118 | else:
119 | self._initialized = False
120 | self._exceptions = exceptions
121 | self._all_lemmas = all_lemmas
122 |
123 | def __call__(self, form: str, pos: Optional[str] = None) -> LemmatizeResult:
124 | result = {}
125 | if not self._initialized:
126 | result[pos] = {form} # always include original when not initialized
127 |
128 | if pos is None:
129 | pos_list = list(DETACHMENT_RULES)
130 | elif pos in DETACHMENT_RULES:
131 | pos_list = [pos]
132 | else:
133 | pos_list = [] # not handled by morphy
134 |
135 | no_pos_forms = result.get(None, set()) # avoid unnecessary duplicates
136 | for _pos in pos_list:
137 | candidates = self._morphstr(form, _pos) - no_pos_forms
138 | if candidates:
139 | result.setdefault(_pos, set()).update(candidates)
140 |
141 | return result
142 |
143 | def _morphstr(self, form: str, pos: str) -> set[str]:
144 | candidates: set[str] = set()
145 |
146 | initialized = self._initialized
147 | if initialized:
148 | all_lemmas = self._all_lemmas[pos]
149 | if form in all_lemmas:
150 | candidates.add(form)
151 | candidates.update(self._exceptions[pos].get(form, set()))
152 | else:
153 | all_lemmas = set()
154 |
155 | for suffix, repl, _ in self._rules[pos]:
156 | # avoid applying rules that perform full suppletion
157 | if form.endswith(suffix) and len(suffix) < len(form):
158 | candidate = f'{form[:-len(suffix)]}{repl}'
159 | if not initialized or candidate in all_lemmas:
160 | candidates.add(candidate)
161 |
162 | return candidates
163 |
164 |
165 | morphy = Morphy()
166 |
--------------------------------------------------------------------------------
/docs/faq.rst:
--------------------------------------------------------------------------------
1 | FAQ
2 | ===
3 |
4 | Is Wn related to the NLTK's `nltk.corpus.wordnet` module?
5 | ---------------------------------------------------------
6 |
7 | Only in spirit. There was an effort to develop the `NLTK`_\ 's module as a
8 | standalone package (see https://github.com/nltk/wordnet/), but
9 | development had slowed. Wn has the same broad goals and a similar API
10 | as that standalone package, but fundamental architectural differences
11 | demanded a complete rewrite, so Wn was created as a separate
12 | project. With approval from the other package's maintainer, Wn
13 | acquired the `wn `_ project on PyPI and
14 | can be seen as its successor.
15 |
16 | Is Wn compatible with the NLTK's module?
17 | ----------------------------------------
18 |
19 | The API is intentionally similar, but not exactly the same (for
20 | instance see the next question), and there are differences in the ways
21 | that results are retrieved, particularly for non-English wordnets. See
22 | :doc:`guides/nltk-migration` for more information. Also see
23 | :ref:`princeton-wordnet`.
24 |
25 | Where are the ``Lemma`` objects? What are ``Word`` and ``Sense`` objects?
26 | -------------------------------------------------------------------------
27 |
28 | Unlike the original `WNDB`_ data format of the original WordNet, the
29 | `WN-LMF`_ XML format grants words (called *lexical entries* in WN-LMF
30 | and a :class:`~wn.Word` object in Wn) and word senses
31 | (:class:`~wn.Sense` in Wn) explicit, first-class status alongside
32 | synsets. While senses are essentially links between words and
33 | synsets, they may contain metadata and be the source or target of
34 | sense relations, so in some ways they are more like nodes than edges
35 | when the wordnet is viewed as a graph. The `NLTK`_\ 's module, using
36 | the WNDB format, combines the information of a word and a sense into a
37 | single object called a ``Lemmas``. Wn also has an unrelated concept
38 | called a :meth:`~wn.Word.lemma`, but it is merely the canonical form
39 | of a word.
40 |
41 | .. _princeton-wordnet:
42 |
43 | Where is the Princeton WordNet data?
44 | ------------------------------------
45 |
46 | The original English wordnet, named simply *WordNet* but often
47 | referred to as the *Princeton WordNet* to better distinguish it from
48 | other projects, is specifically the data distributed by Princeton in
49 | the `WNDB`_ format. The `Open Multilingual Wordnet `_ (OMW)
50 | packages an export of the WordNet data as the *OMW English Wordnet
51 | based on WordNet 3.0* which is used by Wn (with the lexicon ID
52 | ``omw-en``). It also has a similar export for WordNet 3.1 data
53 | (``omw-en31``). Both of these are highly compatible with the original
54 | data and can be used as drop-in replacements.
55 |
56 | Prior to Wn version 0.9 (and, correspondingly, prior to the `OMW
57 | data`_ version 1.4), the ``pwn:3.0`` and ``pwn:3.1`` English wordnets
58 | distributed by OMW were incorrectly called the *Princeton WordNet*
59 | (for WordNet 3.0 and 3.1, respectively). From Wn version 0.9 (and from
60 | version 1.4 of the OMW data), these are called the *OMW English
61 | Wordnet based on WordNet 3.0/3.1* (``omw-en:1.4`` and
62 | ``omw-en31:1.4``, respectively). These lexicons are intentionally
63 | compatible with the original WordNet data, and the 1.4 versions are
64 | even more compatible than the previous ``pwn:3.0`` and ``pwn:3.1``
65 | lexicons, so it is strongly recommended to use them over the previous
66 | versions.
67 |
68 | .. _OMW data: https://github.com/omwn/omw-data
69 |
70 | Why don't all wordnets share the same synsets?
71 | ----------------------------------------------
72 |
73 | The `Open Multilingual Wordnet `_ (OMW) contains wordnets for
74 | many languages created using the *expand* methodology [VOSSEN1998]_,
75 | where non-English wordnets provide words on top of the English
76 | wordnet's synset structure. This allows new wordnets to be built in
77 | much less time than starting from scratch, but with a few drawbacks,
78 | such as that words cannot be added if they do not have a synset in the
79 | English wordnet, and that it is difficult to version the wordnets
80 | independently (e.g., for reproducibility of experiments involving
81 | wordnet data) as all are interconnected. Wn, therefore, creates new
82 | synsets for each wordnet added to its database, and synsets then
83 | specify which resource they belong to. Queries can specify which
84 | resources may be examined. Also see :doc:`guides/interlingual`.
85 |
86 | Why does Wn's database get so big?
87 | ----------------------------------
88 |
89 | The *OMW English Wordnet based on WordNet 3.0* takes about 114 MiB of
90 | disk space in Wn's database, which is only about 8 MiB more than it
91 | takes as a `WN-LMF`_ XML file. The `NLTK`_, however, uses the obsolete
92 | `WNDB`_ format which is more compact, requiring only 35 MiB of disk
93 | space. The difference with the Open Multilingual Wordnet 1.4 is more
94 | striking: it takes about 659 MiB of disk space in the database, but
95 | only 49 MiB in the NLTK. Part of the difference here is that the OMW
96 | files in the NLTK are simple tab-separated-value files listing only
97 | the words added to each synset for each language. In addition, Wn
98 | creates new synsets for each wordnet added (see the previous
99 | question). One more reason is that Wn creates various indexes in the
100 | database for efficient lookup.
101 |
102 | .. _NLTK: https://www.nltk.org/
103 | .. _OMW: http://github.com/omwn
104 | .. [VOSSEN1998] Piek Vossen. 1998. *Introduction to EuroWordNet.* Computers and the Humanities, 32(2): 73--89.
105 | .. _Open English Wordnet 2021: https://en-word.net/
106 | .. _WNDB: https://wordnet.princeton.edu/documentation/wndb5wn
107 | .. _WN-LMF: https://globalwordnet.github.io/schemas/
108 |
--------------------------------------------------------------------------------
/wn/__main__.py:
--------------------------------------------------------------------------------
1 |
2 | import sys
3 | import argparse
4 | from pathlib import Path
5 | import json
6 | import logging
7 |
8 | import wn
9 | from wn.project import iterpackages
10 | from wn import lmf
11 | from wn.validate import validate
12 | from wn._util import format_lexicon_specifier
13 |
14 |
15 | def _download(args):
16 | if args.index:
17 | wn.config.load_index(args.index)
18 | for target in args.target:
19 | wn.download(target, add=args.add)
20 |
21 |
22 | def _lexicons(args):
23 | for lex in wn.lexicons(lang=args.lang, lexicon=args.lexicon):
24 | print('\t'.join((lex.id, lex.version, f'[{lex.language}]', lex.label)))
25 |
26 |
27 | def _projects(args):
28 | for info in wn.projects():
29 | key = 'i'
30 | key += 'c' if info['cache'] else '-'
31 | # key += 'a' if False else '-' # TODO: check if project is added to db
32 | print(
33 | '\t'.join((
34 | key,
35 | info['id'],
36 | info['version'],
37 | f"[{info['language'] or '---'}]",
38 | info['label'] or '---',
39 | ))
40 | )
41 |
42 |
43 | def _validate(args):
44 | all_valid = True
45 | selectseq = [check.strip() for check in args.select.split(',')]
46 | for package in iterpackages(args.FILE):
47 | resource = lmf.load(package.resource_file())
48 | for lexicon in resource['lexicons']:
49 | spec = format_lexicon_specifier(lexicon["id"], lexicon["version"])
50 | print(f'{spec:<20}', end='')
51 | report = validate(lexicon, select=selectseq)
52 | if not any(check.get('items', []) for check in report.values()):
53 | print('passed')
54 | else:
55 | print('failed')
56 | all_valid = False
57 | # clean up report
58 | for code in list(report):
59 | if not report[code].get('items'):
60 | del report[code]
61 | if args.output_file:
62 | with open(args.output_file, 'w') as outfile:
63 | json.dump(report, outfile, indent=2)
64 | else:
65 | for _code, check in report.items():
66 | if not check['items']:
67 | continue
68 | print(f' {check["message"]}')
69 | for id, context in check['items'].items():
70 | print(f' {id}: {context}' if context else f' {id}')
71 |
72 | sys.exit(0 if all_valid else 1)
73 |
74 |
75 | def _path_type(arg):
76 | return Path(arg)
77 |
78 |
79 | def _file_path_type(arg):
80 | path = Path(arg)
81 | if not path.is_file():
82 | raise argparse.ArgumentTypeError(f'cannot file file: {arg}')
83 | return path
84 |
85 |
86 | parser = argparse.ArgumentParser(
87 | prog='python3 -m wn',
88 | description="Manage Wn's wordnet data from the command line.",
89 | )
90 | parser.add_argument(
91 | '-V', '--version', action='version', version=f'Wn {wn.__version__}'
92 | )
93 | parser.add_argument(
94 | '-v', '--verbose', action='count', dest='verbosity', default=0,
95 | help='increase verbosity (can repeat: -vv, -vvv)'
96 | )
97 | parser.add_argument(
98 | '-d', '--dir',
99 | type=_path_type,
100 | help="data directory for Wn's database and cache",
101 | )
102 | parser.set_defaults(func=lambda _: parser.print_help())
103 | sub_parsers = parser.add_subparsers(title='subcommands')
104 |
105 |
106 | parser_download = sub_parsers.add_parser(
107 | 'download',
108 | description="Download wordnets and add them to Wn's database.",
109 | help='download wordnets',
110 | )
111 | parser_download.add_argument(
112 | 'target', nargs='+', help='project specifiers or URLs'
113 | )
114 | parser_download.add_argument(
115 | '--index', type=_file_path_type, help='project index to use for downloading'
116 | )
117 | parser_download.add_argument(
118 | '--no-add', action='store_false', dest='add',
119 | help='download and cache without adding to the database'
120 | )
121 | parser_download.set_defaults(func=_download)
122 |
123 |
124 | parser_lexicons = sub_parsers.add_parser(
125 | 'lexicons',
126 | description="Display a list of installed lexicons.",
127 | help='list installed lexicons',
128 | )
129 | parser_lexicons.add_argument(
130 | '-l', '--lang', help='BCP 47 language code'
131 | )
132 | parser_lexicons.add_argument(
133 | '--lexicon', help='lexicon specifiers'
134 | )
135 | parser_lexicons.set_defaults(func=_lexicons)
136 |
137 |
138 | parser_projects = sub_parsers.add_parser(
139 | 'projects',
140 | description=(
141 | "Display a list of known projects. The first column shows the "
142 | "status for a project (i=indexed, c=cached)."
143 | ),
144 | help='list known projects',
145 | )
146 | parser_projects.set_defaults(func=_projects)
147 |
148 |
149 | parser_validate = sub_parsers.add_parser(
150 | 'validate',
151 | description=(
152 | "Validate a WN-LMF lexicon"
153 | ),
154 | help='validate a lexicon',
155 | )
156 | parser_validate.add_argument(
157 | 'FILE', type=_file_path_type, help='WN-LMF (XML) lexicon file to validate'
158 | )
159 | parser_validate.add_argument(
160 | '--select', metavar='CHECKS', default='E,W',
161 | help='comma-separated list of checks to run (default: E,W)'
162 | )
163 | parser_validate.add_argument(
164 | '--output-file', metavar='FILE',
165 | help='write report to a JSON file'
166 | )
167 | parser_validate.set_defaults(func=_validate)
168 |
169 |
170 | args = parser.parse_args()
171 |
172 | logging.basicConfig(level=logging.ERROR - (min(args.verbosity, 3) * 10))
173 |
174 | if args.dir:
175 | wn.config.data_directory = args.dir
176 |
177 | args.func(args)
178 |
--------------------------------------------------------------------------------
/tests/data/mini-lmf-1.1.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 | tatoe
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
53 |
54 |
56 |
57 |
59 |
60 |
61 |
62 |
63 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 | "the artist illustrated the story beautifully"
92 |
93 |
94 |
95 |
96 |
97 |
98 | INF
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
118 | depict something in a visual medium
119 |
120 |
121 |
123 | terminate employment
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
--------------------------------------------------------------------------------
/tests/lmf_test.py:
--------------------------------------------------------------------------------
1 |
2 | from xml.etree import ElementTree as ET
3 |
4 | from wn import lmf
5 |
6 |
7 | def test_is_lmf(datadir):
8 | assert lmf.is_lmf(datadir / 'mini-lmf-1.0.xml')
9 | assert lmf.is_lmf(str(datadir / 'mini-lmf-1.0.xml'))
10 | assert not lmf.is_lmf(datadir / 'README.md')
11 | assert not lmf.is_lmf(datadir / 'missing.xml')
12 | assert lmf.is_lmf(datadir / 'mini-lmf-1.1.xml')
13 |
14 |
15 | def test_scan_lexicons(datadir):
16 | assert lmf.scan_lexicons(datadir / 'mini-lmf-1.0.xml') == [
17 | {
18 | 'id': 'test-en',
19 | 'version': '1',
20 | 'label': 'Testing English WordNet',
21 | 'extends': None,
22 | },
23 | {
24 | 'id': 'test-es',
25 | 'version': '1',
26 | 'label': 'Testing Spanish WordNet',
27 | 'extends': None,
28 | },
29 | ]
30 |
31 | assert lmf.scan_lexicons(datadir / 'mini-lmf-1.1.xml') == [
32 | {
33 | 'id': 'test-ja',
34 | 'version': '1',
35 | 'label': 'Testing Japanese WordNet',
36 | 'extends': None,
37 | },
38 | {
39 | 'id': 'test-en-ext',
40 | 'version': '1',
41 | 'label': 'Testing English Extension',
42 | 'extends': {
43 | 'id': 'test-en',
44 | 'version': '1',
45 | },
46 | },
47 | ]
48 |
49 |
50 | def test_load_1_0(datadir):
51 | resource = lmf.load(datadir / 'mini-lmf-1.0.xml')
52 | lexicons = resource['lexicons']
53 | assert len(lexicons) == 2
54 | lexicon = lexicons[0]
55 |
56 | assert lexicon['id'] == 'test-en'
57 | assert lexicon['label'] == 'Testing English WordNet'
58 | assert lexicon['language'] == 'en'
59 | assert lexicon['email'] == 'maintainer@example.com'
60 | assert lexicon['license'] == 'https://creativecommons.org/licenses/by/4.0/'
61 | assert lexicon['version'] == '1'
62 | assert lexicon['url'] == 'https://example.com/test-en'
63 |
64 | assert len(lexicon['entries']) == 9
65 | le = lexicon['entries'][0]
66 | assert le['id'] == 'test-en-information-n'
67 |
68 | assert le['lemma']['writtenForm'] == 'information'
69 | assert le['lemma']['partOfSpeech'] == 'n'
70 | assert le['lemma']['script'] == 'Latn'
71 | assert len(le['lemma']['tags']) == 1
72 |
73 | assert len(le.get('forms', [])) == 0
74 |
75 | assert len(le['senses']) == 1
76 | sense = le['senses'][0]
77 | assert sense['id'] == 'test-en-information-n-0001-01'
78 | assert sense['synset'] == 'test-en-0001-n'
79 | assert len(sense.get('relations', [])) == 0
80 | # assert sense['relations'][0]['target'] == 'test-en-exemplify-v-01023137-01'
81 | # assert sense['relations'][0]['type'] == 'derivation'
82 |
83 | assert len(lexicon.get('frames', [])) == 0 # frames are on lexical entry
84 | assert len(lexicon['entries'][6]['frames']) == 2
85 | frames = lexicon['entries'][6]['frames']
86 | assert frames[0]['subcategorizationFrame'] == 'Somebody ----s something'
87 | assert frames[0]['senses'] == ['test-en-illustrate-v-0003-01']
88 |
89 | assert len(lexicon['synsets']) == 8
90 |
91 | assert lexicons[1]['id'] == 'test-es'
92 |
93 |
94 | def test_load_1_1(datadir):
95 | resource = lmf.load(datadir / 'mini-lmf-1.1.xml')
96 | lexicons = resource['lexicons']
97 | assert len(lexicons) == 2
98 | lexicon = lexicons[0]
99 | assert lexicon['id'] == 'test-ja'
100 | assert lexicon['version'] == '1'
101 | # assert lexicon.logo == 'logo.svg'
102 | assert lexicon.get('requires') == [{'id': 'test-en', 'version': '1'}]
103 |
104 | lexicon = lexicons[1]
105 | assert lexicon['id'] == 'test-en-ext'
106 | assert lexicon.get('extends') == {'id': 'test-en', 'version': '1'}
107 |
108 |
109 | def test_load_1_3(datadir):
110 | resource = lmf.load(datadir / 'mini-lmf-1.3.xml')
111 | lexicons = resource['lexicons']
112 | assert len(lexicons) == 1
113 | lexicon = lexicons[0]
114 | synsets = lexicon['synsets']
115 | assert synsets[0]['definitions'][0]['text'] == 'one two three'
116 | assert synsets[1]['definitions'][0]['text'] == 'one two three'
117 | assert synsets[2]['definitions'][0]['text'] == '''
118 | one
119 | two
120 | three
121 | '''
122 |
123 |
124 | def test_load_1_4(datadir):
125 | resource = lmf.load(datadir / 'mini-lmf-1.4.xml')
126 | lexicons = resource['lexicons']
127 | assert len(lexicons) == 1
128 | lexicon = lexicons[0]
129 | assert lexicon['entries'][0].get('index') == 'foo_bar'
130 | assert lexicon['entries'][1].get('index') == 'foo_bar'
131 | assert lexicon['entries'][2].get('index') is None
132 | assert lexicon['entries'][3].get('index') == 'baz'
133 | assert lexicon['entries'][4].get('index') is None
134 | assert lexicon['entries'][5].get('index') == 'baz'
135 |
136 | assert lexicon['entries'][0]['senses'][0].get('n') == 3
137 | assert lexicon['entries'][1]['senses'][0].get('n') == 2
138 | assert lexicon['entries'][1]['senses'][1].get('n') == 1
139 | assert lexicon['entries'][2]['senses'][0].get('n') is None
140 | assert lexicon['entries'][3]['senses'][0].get('n') == 2
141 | assert lexicon['entries'][4]['senses'][0].get('n') == 2
142 | assert lexicon['entries'][4]['senses'][1].get('n') is None
143 | assert lexicon['entries'][5]['senses'][0].get('n') == 1
144 |
145 |
146 | def test_dump(datadir, tmp_path):
147 | tmpdir = tmp_path / 'test_dump'
148 | tmpdir.mkdir()
149 | tmppath = tmpdir / 'mini_lmf_dump.xml'
150 |
151 | def assert_xml_equal(mini_lmf, dump_lmf):
152 | orig = ET.canonicalize(from_file=mini_lmf, strip_text=True)
153 | temp = ET.canonicalize(from_file=dump_lmf, strip_text=True)
154 | # additional transformation to help with debugging
155 | orig = orig.replace('<', '\n<')
156 | temp = temp.replace('<', '\n<')
157 | assert orig == temp
158 |
159 | lmf.dump(lmf.load(datadir / 'mini-lmf-1.0.xml'), tmppath)
160 | assert_xml_equal(datadir / 'mini-lmf-1.0.xml', tmppath)
161 |
162 | lmf.dump(lmf.load(datadir / 'mini-lmf-1.1.xml'), tmppath)
163 | assert_xml_equal(datadir / 'mini-lmf-1.1.xml', tmppath)
164 |
165 | lmf.dump(lmf.load(datadir / 'mini-lmf-1.4.xml'), tmppath)
166 | assert_xml_equal(datadir / 'mini-lmf-1.4.xml', tmppath)
167 |
--------------------------------------------------------------------------------
/docs/guides/wordnet.rst:
--------------------------------------------------------------------------------
1 | .. raw:: html
2 |
3 |
4 |
5 |
6 | The Structure of a Wordnet
7 | ==========================
8 | A **wordnet** is an online lexicon which is organized by concepts.
9 |
10 | The basic unit of a wordnet is the synonym set (**synset**), a group of words that all refer to the
11 | same concept. Words and synsets are linked by means of conceptual-semantic relations to form the
12 | structure of wordnet.
13 |
14 | Words, Senses, and Synsets
15 | --------------------------
16 | We all know that **words** are the basic building blocks of languages, a word is built up with two parts,
17 | its form and its meaning, but in natural languages, the word form and word meaning are not in an elegant
18 | one-to-one match, one word form may connect to many different meanings, so hereforth, we need **senses**,
19 | to work as the unit of word meanings, for example, the word *bank* has at least two senses:
20 |
21 | 1. bank\ :sup:`1`\: financial institution, like *City Bank*;
22 | 2. bank\ :sup:`2`\: sloping land, like *river bank*;
23 |
24 | Since **synsets** are group of words sharing the same concept, bank\ :sup:`1`\ and bank\ :sup:`2`\ are members of
25 | two different synsets, although they have the same word form.
26 |
27 | On the other hand, different word forms may also convey the same concept, such as *cab* and *taxi*,
28 | these word forms with the same concept are grouped together into one synset.
29 |
30 | .. raw:: html
31 | :file: images/word-sense-synset.svg
32 |
33 |
34 | .. role:: center
35 | :class: center
36 |
37 | :center:`Figure: relations between words, senses and synsets`
38 |
39 |
40 | Synset Relations
41 | ----------------
42 | In wordnet, synsets are linked with each other to form various kinds of relations. For example, if
43 | the concept expressed by a synset is more general than a given synset, then it is in a
44 | *hypernym* relation with the given synset. As shown in the figure below, the synset with *car*, *auto* and *automobile* as its
45 | member is the *hypernym* of the other synset with *cab*, *taxi* and *hack*. Such relation which is built on
46 | the synset level is categorized as synset relations.
47 |
48 | .. raw:: html
49 | :file: images/synset-synset.svg
50 |
51 | :center:`Figure: example of synset relations`
52 |
53 | Sense Relations
54 | ---------------
55 |
56 | Some relations in wordnet are also built on sense level, which can be further divided into two types,
57 | relations that link sense with another sense, and relations that link sense with another synset.
58 |
59 | .. note:: In wordnet, synset relation and sense relation can both employ a particular
60 | relation type, such as `domain topic `_.
61 |
62 | **Sense-Sense**
63 |
64 | Sense to sense relations emphasize the connections between different senses, especially when dealing
65 | with morphologically related words. For example, *behavioral* is the adjective to the noun *behavior*,
66 | which is known as in the *pertainym* relation with *behavior*, however, such relation doesn't exist between
67 | *behavioral* and *conduct*, which is a synonym of *behavior* and is in the same synset. Here *pertainym*
68 | is a sense-sense relation.
69 |
70 | .. raw:: html
71 | :file: images/sense-sense.svg
72 |
73 | :center:`Figure: example of sense-sense relations`
74 |
75 | **Sense-Synset**
76 |
77 | Sense-synset relations connect a particular sense with a synset. For example, *cursor* is a term in the
78 | *computer science* discipline, in wordnet, it is in the *has domain topic* relation with the
79 | *computer science* synset, but *pointer*, which is in the same synset with *cursor*, is not a term, thus
80 | has no such relation with *computer science* synset.
81 |
82 | .. raw:: html
83 | :file: images/sense-synset.svg
84 |
85 | :center:`Figure: example of sense-synset relations`
86 |
87 | Other Information
88 | -----------------
89 | A wordnet should be built in an appropriate form, two schemas are accepted:
90 |
91 | * XML schema based on the Lexical Markup Framework (LMF)
92 | * JSON-LD using the Lexicon Model for Ontologies
93 |
94 | The structure of a wordnet should contain below info:
95 |
96 | **Definition**
97 |
98 | Definition is used to define senses and synsets in a wordnet, it is given in the language
99 | of the wordnet it came from.
100 |
101 | **Example**
102 |
103 | Example is used to clarify the senses and synsets in a wordnet, users can understand the definition
104 | more clearly with a given example.
105 |
106 | **Metadata**
107 |
108 | A wordnet has its own metadata, based on the `Dublin Core `_, to state the
109 | basic info of it, below table lists all the items in the metadata of a wordnet:
110 |
111 | +------------------+-----------+-----------+
112 | | contributor | Optional | str |
113 | +------------------+-----------+-----------+
114 | | coverage | Optional | str |
115 | +------------------+-----------+-----------+
116 | | creator | Optional | str |
117 | +------------------+-----------+-----------+
118 | | date | Optional | str |
119 | +------------------+-----------+-----------+
120 | | description | Optional | str |
121 | +------------------+-----------+-----------+
122 | | format | Optional | str |
123 | +------------------+-----------+-----------+
124 | | identifier | Optional | str |
125 | +------------------+-----------+-----------+
126 | | publisher | Optional | str |
127 | +------------------+-----------+-----------+
128 | | relation | Optional | str |
129 | +------------------+-----------+-----------+
130 | | rights | Optional | str |
131 | +------------------+-----------+-----------+
132 | | source | Optional | str |
133 | +------------------+-----------+-----------+
134 | | subject | Optional | str |
135 | +------------------+-----------+-----------+
136 | | title | Optional | str |
137 | +------------------+-----------+-----------+
138 | | type | Optional | str |
139 | +------------------+-----------+-----------+
140 | | status | Optional | str |
141 | +------------------+-----------+-----------+
142 | | note | Optional | str |
143 | +------------------+-----------+-----------+
144 | | confidence | Optional | float |
145 | +------------------+-----------+-----------+
--------------------------------------------------------------------------------
/wn/_module_functions.py:
--------------------------------------------------------------------------------
1 | from typing import Optional, Union
2 |
3 | import wn
4 | from wn._util import format_lexicon_specifier
5 |
6 |
7 | def projects() -> list[dict]:
8 | """Return the list of indexed projects.
9 |
10 | This returns the same dictionaries of information as
11 | :meth:`wn.config.get_project_info
12 | `, but for all indexed
13 | projects.
14 |
15 | Example:
16 |
17 | >>> infos = wn.projects()
18 | >>> len(infos)
19 | 36
20 | >>> infos[0]['label']
21 | 'Open English WordNet'
22 |
23 | """
24 | index = wn.config.index
25 | return [
26 | wn.config.get_project_info(format_lexicon_specifier(project_id, version))
27 | for project_id, project_info in index.items()
28 | for version in project_info.get('versions', [])
29 | if 'resource_urls' in project_info['versions'][version]
30 | ]
31 |
32 |
33 | def lexicons(
34 | *,
35 | lexicon: Optional[str] = "*",
36 | lang: Optional[str] = None
37 | ) -> list[wn.Lexicon]:
38 | """Return the lexicons matching a language or lexicon specifier.
39 |
40 | Example:
41 |
42 | >>> wn.lexicons(lang='en')
43 | [, ]
44 |
45 | """
46 | try:
47 | w = wn.Wordnet(lang=lang, lexicon=lexicon or '*')
48 | except wn.Error:
49 | return []
50 | else:
51 | return w.lexicons()
52 |
53 |
54 | def word(
55 | id: str,
56 | *,
57 | lexicon: Optional[str] = None,
58 | lang: Optional[str] = None
59 | ) -> wn.Word:
60 | """Return the word with *id* in *lexicon*.
61 |
62 | This will create a :class:`Wordnet` object using the *lang* and
63 | *lexicon* arguments. The *id* argument is then passed to the
64 | :meth:`Wordnet.word` method.
65 |
66 | >>> wn.word('ewn-cell-n')
67 | Word('ewn-cell-n')
68 |
69 | """
70 | return wn.Wordnet(lang=lang, lexicon=lexicon).word(id)
71 |
72 |
73 | def words(
74 | form: Optional[str] = None,
75 | pos: Optional[str] = None,
76 | *,
77 | lexicon: Optional[str] = None,
78 | lang: Optional[str] = None,
79 | ) -> list[wn.Word]:
80 | """Return the list of matching words.
81 |
82 | This will create a :class:`Wordnet` object using the *lang* and
83 | *lexicon* arguments. The remaining arguments are passed to the
84 | :meth:`Wordnet.words` method.
85 |
86 | >>> len(wn.words())
87 | 282902
88 | >>> len(wn.words(pos='v'))
89 | 34592
90 | >>> wn.words(form="scurry")
91 | [Word('ewn-scurry-n'), Word('ewn-scurry-v')]
92 |
93 | """
94 | return wn.Wordnet(lang=lang, lexicon=lexicon).words(form=form, pos=pos)
95 |
96 |
97 | def synset(
98 | id: str,
99 | *,
100 | lexicon: Optional[str] = None,
101 | lang: Optional[str] = None
102 | ) -> wn.Synset:
103 | """Return the synset with *id* in *lexicon*.
104 |
105 | This will create a :class:`Wordnet` object using the *lang* and
106 | *lexicon* arguments. The *id* argument is then passed to the
107 | :meth:`Wordnet.synset` method.
108 |
109 | >>> wn.synset('ewn-03311152-n')
110 | Synset('ewn-03311152-n')
111 |
112 | """
113 | return wn.Wordnet(lang=lang, lexicon=lexicon).synset(id=id)
114 |
115 |
116 | def synsets(
117 | form: Optional[str] = None,
118 | pos: Optional[str] = None,
119 | ili: Optional[Union[str, wn.ILI]] = None,
120 | *,
121 | lexicon: Optional[str] = None,
122 | lang: Optional[str] = None,
123 | ) -> list[wn.Synset]:
124 | """Return the list of matching synsets.
125 |
126 | This will create a :class:`Wordnet` object using the *lang* and
127 | *lexicon* arguments. The remaining arguments are passed to the
128 | :meth:`Wordnet.synsets` method.
129 |
130 | >>> len(wn.synsets('couch'))
131 | 4
132 | >>> wn.synsets('couch', pos='v')
133 | [Synset('ewn-00983308-v')]
134 |
135 | """
136 | return wn.Wordnet(lang=lang, lexicon=lexicon).synsets(form=form, pos=pos, ili=ili)
137 |
138 |
139 | def senses(
140 | form: Optional[str] = None,
141 | pos: Optional[str] = None,
142 | *,
143 | lexicon: Optional[str] = None,
144 | lang: Optional[str] = None,
145 | ) -> list[wn.Sense]:
146 | """Return the list of matching senses.
147 |
148 | This will create a :class:`Wordnet` object using the *lang* and
149 | *lexicon* arguments. The remaining arguments are passed to the
150 | :meth:`Wordnet.senses` method.
151 |
152 | >>> len(wn.senses('twig'))
153 | 3
154 | >>> wn.senses('twig', pos='n')
155 | [Sense('ewn-twig-n-13184889-02')]
156 |
157 | """
158 | return wn.Wordnet(lang=lang, lexicon=lexicon).senses(form=form, pos=pos)
159 |
160 |
161 | def sense(
162 | id: str,
163 | *,
164 | lexicon: Optional[str] = None,
165 | lang: Optional[str] = None
166 | ) -> wn.Sense:
167 | """Return the sense with *id* in *lexicon*.
168 |
169 | This will create a :class:`Wordnet` object using the *lang* and
170 | *lexicon* arguments. The *id* argument is then passed to the
171 | :meth:`Wordnet.sense` method.
172 |
173 | >>> wn.sense('ewn-flutter-v-01903884-02')
174 | Sense('ewn-flutter-v-01903884-02')
175 |
176 | """
177 | return wn.Wordnet(lang=lang, lexicon=lexicon).sense(id=id)
178 |
179 |
180 | def ili(
181 | id: str,
182 | *,
183 | lexicon: Optional[str] = None,
184 | lang: Optional[str] = None
185 | ) -> wn.ILI:
186 | """Return the interlingual index with *id*.
187 |
188 | This will create a :class:`Wordnet` object using the *lang* and
189 | *lexicon* arguments. The *id* argument is then passed to the
190 | :meth:`Wordnet.ili` method.
191 |
192 | """
193 | return wn.Wordnet(lang=lang, lexicon=lexicon).ili(id=id)
194 |
195 |
196 | def ilis(
197 | status: Optional[str] = None,
198 | *,
199 | lexicon: Optional[str] = None,
200 | lang: Optional[str] = None,
201 | ) -> list[wn.ILI]:
202 | """Return the list of matching interlingual indices.
203 |
204 | This will create a :class:`Wordnet` object using the *lang* and
205 | *lexicon* arguments. The remaining arguments are passed to the
206 | :meth:`Wordnet.ilis` method.
207 |
208 | >>> len(wn.ilis())
209 | 120071
210 | >>> len(wn.ilis(status='proposed'))
211 | 2573
212 | >>> wn.ilis(status='proposed')[-1].definition()
213 | 'the neutrino associated with the tau lepton.'
214 |
215 | """
216 | return wn.Wordnet(lang=lang, lexicon=lexicon).ilis(status=status)
217 |
--------------------------------------------------------------------------------
/tests/secondary_query_test.py:
--------------------------------------------------------------------------------
1 |
2 | import pytest
3 |
4 | import wn
5 |
6 |
7 | @pytest.mark.usefixtures('mini_db')
8 | def test_word_senses():
9 | assert len(wn.word('test-en-information-n').senses()) == 1
10 | assert len(wn.word('test-es-información-n').senses()) == 1
11 |
12 |
13 | @pytest.mark.usefixtures('mini_db')
14 | def test_word_synsets():
15 | assert len(wn.word('test-en-information-n').synsets()) == 1
16 | assert len(wn.word('test-es-información-n').synsets()) == 1
17 |
18 |
19 | @pytest.mark.usefixtures('mini_db')
20 | def test_word_translate():
21 | assert len(wn.word('test-en-example-n').translate(lang='es')) == 1
22 | assert len(wn.word('test-es-ejemplo-n').translate(lang='en')) == 1
23 |
24 |
25 | @pytest.mark.usefixtures('mini_db')
26 | def test_sense_word():
27 | assert (wn.sense('test-en-information-n-0001-01').word()
28 | == wn.word('test-en-information-n'))
29 | assert (wn.sense('test-es-información-n-0001-01').word()
30 | == wn.word('test-es-información-n'))
31 |
32 |
33 | @pytest.mark.usefixtures('mini_db')
34 | def test_sense_synset():
35 | assert (wn.sense('test-en-information-n-0001-01').synset()
36 | == wn.synset('test-en-0001-n'))
37 | assert (wn.sense('test-es-información-n-0001-01').synset()
38 | == wn.synset('test-es-0001-n'))
39 |
40 |
41 | @pytest.mark.usefixtures('mini_db')
42 | def test_sense_issue_157():
43 | # https://github.com/goodmami/wn/issues/157
44 | sense = wn.sense('test-en-information-n-0001-01')
45 | # This test uses non-public members, which is not ideal, but there
46 | # is currently no better alternative.
47 | assert sense._lexconf is sense.word()._lexconf
48 | assert sense._lexconf is sense.synset()._lexconf
49 |
50 |
51 | @pytest.mark.usefixtures('mini_db')
52 | def test_sense_examples():
53 | assert wn.sense('test-en-information-n-0001-01').examples() == []
54 | assert wn.sense('test-es-información-n-0001-01').examples() == []
55 |
56 |
57 | @pytest.mark.usefixtures('mini_db')
58 | def test_sense_lexicalized():
59 | assert wn.sense('test-en-information-n-0001-01').lexicalized()
60 | assert wn.sense('test-es-información-n-0001-01').lexicalized()
61 |
62 |
63 | @pytest.mark.usefixtures('mini_db')
64 | def test_sense_frames():
65 | assert wn.sense('test-en-illustrate-v-0003-01').frames() == [
66 | 'Somebody ----s something',
67 | 'Something ----s something',
68 | ]
69 | assert wn.sense('test-es-ilustrar-v-0003-01').frames() == []
70 |
71 |
72 | @pytest.mark.usefixtures('mini_db_1_1')
73 | def test_sense_frames_issue_156():
74 | # https://github.com/goodmami/wn/issues/156
75 | assert wn.sense('test-ja-示す-v-0003-01').frames() == [
76 | 'ある人が何かを----',
77 | ]
78 | assert wn.sense('test-ja-事例-n-0002-01').frames() == []
79 |
80 |
81 | @pytest.mark.usefixtures('mini_db')
82 | def test_sense_translate():
83 | assert len(wn.sense('test-en-information-n-0001-01').translate(lang='es')) == 1
84 | assert len(wn.sense('test-es-información-n-0001-01').translate(lang='en')) == 1
85 |
86 |
87 | @pytest.mark.usefixtures('mini_db')
88 | def test_synset_senses():
89 | assert len(wn.synset('test-en-0003-v').senses()) == 2
90 | assert len(wn.synset('test-es-0003-v').senses()) == 2
91 |
92 |
93 | @pytest.mark.usefixtures('mini_db')
94 | def test_synset_words():
95 | assert len(wn.synset('test-en-0003-v').words()) == 2
96 | assert len(wn.synset('test-es-0003-v').words()) == 2
97 |
98 |
99 | @pytest.mark.usefixtures('mini_db')
100 | def test_synset_lemmas():
101 | assert wn.synset('test-en-0003-v').lemmas() == ['exemplify', 'illustrate']
102 | assert wn.synset('test-es-0003-v').lemmas() == ['ejemplificar', 'ilustrar']
103 |
104 |
105 | @pytest.mark.usefixtures('mini_db')
106 | def test_synset_ili():
107 | assert isinstance(wn.synset('test-en-0001-n').ili, wn.ILI)
108 | assert wn.synset('test-en-0001-n').ili.id == 'i67447'
109 | assert wn.synset('test-en-0001-n').ili.status == 'presupposed'
110 | assert wn.synset('test-en-0008-n').ili is None
111 | assert wn.synset('test-en-0007-v').ili.id is None
112 | assert wn.synset('test-en-0007-v').ili.status == 'proposed'
113 |
114 |
115 | @pytest.mark.usefixtures('mini_db')
116 | def test_synset_definition():
117 | assert wn.synset('test-en-0001-n').definition() == 'something that informs'
118 | defn = wn.synset('test-en-0001-n').definition(data=True)
119 | assert defn.source_sense_id == 'test-en-information-n-0001-01'
120 | assert wn.synset('test-es-0001-n').definition() == 'algo que informa'
121 |
122 |
123 | @pytest.mark.usefixtures('mini_db')
124 | def test_synset_definitions():
125 | assert wn.synset('test-en-0001-n').definitions() == ['something that informs']
126 | defns = wn.synset('test-en-0001-n').definitions(data=True)
127 | assert defns[0].source_sense_id == 'test-en-information-n-0001-01'
128 | assert wn.synset('test-es-0001-n').definitions() == ['algo que informa']
129 |
130 |
131 | @pytest.mark.usefixtures('mini_db')
132 | def test_synset_examples():
133 | assert wn.synset('test-en-0001-n').examples() == ['"this is information"']
134 | ex = wn.synset('test-en-0001-n').examples(data=True)[0]
135 | assert ex.text == '"this is information"'
136 | assert wn.synset('test-es-0001-n').examples() == ['"este es la información"']
137 |
138 |
139 | @pytest.mark.usefixtures('mini_db')
140 | def test_synset_lexicalized():
141 | assert wn.synset('test-en-0001-n').lexicalized()
142 | assert wn.synset('test-es-0001-n').lexicalized()
143 |
144 |
145 | @pytest.mark.usefixtures('mini_db')
146 | def test_synset_translate():
147 | assert len(wn.synset('test-en-0001-n').translate(lang='es')) == 1
148 | assert len(wn.synset('test-es-0001-n').translate(lang='en')) == 1
149 |
150 |
151 | @pytest.mark.usefixtures('uninitialized_datadir')
152 | def test_word_sense_order(datadir):
153 | wn.add(datadir / 'sense-member-order.xml')
154 | assert [s.id for s in wn.word('test-foo-n').senses()] == [
155 | "test-01-foo-n", "test-02-foo-n",
156 | ]
157 | assert [s.id for s in wn.word('test-bar-n').senses()] == [
158 | "test-02-bar-n", "test-01-bar-n",
159 | ]
160 |
161 |
162 | @pytest.mark.usefixtures('uninitialized_datadir')
163 | def test_synset_member_order(datadir):
164 | wn.add(datadir / 'sense-member-order.xml')
165 | assert [s.id for s in wn.synset('test-01-n').senses()] == [
166 | "test-01-bar-n", "test-01-foo-n",
167 | ]
168 | assert [s.id for s in wn.synset('test-02-n').senses()] == [
169 | "test-02-bar-n", "test-02-foo-n",
170 | ]
171 |
--------------------------------------------------------------------------------
/tests/relations_test.py:
--------------------------------------------------------------------------------
1 |
2 | import pytest
3 |
4 | import wn
5 |
6 |
7 | @pytest.mark.usefixtures('mini_db')
8 | def test_word_derived_words():
9 | assert len(wn.word('test-en-example-n').derived_words()) == 1
10 | assert len(wn.word('test-es-ejemplo-n').derived_words()) == 1
11 |
12 |
13 | @pytest.mark.usefixtures('mini_db')
14 | def test_synset_hypernyms():
15 | assert wn.synset('test-en-0002-n').hypernyms() == [
16 | wn.synset('test-en-0001-n')
17 | ]
18 | assert wn.synset('test-en-0001-n').hypernyms() == []
19 |
20 |
21 | @pytest.mark.usefixtures('mini_db')
22 | def test_synset_hypernyms_expand_default():
23 | assert wn.synset('test-es-0002-n').hypernyms() == [
24 | wn.synset('test-es-0001-n')
25 | ]
26 | assert wn.synset('test-es-0001-n').hypernyms() == []
27 |
28 |
29 | @pytest.mark.usefixtures('mini_db')
30 | def test_synset_hypernyms_expand_empty():
31 | w = wn.Wordnet(lang='es', expand='')
32 | assert w.synset('test-es-0002-n').hypernyms() == []
33 |
34 |
35 | @pytest.mark.usefixtures('mini_db')
36 | def test_synset_hypernyms_expand_specified():
37 | w = wn.Wordnet(lang='es', expand='test-en')
38 | assert w.synset('test-es-0002-n').hypernyms() == [
39 | w.synset('test-es-0001-n')
40 | ]
41 |
42 |
43 | @pytest.mark.usefixtures('mini_db')
44 | def test_synset_relations():
45 | w = wn.Wordnet(lang='en')
46 | assert w.synset('test-en-0002-n').relations() == {
47 | 'hypernym': [w.synset('test-en-0001-n')],
48 | 'hyponym': [w.synset('test-en-0004-n')]
49 | }
50 |
51 |
52 | @pytest.mark.usefixtures('mini_db')
53 | def test_sense_get_related():
54 | w = wn.Wordnet('test-en')
55 | assert w.sense('test-en-example-n-0002-01').get_related() == [
56 | w.sense('test-en-exemplify-v-0003-01')
57 | ]
58 |
59 |
60 | @pytest.mark.usefixtures('mini_db')
61 | def test_sense_relations():
62 | w = wn.Wordnet('test-en')
63 | assert w.sense('test-en-example-n-0002-01').relations() == {
64 | 'derivation': [w.sense('test-en-exemplify-v-0003-01')]
65 | }
66 |
67 |
68 | @pytest.mark.usefixtures('mini_db_1_1')
69 | def test_extension_relations():
70 | # default mode
71 | assert wn.synset('test-en-0007-v').hypernyms() == [
72 | wn.synset('test-en-ext-0009-v')
73 | ]
74 | assert wn.synset('test-en-ext-0009-v').hyponyms() == [
75 | wn.synset('test-en-0007-v')
76 | ]
77 | assert wn.sense('test-en-information-n-0001-01').get_related('pertainym') == [
78 | wn.sense('test-en-ext-info-n-0001-01')
79 | ]
80 | assert wn.sense('test-en-ext-info-n-0001-01').get_related('pertainym') == [
81 | wn.sense('test-en-information-n-0001-01')
82 | ]
83 |
84 | # restricted to base
85 | w = wn.Wordnet(lexicon='test-en')
86 | assert w.synset('test-en-0007-v').hypernyms() == []
87 | assert w.sense('test-en-information-n-0001-01').get_related('pertainym') == []
88 |
89 | # base and extension
90 | w = wn.Wordnet(lexicon='test-en test-en-ext')
91 | assert w.synset('test-en-0007-v').hypernyms() == [
92 | w.synset('test-en-ext-0009-v')
93 | ]
94 | assert w.synset('test-en-ext-0009-v').hyponyms() == [
95 | w.synset('test-en-0007-v')
96 | ]
97 | assert w.sense('test-en-information-n-0001-01').get_related('pertainym') == [
98 | w.sense('test-en-ext-info-n-0001-01')
99 | ]
100 | assert w.sense('test-en-ext-info-n-0001-01').get_related('pertainym') == [
101 | w.sense('test-en-information-n-0001-01')
102 | ]
103 |
104 | # restricted to extension
105 | w = wn.Wordnet(lexicon='test-en-ext')
106 | assert w.synset('test-en-ext-0009-v').hyponyms() == []
107 | assert w.sense('test-en-ext-info-n-0001-01').get_related('pertainym') == []
108 |
109 |
110 | @pytest.mark.usefixtures('mini_db_1_1')
111 | def test_sense_synset_issue_168():
112 | # https://github.com/goodmami/wn/issues/168
113 | ja = wn.Wordnet(lexicon='test-ja', expand='')
114 | assert ja.synset('test-ja-0001-n').get_related() == []
115 | assert ja.sense('test-ja-情報-n-0001-01').synset().get_related() == []
116 |
117 |
118 | @pytest.mark.usefixtures('mini_db')
119 | def test_synset_relations_issue_169():
120 | # https://github.com/goodmami/wn/issues/169
121 | en = wn.Wordnet('test-en')
122 | assert list(en.synset("test-en-0001-n").relations('hyponym')) == ['hyponym']
123 | es = wn.Wordnet('test-es', expand='test-en')
124 | assert list(es.synset("test-es-0001-n").relations('hyponym')) == ['hyponym']
125 |
126 |
127 | @pytest.mark.usefixtures('mini_db')
128 | def test_synset_relations_issue_177():
129 | # https://github.com/goodmami/wn/issues/177
130 | assert 'hyponym' in wn.synset('test-es-0001-n').relations()
131 |
132 |
133 | @pytest.mark.usefixtures('mini_db')
134 | def test_sense_relation_map():
135 | en = wn.Wordnet('test-en')
136 | assert en.sense('test-en-information-n-0001-01').relation_map() == {}
137 | relmap = en.sense('test-en-illustrate-v-0003-01').relation_map()
138 | # only sense-sense relations by default
139 | assert len(relmap) == 3
140 | assert all(isinstance(tgt, wn.Sense) for tgt in relmap.values())
141 | assert {rel.name for rel in relmap} == {'derivation', 'other'}
142 | assert {rel.target_id for rel in relmap} == {'test-en-illustration-n-0002-01'}
143 | # sense relations targets should always have same ids as resolved targets
144 | assert all(rel.target_id == tgt.id for rel, tgt in relmap.items())
145 |
146 |
147 | @pytest.mark.usefixtures('mini_db')
148 | def test_synset_relation_map():
149 | en = wn.Wordnet('test-en')
150 | assert en.synset('test-en-0003-v').relation_map() == {}
151 | relmap = en.synset('test-en-0002-n').relation_map()
152 | assert len(relmap) == 2
153 | assert {rel.name for rel in relmap} == {'hypernym', 'hyponym'}
154 | assert {rel.target_id for rel in relmap} == {'test-en-0001-n', 'test-en-0004-n'}
155 | # synset relation targets have same ids as resolved targets in same lexicon
156 | assert all(rel.target_id == tgt.id for rel, tgt in relmap.items())
157 | assert all(rel.lexicon().id == 'test-en' for rel in relmap)
158 |
159 | # interlingual synset relation targets show original target ids
160 | es = wn.Wordnet('test-es', expand='test-en')
161 | relmap = es.synset('test-es-0002-n').relation_map()
162 | assert len(relmap) == 2
163 | assert {rel.name for rel in relmap} == {'hypernym', 'hyponym'}
164 | assert {rel.target_id for rel in relmap} == {'test-en-0001-n', 'test-en-0004-n'}
165 | assert all(rel.target_id != tgt.id for rel, tgt in relmap.items())
166 | assert all(rel.lexicon().id == 'test-en' for rel in relmap)
167 |
--------------------------------------------------------------------------------
/docs/api/wn.similarity.rst:
--------------------------------------------------------------------------------
1 | wn.similarity
2 | =============
3 |
4 | .. automodule:: wn.similarity
5 |
6 | Taxonomy-based Metrics
7 | ----------------------
8 |
9 | The `Path `_, `Leacock-Chodorow `_, and `Wu-Palmer `_ similarity
11 | metrics work by finding path distances in the hypernym/hyponym
12 | taxonomy. As such, they are most useful when the synsets are, in fact,
13 | arranged in a taxonomy. For the Princeton WordNet and derivative
14 | wordnets, such as the `Open English Wordnet`_ and `OMW English Wordnet
15 | based on WordNet 3.0`_ available to Wn, synsets for nouns and verbs
16 | are arranged taxonomically: the nouns mostly form a single structure
17 | with a single root while verbs form many smaller structures with many
18 | roots. Synsets for the other parts of speech do not use
19 | hypernym/hyponym relations at all. This situation may be different for
20 | other wordnet projects or future versions of the English wordnets.
21 |
22 | .. _Open English Wordnet: https://en-word.net
23 | .. _OMW English Wordnet based on WordNet 3.0: https://github.com/omwn/omw-data
24 |
25 | The similarity metrics tend to fail when the synsets are not connected
26 | by some path. When the synsets are in different parts of speech, or
27 | even in separate lexicons, this failure is acceptable and
28 | expected. But for cases like the verbs in the Princeton WordNet, it
29 | might be more useful to pretend that there is some unique root for all
30 | verbs so as to create a path connecting any two of them. For this
31 | purpose, the *simulate_root* parameter is available on the
32 | :func:`path`, :func:`lch`, and :func:`wup` functions, where it is
33 | passed on to calls to :meth:`wn.Synset.shortest_path` and
34 | :meth:`wn.Synset.lowest_common_hypernyms`. Setting *simulate_root* to
35 | :python:`True` can, however, give surprising results if the words are
36 | from a different lexicon. Currently, computing similarity for synsets
37 | from a different part of speech raises an error.
38 |
39 |
40 | Path Similarity
41 | '''''''''''''''
42 |
43 | When :math:`p` is the length of the shortest path between two synsets,
44 | the path similarity is:
45 |
46 | .. math::
47 |
48 | \frac{1}{p + 1}
49 |
50 | The similarity score ranges between 0.0 and 1.0, where the higher the
51 | score is, the more similar the synsets are. The score is 1.0 when a
52 | synset is compared to itself, and 0.0 when there is no path between
53 | the two synsets (i.e., the path distance is infinite).
54 |
55 | .. autofunction:: path
56 |
57 |
58 | .. _leacock-chodorow-similarity:
59 |
60 | Leacock-Chodorow Similarity
61 | '''''''''''''''''''''''''''
62 |
63 | When :math:`p` is the length of the shortest path between two synsets
64 | and :math:`d` is the maximum taxonomy depth, the Leacock-Chodorow
65 | similarity is:
66 |
67 | .. math::
68 |
69 | -\text{log}\left(\frac{p + 1}{2d}\right)
70 |
71 | .. autofunction:: lch
72 |
73 |
74 | Wu-Palmer Similarity
75 | ''''''''''''''''''''
76 |
77 | When *LCS* is the lowest common hypernym (also called "least common
78 | subsumer") between two synsets, :math:`i` is the shortest path
79 | distance from the first synset to *LCS*, :math:`j` is the shortest
80 | path distance from the second synset to *LCS*, and :math:`k` is the
81 | number of nodes (distance + 1) from *LCS* to the root node, then the
82 | Wu-Palmer similarity is:
83 |
84 | .. math::
85 |
86 | \frac{2k}{i + j + 2k}
87 |
88 | .. autofunction:: wup
89 |
90 |
91 | Information Content-based Metrics
92 | ---------------------------------
93 |
94 | The `Resnik `_, `Jiang-Conrath `_, and `Lin `_ similarity metrics work
96 | by computing the information content of the synsets and/or that of
97 | their lowest common hypernyms. They therefore require information
98 | content weights (see :mod:`wn.ic`), and the values returned
99 | necessarily depend on the weights used.
100 |
101 |
102 | Resnik Similarity
103 | '''''''''''''''''
104 |
105 | The Resnik similarity (`Resnik 1995
106 | `_) is the maximum
107 | information content value of the common subsumers (hypernym ancestors)
108 | of the two synsets. Formally it is defined as follows, where
109 | :math:`c_1` and :math:`c_2` are the two synsets being compared.
110 |
111 | .. math::
112 |
113 | \text{max}_{c \in \text{S}(c_1, c_2)} \text{IC}(c)
114 |
115 | Since a synset's information content is always equal or greater than
116 | the information content of its hypernyms, :math:`S(c_1, c_2)` above is
117 | more efficiently computed using the lowest common hypernyms instead of
118 | all common hypernyms.
119 |
120 | .. autofunction:: res
121 |
122 |
123 | Jiang-Conrath Similarity
124 | ''''''''''''''''''''''''
125 |
126 | The Jiang-Conrath similarity metric (`Jiang and Conrath, 1997
127 | `_) combines the ideas
128 | of the taxonomy-based and information content-based metrics. It is
129 | defined as follows, where :math:`c_1` and :math:`c_2` are the two
130 | synsets being compared and :math:`c_0` is the lowest common hypernym
131 | of the two with the highest information content weight:
132 |
133 | .. math::
134 |
135 | \frac{1}{\text{IC}(c_1) + \text{IC}(c_2) - 2(\text{IC}(c_0))}
136 |
137 | This equation is the simplified form given in the paper were several
138 | parameterized terms are cancelled out because the full form is not
139 | often used in practice.
140 |
141 | There are two special cases:
142 |
143 | 1. If the information content of :math:`c_0`, :math:`c_1`, and
144 | :math:`c_2` are all zero, the metric returns zero. This occurs when
145 | both :math:`c_1` and :math:`c_2` are the root node, but it can also
146 | occur if the synsets did not occur in the corpus and the smoothing
147 | value was set to zero.
148 |
149 | 2. Otherwise if :math:`c_1 + c_2 = 2c_0`, the metric returns
150 | infinity. This occurs when the two synsets are the same, one is a
151 | descendant of the other, etc., such that they have the same
152 | frequency as each other and as their lowest common hypernym.
153 |
154 | .. autofunction:: jcn
155 |
156 |
157 | Lin Similarity
158 | ''''''''''''''
159 |
160 | Another formulation of information content-based similarity is the Lin
161 | metric (`Lin 1997 `_),
162 | which is defined as follows, where :math:`c_1` and :math:`c_2` are the
163 | two synsets being compared and :math:`c_0` is the lowest common
164 | hypernym with the highest information content weight:
165 |
166 | .. math::
167 |
168 | \frac{2(\text{IC}(c_0))}{\text{IC}(c_1) + \text{IC}(c_0)}
169 |
170 | One special case is if either synset has an information content value
171 | of zero, in which case the metric returns zero.
172 |
173 | .. autofunction:: lin
174 |
--------------------------------------------------------------------------------
/wn/util.py:
--------------------------------------------------------------------------------
1 | """Wn utility classes."""
2 | from collections.abc import Callable
3 | from typing import TextIO
4 | import sys
5 |
6 |
7 | def synset_id_formatter(
8 | fmt: str = '{prefix}-{offset:08}-{pos}',
9 | **kwargs
10 | ) -> Callable:
11 | """Return a function for formatting synset ids.
12 |
13 | The *fmt* argument can be customized. It will be formatted using
14 | any other keyword arguments given to this function and any given
15 | to the resulting function. By default, the format string expects a
16 | ``prefix`` string argument for the namespace (such as a lexicon
17 | id), an ``offset`` integer argument (such as a WNDB offset), and a
18 | ``pos`` string argument.
19 |
20 | Arguments:
21 | fmt: A Python format string
22 | **kwargs: Keyword arguments for the format string.
23 |
24 | Example:
25 |
26 | >>> pwn_synset_id = synset_id_formatter(prefix='pwn')
27 | >>> pwn_synset_id(offset=1174, pos='n')
28 | 'pwn-00001174-n'
29 |
30 | """
31 |
32 | def format_synset_id(**_kwargs) -> str:
33 | return fmt.format(**kwargs, **_kwargs)
34 |
35 | return format_synset_id
36 |
37 |
38 | class ProgressHandler:
39 | """An interface for updating progress in long-running processes.
40 |
41 | Long-running processes in Wn, such as :func:`wn.download` and
42 | :func:`wn.add`, call to a progress handler object as they go. The
43 | default progress handler used by Wn is :class:`ProgressBar`, which
44 | updates progress by formatting and printing a textual bar to
45 | stderr. The :class:`ProgressHandler` class may be used directly,
46 | which does nothing, or users may create their own subclasses for,
47 | e.g., updating a GUI or some other handler.
48 |
49 | The initialization parameters, except for ``file``, are stored in
50 | a :attr:`kwargs` member and may be updated after the handler is
51 | created through the :meth:`set` method. The :meth:`update` method
52 | is the primary way a counter is updated. The :meth:`flash` method
53 | is sometimes called for simple messages. When the process is
54 | complete, the :meth:`close` method is called, optionally with a
55 | message.
56 |
57 | """
58 |
59 | def __init__(
60 | self,
61 | *,
62 | message: str = '',
63 | count: int = 0,
64 | total: int = 0,
65 | refresh_interval: int = 0,
66 | unit: str = '',
67 | status: str = '',
68 | file: TextIO = sys.stderr,
69 | ):
70 | self.file = file
71 | self.kwargs = {
72 | 'count': count,
73 | 'total': total,
74 | 'refresh_interval': refresh_interval,
75 | 'message': message,
76 | 'unit': unit,
77 | 'status': status,
78 | }
79 | self._refresh_quota: int = refresh_interval
80 |
81 | def update(self, n: int = 1, force: bool = False) -> None:
82 | """Update the counter with the increment value *n*.
83 |
84 | This method should update the ``count`` key of :attr:`kwargs`
85 | with the increment value *n*. After this, it is expected to
86 | update some user-facing progress indicator.
87 |
88 | If *force* is :python:`True`, any indicator will be refreshed
89 | regardless of the value of the refresh interval.
90 |
91 | """
92 | self.kwargs['count'] += n # type: ignore
93 |
94 | def set(self, **kwargs) -> None:
95 | """Update progress handler parameters.
96 |
97 | Calling this method also runs :meth:`update` with an increment
98 | of 0, which causes a refresh of any indicator without changing
99 | the counter.
100 |
101 | """
102 | self.kwargs.update(**kwargs)
103 | self.update(0, force=True)
104 |
105 | def flash(self, message: str) -> None:
106 | """Issue a message unrelated to the current counter.
107 |
108 | This may be useful for multi-stage processes to indicate the
109 | move to a new stage, or to log unexpected situations.
110 |
111 | """
112 | pass
113 |
114 | def close(self) -> None:
115 | """Close the progress handler.
116 |
117 | This might be useful for closing file handles or cleaning up
118 | resources.
119 |
120 | """
121 | pass
122 |
123 |
124 | class ProgressBar(ProgressHandler):
125 | """A :class:`ProgressHandler` subclass for printing a progress bar.
126 |
127 | Example:
128 | >>> p = ProgressBar(message='Progress: ', total=10, unit=' units')
129 | >>> p.update(3)
130 | Progress: [######### ] (3/10 units)
131 |
132 | See :meth:`format` for a description of how the progress bar is
133 | formatted.
134 |
135 | """
136 |
137 | #: The default formatting template.
138 | FMT = '\r{message}{bar}{counter}{status}'
139 |
140 | def update(self, n: int = 1, force: bool = False) -> None:
141 | """Increment the count by *n* and print the reformatted bar."""
142 | self.kwargs['count'] += n # type: ignore
143 | self._refresh_quota -= n
144 | if force or self._refresh_quota <= 0:
145 | self._refresh_quota = self.kwargs['refresh_interval'] # type: ignore
146 | s = self.format()
147 | if self.file:
148 | print('\r\033[K', end='', file=self.file)
149 | print(s, end='', file=self.file)
150 |
151 | def format(self) -> str:
152 | """Format and return the progress bar.
153 |
154 | The bar is is formatted according to :attr:`FMT`, using
155 | variables from :attr:`kwargs` and two computed variables:
156 |
157 | - ``bar``: visualization of the progress bar, empty when
158 | ``total`` is 0
159 |
160 | - ``counter``: display of ``count``, ``total``, and ``units``
161 |
162 | >>> p = ProgressBar(message='Progress', count=2, total=10, unit='K')
163 | >>> p.format()
164 | '\\rProgress [###### ] (2/10K) '
165 | >>> p = ProgressBar(count=2, status='Counting...')
166 | >>> p.format()
167 | '\\r (2) Counting...'
168 |
169 | """
170 | _kw = self.kwargs
171 | width = 30
172 | total: int = _kw['total'] # type: ignore
173 | count: int = _kw['count'] # type: ignore
174 |
175 | if total > 0:
176 | num = min(count, total) * width
177 | fill = (num // total) * '#'
178 | part = ((num % total) * 3) // total
179 | if part:
180 | fill += '-='[part-1]
181 | bar = f' [{fill:<{width}}]'
182 | counter = f' ({count}/{total}{_kw["unit"]}) '
183 | else:
184 | bar = ''
185 | counter = f' ({count}{_kw["unit"]}) '
186 |
187 | return self.FMT.format(bar=bar, counter=counter, **_kw)
188 |
189 | def flash(self, message: str) -> None:
190 | """Overwrite the progress bar with *message*."""
191 | print(f'\r\033[K{message}', end='', file=self.file)
192 |
193 | def close(self) -> None:
194 | """Print a newline so the last printed bar remains on screen."""
195 | print(file=self.file)
196 |
--------------------------------------------------------------------------------
/tests/similarity_test.py:
--------------------------------------------------------------------------------
1 |
2 | from math import log
3 |
4 | import pytest
5 |
6 | import wn
7 | from wn import similarity as sim
8 | from wn.taxonomy import taxonomy_depth
9 | from wn.ic import information_content as infocont
10 |
11 |
12 | def get_synsets(w):
13 | return {
14 | 'information': w.synset('test-en-0001-n'),
15 | 'example': w.synset('test-en-0002-n'),
16 | 'sample': w.synset('test-en-0004-n'),
17 | 'random sample': w.synset('test-en-0005-n'),
18 | 'random sample2': w.synset('test-en-0008-n'),
19 | 'datum': w.synset('test-en-0006-n'),
20 | 'exemplify': w.synset('test-en-0003-v'),
21 | }
22 |
23 |
24 | # some fake information content; computed using:
25 | # words = ['example', 'example', 'sample', 'random sample', 'illustrate']
26 | # ic = compute(words, wn.Wordnet('test-en'), distribute_weight=False)
27 |
28 | ic = {
29 | 'n': {'test-en-0001-n': 5.0, # information
30 | 'test-en-0002-n': 5.0, # example, illustration
31 | 'test-en-0004-n': 3.0, # sample
32 | 'test-en-0005-n': 2.0, # random sample
33 | 'test-en-0008-n': 2.0, # random sample 2
34 | 'test-en-0006-n': 1.0, # datum
35 | None: 6.0},
36 | 'v': {'test-en-0003-v': 2.0, # exemplify, illustrate
37 | 'test-en-0007-v': 1.0, # resignate
38 | None: 2.0},
39 | 'a': {None: 1.0},
40 | 'r': {None: 1.0}
41 | }
42 |
43 |
44 | @pytest.mark.usefixtures('mini_db')
45 | def test_path():
46 | ss = get_synsets(wn.Wordnet('test-en'))
47 | assert sim.path(ss['information'], ss['information']) == 1/1
48 | assert sim.path(ss['information'], ss['example']) == 1/2
49 | assert sim.path(ss['information'], ss['sample']) == 1/3
50 | assert sim.path(ss['information'], ss['random sample']) == 1/4
51 | assert sim.path(ss['random sample'], ss['datum']) == 1/5
52 | assert sim.path(ss['random sample2'], ss['datum']) == 0
53 | assert sim.path(ss['random sample2'], ss['datum'], simulate_root=True) == 1/4
54 | assert sim.path(
55 | ss['random sample'], ss['random sample2'], simulate_root=True
56 | ) == 1/6
57 | with pytest.raises(wn.Error):
58 | sim.path(ss['example'], ss['exemplify'])
59 | with pytest.raises(wn.Error):
60 | sim.wup(ss['example'], ss['exemplify'], simulate_root=True)
61 |
62 |
63 | @pytest.mark.usefixtures('mini_db')
64 | def test_wup():
65 | ss = get_synsets(wn.Wordnet('test-en'))
66 | assert sim.wup(ss['information'], ss['information']) == (2*1) / (0+0+2*1)
67 | assert sim.wup(ss['information'], ss['example']) == (2*1) / (0+1+2*1)
68 | assert sim.wup(ss['information'], ss['sample']) == (2*1) / (0+2+2*1)
69 | assert sim.wup(ss['information'], ss['random sample']) == (2*1) / (0+3+2*1)
70 | assert sim.wup(ss['random sample'], ss['datum']) == (2*1) / (3+1+2*1)
71 | with pytest.raises(wn.Error):
72 | assert sim.wup(ss['random sample2'], ss['datum'])
73 | assert (sim.wup(ss['random sample2'], ss['datum'], simulate_root=True)
74 | == (2*1) / (1+2+2*1))
75 | assert (sim.wup(ss['random sample'], ss['random sample2'], simulate_root=True)
76 | == (2*1) / (4+1+2*1))
77 | with pytest.raises(wn.Error):
78 | sim.wup(ss['example'], ss['exemplify'])
79 | with pytest.raises(wn.Error):
80 | sim.wup(ss['example'], ss['exemplify'], simulate_root=True)
81 |
82 |
83 | @pytest.mark.usefixtures('mini_db')
84 | def test_lch():
85 | w = wn.Wordnet('test-en')
86 | ss = get_synsets(w)
87 | d_n = taxonomy_depth(w, 'n')
88 | assert sim.lch(ss['information'], ss['information'], d_n) == -log((0+1) / (2*d_n))
89 | assert sim.lch(ss['information'], ss['example'], d_n) == -log((1+1) / (2*d_n))
90 | assert sim.lch(ss['information'], ss['sample'], d_n) == -log((2+1) / (2*d_n))
91 | assert sim.lch(ss['information'], ss['random sample'], d_n) == -log((3+1) / (2*d_n))
92 | assert sim.lch(ss['random sample'], ss['datum'], d_n) == -log((4+1) / (2*d_n))
93 | with pytest.raises(wn.Error):
94 | assert sim.lch(ss['random sample2'], ss['datum'], d_n)
95 | assert (sim.lch(ss['random sample2'], ss['datum'], d_n, simulate_root=True)
96 | == -log((3+1) / (2*d_n)))
97 | assert (sim.lch(ss['random sample'], ss['random sample2'], d_n, simulate_root=True)
98 | == -log((5+1) / (2*d_n)))
99 | with pytest.raises(wn.Error):
100 | sim.lch(ss['example'], ss['exemplify'], d_n)
101 | with pytest.raises(wn.Error):
102 | sim.lch(ss['example'], ss['exemplify'], d_n, simulate_root=True)
103 |
104 |
105 | @pytest.mark.usefixtures('mini_db')
106 | def test_res():
107 | w = wn.Wordnet('test-en')
108 | ss = get_synsets(w)
109 | assert (sim.res(ss['information'], ss['information'], ic)
110 | == infocont(ss['information'], ic))
111 | assert (sim.res(ss['information'], ss['example'], ic)
112 | == infocont(ss['information'], ic))
113 | assert (sim.res(ss['information'], ss['sample'], ic)
114 | == infocont(ss['information'], ic))
115 | assert (sim.res(ss['information'], ss['random sample'], ic)
116 | == infocont(ss['information'], ic))
117 | assert (sim.res(ss['random sample'], ss['datum'], ic)
118 | == infocont(ss['information'], ic))
119 | with pytest.raises(wn.Error):
120 | sim.res(ss['random sample2'], ss['datum'], ic)
121 | with pytest.raises(wn.Error):
122 | sim.res(ss['example'], ss['exemplify'], ic)
123 |
124 |
125 | @pytest.mark.usefixtures('mini_db')
126 | def test_jcn():
127 | w = wn.Wordnet('test-en')
128 | ss = get_synsets(w)
129 | info_ic = infocont(ss['information'], ic)
130 | assert (sim.jcn(ss['information'], ss['information'], ic)
131 | == float('inf'))
132 | assert (sim.jcn(ss['information'], ss['example'], ic)
133 | == float('inf'))
134 | assert (sim.jcn(ss['information'], ss['sample'], ic)
135 | == 1 / ((info_ic + infocont(ss['sample'], ic)) - 2 * info_ic))
136 | assert (sim.jcn(ss['information'], ss['random sample'], ic)
137 | == 1 / ((info_ic + infocont(ss['random sample'], ic)) - 2 * info_ic))
138 | assert (sim.jcn(ss['random sample'], ss['datum'], ic)
139 | == 1 / (
140 | (infocont(ss['random sample'], ic) + infocont(ss['datum'], ic))
141 | - 2 * info_ic))
142 | with pytest.raises(wn.Error):
143 | sim.jcn(ss['random sample2'], ss['datum'], ic)
144 | with pytest.raises(wn.Error):
145 | sim.jcn(ss['example'], ss['exemplify'], ic)
146 |
147 |
148 | @pytest.mark.usefixtures('mini_db')
149 | def test_lin():
150 | w = wn.Wordnet('test-en')
151 | ss = get_synsets(w)
152 | info_ic = infocont(ss['information'], ic)
153 | assert (sim.lin(ss['information'], ss['information'], ic)
154 | == 1.0)
155 | assert (sim.lin(ss['information'], ss['example'], ic)
156 | == 1.0)
157 | assert (sim.lin(ss['information'], ss['sample'], ic)
158 | == (2 * info_ic) / (info_ic + infocont(ss['sample'], ic)))
159 | assert (sim.lin(ss['information'], ss['random sample'], ic)
160 | == (2 * info_ic) / (info_ic + infocont(ss['random sample'], ic)))
161 | assert (sim.lin(ss['random sample'], ss['datum'], ic)
162 | == ((2 * info_ic)
163 | / (infocont(ss['random sample'], ic) + infocont(ss['datum'], ic))))
164 | with pytest.raises(wn.Error):
165 | sim.lin(ss['random sample2'], ss['datum'], ic)
166 | with pytest.raises(wn.Error):
167 | sim.lin(ss['example'], ss['exemplify'], ic)
168 |
--------------------------------------------------------------------------------
/wn/ic.py:
--------------------------------------------------------------------------------
1 |
2 | """Information Content is a corpus-based metrics of synset or sense
3 | specificity.
4 |
5 | """
6 |
7 | from typing import Optional, TextIO
8 | from pathlib import Path
9 | from collections import Counter
10 | from collections.abc import Callable, Iterable, Iterator
11 | from math import log
12 |
13 | from wn._types import AnyPath
14 | from wn._core import Synset, Wordnet
15 | from wn.constants import NOUN, VERB, ADJ, ADV, ADJ_SAT
16 | from wn.util import synset_id_formatter
17 |
18 |
19 | # Just use a subset of all available parts of speech
20 | IC_PARTS_OF_SPEECH = frozenset((NOUN, VERB, ADJ, ADV))
21 | Freq = dict[str, dict[Optional[str], float]]
22 |
23 |
24 | def information_content(synset: Synset, freq: Freq) -> float:
25 | """Calculate the Information Content value for a synset.
26 |
27 | The information content of a synset is the negative log of the
28 | synset probability (see :func:`synset_probability`).
29 |
30 | """
31 | return -log(synset_probability(synset, freq))
32 |
33 |
34 | def synset_probability(synset: Synset, freq: Freq) -> float:
35 | """Calculate the synset probability.
36 |
37 | The synset probability is defined as freq(ss)/N where freq(ss) is
38 | the IC weight for the synset and N is the total IC weight for all
39 | synsets with the same part of speech.
40 |
41 | Note: this function is not generally used directly, but indirectly
42 | through :func:`information_content`.
43 |
44 | """
45 | pos_freq = freq[synset.pos]
46 | return pos_freq[synset.id] / pos_freq[None]
47 |
48 |
49 | def _initialize(
50 | wordnet: Wordnet,
51 | smoothing: float,
52 | ) -> Freq:
53 | """Populate an Information Content weight mapping to a smoothing value.
54 |
55 | All synsets in *wordnet* are inserted into the dictionary and
56 | mapped to *smoothing*.
57 |
58 | """
59 | freq: Freq = {
60 | pos: {synset.id: smoothing for synset in wordnet.synsets(pos=pos)}
61 | for pos in IC_PARTS_OF_SPEECH
62 | }
63 | # pretend ADJ_SAT is just ADJ
64 | for synset in wordnet.synsets(pos=ADJ_SAT):
65 | freq[ADJ][synset.id] = smoothing
66 | # also initialize totals (when synset is None) for each part-of-speech
67 | for pos in IC_PARTS_OF_SPEECH:
68 | freq[pos][None] = smoothing
69 | return freq
70 |
71 |
72 | def compute(
73 | corpus: Iterable[str],
74 | wordnet: Wordnet,
75 | distribute_weight: bool = True,
76 | smoothing: float = 1.0
77 | ) -> Freq:
78 | """Compute Information Content weights from a corpus.
79 |
80 | Arguments:
81 | corpus: An iterable of string tokens. This is a flat list of
82 | words and the order does not matter. Tokens may be single
83 | words or multiple words separated by a space.
84 |
85 | wordnet: An instantiated :class:`wn.Wordnet` object, used to
86 | look up synsets from words.
87 |
88 | distribute_weight: If :python:`True`, the counts for a word
89 | are divided evenly among all synsets for the word.
90 |
91 | smoothing: The initial value given to each synset.
92 |
93 | Example:
94 | >>> import wn, wn.ic, wn.morphy
95 | >>> ewn = wn.Wordnet('ewn:2020', lemmatizer=wn.morphy.morphy)
96 | >>> freq = wn.ic.compute(["Dogs", "run", ".", "Cats", "sleep", "."], ewn)
97 | >>> dog = ewn.synsets('dog', pos='n')[0]
98 | >>> cat = ewn.synsets('cat', pos='n')[0]
99 | >>> frog = ewn.synsets('frog', pos='n')[0]
100 | >>> freq['n'][dog.id]
101 | 1.125
102 | >>> freq['n'][cat.id]
103 | 1.1
104 | >>> freq['n'][frog.id] # no occurrence; smoothing value only
105 | 1.0
106 | >>> carnivore = dog.lowest_common_hypernyms(cat)[0]
107 | >>> freq['n'][carnivore.id]
108 | 1.3250000000000002
109 | """
110 | freq = _initialize(wordnet, smoothing)
111 | counts = Counter(corpus)
112 |
113 | hypernym_cache: dict[Synset, list[Synset]] = {}
114 | for word, count in counts.items():
115 | synsets = wordnet.synsets(word)
116 | num = len(synsets)
117 | if num == 0:
118 | continue
119 |
120 | weight = float(count / num if distribute_weight else count)
121 |
122 | for synset in synsets:
123 | pos = synset.pos
124 | if pos == ADJ_SAT:
125 | pos = ADJ
126 | if pos not in IC_PARTS_OF_SPEECH:
127 | continue
128 |
129 | freq[pos][None] += weight
130 |
131 | # The following while-loop is equivalent to:
132 | #
133 | # freq[pos][synset.id] += weight
134 | # for path in synset.hypernym_paths():
135 | # for ss in path:
136 | # freq[pos][ss.id] += weight
137 | #
138 | # ...but it caches hypernym lookups for speed
139 |
140 | agenda: list[tuple[Synset, set[Synset]]] = [(synset, set())]
141 | while agenda:
142 | ss, seen = agenda.pop()
143 |
144 | # avoid cycles
145 | if ss in seen:
146 | continue
147 |
148 | freq[pos][ss.id] += weight
149 |
150 | if ss not in hypernym_cache:
151 | hypernym_cache[ss] = ss.hypernyms()
152 | agenda.extend((hyp, seen | {ss}) for hyp in hypernym_cache[ss])
153 |
154 | return freq
155 |
156 |
157 | def load(
158 | source: AnyPath,
159 | wordnet: Wordnet,
160 | get_synset_id: Optional[Callable] = None,
161 | ) -> Freq:
162 | """Load an Information Content mapping from a file.
163 |
164 | Arguments:
165 |
166 | source: A path to an information content weights file.
167 |
168 | wordnet: A :class:`wn.Wordnet` instance with synset
169 | identifiers matching the offsets in the weights file.
170 |
171 | get_synset_id: A callable that takes a synset offset and part
172 | of speech and returns a synset ID valid in *wordnet*.
173 |
174 | Raises:
175 |
176 | :class:`wn.Error`: If *wordnet* does not have exactly one
177 | lexicon.
178 |
179 | Example:
180 |
181 | >>> import wn, wn.ic
182 | >>> pwn = wn.Wordnet('pwn:3.0')
183 | >>> path = '~/nltk_data/corpora/wordnet_ic/ic-brown-resnik-add1.dat'
184 | >>> freq = wn.ic.load(path, pwn)
185 |
186 | """
187 | source = Path(source).expanduser().resolve(strict=True)
188 | assert len(wordnet.lexicons()) == 1
189 | lexid = wordnet.lexicons()[0].id
190 | if get_synset_id is None:
191 | get_synset_id = synset_id_formatter(prefix=lexid)
192 |
193 | freq = _initialize(wordnet, 0.0)
194 |
195 | with source.open() as icfile:
196 | for offset, pos, weight, is_root in _parse_ic_file(icfile):
197 | ssid = get_synset_id(offset=offset, pos=pos)
198 | # synset = wordnet.synset(ssid)
199 | freq[pos][ssid] = weight
200 | if is_root:
201 | freq[pos][None] += weight
202 | return freq
203 |
204 |
205 | def _parse_ic_file(icfile: TextIO) -> Iterator[tuple[int, str, float, bool]]:
206 | """Parse the Information Content file.
207 |
208 | A sample of the format is::
209 |
210 | wnver::eOS9lXC6GvMWznF1wkZofDdtbBU
211 | 1740n 1915712 ROOT
212 | 1930n 859272
213 | 2137n 1055337
214 |
215 | """
216 | next(icfile) # skip header
217 | for line in icfile:
218 | ssinfo, value, *isroot = line.split()
219 | yield (int(ssinfo[:-1]),
220 | ssinfo[-1],
221 | float(value),
222 | bool(isroot))
223 |
--------------------------------------------------------------------------------
/docs/api/wn.ic.rst:
--------------------------------------------------------------------------------
1 |
2 | wn.ic
3 | =====
4 |
5 | .. automodule:: wn.ic
6 |
7 | The mathematical formulae for information content are defined in
8 | `Formal Description`_, and the corresponding Python API function are
9 | described in `Calculating Information Content`_. These functions
10 | require information content weights obtained either by `computing them
11 | from a corpus `_, or by `loading
12 | pre-computed weights from a file `_.
14 |
15 | .. note::
16 |
17 | The term *information content* can be ambiguous. It often, and most
18 | accurately, refers to the result of the :func:`information_content`
19 | function (:math:`\text{IC}(c)` in the mathematical notation), but
20 | is also sometimes used to refer to the corpus frequencies/weights
21 | (:math:`\text{freq}(c)` in the mathematical notation) returned by
22 | :func:`load` or :func:`compute`, as these weights are the basis of
23 | the value computed by :func:`information_content`. The Wn
24 | documentation tries to consistently refer to former as the
25 | *information content value*, or just *information content*, and the
26 | latter as *information content weights*, or *weights*.
27 |
28 |
29 | Formal Description
30 | ------------------
31 |
32 | The Information Content (IC) of a concept (synset) is a measure of its
33 | specificity computed from the wordnet's taxonomy structure and corpus
34 | frequencies. It is defined by Resnik 1995 ([RES95]_), following
35 | information theory, as the negative log-probability of a concept:
36 |
37 | .. math::
38 |
39 | \text{IC}(c) = -\log{p(c)}
40 |
41 | A concept's probability is the empirical probability over a corpus:
42 |
43 | .. math::
44 |
45 | p(c) = \frac{\text{freq}(c)}{N}
46 |
47 | Here, :math:`N` is the total count of words of the same category as
48 | concept :math:`c` ([RES95]_ only considered nouns) where each word has
49 | some representation in the wordnet, and :math:`\text{freq}` is defined
50 | as the sum of corpus counts of words in :math:`\text{words}(c)`, which
51 | is the set of words subsumed by concept :math:`c`:
52 |
53 | .. math::
54 |
55 | \text{freq}(c) = \sum_{w \in \text{words}(c)}{\text{count}(w)}
56 |
57 | It is common for :math:`\text{freq}` to not contain actual frequencies
58 | but instead weights distributed evenly among the synsets for a
59 | word. These weights are calculated as the word frequency divided by
60 | the number of synsets for the word:
61 |
62 | .. math::
63 |
64 | \text{freq}_{\text{distributed}}(c)
65 | = \sum_{w \in \text{words}(c)}{\frac{\text{count}(w)}{|\text{synsets}(w)|}}
66 |
67 | .. [RES95] Resnik, Philip. "Using information content to evaluate
68 | semantic similarity." In Proceedings of the 14th International
69 | Joint Conference on Artificial Intelligence (IJCAI-95), Montreal,
70 | Canada, pp. 448-453. 1995.
71 |
72 |
73 | Example
74 | -------
75 |
76 | In the Princeton WordNet 3.0 (hereafter *WordNet*, but note that the
77 | equivalent lexicon in Wn is the *OMW English Wordnet based on WordNet
78 | 3.0* with specifier ``omw-en:1.4``), the frequency of a concept like
79 | **stone fruit** is not just the number of occurrences of *stone
80 | fruit*, but also includes the counts of the words for its hyponyms
81 | (*almond*, *olive*, etc.) and other taxonomic descendants (*Jordan
82 | almond*, *green olive*, etc.). The word *almond* has two synsets: one
83 | for the fruit or nut, another for the plant. Thus, if the word
84 | *almond* is encountered :math:`n` times in a corpus, then the weight
85 | (either the frequency :math:`n` or distributed weight
86 | :math:`\frac{n}{2}`) is added to the total weights for both synsets
87 | and to those of their ancestors, but not for descendant synsets, such
88 | as for **Jordan almond**. The fruit/nut synset of almond has two
89 | hypernym paths which converge on **fruit**:
90 |
91 | 1. **almond** ⊃ **stone fruit** ⊃ **fruit**
92 | 2. **almond** ⊃ **nut** ⊃ **seed** ⊃ **fruit**
93 |
94 | The weight is added to each ancestor (**stone fruit**, **nut**,
95 | **seed**, **fruit**, ...) once. That is, the weight is not added to
96 | the convergent ancestor for **fruit** twice, but only once.
97 |
98 |
99 | Calculating Information Content
100 | -------------------------------
101 |
102 | .. autofunction:: information_content
103 | .. autofunction:: synset_probability
104 |
105 |
106 | Computing Corpus Weights
107 | ------------------------
108 |
109 | If pre-computed weights are not available for a wordnet or for some
110 | domain, they can be computed given a corpus and a wordnet.
111 |
112 | The corpus is an iterable of words. For large corpora it may help to
113 | use a generator for this iterable, but the entire vocabulary (i.e.,
114 | unique words and counts) will be held at once in memory. Multi-word
115 | expressions are also possible if they exist in the wordnet. For
116 | instance, WordNet has *stone fruit*, with a single space delimiting
117 | the words, as an entry.
118 |
119 | The :class:`wn.Wordnet` object must be instantiated with a single
120 | lexicon, although it may have expand-lexicons for relation
121 | traversal. For best results, the wordnet should use a lemmatizer to
122 | help it deal with inflected wordforms from running text.
123 |
124 | .. autofunction:: compute
125 |
126 |
127 | Reading Pre-computed Information Content Files
128 | ----------------------------------------------
129 |
130 | The :func:`load` function reads pre-computed information content
131 | weights files as used by the `WordNet::Similarity
132 | `_ Perl module or the `NLTK
133 | `_ Python package. These files are computed for
134 | a specific version of a wordnet using the synset offsets from the
135 | `WNDB `_ format,
136 | which Wn does not use. These offsets therefore must be converted into
137 | an identifier that matches those used by the wordnet. By default,
138 | :func:`load` uses the lexicon identifier from its *wordnet* argument
139 | with synset offsets (padded with 0s to make 8 digits) and
140 | parts-of-speech from the weights file to format an identifier, such as
141 | ``omw-en-00001174-n``. For wordnets that use a different identifier
142 | scheme, the *get_synset_id* parameter of :func:`load` can be given a
143 | callable created with :func:`wn.util.synset_id_formatter`. It can also
144 | be given another callable with the same signature as shown below:
145 |
146 | .. code-block:: python
147 |
148 | get_synset_id(*, offset: int, pos: str) -> str
149 |
150 |
151 | When loading pre-computed information content files, it is recommended
152 | to use the ones with smoothing (i.e., ``*-add1.dat`` or
153 | ``*-resnik-add1.dat``) to avoid math domain errors when computing the
154 | information content value.
155 |
156 | .. warning::
157 |
158 | The weights files are only valid for the version of wordnet for
159 | which they were created. Files created for WordNet 3.0 do not work
160 | for WordNet 3.1 because the offsets used in its identifiers are
161 | different, although the *get_synset_id* parameter of :func:`load`
162 | could be given a function that performs a suitable mapping. Some
163 | `Open Multilingual Wordnet `_
164 | wordnets use the WordNet 3.0 offsets in their identifiers and can
165 | therefore technically use the weights, but this usage is
166 | discouraged because the distributional properties of text in
167 | another language and the structure of the other wordnet will not be
168 | compatible with that of the English WordNet. For these cases, it is
169 | recommended to compute new weights using :func:`compute`.
170 |
171 | .. autofunction:: load
172 |
--------------------------------------------------------------------------------