├── wn
    ├── py.typed
    ├── metrics.py
    ├── _exceptions.py
    ├── _types.py
    ├── _ili.py
    ├── __init__.py
    ├── _util.py
    ├── _db.py
    ├── _download.py
    ├── morphy.py
    ├── __main__.py
    ├── _module_functions.py
    ├── util.py
    └── ic.py
├── tests
    ├── data
    │   ├── test-package
    │   │   ├── LICENSE
    │   │   ├── README.md
    │   │   ├── citation.bib
    │   │   └── test-wn.xml
    │   ├── README.md
    │   ├── E101-3.xml
    │   ├── E101-2.xml
    │   ├── W306-0.xml
    │   ├── W305-0.xml
    │   ├── E101-1.xml
    │   ├── E101-0.xml
    │   ├── W307-0.xml
    │   ├── sense-member-order.xml
    │   ├── sense-key-variations.xml
    │   ├── mini-lmf-1.3.xml
    │   ├── mini-lmf-1.4.xml
    │   └── mini-lmf-1.1.xml
    ├── util_test.py
    ├── validate_test.py
    ├── export_test.py
    ├── _util_test.py
    ├── project_test.py
    ├── morphy_test.py
    ├── db_test.py
    ├── conftest.py
    ├── compat_sensekey_test.py
    ├── wordnet_test.py
    ├── ic_test.py
    ├── web_test.py
    ├── taxonomy_test.py
    ├── lmf_test.py
    ├── secondary_query_test.py
    ├── relations_test.py
    └── similarity_test.py
├── docs
    ├── docutils.conf
    ├── requirements.txt
    ├── api
    │   ├── wn.validate.rst
    │   ├── wn.lmf.rst
    │   ├── wn.compat.sensekey.rst
    │   ├── wn.compat.rst
    │   ├── wn.util.rst
    │   ├── wn.project.rst
    │   ├── wn.taxonomy.rst
    │   ├── wn.morphy.rst
    │   ├── wn.similarity.rst
    │   └── wn.ic.rst
    ├── _static
    │   ├── css
    │   │   └── svg.css
    │   ├── wn-logo.svg
    │   └── wn-logo-rotate.svg
    ├── Makefile
    ├── .readthedocs.yaml
    ├── make.bat
    ├── index.rst
    ├── cli.rst
    ├── setup.rst
    ├── conf.py
    ├── guides
    │   ├── nltk-migration.rst
    │   └── wordnet.rst
    └── faq.rst
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── feature_request.md
    │   ├── data-issue.md
    │   └── bug_report.md
    └── workflows
    │   ├── checks.yml
    │   ├── publish.yml
    │   └── publish-docker.yaml
├── Dockerfile
├── .gitignore
├── LICENSE
├── bench
    ├── README.md
    ├── test_bench.py
    └── conftest.py
├── CITATION.cff
├── pyproject.toml
└── CONTRIBUTING.md


/wn/py.typed:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/tests/data/test-package/LICENSE:
--------------------------------------------------------------------------------
1 | Test License
2 | 


--------------------------------------------------------------------------------
/tests/data/test-package/README.md:
--------------------------------------------------------------------------------
1 | # Test README
2 | 


--------------------------------------------------------------------------------
/tests/data/test-package/citation.bib:
--------------------------------------------------------------------------------
1 | % test bib
2 | 


--------------------------------------------------------------------------------
/docs/docutils.conf:
--------------------------------------------------------------------------------
1 | [restructuredtext parser]
2 | syntax_highlight = short
3 | 
4 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx ~= 8.1
2 | furo == 2024.8.6
3 | sphinx-copybutton == 0.5.2
4 | .
5 | 
6 | 


--------------------------------------------------------------------------------
/docs/api/wn.validate.rst:
--------------------------------------------------------------------------------
1 | 
2 | wn.validate
3 | ===========
4 | 
5 | .. automodule:: wn.validate
6 | 
7 | .. autofunction:: validate
8 | 


--------------------------------------------------------------------------------
/tests/data/README.md:
--------------------------------------------------------------------------------
1 | # Testing Data Directory
2 | 
3 | This directory is used to store data files used by the testing system.
4 | 
5 | 


--------------------------------------------------------------------------------
/docs/api/wn.lmf.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | wn.lmf
 3 | ======
 4 | 
 5 | .. automodule:: wn.lmf
 6 | 
 7 | .. autofunction:: load
 8 | .. autofunction:: scan_lexicons
 9 | .. autofunction:: is_lmf
10 | 
11 | 


--------------------------------------------------------------------------------
/tests/data/test-package/test-wn.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <!DOCTYPE LexicalResource SYSTEM "http://globalwordnet.github.io/schemas/WN-LMF-1.3.dtd">
3 | 
4 | <LexicalResource>
5 | 
6 | </LexicalResource>
7 | 


--------------------------------------------------------------------------------
/docs/api/wn.compat.sensekey.rst:
--------------------------------------------------------------------------------
 1 | wn.compat.sensekey
 2 | ==================
 3 | 
 4 | .. automodule:: wn.compat.sensekey
 5 | 
 6 | .. autofunction:: escape
 7 | .. autofunction:: unescape
 8 | .. autofunction:: sense_key_getter
 9 | .. autofunction:: sense_getter
10 | 


--------------------------------------------------------------------------------
/docs/_static/css/svg.css:
--------------------------------------------------------------------------------
 1 | svg {
 2 |     width: 500px;
 3 |     height: 300px;
 4 | 	
 5 |     position: relative;
 6 |     left: 20%;
 7 |     -webkit-transform: translateX(-20%);
 8 |     -ms-transform: translateX(-20%);
 9 |     transform: translateX(-20%);
10 | 	
11 |     }
12 | 	
13 | 


--------------------------------------------------------------------------------
/wn/metrics.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from wn._core import Word, Synset
 3 | 
 4 | 
 5 | # Word-based Metrics
 6 | 
 7 | def ambiguity(word: Word) -> int:
 8 |     return len(word.synsets())
 9 | 
10 | 
11 | def average_ambiguity(synset: Synset) -> float:
12 |     words = synset.words()
13 |     return sum(len(word.synsets()) for word in words) / len(words)
14 | 


--------------------------------------------------------------------------------
/tests/util_test.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from wn import util
 3 | 
 4 | 
 5 | def test_synset_id_formatter():
 6 |     f = util.synset_id_formatter
 7 |     assert f()(prefix='xyz', offset=123, pos='n') == 'xyz-00000123-n'
 8 |     assert f(prefix='xyz')(offset=123, pos='n') == 'xyz-00000123-n'
 9 |     assert f(prefix='xyz', pos='n')(offset=123) == 'xyz-00000123-n'
10 |     assert f('abc-{offset}-{pos}')(offset=1, pos='v') == 'abc-1-v'
11 | 


--------------------------------------------------------------------------------
/docs/api/wn.compat.rst:
--------------------------------------------------------------------------------
 1 | wn.compat
 2 | =========
 3 | 
 4 | Compatibility modules for Wn.
 5 | 
 6 | This subpackage is a namespace for compatibility modules when working
 7 | with particular lexicons. Wn is designed to be agnostic to the
 8 | language or lexicon and not favor one over the other (with the
 9 | exception of :mod:`wn.morphy`, which is English-specific). However,
10 | there are some kinds of functionality that would be useful to
11 | include in Wn, even if they don't generalize to all lexicons.
12 | 
13 | Included modules
14 | ----------------
15 | 
16 | .. toctree::
17 |    :maxdepth: 1
18 | 
19 |    wn.compat.sensekey.rst
20 | 
21 | 


--------------------------------------------------------------------------------
/tests/validate_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from wn import lmf
 4 | from wn.validate import validate
 5 | 
 6 | tests = [
 7 |     ("E101", 0),
 8 |     ("E101", 1),
 9 |     ("E101", 2),
10 |     ("E101", 3),
11 |     ("W305", 0),
12 |     ("W306", 0),
13 |     ("W307", 0),
14 | ]
15 | test_ids = [f"{code}-{i}" for code, i in tests]
16 | 
17 | 
18 | @pytest.mark.parametrize("code,i", tests, ids=test_ids)
19 | def test_validate(datadir, code: str, i: int) -> None:
20 |     path = datadir / f"{code}-{i}.xml"
21 |     lex = lmf.load(path, progress_handler=None)["lexicons"][0]
22 |     report = validate(lex, select=[code], progress_handler=None)
23 |     print(report)
24 |     assert len(report[code]["items"]) > 0
25 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: enhancement
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Set the version of Python and other tools you might need
 9 | build:
10 |   os: ubuntu-22.04
11 |   tools:
12 |     python: "3.12"
13 | 
14 | # Build documentation in the docs/ directory with Sphinx
15 | sphinx:
16 |   configuration: docs/conf.py
17 | 
18 | # We recommend specifying your dependencies to enable reproducible builds:
19 | # https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
20 | python:
21 |   install:
22 |     - requirements: docs/requirements.txt
23 | 
24 | formats:
25 |   - pdf
26 |   - epub
27 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.11-slim
 2 | 
 3 | WORKDIR /app
 4 | 
 5 | # Install system dependencies
 6 | RUN apt-get update && apt-get install -y \
 7 |     python3-pip \
 8 |     python3-dev \
 9 |     build-essential \
10 |     && rm -rf /var/lib/apt/lists/*
11 | 
12 | # Install web server
13 | RUN pip install uvicorn
14 | 
15 | COPY . .
16 | RUN pip install --no-cache-dir ".[web]"
17 | 
18 | # Download the wordnet data and initialize the database
19 | # TODO: this should be done in a separate volume
20 | RUN python -m wn download omw:1.4 cili
21 | 
22 | # Clean up the downloads directory
23 | RUN rm -r ~/.wn_data/downloads
24 | 
25 | # Expose the port
26 | EXPOSE 8080
27 | 
28 | CMD ["uvicorn", "wn.web:app", "--host", "0.0.0.0", "--port", "8080"]


--------------------------------------------------------------------------------
/docs/api/wn.util.rst:
--------------------------------------------------------------------------------
 1 | wn.util
 2 | =======
 3 | 
 4 | .. automodule:: wn.util
 5 | 
 6 | .. autofunction:: synset_id_formatter
 7 | 
 8 | .. autoclass:: ProgressHandler
 9 |    :members:
10 | 
11 |    .. attribute:: kwargs
12 | 
13 |       A dictionary storing the updateable parameters for the progress
14 |       handler. The keys are:
15 | 
16 |       - ``message`` (:class:`str`) -- a generic message or name
17 |       - ``count`` (:class:`int`) -- the current progress counter
18 |       - ``total`` (:class:`int`) -- the expected final value of the counter
19 |       - ``unit`` (:class:`str`) -- the unit of measurement
20 |       - ``status`` (:class:`str`) -- the current status of the process
21 | 
22 | .. autoclass:: ProgressBar
23 |    :members:
24 | 


--------------------------------------------------------------------------------
/tests/export_test.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from xml.etree import ElementTree as ET
 3 | 
 4 | import pytest
 5 | 
 6 | import wn
 7 | 
 8 | 
 9 | @pytest.mark.usefixtures('mini_db')
10 | def test_export(datadir, tmp_path):
11 |     tmpdir = tmp_path / 'test_export'
12 |     tmpdir.mkdir()
13 |     tmppath = tmpdir / 'mini_lmf_export.xml'
14 |     lexicons = wn.lexicons(lexicon='test-en test-es')
15 |     wn.export(lexicons, tmppath)
16 | 
17 |     # remove comments, indentation, etc.
18 |     orig = ET.canonicalize(from_file=datadir / 'mini-lmf-1.0.xml', strip_text=True)
19 |     temp = ET.canonicalize(from_file=tmppath, strip_text=True)
20 |     # additional transformation to help with debugging
21 |     orig = orig.replace('<', '\n<')
22 |     temp = temp.replace('<', '\n<')
23 |     assert orig == temp
24 | 


--------------------------------------------------------------------------------
/wn/_exceptions.py:
--------------------------------------------------------------------------------
 1 | 
 2 | class Error(Exception):
 3 |     """Generic error class for invalid wordnet operations."""
 4 | 
 5 |     # reset the module so the user sees the public name
 6 |     __module__ = 'wn'
 7 | 
 8 | 
 9 | class DatabaseError(Error):
10 |     """Error class for issues with the database."""
11 | 
12 |     __module__ = 'wn'
13 | 
14 | 
15 | class ConfigurationError(Error):
16 |     """Raised on invalid configurations."""
17 |     __module__ = 'wn'
18 | 
19 | 
20 | class ProjectError(Error):
21 |     """Raised when a project is not found or on errors defined in the index."""
22 |     __module__ = 'wn'
23 | 
24 | 
25 | class WnWarning(Warning):
26 |     """Generic warning class for dubious wordnet operations."""
27 | 
28 |     # reset the module so the user sees the public name
29 |     __module__ = 'wn'
30 | 


--------------------------------------------------------------------------------
/tests/data/E101-3.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE LexicalResource SYSTEM "http://globalwordnet.github.io/schemas/WN-LMF-1.0.dtd">
 3 | <LexicalResource xmlns:dc="http://purl.org/dc/elements/1.1/">
 4 | 
 5 | <!-- duplicate ID in different entity types -->
 6 | 
 7 |   <Lexicon id="test-e101"
 8 |            label="Testing E101"
 9 |            language="en"
10 |            email="maintainer@example.com"
11 |            license="https://creativecommons.org/licenses/by/4.0/"
12 |            version="1">
13 | 
14 |     <LexicalEntry id="test-e101-foo-n">
15 |       <Lemma partOfSpeech="n" writtenForm="foo" />
16 |       <Sense id="test-e101-foo-n" synset="test-e101-01-n" />
17 |     </LexicalEntry>
18 | 
19 |     <Synset id="test-e101-01-n" ili="i12345" partOfSpeech="n" />
20 | 
21 |   </Lexicon>
22 | 
23 | </LexicalResource>
24 | 


--------------------------------------------------------------------------------
/tests/data/E101-2.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE LexicalResource SYSTEM "http://globalwordnet.github.io/schemas/WN-LMF-1.0.dtd">
 3 | <LexicalResource xmlns:dc="http://purl.org/dc/elements/1.1/">
 4 | 
 5 | <!-- duplicate ID in synsets -->
 6 | 
 7 |   <Lexicon id="test-e101"
 8 |            label="Testing E101"
 9 |            language="en"
10 |            email="maintainer@example.com"
11 |            license="https://creativecommons.org/licenses/by/4.0/"
12 |            version="1">
13 | 
14 |     <LexicalEntry id="test-e101-foo-n">
15 |       <Lemma partOfSpeech="n" writtenForm="foo" />
16 |       <Sense id="test-e101-foo-n" synset="test-e101-01-n" />
17 |     </LexicalEntry>
18 | 
19 |     <Synset id="test-e101-01-n" ili="i12345" partOfSpeech="n" />
20 |     <Synset id="test-e101-01-n" ili="i12346" partOfSpeech="n" />
21 | 
22 |   </Lexicon>
23 | 
24 | </LexicalResource>
25 | 


--------------------------------------------------------------------------------
/tests/data/W306-0.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE LexicalResource SYSTEM "http://globalwordnet.github.io/schemas/WN-LMF-1.0.dtd">
 3 | <LexicalResource xmlns:dc="http://purl.org/dc/elements/1.1/">
 4 | 
 5 | <!-- blank example in synset -->
 6 | 
 7 |   <Lexicon id="test-w306"
 8 |            label="Testing W306"
 9 |            language="en"
10 |            email="maintainer@example.com"
11 |            license="https://creativecommons.org/licenses/by/4.0/"
12 |            version="1">
13 | 
14 |     <LexicalEntry id="test-w306-foo-n">
15 |       <Lemma partOfSpeech="n" writtenForm="foo" />
16 |       <Sense id="test-w306-foo-n" synset="test-w306-01-n" />
17 |     </LexicalEntry>
18 | 
19 |     <Synset id="test-w306-01-n" ili="i12345" partOfSpeech="n">
20 |       <Example>
21 |         
22 |       </Example>
23 |     </Synset>
24 | 
25 |   </Lexicon>
26 | 
27 | </LexicalResource>
28 | 


--------------------------------------------------------------------------------
/tests/_util_test.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from wn._util import flatten, unique_list
 3 | 
 4 | 
 5 | def test_flatten():
 6 |     assert flatten([]) == []
 7 |     assert flatten([[]]) == []
 8 |     assert flatten([[], []]) == []
 9 |     assert flatten([[[], []], [[], []]]) == [[], [], [], []]
10 |     assert flatten([[1]]) == [1]
11 |     assert flatten([[1, 2], [3, 4]]) == [1, 2, 3, 4]
12 |     assert flatten(["AB", "CD"]) == ["A", "B", "C", "D"]
13 | 
14 | 
15 | def test_unique_list():
16 |     assert unique_list([]) == []
17 |     assert unique_list([1]) == [1]
18 |     assert unique_list([1, 1, 1, 1, 1]) == [1]
19 |     assert unique_list([1, 1, 2, 2, 1]) == [1, 2]
20 |     assert unique_list([2, 1, 2, 2, 1]) == [2, 1]
21 |     assert unique_list("A") == ["A"]
22 |     assert unique_list("AAA") == ["A"]
23 |     assert unique_list("ABABA") == ["A", "B"]
24 |     assert unique_list([(1, 2), (1, 2), (2, 3)]) == [(1, 2), (2, 3)]
25 | 


--------------------------------------------------------------------------------
/tests/data/W305-0.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE LexicalResource SYSTEM "http://globalwordnet.github.io/schemas/WN-LMF-1.0.dtd">
 3 | <LexicalResource xmlns:dc="http://purl.org/dc/elements/1.1/">
 4 | 
 5 | <!-- blank definition in synset -->
 6 | 
 7 |   <Lexicon id="test-w305"
 8 |            label="Testing W305"
 9 |            language="en"
10 |            email="maintainer@example.com"
11 |            license="https://creativecommons.org/licenses/by/4.0/"
12 |            version="1">
13 | 
14 |     <LexicalEntry id="test-w305-foo-n">
15 |       <Lemma partOfSpeech="n" writtenForm="foo" />
16 |       <Sense id="test-w305-foo-n" synset="test-w305-01-n" />
17 |     </LexicalEntry>
18 | 
19 |     <Synset id="test-w305-01-n" ili="i12345" partOfSpeech="n">
20 |       <Definition>
21 |         
22 |       </Definition>
23 |     </Synset>
24 | 
25 |   </Lexicon>
26 | 
27 | </LexicalResource>
28 | 


--------------------------------------------------------------------------------
/tests/data/E101-1.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE LexicalResource SYSTEM "http://globalwordnet.github.io/schemas/WN-LMF-1.0.dtd">
 3 | <LexicalResource xmlns:dc="http://purl.org/dc/elements/1.1/">
 4 | 
 5 | <!-- duplicate ID in senses -->
 6 | 
 7 |   <Lexicon id="test-e101"
 8 |            label="Testing E101"
 9 |            language="en"
10 |            email="maintainer@example.com"
11 |            license="https://creativecommons.org/licenses/by/4.0/"
12 |            version="1">
13 | 
14 |     <LexicalEntry id="test-e101-foo-n">
15 |       <Lemma partOfSpeech="n" writtenForm="foo" />
16 |       <Sense id="test-e101-foo" synset="test-e101-01-n" />
17 |       <Sense id="test-e101-foo" synset="test-e101-02-n" />
18 |     </LexicalEntry>
19 | 
20 |     <Synset id="test-e101-01-n" ili="i12345" partOfSpeech="n" />
21 |     <Synset id="test-e101-02-n" ili="i12346" partOfSpeech="n" />
22 | 
23 |   </Lexicon>
24 | 
25 | </LexicalResource>
26 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/_static/wn-logo.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" standalone="no"?>
 2 | <svg width="150" height="150" viewBox="0 0 10 10" xmlns="http://www.w3.org/2000/svg" version="1.1">
 3 |   <defs>
 4 |     <linearGradient id="wgradient" gradientTransform="rotate(90)">
 5 |       <stop offset="0%"   stop-color="#814F92" />
 6 |       <stop offset="100%" stop-color="#41285C" />
 7 |     </linearGradient>
 8 |     <linearGradient id="ngradient" gradientTransform="rotate(90)">
 9 |       <stop offset="0%"   stop-color="#3CD2CC" />
10 |       <stop offset="100%" stop-color="#176361" />
11 |     </linearGradient>
12 |   </defs>
13 |   <style type="text/css"><![CDATA[
14 |     .path {
15 |       stroke-width: 1;
16 |       fill: none;
17 |       stroke-linecap: round;
18 |       stroke-linejoin: round;
19 |     }
20 |     ]]></style>
21 |   <path id="n" stroke="url(#ngradient)" class="path" d="M5,9 L1,5 9,5 5,1" />
22 |   <path id="w" stroke="url(#wgradient)" class="path" d="M5,1 L1,5 5,5 5,9 9,5" />
23 | </svg>
24 | 


--------------------------------------------------------------------------------
/tests/data/E101-0.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE LexicalResource SYSTEM "http://globalwordnet.github.io/schemas/WN-LMF-1.0.dtd">
 3 | <LexicalResource xmlns:dc="http://purl.org/dc/elements/1.1/">
 4 | 
 5 | <!-- duplicate ID in lexical entries -->
 6 | 
 7 |   <Lexicon id="test-e101"
 8 |            label="Testing E101"
 9 |            language="en"
10 |            email="maintainer@example.com"
11 |            license="https://creativecommons.org/licenses/by/4.0/"
12 |            version="1">
13 | 
14 |     <LexicalEntry id="test-e101-foo-n">
15 |       <Lemma partOfSpeech="n" writtenForm="foo" />
16 |       <Sense id="test-e101-foo" synset="test-e101-01-n" />
17 |     </LexicalEntry>
18 | 
19 |     <LexicalEntry id="test-e101-foo-n">
20 |       <Lemma partOfSpeech="n" writtenForm="foo2" />
21 |       <Sense id="test-e101-foo2" synset="test-e101-01-n" />
22 |     </LexicalEntry>
23 | 
24 |     <Synset id="test-e101-01-n" ili="i12345" partOfSpeech="n" />
25 | 
26 |   </Lexicon>
27 | 
28 | </LexicalResource>
29 | 


--------------------------------------------------------------------------------
/wn/_types.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from collections.abc import Callable, Mapping, Sequence
 3 | from typing import Any, Optional, Union
 4 | from pathlib import Path
 5 | 
 6 | # For functions taking a filesystem path as a str or a pathlib.Path
 7 | AnyPath = Union[str, Path]
 8 | 
 9 | # LMF versions for comparison
10 | VersionInfo = tuple[int, ...]
11 | 
12 | # Synset and Sense relations map a relation type to one or more ids
13 | RelationMap = Mapping[str, Sequence[str]]
14 | 
15 | # User-facing metadata representation
16 | Metadata = dict[str, Any]
17 | 
18 | # A callable that returns a normalized word form for a given word form
19 | NormalizeFunction = Callable[[str], str]
20 | 
21 | # Lemmatization returns a mapping of parts of speech (or None) to
22 | # lists of wordforms that are potential lemmas for some query word
23 | LemmatizeResult = dict[Optional[str], set[str]]
24 | 
25 | # A callable that returns a LemmatizationResult for a given word form
26 | # and optional part of speech
27 | LemmatizeFunction = Callable[[str, Optional[str]], LemmatizeResult]
28 | 


--------------------------------------------------------------------------------
/tests/data/W307-0.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE LexicalResource SYSTEM "http://globalwordnet.github.io/schemas/WN-LMF-1.0.dtd">
 3 | <LexicalResource xmlns:dc="http://purl.org/dc/elements/1.1/">
 4 | 
 5 | <!-- repeated definition in synset -->
 6 | 
 7 |   <Lexicon id="test-w307"
 8 |            label="Testing W307"
 9 |            language="en"
10 |            email="maintainer@example.com"
11 |            license="https://creativecommons.org/licenses/by/4.0/"
12 |            version="1">
13 | 
14 |     <LexicalEntry id="test-w307-foo-n">
15 |       <Lemma partOfSpeech="n" writtenForm="foo" />
16 |       <Sense id="test-w307-foo-1-n" synset="test-w307-01-n" />
17 |       <Sense id="test-w307-foo-2-n" synset="test-w307-02-n" />
18 |     </LexicalEntry>
19 | 
20 |     <Synset id="test-w307-01-n" ili="i12345" partOfSpeech="n">
21 |       <Definition>foo</Definition>
22 |     </Synset>
23 | 
24 |     <Synset id="test-w307-02-n" ili="i12346" partOfSpeech="n">
25 |       <Definition>foo</Definition>
26 |     </Synset>
27 | 
28 |   </Lexicon>
29 | 
30 | </LexicalResource>
31 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 | 
30 | # Unit test / coverage reports
31 | htmlcov/
32 | .tox/
33 | .nox/
34 | .coverage
35 | .coverage.*
36 | .cache
37 | nosetests.xml
38 | coverage.xml
39 | *.cover
40 | *.py,cover
41 | .hypothesis/
42 | .pytest_cache/
43 | 
44 | # Ruff (has its own .gitignore, but in case that ever changes...)
45 | .ruff_cache
46 | 
47 | # Sphinx documentation
48 | docs/_build/
49 | 
50 | # Jupyter Notebook
51 | .ipynb_checkpoints
52 | 
53 | # Environments
54 | .env
55 | .venv
56 | env/
57 | venv/
58 | ENV/
59 | env.bak/
60 | venv.bak/
61 | 
62 | # mypy
63 | .mypy_cache/
64 | .dmypy.json
65 | dmypy.json
66 | 
67 | # PyCharm
68 | .idea/
69 | 
70 | # VS Code
71 | .vscode/
72 | 
73 | # benchmarking results
74 | .benchmarks/


--------------------------------------------------------------------------------
/docs/api/wn.project.rst:
--------------------------------------------------------------------------------
 1 | wn.project
 2 | ==========
 3 | 
 4 | .. automodule:: wn.project
 5 | 
 6 | .. autofunction:: get_project
 7 | .. autofunction:: iterpackages
 8 | .. autofunction:: is_package_directory
 9 | .. autofunction:: is_collection_directory
10 | 
11 | Project Classes
12 | ---------------
13 | 
14 | Projects can be simple resource files, :class:`Package` directories,
15 | or :class:`Collection` directories. For API consistency, resource
16 | files are modeled as a virtual package (:class:`ResourceOnlyPackage`).
17 | 
18 | .. class:: Project
19 | 
20 |    The base class for packages and collections.
21 | 
22 |    This class is not used directly, but all subclasses will implement
23 |    the methods listed here.
24 | 
25 |    .. autoproperty:: path
26 |    .. automethod:: readme
27 |    .. automethod:: license
28 |    .. automethod:: citation
29 | 
30 | .. autoclass:: Package
31 |    :show-inheritance:
32 | 
33 |    .. autoproperty:: type
34 |    .. automethod:: resource_file
35 | 
36 | .. autoclass:: ResourceOnlyPackage
37 |    :show-inheritance:
38 | 
39 | .. autoclass:: Collection
40 |    :show-inheritance:
41 | 
42 |    .. automethod:: packages
43 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Michael Wayne Goodman
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.github/workflows/checks.yml:
--------------------------------------------------------------------------------
 1 | name: tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [main]
 6 |   pull_request:
 7 |     branches: [main]
 8 | 
 9 | jobs:
10 |   lint:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |     - uses: actions/checkout@v4
14 |     - name: Set up Python
15 |       uses: actions/setup-python@v4
16 |       with:
17 |         python-version: "3.9"
18 |     - name: Install Hatch
19 |       run: pipx install hatch
20 |     - name: Lint
21 |       run: hatch fmt --linter --check
22 |     - name: Type Check
23 |       run: hatch run mypy:check
24 |     - name: Check Buildable
25 |       run: hatch build
26 | 
27 |   tests:
28 |     runs-on: ${{ matrix.os }}
29 |     strategy:
30 |       matrix:
31 |         python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
32 |         os: [ubuntu-latest, windows-latest]
33 |     steps:
34 |     - uses: actions/checkout@v4
35 |     - name: Set up Python ${{ matrix.python-version }}
36 |       uses: actions/setup-python@v4
37 |       with:
38 |         python-version: ${{ matrix.python-version }}
39 |     - name: Install Hatch
40 |       run: pipx install hatch
41 |     - name: Test
42 |       run: hatch test
43 | 


--------------------------------------------------------------------------------
/wn/_ili.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from collections.abc import Iterator
 3 | from pathlib import Path
 4 | 
 5 | from wn._types import AnyPath
 6 | 
 7 | 
 8 | def is_ili(source: AnyPath) -> bool:
 9 |     """Return True if *source* is an ILI tab-separated-value file.
10 | 
11 |     This only checks that the first column, split by tabs, of the
12 |     first line is 'ili' or 'ILI'. It does not check if each line has
13 |     the correct number of columns.
14 | 
15 |     """
16 |     source = Path(source).expanduser()
17 |     if source.is_file():
18 |         try:
19 |             with source.open('rb') as fh:
20 |                 return next(fh).split(b'\t')[0] in (b'ili', b'ILI')
21 |         except (StopIteration, IndexError):
22 |             pass
23 |     return False
24 | 
25 | 
26 | def load(source: AnyPath) -> Iterator[dict[str, str]]:
27 |     """Load an interlingual index file.
28 | 
29 |     Args:
30 |         source: path to an ILI file
31 |     """
32 |     source = Path(source).expanduser()
33 |     with source.open(encoding='utf-8') as fh:
34 |         header = next(fh).rstrip('\r\n')
35 |         fields = tuple(map(str.lower, header.split('\t')))
36 |         for line in fh:
37 |             yield dict(zip(fields, line.rstrip('\r\n').split('\t')))
38 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/data-issue.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Data issue
 3 | about: Report an issue Wn's data index
 4 | title: ''
 5 | labels: data
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **If your issue is regarding the contents of the data** (e.g., a lexicon is missing a word, synset, relation, etc.), then please find the upstream project and file the issue there. You can find links to the projects on Wn's [README](https://github.com/goodmami/wn/). Projects without links are probably managed by the [Open Multilingual Wordnet](https://github.com/omwn/omw-data).
11 | 
12 | **Use this issue template for the following kinds of issues:**
13 | 1. Request a wordnet lexicon (including new versions of existing lexicons) to be indexed by Wn
14 | 
15 |    Please provide:
16 |    - the project name
17 |    - the name and contact info of the current maintainer
18 |    - the language of the lexicon (BCP-47 code preferred)
19 |    - a URL to the project (e.g., on GitHub or other homepage)
20 |    - a URL to the [WN-LMF](https://github.com/globalwordnet/schemas/) resource
21 | 
22 | 2. Report an issue with an indexed lexicon (e.g., the source URL has changed)
23 | 
24 |    Please indicate the lexicon id and version and the correct project information, if available.
25 | 


--------------------------------------------------------------------------------
/bench/README.md:
--------------------------------------------------------------------------------
 1 | # Wn Benchmarking
 2 | 
 3 | This directory contains code and data for running benchmarks for
 4 | Wn. The benchmarks are implemented using
 5 | [pytest-benchmarks](https://github.com/ionelmc/pytest-benchmark/), so
 6 | they are run using pytest as follows (from the top-level project
 7 | directory):
 8 | 
 9 | ```console
10 | $ hatch test bench/  # run the benchmarks
11 | $ hatch test bench/ --benchmark-autosave  # run benchmarks and store results
12 | $ hatch test bench/ --benchmark-compare  # run benchmarks and compare to stored result
13 | $ hatch test -- --help  # get help on options (look for those prefixed `--benchmark-`)
14 | ```
15 | 
16 | Notes:
17 | 
18 | * The tests are not exhaustive; when making a change that may affect
19 |   performance, consider making a new test if one doesn't exist
20 |   already. It would be helpful to check in the test to Git, but not
21 |   the benchmark results since those are dependent on the machine.
22 | * Benchmark the code before and after the changes. Store the results
23 |   locally for comparison.
24 | * Ensure the testing environment has a steady load (wait for
25 |   long-running processes to finish, close any active web browser tabs,
26 |   etc.) prior to and while running the test.
27 | * Expect high variance for IO-bound tasks.
28 | 
29 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | :warning: If this is a question about Wn or how to use it, please create a [discussion](https://github.com/goodmami/wn/discussions) instead of an issue.
14 | 
15 | **To Reproduce**
16 | Please enter a minimal working example of the command or Python code that illustrates the problem. To avoid formatting issues, enter the code in a Markdown code block:
17 | 
18 | ```console
19 | $ python -m wn ...
20 | output...
21 | ```
22 | 
23 | or
24 | 
25 | ```pycon
26 | >>> import wn
27 | >>> ...
28 | output
29 | ```
30 | 
31 | **Expected behavior**
32 | A clear and concise description of what you expected to happen.
33 | 
34 | **Environment**
35 | Please enter the versions of Python and Wn you are using as well as the installed lexicons. You can find these by executing the following commands (adjust your platform-specific Python command as necessary, e.g., `python3` or `py -3`):
36 | 
37 | ```console
38 | python --version
39 | python -m wn --version
40 | python -m wn lexicons
41 | ```
42 | 
43 | **Additional context**
44 | Add any other context about the problem here.
45 | 


--------------------------------------------------------------------------------
/tests/data/sense-member-order.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE LexicalResource SYSTEM "http://globalwordnet.github.io/schemas/WN-LMF-1.1.dtd">
 3 | <LexicalResource xmlns:dc="http://purl.org/dc/elements/1.1/">
 4 | 
 5 | <!-- duplicate ID in synsets -->
 6 | 
 7 |   <Lexicon id="test"
 8 |            label="Testing Sense Member Orders"
 9 |            language="en"
10 |            email="maintainer@example.com"
11 |            license="https://creativecommons.org/licenses/by/4.0/"
12 |            version="1">
13 | 
14 |     <LexicalEntry id="test-foo-n">
15 |       <Lemma partOfSpeech="n" writtenForm="foo" />
16 |       <Sense id="test-01-foo-n" synset="test-01-n" />
17 |       <Sense id="test-02-foo-n" synset="test-02-n" />
18 |     </LexicalEntry>
19 | 
20 |     <LexicalEntry id="test-bar-n">
21 |       <Lemma partOfSpeech="n" writtenForm="bar" />
22 |       <Sense id="test-02-bar-n" synset="test-02-n" />
23 |       <Sense id="test-01-bar-n" synset="test-01-n" />
24 |     </LexicalEntry>
25 | 
26 |     <!-- sense IDs as members -->
27 |     <Synset id="test-01-n" ili="i12345" partOfSpeech="n" members="test-01-bar-n test-01-foo-n"/>
28 |     <!-- word IDs as members -->
29 |     <Synset id="test-02-n" ili="i12346" partOfSpeech="n" members="test-bar-n test-foo-n" />
30 | 
31 |   </Lexicon>
32 | 
33 | </LexicalResource>
34 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | title: Wn
 3 | message: >-
 4 |   Please cite this software using the metadata from
 5 |   'preferred-citation'.
 6 | type: software
 7 | authors:
 8 |   - given-names: Michael Wayne
 9 |     family-names: Goodman
10 |     email: goodman.m.w@gmail.com
11 |     orcid: 'https://orcid.org/0000-0002-2896-5141'
12 |   - given-names: Francis
13 |     family-names: Bond
14 |     email: bond@ieee.org
15 |     orcid: 'https://orcid.org/0000-0003-4973-8068'
16 | repository-code: 'https://github.com/goodmami/wn/'
17 | preferred-citation:
18 |   type: conference-paper
19 |   authors:
20 |   - given-names: Michael Wayne
21 |     family-names: Goodman
22 |     email: goodmami@uw.edu
23 |     orcid: 'https://orcid.org/0000-0002-2896-5141'
24 |     affiliation: Nanyang Technological University
25 |   - given-names: Francis
26 |     family-names: Bond
27 |     email: bond@ieee.org
28 |     orcid: 'https://orcid.org/0000-0003-4973-8068'
29 |     affiliation: Nanyang Technological University
30 |   start: 100  # First page number
31 |   end: 107  # Last page number
32 |   conference:
33 |       name: "Proceedings of the 11th Global Wordnet Conference"
34 |   title: "Intrinsically Interlingual: The Wn Python Library for Wordnets"
35 |   year: 2021
36 |   month: 1
37 |   url: 'https://aclanthology.org/2021.gwc-1.12/'
38 |   publisher: "Global Wordnet Association"
39 | 


--------------------------------------------------------------------------------
/wn/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | """
 3 | Wordnet Interface.
 4 | """
 5 | 
 6 | __all__ = (
 7 |     '__version__',
 8 |     'Wordnet',
 9 |     'download',
10 |     'add',
11 |     'add_lexical_resource',
12 |     'remove',
13 |     'export',
14 |     'projects',
15 |     'lexicons',
16 |     'Lexicon',
17 |     'word',
18 |     'words',
19 |     'Word',
20 |     'Form',
21 |     'Pronunciation',
22 |     'Tag',
23 |     'sense',
24 |     'senses',
25 |     'Sense',
26 |     'Example',
27 |     'Count',
28 |     'synset',
29 |     'synsets',
30 |     'Synset',
31 |     'Definition',
32 |     'Relation',
33 |     'ili',
34 |     'ilis',
35 |     'ILI',
36 |     'Error',
37 |     'DatabaseError',
38 |     'ConfigurationError',
39 |     'ProjectError',
40 |     'WnWarning',
41 | )
42 | 
43 | from wn._exceptions import (
44 |     Error,
45 |     DatabaseError,
46 |     ConfigurationError,
47 |     ProjectError,
48 |     WnWarning,
49 | )
50 | from wn._config import config  # noqa: F401
51 | from wn._add import add, add_lexical_resource, remove
52 | from wn._export import export
53 | from wn._download import download
54 | from wn._core import (
55 |     Lexicon,
56 |     Word, Form, Pronunciation, Tag,
57 |     Sense, Example, Count,
58 |     Synset, Definition,
59 |     Relation,
60 |     ILI,
61 |     Wordnet
62 | )
63 | from wn._module_functions import (
64 |     projects,
65 |     lexicons,
66 |     word, words,
67 |     sense, senses,
68 |     synset, synsets,
69 |     ili, ilis,
70 | )
71 | 
72 | __version__ = '0.13.0'
73 | 


--------------------------------------------------------------------------------
/tests/data/sense-key-variations.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE LexicalResource SYSTEM "http://globalwordnet.github.io/schemas/WN-LMF-1.1.dtd">
 3 | <LexicalResource xmlns:dc="https://globalwordnet.github.io/schemas/dc/">
 4 | 
 5 |   <Lexicon id="omw-en"
 6 |            label="OMW English Wordnet based on WordNet-3.0 sample"
 7 |            language="en"
 8 |            email="maintainer@example.com"
 9 |            license="https://wordnet.princeton.edu/license-and-commercial-use"
10 |            version="1.4"
11 |            url="https://github.com/omwn/omw-data"
12 |            citation="Christiane Fellbaum (1998, ed.) *WordNet: An Electronic Lexical Database*. MIT Press.">
13 |     <LexicalEntry id="omw-en--apos-s_Gravenhage-n">
14 |       <Lemma writtenForm="'s Gravenhage" partOfSpeech="n" />
15 |       <Sense id="omw-en--apos-s_Gravenhage-08950407-n" synset="omw-en-08950407-n" dc:identifier="'s_gravenhage%1:15:00::" />
16 |     </LexicalEntry>
17 |     <Synset id="omw-en-08950407-n" ili="" />
18 |   </Lexicon>
19 | 
20 |   <Lexicon id="oewn"
21 |            label="Open Engish Wordnet sample"
22 |            language="en"
23 |            email="maintainer@example.com"
24 |            license="https://creativecommons.org/licenses/by/4.0"
25 |            version="2024"
26 |            url="https://github.com/globalwordnet/english-wordnet">
27 |     <LexicalEntry id="oewn--ap-s_Gravenhage-n">
28 |       <Lemma writtenForm="&apos;s Gravenhage" partOfSpeech="n"/>
29 |       <Sense id="oewn--ap-s_gravenhage__1.15.00.." synset="oewn-08970180-n"/>
30 |     </LexicalEntry>
31 |     <Synset id="oewn-08970180-n" ili="" />
32 |   </Lexicon>
33 | 
34 | </LexicalResource>


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Wn Documentation
 3 | ================
 4 | 
 5 | Overview
 6 | --------
 7 | 
 8 | This package provides an interface to wordnet data, from simple lookup
 9 | queries, to graph traversals, to more sophisticated algorithms and
10 | metrics. Features include:
11 | 
12 | - Support for wordnets in the
13 |   `WN-LMF <https://globalwordnet.github.io/schemas/>`_ format
14 | - A `SQLite <https://sqlite.org>`_ database backend for data
15 |   consistency and efficient queries
16 | - Accurate modeling of Words, Senses, and Synsets
17 | 
18 | Quick Start
19 | -----------
20 | 
21 | .. code-block:: console
22 | 
23 |    $ pip install wn
24 | 
25 | .. code-block:: python
26 | 
27 |    >>> import wn
28 |    >>> wn.download('ewn:2020')
29 |    >>> wn.synsets('coffee')
30 |    [Synset('ewn-04979718-n'), Synset('ewn-07945591-n'), Synset('ewn-07945759-n'), Synset('ewn-12683533-n')]
31 | 
32 | 
33 | Contents
34 | --------
35 | 
36 | .. toctree::
37 |    :maxdepth: 2
38 | 
39 |    setup.rst
40 |    cli.rst
41 |    faq.rst
42 | 
43 | .. toctree::
44 |    :caption: Guides
45 |    :maxdepth: 2
46 | 
47 |    guides/lexicons.rst
48 |    guides/basic.rst
49 |    guides/interlingual.rst
50 |    guides/wordnet.rst
51 |    guides/lemmatization.rst
52 |    guides/nltk-migration.rst
53 | 
54 | .. toctree::
55 |    :caption: API Reference
56 |    :maxdepth: 1
57 |    :hidden:
58 | 
59 |    api/wn.rst
60 |    api/wn.compat.rst
61 |    api/wn.compat.sensekey.rst
62 |    api/wn.constants.rst
63 |    api/wn.ic.rst
64 |    api/wn.lmf.rst
65 |    api/wn.morphy.rst
66 |    api/wn.project.rst
67 |    api/wn.similarity.rst
68 |    api/wn.taxonomy.rst
69 |    api/wn.util.rst
70 |    api/wn.validate.rst
71 |    api/wn.web.rst
72 | 


--------------------------------------------------------------------------------
/docs/_static/wn-logo-rotate.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" standalone="no"?>
 2 | <svg width="200" height="100" xmlns="http://www.w3.org/2000/svg" version="1.1">
 3 |   <defs>
 4 |     <linearGradient id="wgradient" gradientTransform="rotate(90)">
 5 |       <stop offset="0%"   stop-color="#814F92" />
 6 |       <stop offset="100%" stop-color="#41285C" />
 7 |     </linearGradient>
 8 |     <linearGradient id="ngradient" gradientTransform="rotate(90)">
 9 |       <stop offset="0%"   stop-color="#3CD2CC" />
10 |       <stop offset="100%" stop-color="#176361" />
11 |     </linearGradient>
12 |   </defs>
13 |   <style type="text/css"><![CDATA[
14 |     .path {
15 |       stroke-width: 10;
16 |       fill: none;
17 |       stroke-linecap: round;
18 |       stroke-linejoin: round;
19 |       transform-origin: 50% 100%;
20 |     }
21 |     @keyframes rotate-45-clockwise {
22 |       0% { transform: rotate(-45deg); }
23 |       25% { transform: rotate(-45deg); }
24 |       30% { transform: rotate(0); }
25 |       95% { transform: rotate(0); }
26 |       100% { transform: rotate(-45deg); }
27 |     }
28 |     @keyframes rotate-45-anticlockwise {
29 |       0% { transform: rotate(45deg); }
30 |       25% { transform: rotate(45deg); }
31 |       30% { transform: rotate(0); }
32 |       95% { transform: rotate(0); }
33 |       100% { transform: rotate(45deg); }
34 |     }
35 |     #w { stroke: url(#wgradient); stroke-opacity: 1; animation: 20s rotate-45-clockwise 0s ease-in-out infinite; }
36 |     #n { stroke: url(#ngradient); stroke-opacity: 1; animation: 20s rotate-45-anticlockwise 0s ease-in-out infinite; }
37 |   ]]></style>
38 |   <path id="n" class="path" d="M100,90 L60,50 140,50 100,10" />
39 |   <path id="w" class="path" d="M100,10 L60,50 100,50 100,90 140,50" />
40 | </svg>
41 | 


--------------------------------------------------------------------------------
/tests/data/mini-lmf-1.3.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE LexicalResource SYSTEM "http://globalwordnet.github.io/schemas/WN-LMF-1.3.dtd">
 3 | 
 4 | <!--
 5 | WN-LMF 1.3 is the same as 1.1 and 1.2 except for allowing xml:space on
 6 | nodes with text content.
 7 | -->
 8 | 
 9 | <LexicalResource xmlns:dc="http://globalwordnet.github.io/schemas/dc/">
10 | 
11 |   <Lexicon id="test-ws"
12 |            label="Testing Whitespace WordNet"
13 |            language="en"
14 |            email="maintainer@example.com"
15 |            license="https://creativecommons.org/licenses/by/4.0/"
16 |            version="1"
17 |            url="https://example.com/test-whitespace"
18 |            logo="logo.svg">
19 | 
20 |     <LexicalEntry id="test-ws-foo">
21 |       <Lemma partOfSpeech="n" writtenForm="foo" />
22 |       <Sense id="test-ws-foo-1" synset="test-ws-1" />
23 |     </LexicalEntry>
24 | 
25 |     <LexicalEntry id="test-ws-bar">
26 |       <Lemma partOfSpeech="n" writtenForm="bar" />
27 |       <Sense id="test-ws-bar-2" synset="test-ws-2" />
28 |     </LexicalEntry>
29 | 
30 |     <LexicalEntry id="test-ws-baz">
31 |       <Lemma partOfSpeech="n" writtenForm="baz" />
32 |       <Sense id="test-ws-baz-3" synset="test-ws-3" />
33 |     </LexicalEntry>
34 | 
35 |     <Synset id="test-ws-1" ili="" partOfSpeech="n">
36 |       <Definition>
37 |         one
38 |           two
39 |         three
40 |       </Definition>
41 |     </Synset>
42 | 
43 |     <Synset id="test-ws-2" ili="" partOfSpeech="n">
44 |       <Definition xml:space="default">
45 |         one
46 |           two
47 |         three
48 |       </Definition>
49 |     </Synset>
50 | 
51 |     <Synset id="test-ws-3" ili="" partOfSpeech="n">
52 |       <Definition xml:space="preserve">
53 |         one
54 |           two
55 |         three
56 |       </Definition>
57 |     </Synset>
58 | 
59 |   </Lexicon>
60 | 
61 | </LexicalResource>
62 | 


--------------------------------------------------------------------------------
/bench/test_bench.py:
--------------------------------------------------------------------------------
 1 | import wn
 2 | from wn import lmf
 3 | 
 4 | import pytest
 5 | 
 6 | 
 7 | @pytest.mark.benchmark(group="lmf.load", warmup=True)
 8 | def test_load(datadir, benchmark):
 9 |     benchmark(lmf.load, datadir / 'mini-lmf-1.0.xml')
10 | 
11 | 
12 | @pytest.mark.benchmark(group="wn.add_lexical_resource")
13 | @pytest.mark.usefixtures('empty_db')
14 | def test_add_lexical_resource(mock_lmf, benchmark):
15 |     # TODO: when pytest-benchmark's teardown option is released, use
16 |     # that here with more rounds
17 |     benchmark.pedantic(
18 |         wn.add_lexical_resource,
19 |         args=(mock_lmf,),
20 |         # teardown=clean_db,
21 |         iterations=1,
22 |         rounds=1,
23 |     )
24 | 
25 | 
26 | @pytest.mark.benchmark(group="wn.add_lexical_resource")
27 | @pytest.mark.usefixtures('empty_db')
28 | def test_add_lexical_resource_no_progress(mock_lmf, benchmark):
29 |     # TODO: when pytest-benchmark's teardown option is released, use
30 |     # that here with more rounds
31 |     benchmark.pedantic(
32 |         wn.add_lexical_resource,
33 |         args=(mock_lmf,),
34 |         kwargs={"progress_handler": None},
35 |         # teardown=clean_db,
36 |         iterations=1,
37 |         rounds=1,
38 |     )
39 | 
40 | 
41 | @pytest.mark.benchmark(group="primary queries")
42 | @pytest.mark.usefixtures('mock_db')
43 | def test_synsets(benchmark):
44 |     benchmark(wn.synsets)
45 | 
46 | 
47 | @pytest.mark.benchmark(group="primary queries")
48 | @pytest.mark.usefixtures('mock_db')
49 | def test_words(benchmark):
50 |     benchmark(wn.words)
51 | 
52 | 
53 | @pytest.mark.benchmark(group="secondary queries")
54 | @pytest.mark.usefixtures('mock_db')
55 | def test_word_senses_no_wordnet(benchmark):
56 |     word = wn.words()[0]
57 |     benchmark(word.senses)
58 | 
59 | 
60 | @pytest.mark.benchmark(group="secondary queries")
61 | @pytest.mark.usefixtures('mock_db')
62 | def test_word_senses_with_wordnet(benchmark):
63 |     w = wn.Wordnet("mock:1")
64 |     word = w.words()[0]
65 |     benchmark(word.senses)
66 | 
67 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Build and Publish to PyPI or TestPyPI
 2 | 
 3 | on: push
 4 | 
 5 | jobs:
 6 |   build:
 7 |     name: Build distribution
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |       - uses: actions/checkout@v4
11 |       - name: Set up Python
12 |         uses: actions/setup-python@v4
13 |         with:
14 |           python-version: "3.x"
15 |       - name: Install Hatch
16 |         run: pipx install hatch
17 |       - name: Build
18 |         run: hatch build
19 |       - name: Store the distribution packages
20 |         uses: actions/upload-artifact@v4
21 |         with:
22 |           name: python-package-distributions
23 |           path: dist/
24 | 
25 |   publish-to-pypi:
26 |     name: Publish distributions to PyPI
27 |     if: startsWith(github.ref, 'refs/tags/')  # only publish to PyPI on tag pushes
28 |     needs:
29 |       - build
30 |     runs-on: ubuntu-latest
31 |     environment:
32 |       name: pypi
33 |       url: https://pypi.org/p/wn
34 |     permissions:
35 |       id-token: write  # IMPORTANT: mandatory for trusted publishing
36 |     steps:
37 |       - name: Download the dists
38 |         uses: actions/download-artifact@v4.1.8
39 |         with:
40 |           name: python-package-distributions
41 |           path: dist/
42 |       - name: Publish to PyPI
43 |         uses: pypa/gh-action-pypi-publish@release/v1
44 | 
45 |   publish-to-testpypi:
46 |     name: Publish distributions to TestPyPI
47 |     needs:
48 |       - build
49 |     runs-on: ubuntu-latest
50 |     environment:
51 |       name: testpypi
52 |       url: https://test.pypi.org/p/wn
53 |     permissions:
54 |       id-token: write  # IMPORTANT: mandatory for trusted publishing
55 |     steps:
56 |       - name: Download the dists
57 |         uses: actions/download-artifact@v4.1.8
58 |         with:
59 |           name: python-package-distributions
60 |           path: dist/
61 |       - name: Publish to TestPyPI
62 |         uses: pypa/gh-action-pypi-publish@release/v1
63 |         with:
64 |           repository-url: https://test.pypi.org/legacy/
65 |           skip-existing: true
66 | 


--------------------------------------------------------------------------------
/tests/project_test.py:
--------------------------------------------------------------------------------
 1 | from wn import project
 2 | 
 3 | def test_is_package_directory(datadir):
 4 |     assert project.is_package_directory(datadir / "test-package")
 5 |     assert not project.is_package_directory(datadir)
 6 | 
 7 | 
 8 | def test_is_collection_directory(datadir):
 9 |     # not really, but it is a directory containing a package
10 |     assert project.is_collection_directory(datadir)
11 |     assert not project.is_collection_directory(datadir / "test-package")
12 | 
13 | 
14 | def test_get_project(datadir):
15 |     proj = project.get_project(path=datadir / "test-package")
16 |     assert proj.type == "wordnet"
17 |     assert proj.resource_file() == datadir / "test-package" / "test-wn.xml"
18 |     assert proj.readme() == datadir / "test-package" / "README.md"
19 |     assert proj.license() == datadir / "test-package" / "LICENSE"
20 |     assert proj.citation() == datadir / "test-package" / "citation.bib"
21 | 
22 |     proj = project.get_project(path=datadir / "mini-lmf-1.0.xml")
23 |     assert proj.type == "wordnet"
24 |     assert proj.resource_file() == datadir / "mini-lmf-1.0.xml"
25 |     assert proj.readme() is None
26 |     assert proj.license() is None
27 |     assert proj.citation() is None
28 | 
29 | 
30 | def test_iterpackages(datadir):
31 |     # for now, collection.packages() does not return contained resource files
32 |     pkg_names = {
33 |         pkg.resource_file().name
34 |         for pkg in project.iterpackages(datadir)
35 |     }
36 |     assert "mini-lmf-1.0.xml" not in pkg_names
37 |     assert "test-wn.xml" in pkg_names
38 | 
39 |     # explicitly giving a resource file path works, though
40 |     pkg_names = {
41 |         pkg.resource_file().name
42 |         for pkg in project.iterpackages(datadir /  "mini-lmf-1.0.xml")
43 |     }
44 |     assert "mini-lmf-1.0.xml" in pkg_names
45 |     assert "test-wn.xml" not in pkg_names
46 | 
47 | 
48 | def test_compressed_iterpackages(mini_lmf_compressed):
49 |     for pkg in project.iterpackages(mini_lmf_compressed):
50 |         assert pkg.type == "wordnet"
51 |         assert pkg.resource_file().exists()
52 |     # ensure cleanup of temporary data
53 |     assert not pkg.resource_file().exists()
54 |     # ensure original file not deleted
55 |     assert mini_lmf_compressed.exists()
56 | 


--------------------------------------------------------------------------------
/tests/morphy_test.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import pytest
 3 | 
 4 | import wn
 5 | from wn import morphy
 6 | 
 7 | 
 8 | def test_morphy_uninitialized():
 9 |     # An unintialized Morphy isn't very bright, but it starts up
10 |     # fast. It relies on the database to filter bad items.
11 |     m = morphy.Morphy()
12 |     assert m('example', 'n') == {'n': {'example'}}
13 |     assert m('examples', 'n') == {'n': {'examples', 'example'}}
14 |     assert m('examples', 'v') == {'v': {'examples', 'example', 'exampl'}}
15 |     assert m('exemplifying', 'n') == {'n': {'exemplifying'}}
16 |     assert m('exemplifying', 'v') == {'v': {'exemplifying', 'exemplify', 'exemplifye'}}
17 |     assert m('data', 'n') == {'n': {'data'}}
18 |     assert m('datums', 'n') == {'n': {'datums', 'datum'}}  # expected false positive
19 |     assert m('examples', None) == {None: {'examples'},
20 |                                    'n': {'example'},
21 |                                    'v': {'example', 'exampl'}}
22 |     assert m('exemplifying', None) == {None: {'exemplifying'},
23 |                                        'v': {'exemplify', 'exemplifye'}}
24 |     assert m('data', None) == {None: {'data'}}
25 | 
26 | 
27 | @pytest.mark.usefixtures('mini_db')
28 | def test_morphy_initialized():
29 |     w = wn.Wordnet('test-en:1')
30 |     m = morphy.Morphy(wordnet=w)
31 |     assert m('example', 'n') == {'n': {'example'}}
32 |     assert m('examples', 'n') == {'n': {'example'}}
33 |     assert m('examples', 'v') == {}
34 |     assert m('exemplifying', 'n') == {}
35 |     assert m('exemplifying', 'v') == {'v': {'exemplify'}}
36 |     assert m('data', 'n') == {'n': {'datum'}}
37 |     assert m('datums', 'n') == {'n': {'datum'}}  # expected false positive
38 |     assert m('examples', None) == {'n': {'example'}}
39 |     assert m('exemplifying', None) == {'v': {'exemplify'}}
40 |     assert m('data', None) == {'n': {'datum'}}
41 | 
42 | 
43 | @pytest.mark.usefixtures('mini_db')
44 | def test_issue_154():
45 |     # https://github.com/goodmami/wn/issues/154
46 |     w = wn.Wordnet('test-en:1')
47 |     assert w.words('exemplifies') == [w.word('test-en-exemplify-v')]
48 |     assert w.words('samples') == []
49 |     w = wn.Wordnet('test-en:1', lemmatizer=morphy.Morphy())
50 |     assert w.words('exemplifies') == [w.word('test-en-exemplify-v')]
51 |     assert w.words('samples') == [w.word('test-en-sample-n')]
52 | 


--------------------------------------------------------------------------------
/wn/_util.py:
--------------------------------------------------------------------------------
 1 | """Non-public Wn utilities."""
 2 | 
 3 | from collections.abc import Iterable, Hashable
 4 | from typing import TypeVar
 5 | from pathlib import Path
 6 | import hashlib
 7 | from unicodedata import normalize, combining
 8 | 
 9 | 
10 | from wn._types import VersionInfo
11 | 
12 | 
13 | def version_info(version_string: str) -> VersionInfo:
14 |     return tuple(map(int, version_string.split('.')))
15 | 
16 | 
17 | def is_url(string: str) -> bool:
18 |     """Return True if *string* appears to be a URL."""
19 |     # TODO: ETags?
20 |     return any(string.startswith(scheme)
21 |                for scheme in ('http://', 'https://'))
22 | 
23 | 
24 | def is_gzip(path: Path) -> bool:
25 |     """Return True if the file at *path* appears to be gzipped."""
26 |     return _inspect_file_signature(path, b'\x1F\x8B')
27 | 
28 | 
29 | def is_lzma(path: Path) -> bool:
30 |     """Return True if the file at *path* appears to be lzma-compressed."""
31 |     return _inspect_file_signature(path, b'\xFD7zXZ\x00')
32 | 
33 | 
34 | def is_xml(path: Path) -> bool:
35 |     """Return True if the file at *path* appears to be an XML file."""
36 |     return _inspect_file_signature(path, b'<?xml ')
37 | 
38 | 
39 | def _inspect_file_signature(path: Path, signature: bytes) -> bool:
40 |     if path.is_file():
41 |         with path.open('rb') as f:
42 |             return f.read(len(signature)) == signature
43 |     return False
44 | 
45 | 
46 | def short_hash(string: str) -> str:
47 |     """Return a short hash of *string*."""
48 |     b2 = hashlib.blake2b(digest_size=20)
49 |     b2.update(string.encode('utf-8'))
50 |     return b2.hexdigest()
51 | 
52 | 
53 | T = TypeVar('T')
54 | 
55 | 
56 | def flatten(iterable: Iterable[Iterable[T]]) -> list[T]:
57 |     return [x for xs in iterable for x in xs]
58 | 
59 | 
60 | H = TypeVar('H', bound=Hashable)
61 | 
62 | 
63 | def unique_list(items: Iterable[H]) -> list[H]:
64 |     # use a dictionary as an order-preserving set
65 |     targets = {item: True for item in items}
66 |     return list(targets)
67 | 
68 | 
69 | def normalize_form(s: str) -> str:
70 |     return ''.join(c for c in normalize('NFKD', s.lower()) if not combining(c))
71 | 
72 | 
73 | def format_lexicon_specifier(id: str, version: str) -> str:
74 |     return f"{id}:{version}"
75 | 
76 | 
77 | def split_lexicon_specifier(lexicon: str) -> tuple[str, str]:
78 |     id, _, ver = lexicon.partition(":")
79 |     return id, ver
80 | 


--------------------------------------------------------------------------------
/docs/api/wn.taxonomy.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | wn.taxonomy
 3 | ===========
 4 | 
 5 | .. automodule:: wn.taxonomy
 6 | 
 7 | 
 8 | Overview
 9 | --------
10 | 
11 | Among the valid synset relations for wordnets (see
12 | :data:`wn.constants.SYNSET_RELATIONS`), those used for describing
13 | *is-a* `taxonomies <https://en.wikipedia.org/wiki/Taxonomy>`_ are
14 | given special treatment and they are generally the most
15 | well-developed relations in any wordnet. Typically these are the
16 | ``hypernym`` and ``hyponym`` relations, which encode *is-a-type-of*
17 | relationships (e.g., a *hermit crab* is a type of *decapod*, which is
18 | a type of *crustacean*, etc.). They also include ``instance_hypernym``
19 | and ``instance_hyponym``, which encode *is-an-instance-of*
20 | relationships (e.g., *Oregon* is an instance of *American state*).
21 | 
22 | The taxonomy forms a multiply-inheriting hierarchy with the synsets as
23 | nodes. In the English wordnets, such as the Princeton WordNet and its
24 | derivatives, nearly all nominal synsets form such a hierarchy with
25 | single root node, while verbal synsets form many smaller hierarchies
26 | without a common root. Other wordnets may have different properties,
27 | but as many are based off of the Princeton WordNet, they tend to
28 | follow this structure.
29 | 
30 | Functions to find paths within the taxonomies form the basis of all
31 | :mod:`wordnet similarity measures <wn.similarity>`. For instance, the
32 | :ref:`leacock-chodorow-similarity` measure uses both
33 | :func:`shortest_path` and (indirectly) :func:`taxonomy_depth`.
34 | 
35 | 
36 | Wordnet-level Functions
37 | -----------------------
38 | 
39 | Root and leaf synsets in the taxonomy are those with no ancestors
40 | (``hypernym``, ``instance_hypernym``, etc.) or hyponyms (``hyponym``,
41 | ``instance_hyponym``, etc.), respectively.
42 | 
43 | Finding root and leaf synsets
44 | '''''''''''''''''''''''''''''
45 | 
46 | .. autofunction:: roots
47 | .. autofunction:: leaves
48 | 
49 | Computing the taxonomy depth
50 | ''''''''''''''''''''''''''''
51 | 
52 | The taxonomy depth is the maximum depth from a root node to a leaf
53 | node within synsets for a particular part of speech.
54 | 
55 | .. autofunction:: taxonomy_depth
56 | 
57 | 
58 | Synset-level Functions
59 | ----------------------
60 | 
61 | .. autofunction:: hypernym_paths
62 | .. autofunction:: min_depth
63 | .. autofunction:: max_depth
64 | .. autofunction:: shortest_path
65 | .. autofunction:: common_hypernyms
66 | .. autofunction:: lowest_common_hypernyms
67 | 


--------------------------------------------------------------------------------
/tests/db_test.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import sqlite3
 3 | import threading
 4 | import tempfile
 5 | 
 6 | import pytest
 7 | 
 8 | import wn
 9 | from wn import lmf
10 | 
11 | 
12 | @pytest.mark.usefixtures('mini_db')
13 | def test_schema_compatibility():
14 |     conn = sqlite3.connect(str(wn.config.database_path))
15 |     schema_hash = wn._db.schema_hash(conn)
16 |     assert schema_hash in wn._db.COMPATIBLE_SCHEMA_HASHES
17 | 
18 | 
19 | @pytest.mark.usefixtures('mini_db')
20 | def test_db_multithreading():
21 |     """
22 |     See https://github.com/goodmami/wn/issues/86
23 |     Thanks: @fushinari
24 |     """
25 | 
26 |     class WNThread:
27 |         w = None
28 | 
29 |         def __init__(self):
30 |             w_thread = threading.Thread(target=self.set_w)
31 |             w_thread.start()
32 |             w_thread.join()
33 |             self.w.synsets()
34 | 
35 |         def set_w(self):
36 |             if self.w is None:
37 |                 self.w = wn.Wordnet()
38 | 
39 |     # close the connections by resetting the pool
40 |     wn._db.pool = {}
41 |     with pytest.raises(sqlite3.ProgrammingError):
42 |         WNThread()
43 |     wn._db.pool = {}
44 |     wn.config.allow_multithreading = True
45 |     WNThread()  # no error
46 |     wn.config.allow_multithreading = False
47 |     wn._db.pool = {}
48 | 
49 | 
50 | def test_remove_extension(datadir):
51 |     with tempfile.TemporaryDirectory('wn_data_1_1_trigger') as dir:
52 |         old_data_dir = wn.config.data_directory
53 |         wn.config.data_directory = dir
54 |         wn.add(datadir / 'mini-lmf-1.0.xml')
55 |         wn.add(datadir / 'mini-lmf-1.1.xml')
56 |         assert len(wn.lexicons()) == 4
57 |         wn.remove('test-en-ext')
58 |         assert len(wn.lexicons()) == 3
59 |         wn.remove('test-ja')
60 |         assert len(wn.lexicons()) == 2
61 |         wn.add(datadir / 'mini-lmf-1.1.xml')
62 |         assert len(wn.lexicons()) == 4
63 |         wn.remove('test-en')
64 |         assert {lex.id for lex in wn.lexicons()} == {'test-es', 'test-ja'}
65 |         wn.config.data_directory = old_data_dir
66 |         # close any open DB connections before teardown
67 |         for conn in wn._db.pool.values():
68 |             conn.close()
69 | 
70 | 
71 | def test_add_lexical_resource(datadir):
72 |     with tempfile.TemporaryDirectory('wn_data_add_lexical_resource') as dir:
73 |         old_data_dir = wn.config.data_directory
74 |         wn.config.data_directory = dir
75 |         wn.add_lexical_resource(lmf.load(datadir / 'mini-lmf-1.0.xml'))
76 |         assert len(wn.lexicons()) == 2
77 |         wn.add_lexical_resource(lmf.load(datadir / 'mini-lmf-1.1.xml'))
78 |         assert len(wn.lexicons()) == 4
79 |         wn.config.data_directory = old_data_dir
80 |         # close any open DB connections before teardown
81 |         for conn in wn._db.pool.values():
82 |             conn.close()
83 | 
84 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = ["hatchling"]
  3 | build-backend = "hatchling.build"
  4 | 
  5 | [project]
  6 | dynamic = ['version']
  7 | 
  8 | name = "wn"
  9 | description = "Wordnet interface library"
 10 | readme = "README.md"
 11 | requires-python = ">=3.9"
 12 | license = {file = "LICENSE"}
 13 | keywords = ["wordnet", "interlingual", "linguistics", "language", "library"]
 14 | authors = [
 15 |   {name = "Michael Wayne Goodman", email = "goodman.m.w@gmail.com"}
 16 | ]
 17 | classifiers = [
 18 |   "Development Status :: 4 - Beta",
 19 |   "Environment :: Console",
 20 |   "Intended Audience :: Developers",
 21 |   "Intended Audience :: Information Technology",
 22 |   "Intended Audience :: Science/Research",
 23 |   "License :: OSI Approved :: MIT License",
 24 |   "Programming Language :: Python :: 3",
 25 |   "Programming Language :: Python :: 3.9",
 26 |   "Programming Language :: Python :: 3.10",
 27 |   "Programming Language :: Python :: 3.11",
 28 |   "Programming Language :: Python :: 3.12",
 29 |   "Programming Language :: Python :: 3.13",
 30 |   "Topic :: Scientific/Engineering :: Information Analysis",
 31 |   "Topic :: Software Development :: Libraries :: Python Modules",
 32 |   "Topic :: Text Processing :: Linguistic",
 33 | ]
 34 | 
 35 | dependencies = [
 36 |   "httpx",
 37 |   "tomli",
 38 | ]
 39 | 
 40 | [project.optional-dependencies]
 41 | web = [
 42 |   "starlette",
 43 | ]
 44 | editor = [
 45 |   "wn-editor"
 46 | ]
 47 | 
 48 | [project.urls]
 49 | homepage = "https://github.com/goodmami/wn"
 50 | documentation = "https://wn.readthedocs.io"
 51 | changelog = "https://github.com/goodmami/wn/blob/main/CHANGELOG.md"
 52 | 
 53 | [tool.hatch.version]
 54 | path = "wn/__init__.py"
 55 | 
 56 | [tool.hatch.build.targets.sdist]
 57 | exclude = [
 58 |   "/.github",
 59 | ]
 60 | 
 61 | [tool.hatch.envs.hatch-test]
 62 | extra-dependencies = [
 63 |   "pytest-benchmark",
 64 | ]
 65 | features = ["web"]
 66 | 
 67 | [tool.hatch.envs.mypy]
 68 | dependencies = [
 69 |   "mypy",
 70 | ]
 71 | 
 72 | [tool.hatch.envs.mypy.scripts]
 73 | check = "mypy wn/"
 74 | 
 75 | [tool.hatch.envs.docs]
 76 | dependencies = [
 77 |   "wn[web]",
 78 |   "furo",
 79 |   "sphinx",
 80 |   "sphinx-copybutton",
 81 |   "sphinx-autobuild",
 82 | ]
 83 | 
 84 | [tool.hatch.envs.docs.scripts]
 85 | build = "sphinx-build -M html docs docs/_build"
 86 | clean = "sphinx-build -M clean docs docs/_build"
 87 | watch = "sphinx-autobuild docs docs/_build/html"
 88 | 
 89 | [tool.ruff]
 90 | target-version = "py39"
 91 | line-length = 88
 92 | 
 93 | [tool.ruff.lint]
 94 | select = [
 95 |   "B",      # flake8-bugbear
 96 |   "C90",    # McCabe cyclomatic complexity
 97 |   "E",      # pycodestyle
 98 |   "F",      # Pyflakes
 99 |   "W",      # pycodestyle
100 | ]
101 | 
102 | [tool.ruff.lint.per-file-ignores]
103 | "docs/conf.py" = ["E402"]
104 | 
105 | [tool.ruff.format]
106 | quote-style = "single"
107 | 


--------------------------------------------------------------------------------
/tests/data/mini-lmf-1.4.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE LexicalResource SYSTEM "http://globalwordnet.github.io/schemas/WN-LMF-1.4.dtd">
 3 | 
 4 | <!--
 5 | WN-LMF 1.4 has the following changes:
 6 | - optional 'index' attribute on LexicalEntry
 7 | - optional 'n' attribute on Sense
 8 | - Pronunciation elements under Definition and Example
 9 | -->
10 | 
11 | <LexicalResource xmlns:dc="http://globalwordnet.github.io/schemas/dc/">
12 | 
13 |   <Lexicon id="test-1.4"
14 |            label="Testing WN-LMF 1.4"
15 |            language="en"
16 |            email="maintainer@example.com"
17 |            license="https://creativecommons.org/licenses/by/4.0/"
18 |            version="1">
19 | 
20 |     <LexicalEntry id="test-1.4-Foo_Bar-n" index="foo_bar">
21 |       <Lemma partOfSpeech="n" writtenForm="Foo Bar" />
22 |       <Sense id="test-1.4-Foo_Bar-n-1" synset="test-1.4-1" n="3">
23 |         <SenseRelation relType="metaphor" target="test-1.4-baz-n-1" />
24 |       </Sense>
25 |     </LexicalEntry>
26 | 
27 |     <LexicalEntry id="test-1.4-foo_bar-n" index="foo_bar">
28 |       <Lemma partOfSpeech="n" writtenForm="foo bar" />
29 |       <Sense id="test-1.4-foo_bar-n-1" synset="test-1.4-1" n="2" />
30 |       <Sense id="test-1.4-foo_bar-n-2" synset="test-1.4-2" n="1" />
31 |     </LexicalEntry>
32 | 
33 |     <!-- ommitted index defaults to writtenForm (baz) when added to db -->
34 |     <LexicalEntry id="test-1.4-baz-n">
35 |       <Lemma partOfSpeech="n" writtenForm="baz" />
36 |       <Sense id="test-1.4-baz-n-1" synset="test-1.4-1">
37 |         <SenseRelation relType="has_metaphor" target="test-1.4-Foo_Bar-n-1" />
38 |       </Sense>
39 |     </LexicalEntry>
40 | 
41 |     <!-- this should share the index with the one above -->
42 |     <LexicalEntry id="test-1.4-BAZ-n" index="baz">
43 |       <Lemma partOfSpeech="n" writtenForm="BAZ" />
44 |       <Sense id="test-1.4-BAZ-n-1" synset="test-1.4-1" n="2" />
45 |     </LexicalEntry>
46 | 
47 |     <!-- this one does not share the index -->
48 |     <LexicalEntry id="test-1.4-Baz-n">
49 |       <Lemma partOfSpeech="n" writtenForm="Baz" />
50 |       <Sense id="test-1.4-Baz-n-1" synset="test-1.4-1" n="2" />
51 |       <!-- omitted 'n' defaults to position (2) when added to db -->
52 |       <Sense id="test-1.4-Baz-n-2" synset="test-1.4-2" />
53 |     </LexicalEntry>
54 | 
55 |     <!-- indexes are shared only in the same part of speech -->
56 |     <LexicalEntry id="test-1.4-baz-v" index="baz">
57 |       <Lemma partOfSpeech="v" writtenForm="baz" />
58 |       <Sense id="test-1.4-baz-v-1" synset="test-1.4-3" n="1" />
59 |     </LexicalEntry>
60 | 
61 |     <Synset id="test-1.4-1" ili="" partOfSpeech="n" members="test-1.4-Foo_Bar-n-1 test-1.4-foo_bar-n-1 test-1.4-baz-n-1 test-1.4-BAZ-n-1 test-1.4-Baz-n-1" />
62 | 
63 |     <Synset id="test-1.4-2" ili="" partOfSpeech="n" members="test-1.4-foo_bar-n-2 test-1.4-Baz-n-2" />
64 | 
65 |     <Synset id="test-1.4-3" ili="" partOfSpeech="v" members="test-1.4-baz-v-1" />
66 | 
67 |   </Lexicon>
68 | 
69 | </LexicalResource>
70 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import lzma
  3 | import tempfile
  4 | from pathlib import Path
  5 | 
  6 | import pytest
  7 | 
  8 | import wn
  9 | 
 10 | 
 11 | @pytest.fixture(scope='session')
 12 | def datadir():
 13 |     return Path(__file__).parent / 'data'
 14 | 
 15 | 
 16 | @pytest.fixture
 17 | def uninitialized_datadir(monkeypatch, tmp_path: Path):
 18 |     with monkeypatch.context() as m:
 19 |         m.setattr(wn.config, 'data_directory', tmp_path / 'uninitialized_datadir')
 20 |         yield
 21 | 
 22 | 
 23 | @pytest.fixture(scope='session')
 24 | def empty_db():
 25 |     with tempfile.TemporaryDirectory('wn_data_empty') as dir:
 26 |         with pytest.MonkeyPatch.context() as m:
 27 |             m.setattr(wn.config, 'data_directory', dir)
 28 |             yield
 29 | 
 30 | 
 31 | # We want to build these DBs once per session, but connections
 32 | # are created once for every test.
 33 | 
 34 | @pytest.fixture(scope='session')
 35 | def mini_db_dir(datadir):
 36 |     with tempfile.TemporaryDirectory('wn_data_mini') as dir:
 37 |         with pytest.MonkeyPatch.context() as m:
 38 |             m.setattr(wn.config, 'data_directory', dir)
 39 |             wn.add(datadir / 'mini-lmf-1.0.xml')
 40 |             wn._db.clear_connections()
 41 | 
 42 |         yield Path(dir)
 43 | 
 44 | 
 45 | @pytest.fixture
 46 | def mini_lmf_compressed(datadir):
 47 |     data = (datadir / 'mini-lmf-1.0.xml').read_bytes()
 48 |     with tempfile.NamedTemporaryFile(suffix='.xml.xz', delete=False) as file:
 49 |         path = Path(file.name)
 50 |     # Windows cannot reliably reopen file until it's closed
 51 |     with lzma.open(path, "w") as f:
 52 |         f.write(data)
 53 |     try:
 54 |         yield Path(file.name)
 55 |     finally:
 56 |         Path(file.name).unlink()
 57 | 
 58 | 
 59 | @pytest.fixture(scope='session')
 60 | def mini_db_1_1_dir(datadir):
 61 |     with tempfile.TemporaryDirectory('wn_data_mini_1_1') as dir:
 62 |         with pytest.MonkeyPatch.context() as m:
 63 |             m.setattr(wn.config, 'data_directory', dir)
 64 |             wn.add(datadir / 'mini-lmf-1.0.xml')
 65 |             wn.add(datadir / 'mini-lmf-1.1.xml')
 66 |             wn._db.clear_connections()
 67 | 
 68 |         yield Path(dir)
 69 | 
 70 | 
 71 | @pytest.fixture(scope='session')
 72 | def mini_db_1_4_dir(datadir):
 73 |     with tempfile.TemporaryDirectory('wn_data_mini_1_4') as dir:
 74 |         with pytest.MonkeyPatch.context() as m:
 75 |             m.setattr(wn.config, 'data_directory', dir)
 76 |             wn.add(datadir / 'mini-lmf-1.4.xml')
 77 |             wn._db.clear_connections()
 78 | 
 79 |         yield Path(dir)
 80 | 
 81 | 
 82 | @pytest.fixture
 83 | def mini_db(monkeypatch, mini_db_dir):
 84 |     with monkeypatch.context() as m:
 85 |         m.setattr(wn.config, 'data_directory', mini_db_dir)
 86 |         yield
 87 |         wn._db.clear_connections()
 88 | 
 89 | 
 90 | @pytest.fixture
 91 | def mini_db_1_1(monkeypatch, mini_db_1_1_dir):
 92 |     with monkeypatch.context() as m:
 93 |         m.setattr(wn.config, 'data_directory', mini_db_1_1_dir)
 94 |         yield
 95 |         wn._db.clear_connections()
 96 | 
 97 | 
 98 | @pytest.fixture
 99 | def mini_db_1_4(monkeypatch, mini_db_1_4_dir):
100 |     with monkeypatch.context() as m:
101 |         m.setattr(wn.config, 'data_directory', mini_db_1_4_dir)
102 |         yield
103 |         wn._db.clear_connections()
104 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to Wn
 2 | 
 3 | Thanks for helping to make Wn better!
 4 | 
 5 | **Quick Links:**
 6 | 
 7 | - [Report a bug or request a features](https://github.com/goodmami/wn/issues/new)
 8 | - [Ask a question](https://github.com/goodmami/wn/discussions)
 9 | - [View documentation](https://wn.readthedocs.io/)
10 | 
11 | **Developer Information:**
12 | 
13 | - Versioning scheme: [Semantic Versioning](https://semver.org/)
14 | - Branching scheme: [GitHub Flow](https://guides.github.com/introduction/flow/)
15 | - Changelog: [keep a changelog](https://keepachangelog.com/en/1.0.0/)
16 | - Documentation framework: [Sphinx](https://www.sphinx-doc.org/)
17 | - Docstring style: [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings) (via [sphinx.ext.napoleon](https://www.sphinx-doc.org/en/master/usage/extensions/napoleon.html))
18 | - Unit/regression testing: [pytest](https://pytest.org/)
19 | - Benchmarking: [pytest-benchmark](https://pytest-benchmark.readthedocs.io/)
20 | - Packaging framework: [Hatch](https://hatch.pypa.io/)
21 | - Coding style: [PEP-8](https://www.python.org/dev/peps/pep-0008/) (via [Ruff](https://beta.ruff.rs/docs/))
22 | - Type checking: [Mypy](http://mypy-lang.org/)
23 | 
24 | 
25 | ## Get Help
26 | 
27 | Confused about wordnets in general? See the [Global Wordnet
28 | Association Documentation](https://globalwordnet.github.io/gwadoc/)
29 | 
30 | Confused about using Wn or wish to share some tips? [Start a
31 | discussion](https://github.com/goodmami/wn/discussions)
32 | 
33 | Encountering a problem with Wn or wish to propose a new features? [Raise an
34 | issue](https://github.com/goodmami/wn/issues/new)
35 | 
36 | 
37 | ## Report a Bug
38 | 
39 | When reporting a bug, please provide enough information for someone to
40 | reproduce the problem. This might include the version of Python you're
41 | running, the version of Wn you have installed, the wordnet lexicons
42 | you have installed, and possibly the platform (Linux, Windows, macOS)
43 | you're on. Please give a minimal working example that illustrates the
44 | problem. For example:
45 | 
46 | > I'm using Wn 0.9.5 with Python 3.11 on Linux and [description of
47 | > problem...]. Here's what I have tried:
48 | >
49 | > ```pycon
50 | > >>> import wn
51 | > >>> # some code
52 | > ... # some result or error
53 | > ```
54 | 
55 | 
56 | ## Request a Feature
57 | 
58 | If there's a feature that you think would make a good addition to Wn,
59 | raise an issue describing what the feature is and what problems it
60 | would address.
61 | 
62 | ## Guidelines for Contributing
63 | 
64 | See the "developer information" above for a brief description of
65 | guidelines and conventions used in Wn. If you have a fix, please
66 | submit a pull request to the `main` branch. In general, every pull
67 | request should have an associated issue.
68 | 
69 | Developers should run and test Wn locally from source using
70 | [Hatch](https://hatch.pypa.io/). Hatch may be installed
71 | system-wide or within a virtual environment:
72 | 
73 | ```bash
74 | $ pip install hatch
75 | ```
76 | 
77 | You can then use the `hatch` commands like the following:
78 | 
79 | ```console
80 | $ hatch shell           # activate a Wn virtual environment
81 | $ hatch fmt --check     # lint the code and check code style
82 | $ hatch run mypy:check  # type check with mypy
83 | $ hatch test            # run unit tests
84 | $ hatch test bench      # run benchmarks
85 | $ hatch build           # build a source distribution and wheel
86 | $ hatch publish         # publish build artifacts to PyPI
87 | ```
88 | 


--------------------------------------------------------------------------------
/docs/cli.rst:
--------------------------------------------------------------------------------
  1 | Command Line Interface
  2 | ======================
  3 | 
  4 | Some of Wn's functionality is exposed via the command line.
  5 | 
  6 | Global Options
  7 | --------------
  8 | 
  9 | .. option:: -d DIR, --dir DIR
 10 | 
 11 |    Change to use ``DIR`` as the data directory prior to invoking any
 12 |    commands.
 13 | 
 14 | 
 15 | Subcommands
 16 | -----------
 17 | 
 18 | download
 19 | --------
 20 | 
 21 | Download and add projects to the database given one or more project
 22 | specifiers or URLs.
 23 | 
 24 | .. code-block:: console
 25 | 
 26 |    $ python -m wn download oewn:2021 omw:1.4 cili
 27 |    $ python -m wn download https://en-word.net/static/english-wordnet-2021.xml.gz
 28 | 
 29 | .. option:: --index FILE
 30 | 
 31 |    Use the index at ``FILE`` to resolve project specifiers.
 32 | 
 33 |    .. code-block:: console
 34 | 
 35 |       $ python -m wn download --index my-index.toml mywn
 36 | 
 37 | .. option:: --no-add
 38 | 
 39 |    Download and cache the remote file, but don't add it to the
 40 |    database.
 41 | 
 42 | 
 43 | lexicons
 44 | --------
 45 | 
 46 | The ``lexicons`` subcommand lets you quickly see what is installed:
 47 | 
 48 | .. code-block:: console
 49 | 
 50 |    $ python -m wn lexicons
 51 |    omw-en	1.4	[en]	OMW English Wordnet based on WordNet 3.0
 52 |    omw-sk	1.4	[sk]	Slovak WordNet
 53 |    omw-pl	1.4	[pl]	plWordNet
 54 |    omw-is	1.4	[is]	IceWordNet
 55 |    omw-zsm	1.4	[zsm]	Wordnet Bahasa (Malaysian)
 56 |    omw-sl	1.4	[sl]	sloWNet
 57 |    omw-ja	1.4	[ja]	Japanese Wordnet
 58 |    ...
 59 | 
 60 | .. option:: -l LG, --lang LG
 61 | .. option:: --lexicon SPEC
 62 | 
 63 |    The ``--lang`` or ``--lexicon`` option can help you narrow down
 64 |    the results:
 65 | 
 66 |    .. code-block:: console
 67 | 
 68 |       $ python -m wn lexicons --lang en
 69 |       oewn	2021	[en]	Open English WordNet
 70 |       omw-en	1.4	[en]	OMW English Wordnet based on WordNet 3.0
 71 |       $ python -m wn lexicons --lexicon "omw-*"
 72 |       omw-en	1.4	[en]	OMW English Wordnet based on WordNet 3.0
 73 |       omw-sk	1.4	[sk]	Slovak WordNet
 74 |       omw-pl	1.4	[pl]	plWordNet
 75 |       omw-is	1.4	[is]	IceWordNet
 76 |       omw-zsm	1.4	[zsm]	Wordnet Bahasa (Malaysian)
 77 | 
 78 | 
 79 | projects
 80 | --------
 81 | 
 82 | The ``projects`` subcommand lists all known projects in Wn's
 83 | index. This is helpful to see what is available for downloading.
 84 | 
 85 | .. code-block::
 86 | 
 87 |    $ python -m wn projects
 88 |    ic	cili	1.0	[---]	Collaborative Interlingual Index
 89 |    ic	oewn	2024	[en]	Open English WordNet
 90 |    ic	oewn	2023	[en]	Open English WordNet
 91 |    ic	oewn	2022	[en]	Open English WordNet
 92 |    ic	oewn	2021	[en]	Open English WordNet
 93 |    ic	ewn	2020	[en]	Open English WordNet
 94 |    ic	ewn	2019	[en]	Open English WordNet
 95 |    i-	odenet	1.4	[de]	Open German WordNet
 96 |    ic	odenet	1.3	[de]	Open German WordNet
 97 |    ic	omw	1.4	[mul]	Open Multilingual Wordnet
 98 |    ic	omw-en	1.4	[en]	OMW English Wordnet based on WordNet 3.0
 99 |    ...
100 | 
101 | 
102 | validate
103 | --------
104 | 
105 | Given a path to a WN-LMF XML file, check the file for structural
106 | problems and print a report.
107 | 
108 | .. code-block::
109 | 
110 |    $ python -m wn validate english-wordnet-2021.xml
111 | 
112 | .. option:: --select CHECKS
113 | 
114 |    Run the checks with the given comma-separated list of check codes
115 |    or categories.
116 | 
117 |    .. code-block::
118 | 
119 |       $ python -m wn validate --select E W201 W204 deWordNet.xml
120 | 
121 | .. option:: --output-file FILE
122 | 
123 |    Write the report to FILE as a JSON object instead of printing the
124 |    report to stdout.
125 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-docker.yaml:
--------------------------------------------------------------------------------
 1 | # Adapted from https://docs.github.com/en/actions/tutorials/publishing-packages/publishing-docker-images
 2 | name: Publish a Docker image
 3 | 
 4 | # Configures this workflow to run every time a new release is created in the repository.
 5 | on:
 6 |   release:
 7 |     types: [ created ]
 8 | 
 9 | # Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds.
10 | env:
11 |   REGISTRY: ghcr.io
12 |   IMAGE_NAME: ${{ github.repository }}
13 | 
14 | # There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu.
15 | jobs:
16 |   build-and-push-image:
17 |     runs-on: ubuntu-latest
18 |     # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job.
19 |     permissions:
20 |       contents: read
21 |       packages: write
22 |       attestations: write
23 |       id-token: write
24 | 
25 |     steps:
26 |       - name: Checkout repository
27 |         uses: actions/checkout@v4
28 |       # Uses the `docker/login-action` action to log in to the Container registry registry using the account and password that will publish the packages. Once published, the packages are scoped to the account defined here.
29 |       - name: Log in to the Container registry
30 |         uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
31 |         with:
32 |           registry: ${{ env.REGISTRY }}
33 |           username: ${{ github.actor }}
34 |           password: ${{ secrets.GITHUB_TOKEN }}
35 |       # This step uses [docker/metadata-action](https://github.com/docker/metadata-action#about) to extract tags and labels that will be applied to the specified image. The `id` "meta" allows the output of this step to be referenced in a subsequent step. The `images` value provides the base name for the tags and labels.
36 |       - name: Extract metadata (tags, labels) for Docker
37 |         id: meta
38 |         uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
39 |         with:
40 |           images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
41 |       # This step uses the `docker/build-push-action` action to build the image, based on your repository's `Dockerfile`. If the build succeeds, it pushes the image to GitHub Packages.
42 |       # It uses the `context` parameter to define the build's context as the set of files located in the specified path. For more information, see [Usage](https://github.com/docker/build-push-action#usage) in the README of the `docker/build-push-action` repository.
43 |       # It uses the `tags` and `labels` parameters to tag and label the image with the output from the "meta" step.
44 |       - name: Build and push Docker image
45 |         id: push
46 |         uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
47 |         with:
48 |           context: .
49 |           push: true
50 |           tags: ${{ steps.meta.outputs.tags }}
51 |           labels: ${{ steps.meta.outputs.labels }}
52 | 
53 |       # This step generates an artifact attestation for the image, which is an unforgeable statement about where and how it was built. It increases supply chain security for people who consume the image. For more information, see [Using artifact attestations to establish provenance for builds](/actions/security-guides/using-artifact-attestations-to-establish-provenance-for-builds).
54 |       - name: Generate artifact attestation
55 |         uses: actions/attest-build-provenance@v2
56 |         with:
57 |           subject-name: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME}}
58 |           subject-digest: ${{ steps.push.outputs.digest }}
59 |           push-to-registry: true
60 | 
61 | 


--------------------------------------------------------------------------------
/docs/api/wn.morphy.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | wn.morphy
  3 | =========
  4 | 
  5 | .. automodule:: wn.morphy
  6 | 
  7 | .. seealso::
  8 | 
  9 |    The Princeton WordNet `documentation
 10 |    <https://wordnet.princeton.edu/documentation/morphy7wn>`_ describes
 11 |    the original implementation of Morphy.
 12 | 
 13 |    The :doc:`../guides/lemmatization` guide describes how Wn handles
 14 |    lemmatization in general.
 15 | 
 16 | 
 17 | Initialized and Uninitialized Morphy
 18 | ------------------------------------
 19 | 
 20 | There are two ways of using Morphy in Wn: initialized and
 21 | uninitialized.
 22 | 
 23 | Unintialized Morphy is a simple callable that returns lemma
 24 | *candidates* for some given wordform. That is, the results might not
 25 | be valid lemmas, but this is not a problem in practice because
 26 | subsequent queries against the database will filter out the invalid
 27 | ones. This callable is obtained by creating a :class:`Morphy` object
 28 | with no arguments:
 29 | 
 30 | >>> from wn import morphy
 31 | >>> m = morphy.Morphy()
 32 | 
 33 | As an uninitialized Morphy cannot predict which lemmas in the result
 34 | are valid, it always returns the original form and any transformations
 35 | it can find for each part of speech:
 36 | 
 37 | >>> m('lemmata', pos='n')  # exceptional form
 38 | {'n': {'lemmata'}}
 39 | >>> m('lemmas', pos='n')   # regular morphology with part-of-speech
 40 | {'n': {'lemma', 'lemmas'}}
 41 | >>> m('lemmas')            # regular morphology for any part-of-speech
 42 | {None: {'lemmas'}, 'n': {'lemma'}, 'v': {'lemma'}}
 43 | >>> m('wolves')            # invalid forms may be returned
 44 | {None: {'wolves'}, 'n': {'wolf', 'wolve'}, 'v': {'wolve', 'wolv'}}
 45 | 
 46 | 
 47 | This lemmatizer can also be used with a :class:`wn.Wordnet` object to
 48 | expand queries:
 49 | 
 50 | >>> import wn
 51 | >>> ewn = wn.Wordnet('ewn:2020')
 52 | >>> ewn.words('lemmas')
 53 | []
 54 | >>> ewn = wn.Wordnet('ewn:2020', lemmatizer=morphy.Morphy())
 55 | >>> ewn.words('lemmas')
 56 | [Word('ewn-lemma-n')]
 57 | 
 58 | An initialized Morphy is created with a :class:`wn.Wordnet` object as
 59 | its argument. It then uses the wordnet to build lists of valid lemmas
 60 | and exceptional forms (this takes a few seconds). Once this is done,
 61 | it will only return lemmas it knows about:
 62 | 
 63 | >>> ewn = wn.Wordnet('ewn:2020')
 64 | >>> m = morphy.Morphy(ewn)
 65 | >>> m('lemmata', pos='n')  # exceptional form
 66 | {'n': {'lemma'}}
 67 | >>> m('lemmas', pos='n')   # regular morphology with part-of-speech
 68 | {'n': {'lemma'}}
 69 | >>> m('lemmas')            # regular morphology for any part-of-speech
 70 | {'n': {'lemma'}}
 71 | >>> m('wolves')            # invalid forms are pre-filtered
 72 | {'n': {'wolf'}}
 73 | 
 74 | In order to use an initialized Morphy lemmatizer with a
 75 | :class:`wn.Wordnet` object, it must be assigned to the object after
 76 | creation:
 77 | 
 78 | >>> ewn = wn.Wordnet('ewn:2020')  # default: lemmatizer=None
 79 | >>> ewn.words('lemmas')
 80 | []
 81 | >>> ewn.lemmatizer = morphy.Morphy(ewn)
 82 | >>> ewn.words('lemmas')
 83 | [Word('ewn-lemma-n')]
 84 | 
 85 | There is little to no difference in the results obtained from a
 86 | :class:`wn.Wordnet` object using an initialized or uninitialized
 87 | :class:`Morphy` object, but there may be slightly different
 88 | performance profiles for future queries.
 89 | 
 90 | 
 91 | Default Morphy Lemmatizer
 92 | -------------------------
 93 | 
 94 | As a convenience, an uninitialized Morphy lemmatizer is provided in
 95 | this module via the :data:`morphy` member.
 96 | 
 97 | .. data:: morphy
 98 | 
 99 |    A :class:`Morphy` object created without a :class:`wn.Wordnet`
100 |    object.
101 | 
102 | 
103 | The Morphy Class
104 | ----------------
105 | 
106 | .. autoclass:: Morphy
107 | 


--------------------------------------------------------------------------------
/tests/compat_sensekey_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import wn
 4 | from wn.compat import sensekey
 5 | 
 6 | 
 7 | def test_unescape_oewn_sense_key():
 8 | 
 9 |     def unescape(s: str) -> str:
10 |         return sensekey.unescape(s, flavor="oewn")
11 | 
12 |     assert unescape("") == ""
13 |     assert unescape("abc") == "abc"
14 |     assert unescape(".") == "."  # only becomes : in second part of key
15 |     # escape patterns
16 |     assert unescape("-ap-") == "'"
17 |     assert unescape("-ex-") == "!"
18 |     assert unescape("-cm-") == ","
19 |     assert unescape("-cn-") == ":"
20 |     assert unescape("-pl-") == "+"
21 |     assert unescape("-sl-") == "/"
22 |     # adjacent escapes need their own dashes
23 |     assert unescape("-ap-ex-") == "'ex-"
24 |     assert unescape("-ap--ex-") == "'!"
25 |     # invalid escapes are unchanged
26 |     assert unescape("-foo-") == "-foo-"  # not an escape sequence
27 |     assert unescape("-sp-") == "-sp-"  # not valid in lemma portion
28 |     assert unescape("ap-") == "ap-"  # no preceding dash
29 |     assert unescape("-ap") == "-ap"  # no trailing dash
30 |     assert unescape("-AP-") == "-AP-"  # case sensitivity
31 |     # idempotency
32 |     assert unescape(unescape("-ap--ex--cm-")) == unescape("-ap--ex--cm-")
33 |     # full key, second part escapes differently
34 |     assert unescape("abc__1.23.00..") == "abc%1:23:00::"
35 |     assert unescape("abc__1.23.00.foo-sp-bar.") == "abc%1:23:00:foo_bar:"
36 |     assert unescape("abc__1.23.00.foo-ap-bar.") == "abc%1:23:00:foo-ap-bar:"
37 | 
38 | 
39 | def test_escape_oewn_sense_key():
40 | 
41 |     def escape(s: str) -> str:
42 |         return sensekey.escape(s, flavor="oewn")
43 | 
44 |     assert escape("") == ""
45 |     assert escape("abc") == "abc"
46 |     assert escape(".") == "."  # only becomes : in second part of key
47 |     # escape patterns
48 |     assert escape("'") == "-ap-"
49 |     assert escape("!") == "-ex-"
50 |     assert escape(",") == "-cm-"
51 |     assert escape(":") == "-cn-"
52 |     assert escape("+") == "-pl-"
53 |     assert escape("/") == "-sl-"
54 |     # adjacent escapes need their own dashes
55 |     assert escape("'!") == "-ap--ex-"
56 |     # idempotency
57 |     assert escape(escape("'!,")) == escape("'!,")
58 |     # full key, second part escapes differently
59 |     assert escape("abc%1:23:00::") == "abc__1.23.00.."
60 |     assert escape("abc%1:23:00:foo_bar:") == "abc__1.23.00.foo-sp-bar."
61 |     assert escape("abc%1:23:00:foo'bar:") == "abc__1.23.00.foo'bar."
62 | 
63 | 
64 | @pytest.mark.usefixtures("uninitialized_datadir")
65 | def test_sense_key_getter(datadir):
66 |     wn.add(datadir / "sense-key-variations.xml")
67 | 
68 |     get_omw_sense_key = sensekey.sense_key_getter("omw-en:1.4")
69 |     get_oewn_sense_key = sensekey.sense_key_getter("oewn:2024")
70 | 
71 |     omw_sense = wn.sense("omw-en--apos-s_Gravenhage-08950407-n", lexicon="omw-en:1.4")
72 |     oewn_sense = wn.sense("oewn--ap-s_gravenhage__1.15.00..", lexicon="oewn:2024")
73 | 
74 |     assert get_omw_sense_key(omw_sense) == "'s_gravenhage%1:15:00::"
75 |     assert get_omw_sense_key(oewn_sense) is None
76 | 
77 |     assert get_oewn_sense_key(omw_sense) is None
78 |     assert get_oewn_sense_key(oewn_sense) == "'s_gravenhage%1:15:00::"
79 | 
80 | 
81 | @pytest.mark.usefixtures("uninitialized_datadir")
82 | def test_sense_getter(datadir):
83 |     wn.add(datadir / "sense-key-variations.xml")
84 | 
85 |     get_omw_sense = sensekey.sense_getter("omw-en:1.4")
86 |     get_oewn_sense = sensekey.sense_getter("oewn:2024")
87 | 
88 |     omw_sense = wn.sense("omw-en--apos-s_Gravenhage-08950407-n", lexicon="omw-en:1.4")
89 |     oewn_sense = wn.sense("oewn--ap-s_gravenhage__1.15.00..", lexicon="oewn:2024")
90 | 
91 |     assert get_omw_sense("'s_gravenhage%1:15:00::") == omw_sense
92 |     assert get_oewn_sense("'s_gravenhage%1:15:00::") == oewn_sense
93 | 


--------------------------------------------------------------------------------
/docs/setup.rst:
--------------------------------------------------------------------------------
  1 | Installation and Configuration
  2 | ==============================
  3 | 
  4 | .. seealso::
  5 | 
  6 |    This guide is for installing and configuring the Wn software. For
  7 |    adding lexicons to the database, see :doc:`guides/lexicons`.
  8 | 
  9 | 
 10 | Installing from PyPI
 11 | --------------------
 12 | 
 13 | Install the latest release from `PyPI <https://pypi.org/project/wn>`_:
 14 | 
 15 | .. code-block:: bash
 16 | 
 17 |    pip install wn
 18 | 
 19 | To get the dependencies for the :mod:`wn.web` module, use the ``web``
 20 | installation extra:
 21 | 
 22 | .. code-block:: bash
 23 | 
 24 |    pip install "wn[web]"
 25 | 
 26 | 
 27 | Installing with Conda
 28 | ---------------------
 29 | 
 30 | Alternatively, if you use the `Anaconda <https://anaconda.org/>`
 31 | distribution of Python, you can install with conda:
 32 | 
 33 | .. code-block:: bash
 34 | 
 35 |    conda install -c conda-forge wn
 36 | 
 37 | 
 38 | The Data Directory
 39 | ------------------
 40 | 
 41 | By default, Wn stores its data (such as downloaded LMF files and the
 42 | database file) in a ``.wn_data/`` directory under the user's home
 43 | directory. This directory can be changed (see `Configuration`_
 44 | below). Whenever Wn attempts to download a resource or access its
 45 | database, it will check for the existence of, and create if necessary,
 46 | this directory, the ``.wn_data/downloads/`` subdirectory, and the
 47 | ``.wn_data/wn.db`` database file. The file system will look like
 48 | this::
 49 | 
 50 |     .wn_data/
 51 |     ├── downloads
 52 |     │   ├── ...
 53 |     │   └── ...
 54 |     └── wn.db
 55 | 
 56 | The ``...`` entries in the ``downloads/`` subdirectory represent the
 57 | files of resources downloaded from the web. Their filename is a hash
 58 | of the URL so that Wn can avoid downloading the same file twice.
 59 | 
 60 | 
 61 | Configuration
 62 | -------------
 63 | 
 64 | The :py:data:`wn.config` object contains the paths Wn uses for local
 65 | storage and information about resources available on the web. To
 66 | change the directory Wn uses for storing data locally, modify the
 67 | :python:`wn.config.data_directory` member:
 68 | 
 69 | .. code-block:: python
 70 | 
 71 |    import wn
 72 |    wn.config.data_directory = '~/Projects/wn_data'
 73 | 
 74 | There are some things to note:
 75 | 
 76 | - The downloads directory and database path are always relative to the
 77 |   data directory and cannot be changed directly.
 78 | - This change only affects subsequent operations, so any data in the
 79 |   previous location will not be moved nor deleted.
 80 | - This change only affects the current session. If you want a script
 81 |   or application to always use the new location, it must reset the
 82 |   data directory each time it is initialized.
 83 | 
 84 | You can also add project information for remote resources. First you
 85 | add a project, with a project ID, full name, and language code. Then
 86 | you create one or more versions for that project with a version ID,
 87 | resource URL, and license information. This may be done either through
 88 | the :py:data:`wn.config` object's
 89 | :py:meth:`~wn._config.WNConfig.add_project` and
 90 | :py:meth:`~wn._config.WNConfig.add_project_version` methods, or loaded
 91 | from a TOML_ file via the :py:data:`wn.config` object's
 92 | :py:meth:`~wn._config.WNConfig.load_index` method.
 93 | 
 94 | .. _TOML: https://toml.io
 95 | 
 96 | .. code-block:: python
 97 | 
 98 |    wn.config.add_project('ewn', 'English WordNet', 'en')
 99 |    wn.config.add_project_version(
100 |        'ewn', '2020',
101 |        'https://en-word.net/static/english-wordnet-2020.xml.gz',
102 |        'https://creativecommons.org/licenses/by/4.0/',
103 |    )
104 | 
105 | 
106 | Installing From Source
107 | ----------------------
108 | 
109 | If you wish to install the code from the source repository (e.g., to
110 | get an unreleased feature or to contribute toward Wn's development),
111 | clone the repository and use `Hatch <https://hatch.pypa.io/>`_ to
112 | start a virtual environment with Wn installed:
113 | 
114 | .. code-block:: console
115 | 
116 |    $ git clone https://github.com/goodmami/wn.git
117 |    $ cd wn
118 |    $ hatch shell
119 | 


--------------------------------------------------------------------------------
/tests/wordnet_test.py:
--------------------------------------------------------------------------------
  1 | import tempfile
  2 | import warnings
  3 | from pathlib import Path
  4 | 
  5 | import pytest
  6 | 
  7 | import wn
  8 | 
  9 | 
 10 | @pytest.mark.usefixtures('mini_db_1_1')
 11 | def test_wordnet_lexicons():
 12 |     en = wn.Wordnet('test-en')
 13 |     assert len(en.lexicons()) == 1
 14 |     assert len(en.expanded_lexicons()) == 0
 15 | 
 16 |     en1 = wn.Wordnet('test-en:1')
 17 |     assert en.lexicons() == en1.lexicons()
 18 |     assert en.expanded_lexicons() == en1.expanded_lexicons()
 19 | 
 20 |     en2 = wn.Wordnet(lang='en')
 21 |     assert len(en2.lexicons()) == 2
 22 |     assert len(en2.expanded_lexicons()) == 0
 23 | 
 24 |     es = wn.Wordnet('test-es')
 25 |     assert len(es.lexicons()) == 1
 26 |     assert len(es.expanded_lexicons()) == 0
 27 | 
 28 |     es2 = wn.Wordnet('test-es', expand='test-en')
 29 |     assert len(es2.lexicons()) == 1
 30 |     assert len(es2.expanded_lexicons()) == 1
 31 | 
 32 |     ja = wn.Wordnet('test-ja')
 33 |     assert len(ja.lexicons()) == 1
 34 |     assert len(ja.expanded_lexicons()) == 1
 35 | 
 36 |     ja2 = wn.Wordnet('test-ja', expand='')
 37 |     assert len(ja2.lexicons()) == 1
 38 |     assert len(ja2.expanded_lexicons()) == 0
 39 | 
 40 | 
 41 | @pytest.mark.usefixtures('mini_db')
 42 | def test_wordnet_normalize():
 43 |     es = wn.Wordnet('test-es')
 44 |     assert es.words('Informacion') == es.words('información')
 45 |     assert es.words('ínfórmácíón') == es.words('información')
 46 |     es = wn.Wordnet('test-es', normalizer=None)
 47 |     assert es.words('informacion') == []
 48 |     assert es.words('Información') == []
 49 | 
 50 |     # The following doesn't necessarily work because any non-None
 51 |     # normalizer causes the normalized form column to be tested with
 52 |     # the original form
 53 |     # es = wn.Wordnet('test-es', normalizer=str.lower)
 54 |     # assert es.words('informacion') == []
 55 |     # assert es.words('Información') == es.words('información')
 56 | 
 57 | 
 58 | @pytest.mark.usefixtures('mini_db')
 59 | def test_wordnet_lemmatize():
 60 |     # default lemmatizer compares alternative forms
 61 |     en = wn.Wordnet('test-en')
 62 |     assert en.words('examples') == []
 63 |     assert en.words('exemplifying') == en.words('exemplify')
 64 |     assert en.words('data') == en.words('datum')
 65 | 
 66 |     en = wn.Wordnet('test-en', search_all_forms=False)
 67 |     assert en.words('examples') == []
 68 |     assert en.words('exemplifying') == []
 69 |     assert en.words('data') == []
 70 | 
 71 |     def morphy_lite(form, pos):
 72 |         result = {pos: {form}}
 73 |         if pos in ('n', None) and form.endswith('s'):
 74 |             result.setdefault('n', set()).add(form[:-1])
 75 |         return result
 76 | 
 77 |     en = wn.Wordnet('test-en', lemmatizer=morphy_lite, search_all_forms=False)
 78 |     assert en.words('examples', pos='n') == en.words('example')
 79 |     assert en.words('examples') == en.words('example')
 80 |     assert en.words('exemplifying') == []
 81 |     assert en.words('data') == []
 82 | 
 83 |     en = wn.Wordnet('test-en', lemmatizer=morphy_lite, search_all_forms=True)
 84 |     assert en.words('data') == en.words('datum')
 85 |     assert en.words('exemplifying') == en.words('exemplify')
 86 | 
 87 | 
 88 | def test_portable_entities_issue_226(monkeypatch, datadir):
 89 |     # instead use ignore_cleanup_errors=True from Python 3.10
 90 |     tempdir = tempfile.TemporaryDirectory('wn_issue_226')
 91 |     with tempdir as dir:
 92 |         with monkeypatch.context() as m:
 93 |             m.setattr(wn.config, 'data_directory', Path(dir))
 94 |             wn.add(datadir / 'mini-lmf-1.0.xml')
 95 |             en = wn.Wordnet('test-en')
 96 |             info1 = en.synsets('information')[0]
 97 |             wn.remove('test-en')
 98 |             wn.add(datadir / 'mini-lmf-1.0.xml')
 99 |             info2 = en.synsets('information')[0]  # en Wordnet object still works
100 |             assert info1 == info2  # synsets are equivalent
101 |             wn._db.clear_connections()
102 |     # Not needed if ignore_cleanup_errors=True and delete=True above
103 |     try:
104 |         tempdir.cleanup()
105 |     except PermissionError:
106 |         warnings.warn(
107 |             f"Failed to clean up temporary directory {dir!s}",
108 |             stacklevel=1,
109 |         )
110 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # Configuration file for the Sphinx documentation builder.
  2 | #
  3 | # This file only contains a selection of the most common options. For a full
  4 | # list see the documentation:
  5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
  6 | 
  7 | # -- Path setup --------------------------------------------------------------
  8 | 
  9 | # If extensions (or modules to document with autodoc) are in another directory,
 10 | # add these directories to sys.path here. If the directory is relative to the
 11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 12 | #
 13 | # import os
 14 | # import sys
 15 | # sys.path.insert(0, os.path.abspath('.'))
 16 | 
 17 | 
 18 | # -- Project information -----------------------------------------------------
 19 | 
 20 | project = 'wn'
 21 | copyright = '2020, Michael Wayne Goodman'
 22 | author = 'Michael Wayne Goodman'
 23 | 
 24 | import wn
 25 | 
 26 | # The short X.Y version
 27 | version = '.'.join(wn.__version__.split('.')[:2])
 28 | # The full version, including alpha/beta/rc tags
 29 | release = wn.__version__
 30 | 
 31 | # -- General configuration ---------------------------------------------------
 32 | 
 33 | # Add any Sphinx extension module names here, as strings. They can be
 34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 35 | # ones.
 36 | extensions = [
 37 |     'sphinx.ext.autodoc',
 38 |     'sphinx.ext.intersphinx',
 39 |     'sphinx.ext.coverage',
 40 |     # 'sphinx.ext.viewcode',
 41 |     'sphinx.ext.githubpages',
 42 |     'sphinx.ext.napoleon',
 43 |     "sphinx_copybutton",
 44 | ]
 45 | 
 46 | # Add any paths that contain templates here, relative to this directory.
 47 | templates_path = ['_templates']
 48 | 
 49 | # List of patterns, relative to source directory, that match files and
 50 | # directories to ignore when looking for source files.
 51 | # This pattern also affects html_static_path and html_extra_path.
 52 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 53 | 
 54 | # Global definitions
 55 | rst_prolog = """
 56 | .. role:: python(code)
 57 |    :language: python
 58 |    :class: highlight
 59 | """
 60 | 
 61 | # smartquotes = False
 62 | smartquotes_action = 'De'  # D = en- and em-dash; e = ellipsis
 63 | 
 64 | # -- Options for HTML output -------------------------------------------------
 65 | 
 66 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 67 | # a list of builtin themes.#
 68 | 
 69 | html_theme = "furo"
 70 | html_theme_options = {
 71 |     "light_css_variables": {
 72 |         "color-brand-primary": "#006699",
 73 |         "color-brand-content": "#006699",
 74 |         # "color-background": "#f0f0f0",
 75 |         # "color-sidebar-background": "#ddd",
 76 |     },
 77 |     "dark_css_variables": {
 78 |         "color-brand-primary": "#00CCFF",
 79 |         "color-brand-content": "#00CCFF",
 80 |     }
 81 | }
 82 | 
 83 | html_logo = "_static/wn-logo.svg"
 84 | 
 85 | pygments_style = 'manni'
 86 | pygments_dark_style = 'monokai'
 87 | 
 88 | # Add any paths that contain custom static files (such as style sheets) here,
 89 | # relative to this directory. They are copied after the builtin static files,
 90 | # so a file named "default.css" will overwrite the builtin "default.css".
 91 | html_static_path = ['_static']
 92 | html_css_files = [
 93 |     'css/svg.css',
 94 | ]
 95 | 
 96 | # Don't offer to show the source of the current page
 97 | html_show_sourcelink = False
 98 | 
 99 | # -- Options for autodoc extension -------------------------------------------
100 | 
101 | # autodoc_typehints = 'description'
102 | autodoc_typehints = 'signature'
103 | # autodoc_typehints = 'none'
104 | 
105 | # -- Options for intersphinx extension ---------------------------------------
106 | 
107 | # Example configuration for intersphinx: refer to the Python standard library.
108 | intersphinx_mapping = {
109 |     'python': ('https://docs.python.org/3', None),
110 |     'httpx': ('https://httpx.readthedocs.io/en/latest/', None),
111 | }
112 | 
113 | # -- Options for sphinx_copybutton extension ---------------------------------
114 | 
115 | copybutton_prompt_text = (
116 |     r">>> "              # regular Python prompt
117 |     r"|\.\.\. "          # Python continuation prompt
118 |     r"|\$ "              # Basic shell
119 |     r"|In \[\d*\]: "     # Jupyter notebook
120 | )
121 | copybutton_prompt_is_regexp = True
122 | 


--------------------------------------------------------------------------------
/tests/ic_test.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from math import log
  3 | 
  4 | import pytest
  5 | 
  6 | import wn
  7 | from wn.constants import (NOUN, VERB, ADJ, ADV)
  8 | from wn.util import synset_id_formatter
  9 | import wn.ic
 10 | 
 11 | 
 12 | synset_id = {
 13 |     'information': 'test-en-0001-n',
 14 |     'illustration_example': 'test-en-0002-n',
 15 |     'sample': 'test-en-0004-n',
 16 |     'random_sample': 'test-en-0005-n',
 17 |     'random_sample2': 'test-en-0008-n',  # no hypernyms
 18 |     'datum': 'test-en-0006-n',
 19 |     'illustrate_exemplify': 'test-en-0003-v',
 20 |     'resignate': 'test-en-0007-v',
 21 | }
 22 | 
 23 | 
 24 | words = [
 25 |     'For', 'example', ':', 'random sample', '.',
 26 |     'This', 'will', 'illustrate', 'and', 'exemplify', '.',
 27 |     'A', 'sample', 'of', 'data', '.',
 28 | ]
 29 | 
 30 | 
 31 | @pytest.mark.usefixtures('mini_db')
 32 | def test_compute_nodistribute_nosmoothing():
 33 |     w = wn.Wordnet('test-en:1')
 34 |     assert wn.ic.compute(words, w, distribute_weight=False, smoothing=0) == {
 35 |         NOUN: {
 36 |             synset_id['information']: 4.0,
 37 |             synset_id['illustration_example']: 3.0,
 38 |             synset_id['sample']: 2.0,
 39 |             synset_id['random_sample']: 1.0,
 40 |             synset_id['random_sample2']: 1.0,
 41 |             synset_id['datum']: 1.0,
 42 |             None: 5.0,
 43 |         },
 44 |         VERB: {
 45 |             synset_id['illustrate_exemplify']: 2.0,
 46 |             synset_id['resignate']: 0.0,
 47 |             None: 2.0,
 48 |         },
 49 |         ADJ: {None: 0.0},
 50 |         ADV: {None: 0.0},
 51 |     }
 52 | 
 53 | 
 54 | @pytest.mark.usefixtures('mini_db')
 55 | def test_compute_nodistribute_smoothing():
 56 |     w = wn.Wordnet('test-en:1')
 57 |     assert wn.ic.compute(words, w, distribute_weight=False, smoothing=1.0) == {
 58 |         NOUN: {
 59 |             synset_id['information']: 5.0,
 60 |             synset_id['illustration_example']: 4.0,
 61 |             synset_id['sample']: 3.0,
 62 |             synset_id['random_sample']: 2.0,
 63 |             synset_id['random_sample2']: 2.0,
 64 |             synset_id['datum']: 2.0,
 65 |             None: 6.0,
 66 |         },
 67 |         VERB: {
 68 |             synset_id['illustrate_exemplify']: 3.0,
 69 |             synset_id['resignate']: 1.0,
 70 |             None: 3.0,
 71 |         },
 72 |         ADJ: {None: 1.0},
 73 |         ADV: {None: 1.0},
 74 |     }
 75 | 
 76 | 
 77 | @pytest.mark.usefixtures('mini_db')
 78 | def test_compute_distribute_smoothing():
 79 |     w = wn.Wordnet('test-en:1')
 80 |     assert wn.ic.compute(words, w, distribute_weight=True, smoothing=1.0) == {
 81 |         NOUN: {
 82 |             synset_id['information']: 4.5,
 83 |             synset_id['illustration_example']: 3.5,
 84 |             synset_id['sample']: 2.5,
 85 |             synset_id['random_sample']: 1.5,
 86 |             synset_id['random_sample2']: 1.5,
 87 |             synset_id['datum']: 2.0,
 88 |             None: 5.0,
 89 |         },
 90 |         VERB: {
 91 |             synset_id['illustrate_exemplify']: 3.0,
 92 |             synset_id['resignate']: 1.0,
 93 |             None: 3.0,
 94 |         },
 95 |         ADJ: {None: 1.0},
 96 |         ADV: {None: 1.0},
 97 |     }
 98 | 
 99 | 
100 | @pytest.mark.usefixtures('mini_db')
101 | def test_load(tmp_path):
102 |     w = wn.Wordnet('test-en:1')
103 |     icpath = tmp_path / 'foo.dat'
104 |     icpath.write_text(
105 |         'wnver:1234567890AbCdEf\n'
106 |         '1n 4.0 ROOT\n'
107 |         '2n 3.0\n'
108 |         '4n 2.0\n'
109 |         '5n 1.0\n'
110 |         '8n 1.0 ROOT\n'
111 |         '6n 1.0\n'
112 |         '3v 2.0 ROOT\n'
113 |         '7v 0.0 ROOT\n'
114 |     )
115 | 
116 |     get_synset_id = synset_id_formatter('test-en-{offset:04}-{pos}')
117 |     assert (wn.ic.load(icpath, w, get_synset_id=get_synset_id)
118 |             == wn.ic.compute(words, w, distribute_weight=False, smoothing=0.0))
119 | 
120 | 
121 | @pytest.mark.usefixtures('mini_db')
122 | def test_information_content():
123 |     w = wn.Wordnet('test-en:1')
124 |     ic = wn.ic.compute(words, w)
125 |     info = w.synsets('information')[0]
126 |     samp = w.synsets('sample')[0]
127 |     # info is a root but not the only one, so its IC is not 0.0
128 |     assert wn.ic.information_content(info, ic) == -log(
129 |         ic['n'][info.id]
130 |         / ic['n'][None]
131 |     )
132 |     assert wn.ic.information_content(samp, ic) == -log(
133 |         ic['n'][samp.id]
134 |         / ic['n'][None]
135 |     )
136 | 


--------------------------------------------------------------------------------
/tests/web_test.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from starlette.testclient import TestClient
  3 | 
  4 | import wn
  5 | import wn._db
  6 | from wn import web
  7 | 
  8 | 
  9 | # clearing connections on teardown (see conftest.py) isn't enough. For
 10 | # this we apparently need to monkeypatch the wn._db.pool as well.
 11 | 
 12 | @pytest.fixture
 13 | def mini_db_web(monkeypatch, mini_db_dir):
 14 |     with monkeypatch.context() as m:
 15 |         m.setattr(wn._db, 'pool', {})
 16 |         m.setattr(wn.config, 'data_directory', mini_db_dir)
 17 |         m.setattr(wn.config, 'allow_multithreading', True)
 18 |         yield
 19 |         wn._db.clear_connections()
 20 | 
 21 | 
 22 | client = TestClient(web.app)
 23 | 
 24 | 
 25 | @pytest.mark.usefixtures('mini_db_web')
 26 | def test_root():
 27 |     response = client.get('/')
 28 |     assert response.status_code == 404
 29 | 
 30 | 
 31 | @pytest.mark.usefixtures('mini_db_web')
 32 | def test_lexicons():
 33 |     response = client.get("/lexicons")
 34 |     assert response.status_code == 200
 35 |     data = response.json()["data"]
 36 |     assert [lex["id"] for lex in data] == ["test-en:1", "test-es:1"]
 37 | 
 38 | 
 39 | @pytest.mark.usefixtures('mini_db_web')
 40 | def test_words():
 41 |     response = client.get("/words")
 42 |     assert response.status_code == 200
 43 |     data = response.json()["data"]
 44 |     word_ids = {word["id"] for word in data}
 45 |     assert "test-en-information-n" in word_ids
 46 |     assert "test-es-información-n" in word_ids
 47 | 
 48 |     response = client.get("/words", params={"lexicon": "test-en:1"})
 49 |     assert response.status_code == 200
 50 |     data = response.json()["data"]
 51 |     word_ids = {word["id"] for word in data}
 52 |     assert "test-en-information-n" in word_ids
 53 |     assert "test-es-información-n" not in word_ids
 54 | 
 55 | 
 56 | @pytest.mark.usefixtures('mini_db_web')
 57 | def test_senses():
 58 |     response = client.get("/senses")
 59 |     assert response.status_code == 200
 60 |     data = response.json()["data"]
 61 |     sense_ids = {sense["id"] for sense in data}
 62 |     assert "test-en-information-n-0001-01" in sense_ids
 63 |     assert "test-es-información-n-0001-01" in sense_ids
 64 | 
 65 |     response = client.get("/senses", params={"lexicon": "test-en:1"})
 66 |     assert response.status_code == 200
 67 |     data = response.json()["data"]
 68 |     sense_ids = {sense["id"] for sense in data}
 69 |     assert "test-en-information-n-0001-01" in sense_ids
 70 |     assert "test-es-información-n-0001-01" not in sense_ids
 71 | 
 72 | 
 73 | @pytest.mark.usefixtures('mini_db_web')
 74 | def test_synsets():
 75 |     response = client.get("/synsets")
 76 |     assert response.status_code == 200
 77 |     data = response.json()["data"]
 78 |     synset_ids = {synset["id"] for synset in data}
 79 |     assert "test-en-0001-n" in synset_ids
 80 |     assert "test-es-0001-n" in synset_ids
 81 | 
 82 |     response = client.get("/synsets", params={"lexicon": "test-en:1"})
 83 |     assert response.status_code == 200
 84 |     data = response.json()["data"]
 85 |     synset_ids = {synset["id"] for synset in data}
 86 |     assert "test-en-0001-n" in synset_ids
 87 |     assert "test-es-0001-n" not in synset_ids
 88 | 
 89 | 
 90 | @pytest.mark.usefixtures('mini_db_web')
 91 | def test_lexicon_words():
 92 |     response1 = client.get("/lexicons/test-en:1/words")
 93 |     response2 = client.get("/words", params={"lexicon": "test-en:1"})
 94 |     assert response1.status_code == 200
 95 |     assert response2.status_code == 200
 96 |     data1 = response1.json()["data"]
 97 |     data2 = response2.json()["data"]
 98 |     assert {word["id"] for word in data1} == {word["id"] for word in data2}
 99 | 
100 | 
101 | @pytest.mark.usefixtures('mini_db_web')
102 | def test_lexicon_senses():
103 |     response1 = client.get("/lexicons/test-en:1/senses")
104 |     response2 = client.get("/senses", params={"lexicon": "test-en:1"})
105 |     assert response1.status_code == 200
106 |     assert response2.status_code == 200
107 |     data1 = response1.json()["data"]
108 |     data2 = response2.json()["data"]
109 |     assert {sense["id"] for sense in data1} == {sense["id"] for sense in data2}
110 | 
111 | 
112 | @pytest.mark.usefixtures('mini_db_web')
113 | def test_lexicon_synsets():
114 |     response1 = client.get("/lexicons/test-en:1/synsets")
115 |     response2 = client.get("/synsets", params={"lexicon": "test-en:1"})
116 |     assert response1.status_code == 200
117 |     assert response2.status_code == 200
118 |     data1 = response1.json()["data"]
119 |     data2 = response2.json()["data"]
120 |     assert {synset["id"] for synset in data1} == {synset["id"] for synset in data2}
121 | 


--------------------------------------------------------------------------------
/tests/taxonomy_test.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import pytest
  3 | 
  4 | import wn
  5 | from wn.taxonomy import (
  6 |     roots,
  7 |     leaves,
  8 |     taxonomy_depth,
  9 |     hypernym_paths,
 10 |     min_depth,
 11 |     max_depth,
 12 |     shortest_path,
 13 |     # common_hypernyms,
 14 |     # lowest_common_hypernyms,
 15 | )
 16 | 
 17 | 
 18 | @pytest.mark.usefixtures('mini_db')
 19 | def test_roots():
 20 |     en = wn.Wordnet('test-en')
 21 |     assert set(roots(en, pos='n')) == {en.synset('test-en-0001-n'),
 22 |                                        en.synset('test-en-0008-n')}
 23 |     assert set(roots(en, pos='v')) == {en.synset('test-en-0003-v'),
 24 |                                        en.synset('test-en-0007-v')}
 25 |     assert roots(en, pos='a') == []
 26 |     assert set(roots(en)) == set(roots(en, pos='n') + roots(en, pos='v'))
 27 | 
 28 |     # with no expand relations and no relation of its own, every
 29 |     # synset looks like a root
 30 |     es = wn.Wordnet('test-es')
 31 |     assert set(roots(es, pos='n')) == {es.synset('test-es-0001-n'),
 32 |                                        es.synset('test-es-0002-n'),
 33 |                                        es.synset('test-es-0005-n')}
 34 | 
 35 |     es = wn.Wordnet('test-es', expand='test-en')
 36 |     assert roots(es, pos='n') == [es.synset('test-es-0001-n')]
 37 | 
 38 | 
 39 | @pytest.mark.usefixtures('mini_db')
 40 | def test_leaves():
 41 |     en = wn.Wordnet('test-en')
 42 |     assert set(leaves(en, pos='n')) == {en.synset('test-en-0005-n'),
 43 |                                         en.synset('test-en-0006-n'),
 44 |                                         en.synset('test-en-0008-n')}
 45 |     assert set(leaves(en, pos='v')) == {en.synset('test-en-0003-v'),
 46 |                                         en.synset('test-en-0007-v')}
 47 | 
 48 | 
 49 | @pytest.mark.usefixtures('mini_db')
 50 | def test_taxonomy_depth():
 51 |     en = wn.Wordnet('test-en')
 52 |     assert taxonomy_depth(en, pos='n') == 3
 53 |     assert taxonomy_depth(en, pos='v') == 0
 54 | 
 55 | 
 56 | @pytest.mark.usefixtures('mini_db')
 57 | def test_hypernym_paths():
 58 |     information = wn.synsets('information')[0]
 59 |     example = wn.synsets('example')[0]
 60 |     sample = wn.synsets('sample')[0]
 61 |     random_sample = wn.synsets('random sample')[0]
 62 |     assert hypernym_paths(information) == []
 63 |     assert hypernym_paths(example) == [[information]]
 64 |     assert hypernym_paths(sample) == [[example, information]]
 65 |     assert hypernym_paths(random_sample) == [[sample, example, information]]
 66 | 
 67 | 
 68 | @pytest.mark.usefixtures('mini_db')
 69 | def test_interlingual_hypernym_paths():
 70 |     información = wn.synsets('información')[0]
 71 |     ejemplo = wn.synsets('ejemplo')[0]
 72 |     sample = wn.synsets('sample', lexicon='test-en:1')[0]
 73 |     inferred = wn.Synset.empty('*INFERRED*', ili=sample.ili.id, _lexicon='test-es:1')
 74 |     muestra_aleatoria = wn.synsets('muestra aleatoria')[0]
 75 |     assert hypernym_paths(información) == []
 76 |     assert hypernym_paths(ejemplo) == [[información]]
 77 |     assert hypernym_paths(muestra_aleatoria) == [[inferred, ejemplo, información]]
 78 | 
 79 | 
 80 | @pytest.mark.usefixtures('mini_db')
 81 | def test_shortest_path():
 82 |     information = wn.synsets('information')[0]
 83 |     example = wn.synsets('example')[0]
 84 |     sample = wn.synsets('sample')[0]
 85 |     random_sample = wn.synsets('random sample')[0]
 86 |     datum = wn.synsets('datum')[0]
 87 |     exemplify = wn.synsets('exemplify')[0]
 88 |     inferred_root = wn.Synset.empty('*ROOT*', _lexicon='test-en:1')
 89 |     assert shortest_path(information, information) == []
 90 |     assert shortest_path(information, datum) == [datum]
 91 |     assert shortest_path(information, sample) == [example, sample]
 92 |     assert shortest_path(sample, information) == [example, information]
 93 |     assert shortest_path(random_sample, datum) == [sample, example, information, datum]
 94 |     with pytest.raises(wn.Error):
 95 |         shortest_path(example, exemplify)
 96 |     assert shortest_path(example, exemplify, simulate_root=True) == [
 97 |         information, inferred_root, exemplify
 98 |     ]
 99 | 
100 | 
101 | @pytest.mark.usefixtures('mini_db')
102 | def test_min_depth():
103 |     assert min_depth(wn.synsets('information')[0]) == 0
104 |     assert min_depth(wn.synsets('example')[0]) == 1
105 |     assert min_depth(wn.synsets('sample')[0]) == 2
106 |     assert min_depth(wn.synsets('random sample')[0]) == 3
107 | 
108 | 
109 | @pytest.mark.usefixtures('mini_db')
110 | def test_max_depth():
111 |     assert max_depth(wn.synsets('information')[0]) == 0
112 |     assert max_depth(wn.synsets('example')[0]) == 1
113 |     assert max_depth(wn.synsets('sample')[0]) == 2
114 |     assert max_depth(wn.synsets('random sample')[0]) == 3
115 | 


--------------------------------------------------------------------------------
/docs/guides/nltk-migration.rst:
--------------------------------------------------------------------------------
  1 | Migrating from the NLTK
  2 | =======================
  3 | 
  4 | This guide is for users of the `NLTK <https://www.nltk.org/>`_\ 's
  5 | ``nltk.corpus.wordnet`` module who are migrating to Wn. It is not
  6 | guaranteed that Wn will produce the same results as the NLTK's module,
  7 | but with some care its behavior can be very similar.
  8 | 
  9 | Overview
 10 | --------
 11 | 
 12 | One important thing to note is that Wn will search all wordnets in the
 13 | database by default where the NLTK would only search the English.
 14 | 
 15 | >>> from nltk.corpus import wordnet as nltk_wn
 16 | >>> nltk_wn.synsets('chat')                 # only English
 17 | >>> nltk_wn.synsets('chat', lang='fra')     # only French
 18 | >>> import wn
 19 | >>> wn.synsets('chat')                      # all wordnets
 20 | >>> wn.synsets('chat', lang='fr')           # only French
 21 | 
 22 | With Wn it helps to create a :class:`wn.Wordnet` object to pre-filter
 23 | the results by language or lexicon.
 24 | 
 25 | >>> en = wn.Wordnet('omw-en:1.4')
 26 | >>> en.synsets('chat')                     # only the OMW English Wordnet
 27 | 
 28 | Equivalent Operations
 29 | ---------------------
 30 | 
 31 | The following table lists equivalent API calls for the NLTK's wordnet
 32 | module and Wn assuming the respective modules have been instantiated
 33 | (in separate Python sessions) as follows:
 34 | 
 35 | NLTK:
 36 | 
 37 | >>> from nltk.corpus import wordnet as wn
 38 | >>> ss = wn.synsets("chat", pos="v")[0]
 39 | 
 40 | Wn:
 41 | 
 42 | >>> import wn
 43 | >>> en = wn.Wordnet('omw-en:1.4')
 44 | >>> ss = en.synsets("chat", pos="v")[0]
 45 | 
 46 | .. default-role:: python
 47 | 
 48 | Primary Queries
 49 | '''''''''''''''
 50 | 
 51 | =========================================  ===============================================
 52 | NLTK                                       Wn
 53 | =========================================  ===============================================
 54 | `wn.langs()`                               `[lex.language for lex in wn.lexicons()]`
 55 | `wn.lemmas("chat")`                        --
 56 | --                                         `en.words("chat")`
 57 | --                                         `en.senses("chat")`
 58 | `wn.synsets("chat")`                       `en.synsets("chat")`
 59 | `wn.synsets("chat", pos="v")`              `en.synsets("chat", pos="v")`
 60 | `wn.all_synsets()`                         `en.synsets()`
 61 | `wn.all_synsets(pos="v")`                  `en.synsets(pos="v")`
 62 | =========================================  ===============================================
 63 | 
 64 | Synsets -- Basic
 65 | ''''''''''''''''
 66 | 
 67 | ===================  =================
 68 | NLTK                 Wn
 69 | ===================  =================
 70 | `ss.lemmas()`        --
 71 | --                   `ss.senses()`
 72 | --                   `ss.words()`
 73 | `ss.lemmas_names()`  `ss.lemmas()`
 74 | `ss.definition()`    `ss.definition()`
 75 | `ss.examples()`      `ss.examples()`
 76 | `ss.pos()`           `ss.pos`
 77 | ===================  =================
 78 | 
 79 | Synsets -- Relations
 80 | ''''''''''''''''''''
 81 | 
 82 | ==========================================  =====================================
 83 | NLTK                                        Wn
 84 | ==========================================  =====================================
 85 | `ss.hypernyms()`                            `ss.get_related("hypernym")`
 86 | `ss.instance_hypernyms()`                   `ss.get_related("instance_hypernym")`
 87 | `ss.hypernyms() + ss.instance_hypernyms()`  `ss.hypernyms()`
 88 | `ss.hyponyms()`                             `ss.get_related("hyponym")`
 89 | `ss.member_holonyms()`                      `ss.get_related("holo_member")`
 90 | `ss.member_meronyms()`                      `ss.get_related("mero_member")`
 91 | `ss.closure(lambda x: x.hypernyms())`       `ss.closure("hypernym")`
 92 | ==========================================  =====================================
 93 | 
 94 | Synsets -- Taxonomic Structure
 95 | ''''''''''''''''''''''''''''''
 96 | 
 97 | ================================  =========================================================
 98 | NLTK                              Wn
 99 | ================================  =========================================================
100 | `ss.min_depth()`                  `ss.min_depth()`
101 | `ss.max_depth()`                  `ss.max_depth()`
102 | `ss.hypernym_paths()`             `[list(reversed([ss] + p)) for p in ss.hypernym_paths()]`
103 | `ss.common_hypernyms(ss)`         `ss.common_hypernyms(ss)`
104 | `ss.lowest_common_hypernyms(ss)`  `ss.lowest_common_hypernyms(ss)`
105 | `ss.shortest_path_distance(ss)`   `len(ss.shortest_path(ss))`
106 | ================================  =========================================================
107 | 
108 | .. reset default role
109 | .. default-role::
110 | 
111 | (these tables are incomplete)
112 | 


--------------------------------------------------------------------------------
/wn/_db.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Storage back-end interface.
  3 | """
  4 | 
  5 | from importlib import resources
  6 | from pathlib import Path
  7 | import json
  8 | import sqlite3
  9 | import logging
 10 | 
 11 | import wn
 12 | from wn._types import AnyPath
 13 | from wn._util import short_hash, format_lexicon_specifier
 14 | 
 15 | 
 16 | logger = logging.getLogger('wn')
 17 | 
 18 | 
 19 | # Module Constants
 20 | 
 21 | DEBUG = False
 22 | 
 23 | # This stores hashes of the schema to check for version differences.
 24 | # When the schema changes, the hash will change. If the new hash is
 25 | # not added here, the 'test_schema_compatibility' test will fail. It
 26 | # is the developer's responsibility to only add compatible schema
 27 | # hashes here. If the schema change is not backwards-compatible, then
 28 | # clear all old hashes and only put the latest hash here. A hash can
 29 | # be generated like this:
 30 | #
 31 | # >>> import sqlite3
 32 | # >>> import wn
 33 | # >>> conn = sqlite3.connect(wn.config.database_path)
 34 | # >>> wn._db.schema_hash(conn)
 35 | #
 36 | COMPATIBLE_SCHEMA_HASHES = {
 37 |     '4c8ad03af5422d6979039ee2b80838d07c12d2c8',  # Original schema
 38 |     '01909cb2d0cdee19ed687dbd95c5983d7b68f807',  # Added form_lexicon_index
 39 |     '4c2728bb7999685d9748ad6245638a210d0f099d',  # Added form_lexicon_form_covering_index
 40 |     'c1ef1e74d47810fd313383cdb8ecb9a2d9aef7db',  # Migrated database with covering index
 41 | }
 42 | 
 43 | 
 44 | # Optional metadata is stored as a JSON string
 45 | 
 46 | def _adapt_dict(d: dict) -> bytes:
 47 |     return json.dumps(d).encode('utf-8')
 48 | 
 49 | 
 50 | def _convert_dict(s: bytes) -> dict:
 51 |     return json.loads(s)
 52 | 
 53 | 
 54 | def _convert_boolean(s: bytes) -> bool:
 55 |     return bool(int(s))
 56 | 
 57 | 
 58 | sqlite3.register_adapter(dict, _adapt_dict)
 59 | sqlite3.register_converter('meta', _convert_dict)
 60 | sqlite3.register_converter('boolean', _convert_boolean)
 61 | 
 62 | 
 63 | # The pool is a cache of open connections. Unless the database path is
 64 | # changed, there should only be zero or one.
 65 | pool: dict[AnyPath, sqlite3.Connection] = {}
 66 | 
 67 | 
 68 | # The connect() function should be used for all connections
 69 | 
 70 | def connect() -> sqlite3.Connection:
 71 |     dbpath = wn.config.database_path
 72 |     if dbpath not in pool:
 73 |         if not wn.config.data_directory.exists():
 74 |             wn.config.data_directory.mkdir(parents=True, exist_ok=True)
 75 |         initialized = dbpath.is_file()
 76 |         conn = sqlite3.connect(
 77 |             str(dbpath),
 78 |             detect_types=sqlite3.PARSE_DECLTYPES,
 79 |             check_same_thread=not wn.config.allow_multithreading,
 80 |         )
 81 |         # foreign key support needs to be enabled for each connection
 82 |         conn.execute('PRAGMA foreign_keys = ON')
 83 |         if DEBUG:
 84 |             conn.set_trace_callback(print)
 85 |         if not initialized:
 86 |             logger.info('initializing database: %s', dbpath)
 87 |             _init_db(conn)
 88 |         _check_schema_compatibility(conn, dbpath)
 89 | 
 90 |         pool[dbpath] = conn
 91 |     return pool[dbpath]
 92 | 
 93 | 
 94 | def _init_db(conn: sqlite3.Connection) -> None:
 95 |     schema = (resources.files('wn') / 'schema.sql').read_text()
 96 |     conn.executescript(schema)
 97 |     with conn:
 98 |         conn.executemany('INSERT INTO ili_statuses VALUES (null,?)',
 99 |                          [('presupposed',), ('proposed',)])
100 | 
101 | 
102 | def _check_schema_compatibility(conn: sqlite3.Connection, dbpath: Path) -> None:
103 |     hash = schema_hash(conn)
104 | 
105 |     # if the hash is known, then we're all good here
106 |     if hash in COMPATIBLE_SCHEMA_HASHES:
107 |         return
108 | 
109 |     logger.debug('current schema hash:\n  %s', hash)
110 |     logger.debug('compatible schema hashes:\n  %s',
111 |                  '\n  '.join(COMPATIBLE_SCHEMA_HASHES))
112 |     # otherwise, try to raise a helpful error message
113 |     msg = ("Wn's schema has changed and is no longer compatible with the "
114 |            f"database. Please move or delete {dbpath} and rebuild it.")
115 |     try:
116 |         specs = conn.execute('SELECT id, version FROM lexicons').fetchall()
117 |     except sqlite3.OperationalError as exc:
118 |         raise wn.DatabaseError(msg) from exc
119 |     else:
120 |         if specs:
121 |             installed = '\n  '.join(
122 |                 format_lexicon_specifier(id, ver)
123 |                 for id, ver in specs
124 |             )
125 |             msg += f" Lexicons currently installed:\n  {installed}"
126 |         else:
127 |             msg += ' No lexicons are currently installed.'
128 |         raise wn.DatabaseError(msg)
129 | 
130 | 
131 | def schema_hash(conn: sqlite3.Connection) -> str:
132 |     query = 'SELECT sql FROM sqlite_master WHERE NOT sql ISNULL'
133 |     schema = '\n\n'.join(row[0] for row in conn.execute(query))
134 |     return short_hash(schema)
135 | 
136 | 
137 | def clear_connections() -> None:
138 |     """Close and delete any open database connections."""
139 |     for path in list(pool):
140 |         pool[path].close()
141 |         del pool[path]
142 | 


--------------------------------------------------------------------------------
/bench/conftest.py:
--------------------------------------------------------------------------------
  1 | import tempfile
  2 | from collections.abc import Iterator
  3 | from itertools import product, cycle
  4 | from pathlib import Path
  5 | 
  6 | import pytest
  7 | 
  8 | import wn
  9 | from wn import lmf
 10 | 
 11 | 
 12 | @pytest.fixture
 13 | def clean_db():
 14 | 
 15 |     def clean_db():
 16 |         wn.remove("*")
 17 |         dummy_lex = lmf.Lexicon(
 18 |             id="dummy",
 19 |             version="1",
 20 |             label="placeholder to initialize the db",
 21 |             language="zxx",
 22 |             email="",
 23 |             license="",
 24 |         )
 25 |         wn.add_lexical_resource(
 26 |             lmf.LexicalResource(lmf_version="1.3", lexicons=[dummy_lex])
 27 |         )
 28 | 
 29 |     return clean_db
 30 | 
 31 | 
 32 | @pytest.fixture(scope="session")
 33 | def datadir():
 34 |     return Path(__file__).parent.parent / "tests" / "data"
 35 | 
 36 | 
 37 | @pytest.fixture
 38 | def empty_db(clean_db):
 39 |     with tempfile.TemporaryDirectory('wn_data_empty') as dir:
 40 |         with pytest.MonkeyPatch.context() as m:
 41 |             m.setattr(wn.config, 'data_directory', dir)
 42 |             clean_db()
 43 |             yield
 44 | 
 45 | 
 46 | @pytest.fixture(scope="session")
 47 | def mock_lmf():
 48 |     synsets: list[lmf.Synset] = [
 49 |        * _make_synsets("n", 20000),
 50 |        * _make_synsets("v", 10000),
 51 |        * _make_synsets("a", 2000),
 52 |        * _make_synsets("r", 1000),
 53 |     ]
 54 |     entries = _make_entries(synsets)
 55 |     lexicon = lmf.Lexicon(
 56 |         id="mock",
 57 |         version="1",
 58 |         label="",
 59 |         language="zxx",
 60 |         email="",
 61 |         license="",
 62 |         entries=entries,
 63 |         synsets=synsets,
 64 |     )
 65 |     return lmf.LexicalResource(lmf_version="1.3", lexicons=[lexicon])
 66 | 
 67 | 
 68 | @pytest.fixture(scope="session")
 69 | def mock_db_dir(mock_lmf):
 70 |     with tempfile.TemporaryDirectory("wn_data_empty") as dir:
 71 |         with pytest.MonkeyPatch.context() as m:
 72 |             m.setattr(wn.config, 'data_directory', dir)
 73 |             wn.add_lexical_resource(mock_lmf, progress_handler=None)
 74 |             wn._db.clear_connections()
 75 | 
 76 |         yield Path(dir)
 77 | 
 78 | 
 79 | @pytest.fixture
 80 | def mock_db(monkeypatch, mock_db_dir):
 81 |     with monkeypatch.context() as m:
 82 |         m.setattr(wn.config, "data_directory", mock_db_dir)
 83 |         yield
 84 |         wn._db.clear_connections()
 85 | 
 86 | 
 87 | def _make_synsets(pos: str, n: int) -> list[lmf.Synset]:
 88 |     synsets: list[lmf.Synset] = [
 89 |         lmf.Synset(
 90 |             id=f"{i}-{pos}",
 91 |             ili="",
 92 |             partOfSpeech=pos,
 93 |             relations=[],
 94 |             meta={},
 95 |         )
 96 |         for i in range(1, n+1)
 97 |     ]
 98 |     # add relations for nouns and verbs
 99 |     if pos in "nv":
100 |         total = len(synsets)
101 |         tgt_i = 1  # index of next target synset
102 |         n = cycle([2])  # how many targets to relate
103 |         for cur_i in range(total):
104 |             if tgt_i <= cur_i:
105 |                 tgt_i = cur_i + 1
106 |             source = synsets[cur_i]
107 |             for cur_k in range(tgt_i, tgt_i + next(n)):
108 |                 if cur_k >= total:
109 |                     break
110 |                 target = synsets[cur_k]
111 |                 source["relations"].append(
112 |                     lmf.Relation(target=target["id"], relType="hyponym", meta={})
113 |                 )
114 |                 target["relations"].append(
115 |                     lmf.Relation(target=source["id"], relType="hypernym", meta={})
116 |                 )
117 |             tgt_i = cur_k + 1
118 | 
119 |     return synsets
120 | 
121 | 
122 | def _words() -> Iterator[str]:
123 |     consonants = "kgtdpbfvszrlmnhw"
124 |     vowels = "aeiou"
125 |     while True:
126 |         yield from map("".join, product(consonants, vowels, consonants, vowels))
127 | 
128 | 
129 | def _make_entries(synsets: list[lmf.Synset]) -> list[lmf.LexicalEntry]:
130 |     words = _words()
131 |     member_count = cycle(range(1, 4))  # 1, 2, or 3 synset members
132 |     entries: dict[str, lmf.LexicalEntry] = {}
133 |     prev_synsets: list[lmf.Synset] = []
134 |     for synset in synsets:
135 |         ssid = synset["id"]
136 |         pos = synset["partOfSpeech"]
137 | 
138 |         for _ in range(next(member_count)):
139 |             word = next(words)
140 |             senses = [lmf.Sense(id=f"{word}-{ssid}", synset=ssid, meta={})]
141 |             # add some polysemy
142 |             if prev_synsets:
143 |                 ssid2 = prev_synsets.pop()["id"]
144 |                 senses.append(lmf.Sense(id=f"{word}-{ssid2}", synset=ssid2, meta={}))
145 |             eid = f"{word}-{pos}"
146 |             if eid not in entries:
147 |                 entries[eid] = lmf.LexicalEntry(
148 |                     id=eid,
149 |                     lemma=lmf.Lemma(
150 |                         writtenForm=word,
151 |                         partOfSpeech=pos,
152 |                     ),
153 |                     senses=[],
154 |                     meta={},
155 |                 )
156 |             entries[eid]["senses"].extend(senses)
157 | 
158 |         prev_synsets.append(synset)
159 | 
160 |     return list(entries.values())
161 | 


--------------------------------------------------------------------------------
/wn/_download.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from collections.abc import Sequence
  3 | from typing import Optional
  4 | from pathlib import Path
  5 | import logging
  6 | 
  7 | import httpx
  8 | 
  9 | import wn
 10 | from wn._util import is_url
 11 | from wn.util import ProgressHandler, ProgressBar
 12 | from wn._add import add as add_to_db
 13 | from wn import config
 14 | 
 15 | 
 16 | CHUNK_SIZE = 8 * 1024  # how many KB to read at a time
 17 | TIMEOUT = 10  # number of seconds to wait for a server response
 18 | 
 19 | 
 20 | logger = logging.getLogger('wn')
 21 | 
 22 | 
 23 | def download(
 24 |         project_or_url: str,
 25 |         add: bool = True,
 26 |         progress_handler: Optional[type[ProgressHandler]] = ProgressBar,
 27 | ) -> Path:
 28 |     """Download the resource specified by *project_or_url*.
 29 | 
 30 |     First the URL of the resource is determined and then, depending on
 31 |     the parameters, the resource is downloaded and added to the
 32 |     database.  The function then returns the path of the cached file.
 33 | 
 34 |     If *project_or_url* starts with `'http://'` or `'https://'`, then
 35 |     it is taken to be the URL for the resource. Otherwise,
 36 |     *project_or_url* is taken as a :ref:`project specifier
 37 |     <lexicon-specifiers>` and the URL is taken from a matching entry
 38 |     in Wn's project index. If no project matches the specifier,
 39 |     :exc:`wn.Error` is raised.
 40 | 
 41 |     If the URL has been downloaded and cached before, the cached file
 42 |     is used. Otherwise the URL is retrieved and stored in the cache.
 43 | 
 44 |     If the *add* paramter is ``True`` (default), the downloaded
 45 |     resource is added to the database.
 46 | 
 47 |     >>> wn.download('ewn:2020')
 48 |     Added ewn:2020 (English WordNet)
 49 | 
 50 |     The *progress_handler* parameter takes a subclass of
 51 |     :class:`wn.util.ProgressHandler`. An instance of the class will be
 52 |     created, used, and closed by this function.
 53 | 
 54 |     """
 55 |     if progress_handler is None:
 56 |         progress_handler = ProgressHandler
 57 |     progress = progress_handler(message='Download', unit=' bytes')
 58 | 
 59 |     cache_path, urls = _get_cache_path_and_urls(project_or_url)
 60 | 
 61 |     try:
 62 |         if cache_path and cache_path.exists():
 63 |             progress.flash(f'Cached file found: {cache_path!s}')
 64 |             path = cache_path
 65 |         elif urls:
 66 |             path = _download(urls, progress)
 67 |         else:
 68 |             raise wn.Error('no urls to download')
 69 |     finally:
 70 |         progress.close()
 71 | 
 72 |     if add:
 73 |         try:
 74 |             add_to_db(path, progress_handler=progress_handler)
 75 |         except wn.Error as exc:
 76 |             raise wn.Error(
 77 |                 f'could not add downloaded file: {path}\n  You might try '
 78 |                 'deleting the cached file and trying the download again.'
 79 |             ) from exc
 80 | 
 81 |     return path
 82 | 
 83 | 
 84 | def _get_cache_path_and_urls(project_or_url: str) -> tuple[Optional[Path], list[str]]:
 85 |     if is_url(project_or_url):
 86 |         return config.get_cache_path(project_or_url), [project_or_url]
 87 |     else:
 88 |         info = config.get_project_info(project_or_url)
 89 |         return info.get('cache'), info['resource_urls']
 90 | 
 91 | 
 92 | def _download(urls: Sequence[str], progress: ProgressHandler) -> Path:
 93 |     client = httpx.Client(timeout=TIMEOUT, follow_redirects=True)
 94 |     try:
 95 |         for i, url in enumerate(urls, 1):
 96 |             path = config.get_cache_path(url)
 97 |             logger.info('download url: %s', url)
 98 |             logger.info('download cache path: %s', path)
 99 |             try:
100 |                 with open(path, 'wb') as f:
101 |                     progress.set(status='Requesting', count=0)
102 |                     with client.stream("GET", url) as response:
103 |                         response.raise_for_status()
104 |                         total = int(response.headers.get('Content-Length', 0))
105 |                         count = response.num_bytes_downloaded
106 |                         progress.set(count=count, total=total, status='Receiving')
107 |                         for chunk in response.iter_bytes(chunk_size=CHUNK_SIZE):
108 |                             if chunk:
109 |                                 f.write(chunk)
110 |                             progress.update(response.num_bytes_downloaded - count)
111 |                             count = response.num_bytes_downloaded
112 |                         progress.set(status='Complete')
113 |             except httpx.RequestError as exc:
114 |                 path.unlink(missing_ok=True)
115 |                 last_count = progress.kwargs['count']
116 |                 if i == len(urls):
117 |                     raise wn.Error(f'download failed at {last_count} bytes') from exc
118 |                 else:
119 |                     logger.info(
120 |                         'download failed at %d bytes; trying next url', last_count
121 |                     )
122 |             else:
123 |                 break  # success
124 | 
125 |     except KeyboardInterrupt as exc:
126 |         path.unlink(missing_ok=True)
127 |         last_count = progress.kwargs['count']
128 |         raise wn.Error(f'download cancelled at {last_count} bytes') from exc
129 |     except Exception:
130 |         path.unlink(missing_ok=True)
131 |         raise
132 |     finally:
133 |         client.close()
134 | 
135 |     return path
136 | 


--------------------------------------------------------------------------------
/wn/morphy.py:
--------------------------------------------------------------------------------
  1 | 
  2 | """A simple English lemmatizer that finds and removes known suffixes.
  3 | 
  4 | """
  5 | 
  6 | from typing import Optional
  7 | from enum import Flag, auto
  8 | 
  9 | import wn
 10 | from wn._types import LemmatizeResult
 11 | from wn.constants import NOUN, VERB, ADJ, ADJ_SAT, ADV, PARTS_OF_SPEECH
 12 | 
 13 | POSExceptionMap = dict[str, set[str]]
 14 | ExceptionMap = dict[str, POSExceptionMap]
 15 | 
 16 | 
 17 | class _System(Flag):
 18 |     """Flags to track suffix rules in various implementations of Morphy."""
 19 |     PWN = auto()
 20 |     NLTK = auto()
 21 |     WN = auto()
 22 |     ALL = PWN | NLTK | WN
 23 | 
 24 | 
 25 | _PWN = _System.PWN
 26 | _NLTK = _System.NLTK
 27 | _WN = _System.WN
 28 | _ALL = _System.ALL
 29 | 
 30 | 
 31 | Rule = tuple[str, str, _System]
 32 | 
 33 | DETACHMENT_RULES: dict[str, list[Rule]] = {
 34 |     NOUN: [
 35 |         ("s",    "",    _ALL),
 36 |         ("ces",  "x",   _WN),
 37 |         ("ses",  "s",   _ALL),
 38 |         ("ves",  "f",   _NLTK | _WN),
 39 |         ("ives", "ife", _WN),
 40 |         ("xes",  "x",   _ALL),
 41 |         ("xes",  "xis", _WN),
 42 |         ("zes",  "z",   _ALL),
 43 |         ("ches", "ch",  _ALL),
 44 |         ("shes", "sh",  _ALL),
 45 |         ("men",  "man", _ALL),
 46 |         ("ies",  "y",   _ALL),
 47 |     ],
 48 |     VERB: [
 49 |         ("s",   "",  _ALL),
 50 |         ("ies", "y", _ALL),
 51 |         ("es",  "e", _ALL),
 52 |         ("es",  "",  _ALL),
 53 |         ("ed",  "e", _ALL),
 54 |         ("ed",  "",  _ALL),
 55 |         ("ing", "e", _ALL),
 56 |         ("ing", "",  _ALL),
 57 |     ],
 58 |     ADJ: [
 59 |         ("er",  "",  _ALL),
 60 |         ("est", "",  _ALL),
 61 |         ("er",  "e", _ALL),
 62 |         ("est", "e", _ALL),
 63 |     ],
 64 |     ADV: [],
 65 | }
 66 | DETACHMENT_RULES[ADJ_SAT] = DETACHMENT_RULES[ADJ]
 67 | 
 68 | 
 69 | class Morphy:
 70 |     """The Morphy lemmatizer class.
 71 | 
 72 |     Objects of this class are callables that take a wordform and an
 73 |     optional part of speech and return a dictionary mapping parts of
 74 |     speech to lemmas. If objects of this class are not created with a
 75 |     :class:`wn.Wordnet` object, the returned lemmas may be invalid.
 76 | 
 77 |     Arguments:
 78 |         wordnet: optional :class:`wn.Wordnet` instance
 79 | 
 80 |     Example:
 81 | 
 82 |         >>> import wn
 83 |         >>> from wn.morphy import Morphy
 84 |         >>> ewn = wn.Wordnet('ewn:2020')
 85 |         >>> m = Morphy(ewn)
 86 |         >>> m('axes', pos='n')
 87 |         {'n': {'axe', 'ax', 'axis'}}
 88 |         >>> m('geese', pos='n')
 89 |         {'n': {'goose'}}
 90 |         >>> m('gooses')
 91 |         {'n': {'goose'}, 'v': {'goose'}}
 92 |         >>> m('goosing')
 93 |         {'v': {'goose'}}
 94 | 
 95 |     """
 96 | 
 97 |     def __init__(self, wordnet: Optional[wn.Wordnet] = None):
 98 |         self._rules = {
 99 |             pos: [rule for rule in rules if rule[2] & _System.WN]
100 |             for pos, rules in DETACHMENT_RULES.items()
101 |         }
102 |         exceptions: ExceptionMap = {pos: {} for pos in PARTS_OF_SPEECH}
103 |         all_lemmas: dict[str, set[str]] = {pos: set() for pos in PARTS_OF_SPEECH}
104 |         if wordnet:
105 |             for word in wordnet.words():
106 |                 pos = word.pos
107 |                 pos_exc = exceptions[pos]
108 |                 lemma, *others = word.forms()
109 |                 # store every lemma whether it has other forms or not
110 |                 all_lemmas[pos].add(lemma)
111 |                 # those with other forms map to the original lemmas
112 |                 for other in others:
113 |                     if other in pos_exc:
114 |                         pos_exc[other].add(lemma)
115 |                     else:
116 |                         pos_exc[other] = {lemma}
117 |             self._initialized = True
118 |         else:
119 |             self._initialized = False
120 |         self._exceptions = exceptions
121 |         self._all_lemmas = all_lemmas
122 | 
123 |     def __call__(self, form: str, pos: Optional[str] = None) -> LemmatizeResult:
124 |         result = {}
125 |         if not self._initialized:
126 |             result[pos] = {form}  # always include original when not initialized
127 | 
128 |         if pos is None:
129 |             pos_list = list(DETACHMENT_RULES)
130 |         elif pos in DETACHMENT_RULES:
131 |             pos_list = [pos]
132 |         else:
133 |             pos_list = []  # not handled by morphy
134 | 
135 |         no_pos_forms = result.get(None, set())  # avoid unnecessary duplicates
136 |         for _pos in pos_list:
137 |             candidates = self._morphstr(form, _pos) - no_pos_forms
138 |             if candidates:
139 |                 result.setdefault(_pos, set()).update(candidates)
140 | 
141 |         return result
142 | 
143 |     def _morphstr(self, form: str, pos: str) -> set[str]:
144 |         candidates: set[str] = set()
145 | 
146 |         initialized = self._initialized
147 |         if initialized:
148 |             all_lemmas = self._all_lemmas[pos]
149 |             if form in all_lemmas:
150 |                 candidates.add(form)
151 |             candidates.update(self._exceptions[pos].get(form, set()))
152 |         else:
153 |             all_lemmas = set()
154 | 
155 |         for suffix, repl, _ in self._rules[pos]:
156 |             # avoid applying rules that perform full suppletion
157 |             if form.endswith(suffix) and len(suffix) < len(form):
158 |                 candidate = f'{form[:-len(suffix)]}{repl}'
159 |                 if not initialized or candidate in all_lemmas:
160 |                     candidates.add(candidate)
161 | 
162 |         return candidates
163 | 
164 | 
165 | morphy = Morphy()
166 | 


--------------------------------------------------------------------------------
/docs/faq.rst:
--------------------------------------------------------------------------------
  1 | FAQ
  2 | ===
  3 | 
  4 | Is Wn related to the NLTK's `nltk.corpus.wordnet` module?
  5 | ---------------------------------------------------------
  6 | 
  7 | Only in spirit. There was an effort to develop the `NLTK`_\ 's module as a
  8 | standalone package (see https://github.com/nltk/wordnet/), but
  9 | development had slowed. Wn has the same broad goals and a similar API
 10 | as that standalone package, but fundamental architectural differences
 11 | demanded a complete rewrite, so Wn was created as a separate
 12 | project. With approval from the other package's maintainer, Wn
 13 | acquired the `wn <https://pypi.org/project/wn>`_ project on PyPI and
 14 | can be seen as its successor.
 15 | 
 16 | Is Wn compatible with the NLTK's module?
 17 | ----------------------------------------
 18 | 
 19 | The API is intentionally similar, but not exactly the same (for
 20 | instance see the next question), and there are differences in the ways
 21 | that results are retrieved, particularly for non-English wordnets. See
 22 | :doc:`guides/nltk-migration` for more information. Also see
 23 | :ref:`princeton-wordnet`.
 24 | 
 25 | Where are the ``Lemma`` objects? What are ``Word`` and ``Sense`` objects?
 26 | -------------------------------------------------------------------------
 27 | 
 28 | Unlike the original `WNDB`_ data format of the original WordNet, the
 29 | `WN-LMF`_ XML format grants words (called *lexical entries* in WN-LMF
 30 | and a :class:`~wn.Word` object in Wn) and word senses
 31 | (:class:`~wn.Sense` in Wn) explicit, first-class status alongside
 32 | synsets.  While senses are essentially links between words and
 33 | synsets, they may contain metadata and be the source or target of
 34 | sense relations, so in some ways they are more like nodes than edges
 35 | when the wordnet is viewed as a graph. The `NLTK`_\ 's module, using
 36 | the WNDB format, combines the information of a word and a sense into a
 37 | single object called a ``Lemmas``. Wn also has an unrelated concept
 38 | called a :meth:`~wn.Word.lemma`, but it is merely the canonical form
 39 | of a word.
 40 | 
 41 | .. _princeton-wordnet:
 42 | 
 43 | Where is the Princeton WordNet data?
 44 | ------------------------------------
 45 | 
 46 | The original English wordnet, named simply *WordNet* but often
 47 | referred to as the *Princeton WordNet* to better distinguish it from
 48 | other projects, is specifically the data distributed by Princeton in
 49 | the `WNDB`_ format. The `Open Multilingual Wordnet <OMW_>`_ (OMW)
 50 | packages an export of the WordNet data as the *OMW English Wordnet
 51 | based on WordNet 3.0* which is used by Wn (with the lexicon ID
 52 | ``omw-en``). It also has a similar export for WordNet 3.1 data
 53 | (``omw-en31``). Both of these are highly compatible with the original
 54 | data and can be used as drop-in replacements.
 55 | 
 56 | Prior to Wn version 0.9 (and, correspondingly, prior to the `OMW
 57 | data`_ version 1.4), the ``pwn:3.0`` and ``pwn:3.1`` English wordnets
 58 | distributed by OMW were incorrectly called the *Princeton WordNet*
 59 | (for WordNet 3.0 and 3.1, respectively). From Wn version 0.9 (and from
 60 | version 1.4 of the OMW data), these are called the *OMW English
 61 | Wordnet based on WordNet 3.0/3.1* (``omw-en:1.4`` and
 62 | ``omw-en31:1.4``, respectively). These lexicons are intentionally
 63 | compatible with the original WordNet data, and the 1.4 versions are
 64 | even more compatible than the previous ``pwn:3.0`` and ``pwn:3.1``
 65 | lexicons, so it is strongly recommended to use them over the previous
 66 | versions.
 67 | 
 68 | .. _OMW data: https://github.com/omwn/omw-data
 69 | 
 70 | Why don't all wordnets share the same synsets?
 71 | ----------------------------------------------
 72 | 
 73 | The `Open Multilingual Wordnet <OMW_>`_ (OMW) contains wordnets for
 74 | many languages created using the *expand* methodology [VOSSEN1998]_,
 75 | where non-English wordnets provide words on top of the English
 76 | wordnet's synset structure. This allows new wordnets to be built in
 77 | much less time than starting from scratch, but with a few drawbacks,
 78 | such as that words cannot be added if they do not have a synset in the
 79 | English wordnet, and that it is difficult to version the wordnets
 80 | independently (e.g., for reproducibility of experiments involving
 81 | wordnet data) as all are interconnected. Wn, therefore, creates new
 82 | synsets for each wordnet added to its database, and synsets then
 83 | specify which resource they belong to. Queries can specify which
 84 | resources may be examined. Also see :doc:`guides/interlingual`.
 85 | 
 86 | Why does Wn's database get so big?
 87 | ----------------------------------
 88 | 
 89 | The *OMW English Wordnet based on WordNet 3.0* takes about 114 MiB of
 90 | disk space in Wn's database, which is only about 8 MiB more than it
 91 | takes as a `WN-LMF`_ XML file. The `NLTK`_, however, uses the obsolete
 92 | `WNDB`_ format which is more compact, requiring only 35 MiB of disk
 93 | space. The difference with the Open Multilingual Wordnet 1.4 is more
 94 | striking: it takes about 659 MiB of disk space in the database, but
 95 | only 49 MiB in the NLTK. Part of the difference here is that the OMW
 96 | files in the NLTK are simple tab-separated-value files listing only
 97 | the words added to each synset for each language. In addition, Wn
 98 | creates new synsets for each wordnet added (see the previous
 99 | question). One more reason is that Wn creates various indexes in the
100 | database for efficient lookup.
101 | 
102 | .. _NLTK: https://www.nltk.org/
103 | .. _OMW: http://github.com/omwn
104 | .. [VOSSEN1998] Piek Vossen. 1998. *Introduction to EuroWordNet.* Computers and the Humanities, 32(2): 73--89.
105 | .. _Open English Wordnet 2021: https://en-word.net/
106 | .. _WNDB: https://wordnet.princeton.edu/documentation/wndb5wn
107 | .. _WN-LMF: https://globalwordnet.github.io/schemas/
108 | 


--------------------------------------------------------------------------------
/wn/__main__.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import sys
  3 | import argparse
  4 | from pathlib import Path
  5 | import json
  6 | import logging
  7 | 
  8 | import wn
  9 | from wn.project import iterpackages
 10 | from wn import lmf
 11 | from wn.validate import validate
 12 | from wn._util import format_lexicon_specifier
 13 | 
 14 | 
 15 | def _download(args):
 16 |     if args.index:
 17 |         wn.config.load_index(args.index)
 18 |     for target in args.target:
 19 |         wn.download(target, add=args.add)
 20 | 
 21 | 
 22 | def _lexicons(args):
 23 |     for lex in wn.lexicons(lang=args.lang, lexicon=args.lexicon):
 24 |         print('\t'.join((lex.id, lex.version, f'[{lex.language}]', lex.label)))
 25 | 
 26 | 
 27 | def _projects(args):
 28 |     for info in wn.projects():
 29 |         key = 'i'
 30 |         key += 'c' if info['cache'] else '-'
 31 |         # key += 'a' if False else '-'  # TODO: check if project is added to db
 32 |         print(
 33 |             '\t'.join((
 34 |                 key,
 35 |                 info['id'],
 36 |                 info['version'],
 37 |                 f"[{info['language'] or '---'}]",
 38 |                 info['label'] or '---',
 39 |             ))
 40 |         )
 41 | 
 42 | 
 43 | def _validate(args):
 44 |     all_valid = True
 45 |     selectseq = [check.strip() for check in args.select.split(',')]
 46 |     for package in iterpackages(args.FILE):
 47 |         resource = lmf.load(package.resource_file())
 48 |         for lexicon in resource['lexicons']:
 49 |             spec = format_lexicon_specifier(lexicon["id"], lexicon["version"])
 50 |             print(f'{spec:<20}', end='')
 51 |             report = validate(lexicon, select=selectseq)
 52 |             if not any(check.get('items', []) for check in report.values()):
 53 |                 print('passed')
 54 |             else:
 55 |                 print('failed')
 56 |                 all_valid = False
 57 |                 # clean up report
 58 |                 for code in list(report):
 59 |                     if not report[code].get('items'):
 60 |                         del report[code]
 61 |                 if args.output_file:
 62 |                     with open(args.output_file, 'w') as outfile:
 63 |                         json.dump(report, outfile, indent=2)
 64 |                 else:
 65 |                     for _code, check in report.items():
 66 |                         if not check['items']:
 67 |                             continue
 68 |                         print(f'  {check["message"]}')
 69 |                         for id, context in check['items'].items():
 70 |                             print(f'    {id}: {context}' if context else f'    {id}')
 71 | 
 72 |     sys.exit(0 if all_valid else 1)
 73 | 
 74 | 
 75 | def _path_type(arg):
 76 |     return Path(arg)
 77 | 
 78 | 
 79 | def _file_path_type(arg):
 80 |     path = Path(arg)
 81 |     if not path.is_file():
 82 |         raise argparse.ArgumentTypeError(f'cannot file file: {arg}')
 83 |     return path
 84 | 
 85 | 
 86 | parser = argparse.ArgumentParser(
 87 |     prog='python3 -m wn',
 88 |     description="Manage Wn's wordnet data from the command line.",
 89 | )
 90 | parser.add_argument(
 91 |     '-V', '--version', action='version', version=f'Wn {wn.__version__}'
 92 | )
 93 | parser.add_argument(
 94 |     '-v', '--verbose', action='count', dest='verbosity', default=0,
 95 |     help='increase verbosity (can repeat: -vv, -vvv)'
 96 | )
 97 | parser.add_argument(
 98 |     '-d', '--dir',
 99 |     type=_path_type,
100 |     help="data directory for Wn's database and cache",
101 | )
102 | parser.set_defaults(func=lambda _: parser.print_help())
103 | sub_parsers = parser.add_subparsers(title='subcommands')
104 | 
105 | 
106 | parser_download = sub_parsers.add_parser(
107 |     'download',
108 |     description="Download wordnets and add them to Wn's database.",
109 |     help='download wordnets',
110 | )
111 | parser_download.add_argument(
112 |     'target', nargs='+', help='project specifiers or URLs'
113 | )
114 | parser_download.add_argument(
115 |     '--index', type=_file_path_type, help='project index to use for downloading'
116 | )
117 | parser_download.add_argument(
118 |     '--no-add', action='store_false', dest='add',
119 |     help='download and cache without adding to the database'
120 | )
121 | parser_download.set_defaults(func=_download)
122 | 
123 | 
124 | parser_lexicons = sub_parsers.add_parser(
125 |     'lexicons',
126 |     description="Display a list of installed lexicons.",
127 |     help='list installed lexicons',
128 | )
129 | parser_lexicons.add_argument(
130 |     '-l', '--lang', help='BCP 47 language code'
131 | )
132 | parser_lexicons.add_argument(
133 |     '--lexicon', help='lexicon specifiers'
134 | )
135 | parser_lexicons.set_defaults(func=_lexicons)
136 | 
137 | 
138 | parser_projects = sub_parsers.add_parser(
139 |     'projects',
140 |     description=(
141 |         "Display a list of known projects. The first column shows the "
142 |         "status for a project (i=indexed, c=cached)."
143 |     ),
144 |     help='list known projects',
145 | )
146 | parser_projects.set_defaults(func=_projects)
147 | 
148 | 
149 | parser_validate = sub_parsers.add_parser(
150 |     'validate',
151 |     description=(
152 |         "Validate a WN-LMF lexicon"
153 |     ),
154 |     help='validate a lexicon',
155 | )
156 | parser_validate.add_argument(
157 |     'FILE', type=_file_path_type, help='WN-LMF (XML) lexicon file to validate'
158 | )
159 | parser_validate.add_argument(
160 |     '--select', metavar='CHECKS', default='E,W',
161 |     help='comma-separated list of checks to run (default: E,W)'
162 | )
163 | parser_validate.add_argument(
164 |     '--output-file', metavar='FILE',
165 |     help='write report to a JSON file'
166 | )
167 | parser_validate.set_defaults(func=_validate)
168 | 
169 | 
170 | args = parser.parse_args()
171 | 
172 | logging.basicConfig(level=logging.ERROR - (min(args.verbosity, 3) * 10))
173 | 
174 | if args.dir:
175 |     wn.config.data_directory = args.dir
176 | 
177 | args.func(args)
178 | 


--------------------------------------------------------------------------------
/tests/data/mini-lmf-1.1.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <!DOCTYPE LexicalResource SYSTEM "http://globalwordnet.github.io/schemas/WN-LMF-1.1.dtd">
  3 | 
  4 | <LexicalResource xmlns:dc="http://globalwordnet.github.io/schemas/dc/">
  5 | 
  6 |   <Lexicon id="test-ja"
  7 |            label="Testing Japanese WordNet"
  8 |            language="ja"
  9 |            email="maintainer@example.com"
 10 |            license="https://creativecommons.org/licenses/by/4.0/"
 11 |            version="1"
 12 |            url="https://example.com/test-ja"
 13 |            logo="logo.svg">
 14 | 
 15 |     <Requires id="test-en" version="1" />
 16 | 
 17 |     <LexicalEntry id="test-ja-情報-n">
 18 |       <Lemma partOfSpeech="n" writtenForm="情報" script="Jpan" />
 19 |       <Form id="test-ja-情報-n-じょうほう" writtenForm="じょうほう" script="Hira" />
 20 |       <Form id="test-ja-情報-n-ジョウホウ" writtenForm="ジョウホウ" script="Kana" />
 21 |       <Form id="test-ja-情報-n-zyouhou" writtenForm="zyouhou" script="Latn-kunrei" />
 22 |       <Sense id="test-ja-情報-n-0001-01" synset="test-ja-0001-n" />
 23 |     </LexicalEntry>
 24 | 
 25 |     <LexicalEntry id="test-ja-例え-n">
 26 |       <Lemma partOfSpeech="n" writtenForm="例え">
 27 |         <Pronunciation variety="standard" notation="ipa" audio="tatoe.wav">tatoe</Pronunciation>
 28 |       </Lemma>
 29 |       <Form id="test-ja-例え-n-たとえ" writtenForm="たとえ" script="Hira" />
 30 |       <Form id="test-ja-例え-n-タトエ" writtenForm="タトエ" script="Kana" />
 31 |       <Form id="test-ja-例え-n-tatoe" writtenForm="tatoe" script="Latn-kunrei" />
 32 |       <Sense id="test-ja-例え-n-0002-01" synset="test-ja-0002-n" />
 33 |     </LexicalEntry>
 34 | 
 35 |     <LexicalEntry id="test-ja-事例-n">
 36 |       <Lemma partOfSpeech="n" writtenForm="事例" />
 37 |       <Form id="test-ja-事例-n-じれい" writtenForm="じれい" script="Hira" />
 38 |       <Form id="test-ja-事例-n-ジレイ" writtenForm="ジレイ" script="Kana" />
 39 |       <Form id="test-ja-事例-n-zirei" writtenForm="zirei" script="Latn-kunrei" />
 40 |       <Sense id="test-ja-事例-n-0002-01" synset="test-ja-0002-n" />
 41 |     </LexicalEntry>
 42 | 
 43 |     <LexicalEntry id="test-ja-示す-v">
 44 |       <Lemma partOfSpeech="v" writtenForm="示す" />
 45 |       <Form id="test-ja-示す-v-しめす" writtenForm="しめす" script="Hira" />
 46 |       <Form id="test-ja-示す-v-シメス" writtenForm="シメス" script="Kana" />
 47 |       <Form id="test-ja-示す-v-simesu" writtenForm="simesu" script="Latn-kunrei" />
 48 |       <Sense id="test-ja-示す-v-0003-01" synset="test-ja-0003-v" subcat="frame-1" />
 49 |     </LexicalEntry>
 50 | 
 51 |     <Synset id="test-ja-0001-n" ili="i67447" partOfSpeech="n" lexfile="noun.cognition"
 52 |             members="test-ja-情報-n-0001-01" />
 53 | 
 54 |     <Synset id="test-ja-0002-n" ili="i67469" partOfSpeech="n" lexfile="noun.cognition"
 55 |             members="test-ja-事例-n-0002-01 test-ja-例え-n-0002-01" />
 56 | 
 57 |     <Synset id="test-ja-0003-v" ili="i26682" partOfSpeech="v" lexfile="verb.communication"
 58 |             members="test-ja-示す-v-0003-01" />
 59 | 
 60 |     <SyntacticBehaviour id="frame-1" subcategorizationFrame="ある人が何かを----" />
 61 |   </Lexicon>
 62 | 
 63 |   <LexiconExtension id="test-en-ext"
 64 |                     label="Testing English Extension"
 65 |                     language="en"
 66 |                     email="maintainer@example.com"
 67 |                     license="https://creativecommons.org/licenses/by/4.0/"
 68 |                     version="1"
 69 |                     url="https://example.com/test-en-ext">
 70 | 
 71 |     <Extends id="test-en" version="1" />
 72 | 
 73 |     <!-- add sense relation -->
 74 |     <ExternalLexicalEntry id="test-en-information-n">
 75 |       <ExternalSense id="test-en-information-n-0001-01">
 76 |         <SenseRelation relType="pertainym" target="test-en-ext-info-n-0001-01" />
 77 |       </ExternalSense>
 78 |     </ExternalLexicalEntry>
 79 | 
 80 |     <!-- add a new entry for an existing synset -->
 81 |     <LexicalEntry id="test-en-ext-info-n">
 82 |       <Lemma partOfSpeech="n" writtenForm="info" />
 83 |       <Sense id="test-en-ext-info-n-0001-01" synset="test-en-0001-n">
 84 |         <SenseRelation relType="pertainym" target="test-en-information-n-0001-01" />
 85 |       </Sense>
 86 |     </LexicalEntry>
 87 | 
 88 |     <!-- add a sense to an existing entry -->
 89 |     <ExternalLexicalEntry id="test-en-illustrate-v">
 90 |       <Sense id="test-en-ext-illustrate-v-0008-01" synset="test-en-ext-0008-v">
 91 | 	    <Example>"the artist illustrated the story beautifully"</Example>
 92 |       </Sense>
 93 |     </ExternalLexicalEntry>
 94 | 
 95 |     <!-- add a tag -->
 96 |     <ExternalLexicalEntry id="test-en-exemplify-v">
 97 |       <ExternalLemma>
 98 |         <Tag category="tense">INF</Tag>
 99 |       </ExternalLemma>
100 |     </ExternalLexicalEntry>
101 | 
102 |     <!-- add a new entry with a new synset -->
103 |     <LexicalEntry id="test-en-ext-fire-v">
104 |       <Lemma partOfSpeech="v" writtenForm="fire" />
105 |       <Sense id="test-en-ext-fire-v-0009-01" synset="test-en-ext-0009-v" subcat="social-transitive" />
106 |     </LexicalEntry>
107 | 
108 |     <!-- only needed for ids -->
109 |     <ExternalSynset id="test-en-0001-n" />
110 | 
111 |     <!-- add a relation to an existing synset -->
112 |     <ExternalSynset id="test-en-0007-v">
113 |       <SynsetRelation relType="hypernym" target="test-en-ext-0009-v" />
114 |     </ExternalSynset>
115 | 
116 |     <Synset id="test-en-ext-0008-v" ili="i30181" partOfSpeech="v"
117 |             members="test-en-ext-illustrate-v-0008-01" lexfile="verb.creation">
118 |       <Definition>depict something in a visual medium</Definition>
119 |     </Synset>
120 | 
121 |     <Synset id="test-en-ext-0009-v" ili="i33760" partOfSpeech="v"
122 |             members="test-en-ext-fire-v-0009-01" lexfile="verb.social">
123 |       <Definition>terminate employment</Definition>
124 |       <SynsetRelation relType="hyponym" target="test-en-0007-v" />
125 |     </Synset>
126 | 
127 |     <SyntacticBehaviour id="social-transitive" subcategorizationFrame="Somebody ----s somebody" />
128 | 
129 |   </LexiconExtension>
130 | 
131 | </LexicalResource>
132 | 


--------------------------------------------------------------------------------
/tests/lmf_test.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from xml.etree import ElementTree as ET
  3 | 
  4 | from wn import lmf
  5 | 
  6 | 
  7 | def test_is_lmf(datadir):
  8 |     assert lmf.is_lmf(datadir / 'mini-lmf-1.0.xml')
  9 |     assert lmf.is_lmf(str(datadir / 'mini-lmf-1.0.xml'))
 10 |     assert not lmf.is_lmf(datadir / 'README.md')
 11 |     assert not lmf.is_lmf(datadir / 'missing.xml')
 12 |     assert lmf.is_lmf(datadir / 'mini-lmf-1.1.xml')
 13 | 
 14 | 
 15 | def test_scan_lexicons(datadir):
 16 |     assert lmf.scan_lexicons(datadir / 'mini-lmf-1.0.xml') == [
 17 |         {
 18 |             'id': 'test-en',
 19 |             'version': '1',
 20 |             'label': 'Testing English WordNet',
 21 |             'extends': None,
 22 |         },
 23 |         {
 24 |             'id': 'test-es',
 25 |             'version': '1',
 26 |             'label': 'Testing Spanish WordNet',
 27 |             'extends': None,
 28 |         },
 29 |     ]
 30 | 
 31 |     assert lmf.scan_lexicons(datadir / 'mini-lmf-1.1.xml') == [
 32 |         {
 33 |             'id': 'test-ja',
 34 |             'version': '1',
 35 |             'label': 'Testing Japanese WordNet',
 36 |             'extends': None,
 37 |         },
 38 |         {
 39 |             'id': 'test-en-ext',
 40 |             'version': '1',
 41 |             'label': 'Testing English Extension',
 42 |             'extends': {
 43 |                 'id': 'test-en',
 44 |                 'version': '1',
 45 |             },
 46 |         },
 47 |     ]
 48 | 
 49 | 
 50 | def test_load_1_0(datadir):
 51 |     resource = lmf.load(datadir / 'mini-lmf-1.0.xml')
 52 |     lexicons = resource['lexicons']
 53 |     assert len(lexicons) == 2
 54 |     lexicon = lexicons[0]
 55 | 
 56 |     assert lexicon['id'] == 'test-en'
 57 |     assert lexicon['label'] == 'Testing English WordNet'
 58 |     assert lexicon['language'] == 'en'
 59 |     assert lexicon['email'] == 'maintainer@example.com'
 60 |     assert lexicon['license'] == 'https://creativecommons.org/licenses/by/4.0/'
 61 |     assert lexicon['version'] == '1'
 62 |     assert lexicon['url'] == 'https://example.com/test-en'
 63 | 
 64 |     assert len(lexicon['entries']) == 9
 65 |     le = lexicon['entries'][0]
 66 |     assert le['id'] == 'test-en-information-n'
 67 | 
 68 |     assert le['lemma']['writtenForm'] == 'information'
 69 |     assert le['lemma']['partOfSpeech'] == 'n'
 70 |     assert le['lemma']['script'] == 'Latn'
 71 |     assert len(le['lemma']['tags']) == 1
 72 | 
 73 |     assert len(le.get('forms', [])) == 0
 74 | 
 75 |     assert len(le['senses']) == 1
 76 |     sense = le['senses'][0]
 77 |     assert sense['id'] == 'test-en-information-n-0001-01'
 78 |     assert sense['synset'] == 'test-en-0001-n'
 79 |     assert len(sense.get('relations', [])) == 0
 80 |     # assert sense['relations'][0]['target'] == 'test-en-exemplify-v-01023137-01'
 81 |     # assert sense['relations'][0]['type'] == 'derivation'
 82 | 
 83 |     assert len(lexicon.get('frames', [])) == 0  # frames are on lexical entry
 84 |     assert len(lexicon['entries'][6]['frames']) == 2
 85 |     frames = lexicon['entries'][6]['frames']
 86 |     assert frames[0]['subcategorizationFrame'] == 'Somebody ----s something'
 87 |     assert frames[0]['senses'] == ['test-en-illustrate-v-0003-01']
 88 | 
 89 |     assert len(lexicon['synsets']) == 8
 90 | 
 91 |     assert lexicons[1]['id'] == 'test-es'
 92 | 
 93 | 
 94 | def test_load_1_1(datadir):
 95 |     resource = lmf.load(datadir / 'mini-lmf-1.1.xml')
 96 |     lexicons = resource['lexicons']
 97 |     assert len(lexicons) == 2
 98 |     lexicon = lexicons[0]
 99 |     assert lexicon['id'] == 'test-ja'
100 |     assert lexicon['version'] == '1'
101 |     # assert lexicon.logo == 'logo.svg'
102 |     assert lexicon.get('requires') == [{'id': 'test-en', 'version': '1'}]
103 | 
104 |     lexicon = lexicons[1]
105 |     assert lexicon['id'] == 'test-en-ext'
106 |     assert lexicon.get('extends') == {'id': 'test-en', 'version': '1'}
107 | 
108 | 
109 | def test_load_1_3(datadir):
110 |     resource = lmf.load(datadir / 'mini-lmf-1.3.xml')
111 |     lexicons = resource['lexicons']
112 |     assert len(lexicons) == 1
113 |     lexicon = lexicons[0]
114 |     synsets = lexicon['synsets']
115 |     assert synsets[0]['definitions'][0]['text'] == 'one two three'
116 |     assert synsets[1]['definitions'][0]['text'] == 'one two three'
117 |     assert synsets[2]['definitions'][0]['text'] == '''
118 |         one
119 |           two
120 |         three
121 |       '''
122 | 
123 | 
124 | def test_load_1_4(datadir):
125 |     resource = lmf.load(datadir / 'mini-lmf-1.4.xml')
126 |     lexicons = resource['lexicons']
127 |     assert len(lexicons) == 1
128 |     lexicon = lexicons[0]
129 |     assert lexicon['entries'][0].get('index') == 'foo_bar'
130 |     assert lexicon['entries'][1].get('index') == 'foo_bar'
131 |     assert lexicon['entries'][2].get('index') is None
132 |     assert lexicon['entries'][3].get('index') == 'baz'
133 |     assert lexicon['entries'][4].get('index') is None
134 |     assert lexicon['entries'][5].get('index') == 'baz'
135 | 
136 |     assert lexicon['entries'][0]['senses'][0].get('n') == 3
137 |     assert lexicon['entries'][1]['senses'][0].get('n') == 2
138 |     assert lexicon['entries'][1]['senses'][1].get('n') == 1
139 |     assert lexicon['entries'][2]['senses'][0].get('n') is None
140 |     assert lexicon['entries'][3]['senses'][0].get('n') == 2
141 |     assert lexicon['entries'][4]['senses'][0].get('n') == 2
142 |     assert lexicon['entries'][4]['senses'][1].get('n') is None
143 |     assert lexicon['entries'][5]['senses'][0].get('n') == 1
144 | 
145 | 
146 | def test_dump(datadir, tmp_path):
147 |     tmpdir = tmp_path / 'test_dump'
148 |     tmpdir.mkdir()
149 |     tmppath = tmpdir / 'mini_lmf_dump.xml'
150 | 
151 |     def assert_xml_equal(mini_lmf, dump_lmf):
152 |         orig = ET.canonicalize(from_file=mini_lmf, strip_text=True)
153 |         temp = ET.canonicalize(from_file=dump_lmf, strip_text=True)
154 |         # additional transformation to help with debugging
155 |         orig = orig.replace('<', '\n<')
156 |         temp = temp.replace('<', '\n<')
157 |         assert orig == temp
158 | 
159 |     lmf.dump(lmf.load(datadir / 'mini-lmf-1.0.xml'), tmppath)
160 |     assert_xml_equal(datadir / 'mini-lmf-1.0.xml', tmppath)
161 | 
162 |     lmf.dump(lmf.load(datadir / 'mini-lmf-1.1.xml'), tmppath)
163 |     assert_xml_equal(datadir / 'mini-lmf-1.1.xml', tmppath)
164 | 
165 |     lmf.dump(lmf.load(datadir / 'mini-lmf-1.4.xml'), tmppath)
166 |     assert_xml_equal(datadir / 'mini-lmf-1.4.xml', tmppath)
167 | 


--------------------------------------------------------------------------------
/docs/guides/wordnet.rst:
--------------------------------------------------------------------------------
  1 | .. raw:: html
  2 | 
  3 |     <style>.center {margin-left:20%}</style>
  4 | 
  5 | 
  6 | The Structure of a Wordnet
  7 | ==========================
  8 | A **wordnet** is an online lexicon which is organized by concepts. 
  9 | 
 10 | The basic unit of a wordnet is the synonym set (**synset**), a group of words that all refer to the 
 11 | same concept. Words and synsets are linked by means of conceptual-semantic relations to form the 
 12 | structure of wordnet. 
 13 | 
 14 | Words, Senses, and Synsets
 15 | --------------------------
 16 | We all know that **words** are the basic building blocks of languages, a word is built up with two parts, 
 17 | its form and its meaning, but in natural languages, the word form and word meaning are not in an elegant 
 18 | one-to-one match, one word form may connect to many different meanings, so hereforth, we need **senses**, 
 19 | to work as the unit of word meanings, for example, the word *bank* has at least two senses:
 20 | 
 21 | 1. bank\ :sup:`1`\: financial institution, like *City Bank*;
 22 | 2. bank\ :sup:`2`\: sloping land, like *river bank*;
 23 | 
 24 | Since **synsets** are group of words sharing the same concept, bank\ :sup:`1`\ and bank\ :sup:`2`\ are members of 
 25 | two different synsets, although they have the same word form.
 26 | 
 27 | On the other hand, different word forms may also convey the same concept, such as *cab* and *taxi*, 
 28 | these word forms with the same concept are grouped together into one synset.
 29 | 
 30 | .. raw:: html
 31 |     :file: images/word-sense-synset.svg
 32 | 
 33 | 
 34 | .. role:: center
 35 |     :class: center
 36 | 
 37 | :center:`Figure: relations between words, senses and synsets`
 38 | 
 39 | 
 40 | Synset Relations
 41 | ----------------
 42 | In wordnet, synsets are linked with each other to form various kinds of relations. For example, if 
 43 | the concept expressed by a synset is more general than a given synset, then it is in a 
 44 | *hypernym* relation with the given synset. As shown in the figure below, the synset with *car*, *auto* and *automobile* as its 
 45 | member is the *hypernym* of the other synset with *cab*, *taxi* and *hack*. Such relation which is built on 
 46 | the synset level is categorized as synset relations.
 47 | 
 48 | .. raw:: html
 49 |     :file: images/synset-synset.svg
 50 | 
 51 | :center:`Figure: example of synset relations`
 52 | 
 53 | Sense Relations
 54 | ---------------
 55 | 
 56 | Some relations in wordnet are also built on sense level, which can be further divided into two types, 
 57 | relations that link sense with another sense, and relations that link sense with another synset.
 58 | 
 59 | .. note::  In wordnet, synset relation and sense relation can both employ a particular 
 60 |     relation type, such as `domain topic <https://globalwordnet.github.io/gwadoc/#domain_topic>`_.
 61 | 
 62 | **Sense-Sense**
 63 | 
 64 | Sense to sense relations emphasize the connections between different senses, especially when dealing 
 65 | with morphologically related words. For example, *behavioral* is the adjective to the noun *behavior*, 
 66 | which is known as in the *pertainym* relation with *behavior*, however, such relation doesn't exist between 
 67 | *behavioral* and *conduct*, which is a synonym of *behavior* and is in the same synset. Here *pertainym* 
 68 | is a sense-sense relation.
 69 | 
 70 | .. raw:: html
 71 |     :file: images/sense-sense.svg
 72 | 
 73 | :center:`Figure: example of sense-sense relations`
 74 | 
 75 | **Sense-Synset**
 76 | 
 77 | Sense-synset relations connect a particular sense with a synset. For example, *cursor* is a term in the 
 78 | *computer science* discipline, in wordnet, it is in the *has domain topic* relation with the 
 79 | *computer science* synset, but *pointer*, which is in the same synset with *cursor*, is not a term, thus 
 80 | has no such relation with *computer science* synset.
 81 | 
 82 | .. raw:: html
 83 |     :file: images/sense-synset.svg
 84 | 
 85 | :center:`Figure: example of sense-synset relations`
 86 | 
 87 | Other Information
 88 | -----------------
 89 | A wordnet should be built in an appropriate form, two schemas are accepted:
 90 | 
 91 | * XML schema based on the Lexical Markup Framework (LMF)
 92 | * JSON-LD using the Lexicon Model for Ontologies
 93 | 
 94 | The structure of a wordnet should contain below info:
 95 | 
 96 | **Definition**
 97 | 
 98 | Definition is used to define senses and synsets in a wordnet, it is given in the language 
 99 | of the wordnet it came from. 
100 | 
101 | **Example**
102 | 
103 | Example is used to clarify the senses and synsets in a wordnet, users can understand the definition 
104 | more clearly with a given example.
105 | 
106 | **Metadata**
107 | 
108 | A wordnet has its own metadata, based on the `Dublin Core <https://dublincore.org/>`_, to state the 
109 | basic info of it, below table lists all the items in the metadata of a wordnet:
110 | 
111 | +------------------+-----------+-----------+
112 | | contributor      | Optional  |  str      |
113 | +------------------+-----------+-----------+
114 | | coverage         | Optional  |  str      |
115 | +------------------+-----------+-----------+
116 | | creator          | Optional  |  str      |
117 | +------------------+-----------+-----------+
118 | | date             | Optional  |  str      |
119 | +------------------+-----------+-----------+
120 | | description      | Optional  |  str      |
121 | +------------------+-----------+-----------+
122 | | format           | Optional  |  str      |
123 | +------------------+-----------+-----------+
124 | | identifier       | Optional  |  str      |
125 | +------------------+-----------+-----------+
126 | | publisher        | Optional  |  str      |
127 | +------------------+-----------+-----------+
128 | | relation         | Optional  |  str      |
129 | +------------------+-----------+-----------+
130 | | rights           | Optional  |  str      |
131 | +------------------+-----------+-----------+
132 | | source           | Optional  |  str      |
133 | +------------------+-----------+-----------+
134 | | subject          | Optional  |  str      |
135 | +------------------+-----------+-----------+
136 | | title            | Optional  |  str      |
137 | +------------------+-----------+-----------+
138 | | type             | Optional  |  str      |
139 | +------------------+-----------+-----------+
140 | | status           | Optional  |  str      |
141 | +------------------+-----------+-----------+
142 | | note             | Optional  |  str      |
143 | +------------------+-----------+-----------+
144 | | confidence       | Optional  |  float    |
145 | +------------------+-----------+-----------+


--------------------------------------------------------------------------------
/wn/_module_functions.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional, Union
  2 | 
  3 | import wn
  4 | from wn._util import format_lexicon_specifier
  5 | 
  6 | 
  7 | def projects() -> list[dict]:
  8 |     """Return the list of indexed projects.
  9 | 
 10 |     This returns the same dictionaries of information as
 11 |     :meth:`wn.config.get_project_info
 12 |     <wn._config.WNConfig.get_project_info>`, but for all indexed
 13 |     projects.
 14 | 
 15 |     Example:
 16 | 
 17 |         >>> infos = wn.projects()
 18 |         >>> len(infos)
 19 |         36
 20 |         >>> infos[0]['label']
 21 |         'Open English WordNet'
 22 | 
 23 |     """
 24 |     index = wn.config.index
 25 |     return [
 26 |         wn.config.get_project_info(format_lexicon_specifier(project_id, version))
 27 |         for project_id, project_info in index.items()
 28 |         for version in project_info.get('versions', [])
 29 |         if 'resource_urls' in project_info['versions'][version]
 30 |     ]
 31 | 
 32 | 
 33 | def lexicons(
 34 |     *,
 35 |     lexicon: Optional[str] = "*",
 36 |     lang: Optional[str] = None
 37 | ) -> list[wn.Lexicon]:
 38 |     """Return the lexicons matching a language or lexicon specifier.
 39 | 
 40 |     Example:
 41 | 
 42 |         >>> wn.lexicons(lang='en')
 43 |         [<Lexicon ewn:2020 [en]>, <Lexicon omw-en:1.4 [en]>]
 44 | 
 45 |     """
 46 |     try:
 47 |         w = wn.Wordnet(lang=lang, lexicon=lexicon or '*')
 48 |     except wn.Error:
 49 |         return []
 50 |     else:
 51 |         return w.lexicons()
 52 | 
 53 | 
 54 | def word(
 55 |     id: str,
 56 |     *,
 57 |     lexicon: Optional[str] = None,
 58 |     lang: Optional[str] = None
 59 | ) -> wn.Word:
 60 |     """Return the word with *id* in *lexicon*.
 61 | 
 62 |     This will create a :class:`Wordnet` object using the *lang* and
 63 |     *lexicon* arguments. The *id* argument is then passed to the
 64 |     :meth:`Wordnet.word` method.
 65 | 
 66 |     >>> wn.word('ewn-cell-n')
 67 |     Word('ewn-cell-n')
 68 | 
 69 |     """
 70 |     return wn.Wordnet(lang=lang, lexicon=lexicon).word(id)
 71 | 
 72 | 
 73 | def words(
 74 |     form: Optional[str] = None,
 75 |     pos: Optional[str] = None,
 76 |     *,
 77 |     lexicon: Optional[str] = None,
 78 |     lang: Optional[str] = None,
 79 | ) -> list[wn.Word]:
 80 |     """Return the list of matching words.
 81 | 
 82 |     This will create a :class:`Wordnet` object using the *lang* and
 83 |     *lexicon* arguments. The remaining arguments are passed to the
 84 |     :meth:`Wordnet.words` method.
 85 | 
 86 |     >>> len(wn.words())
 87 |     282902
 88 |     >>> len(wn.words(pos='v'))
 89 |     34592
 90 |     >>> wn.words(form="scurry")
 91 |     [Word('ewn-scurry-n'), Word('ewn-scurry-v')]
 92 | 
 93 |     """
 94 |     return wn.Wordnet(lang=lang, lexicon=lexicon).words(form=form, pos=pos)
 95 | 
 96 | 
 97 | def synset(
 98 |     id: str,
 99 |     *,
100 |     lexicon: Optional[str] = None,
101 |     lang: Optional[str] = None
102 | ) -> wn.Synset:
103 |     """Return the synset with *id* in *lexicon*.
104 | 
105 |     This will create a :class:`Wordnet` object using the *lang* and
106 |     *lexicon* arguments. The *id* argument is then passed to the
107 |     :meth:`Wordnet.synset` method.
108 | 
109 |     >>> wn.synset('ewn-03311152-n')
110 |     Synset('ewn-03311152-n')
111 | 
112 |     """
113 |     return wn.Wordnet(lang=lang, lexicon=lexicon).synset(id=id)
114 | 
115 | 
116 | def synsets(
117 |     form: Optional[str] = None,
118 |     pos: Optional[str] = None,
119 |     ili: Optional[Union[str, wn.ILI]] = None,
120 |     *,
121 |     lexicon: Optional[str] = None,
122 |     lang: Optional[str] = None,
123 | ) -> list[wn.Synset]:
124 |     """Return the list of matching synsets.
125 | 
126 |     This will create a :class:`Wordnet` object using the *lang* and
127 |     *lexicon* arguments. The remaining arguments are passed to the
128 |     :meth:`Wordnet.synsets` method.
129 | 
130 |     >>> len(wn.synsets('couch'))
131 |     4
132 |     >>> wn.synsets('couch', pos='v')
133 |     [Synset('ewn-00983308-v')]
134 | 
135 |     """
136 |     return wn.Wordnet(lang=lang, lexicon=lexicon).synsets(form=form, pos=pos, ili=ili)
137 | 
138 | 
139 | def senses(
140 |     form: Optional[str] = None,
141 |     pos: Optional[str] = None,
142 |     *,
143 |     lexicon: Optional[str] = None,
144 |     lang: Optional[str] = None,
145 | ) -> list[wn.Sense]:
146 |     """Return the list of matching senses.
147 | 
148 |     This will create a :class:`Wordnet` object using the *lang* and
149 |     *lexicon* arguments. The remaining arguments are passed to the
150 |     :meth:`Wordnet.senses` method.
151 | 
152 |     >>> len(wn.senses('twig'))
153 |     3
154 |     >>> wn.senses('twig', pos='n')
155 |     [Sense('ewn-twig-n-13184889-02')]
156 | 
157 |     """
158 |     return wn.Wordnet(lang=lang, lexicon=lexicon).senses(form=form, pos=pos)
159 | 
160 | 
161 | def sense(
162 |     id: str,
163 |     *,
164 |     lexicon: Optional[str] = None,
165 |     lang: Optional[str] = None
166 | ) -> wn.Sense:
167 |     """Return the sense with *id* in *lexicon*.
168 | 
169 |     This will create a :class:`Wordnet` object using the *lang* and
170 |     *lexicon* arguments. The *id* argument is then passed to the
171 |     :meth:`Wordnet.sense` method.
172 | 
173 |     >>> wn.sense('ewn-flutter-v-01903884-02')
174 |     Sense('ewn-flutter-v-01903884-02')
175 | 
176 |     """
177 |     return wn.Wordnet(lang=lang, lexicon=lexicon).sense(id=id)
178 | 
179 | 
180 | def ili(
181 |     id: str,
182 |     *,
183 |     lexicon: Optional[str] = None,
184 |     lang: Optional[str] = None
185 | ) -> wn.ILI:
186 |     """Return the interlingual index with *id*.
187 | 
188 |     This will create a :class:`Wordnet` object using the *lang* and
189 |     *lexicon* arguments. The *id* argument is then passed to the
190 |     :meth:`Wordnet.ili` method.
191 | 
192 |     """
193 |     return wn.Wordnet(lang=lang, lexicon=lexicon).ili(id=id)
194 | 
195 | 
196 | def ilis(
197 |     status: Optional[str] = None,
198 |     *,
199 |     lexicon: Optional[str] = None,
200 |     lang: Optional[str] = None,
201 | ) -> list[wn.ILI]:
202 |     """Return the list of matching interlingual indices.
203 | 
204 |     This will create a :class:`Wordnet` object using the *lang* and
205 |     *lexicon* arguments. The remaining arguments are passed to the
206 |     :meth:`Wordnet.ilis` method.
207 | 
208 |     >>> len(wn.ilis())
209 |     120071
210 |     >>> len(wn.ilis(status='proposed'))
211 |     2573
212 |     >>> wn.ilis(status='proposed')[-1].definition()
213 |     'the neutrino associated with the tau lepton.'
214 | 
215 |     """
216 |     return wn.Wordnet(lang=lang, lexicon=lexicon).ilis(status=status)
217 | 


--------------------------------------------------------------------------------
/tests/secondary_query_test.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import pytest
  3 | 
  4 | import wn
  5 | 
  6 | 
  7 | @pytest.mark.usefixtures('mini_db')
  8 | def test_word_senses():
  9 |     assert len(wn.word('test-en-information-n').senses()) == 1
 10 |     assert len(wn.word('test-es-información-n').senses()) == 1
 11 | 
 12 | 
 13 | @pytest.mark.usefixtures('mini_db')
 14 | def test_word_synsets():
 15 |     assert len(wn.word('test-en-information-n').synsets()) == 1
 16 |     assert len(wn.word('test-es-información-n').synsets()) == 1
 17 | 
 18 | 
 19 | @pytest.mark.usefixtures('mini_db')
 20 | def test_word_translate():
 21 |     assert len(wn.word('test-en-example-n').translate(lang='es')) == 1
 22 |     assert len(wn.word('test-es-ejemplo-n').translate(lang='en')) == 1
 23 | 
 24 | 
 25 | @pytest.mark.usefixtures('mini_db')
 26 | def test_sense_word():
 27 |     assert (wn.sense('test-en-information-n-0001-01').word()
 28 |             == wn.word('test-en-information-n'))
 29 |     assert (wn.sense('test-es-información-n-0001-01').word()
 30 |             == wn.word('test-es-información-n'))
 31 | 
 32 | 
 33 | @pytest.mark.usefixtures('mini_db')
 34 | def test_sense_synset():
 35 |     assert (wn.sense('test-en-information-n-0001-01').synset()
 36 |             == wn.synset('test-en-0001-n'))
 37 |     assert (wn.sense('test-es-información-n-0001-01').synset()
 38 |             == wn.synset('test-es-0001-n'))
 39 | 
 40 | 
 41 | @pytest.mark.usefixtures('mini_db')
 42 | def test_sense_issue_157():
 43 |     # https://github.com/goodmami/wn/issues/157
 44 |     sense = wn.sense('test-en-information-n-0001-01')
 45 |     # This test uses non-public members, which is not ideal, but there
 46 |     # is currently no better alternative.
 47 |     assert sense._lexconf is sense.word()._lexconf
 48 |     assert sense._lexconf is sense.synset()._lexconf
 49 | 
 50 | 
 51 | @pytest.mark.usefixtures('mini_db')
 52 | def test_sense_examples():
 53 |     assert wn.sense('test-en-information-n-0001-01').examples() == []
 54 |     assert wn.sense('test-es-información-n-0001-01').examples() == []
 55 | 
 56 | 
 57 | @pytest.mark.usefixtures('mini_db')
 58 | def test_sense_lexicalized():
 59 |     assert wn.sense('test-en-information-n-0001-01').lexicalized()
 60 |     assert wn.sense('test-es-información-n-0001-01').lexicalized()
 61 | 
 62 | 
 63 | @pytest.mark.usefixtures('mini_db')
 64 | def test_sense_frames():
 65 |     assert wn.sense('test-en-illustrate-v-0003-01').frames() == [
 66 |         'Somebody ----s something',
 67 |         'Something ----s something',
 68 |     ]
 69 |     assert wn.sense('test-es-ilustrar-v-0003-01').frames() == []
 70 | 
 71 | 
 72 | @pytest.mark.usefixtures('mini_db_1_1')
 73 | def test_sense_frames_issue_156():
 74 |     # https://github.com/goodmami/wn/issues/156
 75 |     assert wn.sense('test-ja-示す-v-0003-01').frames() == [
 76 |         'ある人が何かを----',
 77 |     ]
 78 |     assert wn.sense('test-ja-事例-n-0002-01').frames() == []
 79 | 
 80 | 
 81 | @pytest.mark.usefixtures('mini_db')
 82 | def test_sense_translate():
 83 |     assert len(wn.sense('test-en-information-n-0001-01').translate(lang='es')) == 1
 84 |     assert len(wn.sense('test-es-información-n-0001-01').translate(lang='en')) == 1
 85 | 
 86 | 
 87 | @pytest.mark.usefixtures('mini_db')
 88 | def test_synset_senses():
 89 |     assert len(wn.synset('test-en-0003-v').senses()) == 2
 90 |     assert len(wn.synset('test-es-0003-v').senses()) == 2
 91 | 
 92 | 
 93 | @pytest.mark.usefixtures('mini_db')
 94 | def test_synset_words():
 95 |     assert len(wn.synset('test-en-0003-v').words()) == 2
 96 |     assert len(wn.synset('test-es-0003-v').words()) == 2
 97 | 
 98 | 
 99 | @pytest.mark.usefixtures('mini_db')
100 | def test_synset_lemmas():
101 |     assert wn.synset('test-en-0003-v').lemmas() == ['exemplify', 'illustrate']
102 |     assert wn.synset('test-es-0003-v').lemmas() == ['ejemplificar', 'ilustrar']
103 | 
104 | 
105 | @pytest.mark.usefixtures('mini_db')
106 | def test_synset_ili():
107 |     assert isinstance(wn.synset('test-en-0001-n').ili, wn.ILI)
108 |     assert wn.synset('test-en-0001-n').ili.id == 'i67447'
109 |     assert wn.synset('test-en-0001-n').ili.status == 'presupposed'
110 |     assert wn.synset('test-en-0008-n').ili is None
111 |     assert wn.synset('test-en-0007-v').ili.id is None
112 |     assert wn.synset('test-en-0007-v').ili.status == 'proposed'
113 | 
114 | 
115 | @pytest.mark.usefixtures('mini_db')
116 | def test_synset_definition():
117 |     assert wn.synset('test-en-0001-n').definition() == 'something that informs'
118 |     defn = wn.synset('test-en-0001-n').definition(data=True)
119 |     assert defn.source_sense_id == 'test-en-information-n-0001-01'
120 |     assert wn.synset('test-es-0001-n').definition() == 'algo que informa'
121 | 
122 | 
123 | @pytest.mark.usefixtures('mini_db')
124 | def test_synset_definitions():
125 |     assert wn.synset('test-en-0001-n').definitions() == ['something that informs']
126 |     defns = wn.synset('test-en-0001-n').definitions(data=True)
127 |     assert defns[0].source_sense_id == 'test-en-information-n-0001-01'
128 |     assert wn.synset('test-es-0001-n').definitions() == ['algo que informa']
129 | 
130 | 
131 | @pytest.mark.usefixtures('mini_db')
132 | def test_synset_examples():
133 |     assert wn.synset('test-en-0001-n').examples() == ['"this is information"']
134 |     ex = wn.synset('test-en-0001-n').examples(data=True)[0]
135 |     assert ex.text == '"this is information"'
136 |     assert wn.synset('test-es-0001-n').examples() == ['"este es la información"']
137 | 
138 | 
139 | @pytest.mark.usefixtures('mini_db')
140 | def test_synset_lexicalized():
141 |     assert wn.synset('test-en-0001-n').lexicalized()
142 |     assert wn.synset('test-es-0001-n').lexicalized()
143 | 
144 | 
145 | @pytest.mark.usefixtures('mini_db')
146 | def test_synset_translate():
147 |     assert len(wn.synset('test-en-0001-n').translate(lang='es')) == 1
148 |     assert len(wn.synset('test-es-0001-n').translate(lang='en')) == 1
149 | 
150 | 
151 | @pytest.mark.usefixtures('uninitialized_datadir')
152 | def test_word_sense_order(datadir):
153 |     wn.add(datadir / 'sense-member-order.xml')
154 |     assert [s.id for s in wn.word('test-foo-n').senses()] == [
155 |         "test-01-foo-n", "test-02-foo-n",
156 |     ]
157 |     assert [s.id for s in wn.word('test-bar-n').senses()] == [
158 |         "test-02-bar-n", "test-01-bar-n",
159 |     ]
160 | 
161 | 
162 | @pytest.mark.usefixtures('uninitialized_datadir')
163 | def test_synset_member_order(datadir):
164 |     wn.add(datadir / 'sense-member-order.xml')
165 |     assert [s.id for s in wn.synset('test-01-n').senses()] == [
166 |         "test-01-bar-n", "test-01-foo-n",
167 |     ]
168 |     assert [s.id for s in wn.synset('test-02-n').senses()] == [
169 |         "test-02-bar-n", "test-02-foo-n",
170 |     ]
171 | 


--------------------------------------------------------------------------------
/tests/relations_test.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import pytest
  3 | 
  4 | import wn
  5 | 
  6 | 
  7 | @pytest.mark.usefixtures('mini_db')
  8 | def test_word_derived_words():
  9 |     assert len(wn.word('test-en-example-n').derived_words()) == 1
 10 |     assert len(wn.word('test-es-ejemplo-n').derived_words()) == 1
 11 | 
 12 | 
 13 | @pytest.mark.usefixtures('mini_db')
 14 | def test_synset_hypernyms():
 15 |     assert wn.synset('test-en-0002-n').hypernyms() == [
 16 |         wn.synset('test-en-0001-n')
 17 |     ]
 18 |     assert wn.synset('test-en-0001-n').hypernyms() == []
 19 | 
 20 | 
 21 | @pytest.mark.usefixtures('mini_db')
 22 | def test_synset_hypernyms_expand_default():
 23 |     assert wn.synset('test-es-0002-n').hypernyms() == [
 24 |         wn.synset('test-es-0001-n')
 25 |     ]
 26 |     assert wn.synset('test-es-0001-n').hypernyms() == []
 27 | 
 28 | 
 29 | @pytest.mark.usefixtures('mini_db')
 30 | def test_synset_hypernyms_expand_empty():
 31 |     w = wn.Wordnet(lang='es', expand='')
 32 |     assert w.synset('test-es-0002-n').hypernyms() == []
 33 | 
 34 | 
 35 | @pytest.mark.usefixtures('mini_db')
 36 | def test_synset_hypernyms_expand_specified():
 37 |     w = wn.Wordnet(lang='es', expand='test-en')
 38 |     assert w.synset('test-es-0002-n').hypernyms() == [
 39 |         w.synset('test-es-0001-n')
 40 |     ]
 41 | 
 42 | 
 43 | @pytest.mark.usefixtures('mini_db')
 44 | def test_synset_relations():
 45 |     w = wn.Wordnet(lang='en')
 46 |     assert w.synset('test-en-0002-n').relations() == {
 47 |         'hypernym': [w.synset('test-en-0001-n')],
 48 |         'hyponym': [w.synset('test-en-0004-n')]
 49 |     }
 50 | 
 51 | 
 52 | @pytest.mark.usefixtures('mini_db')
 53 | def test_sense_get_related():
 54 |     w = wn.Wordnet('test-en')
 55 |     assert w.sense('test-en-example-n-0002-01').get_related() == [
 56 |         w.sense('test-en-exemplify-v-0003-01')
 57 |     ]
 58 | 
 59 | 
 60 | @pytest.mark.usefixtures('mini_db')
 61 | def test_sense_relations():
 62 |     w = wn.Wordnet('test-en')
 63 |     assert w.sense('test-en-example-n-0002-01').relations() == {
 64 |         'derivation': [w.sense('test-en-exemplify-v-0003-01')]
 65 |     }
 66 | 
 67 | 
 68 | @pytest.mark.usefixtures('mini_db_1_1')
 69 | def test_extension_relations():
 70 |     # default mode
 71 |     assert wn.synset('test-en-0007-v').hypernyms() == [
 72 |         wn.synset('test-en-ext-0009-v')
 73 |     ]
 74 |     assert wn.synset('test-en-ext-0009-v').hyponyms() == [
 75 |         wn.synset('test-en-0007-v')
 76 |     ]
 77 |     assert wn.sense('test-en-information-n-0001-01').get_related('pertainym') == [
 78 |         wn.sense('test-en-ext-info-n-0001-01')
 79 |     ]
 80 |     assert wn.sense('test-en-ext-info-n-0001-01').get_related('pertainym') == [
 81 |         wn.sense('test-en-information-n-0001-01')
 82 |     ]
 83 | 
 84 |     # restricted to base
 85 |     w = wn.Wordnet(lexicon='test-en')
 86 |     assert w.synset('test-en-0007-v').hypernyms() == []
 87 |     assert w.sense('test-en-information-n-0001-01').get_related('pertainym') == []
 88 | 
 89 |     # base and extension
 90 |     w = wn.Wordnet(lexicon='test-en test-en-ext')
 91 |     assert w.synset('test-en-0007-v').hypernyms() == [
 92 |         w.synset('test-en-ext-0009-v')
 93 |     ]
 94 |     assert w.synset('test-en-ext-0009-v').hyponyms() == [
 95 |         w.synset('test-en-0007-v')
 96 |     ]
 97 |     assert w.sense('test-en-information-n-0001-01').get_related('pertainym') == [
 98 |         w.sense('test-en-ext-info-n-0001-01')
 99 |     ]
100 |     assert w.sense('test-en-ext-info-n-0001-01').get_related('pertainym') == [
101 |         w.sense('test-en-information-n-0001-01')
102 |     ]
103 | 
104 |     # restricted to extension
105 |     w = wn.Wordnet(lexicon='test-en-ext')
106 |     assert w.synset('test-en-ext-0009-v').hyponyms() == []
107 |     assert w.sense('test-en-ext-info-n-0001-01').get_related('pertainym') == []
108 | 
109 | 
110 | @pytest.mark.usefixtures('mini_db_1_1')
111 | def test_sense_synset_issue_168():
112 |     # https://github.com/goodmami/wn/issues/168
113 |     ja = wn.Wordnet(lexicon='test-ja', expand='')
114 |     assert ja.synset('test-ja-0001-n').get_related() == []
115 |     assert ja.sense('test-ja-情報-n-0001-01').synset().get_related() == []
116 | 
117 | 
118 | @pytest.mark.usefixtures('mini_db')
119 | def test_synset_relations_issue_169():
120 |     # https://github.com/goodmami/wn/issues/169
121 |     en = wn.Wordnet('test-en')
122 |     assert list(en.synset("test-en-0001-n").relations('hyponym')) == ['hyponym']
123 |     es = wn.Wordnet('test-es', expand='test-en')
124 |     assert list(es.synset("test-es-0001-n").relations('hyponym')) == ['hyponym']
125 | 
126 | 
127 | @pytest.mark.usefixtures('mini_db')
128 | def test_synset_relations_issue_177():
129 |     # https://github.com/goodmami/wn/issues/177
130 |     assert 'hyponym' in wn.synset('test-es-0001-n').relations()
131 | 
132 | 
133 | @pytest.mark.usefixtures('mini_db')
134 | def test_sense_relation_map():
135 |     en = wn.Wordnet('test-en')
136 |     assert en.sense('test-en-information-n-0001-01').relation_map() == {}
137 |     relmap = en.sense('test-en-illustrate-v-0003-01').relation_map()
138 |     # only sense-sense relations by default
139 |     assert len(relmap) == 3
140 |     assert all(isinstance(tgt, wn.Sense) for tgt in relmap.values())
141 |     assert {rel.name for rel in relmap} == {'derivation', 'other'}
142 |     assert {rel.target_id for rel in relmap} == {'test-en-illustration-n-0002-01'}
143 |     # sense relations targets should always have same ids as resolved targets
144 |     assert all(rel.target_id == tgt.id for rel, tgt in relmap.items())
145 | 
146 | 
147 | @pytest.mark.usefixtures('mini_db')
148 | def test_synset_relation_map():
149 |     en = wn.Wordnet('test-en')
150 |     assert en.synset('test-en-0003-v').relation_map() == {}
151 |     relmap = en.synset('test-en-0002-n').relation_map()
152 |     assert len(relmap) == 2
153 |     assert {rel.name for rel in relmap} == {'hypernym', 'hyponym'}
154 |     assert {rel.target_id for rel in relmap} == {'test-en-0001-n', 'test-en-0004-n'}
155 |     # synset relation targets have same ids as resolved targets in same lexicon
156 |     assert all(rel.target_id == tgt.id for rel, tgt in relmap.items())
157 |     assert all(rel.lexicon().id == 'test-en' for rel in relmap)
158 | 
159 |     # interlingual synset relation targets show original target ids
160 |     es = wn.Wordnet('test-es', expand='test-en')
161 |     relmap = es.synset('test-es-0002-n').relation_map()
162 |     assert len(relmap) == 2
163 |     assert {rel.name for rel in relmap} == {'hypernym', 'hyponym'}
164 |     assert {rel.target_id for rel in relmap} == {'test-en-0001-n', 'test-en-0004-n'}
165 |     assert all(rel.target_id != tgt.id for rel, tgt in relmap.items())
166 |     assert all(rel.lexicon().id == 'test-en' for rel in relmap)
167 | 


--------------------------------------------------------------------------------
/docs/api/wn.similarity.rst:
--------------------------------------------------------------------------------
  1 | wn.similarity
  2 | =============
  3 | 
  4 | .. automodule:: wn.similarity
  5 | 
  6 | Taxonomy-based Metrics
  7 | ----------------------
  8 | 
  9 | The `Path <Path Similarity_>`_, `Leacock-Chodorow <Leacock-Chodorow
 10 | Similarity_>`_, and `Wu-Palmer <Wu-Palmer Similarity_>`_ similarity
 11 | metrics work by finding path distances in the hypernym/hyponym
 12 | taxonomy. As such, they are most useful when the synsets are, in fact,
 13 | arranged in a taxonomy. For the Princeton WordNet and derivative
 14 | wordnets, such as the `Open English Wordnet`_ and `OMW English Wordnet
 15 | based on WordNet 3.0`_ available to Wn, synsets for nouns and verbs
 16 | are arranged taxonomically: the nouns mostly form a single structure
 17 | with a single root while verbs form many smaller structures with many
 18 | roots. Synsets for the other parts of speech do not use
 19 | hypernym/hyponym relations at all. This situation may be different for
 20 | other wordnet projects or future versions of the English wordnets.
 21 | 
 22 | .. _Open English Wordnet: https://en-word.net
 23 | .. _OMW English Wordnet based on WordNet 3.0: https://github.com/omwn/omw-data
 24 | 
 25 | The similarity metrics tend to fail when the synsets are not connected
 26 | by some path. When the synsets are in different parts of speech, or
 27 | even in separate lexicons, this failure is acceptable and
 28 | expected. But for cases like the verbs in the Princeton WordNet, it
 29 | might be more useful to pretend that there is some unique root for all
 30 | verbs so as to create a path connecting any two of them. For this
 31 | purpose, the *simulate_root* parameter is available on the
 32 | :func:`path`, :func:`lch`, and :func:`wup` functions, where it is
 33 | passed on to calls to :meth:`wn.Synset.shortest_path` and
 34 | :meth:`wn.Synset.lowest_common_hypernyms`. Setting *simulate_root* to
 35 | :python:`True` can, however, give surprising results if the words are
 36 | from a different lexicon. Currently, computing similarity for synsets
 37 | from a different part of speech raises an error.
 38 | 
 39 | 
 40 | Path Similarity
 41 | '''''''''''''''
 42 | 
 43 | When :math:`p` is the length of the shortest path between two synsets,
 44 | the path similarity is:
 45 | 
 46 | .. math::
 47 | 
 48 |    \frac{1}{p + 1}
 49 | 
 50 | The similarity score ranges between 0.0 and 1.0, where the higher the
 51 | score is, the more similar the synsets are. The score is 1.0 when a
 52 | synset is compared to itself, and 0.0 when there is no path between
 53 | the two synsets (i.e., the path distance is infinite).
 54 | 
 55 | .. autofunction:: path
 56 | 
 57 | 
 58 | .. _leacock-chodorow-similarity:
 59 | 
 60 | Leacock-Chodorow Similarity
 61 | '''''''''''''''''''''''''''
 62 | 
 63 | When :math:`p` is the length of the shortest path between two synsets
 64 | and :math:`d` is the maximum taxonomy depth, the Leacock-Chodorow
 65 | similarity is:
 66 | 
 67 | .. math::
 68 | 
 69 |    -\text{log}\left(\frac{p + 1}{2d}\right)
 70 | 
 71 | .. autofunction:: lch
 72 | 
 73 | 
 74 | Wu-Palmer Similarity
 75 | ''''''''''''''''''''
 76 | 
 77 | When *LCS* is the lowest common hypernym (also called "least common
 78 | subsumer") between two synsets, :math:`i` is the shortest path
 79 | distance from the first synset to *LCS*, :math:`j` is the shortest
 80 | path distance from the second synset to *LCS*, and :math:`k` is the
 81 | number of nodes (distance + 1) from *LCS* to the root node, then the
 82 | Wu-Palmer similarity is:
 83 | 
 84 | .. math::
 85 | 
 86 |    \frac{2k}{i + j + 2k}
 87 | 
 88 | .. autofunction:: wup
 89 | 
 90 | 
 91 | Information Content-based Metrics
 92 | ---------------------------------
 93 | 
 94 | The `Resnik <Resnik Similarity_>`_, `Jiang-Conrath <Jiang-Conrath
 95 | Similarity_>`_, and `Lin <Lin Similarity_>`_ similarity metrics work
 96 | by computing the information content of the synsets and/or that of
 97 | their lowest common hypernyms. They therefore require information
 98 | content weights (see :mod:`wn.ic`), and the values returned
 99 | necessarily depend on the weights used.
100 | 
101 | 
102 | Resnik Similarity
103 | '''''''''''''''''
104 | 
105 | The Resnik similarity (`Resnik 1995
106 | <https://arxiv.org/pdf/cmp-lg/9511007.pdf>`_) is the maximum
107 | information content value of the common subsumers (hypernym ancestors)
108 | of the two synsets. Formally it is defined as follows, where
109 | :math:`c_1` and :math:`c_2` are the two synsets being compared.
110 | 
111 | .. math::
112 | 
113 |    \text{max}_{c \in \text{S}(c_1, c_2)} \text{IC}(c)
114 | 
115 | Since a synset's information content is always equal or greater than
116 | the information content of its hypernyms, :math:`S(c_1, c_2)` above is
117 | more efficiently computed using the lowest common hypernyms instead of
118 | all common hypernyms.
119 | 
120 | .. autofunction:: res
121 | 
122 | 
123 | Jiang-Conrath Similarity
124 | ''''''''''''''''''''''''
125 | 
126 | The Jiang-Conrath similarity metric (`Jiang and Conrath, 1997
127 | <https://www.aclweb.org/anthology/O97-1002.pdf>`_) combines the ideas
128 | of the taxonomy-based and information content-based metrics. It is
129 | defined as follows, where :math:`c_1` and :math:`c_2` are the two
130 | synsets being compared and :math:`c_0` is the lowest common hypernym
131 | of the two with the highest information content weight:
132 | 
133 | .. math::
134 | 
135 |    \frac{1}{\text{IC}(c_1) + \text{IC}(c_2) - 2(\text{IC}(c_0))}
136 | 
137 | This equation is the simplified form given in the paper were several
138 | parameterized terms are cancelled out because the full form is not
139 | often used in practice.
140 | 
141 | There are two special cases:
142 | 
143 | 1. If the information content of :math:`c_0`, :math:`c_1`, and
144 |    :math:`c_2` are all zero, the metric returns zero. This occurs when
145 |    both :math:`c_1` and :math:`c_2` are the root node, but it can also
146 |    occur if the synsets did not occur in the corpus and the smoothing
147 |    value was set to zero.
148 | 
149 | 2. Otherwise if :math:`c_1 + c_2 = 2c_0`, the metric returns
150 |    infinity. This occurs when the two synsets are the same, one is a
151 |    descendant of the other, etc., such that they have the same
152 |    frequency as each other and as their lowest common hypernym.
153 | 
154 | .. autofunction:: jcn
155 | 
156 | 
157 | Lin Similarity
158 | ''''''''''''''
159 | 
160 | Another formulation of information content-based similarity is the Lin
161 | metric (`Lin 1997 <https://www.aclweb.org/anthology/P97-1009.pdf>`_),
162 | which is defined as follows, where :math:`c_1` and :math:`c_2` are the
163 | two synsets being compared and :math:`c_0` is the lowest common
164 | hypernym with the highest information content weight:
165 | 
166 | .. math::
167 | 
168 |    \frac{2(\text{IC}(c_0))}{\text{IC}(c_1) + \text{IC}(c_0)}
169 | 
170 | One special case is if either synset has an information content value
171 | of zero, in which case the metric returns zero.
172 | 
173 | .. autofunction:: lin
174 | 


--------------------------------------------------------------------------------
/wn/util.py:
--------------------------------------------------------------------------------
  1 | """Wn utility classes."""
  2 | from collections.abc import Callable
  3 | from typing import TextIO
  4 | import sys
  5 | 
  6 | 
  7 | def synset_id_formatter(
  8 |     fmt: str = '{prefix}-{offset:08}-{pos}',
  9 |     **kwargs
 10 | ) -> Callable:
 11 |     """Return a function for formatting synset ids.
 12 | 
 13 |     The *fmt* argument can be customized. It will be formatted using
 14 |     any other keyword arguments given to this function and any given
 15 |     to the resulting function. By default, the format string expects a
 16 |     ``prefix`` string argument for the namespace (such as a lexicon
 17 |     id), an ``offset`` integer argument (such as a WNDB offset), and a
 18 |     ``pos`` string argument.
 19 | 
 20 |     Arguments:
 21 |         fmt: A Python format string
 22 |         **kwargs: Keyword arguments for the format string.
 23 | 
 24 |     Example:
 25 | 
 26 |         >>> pwn_synset_id = synset_id_formatter(prefix='pwn')
 27 |         >>> pwn_synset_id(offset=1174, pos='n')
 28 |         'pwn-00001174-n'
 29 | 
 30 |     """
 31 | 
 32 |     def format_synset_id(**_kwargs) -> str:
 33 |         return fmt.format(**kwargs, **_kwargs)
 34 | 
 35 |     return format_synset_id
 36 | 
 37 | 
 38 | class ProgressHandler:
 39 |     """An interface for updating progress in long-running processes.
 40 | 
 41 |     Long-running processes in Wn, such as :func:`wn.download` and
 42 |     :func:`wn.add`, call to a progress handler object as they go.  The
 43 |     default progress handler used by Wn is :class:`ProgressBar`, which
 44 |     updates progress by formatting and printing a textual bar to
 45 |     stderr. The :class:`ProgressHandler` class may be used directly,
 46 |     which does nothing, or users may create their own subclasses for,
 47 |     e.g., updating a GUI or some other handler.
 48 | 
 49 |     The initialization parameters, except for ``file``, are stored in
 50 |     a :attr:`kwargs` member and may be updated after the handler is
 51 |     created through the :meth:`set` method. The :meth:`update` method
 52 |     is the primary way a counter is updated. The :meth:`flash` method
 53 |     is sometimes called for simple messages. When the process is
 54 |     complete, the :meth:`close` method is called, optionally with a
 55 |     message.
 56 | 
 57 |     """
 58 | 
 59 |     def __init__(
 60 |         self,
 61 |         *,
 62 |         message: str = '',
 63 |         count: int = 0,
 64 |         total: int = 0,
 65 |         refresh_interval: int = 0,
 66 |         unit: str = '',
 67 |         status: str = '',
 68 |         file: TextIO = sys.stderr,
 69 |     ):
 70 |         self.file = file
 71 |         self.kwargs = {
 72 |             'count': count,
 73 |             'total': total,
 74 |             'refresh_interval': refresh_interval,
 75 |             'message': message,
 76 |             'unit': unit,
 77 |             'status': status,
 78 |         }
 79 |         self._refresh_quota: int = refresh_interval
 80 | 
 81 |     def update(self, n: int = 1, force: bool = False) -> None:
 82 |         """Update the counter with the increment value *n*.
 83 | 
 84 |         This method should update the ``count`` key of :attr:`kwargs`
 85 |         with the increment value *n*. After this, it is expected to
 86 |         update some user-facing progress indicator.
 87 | 
 88 |         If *force* is :python:`True`, any indicator will be refreshed
 89 |         regardless of the value of the refresh interval.
 90 | 
 91 |         """
 92 |         self.kwargs['count'] += n  # type: ignore
 93 | 
 94 |     def set(self, **kwargs) -> None:
 95 |         """Update progress handler parameters.
 96 | 
 97 |         Calling this method also runs :meth:`update` with an increment
 98 |         of 0, which causes a refresh of any indicator without changing
 99 |         the counter.
100 | 
101 |         """
102 |         self.kwargs.update(**kwargs)
103 |         self.update(0, force=True)
104 | 
105 |     def flash(self, message: str) -> None:
106 |         """Issue a message unrelated to the current counter.
107 | 
108 |         This may be useful for multi-stage processes to indicate the
109 |         move to a new stage, or to log unexpected situations.
110 | 
111 |         """
112 |         pass
113 | 
114 |     def close(self) -> None:
115 |         """Close the progress handler.
116 | 
117 |         This might be useful for closing file handles or cleaning up
118 |         resources.
119 | 
120 |         """
121 |         pass
122 | 
123 | 
124 | class ProgressBar(ProgressHandler):
125 |     """A :class:`ProgressHandler` subclass for printing a progress bar.
126 | 
127 |     Example:
128 |         >>> p = ProgressBar(message='Progress: ', total=10, unit=' units')
129 |         >>> p.update(3)
130 |         Progress: [#########                     ] (3/10 units)
131 | 
132 |     See :meth:`format` for a description of how the progress bar is
133 |     formatted.
134 | 
135 |     """
136 | 
137 |     #: The default formatting template.
138 |     FMT = '\r{message}{bar}{counter}{status}'
139 | 
140 |     def update(self, n: int = 1, force: bool = False) -> None:
141 |         """Increment the count by *n* and print the reformatted bar."""
142 |         self.kwargs['count'] += n  # type: ignore
143 |         self._refresh_quota -= n
144 |         if force or self._refresh_quota <= 0:
145 |             self._refresh_quota = self.kwargs['refresh_interval']  # type: ignore
146 |             s = self.format()
147 |             if self.file:
148 |                 print('\r\033[K', end='', file=self.file)
149 |                 print(s, end='', file=self.file)
150 | 
151 |     def format(self) -> str:
152 |         """Format and return the progress bar.
153 | 
154 |         The bar is is formatted according to :attr:`FMT`, using
155 |         variables from :attr:`kwargs` and two computed variables:
156 | 
157 |         - ``bar``: visualization of the progress bar, empty when
158 |           ``total`` is 0
159 | 
160 |         - ``counter``: display of ``count``, ``total``, and ``units``
161 | 
162 |         >>> p = ProgressBar(message='Progress', count=2, total=10, unit='K')
163 |         >>> p.format()
164 |         '\\rProgress [######                        ] (2/10K) '
165 |         >>> p = ProgressBar(count=2, status='Counting...')
166 |         >>> p.format()
167 |         '\\r (2) Counting...'
168 | 
169 |         """
170 |         _kw = self.kwargs
171 |         width = 30
172 |         total: int = _kw['total']  # type: ignore
173 |         count: int = _kw['count']  # type: ignore
174 | 
175 |         if total > 0:
176 |             num = min(count, total) * width
177 |             fill = (num // total) * '#'
178 |             part = ((num % total) * 3) // total
179 |             if part:
180 |                 fill += '-='[part-1]
181 |             bar = f' [{fill:<{width}}]'
182 |             counter = f' ({count}/{total}{_kw["unit"]}) '
183 |         else:
184 |             bar = ''
185 |             counter = f' ({count}{_kw["unit"]}) '
186 | 
187 |         return self.FMT.format(bar=bar, counter=counter, **_kw)
188 | 
189 |     def flash(self, message: str) -> None:
190 |         """Overwrite the progress bar with *message*."""
191 |         print(f'\r\033[K{message}', end='', file=self.file)
192 | 
193 |     def close(self) -> None:
194 |         """Print a newline so the last printed bar remains on screen."""
195 |         print(file=self.file)
196 | 


--------------------------------------------------------------------------------
/tests/similarity_test.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from math import log
  3 | 
  4 | import pytest
  5 | 
  6 | import wn
  7 | from wn import similarity as sim
  8 | from wn.taxonomy import taxonomy_depth
  9 | from wn.ic import information_content as infocont
 10 | 
 11 | 
 12 | def get_synsets(w):
 13 |     return {
 14 |         'information': w.synset('test-en-0001-n'),
 15 |         'example': w.synset('test-en-0002-n'),
 16 |         'sample': w.synset('test-en-0004-n'),
 17 |         'random sample': w.synset('test-en-0005-n'),
 18 |         'random sample2': w.synset('test-en-0008-n'),
 19 |         'datum': w.synset('test-en-0006-n'),
 20 |         'exemplify': w.synset('test-en-0003-v'),
 21 |     }
 22 | 
 23 | 
 24 | # some fake information content; computed using:
 25 | #   words = ['example', 'example', 'sample', 'random sample', 'illustrate']
 26 | #   ic = compute(words, wn.Wordnet('test-en'), distribute_weight=False)
 27 | 
 28 | ic = {
 29 |     'n': {'test-en-0001-n': 5.0,  # information
 30 |           'test-en-0002-n': 5.0,  # example, illustration
 31 |           'test-en-0004-n': 3.0,  # sample
 32 |           'test-en-0005-n': 2.0,  # random sample
 33 |           'test-en-0008-n': 2.0,  # random sample 2
 34 |           'test-en-0006-n': 1.0,  # datum
 35 |           None: 6.0},
 36 |     'v': {'test-en-0003-v': 2.0,  # exemplify, illustrate
 37 |           'test-en-0007-v': 1.0,  # resignate
 38 |           None: 2.0},
 39 |     'a': {None: 1.0},
 40 |     'r': {None: 1.0}
 41 | }
 42 | 
 43 | 
 44 | @pytest.mark.usefixtures('mini_db')
 45 | def test_path():
 46 |     ss = get_synsets(wn.Wordnet('test-en'))
 47 |     assert sim.path(ss['information'], ss['information']) == 1/1
 48 |     assert sim.path(ss['information'], ss['example']) == 1/2
 49 |     assert sim.path(ss['information'], ss['sample']) == 1/3
 50 |     assert sim.path(ss['information'], ss['random sample']) == 1/4
 51 |     assert sim.path(ss['random sample'], ss['datum']) == 1/5
 52 |     assert sim.path(ss['random sample2'], ss['datum']) == 0
 53 |     assert sim.path(ss['random sample2'], ss['datum'], simulate_root=True) == 1/4
 54 |     assert sim.path(
 55 |         ss['random sample'], ss['random sample2'], simulate_root=True
 56 |     ) == 1/6
 57 |     with pytest.raises(wn.Error):
 58 |         sim.path(ss['example'], ss['exemplify'])
 59 |     with pytest.raises(wn.Error):
 60 |         sim.wup(ss['example'], ss['exemplify'], simulate_root=True)
 61 | 
 62 | 
 63 | @pytest.mark.usefixtures('mini_db')
 64 | def test_wup():
 65 |     ss = get_synsets(wn.Wordnet('test-en'))
 66 |     assert sim.wup(ss['information'], ss['information']) == (2*1) / (0+0+2*1)
 67 |     assert sim.wup(ss['information'], ss['example']) == (2*1) / (0+1+2*1)
 68 |     assert sim.wup(ss['information'], ss['sample']) == (2*1) / (0+2+2*1)
 69 |     assert sim.wup(ss['information'], ss['random sample']) == (2*1) / (0+3+2*1)
 70 |     assert sim.wup(ss['random sample'], ss['datum']) == (2*1) / (3+1+2*1)
 71 |     with pytest.raises(wn.Error):
 72 |         assert sim.wup(ss['random sample2'], ss['datum'])
 73 |     assert (sim.wup(ss['random sample2'], ss['datum'], simulate_root=True)
 74 |             == (2*1) / (1+2+2*1))
 75 |     assert (sim.wup(ss['random sample'], ss['random sample2'], simulate_root=True)
 76 |             == (2*1) / (4+1+2*1))
 77 |     with pytest.raises(wn.Error):
 78 |         sim.wup(ss['example'], ss['exemplify'])
 79 |     with pytest.raises(wn.Error):
 80 |         sim.wup(ss['example'], ss['exemplify'], simulate_root=True)
 81 | 
 82 | 
 83 | @pytest.mark.usefixtures('mini_db')
 84 | def test_lch():
 85 |     w = wn.Wordnet('test-en')
 86 |     ss = get_synsets(w)
 87 |     d_n = taxonomy_depth(w, 'n')
 88 |     assert sim.lch(ss['information'], ss['information'], d_n) == -log((0+1) / (2*d_n))
 89 |     assert sim.lch(ss['information'], ss['example'], d_n) == -log((1+1) / (2*d_n))
 90 |     assert sim.lch(ss['information'], ss['sample'], d_n) == -log((2+1) / (2*d_n))
 91 |     assert sim.lch(ss['information'], ss['random sample'], d_n) == -log((3+1) / (2*d_n))
 92 |     assert sim.lch(ss['random sample'], ss['datum'], d_n) == -log((4+1) / (2*d_n))
 93 |     with pytest.raises(wn.Error):
 94 |         assert sim.lch(ss['random sample2'], ss['datum'], d_n)
 95 |     assert (sim.lch(ss['random sample2'], ss['datum'], d_n, simulate_root=True)
 96 |             == -log((3+1) / (2*d_n)))
 97 |     assert (sim.lch(ss['random sample'], ss['random sample2'], d_n, simulate_root=True)
 98 |             == -log((5+1) / (2*d_n)))
 99 |     with pytest.raises(wn.Error):
100 |         sim.lch(ss['example'], ss['exemplify'], d_n)
101 |     with pytest.raises(wn.Error):
102 |         sim.lch(ss['example'], ss['exemplify'], d_n, simulate_root=True)
103 | 
104 | 
105 | @pytest.mark.usefixtures('mini_db')
106 | def test_res():
107 |     w = wn.Wordnet('test-en')
108 |     ss = get_synsets(w)
109 |     assert (sim.res(ss['information'], ss['information'], ic)
110 |             == infocont(ss['information'], ic))
111 |     assert (sim.res(ss['information'], ss['example'], ic)
112 |             == infocont(ss['information'], ic))
113 |     assert (sim.res(ss['information'], ss['sample'], ic)
114 |             == infocont(ss['information'], ic))
115 |     assert (sim.res(ss['information'], ss['random sample'], ic)
116 |             == infocont(ss['information'], ic))
117 |     assert (sim.res(ss['random sample'], ss['datum'], ic)
118 |             == infocont(ss['information'], ic))
119 |     with pytest.raises(wn.Error):
120 |         sim.res(ss['random sample2'], ss['datum'], ic)
121 |     with pytest.raises(wn.Error):
122 |         sim.res(ss['example'], ss['exemplify'], ic)
123 | 
124 | 
125 | @pytest.mark.usefixtures('mini_db')
126 | def test_jcn():
127 |     w = wn.Wordnet('test-en')
128 |     ss = get_synsets(w)
129 |     info_ic = infocont(ss['information'], ic)
130 |     assert (sim.jcn(ss['information'], ss['information'], ic)
131 |             == float('inf'))
132 |     assert (sim.jcn(ss['information'], ss['example'], ic)
133 |             == float('inf'))
134 |     assert (sim.jcn(ss['information'], ss['sample'], ic)
135 |             == 1 / ((info_ic + infocont(ss['sample'], ic)) - 2 * info_ic))
136 |     assert (sim.jcn(ss['information'], ss['random sample'], ic)
137 |             == 1 / ((info_ic + infocont(ss['random sample'], ic)) - 2 * info_ic))
138 |     assert (sim.jcn(ss['random sample'], ss['datum'], ic)
139 |             == 1 / (
140 |                 (infocont(ss['random sample'], ic) + infocont(ss['datum'], ic))
141 |                 - 2 * info_ic))
142 |     with pytest.raises(wn.Error):
143 |         sim.jcn(ss['random sample2'], ss['datum'], ic)
144 |     with pytest.raises(wn.Error):
145 |         sim.jcn(ss['example'], ss['exemplify'], ic)
146 | 
147 | 
148 | @pytest.mark.usefixtures('mini_db')
149 | def test_lin():
150 |     w = wn.Wordnet('test-en')
151 |     ss = get_synsets(w)
152 |     info_ic = infocont(ss['information'], ic)
153 |     assert (sim.lin(ss['information'], ss['information'], ic)
154 |             == 1.0)
155 |     assert (sim.lin(ss['information'], ss['example'], ic)
156 |             == 1.0)
157 |     assert (sim.lin(ss['information'], ss['sample'], ic)
158 |             == (2 * info_ic) / (info_ic + infocont(ss['sample'], ic)))
159 |     assert (sim.lin(ss['information'], ss['random sample'], ic)
160 |             == (2 * info_ic) / (info_ic + infocont(ss['random sample'], ic)))
161 |     assert (sim.lin(ss['random sample'], ss['datum'], ic)
162 |             == ((2 * info_ic)
163 |                 / (infocont(ss['random sample'], ic) + infocont(ss['datum'], ic))))
164 |     with pytest.raises(wn.Error):
165 |         sim.lin(ss['random sample2'], ss['datum'], ic)
166 |     with pytest.raises(wn.Error):
167 |         sim.lin(ss['example'], ss['exemplify'], ic)
168 | 


--------------------------------------------------------------------------------
/wn/ic.py:
--------------------------------------------------------------------------------
  1 | 
  2 | """Information Content is a corpus-based metrics of synset or sense
  3 | specificity.
  4 | 
  5 | """
  6 | 
  7 | from typing import Optional, TextIO
  8 | from pathlib import Path
  9 | from collections import Counter
 10 | from collections.abc import Callable, Iterable, Iterator
 11 | from math import log
 12 | 
 13 | from wn._types import AnyPath
 14 | from wn._core import Synset, Wordnet
 15 | from wn.constants import NOUN, VERB, ADJ, ADV, ADJ_SAT
 16 | from wn.util import synset_id_formatter
 17 | 
 18 | 
 19 | # Just use a subset of all available parts of speech
 20 | IC_PARTS_OF_SPEECH = frozenset((NOUN, VERB, ADJ, ADV))
 21 | Freq = dict[str, dict[Optional[str], float]]
 22 | 
 23 | 
 24 | def information_content(synset: Synset, freq: Freq) -> float:
 25 |     """Calculate the Information Content value for a synset.
 26 | 
 27 |     The information content of a synset is the negative log of the
 28 |     synset probability (see :func:`synset_probability`).
 29 | 
 30 |     """
 31 |     return -log(synset_probability(synset, freq))
 32 | 
 33 | 
 34 | def synset_probability(synset: Synset, freq: Freq) -> float:
 35 |     """Calculate the synset probability.
 36 | 
 37 |     The synset probability is defined as freq(ss)/N where freq(ss) is
 38 |     the IC weight for the synset and N is the total IC weight for all
 39 |     synsets with the same part of speech.
 40 | 
 41 |     Note: this function is not generally used directly, but indirectly
 42 |     through :func:`information_content`.
 43 | 
 44 |     """
 45 |     pos_freq = freq[synset.pos]
 46 |     return pos_freq[synset.id] / pos_freq[None]
 47 | 
 48 | 
 49 | def _initialize(
 50 |     wordnet: Wordnet,
 51 |     smoothing: float,
 52 | ) -> Freq:
 53 |     """Populate an Information Content weight mapping to a smoothing value.
 54 | 
 55 |     All synsets in *wordnet* are inserted into the dictionary and
 56 |     mapped to *smoothing*.
 57 | 
 58 |     """
 59 |     freq: Freq = {
 60 |         pos: {synset.id: smoothing for synset in wordnet.synsets(pos=pos)}
 61 |         for pos in IC_PARTS_OF_SPEECH
 62 |     }
 63 |     # pretend ADJ_SAT is just ADJ
 64 |     for synset in wordnet.synsets(pos=ADJ_SAT):
 65 |         freq[ADJ][synset.id] = smoothing
 66 |     # also initialize totals (when synset is None) for each part-of-speech
 67 |     for pos in IC_PARTS_OF_SPEECH:
 68 |         freq[pos][None] = smoothing
 69 |     return freq
 70 | 
 71 | 
 72 | def compute(
 73 |     corpus: Iterable[str],
 74 |     wordnet: Wordnet,
 75 |     distribute_weight: bool = True,
 76 |     smoothing: float = 1.0
 77 | ) -> Freq:
 78 |     """Compute Information Content weights from a corpus.
 79 | 
 80 |     Arguments:
 81 |         corpus: An iterable of string tokens. This is a flat list of
 82 |             words and the order does not matter. Tokens may be single
 83 |             words or multiple words separated by a space.
 84 | 
 85 |         wordnet: An instantiated :class:`wn.Wordnet` object, used to
 86 |             look up synsets from words.
 87 | 
 88 |         distribute_weight: If :python:`True`, the counts for a word
 89 |             are divided evenly among all synsets for the word.
 90 | 
 91 |         smoothing: The initial value given to each synset.
 92 | 
 93 |     Example:
 94 |         >>> import wn, wn.ic, wn.morphy
 95 |         >>> ewn = wn.Wordnet('ewn:2020', lemmatizer=wn.morphy.morphy)
 96 |         >>> freq = wn.ic.compute(["Dogs", "run", ".", "Cats", "sleep", "."], ewn)
 97 |         >>> dog = ewn.synsets('dog', pos='n')[0]
 98 |         >>> cat = ewn.synsets('cat', pos='n')[0]
 99 |         >>> frog = ewn.synsets('frog', pos='n')[0]
100 |         >>> freq['n'][dog.id]
101 |         1.125
102 |         >>> freq['n'][cat.id]
103 |         1.1
104 |         >>> freq['n'][frog.id]  # no occurrence; smoothing value only
105 |         1.0
106 |         >>> carnivore = dog.lowest_common_hypernyms(cat)[0]
107 |         >>> freq['n'][carnivore.id]
108 |         1.3250000000000002
109 |     """
110 |     freq = _initialize(wordnet, smoothing)
111 |     counts = Counter(corpus)
112 | 
113 |     hypernym_cache: dict[Synset, list[Synset]] = {}
114 |     for word, count in counts.items():
115 |         synsets = wordnet.synsets(word)
116 |         num = len(synsets)
117 |         if num == 0:
118 |             continue
119 | 
120 |         weight = float(count / num if distribute_weight else count)
121 | 
122 |         for synset in synsets:
123 |             pos = synset.pos
124 |             if pos == ADJ_SAT:
125 |                 pos = ADJ
126 |             if pos not in IC_PARTS_OF_SPEECH:
127 |                 continue
128 | 
129 |             freq[pos][None] += weight
130 | 
131 |             # The following while-loop is equivalent to:
132 |             #
133 |             # freq[pos][synset.id] += weight
134 |             # for path in synset.hypernym_paths():
135 |             #     for ss in path:
136 |             #         freq[pos][ss.id] += weight
137 |             #
138 |             # ...but it caches hypernym lookups for speed
139 | 
140 |             agenda: list[tuple[Synset, set[Synset]]] = [(synset, set())]
141 |             while agenda:
142 |                 ss, seen = agenda.pop()
143 | 
144 |                 # avoid cycles
145 |                 if ss in seen:
146 |                     continue
147 | 
148 |                 freq[pos][ss.id] += weight
149 | 
150 |                 if ss not in hypernym_cache:
151 |                     hypernym_cache[ss] = ss.hypernyms()
152 |                 agenda.extend((hyp, seen | {ss}) for hyp in hypernym_cache[ss])
153 | 
154 |     return freq
155 | 
156 | 
157 | def load(
158 |     source: AnyPath,
159 |     wordnet: Wordnet,
160 |     get_synset_id: Optional[Callable] = None,
161 | ) -> Freq:
162 |     """Load an Information Content mapping from a file.
163 | 
164 |     Arguments:
165 | 
166 |         source: A path to an information content weights file.
167 | 
168 |         wordnet: A :class:`wn.Wordnet` instance with synset
169 |             identifiers matching the offsets in the weights file.
170 | 
171 |         get_synset_id: A callable that takes a synset offset and part
172 |             of speech and returns a synset ID valid in *wordnet*.
173 | 
174 |     Raises:
175 | 
176 |         :class:`wn.Error`: If *wordnet* does not have exactly one
177 |             lexicon.
178 | 
179 |     Example:
180 | 
181 |         >>> import wn, wn.ic
182 |         >>> pwn = wn.Wordnet('pwn:3.0')
183 |         >>> path = '~/nltk_data/corpora/wordnet_ic/ic-brown-resnik-add1.dat'
184 |         >>> freq = wn.ic.load(path, pwn)
185 | 
186 |     """
187 |     source = Path(source).expanduser().resolve(strict=True)
188 |     assert len(wordnet.lexicons()) == 1
189 |     lexid = wordnet.lexicons()[0].id
190 |     if get_synset_id is None:
191 |         get_synset_id = synset_id_formatter(prefix=lexid)
192 | 
193 |     freq = _initialize(wordnet, 0.0)
194 | 
195 |     with source.open() as icfile:
196 |         for offset, pos, weight, is_root in _parse_ic_file(icfile):
197 |             ssid = get_synset_id(offset=offset, pos=pos)
198 |             # synset = wordnet.synset(ssid)
199 |             freq[pos][ssid] = weight
200 |             if is_root:
201 |                 freq[pos][None] += weight
202 |     return freq
203 | 
204 | 
205 | def _parse_ic_file(icfile: TextIO) -> Iterator[tuple[int, str, float, bool]]:
206 |     """Parse the Information Content file.
207 | 
208 |     A sample of the format is::
209 | 
210 |         wnver::eOS9lXC6GvMWznF1wkZofDdtbBU
211 |         1740n 1915712 ROOT
212 |         1930n 859272
213 |         2137n 1055337
214 | 
215 |     """
216 |     next(icfile)  # skip header
217 |     for line in icfile:
218 |         ssinfo, value, *isroot = line.split()
219 |         yield (int(ssinfo[:-1]),
220 |                ssinfo[-1],
221 |                float(value),
222 |                bool(isroot))
223 | 


--------------------------------------------------------------------------------
/docs/api/wn.ic.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | wn.ic
  3 | =====
  4 | 
  5 | .. automodule:: wn.ic
  6 | 
  7 | The mathematical formulae for information content are defined in
  8 | `Formal Description`_, and the corresponding Python API function are
  9 | described in `Calculating Information Content`_. These functions
 10 | require information content weights obtained either by `computing them
 11 | from a corpus <Computing Corpus Weights_>`_, or by `loading
 12 | pre-computed weights from a file <Reading Pre-computed Information
 13 | Content Files_>`_.
 14 | 
 15 | .. note::
 16 | 
 17 |    The term *information content* can be ambiguous. It often, and most
 18 |    accurately, refers to the result of the :func:`information_content`
 19 |    function (:math:`\text{IC}(c)` in the mathematical notation), but
 20 |    is also sometimes used to refer to the corpus frequencies/weights
 21 |    (:math:`\text{freq}(c)` in the mathematical notation) returned by
 22 |    :func:`load` or :func:`compute`, as these weights are the basis of
 23 |    the value computed by :func:`information_content`. The Wn
 24 |    documentation tries to consistently refer to former as the
 25 |    *information content value*, or just *information content*, and the
 26 |    latter as *information content weights*, or *weights*.
 27 | 
 28 | 
 29 | Formal Description
 30 | ------------------
 31 | 
 32 | The Information Content (IC) of a concept (synset) is a measure of its
 33 | specificity computed from the wordnet's taxonomy structure and corpus
 34 | frequencies. It is defined by Resnik 1995 ([RES95]_), following
 35 | information theory, as the negative log-probability of a concept:
 36 | 
 37 | .. math::
 38 | 
 39 |    \text{IC}(c) = -\log{p(c)}
 40 | 
 41 | A concept's probability is the empirical probability over a corpus:
 42 | 
 43 | .. math::
 44 | 
 45 |    p(c) = \frac{\text{freq}(c)}{N}
 46 | 
 47 | Here, :math:`N` is the total count of words of the same category as
 48 | concept :math:`c` ([RES95]_ only considered nouns) where each word has
 49 | some representation in the wordnet, and :math:`\text{freq}` is defined
 50 | as the sum of corpus counts of words in :math:`\text{words}(c)`, which
 51 | is the set of words subsumed by concept :math:`c`:
 52 | 
 53 | .. math::
 54 | 
 55 |    \text{freq}(c) = \sum_{w \in \text{words}(c)}{\text{count}(w)}
 56 | 
 57 | It is common for :math:`\text{freq}` to not contain actual frequencies
 58 | but instead weights distributed evenly among the synsets for a
 59 | word. These weights are calculated as the word frequency divided by
 60 | the number of synsets for the word:
 61 | 
 62 | .. math::
 63 | 
 64 |    \text{freq}_{\text{distributed}}(c)
 65 |    = \sum_{w \in \text{words}(c)}{\frac{\text{count}(w)}{|\text{synsets}(w)|}}
 66 | 
 67 | .. [RES95] Resnik, Philip. "Using information content to evaluate
 68 |    semantic similarity." In Proceedings of the 14th International
 69 |    Joint Conference on Artificial Intelligence (IJCAI-95), Montreal,
 70 |    Canada, pp. 448-453. 1995.
 71 | 
 72 | 
 73 | Example
 74 | -------
 75 | 
 76 | In the Princeton WordNet 3.0 (hereafter *WordNet*, but note that the
 77 | equivalent lexicon in Wn is the *OMW English Wordnet based on WordNet
 78 | 3.0* with specifier ``omw-en:1.4``), the frequency of a concept like
 79 | **stone fruit** is not just the number of occurrences of *stone
 80 | fruit*, but also includes the counts of the words for its hyponyms
 81 | (*almond*, *olive*, etc.) and other taxonomic descendants (*Jordan
 82 | almond*, *green olive*, etc.). The word *almond* has two synsets: one
 83 | for the fruit or nut, another for the plant. Thus, if the word
 84 | *almond* is encountered :math:`n` times in a corpus, then the weight
 85 | (either the frequency :math:`n` or distributed weight
 86 | :math:`\frac{n}{2}`) is added to the total weights for both synsets
 87 | and to those of their ancestors, but not for descendant synsets, such
 88 | as for **Jordan almond**. The fruit/nut synset of almond has two
 89 | hypernym paths which converge on **fruit**:
 90 | 
 91 | 1. **almond** ⊃ **stone fruit** ⊃ **fruit**
 92 | 2. **almond** ⊃ **nut** ⊃ **seed** ⊃ **fruit**
 93 | 
 94 | The weight is added to each ancestor (**stone fruit**, **nut**,
 95 | **seed**, **fruit**, ...) once. That is, the weight is not added to
 96 | the convergent ancestor for **fruit** twice, but only once.
 97 | 
 98 | 
 99 | Calculating Information Content
100 | -------------------------------
101 | 
102 | .. autofunction:: information_content
103 | .. autofunction:: synset_probability
104 | 
105 | 
106 | Computing Corpus Weights
107 | ------------------------
108 | 
109 | If pre-computed weights are not available for a wordnet or for some
110 | domain, they can be computed given a corpus and a wordnet.
111 | 
112 | The corpus is an iterable of words. For large corpora it may help to
113 | use a generator for this iterable, but the entire vocabulary (i.e.,
114 | unique words and counts) will be held at once in memory. Multi-word
115 | expressions are also possible if they exist in the wordnet. For
116 | instance, WordNet has *stone fruit*, with a single space delimiting
117 | the words, as an entry.
118 | 
119 | The :class:`wn.Wordnet` object must be instantiated with a single
120 | lexicon, although it may have expand-lexicons for relation
121 | traversal. For best results, the wordnet should use a lemmatizer to
122 | help it deal with inflected wordforms from running text.
123 | 
124 | .. autofunction:: compute
125 | 
126 | 
127 | Reading Pre-computed Information Content Files
128 | ----------------------------------------------
129 | 
130 | The :func:`load` function reads pre-computed information content
131 | weights files as used by the `WordNet::Similarity
132 | <http://wn-similarity.sourceforge.net/>`_ Perl module or the `NLTK
133 | <http://www.nltk.org/>`_ Python package. These files are computed for
134 | a specific version of a wordnet using the synset offsets from the
135 | `WNDB <https://wordnet.princeton.edu/documentation/wndb5wn>`_ format,
136 | which Wn does not use. These offsets therefore must be converted into
137 | an identifier that matches those used by the wordnet. By default,
138 | :func:`load` uses the lexicon identifier from its *wordnet* argument
139 | with synset offsets (padded with 0s to make 8 digits) and
140 | parts-of-speech from the weights file to format an identifier, such as
141 | ``omw-en-00001174-n``. For wordnets that use a different identifier
142 | scheme, the *get_synset_id* parameter of :func:`load` can be given a
143 | callable created with :func:`wn.util.synset_id_formatter`. It can also
144 | be given another callable with the same signature as shown below:
145 | 
146 | .. code-block:: python
147 | 
148 |    get_synset_id(*, offset: int, pos: str) -> str
149 | 
150 | 
151 | When loading pre-computed information content files, it is recommended
152 | to use the ones with smoothing (i.e., ``*-add1.dat`` or
153 | ``*-resnik-add1.dat``) to avoid math domain errors when computing the
154 | information content value.
155 | 
156 | .. warning::
157 | 
158 |    The weights files are only valid for the version of wordnet for
159 |    which they were created. Files created for WordNet 3.0 do not work
160 |    for WordNet 3.1 because the offsets used in its identifiers are
161 |    different, although the *get_synset_id* parameter of :func:`load`
162 |    could be given a function that performs a suitable mapping. Some
163 |    `Open Multilingual Wordnet <https://github.com/omwn/omw-data>`_
164 |    wordnets use the WordNet 3.0 offsets in their identifiers and can
165 |    therefore technically use the weights, but this usage is
166 |    discouraged because the distributional properties of text in
167 |    another language and the structure of the other wordnet will not be
168 |    compatible with that of the English WordNet. For these cases, it is
169 |    recommended to compute new weights using :func:`compute`.
170 | 
171 | .. autofunction:: load
172 | 


--------------------------------------------------------------------------------