├── .gitattributes
├── .github
└── workflows
│ ├── python-package.yml
│ └── wheels.yml
├── .gitignore
├── LICENSE.txt
├── MANIFEST.in
├── README.md
├── doc
├── _static
│ ├── GreynirLogo220.png
│ ├── GreynirLogo400.png
│ ├── GreynirTreeExample.png
│ ├── GreynirTreeExampleSmall.png
│ ├── GreynirTreeExampleSmall2.png
│ ├── LitlaGula.png
│ ├── MideindLogoVert100.png
│ ├── MideindLogoVert300.png
│ ├── MideindLogoVert400.png
│ ├── ReynirLogo216.png
│ ├── ReynirLogo400.png
│ ├── annotation_instructions.pdf
│ ├── custom.css
│ ├── favicon.ico
│ ├── greynir-favicon-32x32.png
│ ├── greynir-logo-large.png
│ └── mideind-horizontal-small.png
├── conf.py
├── copyright.rst
├── index.rst
├── installation.rst
├── nonterminals.rst
├── overview.rst
├── patterns.rst
├── quickstart.rst
├── reference.rst
├── simpletree.rst
└── terminals.rst
├── old
├── build_wheels.sh
├── release.sh
└── wheels.sh
├── setup.py
├── src
└── reynir
│ ├── Greynir.grammar
│ ├── __init__.py
│ ├── _eparser.cpp
│ ├── baseparser.py
│ ├── basics.py
│ ├── bindb.py
│ ├── binparser.py
│ ├── bintokenizer.py
│ ├── cache.py
│ ├── config
│ ├── Abbrev_errors.conf
│ ├── AdjectivePredicates.conf
│ ├── Adjectives.conf
│ ├── GreynirEngine.conf
│ ├── Names.conf
│ ├── NounPredicates.conf
│ ├── Phrases.conf
│ ├── Prefs.conf
│ ├── Prepositions.conf
│ └── Verbs.conf
│ ├── eparser.cpp
│ ├── eparser.h
│ ├── eparser_build.py
│ ├── fastparser.py
│ ├── glock.py
│ ├── grammar.py
│ ├── ifdtagger.py
│ ├── incparser.py
│ ├── lemmatize.py
│ ├── matcher.py
│ ├── nounphrase.py
│ ├── py.typed
│ ├── reducer.py
│ ├── resources
│ ├── ord.auka.csv
│ └── systematic_additions.csv
│ ├── reynir.py
│ ├── settings.py
│ ├── simpletree.py
│ └── verbframe.py
└── test
├── test_cases.py
├── test_matcher.py
├── test_no_multiply_numbers.py
├── test_original.py
├── test_parse.py
├── test_reynir.py
└── test_serializers.py
/.gitattributes:
--------------------------------------------------------------------------------
1 | src/reynir/resources/ordalisti-*.bin filter=lfs diff=lfs merge=lfs -text
2 | src/reynir/resources/ord.compressed filter=lfs diff=lfs merge=lfs -text
3 |
4 | # Set the default line ending behavior to auto
5 | * text=auto
6 |
7 | # Source files should only have LF endings
8 | *.py text eol=lf
9 | *.c text eol=lf
10 | *.h text eol=lf
11 | *.cpp text eol=lf
12 | *.hpp text eol=lf
13 | *.csv text eol=lf
14 | *.grammar text eol=lf
15 | *.yml text eol=lf
16 | *.sh text eol=lf
17 | *.rst text eol=lf
18 | *.md text eol=lf
19 | *.in text eol=lf
20 | *.conf text eol=lf
21 |
22 | # Declare files that will always have CRLF line endings on checkout
23 | *.sln text eol=crlf
24 | *.bat text eol=crlf
25 |
26 | # Denote all files that are truly binary and should not be modified
27 | *.png binary
28 | *.jpg binary
29 | *.bin binary
30 | *.compressed binary
31 |
--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
1 | name: tests
2 |
3 | on:
4 | push:
5 | branches: [ "*" ]
6 | pull_request:
7 | branches: [ "*" ]
8 |
9 | jobs:
10 | build:
11 |
12 | runs-on: ${{ matrix.os }}
13 | strategy:
14 | matrix:
15 | os: [ubuntu-latest]
16 | python-version: ["3.9", "3.10", "3.11", "3.12", "3.13.0-rc.1", "pypy-3.9", "pypy-3.10"]
17 |
18 | steps:
19 | - uses: actions/checkout@v4
20 | - name: Set up Python ${{ matrix.python-version }}
21 | uses: actions/setup-python@v5
22 | with:
23 | python-version: ${{ matrix.python-version }}
24 | - name: Install GreynirEngine
25 | run: |
26 | python -m pip install --upgrade pip wheel setuptools pytest
27 | python -m pip install -e .
28 | - name: Test with pytest
29 | run: |
30 | python -m pytest
31 | - name: Slack notification
32 | uses: 8398a7/action-slack@v3
33 | with:
34 | status: ${{ job.status }}
35 | author_name: Integration Testing (Python ${{ matrix.python-version }})
36 | env:
37 | SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
38 | if: failure() # Pick up event if the job fails
39 |
--------------------------------------------------------------------------------
/.github/workflows/wheels.yml:
--------------------------------------------------------------------------------
1 | name: wheels
2 |
3 | on:
4 | push:
5 | tags:
6 | - '**'
7 |
8 | jobs:
9 | build_wheels:
10 | name: Build wheels on ${{ matrix.os }}
11 | runs-on: ${{ matrix.os }}
12 | strategy:
13 | matrix:
14 | os: [macos-12, ubuntu-latest, windows-latest]
15 |
16 | steps:
17 | # Check out repository using git-lfs
18 | - uses: actions/checkout@v4
19 | with:
20 | lfs: true
21 |
22 | # Python used to run cibuildwheel
23 | - uses: actions/setup-python@v5
24 | with:
25 | python-version: '3.10'
26 |
27 | - name: Install cibuildwheel
28 | run: python -m pip install --upgrade pip wheel setuptools cibuildwheel
29 |
30 | - name: Build wheels
31 | run: python -m cibuildwheel --output-dir wheelhouse
32 | # Options (https://cibuildwheel.readthedocs.io/en/stable/options/)
33 | env:
34 | CIBW_SKIP: cp36-* cp37-* cp38-* *pp37-* pp38-* *musllinux*
35 | CIBW_BEFORE_BUILD_MACOS: python3 -m pip install --upgrade setuptools wheel cffi
36 | CIBW_ARCHS_MACOS: "x86_64 arm64"
37 | CIBW_ARCHS_WINDOWS: "AMD64"
38 | CIBW_ARCHS_LINUX: "x86_64"
39 | CIBW_BUILD_FRONTEND: "build"
40 | # CIBW_PROJECT_REQUIRES_PYTHON: ">=3.9"
41 |
42 | - uses: actions/upload-artifact@v3
43 | with:
44 | path: ./wheelhouse/*.whl
45 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | .cache/
4 | .pytest_cache/
5 | .mypy_cache/
6 | .idea/
7 | .vscode/
8 | *.py[cod]
9 | *.o
10 | *.obj
11 | *.exp
12 | *.lib
13 | *.so
14 | *.exe
15 |
16 | # Compressed DAWG trees
17 | *.dawg
18 |
19 | # Pickled DAWG trees
20 | *.pickle
21 |
22 | # Training data files
23 | *.gz
24 | *.zip
25 |
26 | # Experimental Python stuff
27 | test.py
28 | *.new.py
29 | *.old.py
30 |
31 | # Mypy
32 | mypy.ini
33 |
34 | # Doc stuff
35 | doc/Makefile
36 | doc/make.bat
37 | doc/_build/
38 |
39 | # Various resource files
40 | src/reynir/resources/*.csv
41 | src/reynir/resources/*.tsv
42 | src/reynir/resources/*.txt
43 | src/reynir/resources/*.py
44 | !src/reynir/resources/ord.auka.csv
45 | !src/reynir/resources/systematic_additions.csv
46 | src/reynir/resources/extra/
47 |
48 | # Test files
49 | test/test_corpus/handpsd/annotaldLog.txt
50 |
51 | # Scratch work files
52 | src/reynir/_eparser.cpp
53 | *.bin
54 | !ordalisti-*.dawg.bin
55 |
56 | *.sublime-project
57 | *.sublime-workspace
58 | *.code-workspace
59 | *.bak
60 | *.profile
61 | *.log
62 | t
63 | txt
64 | input.txt
65 | nohup.out
66 |
67 | # Distribution / packaging
68 | .Python
69 | env/
70 | bin/
71 | build/
72 | develop-eggs/
73 | dist/
74 | eggs/
75 | lib/
76 | lib64/
77 | parts/
78 | sdist/
79 | var/
80 | .eggs/
81 | *.egg-info/
82 | .installed.cfg
83 | *.egg
84 | log/
85 | console/
86 |
87 | # Virtual environments
88 | venv
89 | p35/
90 | p358/
91 | p359/
92 | p37/
93 | pypy*
94 |
95 | # Installer logs
96 | pip-log.txt
97 | pip-delete-this-directory.txt
98 | deploy_done.py
99 |
100 | # Windows image file caches
101 | Thumbs.db
102 | ehthumbs.db
103 |
104 | # Folder config file
105 | Desktop.ini
106 |
107 | # Recycle Bin used on file shares
108 | $RECYCLE.BIN/
109 |
110 | # Windows Installer files
111 | *.cab
112 | *.msi
113 | *.msm
114 | *.msp
115 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 |
2 | Greynir is *copyright © 2016-2024 by Miðeind ehf.*
3 | The original author of this software is *Vilhjálmur Þorsteinsson*.
4 |
5 | This software is licensed under the MIT License:
6 |
7 | Permission is hereby granted, free of charge, to any person
8 | obtaining a copy of this software and associated documentation
9 | files (the "Software"), to deal in the Software without restriction,
10 | including without limitation the rights to use, copy, modify, merge,
11 | publish, distribute, sublicense, and/or sell copies of the Software,
12 | and to permit persons to whom the Software is furnished to do so,
13 | subject to the following conditions:
14 |
15 | The above copyright notice and this permission notice shall be
16 | included in all copies or substantial portions of the Software.
17 |
18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | graft src
2 | prune src/reynir/__pycache__
3 | prune src/reynir/.mypy_cache
4 | prune test/test_corpus
5 | include src/reynir/Greynir.grammar
6 | exclude src/reynir/Greynir.*.bin
7 | include src/reynir/eparser.h
8 | exclude src/reynir/_eparser.cpp
9 | include src/reynir/config/*.conf
10 | exclude src/reynir/resources/*.csv
11 | exclude src/reynir/resources/*.txt
12 | exclude src/reynir/resources/.DS_Store
13 | exclude src/reynir/*.o
14 | exclude src/reynir/*.so
15 | exclude src/reynir/.DS_Store
16 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://opensource.org/licenses/MIT)
2 | [](https://www.python.org/downloads/release/python-3817/)
3 | 
4 | 
5 | []()
6 |
7 |
8 |
9 | # GreynirEngine
10 |
11 | **A fast, efficient natural language processing engine for Icelandic**
12 |
13 | ## Overview
14 |
15 | Greynir is a Python 3 (>=3.9) package,
16 | published by [Miðeind ehf.](https://mideind.is), for
17 | **working with Icelandic natural language text**.
18 | Greynir can parse text into **sentence trees**, find **lemmas**,
19 | inflect **noun phrases**, assign **part-of-speech tags** and much more.
20 |
21 | Greynir's sentence trees can *inter alia* be used to extract
22 | information from text, for instance about people, titles, entities, facts,
23 | actions and opinions.
24 |
25 | Full documentation for Greynir is [available here](https://greynir.is/doc/).
26 |
27 | Greynir is the engine of [Greynir.is](https://greynir.is),
28 | a natural-language front end for a database of over 10 million
29 | sentences parsed from Icelandic news articles, and
30 | [Embla](https://embla.is), a voice-driven virtual assistant app
31 | for smart devices such as iOS and Android phones.
32 |
33 | Greynir includes a hand-written
34 | [context-free grammar](https://github.com/mideind/GreynirEngine/blob/master/src/reynir/Greynir.grammar)
35 | for the Icelandic language, consisting of over 7,000 lines of grammatical
36 | productions in [extended Backus-Naur format](https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form).
37 | Its fast C++ parser core is able to cope with long and ambiguous sentences,
38 | using an [Earley-type parser](https://en.wikipedia.org/wiki/Earley_parser)
39 | as [enhanced by Scott and Johnstone](https://www.sciencedirect.com/science/article/pii/S0167642309000951).
40 |
41 | Greynir employs the [Tokenizer](https://pypi.org/project/tokenizer/) package,
42 | by the same authors, to tokenize text, and
43 | uses [BinPackage](https://pypi.org/project/islenska/) as its database of
44 | Icelandic vocabulary and morphology.
45 |
46 | ## Examples
47 |
48 | ### Use Greynir to easily inflect noun phrases
49 |
50 | ````python
51 | from reynir import NounPhrase as Nl
52 |
53 | # Create a NounPhrase ('nafnliður') object
54 | karfa = Nl("þrír lúxus-miðar á Star Wars og tveir brimsaltir pokar af poppi")
55 |
56 | # Print the NounPhrase in the correct case for each context
57 | # (þf=þolfall/accusative, þgf=þágufall/dative). Note that
58 | # the NounPhrase class implements __format__(), allowing you
59 | # to use the case as a format specification, for instance in f-strings.
60 |
61 | print(f"Þú keyptir {karfa:þf}.")
62 | print(f"Hér er kvittunin þín fyrir {karfa:þgf}.")
63 | ````
64 |
65 | The program outputs the following text, correctly inflected:
66 |
67 | ````text
68 | Þú keyptir þrjá lúxus-miða á Star Wars og tvo brimsalta poka af poppi.
69 | Hér er kvittunin þín fyrir þremur lúxus-miðum á Star Wars og tveimur brimsöltum pokum af poppi.
70 | ````
71 |
72 | ### Use Greynir to parse a sentence
73 |
74 | ````python
75 | >>> from reynir import Greynir
76 | >>> g = Greynir()
77 | >>> sent = g.parse_single("Ása sá sól.")
78 | >>> print(sent.tree.view)
79 | P # Root
80 | +-S-MAIN # Main sentence
81 | +-IP # Inflected phrase
82 | +-NP-SUBJ # Noun phrase, subject
83 | +-no_et_nf_kvk: 'Ása' # Noun, singular, nominative, feminine
84 | +-VP # Verb phrase containing arguments
85 | +-VP # Verb phrase containing verb
86 | +-so_1_þf_et_p3: 'sá' # Verb, 1 accusative arg, singular, 3rd p
87 | +-NP-OBJ # Noun phrase, object
88 | +-no_et_þf_kvk: 'sól' # Noun, singular, accusative, feminine
89 | +-'.' # Punctuation
90 | >>> sent.tree.nouns
91 | ['Ása', 'sól']
92 | >>> sent.tree.verbs
93 | ['sjá']
94 | >>> sent.tree.flat
95 | 'P S-MAIN IP NP-SUBJ no_et_nf_kvk /NP-SUBJ VP so_1_þf_et_p3
96 | NP-OBJ no_et_þf_kvk /NP-OBJ /VP /IP /S-MAIN p /P'
97 | >>> # The subject noun phrase (S.IP.NP also works)
98 | >>> sent.tree.S.IP.NP_SUBJ.lemmas
99 | ['Ása']
100 | >>> # The verb phrase
101 | >>> sent.tree.S.IP.VP.lemmas
102 | ['sjá', 'sól']
103 | >>> # The object within the verb phrase (S.IP.VP.NP also works)
104 | >>> sent.tree.S.IP.VP.NP_OBJ.lemmas
105 | ['sól']
106 | ````
107 |
108 | ## Prerequisites
109 |
110 | This package runs on CPython 3.9 or newer, and on PyPy 3.9 or newer.
111 |
112 | To find out which version of Python you have, enter:
113 |
114 | ````sh
115 | python --version
116 | ````
117 |
118 | If a binary wheel package isn't available on [PyPI](https://pypi.org>)
119 | for your system, you may need to have the `python3-dev` package
120 | (or its Windows equivalent) installed on your
121 | system to set up Greynir successfully. This is
122 | because a source distribution install requires a C++ compiler and linker:
123 |
124 | ````sh
125 | # Debian or Ubuntu
126 | sudo apt-get install python3-dev
127 | ````
128 |
129 | Depending on your system, you may also need to install `libffi-dev`:
130 |
131 | ````sh
132 | # Debian or Ubuntu
133 | sudo apt-get install libffi-dev
134 | ````
135 |
136 | ## Installation
137 |
138 | To install this package, assuming Python 3 is your default Python:
139 |
140 | ````sh
141 | pip install reynir
142 | ````
143 |
144 | If you have **git** installed and want to be able to edit
145 | the source, do like so:
146 |
147 | ````sh
148 | git clone https://github.com/mideind/GreynirEngine
149 | cd GreynirEngine
150 | # [ Activate your virtualenv here if you have one ]
151 | pip install -e .
152 | ````
153 |
154 | The package source code is in `GreynirEngine/src/reynir`.
155 |
156 | ## Tests
157 |
158 | To run the built-in tests, install [pytest](https://docs.pytest.org/en/latest),
159 | `cd` to your `GreynirEngine` subdirectory (and optionally activate your
160 | virtualenv), then run:
161 |
162 | ````sh
163 | python -m pytest
164 | ````
165 |
166 | ## Evaluation
167 |
168 | A parsing test pipeline for different parsing schemas, including the Greynir schema,
169 | has been developed. It is available [here](https://github.com/mideind/ParsingTestPipe).
170 |
171 | ## Documentation
172 |
173 | Please consult [Greynir's documentation](https://greynir.is/doc/) for detailed
174 | [installation instructions](https://greynir.is/doc/installation.html),
175 | a [quickstart guide](https://greynir.is/doc/quickstart.html),
176 | and [reference information](https://greynir.is/doc/reference.html),
177 | as well as important information about
178 | [copyright and licensing](https://greynir.is/doc/copyright.html).
179 |
180 | ## Troubleshooting
181 |
182 | If parsing seems to hang, it is possible that a lock file that GreynirEngine
183 | uses has been left locked. This can happen if a Python process that uses
184 | GreynirEngine is killed abruptly. The solution is to delete the lock file
185 | and try again:
186 |
187 | On Linux and macOS:
188 |
189 | ````sh
190 | rm /tmp/greynir-grammar # May require sudo privileges
191 | ````
192 |
193 | On Windows:
194 |
195 | ````cmd
196 | del %TEMP%\greynir-grammar
197 | ````
198 |
199 | ## Copyright and licensing
200 |
201 | Greynir is Copyright © 2016-2024 by [Miðeind ehf.](https://mideind.is).
202 | The original author of this software is *Vilhjálmur Þorsteinsson*.
203 |
204 |
206 |
207 | This software is licensed under the **MIT License**:
208 |
209 | *Permission is hereby granted, free of charge, to any person*
210 | *obtaining a copy of this software and associated documentation*
211 | *files (the "Software"), to deal in the Software without restriction,*
212 | *including without limitation the rights to use, copy, modify, merge,*
213 | *publish, distribute, sublicense, and/or sell copies of the Software,*
214 | *and to permit persons to whom the Software is furnished to do so,*
215 | *subject to the following conditions:*
216 |
217 | **The above copyright notice and this permission notice shall be**
218 | **included in all copies or substantial portions of the Software.**
219 |
220 | *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,*
221 | *EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF*
222 | *MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*
223 | *IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY*
224 | *CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,*
225 | *TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE*
226 | *SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.*
227 |
228 | If you would like to use this software in ways that are incompatible
229 | with the standard MIT license, [contact Miðeind ehf.](mailto:mideind@mideind.is)
230 | to negotiate custom arrangements.
231 |
232 | ----
233 |
234 | GreynirEngine indirectly embeds the [Database of Icelandic Morphology](https://bin.arnastofnun.is),
235 | ([Beygingarlýsing íslensks nútímamáls](https://bin.arnastofnun.is)), abbreviated BÍN.
236 | GreynirEngine does not claim any endorsement by the BÍN authors or copyright holders.
237 |
238 | The BÍN source data are publicly available under the
239 | [CC BY-SA 4.0 license](https://creativecommons.org/licenses/by-sa/4.0/), as further
240 | detailed [here in English](https://bin.arnastofnun.is/DMII/LTdata/conditions/)
241 | and [here in Icelandic](https://bin.arnastofnun.is/gogn/mimisbrunnur/).
242 |
243 | In accordance with the BÍN license terms, credit is hereby given as follows:
244 |
245 | *Beygingarlýsing íslensks nútímamáls. Stofnun Árna Magnússonar í íslenskum fræðum.*
246 | *Höfundur og ritstjóri Kristín Bjarnadóttir.*
247 |
--------------------------------------------------------------------------------
/doc/_static/GreynirLogo220.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mideind/GreynirEngine/c827bfbba49eb49971d5bf63cc49f444534ce11b/doc/_static/GreynirLogo220.png
--------------------------------------------------------------------------------
/doc/_static/GreynirLogo400.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mideind/GreynirEngine/c827bfbba49eb49971d5bf63cc49f444534ce11b/doc/_static/GreynirLogo400.png
--------------------------------------------------------------------------------
/doc/_static/GreynirTreeExample.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mideind/GreynirEngine/c827bfbba49eb49971d5bf63cc49f444534ce11b/doc/_static/GreynirTreeExample.png
--------------------------------------------------------------------------------
/doc/_static/GreynirTreeExampleSmall.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mideind/GreynirEngine/c827bfbba49eb49971d5bf63cc49f444534ce11b/doc/_static/GreynirTreeExampleSmall.png
--------------------------------------------------------------------------------
/doc/_static/GreynirTreeExampleSmall2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mideind/GreynirEngine/c827bfbba49eb49971d5bf63cc49f444534ce11b/doc/_static/GreynirTreeExampleSmall2.png
--------------------------------------------------------------------------------
/doc/_static/LitlaGula.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mideind/GreynirEngine/c827bfbba49eb49971d5bf63cc49f444534ce11b/doc/_static/LitlaGula.png
--------------------------------------------------------------------------------
/doc/_static/MideindLogoVert100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mideind/GreynirEngine/c827bfbba49eb49971d5bf63cc49f444534ce11b/doc/_static/MideindLogoVert100.png
--------------------------------------------------------------------------------
/doc/_static/MideindLogoVert300.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mideind/GreynirEngine/c827bfbba49eb49971d5bf63cc49f444534ce11b/doc/_static/MideindLogoVert300.png
--------------------------------------------------------------------------------
/doc/_static/MideindLogoVert400.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mideind/GreynirEngine/c827bfbba49eb49971d5bf63cc49f444534ce11b/doc/_static/MideindLogoVert400.png
--------------------------------------------------------------------------------
/doc/_static/ReynirLogo216.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mideind/GreynirEngine/c827bfbba49eb49971d5bf63cc49f444534ce11b/doc/_static/ReynirLogo216.png
--------------------------------------------------------------------------------
/doc/_static/ReynirLogo400.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mideind/GreynirEngine/c827bfbba49eb49971d5bf63cc49f444534ce11b/doc/_static/ReynirLogo400.png
--------------------------------------------------------------------------------
/doc/_static/annotation_instructions.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mideind/GreynirEngine/c827bfbba49eb49971d5bf63cc49f444534ce11b/doc/_static/annotation_instructions.pdf
--------------------------------------------------------------------------------
/doc/_static/custom.css:
--------------------------------------------------------------------------------
1 | /*
2 | custom.css
3 |
4 | Copyright © 2023 Miðeind ehf.
5 | See the Greynir GitHub repository at
6 | https://github.com/mideind/GreynirEngine
7 | for copyright and licensing information.
8 |
9 | This style sheet overrides stuff from the standard Sphinx Alabaster
10 | style to make it more compliant with the Mideind/Greynir branding
11 |
12 | */
13 |
14 | @import url("https://fonts.googleapis.com/css?family=Lato:300,300i,400,400i,700,700i&display=swap");
15 |
16 | body {
17 | font-weight: 300;
18 | font-size: 16px;
19 | }
20 |
21 | strong {
22 | font-weight: 400;
23 | }
24 |
25 | div.body h1,
26 | div.body h2,
27 | div.body h3,
28 | div.body h4,
29 | div.body h5,
30 | div.body h6 {
31 | font-family: "Lato", "Garamond", "Georgia", serif;
32 | font-weight: normal;
33 | }
34 |
35 | div.body h1 {
36 | font-weight: 700;
37 | font-style: italic;
38 | font-size: 220%;
39 | color: #006eff;
40 | margin-top: 20px;
41 | }
42 |
43 | div.body h2 {
44 | color: #006eff;
45 | font-size: 160%;
46 | margin-top: 42px;
47 | }
48 |
49 | div.body p,
50 | div.body dd,
51 | div.body li {
52 | line-height: 1.5em;
53 | }
54 |
55 | pre,
56 | tt,
57 | code {
58 | font-size: 14px;
59 | line-height: 1.25em;
60 | }
61 |
62 | div.sphinxsidebarwrapper p.blurb {
63 | margin-top: 5px;
64 | margin-bottom: 10px;
65 | font-size: 13px;
66 | }
67 |
68 | div.sphinxsidebar h3,
69 | div.sphinxsidebar h4 {
70 | font-family: "Lato", "Garamond", "Georgia", serif;
71 | font-weight: 400;
72 | font-size: 22px;
73 | color: #006eff;
74 | margin-top: 15px;
75 | margin-bottom: 5px;
76 | }
77 |
78 | div.sphinxsidebar ul li.toctree-l1 > a {
79 | font-size: 110%;
80 | line-height: 1.7;
81 | }
82 |
83 | div.figure p.caption span.caption-text {
84 | font-style: italic;
85 | }
86 |
87 | dl.py.class,
88 | dl.py.method,
89 | dl.py.attribute {
90 | margin-top: 1.5em;
91 | margin-bottom: 0.8em;
92 | }
93 |
94 | dl.py.method dt,
95 | dl.py.attribute dt {
96 | margin-bottom: 0.8em;
97 | padding-left: 2em;
98 | text-indent: -2em;
99 | }
100 |
101 | tt.descname,
102 | tt.descclassname,
103 | code.descname,
104 | code.descclassname {
105 | font-size: 1.05em;
106 | color: #006eff;
107 | }
108 |
109 | dl.field-list > dt {
110 | font-weight: normal;
111 | color: #888888;
112 | }
113 |
--------------------------------------------------------------------------------
/doc/_static/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mideind/GreynirEngine/c827bfbba49eb49971d5bf63cc49f444534ce11b/doc/_static/favicon.ico
--------------------------------------------------------------------------------
/doc/_static/greynir-favicon-32x32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mideind/GreynirEngine/c827bfbba49eb49971d5bf63cc49f444534ce11b/doc/_static/greynir-favicon-32x32.png
--------------------------------------------------------------------------------
/doc/_static/greynir-logo-large.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mideind/GreynirEngine/c827bfbba49eb49971d5bf63cc49f444534ce11b/doc/_static/greynir-logo-large.png
--------------------------------------------------------------------------------
/doc/_static/mideind-horizontal-small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mideind/GreynirEngine/c827bfbba49eb49971d5bf63cc49f444534ce11b/doc/_static/mideind-horizontal-small.png
--------------------------------------------------------------------------------
/doc/conf.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | #
4 | # Greynir documentation build configuration file, created by
5 | # sphinx-quickstart on Sun Apr 8 01:20:08 2018.
6 | #
7 | # This file is execfile()d with the current directory set to its
8 | # containing dir.
9 | #
10 | # Note that not all possible configuration values are present in this
11 | # autogenerated file.
12 | #
13 | # All configuration values have a default; values that are commented out
14 | # serve to show the default.
15 |
16 | # If extensions (or modules to document with autodoc) are in another directory,
17 | # add these directories to sys.path here. If the directory is relative to the
18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
19 | #
20 |
21 | from typing import TYPE_CHECKING, Mapping, Any
22 |
23 | import os
24 | from datetime import datetime
25 |
26 | # -- General configuration ------------------------------------------------
27 |
28 | # If your documentation needs a minimal Sphinx version, state it here.
29 | #
30 | # needs_sphinx = '1.0'
31 |
32 | # Add any Sphinx extension module names here, as strings. They can be
33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
34 | # ones.
35 | extensions = []
36 |
37 | # Add any paths that contain templates here, relative to this directory.
38 | templates_path = ["_templates"]
39 |
40 | # The suffix(es) of source filenames.
41 | # You can specify multiple suffix as a list of string:
42 | #
43 | # source_suffix = ['.rst', '.md']
44 | source_suffix = ".rst"
45 |
46 | # The master toctree document.
47 | master_doc = "index"
48 |
49 | # General information about the project.
50 | year = datetime.now().year
51 | project = "Greynir"
52 | copyright = "{0} Miðeind ehf".format(year)
53 | author = "Miðeind ehf."
54 |
55 | # The version info for the project you're documenting, acts as replacement for
56 | # |version| and |release|, also used in various other places throughout the
57 | # built documents.
58 |
59 | # Get version string from "../src/reynir/version.py"
60 | basepath, _ = os.path.split(os.path.realpath(__file__))
61 | version_path = os.path.join(basepath, "..", "src", "reynir", "version.py")
62 |
63 | if TYPE_CHECKING:
64 | __version__ = ""
65 | else:
66 | exec(open(version_path).read())
67 |
68 | # The full version, including alpha/beta/rc tags.
69 | release = __version__ # pylint: disable=undefined-variable
70 | # The short X.Y version.
71 | version = ".".join(__version__.split(".")[:2]) # pylint: disable=undefined-variable
72 |
73 | # The language for content autogenerated by Sphinx. Refer to documentation
74 | # for a list of supported languages.
75 | #
76 | # This is also used if you do content translation via gettext catalogs.
77 | # Usually you set "language" from the command line for these cases.
78 | language = None
79 |
80 | # List of patterns, relative to source directory, that match files and
81 | # directories to ignore when looking for source files.
82 | # This patterns also effect to html_static_path and html_extra_path
83 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
84 |
85 | # The name of the Pygments (syntax highlighting) style to use.
86 | pygments_style = "sphinx"
87 |
88 | # If true, `todo` and `todoList` produce output, else they produce nothing.
89 | todo_include_todos = False
90 |
91 |
92 | # -- Options for HTML output ----------------------------------------------
93 |
94 | # The theme to use for HTML and HTML Help pages. See the documentation for
95 | # a list of builtin themes.
96 | #
97 | html_theme = "alabaster"
98 |
99 | # Theme options are theme-specific and customize the look and feel of a theme
100 | # further. For a list of options available for each theme, see the
101 | # documentation.
102 | #
103 | html_sidebars = {
104 | "**": ["about.html", "navigation.html", "relations.html", "searchbox.html"]
105 | }
106 | html_theme_options: Mapping[str, Any] = {
107 | "logo": "GreynirLogo400.png",
108 | "logo_name": False,
109 | "logo_text_align": "center",
110 | "description": "Natural Language Processing for Icelandic",
111 | "github_user": "mideind",
112 | "github_repo": "GreynirEngine",
113 | "github_button": True,
114 | "sidebar_collapse": False,
115 | "fixed_sidebar": True,
116 | "font_family": (
117 | "Lato, Georgia, 'goudy old style', 'minion pro', "
118 | "'bell mt', 'Hiragino Mincho Pro', serif"
119 | ),
120 | }
121 |
122 | # Add any paths that contain custom static files (such as style sheets) here,
123 | # relative to this directory. They are copied after the builtin static files,
124 | # so a file named "default.css" will overwrite the builtin "default.css".
125 | html_static_path = ["_static"]
126 |
127 | # Set the favicon
128 | html_favicon = "_static/greynir-favicon-32x32.png"
129 |
130 | # -- Options for HTMLHelp output ------------------------------------------
131 |
132 | # Output file base name for HTML help builder.
133 | htmlhelp_basename = "Greynirdoc"
134 |
135 |
136 | # -- Options for LaTeX output ---------------------------------------------
137 |
138 | latex_elements = {
139 | # The paper size ('letterpaper' or 'a4paper').
140 | "papersize": "a4paper",
141 | # The font size ('10pt', '11pt' or '12pt').
142 | "pointsize": "10pt",
143 | # Additional stuff for the LaTeX preamble.
144 | "preamble": "",
145 | # Latex figure (float) alignment
146 | "figure_align": "htbp",
147 | }
148 |
149 | # Grouping the document tree into LaTeX files. List of tuples
150 | # (source start file, target name, title,
151 | # author, documentclass [howto, manual, or own class]).
152 | latex_documents = [
153 | (master_doc, "Greynir.tex", "Greynir Documentation", "Miðeind ehf.", "manual")
154 | ]
155 |
156 |
157 | # -- Options for manual page output ---------------------------------------
158 |
159 | # One entry per manual page. List of tuples
160 | # (source start file, name, description, authors, manual section).
161 | man_pages = [(master_doc, "greynir", "Greynir Documentation", [author], 1)]
162 |
163 |
164 | # -- Options for Texinfo output -------------------------------------------
165 |
166 | # Grouping the document tree into Texinfo files. List of tuples
167 | # (source start file, target name, title, author,
168 | # dir menu entry, description, category)
169 | texinfo_documents = [
170 | (
171 | master_doc,
172 | "Greynir",
173 | "Greynir Documentation",
174 | author,
175 | "Greynir",
176 | "Natural language processing for Icelandic",
177 | "NLP",
178 | )
179 | ]
180 |
--------------------------------------------------------------------------------
/doc/copyright.rst:
--------------------------------------------------------------------------------
1 | .. _copyright:
2 |
3 | Copyright and licensing
4 | =======================
5 |
6 | .. figure:: _static/MideindLogoVert100.png
7 | :align: left
8 | :alt: Miðeind ehf.
9 |
10 | GreynirEngine is *copyright © 2023 Miðeind ehf.*, Reykjavík, Iceland.
11 |
12 | The project's original author is *Vilhjálmur Þorsteinsson*.
13 |
14 | This software is licensed under the MIT License:
15 |
16 | *Permission is hereby granted, free of charge, to any person*
17 | *obtaining a copy of this software and associated documentation*
18 | *files (the "Software"), to deal in the Software without restriction,*
19 | *including without limitation the rights to use, copy, modify, merge,*
20 | *publish, distribute, sublicense, and/or sell copies of the Software,*
21 | *and to permit persons to whom the Software is furnished to do so,*
22 | *subject to the following conditions:*
23 |
24 | *The above copyright notice and this permission notice shall be*
25 | *included in all copies or substantial portions of the Software.*
26 |
27 | *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,*
28 | *EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF*
29 | *MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*
30 | *IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY*
31 | *CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,*
32 | *TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE*
33 | *SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.*
34 |
35 | .. note::
36 |
37 | GreynirEngine indirectly embeds the `Database of Modern Icelandic Inflection `_
38 | (`Beygingarlýsing íslensks nútímamáls `_), abbreviated BÍN.
39 |
40 | The BÍN source data are publicly available under the CC-BY-4.0 license, as further
41 | detailed `here in English `_
42 | and `here in Icelandic `_.
43 |
44 | In accordance with the BÍN license terms, credit is hereby given as follows:
45 |
46 | *Beygingarlýsing íslensks nútímamáls. Stofnun Árna Magnússonar í íslenskum fræðum. Höfundur og ritstjóri Kristín Bjarnadóttir.*
47 |
48 | GreynirEngine accesses BÍN data through another package from the same authors
49 | called BinPackage, and further information is available in that package's
50 | `GitHub repository `_.
51 |
--------------------------------------------------------------------------------
/doc/index.rst:
--------------------------------------------------------------------------------
1 | .. Greynir documentation master file, created by
2 | sphinx-quickstart on Sun Apr 8 01:20:08 2018.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 |
7 | Welcome to Greynir
8 | ==================
9 |
10 | *Til að gagnast sem flestum er skjölun Greynis á ensku. - In order to serve
11 | the widest possible audience, Greynir's documentation is in English.*
12 |
13 | Greynir is a Python >= 3.9 package for **working with Icelandic text**,
14 | including parsing it into **sentence trees**, finding **lemmas**,
15 | inflecting **noun phrases**, assigning **part-of-speech tags** and much more.
16 |
17 | Greynir's sentence trees can *inter alia* be used to extract information
18 | from text, for instance about people, titles, entities, facts, actions
19 | and opinions.
20 |
21 | .. figure:: _static/GreynirTreeExampleSmall2.png
22 | :align: center
23 | :alt: An example of a parse tree
24 |
25 | Greynir is the engine of `Embla `_ and `Greynir.is `_.
26 |
27 | Greynir has been used to parse text from Icelandic news websites since 2015,
28 | processing over 10 million sentences in over 515.000 articles. Its
29 | optimized C++ parsing core is fast and efficient enough to parse
30 | real-world text according to a
31 | `context-free grammar for the Icelandic
32 | language `_
33 | with over 22,000 productions.
34 |
35 | To get acquainted with Greynir, we recommend that you start with
36 | the :ref:`overview`, proceed with the :ref:`installation` instructions,
37 | and then look at the :ref:`quickstart`. For further reference, consult
38 | the :ref:`reference` section.
39 |
40 | This documentation also contains :ref:`important information about copyright
41 | and licensing `.
42 |
43 | Batteries included
44 | ------------------
45 |
46 | To start using Greynir with Python, you (usually) need
47 | :ref:`ony one command `::
48 |
49 | $ pip install reynir
50 |
51 | **No database to set up, no further data to download.**
52 |
53 | Greynir indirectly embeds the `Database of Modern Icelandic
54 | Inflection `_
55 | (`Beygingarlýsing íslensks nútímamáls `_),
56 | with over 6 million entries, in compressed form.
57 | By looking up word forms in this database and applying context-free
58 | grammar rules (productions) and scoring heuristics, Greynir is able to
59 | infer what the most likely lemmas are, how they are inflected in the
60 | parsed text, and where they fit in the overall sentence structure.
61 |
62 | Greynir is thoroughly documented, and its source code is of course
63 | `available on GitHub `_.
64 |
65 | Enabling your application
66 | -------------------------
67 |
68 | Greynir can serve as an enabling component of applications such as:
69 |
70 | * Natural language query systems
71 | * Bots and conversational systems
72 | * Information extraction tools
73 | * Intelligent search tools
74 | * Grammatical pattern analyzers
75 | * Text similarity
76 | * Author identification
77 | * Sentiment analysis
78 | * Content summarization
79 | * Content category labeling
80 | * Part-of-speech (POS) taggers and lemmatizers
81 | * Generation of training corpora for machine learning
82 |
83 | About Greynir
84 | -------------
85 |
86 | Greynir is a project and product of Miðeind ehf. of Reykjavík, Iceland.
87 | It is a free open source software project (:ref:`MIT license `),
88 | started in mid-2015 by its original author, Vilhjálmur Þorsteinsson.
89 | Its aim is to produce an **industrial-strength Natural Language**
90 | **Processing toolset for Icelandic**, with the hope of supporting the
91 | language on the digital front in times of rapid advances in language
92 | technology; changes that may leave low-resource languages at a
93 | disadvantage unless explicit action is taken to strengthen their position.
94 |
95 | Greynir and associated projects received grants from the Icelandic
96 | Language Technology Fund *(Máltæknisjóður)* in 2016 and 2017, and have
97 | been partially supported by the Icelandic Government's Language
98 | Technology Programme The LT Programme is managed by
99 | `Almannarómur `_ and funded by the Ministry
100 | of Education, Science and Culture.
101 |
102 |
103 | .. toctree::
104 | :maxdepth: 1
105 | :hidden:
106 |
107 | overview
108 | installation
109 | quickstart
110 | reference
111 | patterns
112 | nonterminals
113 | terminals
114 | copyright
115 |
116 |
--------------------------------------------------------------------------------
/doc/installation.rst:
--------------------------------------------------------------------------------
1 | .. _installation:
2 |
3 | Installation
4 | ============
5 |
6 | Prerequisites
7 | -------------
8 |
9 | Greynir runs on **CPython 3.9** or newer, and on **PyPy 3.9**
10 | or newer (more info on PyPy `here `_).
11 |
12 | On GNU/Linux and similar systems, you may need to have ``python3-dev``
13 | installed on your system:
14 |
15 | .. code-block:: bash
16 |
17 | # Debian or Ubuntu:
18 | $ sudo apt-get install python3-dev
19 |
20 | Depending on your system, you may also need to install ``libffi-dev``:
21 |
22 | .. code-block:: bash
23 |
24 | # Debian or Ubuntu:
25 | $ sudo apt-get install libffi-dev
26 |
27 | On Windows, you may need the latest
28 | `Visual Studio Build Tools `_,
29 | specifically the Visual C++ build tools, installed on your PC along
30 | with the Windows 10 SDK.
31 |
32 |
33 | Install with pip
34 | ----------------
35 |
36 | To install Greynir:
37 |
38 | .. code-block:: bash
39 |
40 | $ pip install reynir
41 |
42 | ...or if you have both Python2 and Python3 available on your system:
43 |
44 | .. code-block:: bash
45 |
46 | $ pip3 install reynir
47 |
48 | ...or if you want to be able to edit Greynir's source code in-place,
49 | install ``git`` and do the following (note the final dot in the last line):
50 |
51 | .. code-block:: bash
52 |
53 | $ mkdir ~/github
54 | $ cd ~/github
55 | $ git clone https://github.com/mideind/GreynirEngine
56 | $ cd GreynirEngine
57 | $ git pull
58 | $ pip install -e .
59 |
60 | On most common Linux x86_64/amd64 systems, ``pip`` will download and
61 | install a binary wheel. On other systems, a source distribution will be
62 | downloaded and compiled to binary. This requires a standard, Python-supported
63 | C/C++ compiler to be present on the system.
64 |
65 | Greynir's binary wheels are in the ``manylinux2010`` format (or newer).
66 | This means that you will need version 19.0 or newer of ``pip`` to be able
67 | to install a Greynir wheel. Versions of Python from 3.7 onwards include a
68 | new-enough ``pip``.
69 |
70 | Pull requests are welcome in the project's
71 | `GitHub repository `_.
72 |
73 |
74 | Install into a virtualenv
75 | -------------------------
76 |
77 | In many cases, you will want to maintain a separate Python environment for
78 | your project that uses Greynir. For this, you can use *virtualenv*
79 | (if you haven't already, install it with ``pip install virtualenv``):
80 |
81 | .. code-block:: bash
82 |
83 | $ virtualenv -p python3 venv
84 |
85 | # Enter the virtual environment
86 | $ source venv/bin/activate
87 |
88 | # Install Greynir into it
89 | $ pip install reynir
90 |
91 | $ python
92 | [ Use Python with Greynir ]
93 |
94 | # Leave the virtual environment
95 | $ deactivate
96 |
97 | On Windows:
98 |
99 | .. code-block:: batch
100 |
101 | C:\MyProject> virtualenv venv
102 |
103 | REM Enter the virtual environment
104 | C:\MyProject> venv/Scripts/activate
105 |
106 | REM Install Greynir into it
107 | (venv) C:\MyProject> pip install reynir
108 |
109 | (venv) C:\MyProject> python
110 | REM [ Use Python with Greynir ]
111 |
112 | REM Leave the virtual environment
113 | (venv) C:\MyProject> deactivate
114 |
115 | More information about *virtualenv* is `available
116 | here `_.
117 |
--------------------------------------------------------------------------------
/doc/nonterminals.rst:
--------------------------------------------------------------------------------
1 | .. _nonterminals:
2 |
3 | Nonterminals
4 | ============
5 |
6 | This section lists the nonterminals that can occur within simplified
7 | sentence trees, i.e. instances of the :py:class:`SimpleTree` class.
8 | The nonterminal name of a tree node can be read from the
9 | :py:attr:`SimpleTree.tag` property.
10 |
11 | Sentences and paragraphs
12 | ------------------------
13 |
14 | *Setningar, málsgreinar og efnisgreinar*
15 |
16 | +--------------+----------------------------------------------------------+
17 | | S0 | Root of tree |
18 | +--------------+----------------------------------------------------------+
19 | | S-MAIN | Main clause (aðalsetning) |
20 | +--------------+----------------------------------------------------------+
21 | | S-HEADING | Sentence-heading (fyrirsögn) |
22 | +--------------+----------------------------------------------------------+
23 | | S-PREFIX | Prefix clause (*Með öðrum orðum:* Páll sá kött) |
24 | +--------------+----------------------------------------------------------+
25 | | S-QUE | Question clause (spurnarsetning) |
26 | | | („*Hvaða stjaka viltu*?“) |
27 | +--------------+----------------------------------------------------------+
28 | | CP-THT | Complement clause (skýringarsetning) |
29 | | | (Páll veit *að kötturinn kemur heim*) |
30 | +--------------+----------------------------------------------------------+
31 | | CP-QUE | Question subclause (spurnaraukasetning) |
32 | | | (Páll spurði *hvaða stjaka hún vildi*) |
33 | +--------------+----------------------------------------------------------+
34 | | CP-REL | Relative clause (tilvísunarsetning) |
35 | | | (Páll, *sem kom inn*, klappaði kettinum) |
36 | +--------------+----------------------------------------------------------+
37 | | CP-ADV-TEMP | Adverbial temporal phrase (tíðarsetning) |
38 | | | (Páll fór út *á meðan kötturinn mjálmaði*) |
39 | +--------------+----------------------------------------------------------+
40 | | CP-ADV-PURP | Adverbial purpose phrase (tilgangssetning) |
41 | | | (Fuglinn flaug *til þess að ná sér í mat*) |
42 | +--------------+----------------------------------------------------------+
43 | | CP-ADV-ACK | Adverbial acknowledgement phrase (viðurkenningarsetning) |
44 | | | (Páll fór út, *þó að hann væri þreyttur*) |
45 | +--------------+----------------------------------------------------------+
46 | | CP-ADV-CONS | Adverbial consequence phrase (afleiðingarsetning) |
47 | | | (Páll fór út, *þannig að hann er þreyttur*) |
48 | +--------------+----------------------------------------------------------+
49 | | CP-ADV-CAUSE | Adverbial causal phrase (orsakarsetning) |
50 | | | (Páll fór út, *þar sem hann er þreyttur*) |
51 | +--------------+----------------------------------------------------------+
52 | | CP-ADV-COND | Adverbial conditional phrase (skilyrðissetning) |
53 | | | (Páll færi út, *ef hann gæti*) |
54 | +--------------+----------------------------------------------------------+
55 | | CP-ADV-CMP | Adverbial comparative phrase (samanburðarsetning) |
56 | +--------------+----------------------------------------------------------+
57 | | CP-QUOTE | Direct quote (bein tilvitnun) |
58 | | | („*Þetta er fínt*,“ sagði Páll) |
59 | +--------------+----------------------------------------------------------+
60 |
61 |
62 | Inflectional phrases
63 | --------------------
64 |
65 | *Beygingarliðir*
66 |
67 | +------------+---------------------------------------------------+
68 | | IP | Inflectional phrase (beygingarliður) |
69 | +------------+---------------------------------------------------+
70 | | IP-INF | Infinitival inflectional phrase |
71 | +------------+---------------------------------------------------+
72 |
73 |
74 | Noun phrases
75 | ------------
76 |
77 | *Nafnliðir*
78 |
79 | +------------+---------------------------------------------------+
80 | | NP | Noun phrase |
81 | +------------+---------------------------------------------------+
82 | | NP-SUBJ | Noun phrase - subject (*Páll* sá sólina) |
83 | +------------+---------------------------------------------------+
84 | | NP-OBJ | Noun phrase - direct object (Páll sá *sólina*) |
85 | +------------+---------------------------------------------------+
86 | | NP-IOBJ | Noun phrase - indirect object |
87 | | | (Páll sýndi *barninu* bókina) |
88 | +------------+---------------------------------------------------+
89 | | NP-PRD | Noun phrase - predicate (Páll er *formaður*) |
90 | +------------+---------------------------------------------------+
91 | | NP-ADP | Noun phrase - adjectival object (líkur *Páli*) |
92 | +------------+---------------------------------------------------+
93 | | NP-POSS | Noun phrase - possessive (köttur *Páls*) |
94 | +------------+---------------------------------------------------+
95 | | NP-ADDR | Noun phrase - address (*Fiskislóð 31*) |
96 | +------------+---------------------------------------------------+
97 | | NP-TITLE | Noun phrase - title (Páll Jónsson *ritari*) |
98 | +------------+---------------------------------------------------+
99 | | NP-COMPANY | Noun phrase - company (*Samherji hf.*) |
100 | +------------+---------------------------------------------------+
101 | | NP-MEASURE | Noun phrase - quantity |
102 | +------------+---------------------------------------------------+
103 | | NP-AGE | Noun phrase - age |
104 | +------------+---------------------------------------------------+
105 |
106 |
107 | Adjective phrases
108 | -----------------
109 |
110 | *Lýsingarliðir*
111 |
112 | +------------+---------------------------------------------------+
113 | | ADJP | Adjective phrase (Páll er *góður og gegn* maður) |
114 | +------------+---------------------------------------------------+
115 |
116 | Verb phrases
117 | ------------
118 |
119 | *Sagnliðir*
120 |
121 | +------------+---------------------------------------------------+
122 | | VP | Verb phrase |
123 | +------------+---------------------------------------------------+
124 | | VP-AUX | Auxiliary verb phrase (hjálparsögn) |
125 | | | (Páll *hefur* klappað kettinum) |
126 | +------------+---------------------------------------------------+
127 |
128 | Prepositional phrases
129 | ---------------------
130 |
131 | *Forsetningarliðir*
132 |
133 | +------------+---------------------------------------------------+
134 | | PP | Prepositional phrase |
135 | +------------+---------------------------------------------------+
136 |
137 | Adverbial phrases
138 | -----------------
139 |
140 | *Atviksliðir*
141 |
142 | +--------------------+-------------------------------------------+
143 | | ADVP | Adverbial phrase |
144 | +--------------------+-------------------------------------------+
145 | | ADVP-DIR | Directional adverbial phrase |
146 | +--------------------+-------------------------------------------+
147 | | ADVP-DATE-ABS | Absolute date phrase |
148 | +--------------------+-------------------------------------------+
149 | | ADVP-DATE-REL | Relative date phrase |
150 | +--------------------+-------------------------------------------+
151 | | ADVP-TIMESTAMP-ABS | Absolute timestamp |
152 | +--------------------+-------------------------------------------+
153 | | ADVP-TIMESTAMP-REL | Relative timestamp |
154 | +--------------------+-------------------------------------------+
155 | | ADVP-TMP-SET | Temporal frequency phrase |
156 | +--------------------+-------------------------------------------+
157 | | ADVP-DUR-ABS | Absolute duration |
158 | +--------------------+-------------------------------------------+
159 | | ADVP-DUR-REL | Relative duration |
160 | +--------------------+-------------------------------------------+
161 | | ADVP-DUR-TIME | Time period phrase |
162 | +--------------------+-------------------------------------------+
163 |
164 |
165 | Other phrases
166 | -------------
167 |
168 | *Aðrir liðir*
169 |
170 | +--------+---------------------------------------------------+
171 | | P | Preposition |
172 | +--------+---------------------------------------------------+
173 | | TO | Infinitive particle |
174 | +--------+---------------------------------------------------+
175 | | C | Conjunction |
176 | +--------+---------------------------------------------------+
177 |
--------------------------------------------------------------------------------
/doc/overview.rst:
--------------------------------------------------------------------------------
1 | .. _overview:
2 |
3 | Overview
4 | ========
5 |
6 | **Greynir** parses sentences of Icelandic text into **parse trees**.
7 | A parse tree recursively describes the grammatical structure
8 | of the sentence, including its noun phrases, verb phrases,
9 | prepositional phrases, etc.
10 |
11 | The individual tokens (words, numbers, punctuation, etc.) of the sentence
12 | correspond to leaves in the parse tree.
13 |
14 | .. figure:: _static/LitlaGula.png
15 | :align: center
16 | :alt: A parse tree
17 |
18 | *The parse tree for the sentence "Litla gula hænan fann fræ".*
19 |
20 | By examining and processing the parse tree, information and meaning
21 | can be extracted from the sentence.
22 |
23 | Example
24 | -------
25 |
26 | Here is a short example of what can be done with Greynir::
27 |
28 | >>> from reynir import Greynir
29 | >>> g = Greynir()
30 | >>> sent = g.parse_single("Ása sá sól.")
31 | >>> print(sent.tree.view)
32 | S0 # Root
33 | +-S-MAIN # Main sentence
34 | +-IP # Inflected phrase
35 | +-NP-SUBJ # Noun phrase, subject
36 | +-no_et_nf_kvk: 'Ása' # Noun, singular, nominative, feminine
37 | +-VP # Verb phrase containing arguments
38 | +-VP # Verb phrase containing verb
39 | +-so_1_þf_et_p3: 'sá' # Verb, 1 accusative arg, singular, 3rd p
40 | +-NP-OBJ # Noun phrase, object
41 | +-no_et_þf_kvk: 'sól' # Noun, singular, accusative, feminine
42 | +-'.' # Punctuation
43 | >>> sent.tree.nouns
44 | ['Ása', 'sól']
45 | >>> sent.tree.verbs
46 | ['sjá']
47 | >>> # Show the subject noun phrase
48 | >>> sent.tree.S.IP.NP_SUBJ.lemmas
49 | ['Ása']
50 | >>> # Show the verb phrase
51 | >>> sent.tree.S.IP.VP.lemmas
52 | ['sjá', 'sól']
53 | >>> # Show the object of the verb
54 | >>> sent.tree.S.IP.VP.NP_OBJ.lemmas
55 | ['sól']
56 |
57 | Here, ``S`` stands for sentence *(málsgrein)*, ``IP`` for inflected
58 | phrase *(beygingarliður)*, ``VP`` is a verb phrase *(sagnliður)*,
59 | ``NP_SUBJ`` is a subject noun phrase *(frumlag)* and
60 | ``NP_OBJ`` is an object noun phrase *(andlag)*.
61 | Nonterminal names are listed in the :ref:`nonterminals` section.
62 |
63 | What Greynir does
64 | -----------------
65 |
66 | Greynir starts by **tokenizing** your text, i.e. dividing it up into individual words,
67 | numbers, punctuation and other tokens. For this, it uses the separate
68 | `Tokenizer `_ package, by the
69 | same authors, which is automatically installed with Greynir.
70 |
71 | After tokenization, Greynir proceeds to **parse** the text according to a
72 | `context-free grammar `_
73 | for the modern Icelandic language. This grammar contains rules describing
74 | how sentences and the various subparts thereof can be validly constructed.
75 |
76 | Almost all sentences are **ambiguous**. This means that there are multiple
77 | parse trees that can validly describe the sentence according to the grammar
78 | rules. Greynir thus has to choose a single best tree from the forest of
79 | possible trees. It does this with a scoring heuristic which assigns higher
80 | scores to common word forms and grammatical constructs, and lower scores to
81 | rare word forms and uncommon constructs. The parse tree with the highest
82 | overall score wins and is returned from the :py:meth:`Greynir.parse_single()`
83 | function.
84 |
85 | Once the best parse tree has been found, it is available for various kinds
86 | of **queries**. You can access word lemmas, extract noun and verb phrases
87 | as shown above, look for patterns via wildcard matching, and much more.
88 | This is described in detail in the :ref:`reference`.
89 |
90 |
--------------------------------------------------------------------------------
/doc/patterns.rst:
--------------------------------------------------------------------------------
1 | .. _patterns:
2 |
3 | Patterns
4 | ========
5 |
6 | This section describes grammatical matching patterns that can be used with the
7 | :py:meth:`SimpleTree.match()`, :py:meth:`SimpleTree.first_match()`,
8 | :py:meth:`SimpleTree.all_matches()` and :py:meth:`SimpleTree.top_matches()`
9 | methods.
10 |
11 | Overview
12 | --------
13 |
14 | The above mentioned methods can be used to find trees and subtrees that match
15 | a specific grammatical pattern, within a sentence. The pattern can include
16 | conditions that apply to the root of each subtree as well as its children,
17 | direct or indirect.
18 |
19 | The patterns are given as strings, with pattern tokens separated by whitespace.
20 | :ref:`examples` are given below.
21 |
22 | See the documentation of each method for a further explanation of how the
23 | given pattern is matched in each case, and how results are returned.
24 |
25 | Simple matches
26 | --------------
27 |
28 | * A ``"literal"`` within *double quotes* matches a subtree that covers exactly
29 | the given literal text, although using a case-neutral comparison.
30 | ``"Icelandic"`` thus matches ``icelandic`` and ``ICELANDIC``.
31 | The literal may have multiple words, separated by spaces:
32 | ``"borgarstjóri reykjavíkur"`` matches a subtree that covers these two
33 | word forms. The matched subtree can be a nonterminal or a terminal node.
34 |
35 | * A ``'literal'`` within *single quotes* matches a subtree that covers exactly
36 | the given word lemma(s), using a case-neutral comparison.
37 | ``'hestur'`` thus matches ``hests`` and ``Hestinum``.
38 | The literal may have multiple words, separated by spaces:
39 | ``'borgarstjóri reykjavík'`` matches a subtree that covers these
40 | two lemmas. (``'borgarstjóri reykjavíkur'`` would never match anything
41 | as ``reykjavíkur`` is not the lemma of any word form.) The matched subtree
42 | can be a nonterminal or a terminal node.
43 |
44 | * A ``@"literal"`` within *double quotes* and *prefixed with the @ symbol* matches
45 | a *terminal node* that corresponds to a token having
46 | the given literal text, although using a case-neutral comparison.
47 | ``@"Icelandic"`` thus matches ``icelandic`` and ``ICELANDIC``.
48 |
49 | * A ``@'literal'`` within *single quotes* and *prefixed with the @ symbol* matches
50 | a *terminal node* that corresponds to a token having the given word lemma,
51 | using a case-neutral comparison. ``@'hestur'`` thus matches ``hests``
52 | and ``Hestinum``.
53 |
54 | * A ``NONTERMINAL`` identifier in upper case matches nodes associated with
55 | that nonterminal, as well as subcategories thereof. ``NP`` thus matches
56 | ``NP`` as well as ``NP-OBJ`` and ``NP-SUBJ``. ``NP-OBJ`` only matches
57 | ``NP-OBJ`` and subcategories thereof.
58 |
59 | * A ``terminal`` identifier in lower case matches nodes associated with
60 | the specified category of terminal, and having at least the variants given,
61 | if any. ``no`` thus matches all noun terminals, while ``no_nf_et``
62 | only matches noun terminals in nominative case, singular (but any
63 | gender, since a gender variant is not specified). ``p`` matches a
64 | punctuation terminal.
65 |
66 | Wildcard match
67 | --------------
68 |
69 | * A dot ``.`` matches any single tree node, which can be a terminal or nonterminal.
70 |
71 | OR match
72 | --------
73 |
74 | * ``(Any1 | Any2 | ...)`` matches if anything between the parentheses matches.
75 | The options are separated by vertical bars ``|``.
76 |
77 | Sequence matches
78 | ----------------
79 |
80 | * ``Any1 Any2 Any3`` matches the given sequence of matches if each
81 | element matches in exactly the given order. The match must be exhaustive,
82 | i.e. no child nodes may be left unmatched at the end of the list.
83 |
84 | * ``Any+`` matches one or more sequential instances of the given ``Any`` match.
85 |
86 | * ``Any*`` matches zero or more sequential instances of the
87 | given ``Any`` match.
88 |
89 | * ``Any?`` matches zero or one instances of the given ``Any`` match.
90 |
91 | * ``.*`` thus matches any number of any nodes and is an often-used construct.
92 |
93 | * ``[ Any1 Any2 ]`` matches any node sequence that starts with the two given
94 | matches. It does not matter whether the sequence contains more nodes.
95 |
96 | * ``[ Any1 Any2 $ ]`` matches any node sequence where ``Any1``
97 | and ``Any2`` match and there are no further nodes in the sequence.
98 | The ``$`` sign is an end-of-sequence marker.
99 |
100 | * ``[ Any1 .* Any2 $ ]`` matches only sequences that start with ``Any1`` and
101 | end with ``Any2``.
102 |
103 | Hierarchical matches
104 | --------------------
105 |
106 | * ``Any1 > { Any2 Any3 ... }`` matches if ``Any1`` matches and has *immediate*
107 | (direct) children that include ``Any2``, ``Any3`` *and* other given arguments
108 | (irrespective of order). This is a *set-like* operator.
109 |
110 | * ``Any1 >> { Any2 Any3 ... }`` matches if ``Any1`` matches and has children
111 | *at any sublevel*, that include ``Any2``, ``Any3`` *and* other given arguments
112 | (irrespective of order). However, subtrees of ``IP`` nonterminals are skipped,
113 | so nested inflectional phrases are excluded from the search.
114 | This is a *set-like* operator.
115 |
116 | * ``Any1 >>> { Any2 Any3 ... }`` matches if ``Any1`` matches and has children
117 | *at any sublevel*, that include ``Any2``, ``Any3`` *and* other given arguments
118 | (irrespective of order). Unlike the ``>>`` operator, subtrees of ``IP`` are
119 | included in the search. This is a *set-like* operator.
120 |
121 | * ``Any1 > [ Any2 Any3 ... ]`` matches if ``Any1`` matches and has immediate
122 | children that include ``Any2``, ``Any3`` *and* other given arguments
123 | *in the order specified*. This is a *list-like* operator.
124 |
125 | .. _examples:
126 |
127 | Examples
128 | --------
129 |
130 | This pattern will match the root subtree of any sentence that has a verb phrase
131 | that refers to a person as an argument::
132 |
133 | "S >> { VP >> { NP-OBJ >> person }}"
134 |
135 | This pattern will match any sentence that has a verb phrase that refers to
136 | a male person as an argument::
137 |
138 | "S >> { VP >> { NP-OBJ >> person_kk }}"
139 |
140 | Here is a short program using some of the matching features::
141 |
142 | from reynir import Greynir
143 | g = Greynir()
144 | my_text = ("Reynt er að efla áhuga ungs fólks á borgarstjórnarmálum "
145 | "með framboðsfundum og skuggakosningum en þótt kjörstaðirnir "
146 | "í þeim séu færðir inn í framhaldsskólana er þátttakan lítil.")
147 | s = g.parse_single(my_text)
148 | print("Parse tree:")
149 | print(s.tree.view)
150 | print("\nAll subjects:\n")
151 | for d in s.tree.descendants:
152 | if d.match_tag("NP-SUBJ"):
153 | print(d.text)
154 | print("\nAll masculine noun and pronoun phrases:\n")
155 | for m in s.tree.all_matches("NP > { (no_kk | pfn_kk) } "):
156 | print(m.canonical_np)
157 |
158 | Output:
159 |
160 | .. code-block:: none
161 |
162 | Parse tree:
163 | S0
164 | +-S-MAIN
165 | +-IP
166 | +-VP
167 | +-VP
168 | +-so_sagnb: 'Reynt'
169 | +-VP
170 | +-so_et_p3: 'er'
171 | +-IP-INF
172 | +-TO
173 | +-nhm: 'að'
174 | +-VP
175 | +-VP
176 | +-so_1_þf_nh: 'efla'
177 | +-NP-OBJ
178 | +-no_et_þf_kk: 'áhuga'
179 | +-NP-POSS
180 | +-lo_ef_et_hk: 'ungs'
181 | +-no_et_ef_hk: 'fólks'
182 | +-PP
183 | +-P
184 | +-fs_þgf: 'á'
185 | +-NP
186 | +-no_ft_þgf_hk: 'borgarstjórnarmálum'
187 | +-PP
188 | +-P
189 | +-fs_þgf: 'með'
190 | +-NP
191 | +-no_ft_þgf_kk: 'framboðsfundum'
192 | +-C
193 | +-st: 'og'
194 | +-no_ft_þgf_kvk: 'skuggakosningum'
195 | +-C
196 | +-st: 'en'
197 | +-S-MAIN
198 | +-CP-ADV-ACK
199 | +-C
200 | +-st: 'þótt'
201 | +-IP
202 | +-NP-SUBJ
203 | +-no_ft_nf_kk: 'kjörstaðirnir'
204 | +-PP
205 | +-P
206 | +-fs_þgf: 'í'
207 | +-NP
208 | +-pfn_kvk_ft_þgf: 'þeim'
209 | +-VP
210 | +-VP
211 | +-so_ft_p3: 'séu'
212 | +-NP-PRD
213 | +-NP-PRD
214 | +-VP
215 | +-so_lhþt_sb_nf_ft_kk: 'færðir'
216 | +-PP
217 | +-ADVP-DIR
218 | +-ao: 'inn'
219 | +-P
220 | +-fs_þf: 'í'
221 | +-NP
222 | +-no_ft_þf_kk: 'framhaldsskólana'
223 | +-IP
224 | +-VP
225 | +-VP
226 | +-so_et_p3: 'er'
227 | +-NP-SUBJ
228 | +-no_et_nf_kvk: 'þátttakan'
229 | +-NP-PRD
230 | +-lo_sb_nf_et_kvk: 'lítil'
231 | +-'.'
232 |
233 | All subjects:
234 |
235 | kjörstaðirnir í þeim
236 | þátttakan
237 |
238 | All masculine noun and pronoun phrases:
239 |
240 | áhugi
241 | framboðsfundur og skuggakosning
242 | kjörstaður
243 | framhaldsskóli
244 |
245 |
--------------------------------------------------------------------------------
/doc/quickstart.rst:
--------------------------------------------------------------------------------
1 | .. _quickstart:
2 |
3 | Quickstart
4 | ==========
5 |
6 | After :ref:`installing Greynir `, fire up your
7 | Python 3 interpreter::
8 |
9 | $ python3
10 |
11 | ...and try something like the following::
12 |
13 | from reynir import NounPhrase as Nl
14 |
15 | # Create a NounPhrase ('nafnliður') object
16 | nl = Nl("þrír lúxus-miðar á Star Wars og tveir brimsaltir pokar af poppi")
17 |
18 | # Print the NounPhrase in the correct case for each context
19 | # (þf=þolfall/accusative, þgf=þágufall/dative)
20 |
21 | print("Þú keyptir {nl:þf}.".format(nl=nl))
22 | print("Hér er kvittunin þín fyrir {nl:þgf}.".format(nl=nl))
23 |
24 | The program outputs the following text, correctly inflected::
25 |
26 | Þú keyptir þrjá lúxus-miða á Star Wars og tvo brimsalta poka af poppi.
27 | Hér er kvittunin þín fyrir þremur lúxus-miðum á Star Wars og tveimur brimsöltum pokum af poppi.
28 |
29 | Use the :py:class:`NounPhrase` class to easily inflect Icelandic noun phrases
30 | and to convert them between cases, for instance in user interfaces, in chatbot
31 | conversations and in printouts.
32 |
33 | A more detailed, lower-level example is as follows::
34 |
35 | from reynir import Greynir
36 |
37 | my_text = "Litla gula hænan fann fræ. Það var hveitifræ."
38 |
39 | # Initialize the Greynir parser and submit the text as a parse job
40 | g = Greynir()
41 | job = g.submit(my_text)
42 |
43 | # Iterate through sentences and parse each one
44 | for sent in job:
45 | sent.parse()
46 | print("Sentence: {0}".format(sent.tidy_text))
47 | print("Lemmas: {0}".format(sent.lemmas))
48 | print("Parse tree:\n{0}\n".format(sent.tree.view))
49 |
50 | The output of the program is::
51 |
52 | Sentence: Litla gula hænan fann fræ.
53 | Lemmas: ['lítill', 'gulur', 'hæna', 'finna', 'fræ', '.']
54 | Parse tree:
55 | S0
56 | +-S-MAIN
57 | +-IP
58 | +-NP-SUBJ
59 | +-lo_nf_et_kvk: 'Litla'
60 | +-lo_nf_et_kvk: 'gula'
61 | +-no_et_nf_kvk: 'hænan'
62 | +-VP
63 | +-VP
64 | +-so_1_þf_et_p3: 'fann'
65 | +-NP-OBJ
66 | +-no_et_þf_hk: 'fræ'
67 | +-'.'
68 | Sentence: Það var hveitifræ.
69 | Lemmas: ['það', 'vera', 'hveitifræ', '.']
70 | Parse tree:
71 | S0
72 | +-S-MAIN
73 | +-IP
74 | +-NP-SUBJ
75 | +-pfn_hk_et_nf: 'Það'
76 | +-VP
77 | +-VP
78 | +-so_1_nf_et_p3: 'var'
79 | +-NP-PRD
80 | +-no_et_nf_hk: 'hveitifræ'
81 | +-'.'
82 |
83 | The code first creates an instance of the :py:class:`Greynir` class
84 | and assigns it to the ``g`` object. The :py:class:`Greynir` class is
85 | Greynir's main service interface.
86 |
87 | Next, the program submits a piece of text containing two sentences to
88 | Greynir, which returns a job object. Each job object encapsulates a
89 | stream of sentences that will be, or have been, processed through
90 | Greynir's tokenizer and parser.
91 |
92 | A job object is a Python generator, and the ``for`` loop iterates through
93 | the job's sentence stream, returning each sentence in turn in the ``sent``
94 | object.
95 |
96 | The ``for`` loop body parses the sentence by calling ``sent.parse()``.
97 | This function returns ``True`` if the sentence was successfully parsed, i.e.
98 | at least one valid parse tree was found for it, or ``False`` otherwise.
99 |
100 | The sentence object has a number of properties, including ``sent.tidy_text``
101 | which returns a normalized form of the tokenized sentence.
102 |
103 | If the sentence was successfully parsed, the ``sent.tree`` property
104 | (of type :py:class:`SimpleTree`)
105 | contains its best parse tree. This tree can be further queried via
106 | properties such as ``sent.lemmas``, which returns a list of the
107 | word lemmas in the sentence, and ``sent.tree.view``, which
108 | returns a string with an "ASCII art" representation of the parse tree.
109 |
110 | The parse tree contains grammar **nonterminals** in uppercase, such
111 | as ``S0`` (root), ``S-MAIN`` (main sentence), ``IP`` (inflected
112 | phrase - *beygingarliður*), ``NP-SUBJ`` (noun phrase - subject,
113 | *frumlag*), ``VP`` (verb phrase - *sagnliður*), etc.
114 |
115 | Nonterminals are listed and explained in the :ref:`nonterminals` section.
116 |
117 | The tree also shows grammar **terminals** (leaves, corresponding to
118 | tokens) in lowercase, as well as their :ref:`grammatical variants `
119 | (features). Examples are ``pfn_hk_et_nf`` (personal pronoun,
120 | neutral gender, singular, nominative case), and ``so_1_nf_et_p3``
121 | (verb, one argument in nominative case, singular, 3rd person).
122 |
123 | Terminals and variants are listed and explained in the :ref:`terminals`
124 | section.
125 |
126 | The sentence and tree properties and functions are further
127 | detailed and described in the :ref:`reference` section.
128 |
129 |
--------------------------------------------------------------------------------
/doc/terminals.rst:
--------------------------------------------------------------------------------
1 | .. _terminals:
2 |
3 | Terminals
4 | =========
5 |
6 | This section lists the terminals that can occur within simplified
7 | sentence trees, i.e. instances of the :py:class:`SimpleTree` class. The
8 | terminal associated with a tree node is available in the
9 | :py:attr:`SimpleTree.terminal` property.
10 |
11 | A terminal node always corresponds to a single token from the input text.
12 |
13 | A typical terminal string looks like this (for instance matching
14 | the word *hestur*)::
15 |
16 | 'no_kk_nf_et' # Noun, masculine, nominative case, singular
17 |
18 | The terminal category, i.e. the first part of the terminal name (``no`` in the
19 | example), is available
20 | in the :py:attr:`SimpleTree.tcat` property. The grammatical variants of the
21 | terminal are stored in the list :py:attr:`SimpleTree.variants`,
22 | which is ``[ 'kk', 'nf', 'et' ]`` in the example.
23 |
24 | To obtain the entire set of variants (features) associated with a word form,
25 | use the property :py:attr:`SimpleTree.all_variants`.
26 |
27 | The terminal categories and grammatical variants are listed below.
28 |
29 | .. _categories:
30 |
31 | Word categories
32 | ---------------
33 |
34 | +------------+---------------------------------------------------+
35 | | no | Noun (nafnorð) |
36 | +------------+---------------------------------------------------+
37 | | so | Verb (sagnorð) |
38 | +------------+---------------------------------------------------+
39 | | lo | Adjective (lýsingarorð) |
40 | +------------+---------------------------------------------------+
41 | | fs | Preposition (forsetning) |
42 | +------------+---------------------------------------------------+
43 | | nhm | Verb infinitive indicator (nafnháttarmerki, *að*) |
44 | +------------+---------------------------------------------------+
45 | | gr | Definite article (laus greinir, *hinn/hin/hið*) |
46 | +------------+---------------------------------------------------+
47 | | uh | Exclamation (upphrópun) |
48 | +------------+---------------------------------------------------+
49 | | ao | Adverb (atviksorð) |
50 | +------------+---------------------------------------------------+
51 | | eo | Qualifying adverb (atviksorð sem stendur með |
52 | | | nafnorði í einkunn) |
53 | +------------+---------------------------------------------------+
54 | | st | Conjunction (samtenging) |
55 | +------------+---------------------------------------------------+
56 | | stt | Connective conjunction (sem/er-samtenging) |
57 | +------------+---------------------------------------------------+
58 | | fn | Pronoun (fornafn) |
59 | +------------+---------------------------------------------------+
60 | | pfn | Personal pronoun (persónufornafn) |
61 | +------------+---------------------------------------------------+
62 | | abfn | Reflexive pronoun (afturbeygt fornafn) |
63 | +------------+---------------------------------------------------+
64 | | person | Person name (mannsnafn) |
65 | +------------+---------------------------------------------------+
66 | | sérnafn | Proper name (sérnafn) |
67 | +------------+---------------------------------------------------+
68 | | entity | Proper name of recognized named entity |
69 | +------------+---------------------------------------------------+
70 | | fyrirtæki | Company name (fyrirtækisnafn) |
71 | +------------+---------------------------------------------------+
72 | | gata | Street name (götuheiti) |
73 | +------------+---------------------------------------------------+
74 | | to | Number word, inflectable (beygjanlegt töluorð) |
75 | | | Only *núll, einn, tveir, þrír, fjórir* |
76 | +------------+---------------------------------------------------+
77 | | töl | Number word, uninflectable (óbeygjanlegt töluorð) |
78 | +------------+---------------------------------------------------+
79 |
80 | Number categories
81 | -----------------
82 |
83 | +----------------+---------------------------------------------------+
84 | | tala | Number |
85 | +----------------+---------------------------------------------------+
86 | | prósenta | Percentage |
87 | +----------------+---------------------------------------------------+
88 | | ártal | Year |
89 | +----------------+---------------------------------------------------+
90 | | raðnr | Ordinal number |
91 | +----------------+---------------------------------------------------+
92 | | talameðbókstaf | Number followed by letter: *15B* |
93 | +----------------+---------------------------------------------------+
94 | | sequence | Sequence: *1, 2, 3..., a, b, c..., i, ii, iii...* |
95 | +----------------+---------------------------------------------------+
96 |
97 | Date and time categories
98 | ------------------------
99 |
100 | +------------+---------------------------------------------------+
101 | | dagsföst | Absolute date (year, month, day) |
102 | +------------+---------------------------------------------------+
103 | | dagsafs | Relative date |
104 | | | (year, month, day - at least one value missing) |
105 | +------------+---------------------------------------------------+
106 | | tími | Time (hour, minute, second) |
107 | +------------+---------------------------------------------------+
108 | | tímapunktur| Time point |
109 | | | (year, month, day, hour, minute, second) |
110 | +------------+---------------------------------------------------+
111 |
112 | Other
113 | -----------
114 | +---------------+------------------------------------------------+
115 | | lén | *greynir.is* |
116 | +---------------+------------------------------------------------+
117 | | myllumerki | *#lífiðeryndislegt* |
118 | +---------------+------------------------------------------------+
119 | | tölvupóstfang | *gervi@greynir.is* |
120 | +---------------+------------------------------------------------+
121 |
122 |
123 |
124 | Punctuation
125 | -----------
126 |
127 | +------------+---------------------------------------------------+
128 | | grm | Punctuation |
129 | +------------+---------------------------------------------------+
130 |
131 |
132 |
133 | .. _variants:
134 |
135 | Variants
136 | ========
137 |
138 | This section lists grammatical variants (features) that are
139 | included as parts of terminal names, separated by underscores (``_``).
140 |
141 | Gender
142 | ------
143 |
144 | +------------+---------------------------------------------------+
145 | | kk | Masculine (karlkyn) |
146 | +------------+---------------------------------------------------+
147 | | kvk | Feminine (kvenkyn) |
148 | +------------+---------------------------------------------------+
149 | | hk | Neutral (hvorugkyn) |
150 | +------------+---------------------------------------------------+
151 |
152 | Number
153 | ------
154 |
155 | +------------+---------------------------------------------------+
156 | | et | Singular (eintala) |
157 | +------------+---------------------------------------------------+
158 | | ft | Plural (fleirtala) |
159 | +------------+---------------------------------------------------+
160 |
161 | Case
162 | ----
163 |
164 | The *case* variants may occur with nouns, pronouns, adjectives, prepositions
165 | and verbs (``lhþt`` and ``subj``). In the case of prepositions, the
166 | variant indicates which case the preposition controls.
167 |
168 | +------------+---------------------------------------------------+
169 | | nf | Nominative (nefnifall) |
170 | +------------+---------------------------------------------------+
171 | | þf | Accusative (þolfall) |
172 | +------------+---------------------------------------------------+
173 | | þgf | Dative (þágufall) |
174 | +------------+---------------------------------------------------+
175 | | ef | Genitive (eignarfall) |
176 | +------------+---------------------------------------------------+
177 |
178 | Arguments
179 | ---------
180 |
181 | Verb terminals, other than ``lhþt`` and ``subj``, indicate the number
182 | and cases of the verb's arguments as follows::
183 |
184 | 'so_0_et_p3_gm' # No argument, singular/3rd person/active voice
185 | 'so_1_þf_et_p3_gm' # Same, but with one argument in accusative case
186 | 'so_2_þgf_þf_et_p3_gm' # Two arguments, dative and accusative
187 |
188 | An example of a verb that matches the last terminal would be
189 | *skrifaði* (wrote) in the sentence *"Hann skrifaði konunni bréf"*
190 | ("He wrote a letter to the woman").
191 |
192 | +------------+---------------------------------------------------+
193 | | 0 | No argument |
194 | +------------+---------------------------------------------------+
195 | | 1 | One argument, whose case is in the following |
196 | | | variant |
197 | +------------+---------------------------------------------------+
198 | | 2 | Two arguments, whose cases are in the following |
199 | | | two variants |
200 | +------------+---------------------------------------------------+
201 |
202 | Person
203 | ------
204 |
205 | Occurs with verbs (``so`` terminal category) only.
206 |
207 | +------------+---------------------------------------------------+
208 | | p1 | First person *(Ég er / Við erum)* |
209 | +------------+---------------------------------------------------+
210 | | p2 | Second person *(Þú ert / Þið eruð)* |
211 | +------------+---------------------------------------------------+
212 | | p3 | Third person *(Það er / Þau eru)* |
213 | +------------+---------------------------------------------------+
214 |
215 | Degree
216 | ------
217 |
218 | Occurs with adjectives (``lo`` terminal category), and in the
219 | case of ``mst`` with certain adverbs (``ao`` terminal category).
220 |
221 | +------------+---------------------------------------------------+
222 | | mst | Comparative *(stærri)* |
223 | +------------+---------------------------------------------------+
224 | | esb | Superlative, indefinite *(maðurinn er stærstur)* |
225 | +------------+---------------------------------------------------+
226 | | evb | Superlative, definite *(stærsti maðurinn)* |
227 | +------------+---------------------------------------------------+
228 |
229 | Adjective object case
230 | ---------------------
231 |
232 | Occurs with adjectives (``lo`` terminal category) only.
233 |
234 | +------------+---------------------------------------------------+
235 | | sþf | Accusative (viðstaddur *hátíðina*) |
236 | +------------+---------------------------------------------------+
237 | | sþgf | Dative (líkur *Páli*) |
238 | +------------+---------------------------------------------------+
239 | | sef | Genitive (fullur *orku*) |
240 | +------------+---------------------------------------------------+
241 |
242 | Verb forms
243 | ----------
244 |
245 | These variants occur with verbs (``so`` terminal category) only.
246 |
247 | +------------+---------------------------------------------------------+
248 | | gm | Active voice (germynd) |
249 | +------------+---------------------------------------------------------+
250 | | mm | Middle voice (miðmynd) |
251 | +------------+---------------------------------------------------------+
252 | | nh | Infinitive (nafnháttur) |
253 | +------------+---------------------------------------------------------+
254 | | fh | Indicative (framsöguháttur) |
255 | +------------+---------------------------------------------------------+
256 | | bh | Imperative (boðháttur) |
257 | +------------+---------------------------------------------------------+
258 | | vh | Subjunctive (viðtengingarháttur) |
259 | +------------+---------------------------------------------------------+
260 | | nt | Present tense (nútíð) |
261 | +------------+---------------------------------------------------------+
262 | | þt | Past tense (þátíð) |
263 | +------------+---------------------------------------------------------+
264 | | lh | | Present participle (lýsingarháttur nútíðar) |
265 | | | | (note that the ``nt`` variant will also be present) |
266 | +------------+---------------------------------------------------------+
267 | | lhþt | | Past participle (lýsingarþáttur þátíðar) |
268 | | | | (note that the ``þt`` variant will NOT be present) |
269 | +------------+---------------------------------------------------------+
270 | | sagnb | Supine (sagnbót) |
271 | +------------+---------------------------------------------------------+
272 | | sb | Indefinite (sterk beyging), |
273 | | | only occurs with ``lhþt`` |
274 | +------------+---------------------------------------------------------+
275 | | vb | Definite (veik beyging), |
276 | | | only occurs with ``lhþt`` |
277 | +------------+---------------------------------------------------------+
278 | | op | Impersonal verb (ópersónuleg sögn) |
279 | +------------+---------------------------------------------------------+
280 | | subj | Verb that requires the subject's case to be |
281 | | | non-nominative (sögn sem krefst frumlags í |
282 | | | aukafalli) |
283 | +------------+---------------------------------------------------------+
284 | | expl | Expletive (leppur), matches verb forms that can be used |
285 | | | with an expletive (*það rignir*) |
286 | +------------+---------------------------------------------------------+
287 |
288 | Noun qualifiers
289 | ---------------
290 |
291 | These variants occur with noun terminals (``no`` category) only.
292 |
293 | +------------+---------------------------------------------------+
294 | | gr | Definite, attached to noun (viðskeyttur greinir |
295 | | | með nafnorði) |
296 | +------------+---------------------------------------------------+
297 | | abbrev | Abbreviation (skammstöfun) |
298 | +------------+---------------------------------------------------+
299 |
300 | Word or lemma endings
301 | ---------------------
302 |
303 | These variants can be used to constrain matching to word forms or lemmas
304 | with particular endings only. They are used to detect certain forms of
305 | grammatical errors.
306 |
307 | +------------+---------------------------------------------------+
308 | | xir | Matches only words with lemmas that end with |
309 | | | *ir* (e.g., *læknir*, *kælir*) |
310 | +------------+---------------------------------------------------+
311 | | zana | Matches only word forms that end with |
312 | | | *ana* (e.g., *flokkana*, *bílana*) |
313 | +------------+---------------------------------------------------+
314 |
315 |
--------------------------------------------------------------------------------
/old/build_wheels.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # Build the reynir wheels on the CentOS5/6 base manylinux1/manylinux2010 platform
3 | # This script should be executed inside the Docker container!
4 | # It is invoked indirectly from wheels.sh
5 |
6 | # Stop execution upon error; show executed commands
7 | set -e -x
8 |
9 | # Create wheels for Python 3.7
10 | for PYBIN in cp37; do
11 | "/opt/python/${PYBIN}-${PYBIN}m/bin/pip" wheel /io/ -w wheelhouse/
12 | done
13 | # Create wheels for Python >= 3.8
14 | for PYBIN in cp38 cp39; do
15 | "/opt/python/${PYBIN}-${PYBIN}/bin/pip" wheel /io/ -w wheelhouse/
16 | done
17 | # Create wheels for PyPy3 (>=3.7)
18 | for PYBIN in /opt/pypy/pypy3.*/bin; do
19 | "${PYBIN}/pip" wheel /io/ -w wheelhouse/
20 | done
21 |
22 | # Bundle external shared libraries into the wheels
23 | for whl in wheelhouse/reynir-*.whl; do
24 | auditwheel repair "$whl" --plat $PLAT -w /io/wheelhouse/
25 | done
26 |
27 | # Set read/write permissions on the wheels
28 | chmod 666 /io/wheelhouse/*
29 |
--------------------------------------------------------------------------------
/old/release.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Build a GreynirEngine release and upload it to PyPi
3 | if [ "$1" = "" ]; then
4 | echo "Version name argument missing"
5 | exit 1
6 | fi
7 | echo "Upload a new GreynirEngine version:" "$1"
8 | # Fix permission bits
9 | chmod -x src/reynir/*.py
10 | chmod -x src/reynir/*.cpp
11 | chmod -x src/reynir/*.grammar
12 | chmod -x src/reynir/config/*
13 | chmod -x src/reynir/resources/*
14 | # Remove binary grammar files as they may be out of date
15 | rm src/reynir/Greynir.*.bin
16 | # Create the base source distribution
17 | rm -rf build/*
18 | python3 setup.py sdist
19 | # Create the binary wheels
20 | source wheels.sh
21 | # Upload the new release
22 | twine upload dist/reynir-$1*
23 | echo "Upload of" "$1" "done"
24 |
--------------------------------------------------------------------------------
/old/wheels.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | echo "Building manylinux2010 wheels..."
3 | # Build manylinux2010 versions via a Docker CentOS6 image
4 | # See https://github.com/pypa/python-manylinux-demo/blob/master/.travis.yml
5 | # and https://github.com/pypy/manylinux
6 | mkdir -p /tmp/io
7 | chmod 777 /tmp/io
8 | chgrp docker /tmp/io
9 | rm -rf /tmp/io/*
10 | mkdir -p /tmp/io/src
11 | mkdir -p /tmp/io/test
12 | mkdir -p /tmp/io/wheelhouse
13 | chmod 777 /tmp/io/wheelhouse
14 | chgrp docker /tmp/io/wheelhouse
15 | # Fresh copy everything to the /tmp/io temporary subdirectory,
16 | # expanding symlinks
17 | cp -L ./* /tmp/io
18 | cp -L -r ./src/* /tmp/io/src
19 | cp -L -r ./test/* /tmp/io/test
20 | # Pull the latest pypywheels/manylinux2010 Docker image
21 | docker pull pypywheels/manylinux2010-pypy_x86_64
22 | # Run the Docker image
23 | docker run --rm -e PLAT=manylinux2010_x86_64 -it -v /tmp/io:/io pypywheels/manylinux2010-pypy_x86_64 bash /io/build_wheels.sh
24 | # Copy the finished wheels
25 | mkdir -p ./dist
26 | mv /tmp/io/wheelhouse/reynir* ./dist
27 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | Greynir: Natural language processing for Icelandic
4 |
5 | Setup.py
6 |
7 | Copyright © 2023 Miðeind ehf.
8 | Original Author: Vilhjálmur Þorsteinsson
9 |
10 | This software is licensed under the MIT License:
11 |
12 | Permission is hereby granted, free of charge, to any person
13 | obtaining a copy of this software and associated documentation
14 | files (the "Software"), to deal in the Software without restriction,
15 | including without limitation the rights to use, copy, modify, merge,
16 | publish, distribute, sublicense, and/or sell copies of the Software,
17 | and to permit persons to whom the Software is furnished to do so,
18 | subject to the following conditions:
19 |
20 | The above copyright notice and this permission notice shall be
21 | included in all copies or substantial portions of the Software.
22 |
23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
26 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
27 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
28 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
29 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
30 |
31 |
32 | This module sets up the Greynir package. It uses the cffi_modules
33 | parameter, available in recent versions of setuptools, to
34 | automatically compile the eparser.cpp module to eparser.*.so/.pyd
35 | and build the required CFFI Python wrapper via eparser_build.py.
36 | The same applies to bin.cpp -> bin.*.so and bin_build.py.
37 |
38 | Note that installing under PyPy >= 3.9 is supported (and recommended
39 | for best performance).
40 |
41 | """
42 |
43 | from glob import glob
44 | from os.path import basename, splitext
45 |
46 | from setuptools import find_packages
47 | from setuptools import setup # type: ignore
48 |
49 |
50 | with open("README.md", "r", encoding="utf-8") as fh:
51 | long_description = fh.read()
52 |
53 | setup(
54 | name="reynir",
55 | version="3.5.7",
56 | license="MIT",
57 | description="A natural language parser for Icelandic",
58 | long_description=long_description,
59 | long_description_content_type="text/markdown",
60 | author="Miðeind ehf",
61 | author_email="mideind@mideind.is",
62 | url="https://github.com/mideind/GreynirEngine",
63 | packages=find_packages("src"),
64 | package_dir={"": "src"},
65 | py_modules=[splitext(basename(path))[0] for path in glob("src/*.py")],
66 | package_data={"reynir": ["py.typed"]},
67 | include_package_data=True,
68 | zip_safe=True,
69 | classifiers=[
70 | # complete classifier list: http://pypi.python.org/pypi?%3Aaction=list_classifiers
71 | "Development Status :: 5 - Production/Stable",
72 | "Intended Audience :: Developers",
73 | "Intended Audience :: Science/Research",
74 | "License :: OSI Approved :: MIT License",
75 | "Operating System :: Unix",
76 | "Operating System :: POSIX",
77 | "Operating System :: Microsoft :: Windows",
78 | "Operating System :: MacOS",
79 | "Natural Language :: Icelandic",
80 | "Programming Language :: Python",
81 | "Programming Language :: Python :: 3",
82 | "Programming Language :: Python :: 3.9",
83 | "Programming Language :: Python :: 3.10",
84 | "Programming Language :: Python :: 3.11",
85 | "Programming Language :: Python :: 3.12",
86 | "Programming Language :: Python :: 3.13",
87 | "Programming Language :: Python :: Implementation :: CPython",
88 | "Programming Language :: Python :: Implementation :: PyPy",
89 | "Topic :: Software Development :: Libraries :: Python Modules",
90 | "Topic :: Utilities",
91 | "Topic :: Text Processing :: Linguistic",
92 | ],
93 | keywords=["nlp", "parser", "icelandic"],
94 | # Note: cffi 1.15.1 is the version built into PyPy 3.9.
95 | # Do not specify a higher version as that would prevent installation on PyPy 3.9,
96 | # unless you know what you're doing.
97 | setup_requires=["cffi>=1.15.1"],
98 | install_requires=[
99 | "cffi>=1.15.1",
100 | "tokenizer>=3.4.5",
101 | "islenska>=1.0.3",
102 | "typing_extensions",
103 | ],
104 | cffi_modules=["src/reynir/eparser_build.py:ffibuilder"],
105 | )
106 |
--------------------------------------------------------------------------------
/src/reynir/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | Greynir: Natural language processing for Icelandic
4 |
5 | Copyright © 2023 Miðeind ehf.
6 | Original author: Vilhjálmur Þorsteinsson
7 |
8 | This software is licensed under the MIT License:
9 |
10 | Permission is hereby granted, free of charge, to any person
11 | obtaining a copy of this software and associated documentation
12 | files (the "Software"), to deal in the Software without restriction,
13 | including without limitation the rights to use, copy, modify, merge,
14 | publish, distribute, sublicense, and/or sell copies of the Software,
15 | and to permit persons to whom the Software is furnished to do so,
16 | subject to the following conditions:
17 |
18 | The above copyright notice and this permission notice shall be
19 | included in all copies or substantial portions of the Software.
20 |
21 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
24 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
25 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
26 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
27 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
28 |
29 | This module exposes the Greynir API, i.e. the identifiers that are
30 | directly accessible via the reynir module object after importing it.
31 |
32 | """
33 |
34 | # Expose the Greynir API
35 |
36 | import importlib.metadata
37 |
38 | from .reynir import (
39 | Greynir,
40 | Reynir,
41 | Terminal,
42 | LemmaTuple,
43 | ProgressFunc,
44 | ParseResult,
45 | Sentence,
46 | Paragraph,
47 | ICELANDIC_RATIO,
48 | )
49 |
50 | # Import the following _underscored classes to be able to use them
51 | # in type signatures in derived classes
52 | from .reynir import (
53 | _Job,
54 | _Sentence,
55 | _Paragraph,
56 | )
57 | from .nounphrase import NounPhrase
58 | from .fastparser import ParseForestPrinter, ParseForestDumper, ParseForestFlattener
59 | from .fastparser import ParseError, ParseForestNavigator
60 | from .settings import Settings
61 | from .bintokenizer import tokenize, TokenList
62 |
63 | # Expose the tokenizer API
64 |
65 | from tokenizer import (
66 | TOK,
67 | Tok,
68 | paragraphs,
69 | correct_spaces,
70 | mark_paragraphs,
71 | TP_LEFT,
72 | TP_CENTER,
73 | TP_RIGHT,
74 | TP_NONE,
75 | TP_WORD,
76 | KLUDGY_ORDINALS_PASS_THROUGH,
77 | KLUDGY_ORDINALS_MODIFY,
78 | KLUDGY_ORDINALS_TRANSLATE,
79 | )
80 | from tokenizer.abbrev import Abbreviations
81 |
82 | __author__ = "Miðeind ehf."
83 | __copyright__ = "© 2023 Miðeind ehf."
84 | __version__ = importlib.metadata.version("reynir")
85 |
86 | __all__ = (
87 | "TP_LEFT",
88 | "TP_RIGHT",
89 | "TP_CENTER",
90 | "TP_NONE",
91 | "TP_WORD",
92 | "KLUDGY_ORDINALS_MODIFY",
93 | "KLUDGY_ORDINALS_PASS_THROUGH",
94 | "KLUDGY_ORDINALS_TRANSLATE",
95 | "Greynir",
96 | "Reynir",
97 | "Terminal",
98 | "LemmaTuple",
99 | "ProgressFunc",
100 | "ParseResult",
101 | "Sentence",
102 | "Paragraph",
103 | "ICELANDIC_RATIO",
104 | "TOK",
105 | "Tok",
106 | "paragraphs",
107 | "correct_spaces",
108 | "mark_paragraphs",
109 | "_Job",
110 | "_Sentence",
111 | "_Paragraph",
112 | "NounPhrase",
113 | "ParseForestPrinter",
114 | "ParseForestDumper",
115 | "ParseForestFlattener",
116 | "ParseError",
117 | "ParseForestNavigator",
118 | "Settings",
119 | "tokenize",
120 | "TokenList",
121 | "__version__",
122 | "__author__",
123 | "__copyright__",
124 | )
125 |
126 | Abbreviations.initialize()
127 | Settings.read("config/GreynirEngine.conf")
128 |
--------------------------------------------------------------------------------
/src/reynir/baseparser.py:
--------------------------------------------------------------------------------
1 | """
2 | Greynir: Natural language processing for Icelandic
3 |
4 | Parser base module
5 |
6 | Copyright © 2023 Miðeind ehf.
7 |
8 | This software is licensed under the MIT License:
9 |
10 | Permission is hereby granted, free of charge, to any person
11 | obtaining a copy of this software and associated documentation
12 | files (the "Software"), to deal in the Software without restriction,
13 | including without limitation the rights to use, copy, modify, merge,
14 | publish, distribute, sublicense, and/or sell copies of the Software,
15 | and to permit persons to whom the Software is furnished to do so,
16 | subject to the following conditions:
17 |
18 | The above copyright notice and this permission notice shall be
19 | included in all copies or substantial portions of the Software.
20 |
21 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
24 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
25 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
26 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
27 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
28 |
29 | This module defines a base parser class. The base is used in
30 | BIN_Parser (see binparser.py) which is again the base of the
31 | C++ Earley parser Fast_Parser (see fastparser.py)
32 |
33 | """
34 |
35 | from typing import Dict, List, Iterator, Optional
36 |
37 | from .grammar import Grammar, GrammarItem, Terminal, Nonterminal, Production
38 |
39 |
40 | class _PackedProduction:
41 |
42 | """A container for a packed production, i.e. a grammar Production
43 | where the component terminals and nonterminals have been packed
44 | into a list of integer indices"""
45 |
46 | def __init__(self, priority: int, production: Production) -> None:
47 | # Store the relative priority of this production within its nonterminal
48 | self._priority = priority
49 | # Keep a reference to the original production
50 | self._production = production
51 | # Store the packed list of indices
52 | self._ix_list = production.prod
53 | # Cache the length
54 | self._len = len(self._ix_list)
55 |
56 | @property
57 | def production(self) -> Production:
58 | return self._production
59 |
60 | @property
61 | def priority(self) -> int:
62 | return self._priority
63 |
64 | def __getitem__(self, index: int) -> int:
65 | return self._ix_list[index] if 0 <= index < self._len else 0
66 |
67 | def __len__(self) -> int:
68 | return self._len
69 |
70 | def __iter__(self) -> Iterator[int]:
71 | return iter(self._ix_list)
72 |
73 |
74 | class Base_Parser:
75 |
76 | """Parses a sequence of tokens according to a given grammar and
77 | a root nonterminal within that grammar, returning a forest of
78 | possible parses. The parses uses an optimized Earley algorithm.
79 | """
80 |
81 | def __init__(self) -> None:
82 | self._root: Optional[int] = None
83 | self._nt_dict: Dict[int, Optional[List[_PackedProduction]]] = {}
84 | self._nonterminals: Dict[int, Nonterminal] = {}
85 | self._terminals: Dict[int, Terminal] = {}
86 |
87 | def init_from_grammar(self, g: Grammar) -> None:
88 | """Initialize the parser with the given grammar"""
89 | nt_d = g.nt_dict
90 | r = g.root
91 | assert nt_d is not None
92 | assert r is not None
93 | assert r in nt_d
94 | # Convert the grammar to integer index representation for speed
95 | self._root = r.index
96 | # Make new grammar dictionary, keyed by nonterminal index and
97 | # containing packed productions with integer indices
98 | self._nt_dict = {}
99 | for nt, plist in nt_d.items():
100 | self._nt_dict[nt.index] = (
101 | [_PackedProduction(prio, p) for prio, p in plist]
102 | )
103 | self._nonterminals = g.nonterminals_by_ix
104 | self._terminals = g.terminals_by_ix
105 |
106 | @classmethod
107 | def for_grammar(cls, g: Grammar) -> "Base_Parser":
108 | """Create a parser for the Grammar in g"""
109 | p = cls()
110 | p.init_from_grammar(g)
111 | return p
112 |
113 | def _lookup(self, ix: int) -> GrammarItem:
114 | """Convert a production item from an index to an object reference"""
115 | # Terminals have positive indices
116 | # Nonterminals have negative indices
117 | # A zero index is not allowed
118 | assert ix != 0
119 | return self._nonterminals[ix] if ix < 0 else self._terminals[ix]
120 |
--------------------------------------------------------------------------------
/src/reynir/basics.py:
--------------------------------------------------------------------------------
1 | """
2 | Greynir: Natural language processing for Icelandic
3 |
4 | Basic classes module
5 |
6 | Copyright © 2023 Miðeind ehf.
7 |
8 | This software is licensed under the MIT License:
9 |
10 | Permission is hereby granted, free of charge, to any person
11 | obtaining a copy of this software and associated documentation
12 | files (the "Software"), to deal in the Software without restriction,
13 | including without limitation the rights to use, copy, modify, merge,
14 | publish, distribute, sublicense, and/or sell copies of the Software,
15 | and to permit persons to whom the Software is furnished to do so,
16 | subject to the following conditions:
17 |
18 | The above copyright notice and this permission notice shall be
19 | included in all copies or substantial portions of the Software.
20 |
21 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
24 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
25 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
26 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
27 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
28 |
29 |
30 | This module contains basic functions that are used by the settings
31 | module and other modules. These functions have been extracted from the
32 | settings module to avoid circular imports or module references.
33 |
34 | """
35 |
36 | from typing import (
37 | Callable,
38 | Iterable,
39 | Iterator,
40 | List,
41 | Optional,
42 | )
43 |
44 | import os
45 | import locale
46 |
47 | from contextlib import contextmanager
48 | import importlib.resources as importlib_resources
49 |
50 |
51 | # The locale used by default in the changedlocale function
52 | _DEFAULT_LOCALE = ("IS_is", "UTF-8")
53 |
54 | # A set of all valid verb argument cases
55 | ALL_CASES = frozenset(("nf", "þf", "þgf", "ef"))
56 | ALL_GENDERS = frozenset(("kk", "kvk", "hk"))
57 | ALL_NUMBERS = frozenset(("et", "ft"))
58 | SUBCLAUSES = frozenset(("nh", "nhx", "falls", "spurns"))
59 | REFLPRN = {"sig": "sig_hk_et_þf", "sér": "sig_hk_et_þgf", "sín": "sig_hk_et_ef"}
60 | REFLPRN_CASE = {"sig": "þf", "sér": "þgf", "sín": "ef"}
61 | REFLPRN_SET = frozenset(REFLPRN.keys())
62 |
63 | # BÍN compressed file format version (used in tools/binpack.py and bincompress.py)
64 | BIN_COMPRESSOR_VERSION = b"Greynir 02.00.00"
65 | assert len(BIN_COMPRESSOR_VERSION) == 16
66 | BIN_COMPRESSED_FILE = "ord.compressed"
67 |
68 |
69 | @contextmanager
70 | def changedlocale(
71 | new_locale: Optional[str] = None, category: str = "LC_COLLATE"
72 | ) -> Iterator[Callable[[str], str]]:
73 | """Change locale for collation temporarily within a context (with-statement)"""
74 | # The newone locale parameter should be a tuple: ('is_IS', 'UTF-8')
75 | # The category should be a string such as 'LC_TIME', 'LC_NUMERIC' etc.
76 | cat = getattr(locale, category)
77 | old_locale = locale.getlocale(cat)
78 | try:
79 | locale.setlocale(cat, new_locale or _DEFAULT_LOCALE)
80 | yield locale.strxfrm # Function to transform string for sorting
81 | finally:
82 | locale.setlocale(cat, old_locale)
83 |
84 |
85 | def sort_strings(strings: Iterable[str], loc: Optional[str] = None) -> List[str]:
86 | """Sort a list of strings using the specified locale's collation order"""
87 | # Change locale temporarily for the sort
88 | with changedlocale(loc) as strxfrm:
89 | return sorted(strings, key=strxfrm)
90 |
91 |
92 | class ConfigError(Exception):
93 | """Exception class for configuration errors"""
94 |
95 | def __init__(self, s: str) -> None:
96 | super().__init__(s)
97 | self.fname: Optional[str] = None
98 | self.line = 0
99 |
100 | def set_pos(self, fname: str, line: int) -> None:
101 | """Set file name and line information, if not already set"""
102 | if not self.fname:
103 | self.fname = fname
104 | self.line = line
105 |
106 | def __str__(self) -> str:
107 | """Return a string representation of this exception"""
108 | s = Exception.__str__(self)
109 | if not self.fname:
110 | return s
111 | return "File {0}, line {1}: {2}".format(self.fname, self.line, s)
112 |
113 |
114 | class LineReader:
115 | """Read lines from a text file, recognizing $include directives"""
116 |
117 | def __init__(
118 | self,
119 | fname: str,
120 | *,
121 | package_name: Optional[str] = None,
122 | outer_fname: Optional[str] = None,
123 | outer_line: int = 0
124 | ) -> None:
125 | self._fname = fname
126 | self._package_name = package_name
127 | self._line = 0
128 | self._inner_rdr: Optional[LineReader] = None
129 | self._outer_fname = outer_fname
130 | self._outer_line = outer_line
131 |
132 | def fname(self) -> str:
133 | """The name of the file being read"""
134 | return self._fname if self._inner_rdr is None else self._inner_rdr.fname()
135 |
136 | def line(self) -> int:
137 | """The number of the current line within the file"""
138 | return self._line if self._inner_rdr is None else self._inner_rdr.line()
139 |
140 | def lines(self) -> Iterator[str]:
141 | """Generator yielding lines from a text file"""
142 | self._line = 0
143 | try:
144 | if self._package_name:
145 | ref = importlib_resources.files("reynir").joinpath(self._fname)
146 | stream = ref.open("rb")
147 | else:
148 | stream = open(self._fname, "rb")
149 | with stream as inp:
150 | # Read config file line-by-line from the package resources
151 | accumulator = ""
152 | for b in inp:
153 | # We get byte strings; convert from utf-8 to Python strings
154 | s = b.decode("utf-8")
155 | self._line += 1
156 | if s.rstrip().endswith("\\"):
157 | # Backslash at end of line: continuation in next line
158 | accumulator += s.strip()[:-1]
159 | continue
160 | if accumulator:
161 | # Add accumulated text from preceding
162 | # backslash-terminated lines, but drop leading whitespace
163 | s = accumulator + s.lstrip()
164 | accumulator = ""
165 | # Check for include directive: $include filename.txt
166 | if s.startswith("$") and s.lower().startswith("$include "):
167 | iname = s.split(maxsplit=1)[1].strip()
168 | # Do some path magic to allow the included path
169 | # to be relative to the current file path, or a
170 | # fresh (absolute) path by itself
171 | head, _ = os.path.split(self._fname)
172 | iname = os.path.join(head, iname)
173 | rdr = self._inner_rdr = LineReader(
174 | iname,
175 | package_name=self._package_name,
176 | outer_fname=self._fname,
177 | outer_line=self._line,
178 | )
179 | yield from rdr.lines()
180 | self._inner_rdr = None
181 | else:
182 | yield s
183 | if accumulator:
184 | # Catch corner case where last line of file ends with a backslash
185 | yield accumulator
186 | except (IOError, OSError):
187 | if self._outer_fname:
188 | # This is an include file within an outer config file
189 | c = ConfigError(
190 | "Error while opening or reading include file '{0}'".format(
191 | self._fname
192 | )
193 | )
194 | c.set_pos(self._outer_fname, self._outer_line)
195 | else:
196 | # This is an outermost config file
197 | c = ConfigError(
198 | "Error while opening or reading config file '{0}'".format(
199 | self._fname
200 | )
201 | )
202 | raise c
203 |
--------------------------------------------------------------------------------
/src/reynir/bindb.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | Greynir: Natural language processing for Icelandic
4 |
5 | BinDb module
6 |
7 | Copyright © 2023 Miðeind ehf.
8 |
9 | This software is licensed under the MIT License:
10 |
11 | Permission is hereby granted, free of charge, to any person
12 | obtaining a copy of this software and associated documentation
13 | files (the "Software"), to deal in the Software without restriction,
14 | including without limitation the rights to use, copy, modify, merge,
15 | publish, distribute, sublicense, and/or sell copies of the Software,
16 | and to permit persons to whom the Software is furnished to do so,
17 | subject to the following conditions:
18 |
19 | The above copyright notice and this permission notice shall be
20 | included in all copies or substantial portions of the Software.
21 |
22 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
23 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
24 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
25 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
26 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
27 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
28 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
29 |
30 | This module implements a thin wrapper on top of the GreynirBin
31 | class from BinPackage, as well as a couple of basic data classes.
32 |
33 | """
34 |
35 | from typing import Any, List, Optional, Tuple
36 | from functools import lru_cache
37 |
38 | from islenska.basics import make_bin_entry, ALL_CASES
39 | from islenska.bindb import GreynirBin as GBin, PERSON_NAME_FL
40 |
41 | from tokenizer.definitions import BIN_Tuple
42 |
43 | from .settings import StaticPhrases
44 |
45 | # SHSnid tuple as seen by the Greynir compatibility layer
46 | ResultTuple = Tuple[str, List[BIN_Tuple]]
47 |
48 |
49 | # Size of name cache for lookup_name_gender
50 | _NAME_GENDER_CACHE_SIZE = 128
51 |
52 |
53 | class GreynirBin(GBin):
54 |
55 | """Overridden class that adds a singleton instance of GreynirBin
56 | and a context manager protocol"""
57 |
58 | _singleton: Optional["GreynirBin"] = None
59 |
60 | @classmethod
61 | def get_db(cls) -> "GreynirBin":
62 | if cls._singleton is None:
63 | cls._singleton = GreynirBin()
64 | return cls._singleton
65 |
66 | def __enter__(self) -> "GreynirBin":
67 | """Allow this class to be used in a with statement"""
68 | return self
69 |
70 | def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
71 | pass
72 |
73 | def lookup_g(
74 | self, w: str, at_sentence_start: bool = False, auto_uppercase: bool = False
75 | ) -> ResultTuple:
76 | """Returns BIN_Tuple instances, which are the Greynir version
77 | of islenska.BinEntry"""
78 | w, m = self._lookup(
79 | w,
80 | at_sentence_start,
81 | auto_uppercase,
82 | self._meanings_cache_lookup,
83 | make_bin_entry,
84 | )
85 | return w, [BIN_Tuple._make(mm) for mm in m]
86 |
87 | def lookup_nominative_g(self, w: str, **options: Any) -> List[BIN_Tuple]:
88 | """Returns the Greynir version of islenska.BinEntry"""
89 | return [BIN_Tuple._make(mm) for mm in super().lookup_nominative(w, **options)]
90 |
91 | def lookup_accusative_g(self, w: str, **options: Any) -> List[BIN_Tuple]:
92 | """Returns the Greynir version of islenska.BinEntry"""
93 | return [BIN_Tuple._make(mm) for mm in super().lookup_accusative(w, **options)]
94 |
95 | def lookup_dative_g(self, w: str, **options: Any) -> List[BIN_Tuple]:
96 | """Returns the Greynir version of islenska.BinEntry"""
97 | return [BIN_Tuple._make(mm) for mm in super().lookup_dative(w, **options)]
98 |
99 | def lookup_genitive_g(self, w: str, **options: Any) -> List[BIN_Tuple]:
100 | """Returns the Greynir version of islenska.BinEntry"""
101 | return [BIN_Tuple._make(mm) for mm in super().lookup_genitive(w, **options)]
102 |
103 | def meanings(self, w: str) -> List[BIN_Tuple]:
104 | """Low-level lookup of BIN_Tuple instances for the given word"""
105 | return [
106 | BIN_Tuple(k.ord, k.bin_id, k.ofl, k.hluti, k.bmynd, k.mark)
107 | for k in self._ksnid_lookup(w)
108 | ]
109 |
110 | @lru_cache(maxsize=_NAME_GENDER_CACHE_SIZE)
111 | def lookup_name_gender(self, name: str, preferred_case: str = "nf") -> str:
112 | """Given a person name, lookup its gender"""
113 | assert preferred_case in ALL_CASES
114 |
115 | if not name:
116 | return "hk" # Unknown gender
117 |
118 | w = name.split(maxsplit=1)[0] # Get first name
119 | m = self.meanings(w) # Look up meanings
120 | if m:
121 | # Find all meanings that can be person names
122 | nl = [x for x in m if x.fl in PERSON_NAME_FL]
123 | if nl:
124 | # Find all meanings in the preferred case
125 | prefc = [x for x in nl if x.beyging.lower().startswith(preferred_case)]
126 | if prefc:
127 | # Found a name meaning in the preferred case
128 | return prefc[0].ordfl
129 | # Found a name meaning *not* in the preferred case
130 | return nl[0].ordfl
131 |
132 | # The first name was not found: check whether the full name is
133 | # in the static phrases
134 | m = StaticPhrases.lookup(name)
135 | if m is not None:
136 | if m.fl in PERSON_NAME_FL:
137 | return m.ordfl
138 | return "hk" # Unknown gender
139 |
--------------------------------------------------------------------------------
/src/reynir/cache.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | cache.py
4 |
5 | Cache utility classes
6 |
7 | The LRU_Cache and LFU_Cache classes herein are
8 | copyright © 2011 by Raymond Hettinger
9 |
10 | cf. http://code.activestate.com/recipes/577970-simplified-lru-cache/
11 | http://code.activestate.com/recipes/498245-lru-and-lfu-cache-decorators/
12 |
13 | MIT license:
14 |
15 | Permission is hereby granted, free of charge, to any person obtaining a copy of
16 | this software and associated documentation files (the "Software"), to deal in
17 | the Software without restriction, including without limitation the rights to use,
18 | copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the
19 | Software, and to permit persons to whom the Software is furnished to do so,
20 | subject to the following conditions:
21 |
22 | The above copyright notice and this permission notice shall be included
23 | in all copies or substantial portions of the Software.
24 |
25 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
26 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
28 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
30 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
31 | IN THE SOFTWARE.
32 |
33 | ---
34 |
35 | The classes have been modified from their original versions,
36 | which are available from the URLs given above.
37 |
38 | """
39 |
40 | from typing import List, Dict, Any, Callable, TypeVar, Generic, cast
41 |
42 | from heapq import nsmallest
43 | from operator import itemgetter
44 | import threading
45 | from functools import wraps
46 |
47 |
48 | LRU_DEFAULT = 1024
49 | LFU_DEFAULT = 512
50 |
51 |
52 | _K = TypeVar("_K")
53 | _V = TypeVar("_V")
54 |
55 |
56 | class LRU_Cache(Generic[_V]):
57 | def __init__(
58 | self, user_function: Callable[..., _V], maxsize: int = LRU_DEFAULT
59 | ) -> None:
60 | # Link layout: [PREV, NEXT, KEY, RESULT]
61 | root: List[Any] = [None, None, None, None]
62 | self.root = root
63 | self.user_function = user_function
64 | cache: Dict[Any, List[Any]] = {}
65 | self.cache = cache
66 |
67 | last: List[Any] = root
68 | for _ in range(maxsize):
69 | key = object()
70 | cache[key] = last[1] = last = [last, root, key, None]
71 | root[0] = last
72 |
73 | def __call__(self, *key: Any) -> _V:
74 | cache = self.cache
75 | root = self.root
76 | link = cache.get(key)
77 | if link is not None:
78 | link_prev, link_next, _, result = link
79 | link_prev[1] = link_next
80 | link_next[0] = link_prev
81 | last = root[0]
82 | last[1] = root[0] = link
83 | link[0] = last
84 | link[1] = root
85 | return result
86 | result = self.user_function(*key)
87 | root[2] = key
88 | root[3] = result
89 | oldroot = root
90 | root = self.root = root[1]
91 | root[2], oldkey = None, root[2]
92 | root[3] = None
93 | del cache[oldkey]
94 | cache[key] = oldroot
95 | return result
96 |
97 |
98 | class Counter(Dict[_K, int], Generic[_K]):
99 | """Mapping where default values are zero"""
100 |
101 | def __missing__(self, key: _K) -> int:
102 | return 0
103 |
104 |
105 | class LFU_Cache(Generic[_K, _V]):
106 |
107 | """Least-frequently-used (LFU) cache for word lookups.
108 | Based on a pattern by Raymond Hettinger
109 | """
110 |
111 | def __init__(self, maxsize: int = LFU_DEFAULT) -> None:
112 | # Mapping of keys to results
113 | self.cache: Dict[_K, _V] = {}
114 | # Times each key has been accessed
115 | self.use_count: Counter[_K] = Counter()
116 | self.maxsize = maxsize
117 | self.hits = self.misses = 0
118 | # The cache may be accessed in parallel by multiple threads
119 | self.lock = threading.Lock()
120 |
121 | def lookup(self, key: _K, func: Callable[[_K], _V]) -> _V:
122 | """Lookup a key in the cache, calling func(key)
123 | to obtain the data if not already there"""
124 | with self.lock:
125 | self.use_count[key] += 1
126 | # Get cache entry or compute if not found
127 | try:
128 | result = self.cache[key]
129 | self.hits += 1
130 | except KeyError:
131 | result = func(key)
132 | self.cache[key] = result
133 | self.misses += 1
134 |
135 | # Purge the 10% least frequently used cache entries
136 | if len(self.cache) > self.maxsize:
137 | for key, _ in nsmallest(
138 | self.maxsize // 10, self.use_count.items(), key=itemgetter(1)
139 | ):
140 |
141 | del self.cache[key], self.use_count[key]
142 |
143 | return result
144 |
145 |
146 | # Define a type variable to allow MyPy to infer the relationship
147 | # between intermediate types in cached and cached_property
148 | _T = TypeVar("_T")
149 |
150 | # Define a unique singleton for use as a sentinel
151 | _NA = object()
152 |
153 |
154 | def cached(func: Callable[..., _T]) -> Callable[..., _T]:
155 | """A decorator for caching function calls"""
156 |
157 | @wraps(func)
158 | def wrapper(*args: Any, **kwargs: Any) -> _T:
159 | val = cast(_T, getattr(func, "_cache", _NA))
160 | if val is _NA:
161 | val = func(*args, **kwargs)
162 | setattr(func, "_cache", val)
163 | return val
164 |
165 | return wrapper
166 |
167 |
168 | class cached_property(Generic[_T]):
169 |
170 | """A decorator for caching instance properties"""
171 |
172 | def __init__(self, func: Callable[..., _T]) -> None:
173 | self.__doc__ = getattr(func, "__doc__")
174 | self.func = func
175 |
176 | def __get__(self, obj: Any, cls: Any) -> _T:
177 | if obj is None:
178 | return cast(_T, self) # Hack to satisfy mypy/Pylance
179 | # Get the property value and put it into the instance's
180 | # dict instead of the original function
181 | val = obj.__dict__[self.func.__name__] = self.func(obj)
182 | return val
183 |
--------------------------------------------------------------------------------
/src/reynir/config/Abbrev_errors.conf:
--------------------------------------------------------------------------------
1 |
2 | þ.á.m. = "þar á meðal" ao frasi # Algeng villa
3 | n.k.* = "næstkomandi" lo # Prentvilla, en talin nógu saklaus til að leyfa
4 |
--------------------------------------------------------------------------------
/src/reynir/config/AdjectivePredicates.conf:
--------------------------------------------------------------------------------
1 | # Greynir: Natural language processing for Icelandic
2 |
3 | # From Kristín Þóra Pétursdóttir's Master's thesis, http://hdl.handle.net/1946/17722,
4 | # with additions
5 |
6 | [adjective_predicates]
7 |
8 | aðgengilegur þgf
9 | andsnúinn þgf
10 | andstreymur þgf
11 | beinisamur þgf
12 | bundinn þgf
13 | feginn þgf
14 | fráhverfur þgf
15 | fyrirlitlegur þgf
16 | frændhollur þgf
17 | góðviljaður þgf
18 | haldinn þgf
19 | harmdauði þgf
20 | háður þgf
21 | óháður þgf
22 | hliðhollur þgf
23 | hollur þgf
24 | liðsinnaður þgf
25 | lokaður þgf
26 | mótdrægur þgf
27 | mótfallinn þgf
28 | mótgjarn þgf
29 | mótsnúinn þgf
30 | opinn þgf
31 | óbrigður þgf
32 | reiðubúinn þgf
33 | skæður þgf
34 | tilgefinn þgf
35 | vandabundinn þgf
36 | viljaður þgf
37 | vingæfur þgf
38 | vinhallur þgf
39 | vinhollur þgf
40 | vinveittur þgf
41 | vorkunnugur þgf
42 |
43 | alkunnugur þgf
44 | alkunnur þgf
45 | áþekkur þgf
46 | fjarlægur þgf
47 | fjarstæður þgf
48 | frábrugðinn þgf
49 | jafn þgf
50 | jafnaldra þgf
51 | jafnborinn þgf
52 | jafnfætis þgf
53 | jafngamall þgf
54 | jafnkosta þgf
55 | jafnkristinn þgf
56 | jafnliða þgf
57 | jafnlíkur þgf
58 | kunnugur þgf
59 | kynlíkur þgf
60 | ólíkur þgf
61 | líkur þgf
62 | málkunnugur þgf
63 | merktur þgf
64 | nafnkunnugur þgf
65 | náinn þgf
66 | nágöngull þgf
67 | nákominn þgf
68 | nálægur þgf
69 | nástæður þgf
70 | nærgengur þgf
71 | nærstandandi þgf
72 | nærri þgf
73 | ókunnur þgf
74 | sambærilegur þgf
75 | sameiginlegur þgf
76 | samferða þgf
77 | samhljóða þgf
78 | samhliða þgf
79 | samhuga þgf
80 | samkynja þgf
81 | ósamjafn þgf
82 | samjafn þgf
83 | samkvæmur þgf
84 | samlaga þgf
85 | samlendur þgf
86 | sammála þgf
87 | sammæddur þgf
88 | samnefndur þgf
89 | samsekur þgf
90 | samsíða þgf
91 | samskipa þgf
92 | samskóla þgf
93 | samstunda þgf
94 | samtengdur þgf
95 | samtíða þgf
96 | samvista þgf
97 | samþykkur þgf
98 | sifjaður þgf
99 | skaplíkur þgf
100 | óskyldur þgf
101 | skyldur þgf
102 | svipaður þgf
103 | tengdur þgf
104 | ótengdur þgf
105 | fasttengdur þgf
106 |
107 | alskipaður þgf
108 | ataður þgf
109 | auðráðinn þgf
110 | auðráður þgf
111 | áfastur þgf
112 | ánafnaður þgf
113 | blandaður þgf
114 | blandinn þgf
115 | byggður þgf
116 | búinn þgf
117 | eignaður þgf
118 | firrtur þgf
119 | gróinn þgf
120 | gyrtur þgf
121 | gæddur þgf
122 | heyrilegur þgf
123 | hjúpaður þgf
124 | hlaðinn þgf
125 | hulinn þgf
126 | innborinn þgf
127 | fagurskrýddur þgf
128 | falur þgf
129 | gildur þgf
130 | óheimill þgf
131 | heimill þgf
132 | kafhlaðinn þgf
133 | kafinn þgf
134 | klæddur þgf
135 | íklæddur þgf
136 | knúinn þgf
137 | litaður þgf
138 | rammskipaður þgf
139 | rúinn þgf
140 | skreyttur þgf
141 | sleginn þgf
142 | smurður þgf
143 | gersneyddur þgf
144 | sneyddur þgf
145 | sveipaður þgf
146 | sýnilegur þgf
147 | umvafinn þgf
148 | undanskilinn þgf
149 | undanþeginn þgf
150 | vaxinn þgf
151 | þakinn þgf
152 | vafinn þgf
153 | úðaður þgf
154 |
155 | ástfólginn þgf
156 | einhlítur þgf
157 | frábitinn þgf
158 | gagnlegur þgf
159 | hagfelldur þgf
160 | haldsamur þgf
161 | hjartfólginn þgf
162 | hugfelldur þgf
163 | hugleikinn þgf
164 | hugnæmur þgf
165 | hugstæður þgf
166 | hugþekkur þgf
167 | kær þgf
168 | ljós þgf
169 | leiður þgf
170 | maklegur þgf
171 | náttúrulegur þgf
172 | óskapfelldur þgf
173 | tamur þgf
174 | vandalaus þgf
175 |
176 | afhuga þgf
177 | andvígur þgf
178 | ástúðlegur þgf
179 | blíður þgf
180 | eftirlátur þgf
181 | fráskila þgf
182 | fylgisamur þgf
183 | fylginn þgf
184 | góður þgf
185 | góðviljaður þgf
186 | gramur þgf
187 | grimmur þgf
188 | handgenginn þgf
189 | harður þgf
190 | hjálplegur þgf
191 | hlynntur þgf
192 | hlýðinn þgf
193 | hægur þgf
194 | hættulegur þgf
195 | leiðitamur þgf
196 | miskunnsamur þgf
197 | mjúkur þgf
198 | reiður þgf
199 | skuldbundinn þgf
200 | traustur þgf
201 | ótrúr þgf
202 | trúr þgf
203 | ótryggur þgf
204 | tryggur þgf
205 | undirgefinn þgf
206 | undirlátur þgf
207 | viðbúinn þgf
208 | vondur þgf
209 | óþakklátur þgf
210 | þakklátur þgf
211 | þekkur þgf
212 | þýður þgf
213 | þægur þgf
214 | þægilegur þgf
215 | æfur þgf
216 |
217 | ástúðlegur /við þf
218 | blíður /við þf
219 | bundinn /við þf
220 | duglegur /við þf
221 | eftirlátur /við þf
222 | fráskila /við þf
223 | fylgisamur /við þf
224 | fylginn /við þf
225 | góður /við þf
226 | góðviljaður /við þf
227 | gramur /við þf
228 | grimmur /við þf
229 | handgenginn /við þf
230 | harður /við þf
231 | hjálplegur /við þf
232 | hlynntur /við þf
233 | hlýðinn /við þf
234 | hægur /við þf
235 | hættulegur /við þf
236 | leiðitamur /við þf
237 | miskunnsamur /við þf
238 | mjúkur /við þf
239 | reiður /við þf
240 | skuldbundinn /við þf
241 | traustur /við þf
242 | ótrúr /við þf
243 | trúr /við þf
244 | ótryggur /við þf
245 | tryggur /við þf
246 | undirgefinn /við þf
247 | undirlátur /við þf
248 | viðbúinn /við þf
249 | vondur /við þf
250 | óþakklátur /við þf
251 | þakklátur /við þf
252 | þekkur /við þf
253 | þýður /við þf
254 | þægur /við þf
255 | þægilegur /við þf
256 | æfur /við þf
257 |
258 | auðveldur þgf
259 | dýrmætur þgf
260 | eiginlegur þgf
261 | erfiður þgf
262 | óhagstæður þgf
263 | hagstæður þgf
264 | harðleikinn þgf
265 | óhentugur þgf
266 | hentugur þgf
267 | hættur þgf
268 | illur þgf
269 | nytsamur þgf
270 | nytsamlegur þgf
271 | skaðlaus þgf
272 | skaðlegur þgf
273 | skaðsamlegur þgf
274 | skaðsamur þgf
275 | skaðvænlegur þgf
276 | skaðvænn þgf
277 | torveldur þgf
278 | óþarfur þgf
279 | þarfur þgf
280 | þolanlegur þgf
281 |
282 | auðveldur /við þf
283 | auðveldur /fyrir þf
284 | dýrmætur /fyrir þf
285 | eiginlegur /fyrir þf
286 | erfiður /við þf
287 | erfiður /fyrir þf
288 | óhagstæður /fyrir þf
289 | hagstæður /fyrir þf
290 | harðleikinn /fyrir þf
291 | óhentugur /fyrir þf
292 | hentugur /fyrir þf
293 | hættur /við þf
294 | illur /við þf
295 | nytsamur /fyrir þf
296 | nytsamlegur /fyrir þf
297 | skaðlaus /fyrir þf
298 | skaðlegur /fyrir þf
299 | skaðsamlegur /fyrir þf
300 | skaðsamur /fyrir þf
301 | skaðvænlegur /fyrir þf
302 | skaðvænn /fyrir þf
303 | torveldur /fyrir þf
304 | óþarfur /fyrir þf
305 | þarfur /fyrir þf
306 | þolanlegur /fyrir þf
307 |
308 | viðriðinn þf
309 | viðstaddur þf
310 | viðloðandi þf
311 | viðloðinn þf
312 | # varðandi þf # Virðist vera gripið með so_lh_nt
313 |
314 | verður ef
315 | fullur ef
316 | frjáls ef # frjáls ferða sinna, frjáls skoðana sinna
317 | fullviss ef # þess fullviss að...
318 | meðvitaður ef # þess meðvitaður að...
319 |
320 | # að/af errors
321 |
322 | auðugur /að þgf
323 | auðugur /af þgf $error(WRONG-PP, að)
324 | kunnur /að þgf
325 | kunnur /af þgf $error(WRONG-PP, að)
326 | ólétt /að þgf
327 | ólétt /af þgf $error(WRONG-PP, að)
328 | ófrísk /að þgf
329 | ófrísk /af þgf $error(WRONG-PP, að)
330 | vanfær /að þgf
331 | vanfær /af þgf $error(WRONG-PP, að)
332 | rammur /að þgf
333 | rammur /af þgf $error(WRONG-PP, að)
334 | uppvís /að þgf
335 | uppvís /af þgf $error(WRONG-PP, að)
336 | ríkur /að þgf
337 | ríkur /af þgf $error(WRONG-PP, að)
338 | snauður /að þgf
339 | snauður /af þgf $error(WRONG-PP, að)
340 | þekktur /að þgf
341 | þekktur /af þgf $error(WRONG-PP, að)
342 |
--------------------------------------------------------------------------------
/src/reynir/config/GreynirEngine.conf:
--------------------------------------------------------------------------------
1 | #
2 | # GreynirEngine.conf
3 | #
4 | # Configuration file for GreynirEngine ('reynir' on PyPI)
5 | #
6 | # Copyright © 2023 Miðeind ehf
7 | #
8 | # This software is licensed under the MIT License:
9 | #
10 | # Permission is hereby granted, free of charge, to any person
11 | # obtaining a copy of this software and associated documentation
12 | # files (the "Software"), to deal in the Software without restriction,
13 | # including without limitation the rights to use, copy, modify, merge,
14 | # publish, distribute, sublicense, and/or sell copies of the Software,
15 | # and to permit persons to whom the Software is furnished to do so,
16 | # subject to the following conditions:
17 | #
18 | # The above copyright notice and this permission notice shall be
19 | # included in all copies or substantial portions of the Software.
20 | #
21 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
24 | # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
25 | # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
26 | # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
27 | # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
28 |
29 | # This file is read in the package's __init__.py, via the settings.py module.
30 | # It omits settings that are not relevant to Greynir in its package form,
31 | # such as web server and database configuration.
32 |
33 | [settings]
34 |
35 | debug = false
36 |
37 | # Phrases.conf should be included before Prepositions.conf
38 |
39 | $include Phrases.conf
40 |
41 | $include Prepositions.conf
42 |
43 | $include Prefs.conf
44 |
45 | $include Names.conf
46 |
47 | $include Verbs.conf
48 |
49 | $include Adjectives.conf
50 |
51 | $include AdjectivePredicates.conf
52 |
--------------------------------------------------------------------------------
/src/reynir/config/Names.conf:
--------------------------------------------------------------------------------
1 |
2 | # Greynir: Natural language processing for Icelandic
3 |
4 | # Additional information and configuration for person names
5 |
6 | # Copyright © 2023 Miðeind ehf.
7 |
8 | [disallowed_names]
9 |
10 | # Orðmyndir sem eru ekki teknar gildar sem byrjun nafna í því falli sem gefið er
11 |
12 | Almarr þf þgf ef
13 | Annar nf þf ef # Nánast alltaf fornafnið
14 | Annarr þf þgf ef # Nánast alltaf fornafnið
15 | Ara nf
16 | Án nf þf
17 | Ásti þf þgf ef
18 | Birnir þf þgf
19 | Bjarna nf
20 | Elína þf þgf
21 | Ernir þf þgf
22 | Donaldur þf þgf ef
23 | Finn þf þgf ef
24 | Fjalarr þf þgf ef
25 | Frár þf þgf
26 | Gamli nf þf þgf ef # Nánast alltaf lýsingarorð
27 | Gerða þf þgf ef
28 | Grein nf þf þgf
29 | Guðna nf
30 | Guðmund nf þf þgf ef
31 | Guðrúnn þf
32 | Gunnur ef
33 | Harald nf
34 | Heiðarr þf þgf ef
35 | Hildir þf þgf
36 | Hnikarr þf þgf ef
37 | Ísarr þf þgf ef
38 | Ísrael nf þg þgf ef # Nánast alltaf örnefnið
39 | Karli nf
40 | Konráður þf þgf ef
41 | Kristína þf þgf ef
42 | Leif þf þgf ef
43 | Minning nf þf þgf ef # Villa í BÍN?
44 | Oddnýr þf þgf
45 | Ormarr þf þgf ef
46 | Óttarr þf þgf ef
47 | Rögn ef
48 | Sali þf þgf ef
49 | Sigmund þf þgf ef
50 | Sigurð þf þgf ef
51 | Skúla nf
52 | Sólveigur þf þgf
53 | Steinarr þf þgf ef
54 | Styrr þf þgf ef
55 | Sævarr þf þgf ef
56 | Vörður þgf
57 | Ýrr þf þgf ef
58 | Þórr þf þgf ef
59 | Ævarr þf þgf ef
60 | Örvarr þf þgf ef
61 |
62 |
63 | # Margræð orð sem skilja á sem nöfn í byrjun setningar þó þau standi ein
64 |
65 | [name_preferences]
66 |
67 | Aðalberg
68 | Aðalbergi
69 | Aðalbergs
70 | Aðalbjargar
71 | Aðalbjörg
72 | Aðalráð
73 | Aðalráði
74 | Aðalráðs
75 | Agata
76 | Agða
77 | Agga
78 | Aggi
79 | Agli
80 | Agna
81 | Agnar
82 | Akurrós
83 | Akurrósar
84 | Alba
85 | Alda
86 | Andra
87 | Andri
88 | Ara
89 | Ari
90 | Arnar
91 | Arngeir
92 | Arngeiri
93 | Arngeirs
94 | Aski
95 | Asks
96 | Askur
97 | Aspar
98 | Assa
99 | Atla
100 | Atlas
101 | Atlasar
102 | Atlass
103 | Atli
104 | #Auðar
105 | Auðari
106 | Auðnu
107 | Ámunda
108 | Áni
109 | Áns
110 | Árbót
111 | Árbótar
112 | Árdís
113 | Árdísar
114 | Ármann
115 | Ármanni
116 | Ármanns
117 | #Árnes # Mannsnafn??
118 | #Árnesi # Mannsnafn??
119 | Ársól
120 | Ársólar
121 | Ársæl
122 | Ársælar
123 | Ársæli
124 | Ársæll
125 | Ársæls
126 | Árveig
127 | Árveigar
128 | Ásmund
129 | Ásmundar
130 | Ásmundi
131 | Ásmundur
132 | Ástráð
133 | Ástráði
134 | Ástráðs
135 | Ástríður
136 | Baldri
137 | Baldur
138 | Baldurs
139 | Barbara
140 | Barra
141 | #Bassa
142 | #Bassi
143 | Bella
144 | Benediktína
145 | Benta
146 | Bents
147 | Bergdís
148 | Bergdísar
149 | Berglind
150 | Berglindar
151 | Bergstein
152 | Bergsteina
153 | Bergsteini
154 | Bergsteinn
155 | Bergsteins
156 | Bersa
157 | Bersi
158 | Beru
159 | Bessa
160 | Bessi
161 | Beta
162 | Betu
163 | Birkir
164 | Birna
165 | Birnis
166 | Birnu
167 | Birtingi
168 | Birtings
169 | Birtingur
170 | Bjarglindar
171 | Bjarglindi
172 | Bjólan
173 | Blöku
174 | Bos
175 | Braga
176 | Bragi
177 | Brand
178 | Brandi
179 | Brandís
180 | Brands
181 | Brár
182 | Breka
183 | Breki
184 | Bretting
185 | Brima
186 | Brimar
187 | Brimari
188 | Brimi
189 | Bubba
190 | #Burkna
191 | #Burkni
192 | Bæring
193 | Böðvar
194 | Dagfara
195 | Dagfari
196 | Dagga
197 | Daggar
198 | Dagheiðar
199 | Dagheiði
200 | Dagheiður
201 | Dagmey
202 | Dagmeyjar
203 | Dagmeyju
204 | Dalrós
205 | Dalrósar
206 | Danna
207 | Danni
208 | Darra
209 | Darri
210 | Davíða
211 | Davíðu
212 | Dofra
213 | Dofri
214 | Drafnar
215 | Draumrún
216 | Draumrúnar
217 | Droplaug
218 | Droplaugar
219 | Dröfn
220 | Dúa
221 | Dúi
222 | Dæju
223 | Döllu
224 | Ebba
225 | Ebbi
226 | Edda
227 | Eddu
228 | Editar
229 | Eiðunnar
230 | Eiðunni
231 | Eiðvarar
232 | Eiðvör
233 | Eiðvöru
234 | Eldborg
235 | Eldborgar
236 | Eldey
237 | Eldeyjar
238 | Eldeyju
239 | Elfa
240 | Elfar
241 | Elfi
242 | Elfu
243 | Elliða
244 | Elliði
245 | Elna
246 | Emma
247 | Emmu
248 | Erla
249 | Erlar
250 | Erlu
251 | Erna
252 | Erni
253 | Ernu
254 | Esja
255 | Esju
256 | Eski
257 | Etna
258 | Etnu
259 | Eygló
260 | Eyglóar
261 | Fía
262 | Fjalar
263 | Fjölvar
264 | Fjölvari
265 | Fjölvars
266 | Fjörni
267 | Fjörnir
268 | Fjörnis
269 | Flosa
270 | Flosi
271 | Fransiska
272 | Fransisku
273 | Friðgerðar
274 | Friðmann
275 | Friðmanni
276 | Friðmanns
277 | Fúsa
278 | Fúsi
279 | Fylkis
280 | Galti
281 | Garra
282 | Garri
283 | Gassa
284 | Gassi
285 | Gaut
286 | Gauta
287 | Gauti
288 | Gauts
289 | Gautur
290 | Gefn
291 | Geir
292 | Geirs
293 | Gelli
294 | Gellir
295 | Gellis
296 | Gerðari
297 | Gígja
298 | Gígjari
299 | Gígju
300 | Glóey
301 | Glóeyjar
302 | Gnúp
303 | Gnúpi
304 | Gnúps
305 | Gnúpur
306 | #Góa
307 | #Góu
308 | Grana
309 | Grani
310 | Greipi
311 | Greips
312 | Greipur
313 | Grettis
314 | Grétu
315 | Grímar
316 | Grími
317 | Gudda
318 | Guddu
319 | Gullbrá
320 | Gullbrár
321 | Gullveig
322 | Gullveigar
323 | Gumma
324 | Gummi
325 | Gunnar
326 | Gunni
327 | Gunnlaðar
328 | Gunnlöð
329 | Gunnur
330 | Gylfa
331 | Gylfi
332 | Gyrðis
333 | Gýgjar
334 | Gými
335 | Gýmir
336 | Gýmis
337 | Hadd
338 | Hadda
339 | Haddar
340 | Haddi
341 | Hadds
342 | Haðar
343 | Hafborg
344 | Hafborgar
345 | Hafdís
346 | Hafdísar
347 | Hafliða
348 | Hafliði
349 | Hafnari
350 | #Haföldu
351 | Hall
352 | Hansa
353 | Harra
354 | Harri
355 | Hauður
356 | Hauk
357 | Hávar
358 | Hedda
359 | Heiðmann
360 | Heiðmanni
361 | Heiðmanns
362 | Heimis
363 | Hein
364 | Hekla
365 | Heklu
366 | Helma
367 | Helmu
368 | Herborg
369 | Herborgar
370 | Hergarð
371 | Hergarði
372 | Hergarðs
373 | Herjólf
374 | Herjólfi
375 | Herjólfs
376 | Herjólfur
377 | Hermann
378 | Hermanni
379 | Hermanns
380 | Hersi
381 | Hersir
382 | Hersis
383 | Héðin
384 | Héðinn
385 | Héðins
386 | Héðni
387 | Hilda
388 | Hildar
389 | Hildi
390 | Hildir
391 | Hildis
392 | Hildur
393 | Hilmi
394 | Hilmir
395 | Hilmis
396 | Hjalta
397 | Hjalti
398 | Hjartar
399 | Hjálmrún
400 | Hjálmrúnar
401 | Hjört
402 | Hjörvar
403 | Hleinar
404 | Hlífari
405 | Hlín
406 | Hlínar
407 | Hlöð
408 | Hlöður
409 | Hraunar
410 | Hróa
411 | Hróar
412 | Hrói
413 | Hrund
414 | Hrundar
415 | Hugborg
416 | Hugborgar
417 | Huld
418 | #Huldar
419 | Huldari
420 | Höddu
421 | Höð
422 | Inga
423 | Ingi
424 | Innu
425 | Irpa
426 | Irpu
427 | Íma
428 | Ími
429 | Ímu
430 | Íris
431 | Írisar
432 | Ísafold
433 | Ísafoldar
434 | Ísbjörg
435 | Ísey
436 | Íseyjar
437 | Íseyju
438 | Ísfold
439 | Ísfoldar
440 | Ísgerðar
441 | Íslilja
442 | Íslilju
443 | Jakobína
444 | Jan
445 | Jans
446 | Jara
447 | Járnbrá
448 | Járnbrár
449 | Jóa
450 | Jóanna
451 | Jódís
452 | Jódísar
453 | Jón
454 | Jóna
455 | Jónanna
456 | Jóni
457 | Jónu
458 | Jórunn
459 | Jórunnar
460 | Júlla
461 | Júllu
462 | Jústa
463 | Jústu
464 | Jöru
465 | #Kalla # Kalla þurfti til lögreglu ...
466 | Kalli
467 | Kamilla
468 | Kamillu
469 | Kamma
470 | #Kana
471 | #Kani
472 | Kara
473 | Kata
474 | Katla
475 | Katli
476 | Kára
477 | Kári
478 | Ketil
479 | Ketill
480 | Ketils
481 | Kidda
482 | Kiddi
483 | Kiljan
484 | Kjalar
485 | Kolbrún
486 | Kolbrúnar
487 | Kolbrúnu
488 | Kolur
489 | Kolþerna
490 | Kolþernu
491 | Krissa
492 | Krissi
493 | Kristmann
494 | Kristmanni
495 | Kristmanns
496 | Köllu
497 | Lana
498 | Lara
499 | Lasarus
500 | Lasarusar
501 | Lasarusi
502 | Laufar
503 | Lár
504 | Lára
505 | Lárs
506 | Leif
507 | Leifi
508 | Leó
509 | Leós
510 | Lill
511 | Lillar
512 | Linnar
513 | Línar
514 | Líneik
515 | Líneikur
516 | Lofn
517 | Lofnar
518 | Lotta
519 | Lottu
520 | Lyngheiðar
521 | Lyngheiði
522 | Lýra
523 | Lýru
524 | Maja
525 | Makan
526 | Makans
527 | Malinu
528 | Manga
529 | Mangi
530 | Mardallar
531 | Mardöll
532 | Marías
533 | Maríuerla
534 | Maríuerlu
535 | Marjas
536 | Marsa
537 | Mánadís
538 | Mánadísar
539 | Mist
540 | Mistar
541 | Mími
542 | Mímir
543 | Mortína
544 | Mortínu
545 | Móna
546 | Mónu
547 | Muggi
548 | Myrra
549 | Myrru
550 | Nanna
551 | Natans
552 | Nikulásar
553 | Nikulási
554 | Njála
555 | Njálu
556 | Njóla
557 | Njólu
558 | Nóa
559 | Nói
560 | Nóna
561 | #Nóni
562 | Nóra
563 | Nóru
564 | Nóu
565 | Núp
566 | Núpan
567 | Núpi
568 | Núps
569 | Núpur
570 | Nökkva
571 | Nökkvi
572 | Nönnu
573 | Oddbjargar
574 | Oddbjörg
575 | Oddhildar
576 | Oddvari
577 | Olla
578 | Orra
579 | Orri
580 | Otra
581 | Otri
582 | Otur
583 | Oturs
584 | Óðinn
585 | Óðrík
586 | Óðríki
587 | Óðríks
588 | Óðríkur
589 | Órækja
590 | Órækju
591 | Ósvífri
592 | Ósvífur
593 | Pál
594 | Pála
595 | Páli
596 | Páll
597 | Pálmari
598 | Pálmu
599 | Páls
600 | Pers
601 | Pésa
602 | Pési
603 | Pétri
604 | Pétur
605 | Péturs
606 | Regin
607 | Reginbjörg
608 | Reinar
609 | Reynis
610 | Reyrs
611 | Rikka
612 | Ritu
613 | Rín
614 | Rínar
615 | Ríta
616 | Rósalinda
617 | Rósalindar
618 | Rósalindi
619 | Rósalín
620 | Rósanna
621 | Rósfríð
622 | Rósfríðar
623 | Rósfríði
624 | Rósfríður
625 | Rósu
626 | Rúnari
627 | Rúnu
628 | Röðli
629 | Röðul
630 | Röðull
631 | Röðuls
632 | Salvar
633 | Seif
634 | Seifi
635 | Selju
636 | Sigga
637 | Siggi
638 | Siggu
639 | Sigmann
640 | Sigmanni
641 | Sigmanns
642 | Signari
643 | Signu
644 | Sigurrún
645 | Sigurrúnar
646 | Sigurstein
647 | Sigursteina
648 | Sigursteini
649 | Sigursteinn
650 | Sigursteins
651 | Sigurvarðar
652 | Sigurvarði
653 | Silla
654 | Sindra
655 | Sindri
656 | Síta
657 | Sjafnar
658 | Sjóborg
659 | Sjóborgar
660 | #Skafta
661 | Skírnir
662 | Skæring
663 | Smyril
664 | Smyrill
665 | Smyrils
666 | Smyrli
667 | Snærós
668 | Snærósar
669 | Sólbrá
670 | Sólbrár
671 | Sólbrún
672 | Sólbrúnar
673 | Sólbrúnu
674 | Sóldaggar
675 | Sóldís
676 | Sóldísar
677 | Sóldögg
678 | Sólrún
679 | Sólrúnar
680 | Stefnis
681 | Steinborg
682 | Steinborgar
683 | Steinka
684 | Steinku
685 | Stella
686 | Stellu
687 | Styrmi
688 | Styrmir
689 | Sumarliða
690 | Sumarliði
691 | Sumarlín
692 | Sunna
693 | Sunnu
694 | Svarthöfða
695 | Svarthöfði
696 | Sverri
697 | Sverrir
698 | Sverris
699 | Svía
700 | Sæbirni
701 | Sæbjarnar
702 | Sæbjörn
703 | Sæborg
704 | Sæborgar
705 | Sædís
706 | Sædísar
707 | Sæfinn
708 | Sæmu
709 | Sölva
710 | Sölvar
711 | Sölvi
712 | Sörla
713 | Sörli
714 | Tandra
715 | Tandri
716 | Teit
717 | Teits
718 | Teitur
719 | Tila
720 | Tíbrá
721 | Tíbrár
722 | Tína
723 | Tínu
724 | Todda
725 | Tór
726 | Tórs
727 | Tóta
728 | Tóti
729 | Tótu
730 | Trjámann
731 | Trjámanni
732 | Trjámanns
733 | Trúmann
734 | Trúmanni
735 | Trúmanns
736 | Tyrfing
737 | Tý
738 | Týr
739 | Týs
740 | Unnari
741 | Urður
742 | Úlfhéðin
743 | Úlfhéðinn
744 | Úlfhéðins
745 | Úlfhéðni
746 | Úlla
747 | Vagnborg
748 | Vagnborgar
749 | Valbjarkar
750 | Valbjörk
751 | Valborg
752 | Valborgar
753 | Valdís
754 | Valdísar
755 | Valur
756 | Vatnar
757 | Veigalín
758 | Veigalínar
759 | Veigs
760 | Verónika
761 | Veróniku
762 | Veturliða
763 | Veturliði
764 | Viðar
765 | Vigni
766 | Vignir
767 | Vignis
768 | Virgil
769 | Virgill
770 | Virgils
771 | Virgli
772 | Vífil
773 | Vífill
774 | Vífils
775 | Vífli
776 | Vordís
777 | Vordísar
778 | Vögnu
779 | Völund
780 | Völundar
781 | Völundi
782 | Völundur
783 | Yngva
784 | Yngvar
785 | Yngvi
786 | Yrja
787 | Yrju
788 | Yrsa
789 | Yrsu
790 | Ými
791 | Ýmir
792 | Ýr
793 | Ýrar
794 | Ýri
795 | Þallar
796 | Þengil
797 | Þengill
798 | Þengils
799 | Þengli
800 | Þiðrandi
801 | Þjóðvarðar
802 | Þormar
803 | Þrastar
804 | Þráinn
805 | Þránd
806 | Þrándar
807 | Þrándi
808 | Þrándur
809 | Þresti
810 | Þrym
811 | Þrymi
812 | Þryms
813 | Þrymur
814 | Þura
815 | Þuru
816 | Þyri
817 | Þöll
818 | Æsu
819 | Ævar
820 | Ögðu
821 | Öggu
822 | Ölbu
823 | Ölni
824 | Ölnir
825 | Ölrún
826 | Ölrúnar
827 | Ölveig
828 | Ölveigar
829 | Ölvi
830 | Ölvir
831 | Össu
832 | Össur
833 |
--------------------------------------------------------------------------------
/src/reynir/config/NounPredicates.conf:
--------------------------------------------------------------------------------
1 | # Greynir: Natural language processing for Icelandic
2 |
3 | # Copyright © 2023 Miðeind ehf
4 |
5 | # Work in progress; handling of this data has not
6 | # been implemented as of yet.
7 |
8 | afborgun /af þgf $error(PP, á)
9 | afborgun /á þgf
10 | affall /af þgf
11 | affall /að þgf $error(AÐAF, af)
12 | afskriftir /af þgf
13 | afskriftir /að þgf $error(AÐAF, af)
14 | eftirsjá /af þgf $error(AÐAF, að)
15 | eftirsjá /að þgf
16 | frásögn /af þgf
17 | frásögn /um þf $error(PP-ALL, /af þgf)
18 | fyrirmynd /af þgf $error(AÐAF, að)
19 | fyrirmynd /að þgf
20 | færi /á þgf
21 | hlutdeild /að þgf $error(PP, í)
22 | hlutdeild /í þgf
23 | hætta /á þgf
24 | höfundur /af þgf $error(AÐAF, að)
25 | höfundur /að þgf
26 | hús /við þf
27 | húsnæði /við þf
28 | innsýn /inn_í þf $error(PP, í)
29 | innsýn /í þf
30 | karl /í þf # karl í krapinu, karlinn í tunglinu
31 | kaupandi /af þgf $error(AÐAF, að)
32 | kaupandi /að þgf
33 | lykill /af þgf $error(AÐAF, að)
34 | lykill /að þgf
35 | meðferð /gegn þgf $error(PP, við)
36 | meðferð /við þgf
37 | ofnæmi /fyrir þgf
38 | ofnæmi /gegn þgf $error(PP, fyrir)
39 | sveifla /á þgf $error(PP, á)
40 | sveifla /í þgf
41 | teikning /af þgf $error(AÐAF, að)
42 | teikning /að þgf
43 | tækifæri /til ef
44 | tækifæri /á þgf $error(PP-ALL, /til ef)
45 | uppdráttur /af þgf $error(AÐAF, að)
46 | uppdráttur /að þgf
47 | uppskrift /af þgf $error(AÐAF, að)
48 | uppskrift /að þgf
49 | uppástunga /að þgf $error(PP-ALL, /um þf)
50 | uppástunga /um þf
51 | virðing /fyrir þgf
52 | virðing /við þf $error(PP-ALL, /fyrir þgf)
53 | vitni /af þgf $error(AÐAF, að)
54 | vitni /að þgf
55 | vörn /gegn þgf
56 | vörn /við þgf $error(PP, gegn)
57 | áfangi /af þgf $error(AÐAF, að)
58 | áfangi /að þgf
59 | áhyggjur /af þgf
60 | áhyggjur /að þgf $error(AÐAF, af)
61 | áhætta /á þgf $error(ALL, hætta, /á þf)
62 | áskrifandi /af þgf $error(AÐAF, að)
63 | áskrifandi /að þgf
64 | ávöxtun /á þf $error(PP-TO-OBJ, ef)
65 | ávöxtun /á þgf $error(PP-TO-OBJ, ef)
66 | útgáfa /af þgf
67 | útgáfa /að þgf $error(AÐAF, af)
68 |
--------------------------------------------------------------------------------
/src/reynir/config/Prepositions.conf:
--------------------------------------------------------------------------------
1 |
2 | # Greynir: Natural language processing for Icelandic
3 |
4 | # Copyright © 2023 Miðeind ehf.
5 |
6 | # Prepositions.conf
7 |
8 | # Forsetningar
9 |
10 | # Forsetningar merktar með 'nh' geta staðið á
11 | # undan sagnlið í nafnhætti:
12 | # 'Beiðnin um að handtaka manninn var send lögreglunni'
13 |
14 | # Stjörnumerktar forsetningar geta komið fyrir í
15 | # [ambiguous_phrases] kaflanum í Phrases.conf, þannig að
16 | # þær hafi áhrif til niðurskurðar mögulegra merkinga.
17 | # Þær verða að vera merktar sem forsetningar ('fs') í BÍN.
18 |
19 | [prepositions]
20 |
21 | að* þgf
22 | af* þgf nh
23 | allfjarri þgf
24 | andspænis þgf
25 | andstætt þgf
26 | auk ef
27 | austan ef
28 | austur þf
29 | á* þf nh
30 | á* þgf
31 | án ef
32 | árla ef
33 | ásamt þgf
34 | bak þgf
35 | eftir* þf nh
36 | eftir* þgf
37 | fjarri þgf
38 | fjær þgf
39 | fram þf
40 | frá þgf
41 | fyrir* þf nh
42 | fyrir* þgf
43 | gagnstætt þgf
44 | gagnvart þgf
45 | gegn þgf
46 | gegnt þgf
47 | gegnum þf nh
48 | handa þgf
49 | handan ef
50 | hjá þgf
51 | inn þf nh
52 | innan ef
53 | í* þf nh
54 | í* þgf
55 | jafnframt þgf
56 | jafnhliða þgf
57 | kring þgf
58 | kringum þf nh
59 | með* þf nh
60 | með* þgf
61 | meðal ef
62 | meðfram þgf
63 | meður þgf
64 | milli ef
65 | millum ef
66 | mót þgf
67 | móti þgf
68 | nálægt þgf
69 | neðan ef
70 | niður þf
71 | norðan ef
72 | norður þf
73 | nær þgf
74 | nærri þgf
75 | næst þgf
76 | #næstum nf # Frekar eo!
77 | #of
78 | ofan ef
79 | ofar þgf # 'ofar hverri kröfu'
80 | óháð þgf # 'hefðu alltaf greitt óháð nauðasamningi'
81 | ólíkt þgf # 'þeir fá enga styrki ólíkt frambjóðendum til þings'
82 | órafjarri þgf
83 | sakir ef
84 | samanber þf nh # 'samanber yfirlýsingu ríkisstjórnarinnar frá 3. júní'
85 | samfara þgf
86 | samhliða þgf
87 | samkvæmt þgf
88 | sammála þgf
89 | samsíðis þgf
90 | samskipa þgf
91 | samstíga þgf
92 | samtímis þgf
93 | #sem nf # 'í krafti stöðu minnar sem leikhússtjóri'
94 | #sem þf # 'margir hafa hvatt mig til að bjóða mig fram sem forseta Íslands'
95 | #síðan
96 | síðla ef
97 | snemma ef
98 | suður þf
99 | sunnan ef
100 | sökum ef
101 | til* ef nh # 'tilraunir til að skilgreina vandann'
102 | um* þf nh
103 | umfram þf nh
104 | umhverfis þf
105 | undan þgf # !!! á undan
106 | undir þf
107 | undir þgf
108 | upp þf # !!! upp á
109 | # !!! Note: In Verbs.conf, there are several instances of '/upp þgf',
110 | # !!! but 'upp' is not defined here as a preposition with a dative (þgf) argument.
111 | utan ef
112 | úr þgf # !!! upp úr
113 | út þf
114 | varðandi þf
115 | vegna ef
116 | vestan ef
117 | vestur þf
118 | víðsfjarri þgf
119 | við* þf nh
120 | við* þgf # Hard-coded in reducer.py to have less priority than við + þf
121 | yfir* þf nh
122 | yfir* þgf # yfir honum var helgisvipur
123 |
124 | # Multiword prepositions
125 | # These phrases should also be included in Phrases.conf,
126 | # in most cases as 'ao frasi'
127 | # Note that these prepositions can be associated with verbs
128 | # in Verbs.conf using underscores, for example
129 | # 'keppa /fyrir_hönd ef'
130 |
131 | fyrir aftan þf
132 | fyrir austan þf
133 | fyrir framan þf
134 | fyrir handan þf
135 | fyrir innan þf
136 | fyrir neðan þf
137 | fyrir norðan þf
138 | fyrir ofan þf
139 | fyrir sunnan þf
140 | fyrir utan þf
141 | fyrir vestan þf
142 | fyrir hönd ef
143 | #á móti þgf
144 | #á eftir þgf
145 | #á undan þgf
146 | #á meðal ef
147 | #á milli ef
148 | #á hendur þgf
149 | #á fætur þgf
150 | í kringum þf
151 | í gegnum þf
152 | fyrir sakir þf
153 | á móts við þf
154 | innan við þf
155 | samanborið við þf
156 | #miðað við þf
157 | með tilliti til ef
158 | þrátt fyrir þf
159 | það sem af er þgf
160 | það sem eftir er ef
161 | til og frá þgf
162 | upp úr þgf
163 | þvert á þf
164 | austur fyrir þf
165 | vestur fyrir þf
166 | norður fyrir þf
167 | suður fyrir þf
168 | skömmu fyrir þf
169 | skömmu eftir þf
170 | örskömmu fyrir þf
171 | örskömmu eftir þf
172 |
173 | # Other multiword prepositional phrases that
174 | # were written in one word but have been split up.
175 | # This information is still needed to know which
176 | # case the composite preposition governs.
177 |
178 | austan undir þf
179 | fram undir þf # 'fram undir kvöld'
180 | innan undir þf nh
181 | út undan þgf
182 | út yfir þf
183 |
184 | # Ambiguous erroneous multiword prepositions
185 | # Should be disambiguated into different things
186 | # based on what case they govern.
187 |
188 | # 'fram á eyrina'
189 | frammá þf nh $error(FORM-fram_á)
190 | # 'frammi á gangi'
191 | frammá þgf $error(FORM-frammi_á)
192 | # 'fram í hellinn'
193 | frammí þf $error(FORM-fram_í)
194 | # 'frammi í bílnum'
195 | frammí þgf $error(FORM-frammi_í)
196 | # 'inn á völlinn'
197 | inná þf nh $error(FORM-inn_á)
198 | # 'inni á vellinum'
199 | inná þgf $error(FORM-inni_á)
200 | # 'inn í hellinn'
201 | inní þf nh $error(FORM-inn_í)
202 | # 'inni í hellinum'
203 | inní þgf $error(FORM-inni_í)
204 | # 'niður á lækjarbakkann'
205 | niðrá þf nh $error(FORM-niður_á)
206 | # 'niðri á gólfinu'
207 | niðrá þgf $error(FORM-niðri_á)
208 | # 'niður í myrkrið'
209 | niðrí þf nh $error(FORM-niður_í)
210 | # 'niðri í myrkrinu'
211 | niðrí þgf $error(FORM-niðri_í)
212 | # 'upp á hestinn'
213 | uppá þf $error(FORM-upp_á)
214 | # 'uppi á borðinu'
215 | uppá þgf $error(FORM-uppi_á)
216 | # 'upp í bílinn'
217 | uppí þf $error(FORM-upp_í)
218 | # 'uppi í kastalanum'
219 | uppí þgf $error(FORM-uppi_í)
220 | # 'út á ystu nöf'
221 | útá þf nh $error(FORM-út_á)
222 | # 'úti á túninu'
223 | útá þgf $error(FORM-úti_á)
224 | # 'út í laugina'
225 | útí þf nh $error(FORM-út_í)
226 | # 'úti í náttúrunni'
227 | útí þgf $error(FORM-úti_í)
228 |
229 | # Compound prepositions that should be split into two words
230 |
231 | alltað þgf $error(FORM-allt_að)
232 | austanundir þf $error(FORM-austan_undir)
233 | framhjá þgf $error(FORM-fram_hjá)
234 | framundir þf $error(FORM-fram_undir) # 'framundir kvöld munu björgunarsveitir aðstoða fólk'
235 | innanum þf nh $error(FORM-innan_um)
236 | innanundir þf nh $error(FORM-innan_undir)
237 | innum þf nh $error(FORM-inn_um)
238 | útaf þgf $error(FORM-út_af)
239 | útundan þgf $error(FORM-út_undan)
240 | útúr þgf $error(FORM-út_úr)
241 | útyfir þf $error(FORM-út_yfir)
242 |
243 |
--------------------------------------------------------------------------------
/src/reynir/eparser.h:
--------------------------------------------------------------------------------
1 | /*
2 |
3 | Greynir: Natural language processing for Icelandic
4 |
5 | C++ Earley parser module
6 |
7 | Copyright © 2023 Miðeind ehf.
8 |
9 | This software is licensed under the MIT License:
10 |
11 | Permission is hereby granted, free of charge, to any person
12 | obtaining a copy of this software and associated documentation
13 | files (the "Software"), to deal in the Software without restriction,
14 | including without limitation the rights to use, copy, modify, merge,
15 | publish, distribute, sublicense, and/or sell copies of the Software,
16 | and to permit persons to whom the Software is furnished to do so,
17 | subject to the following conditions:
18 |
19 | The above copyright notice and this permission notice shall be
20 | included in all copies or substantial portions of the Software.
21 |
22 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
23 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
24 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
25 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
26 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
27 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
28 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
29 |
30 | This module implements an optimized Earley parser in C++.
31 | It is designed to be called from Python code with
32 | already parsed and packed grammar structures.
33 |
34 | The Earley parser used here is the improved version described by Scott & Johnstone,
35 | referencing Tomita. This allows worst-case cubic (O(n^3)) order, where n is the
36 | length of the input sentence, while still returning all possible parse trees
37 | for an ambiguous grammar.
38 |
39 | See Elizabeth Scott, Adrian Johnstone:
40 | "Recognition is not parsing — SPPF-style parsing from cubic recognisers"
41 | Science of Computer Programming, Volume 75, Issues 1–2, 1 January 2010, Pages 55–70
42 |
43 | */
44 |
45 | #include
46 | #include
47 | #include
48 |
49 |
50 | // Assert macro
51 | #ifdef DEBUG
52 | #define ASSERT(x) assert(x)
53 | #else
54 | #define ASSERT(x)
55 | #endif
56 |
57 |
58 | typedef unsigned int UINT;
59 | typedef int INT;
60 | typedef wchar_t WCHAR;
61 | typedef char CHAR;
62 | typedef unsigned char BYTE;
63 | typedef bool BOOL;
64 |
65 |
66 | class Production;
67 | class Parser;
68 | class State;
69 | class Column;
70 | class NodeDict;
71 | class Label;
72 | struct StateChunk;
73 |
74 |
75 | class AllocCounter {
76 |
77 | // A utility class to count allocated instances
78 | // of an instrumented class. Add this as a static
79 | // member (named e.g. 'ac') of the class to be watched
80 | // and call ac++ and ac-- in the constructor and destructor,
81 | // respectively.
82 |
83 | private:
84 |
85 | UINT m_nAllocs;
86 | UINT m_nFrees;
87 |
88 | public:
89 |
90 | AllocCounter(void)
91 | : m_nAllocs(0), m_nFrees(0)
92 | { }
93 | ~AllocCounter(void)
94 | { }
95 |
96 | void operator++(int)
97 | { this->m_nAllocs++; }
98 | void operator--(int)
99 | {
100 | ASSERT(this->m_nAllocs > this->m_nFrees);
101 | this->m_nFrees++;
102 | }
103 | UINT numAllocs(void) const
104 | { return this->m_nAllocs; }
105 | UINT numFrees(void) const
106 | { return this->m_nFrees; }
107 | INT getBalance(void) const
108 | { return (INT)(this->m_nAllocs - this->m_nFrees); }
109 |
110 | };
111 |
112 |
113 | class Nonterminal {
114 |
115 | // A Nonterminal has an associated list of owned Productions
116 |
117 | friend class AllocReporter;
118 |
119 | private:
120 |
121 | WCHAR* m_pwzName;
122 | Production* m_pProd;
123 |
124 | static AllocCounter ac;
125 |
126 | protected:
127 |
128 | public:
129 |
130 | Nonterminal(const WCHAR* pwzName);
131 |
132 | ~Nonterminal(void);
133 |
134 | void addProduction(Production* p);
135 |
136 | // Get the first right-hand-side production of this nonterminal
137 | Production* getHead(void) const
138 | { return this->m_pProd; }
139 |
140 | WCHAR* getName(void) const
141 | { return this->m_pwzName; }
142 |
143 | };
144 |
145 |
146 | class Production {
147 |
148 | // A Production owns a local copy of an array of items,
149 | // where each item is a negative nonterminal index, or
150 | // positive terminal index. Attempts to index past the
151 | // end of the production yield a 0 item.
152 |
153 | friend class AllocReporter;
154 |
155 | private:
156 |
157 | UINT m_nId; // Unique integer id (0-based) of this production
158 | UINT m_nPriority; // Relative priority of this production
159 | UINT m_n; // Number of items in production
160 | INT* m_pList; // List of items in production
161 | Production* m_pNext; // Next production of same nonterminal
162 |
163 | static AllocCounter ac;
164 |
165 | protected:
166 |
167 | public:
168 |
169 | Production(UINT nId, UINT nPriority, UINT n, const INT* pList);
170 |
171 | ~Production(void);
172 |
173 | void setNext(Production* p);
174 | Production* getNext(void) const
175 | { return this->m_pNext; }
176 |
177 | UINT getId(void) const
178 | { return this->m_nId; }
179 | UINT getLength(void) const
180 | { return this->m_n; }
181 | BOOL isEpsilon(void) const
182 | { return this->m_n == 0; }
183 | UINT getPriority(void) const
184 | { return this->m_nPriority; }
185 |
186 | // Get the item at the dot position within the production
187 | INT operator[] (UINT nDot) const;
188 |
189 | };
190 |
191 |
192 | class Grammar {
193 |
194 | // A Grammar is a collection of Nonterminals
195 | // with their Productions.
196 |
197 | friend class AllocReporter;
198 |
199 | private:
200 |
201 | UINT m_nNonterminals; // Number of nonterminals
202 | UINT m_nTerminals; // Number of terminals (indexed from 1)
203 | INT m_iRoot; // Index of root nonterminal (negative)
204 | Nonterminal** m_nts; // Array of Nonterminal pointers, owned by the Grammar class
205 |
206 | static AllocCounter ac;
207 |
208 | protected:
209 |
210 | public:
211 |
212 | Grammar(UINT nNonterminals, UINT nTerminals, INT iRoot = -1);
213 | Grammar(void);
214 | ~Grammar(void);
215 |
216 | void reset(void);
217 |
218 | BOOL readBinary(const CHAR* pszFilename);
219 |
220 | UINT getNumNonterminals(void) const
221 | { return this->m_nNonterminals; }
222 | UINT getNumTerminals(void) const
223 | { return this->m_nTerminals; }
224 | INT getRoot(void) const
225 | { return this->m_iRoot; }
226 |
227 | void setNonterminal(INT iIndex, Nonterminal*);
228 |
229 | Nonterminal* operator[] (INT iIndex) const;
230 |
231 | const WCHAR* nameOfNt(INT iNt) const;
232 |
233 | };
234 |
235 |
236 | class Label {
237 |
238 | // A Label is associated with a Node.
239 |
240 | friend class Node;
241 |
242 | private:
243 |
244 | INT m_iNt;
245 | UINT m_nDot;
246 | Production* m_pProd;
247 | UINT m_nI;
248 | UINT m_nJ;
249 |
250 | public:
251 |
252 | Label(INT iNt, UINT nDot, Production* pProd, UINT nI, UINT nJ)
253 | : m_iNt(iNt), m_nDot(nDot), m_pProd(pProd), m_nI(nI), m_nJ(nJ)
254 | { }
255 |
256 | BOOL operator==(const Label& other) const
257 | { return ::memcmp((void*)this, (void*)&other, sizeof(Label)) == 0; }
258 |
259 | };
260 |
261 |
262 | class Node {
263 |
264 | friend class AllocReporter;
265 |
266 | private:
267 |
268 | struct FamilyEntry {
269 | Production* pProd;
270 | Node* p1;
271 | Node* p2;
272 | FamilyEntry* pNext;
273 | };
274 |
275 | Label m_label;
276 | FamilyEntry* m_pHead;
277 | UINT m_nRefCount;
278 |
279 | static AllocCounter ac;
280 |
281 | void _dump(Grammar*, UINT nIndent);
282 |
283 | protected:
284 |
285 | public:
286 |
287 | Node(const Label&);
288 | ~Node(void);
289 |
290 | void addRef(void)
291 | { this->m_nRefCount++; }
292 | void delRef(void);
293 |
294 | void addFamily(Production*, Node* pW, Node* pV);
295 |
296 | BOOL hasLabel(const Label& label) const
297 | { return this->m_label == label; }
298 |
299 | void dump(Grammar*);
300 |
301 | static UINT numCombinations(Node*);
302 |
303 | };
304 |
305 |
306 | // Token-terminal matching function
307 | typedef BOOL (*MatchingFunc)(UINT nHandle, UINT nToken, UINT nTerminal);
308 |
309 | // Allocator for token/terminal matching cache
310 | typedef BYTE* (*AllocFunc)(UINT nHandle, UINT nToken, UINT nTerminals);
311 |
312 | // Default matching function that simply
313 | // compares the token value with the terminal number
314 | BOOL defaultMatcher(UINT nHandle, UINT nToken, UINT nTerminal);
315 |
316 |
317 | class Parser {
318 |
319 | // Earley-Scott parser for a given Grammar
320 |
321 | friend class AllocReporter;
322 | friend class Column;
323 |
324 | private:
325 |
326 | // Grammar pointer, not owned by the Parser
327 | Grammar* m_pGrammar;
328 | MatchingFunc m_pMatchingFunc;
329 | AllocFunc m_pAllocFunc;
330 |
331 | void push(UINT nHandle, State*, Column*, State*&, StateChunk*);
332 |
333 | Node* makeNode(State* pState, UINT nEnd, Node* pV, NodeDict& ndV);
334 |
335 | // Internal token/terminal matching cache management
336 | BYTE* allocCache(UINT nHandle, UINT nToken, BOOL* pbNeedsRelease);
337 | void releaseCache(BYTE* abCache);
338 |
339 | protected:
340 |
341 | public:
342 |
343 | Parser(Grammar*, MatchingFunc = defaultMatcher, AllocFunc = NULL);
344 | ~Parser(void);
345 |
346 | UINT getNumTerminals(void) const
347 | { return this->m_pGrammar->getNumTerminals(); }
348 | UINT getNumNonterminals(void) const
349 | { return this->m_pGrammar->getNumNonterminals(); }
350 | MatchingFunc getMatchingFunc(void) const
351 | { return this->m_pMatchingFunc; }
352 | Grammar* getGrammar(void) const
353 | { return this->m_pGrammar; }
354 |
355 | // If pnToklist is NULL, a sequence of integers 0..nTokens-1 will be used
356 | Node* parse(UINT nHandle, INT iStartNt, UINT* pnErrorToken,
357 | UINT nTokens, const UINT pnToklist[] = NULL);
358 |
359 | };
360 |
361 | // Print a report on memory allocation
362 | extern "C" void printAllocationReport(void);
363 |
364 | // Parse a token stream
365 | extern "C" Node* earleyParse(Parser*, UINT nTokens, INT iRoot, UINT nHandle, UINT* pnErrorToken);
366 |
367 | extern "C" Grammar* newGrammar(const CHAR* pszGrammarFile);
368 |
369 | extern "C" void deleteGrammar(Grammar*);
370 |
371 | extern "C" Parser* newParser(Grammar*, MatchingFunc fpMatcher = defaultMatcher, AllocFunc fpAlloc = NULL);
372 |
373 | extern "C" void deleteParser(Parser*);
374 |
375 | extern "C" void deleteForest(Node*);
376 |
377 | extern "C" void dumpForest(Node*, Grammar*);
378 |
379 | extern "C" UINT numCombinations(Node*);
380 |
381 |
--------------------------------------------------------------------------------
/src/reynir/eparser_build.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | Greynir: Natural language processing for Icelandic
4 |
5 | CFFI builder for _eparser module
6 |
7 | Copyright © 2023 Miðeind ehf.
8 | Author: Vilhjálmur Þorsteinsson
9 |
10 | This software is licensed under the MIT License:
11 |
12 | Permission is hereby granted, free of charge, to any person
13 | obtaining a copy of this software and associated documentation
14 | files (the "Software"), to deal in the Software without restriction,
15 | including without limitation the rights to use, copy, modify, merge,
16 | publish, distribute, sublicense, and/or sell copies of the Software,
17 | and to permit persons to whom the Software is furnished to do so,
18 | subject to the following conditions:
19 |
20 | The above copyright notice and this permission notice shall be
21 | included in all copies or substantial portions of the Software.
22 |
23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
26 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
27 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
28 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
29 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
30 |
31 | This module only runs at setup/installation time. It is invoked
32 | from setup.py as requested by the cffi_modules=[] parameter of the
33 | setup() function. It causes the _eparser.*.so CFFI wrapper library
34 | to be built from its source in eparser.cpp.
35 |
36 | """
37 |
38 | import os
39 | import platform
40 | import cffi
41 |
42 | # Don't change the name of this variable unless you
43 | # change it in setup.py as well
44 | ffibuilder = cffi.FFI()
45 |
46 | WINDOWS = platform.system() == "Windows"
47 | MACOS = platform.system() == "Darwin"
48 | IMPLEMENTATION = platform.python_implementation()
49 |
50 | # What follows is the actual Python-wrapped C interface to eparser.*.so
51 |
52 | declarations = """
53 |
54 | typedef unsigned int UINT;
55 | typedef int INT;
56 | typedef int BOOL; // Different from C++
57 | typedef char CHAR;
58 | typedef unsigned char BYTE;
59 |
60 | struct Grammar {
61 | UINT nNonterminals; // Number of nonterminals
62 | UINT nTerminals; // Number of terminals (indexed from 1)
63 | INT iRoot; // Index of root nonterminal (negative)
64 | };
65 |
66 | struct Parser {
67 | struct Grammar* pGrammar;
68 | };
69 |
70 | struct Production {
71 | UINT nId;
72 | UINT nPriority;
73 | UINT n;
74 | INT* pList;
75 | };
76 |
77 | struct Label {
78 | INT iNt;
79 | UINT nDot;
80 | struct Production* pProd;
81 | UINT nI;
82 | UINT nJ;
83 | };
84 |
85 | struct FamilyEntry {
86 | struct Production* pProd;
87 | struct Node* p1;
88 | struct Node* p2;
89 | struct FamilyEntry* pNext;
90 | };
91 |
92 | struct Node {
93 | struct Label label;
94 | struct FamilyEntry* pHead;
95 | UINT nRefCount;
96 | };
97 |
98 | typedef BOOL (*MatchingFunc)(UINT nHandle, UINT nToken, UINT nTerminal);
99 | typedef BYTE* (*AllocFunc)(UINT nHandle, UINT nToken, UINT nSize);
100 |
101 | struct Node* earleyParse(struct Parser*, UINT nTokens, INT iRoot, UINT nHandle, UINT* pnErrorToken);
102 | struct Grammar* newGrammar(const CHAR* pszGrammarFile);
103 | void deleteGrammar(struct Grammar*);
104 | struct Parser* newParser(struct Grammar*, MatchingFunc fpMatcher, AllocFunc fpAlloc);
105 | void deleteParser(struct Parser*);
106 | void deleteForest(struct Node*);
107 | void dumpForest(struct Node*, struct Grammar*);
108 | UINT numCombinations(struct Node*);
109 |
110 | void printAllocationReport(void);
111 |
112 | """
113 |
114 | # Declare the Python callbacks from fastparser.py that will be called by the C code
115 | # See: https://cffi.readthedocs.io/en/latest/using.html#extern-python-new-style-callbacks
116 |
117 | callbacks = """
118 |
119 | extern "Python" BOOL matching_func(UINT, UINT, UINT);
120 | extern "Python" BYTE* alloc_func(UINT, UINT, UINT);
121 |
122 | """
123 |
124 | # Do the magic CFFI incantations necessary to get CFFI and setuptools
125 | # to compile eparser.cpp at setup time, generate a .so library and
126 | # wrap it so that it is callable from Python and PyPy as _eparser
127 |
128 | if WINDOWS:
129 | extra_compile_args = ["/Zc:offsetof-"]
130 | elif MACOS:
131 | os.environ["CFLAGS"] = "-stdlib=libc++" # Fixes PyPy build on macOS 10.15.6+
132 | os.environ["MACOSX_DEPLOYMENT_TARGET"] = "10.9"
133 | extra_compile_args = ["-mmacosx-version-min=10.9", "-stdlib=libc++"]
134 | else:
135 | extra_compile_args = ["-std=c++11"]
136 |
137 | # On some systems, the linker needs to be told to use the C++ compiler
138 | # under PyPy due to changes in the default behaviour of distutils.
139 | if IMPLEMENTATION == "PyPy":
140 | os.environ["LDCXXSHARED"] = "c++ -shared"
141 |
142 | ffibuilder.cdef(declarations + callbacks)
143 |
144 | ffibuilder.set_source(
145 | "reynir._eparser",
146 | # eparser.cpp is written in C++ but must export a pure C interface.
147 | # This is the reason for the "extern 'C' { ... }" wrapper.
148 | 'extern "C" {\n' + declarations + "\n}\n",
149 | source_extension=".cpp",
150 | sources=["src/reynir/eparser.cpp"],
151 | extra_compile_args=extra_compile_args,
152 | )
153 |
154 | if __name__ == "__main__":
155 | ffibuilder.compile(verbose=True)
156 |
--------------------------------------------------------------------------------
/src/reynir/glock.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | Greynir: Natural language processing for Icelandic
4 |
5 | GlobalLock utility class
6 |
7 | Copyright © 2023 Miðeind ehf.
8 | Original author: Vilhjálmur Þorsteinsson
9 |
10 | This software is licensed under the MIT License:
11 |
12 | Permission is hereby granted, free of charge, to any person
13 | obtaining a copy of this software and associated documentation
14 | files (the "Software"), to deal in the Software without restriction,
15 | including without limitation the rights to use, copy, modify, merge,
16 | publish, distribute, sublicense, and/or sell copies of the Software,
17 | and to permit persons to whom the Software is furnished to do so,
18 | subject to the following conditions:
19 |
20 | The above copyright notice and this permission notice shall be
21 | included in all copies or substantial portions of the Software.
22 |
23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
26 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
27 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
28 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
29 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
30 |
31 | This module implements the GlobalLock class, providing
32 | interprocess locks within a server.
33 |
34 | A GlobalLock is implemented as a file in the /tmp/ directory,
35 | which is assumed to exist (on the current drive in Windows).
36 |
37 | A quick and easy way to use a blocking GlobalLock is as follows:
38 |
39 | with GlobalLock('somestring'):
40 | code_that_only_one_process_can_run_simultaneously()
41 |
42 | """
43 |
44 | from typing import Any, IO, Optional
45 |
46 | import os
47 | import stat
48 | import tempfile
49 |
50 |
51 | class LockError(Exception):
52 | """Lock could not be obtained"""
53 |
54 | pass
55 |
56 |
57 | POSIX: bool = False
58 |
59 | try:
60 | # Try Linux/POSIX
61 | import fcntl
62 | except ImportError:
63 |
64 | try:
65 | # Try Windows
66 | import msvcrt
67 | except ImportError:
68 |
69 | # Not Unix, not Windows: bail out
70 | def _lock_file(file: IO[str], block: bool) -> None:
71 | raise TypeError("File locking not supported on this platform")
72 |
73 | def _unlock_file(file: IO[str]) -> None:
74 | raise TypeError("File locking not supported on this platform")
75 |
76 | else:
77 |
78 | # Windows
79 |
80 | def _lock_file(file: IO[str], block: bool) -> None:
81 | # Lock just the first byte of the file
82 | retry = True
83 | while retry:
84 | retry = False
85 | try:
86 | msvcrt.locking( # type: ignore
87 | file.fileno(),
88 | msvcrt.LK_LOCK if block else msvcrt.LK_NBLCK, # type: ignore
89 | 1,
90 | )
91 | except OSError as e:
92 | if block and e.errno == 36:
93 | # Windows says 'resource deadlock avoided', but we truly want
94 | # a longer blocking wait: try again
95 | retry = True
96 | else:
97 | raise LockError(
98 | "Couldn't lock {0}, errno is {1}".format(file.name, e.errno)
99 | )
100 |
101 | def _unlock_file(file: IO[str]) -> None:
102 | try:
103 | file.seek(0)
104 | msvcrt.locking(file.fileno(), msvcrt.LK_UNLCK, 1) # type: ignore
105 | except OSError as e:
106 | raise LockError(
107 | "Couldn't unlock {0}, errno is {1}".format(file.name, e.errno)
108 | )
109 |
110 | else:
111 |
112 | # Linux/POSIX
113 |
114 | POSIX = True # type: ignore
115 |
116 | def _lock_file(file: IO[str], block: bool) -> None:
117 | try:
118 | fcntl.flock(file.fileno(), fcntl.LOCK_EX | (0 if block else fcntl.LOCK_NB))
119 | except IOError:
120 | raise LockError("Couldn't lock {0}".format(file.name))
121 |
122 | def _unlock_file(file: IO[str]) -> None:
123 | # File is automatically unlocked on close
124 | pass
125 |
126 |
127 | class GlobalLock:
128 |
129 | _TMP_DIR = tempfile.gettempdir()
130 |
131 | def __init__(self, lockname: str) -> None:
132 | """Initialize a global lock with the given name"""
133 | assert lockname and isinstance(lockname, str)
134 | # Locate global locks in the system temporary directory
135 | # (should work on both Windows and Unix/POSIX)
136 | self._path = os.path.join(self._TMP_DIR, "greynir-" + lockname)
137 | self._fp: Optional[IO[str]] = None
138 |
139 | def acquire(self, block: bool = True) -> None:
140 | """Acquire a global lock, blocking if block = True"""
141 |
142 | if self._fp is not None:
143 | # Already hold the lock
144 | return
145 |
146 | path = self._path
147 | fp = None
148 | try:
149 | # Try to open for writing without truncation:
150 | fp = open(path, "r+")
151 | except IOError:
152 | # If the file doesn't exist, we'll get an IO error, try a+
153 | # Note that there may be a race here. Multiple processes
154 | # could fail on the r+ open and open the file a+, but only
155 | # one will get the the lock and write a pid.
156 | try:
157 | fp = open(path, "a+")
158 | # Make sure that the file is readable and writable by others
159 | if POSIX:
160 | os.fchmod(
161 | fp.fileno(),
162 | stat.S_IRUSR
163 | | stat.S_IWUSR
164 | | stat.S_IRGRP
165 | | stat.S_IWGRP
166 | | stat.S_IROTH
167 | | stat.S_IWOTH,
168 | )
169 | except IOError:
170 | raise LockError("Couldn't open or create lock file {0}".format(path))
171 |
172 | self._fp = fp
173 |
174 | try:
175 | _lock_file(fp, block)
176 | except:
177 | fp.seek(1)
178 | fp.close()
179 | raise
180 |
181 | # Once acquired, write the process id to the file
182 | fp.write(" %s\n" % os.getpid())
183 | fp.truncate()
184 | fp.flush()
185 |
186 | def release(self) -> None:
187 | """Release the lock"""
188 | if self._fp is not None:
189 | _unlock_file(self._fp)
190 | self._fp.close()
191 | self._fp = None
192 |
193 | def __enter__(self):
194 | """Python context manager protocol"""
195 | self.acquire(block=True)
196 | return self
197 |
198 | def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any):
199 | """Python context manager protocol"""
200 | self.release()
201 | return False
202 |
--------------------------------------------------------------------------------
/src/reynir/incparser.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | Greynir: Natural language processing for Icelandic
4 |
5 | Utility class for incremental parsing of token streams
6 |
7 | Copyright © 2023 Miðeind ehf.
8 | Original author: Vilhjálmur Þorsteinsson
9 |
10 | This software is licensed under the MIT License:
11 |
12 | Permission is hereby granted, free of charge, to any person
13 | obtaining a copy of this software and associated documentation
14 | files (the "Software"), to deal in the Software without restriction,
15 | including without limitation the rights to use, copy, modify, merge,
16 | publish, distribute, sublicense, and/or sell copies of the Software,
17 | and to permit persons to whom the Software is furnished to do so,
18 | subject to the following conditions:
19 |
20 | The above copyright notice and this permission notice shall be
21 | included in all copies or substantial portions of the Software.
22 |
23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
26 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
27 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
28 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
29 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
30 |
31 | This module implements a simple utility class for parsing token
32 | streams into paragraphs and sentences. The parse is incremental so
33 | that the client can take action on each paragraph and sentence as
34 | it is processed. Also, time.sleep(0) is called between sentences
35 | to make multi-threaded parses proceed more smoothly and evenly.
36 |
37 | """
38 |
39 | import time
40 | from typing import Iterable, Iterator, List, Optional, Tuple
41 |
42 | from tokenizer import paragraphs, Tok
43 |
44 | from .bintokenizer import tokens_are_foreign
45 | from .fastparser import Fast_Parser, Node, ParseError
46 | from .reducer import Reducer
47 | from .settings import Settings
48 |
49 |
50 | # Number of tree combinations that must be exceeded for a verbose
51 | # parse dump to include the sentence text (as opposed to just basic stats)
52 | _VERBOSE_AMBIGUITY_THRESHOLD = 1000
53 |
54 | # The ratio of words in a sentence that must be found in BÍN
55 | # for it to be analyzed as an Icelandic sentence
56 | ICELANDIC_RATIO = 0.5
57 |
58 |
59 | # The same type is defined in the Tokenizer module
60 | SentenceTuple = Tuple[int, List[Tok]]
61 |
62 |
63 | class IncrementalParser:
64 |
65 | """Utility class to parse a token list as a sequence of paragraphs
66 | containing sentences. Typical usage:
67 |
68 | toklist = tokenize(text)
69 | fp = Fast_Parser()
70 | ip = IncrementalParser(fp, toklist)
71 | for p in ip.paragraphs():
72 | for sent in p.sentences():
73 | if sent.parse():
74 | # sentence parsed successfully
75 | # do something with sent.tree
76 | else:
77 | # an error occurred in the parse
78 | # the error token index is at sent.err_index
79 | num_sentences = ip.num_sentences
80 | num_parsed = ip.num_parsed
81 | ambiguity = ip.ambiguity
82 | parse_time = ip.parse_time
83 |
84 | """
85 |
86 | class _IncrementalSentence:
87 |
88 | """An internal sentence representation class"""
89 |
90 | def __init__(self, ip: "IncrementalParser", s: List[Tok]) -> None:
91 | self._ip = ip
92 | self._s = s
93 | self._len = len(s)
94 | assert self._len > 0 # Input should be already sanitized
95 | self._err_index: Optional[int] = None
96 | self._tree: Optional[Node] = None
97 | self._score = 0
98 | self._error: Optional[ParseError] = None
99 |
100 | def __len__(self):
101 | return self._len
102 |
103 | def parse(self) -> bool:
104 | """Parse the sentence"""
105 | num = 0
106 | score = 0
107 | forest: Optional[Node] = None
108 | try:
109 | if tokens_are_foreign(self._s, min_icelandic_ratio=ICELANDIC_RATIO):
110 | raise ParseError(
111 | "Sentence is probably not in Icelandic", token_index=0
112 | )
113 | forest = self._ip._parser.go(self._s)
114 | num = Fast_Parser.num_combinations(forest)
115 | if num > 1:
116 | forest, score = self._ip._reducer.go_with_score(forest)
117 | except ParseError as e:
118 | # The ParseError may originate in the reducer.go_with_score()
119 | # function, and in that case, forest is not None; be sure to reset it
120 | forest = None
121 | score = 0
122 | num = 0
123 | self._err_index = e.token_index
124 | self._error = e
125 | self._tree = forest
126 | self._score = score
127 | self._ip._add_sentence(self, num)
128 | return num > 0
129 |
130 | @property
131 | def tokens(self) -> List[Tok]:
132 | return self._s
133 |
134 | @property
135 | def tree(self) -> Optional[Node]:
136 | return self._tree
137 |
138 | @property
139 | def score(self) -> int:
140 | return self._score
141 |
142 | @property
143 | def error(self) -> Optional[ParseError]:
144 | return self._error
145 |
146 | @property
147 | def err_index(self) -> int:
148 | return self._len - 1 if self._err_index is None else self._err_index
149 |
150 | @property
151 | def text(self) -> str:
152 | return " ".join(t.txt for t in self._s if t.txt)
153 |
154 | def __str__(self) -> str:
155 | return self.text
156 |
157 | class _IncrementalParagraph:
158 |
159 | """An internal paragraph representation class"""
160 |
161 | def __init__(self, ip: "IncrementalParser", p: List[SentenceTuple]) -> None:
162 | self._ip = ip
163 | self._p = p
164 |
165 | def sentences(self) -> Iterator["IncrementalParser._IncrementalSentence"]:
166 | """Yield the sentences within the paragraph, nicely wrapped"""
167 | Sent = IncrementalParser._IncrementalSentence
168 | for _, sent in self._p:
169 | # Call time.sleep(0) to yield the current thread, i.e.
170 | # enable the threading subsystem and/or eventlet under Gunicorn
171 | # to switch threads at this point - since the parsing of an
172 | # entire article can take a long time
173 | time.sleep(0)
174 | yield Sent(self._ip, sent)
175 |
176 | def __init__(
177 | self, parser: Fast_Parser, toklist: Iterable[Tok], verbose: bool = False
178 | ) -> None:
179 | self._parser = parser
180 | self._reducer = Reducer(parser.grammar)
181 | self._num_sent = 0
182 | self._num_parsed_sent = 0
183 | self._num_tokens = 0
184 | self._num_combinations = 0
185 | self._total_score = 0
186 | self._total_ambig = 0.0
187 | self._total_tokens = 0
188 | self._start_time = self._last_time = time.time()
189 | self._verbose = verbose
190 | self._toklist = list(toklist)
191 |
192 | def _add_sentence(
193 | self, s: "IncrementalParser._IncrementalSentence", num: int
194 | ) -> None:
195 | """Add a processed sentence to the statistics"""
196 | slen = len(s)
197 | self._num_sent += 1
198 | self._num_tokens += slen
199 | if num > 0:
200 | # The sentence was parsed successfully
201 | self._num_parsed_sent += 1
202 | self._num_combinations += num
203 | ambig_factor = num ** (1 / slen)
204 | self._total_ambig += ambig_factor * slen
205 | self._total_tokens += slen
206 | self._total_score += s.score
207 | # Debugging output, if requested and enabled
208 | if self._verbose and Settings.DEBUG:
209 | current_time = time.time()
210 | print(
211 | "Parsed sentence of length {0} with {1} combinations{3} "
212 | "in {4:.1f} seconds{2}".format(
213 | slen,
214 | num,
215 | ("\n" + s.text) if num >= _VERBOSE_AMBIGUITY_THRESHOLD else "",
216 | " and score " + str(s.score) if num >= 1 else "",
217 | current_time - self._last_time,
218 | )
219 | )
220 | self._last_time = current_time
221 |
222 | def paragraphs(self) -> Iterator["IncrementalParser._IncrementalParagraph"]:
223 | """Yield the paragraphs from the token stream"""
224 | Para = IncrementalParser._IncrementalParagraph
225 | for p in paragraphs(self._toklist):
226 | yield Para(self, p)
227 |
228 | @property
229 | def num_tokens(self) -> int:
230 | return self._num_tokens
231 |
232 | @property
233 | def num_sentences(self) -> int:
234 | return self._num_sent
235 |
236 | @property
237 | def num_parsed(self) -> int:
238 | return self._num_parsed_sent
239 |
240 | @property
241 | def num_combinations(self) -> int:
242 | return self._num_combinations
243 |
244 | @property
245 | def total_score(self) -> int:
246 | return self._total_score
247 |
248 | @property
249 | def ambiguity(self) -> float:
250 | return (
251 | (self._total_ambig / self._total_tokens) if self._total_tokens > 0 else 1.0
252 | )
253 |
254 | @property
255 | def parse_time(self) -> float:
256 | return time.time() - self._start_time
257 |
--------------------------------------------------------------------------------
/src/reynir/lemmatize.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | Greynir: Natural language processing for Icelandic
4 |
5 | Copyright © 2023 Miðeind ehf.
6 |
7 | This software is licensed under the MIT License:
8 |
9 | Permission is hereby granted, free of charge, to any person
10 | obtaining a copy of this software and associated documentation
11 | files (the "Software"), to deal in the Software without restriction,
12 | including without limitation the rights to use, copy, modify, merge,
13 | publish, distribute, sublicense, and/or sell copies of the Software,
14 | and to permit persons to whom the Software is furnished to do so,
15 | subject to the following conditions:
16 |
17 | The above copyright notice and this permission notice shall be
18 | included in all copies or substantial portions of the Software.
19 |
20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
24 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
25 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
26 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 |
28 | This module contains a function to (simplistically) lemmatize text
29 | without parsing it.
30 |
31 | """
32 |
33 | from typing import Optional, Union, Callable, Tuple, List, Iterator, TypeVar, cast
34 |
35 | from abc import abstractmethod, ABCMeta
36 |
37 | from .bindb import BIN_Tuple
38 | from .bintokenizer import tokenize, TOK
39 |
40 |
41 | # TODO: In Python >= 3.8, the base class could be typing.Protocol
42 | class Comparable(metaclass=ABCMeta):
43 | """Protocol for annotating comparable types"""
44 |
45 | @abstractmethod
46 | def __lt__(self: "CT", other: "CT") -> bool: ...
47 |
48 |
49 | CT = TypeVar("CT", bound=Comparable)
50 |
51 | LemmaTuple = Tuple[str, str] # Lemma, category (ordfl)
52 |
53 |
54 | def simple_lemmatize(
55 | txt: str,
56 | *,
57 | all_lemmas: bool = False,
58 | sortkey: Optional[Callable[[LemmaTuple], Comparable]] = None,
59 | ) -> Union[Iterator[LemmaTuple], Iterator[List[LemmaTuple]]]:
60 | """Simplistically lemmatize a list of tokens, returning a generator of
61 | (lemma, category) tuples. The default behaviour is to return the
62 | first lemma provided by bintokenizer. If all_lemmas are requested,
63 | returns full list of potential lemmas. A sort function can be provided
64 | to determine the ordering of that list."""
65 | for t in tokenize(txt):
66 | y: Optional[List[LemmaTuple]] = None
67 | if t.kind == TOK.WORD:
68 | if t.val:
69 | # Known word
70 | if "-" in t.txt:
71 | # The original word already contains a hyphen: leave'em in
72 | y = [(v.stofn, v.ordfl) for v in cast(List[BIN_Tuple], t.val)]
73 | else:
74 | # The original word doesn't contain a hyphen: any hyphens
75 | # in the lemmas must come from the compounding algorithm
76 | y = [
77 | (v.stofn.replace("-", ""), v.ordfl)
78 | for v in cast(List[BIN_Tuple], t.val)
79 | ]
80 | else:
81 | # Unknown word: assume it's an entity
82 | y = [(t.txt, "entity")]
83 | elif t.kind == TOK.PERSON:
84 | assert t.person_names
85 | # Person name w. gender
86 | person_name = t.person_names[0]
87 | y = [(person_name.name, "person_" + (person_name.gender or "hk"))]
88 | elif t.kind == TOK.ENTITY or t.kind == TOK.COMPANY:
89 | # Entity or company name
90 | y = [(t.txt, "entity")]
91 | if y is not None:
92 | # OK, we're returning one or more lemmas for this token
93 | # Remove duplicates while preserving order
94 | y = list(dict.fromkeys(y))
95 | if sortkey is not None:
96 | y.sort(key=sortkey)
97 | if all_lemmas:
98 | yield y
99 | else:
100 | yield y[0] # Naively return first lemma
101 |
--------------------------------------------------------------------------------
/src/reynir/nounphrase.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | Greynir: Natural language processing for Icelandic
4 |
5 | NounPhrase class implementation
6 |
7 | Copyright © 2023 Miðeind ehf.
8 | Original author: Vilhjálmur Þorsteinsson
9 |
10 | This software is licensed under the MIT License:
11 |
12 | Permission is hereby granted, free of charge, to any person
13 | obtaining a copy of this software and associated documentation
14 | files (the "Software"), to deal in the Software without restriction,
15 | including without limitation the rights to use, copy, modify, merge,
16 | publish, distribute, sublicense, and/or sell copies of the Software,
17 | and to permit persons to whom the Software is furnished to do so,
18 | subject to the following conditions:
19 |
20 | The above copyright notice and this permission notice shall be
21 | included in all copies or substantial portions of the Software.
22 |
23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
26 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
27 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
28 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
29 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
30 |
31 | This module implements the NounPhrase class, a handy container
32 | for noun phrases (nafnliður) allowing them to be easily inflected
33 | and formatted.
34 |
35 | """
36 |
37 | from typing import Optional, Mapping, Callable
38 |
39 | import operator
40 |
41 | from .reynir import Greynir, _NounPhrase, SimpleTree
42 |
43 |
44 | # Format specifiers and how they relate to properties
45 | # of the contained NounPhrase object
46 | _FMT: Mapping[str, Callable[["_NounPhrase"], str]] = {
47 | # Icelandic format specifiers
48 | "nf": operator.attrgetter("nominative"),
49 | "þf": operator.attrgetter("accusative"),
50 | "þgf": operator.attrgetter("dative"),
51 | "ef": operator.attrgetter("genitive"),
52 | "ángr": operator.attrgetter("indefinite"),
53 | "stofn": operator.attrgetter("canonical"),
54 | # English/international format specifiers
55 | "nom": operator.attrgetter("nominative"),
56 | "acc": operator.attrgetter("accusative"),
57 | "dat": operator.attrgetter("dative"),
58 | "gen": operator.attrgetter("genitive"),
59 | "ind": operator.attrgetter("indefinite"),
60 | "can": operator.attrgetter("canonical"),
61 | }
62 |
63 |
64 | class NounPhrase:
65 |
66 | """A handy container for a noun phrase (nafnliður),
67 | allowing it to be easily inflected and formatted"""
68 |
69 | # Singleton parser instance
70 | _greynir: Optional[Greynir] = None
71 |
72 | def __init__(self, np_string: str, *, force_number: Optional[str] = None) -> None:
73 | """Initialize a NounPhrase from a text string.
74 | If force_number is set to "et" or "singular", we only
75 | consider singular interpretations of the string.
76 | If force_number is set to "ft" or "plural", we only
77 | consider plural interpretations of the string."""
78 | self._np_string = np_string or ""
79 | self._number: Optional[str] = None
80 | self._person: Optional[str] = None
81 | self._case: Optional[str] = None
82 | self._gender: Optional[str] = None
83 | self._np: Optional[_NounPhrase] = None
84 | if self._np_string:
85 | if self._greynir is None:
86 | # Initialize our parser singleton
87 | # When parsing noun phrases, we don't assume that they
88 | # start a sentence - so we don't attempt to interpret the
89 | # first word as a lowercase word, as we would otherwise
90 | self.__class__._greynir = Greynir(no_sentence_start=True)
91 | # Parse the noun phrase string into a _NounPhrase object
92 | assert self._greynir is not None
93 | self._np = self._greynir.parse_noun_phrase(
94 | self._np_string, force_number=force_number
95 | )
96 | if self._np is not None and self._np.deep_tree is not None:
97 | # Access the first child of the root 'Nl' nonterminal
98 | # of the deep parse tree
99 | nt = next(self._np.deep_tree.enum_child_nodes()).nonterminal.name
100 | # Sanity check
101 | assert nt.startswith("Nl_") or nt.startswith("NlEind_")
102 | # Extract the variants of the nonterminal
103 | variants = set(nt.split("_")[1:])
104 | self._number = (variants & {"et", "ft"}).pop()
105 | self._person = (variants & {"p1", "p2", "p3"}).pop()
106 | self._case = (variants & {"nf", "þf", "þgf", "ef"}).pop()
107 | self._gender = (variants & {"kk", "kvk", "hk"}).pop()
108 |
109 | def __str__(self) -> str:
110 | """Return the contained string as-is"""
111 | return self._np_string
112 |
113 | def __repr__(self) -> str:
114 | return "".format(
115 | self._np_string, "parsed" if self.parsed else "not parsed"
116 | )
117 |
118 | def __len__(self) -> int:
119 | """Provide len() for convenience"""
120 | return self._np_string.__len__()
121 |
122 | def __format__(self, format_spec: str) -> str:
123 | """Return the contained string after inflecting it according
124 | to the format specification, if given"""
125 | # Examples:
126 | # >>> np = NounPhrase('skjótti hesturinn')
127 | # >>> f"Hér er {np:nf}"
128 | # 'Hér er skjótti hesturinn'
129 | # >>> f"Um {np:þf}"
130 | # 'Um skjótta hestinn'
131 | # >>> f"Frá {np:þgf}"
132 | # 'Frá skjótta hestinum'
133 | # >>> f"Til {np:ef}"
134 | # 'Til skjótta hestsins'
135 | # >>> f"Hér er {np:ángr}"
136 | # 'Hér er skjóttur hestur'
137 | # np = NounPhrase("þrír skjóttir hestar")
138 | # >>> f"Umræðuefnið er {np:stofn}"
139 | # 'Umræðuefnið er skjóttur hestur'
140 | if not format_spec or not self.parsed:
141 | return self._np_string
142 | # Find the attrgetter (property access function)
143 | # corresponding to the format spec
144 | fmt = _FMT.get(format_spec)
145 | if fmt is None:
146 | # We don't recognize this format specifier
147 | raise ValueError(
148 | "Invalid format specifier for NounPhrase: '{0}'".format(format_spec)
149 | )
150 | # Extract the requested property and return it
151 | assert self._np is not None
152 | return fmt(self._np)
153 |
154 | @property
155 | def parsed(self) -> bool:
156 | """Return True if the noun phrase was successfully parsed"""
157 | return self._np is not None and self._np.tree is not None
158 |
159 | @property
160 | def tree(self) -> Optional[SimpleTree]:
161 | """Return the SimpleTree object corresponding to the noun phrase"""
162 | return None if self._np is None else self._np.tree
163 |
164 | @property
165 | def case(self) -> Optional[str]:
166 | """Return the case of the noun phrase, as originally parsed"""
167 | return self._case
168 |
169 | @property
170 | def number(self) -> Optional[str]:
171 | """Return the number (singular='et'/plural='ft') of the noun phrase,
172 | as originally parsed"""
173 | return self._number
174 |
175 | @property
176 | def person(self) -> Optional[str]:
177 | """Return the person ('p1', 'p2', 'p3') of the noun phrase,
178 | as originally parsed"""
179 | return self._person
180 |
181 | @property
182 | def gender(self) -> Optional[str]:
183 | """Return the gender (masculine='kk', feminine='kvk', neutral='hk')
184 | of the noun phrase, as originally parsed"""
185 | return self._gender
186 |
187 | @property
188 | def nominative(self) -> Optional[str]:
189 | """Return nominative form (nefnifall)"""
190 | return None if self._np is None else self._np.nominative
191 |
192 | @property
193 | def indefinite(self) -> Optional[str]:
194 | """Return indefinite form (nefnifall án greinis)"""
195 | return None if self._np is None else self._np.indefinite
196 |
197 | @property
198 | def canonical(self) -> Optional[str]:
199 | """Return canonical form (nefnifall eintölu án greinis)"""
200 | return None if self._np is None else self._np.canonical
201 |
202 | @property
203 | def accusative(self) -> Optional[str]:
204 | """Return accusative form (þolfall)"""
205 | return None if self._np is None else self._np.accusative
206 |
207 | @property
208 | def dative(self) -> Optional[str]:
209 | """Return dative form (þágufall)"""
210 | return None if self._np is None else self._np.dative
211 |
212 | @property
213 | def genitive(self) -> Optional[str]:
214 | """Return genitive form (eignarfall)"""
215 | return None if self._np is None else self._np.genitive
216 |
--------------------------------------------------------------------------------
/src/reynir/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mideind/GreynirEngine/c827bfbba49eb49971d5bf63cc49f444534ce11b/src/reynir/py.typed
--------------------------------------------------------------------------------
/test/test_matcher.py:
--------------------------------------------------------------------------------
1 | # type: ignore
2 | """
3 |
4 | test_matcher.py
5 |
6 | Tests for the SimpleTree matching functionality in matcher.py
7 |
8 | Copyright © 2023 by Miðeind ehf.
9 | Original author: Vilhjálmur Þorsteinsson
10 |
11 | This software is licensed under the MIT License:
12 |
13 | Permission is hereby granted, free of charge, to any person
14 | obtaining a copy of this software and associated documentation
15 | files (the "Software"), to deal in the Software without restriction,
16 | including without limitation the rights to use, copy, modify, merge,
17 | publish, distribute, sublicense, and/or sell copies of the Software,
18 | and to permit persons to whom the Software is furnished to do so,
19 | subject to the following conditions:
20 |
21 | The above copyright notice and this permission notice shall be
22 | included in all copies or substantial portions of the Software.
23 |
24 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
27 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
28 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
29 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
30 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 |
32 | """
33 |
34 | import pytest
35 |
36 | from tokenizer.definitions import AmountTuple, DateTimeTuple
37 |
38 | from reynir import Greynir
39 | from reynir.reynir import Terminal
40 |
41 |
42 | @pytest.fixture(scope="module")
43 | def r():
44 | """Provide a module-scoped Greynir instance as a test fixture"""
45 | r = Greynir()
46 | yield r
47 | # Do teardown here
48 | r.__class__.cleanup()
49 |
50 |
51 | def test_matcher(r: Greynir, verbose: bool = False) -> None:
52 |
53 | s = r.parse_single("Hún á heiðurinn að þessu.")
54 | m = list(
55 | s.tree.all_matches(
56 | "( "
57 | "VP > [ .* VP > { ( 'eiga'|'fá'|'hljóta' ) } .* NP-OBJ > { 'heiður' PP > { 'að' } } ] "
58 | "| "
59 | "VP > [ .* VP > { ( 'eiga'|'fá'|'hljóta' ) } .* NP-OBJ > { 'heiður' } PP > { 'að' } ] "
60 | ") "
61 | )
62 | )
63 | assert len(m) == 1
64 |
65 | # Simple condition, correct sentence (vh in both subtrees)
66 | s = r.parse_single("Ég hefði farið út ef Jón hefði hegðað sér vel.")
67 | m = list(
68 | s.tree.all_matches(
69 | "VP > { VP > { so_vh } CP-ADV-COND > { IP > { VP >> so_fh }}}"
70 | )
71 | )
72 | assert len(m) == 0
73 |
74 | # Simple condition, incorrect sentence (fh in conditional subtree)
75 | s = r.parse_single("Ég hefði farið út ef Jón hafði hegðað sér vel.")
76 | m = list(
77 | s.tree.all_matches(
78 | "VP > { VP > { so_vh } CP-ADV-COND > { IP > { VP >> so_fh }}}"
79 | )
80 | )
81 | assert len(m) == 1
82 |
83 | # Complex condition, incorrect sentence (fh in complex subsentence, fh in conditional subtree)
84 | s = r.parse_single(
85 | "Ég hefði farið út ef Jón, sem Anna elskaði heitt, hafði hegðað sér vel."
86 | )
87 | # There are two potential attachments of the CP-ADV-COND subtree
88 | m = list(
89 | s.tree.all_matches(
90 | "VP > { VP > { so_vh } CP-ADV-COND > { IP > { VP >> so_fh }}}"
91 | )
92 | ) + list(
93 | s.tree.all_matches(
94 | " IP > { VP > { VP > { so_vh } } CP-ADV-COND > { IP > { VP >> so_fh }}}"
95 | )
96 | )
97 | assert len(m) == 1
98 |
99 | # Complex condition, incorrect sentence (vh in complex subsentence, fh in conditional subtree)
100 | s = r.parse_single(
101 | "Ég hefði farið út ef Jón, sem Anna hefði elskað heitt, hafði hegðað sér vel."
102 | )
103 | # There are two potential attachments of the CP-ADV-COND subtree
104 | m = list(
105 | s.tree.all_matches(
106 | "VP > { VP > { so_vh } CP-ADV-COND > { IP > { VP >> so_fh }}}"
107 | )
108 | ) + list(
109 | s.tree.all_matches(
110 | "IP > { VP > { VP > { so_vh } } CP-ADV-COND > { IP > { VP >> so_fh }}}"
111 | )
112 | )
113 | assert len(m) == 1
114 |
115 | # Complex condition, correct sentence (fh in complex subsentence, vh in conditional subtree)
116 | s = r.parse_single(
117 | "Ég hefði farið út ef Jón, sem Anna elskaði heitt, hefði hegðað sér vel."
118 | )
119 | # There are two potential attachments of the CP-ADV-COND subtree
120 | m = list(
121 | s.tree.all_matches(
122 | "VP > { VP > { so_vh } CP-ADV-COND > { IP > { VP >> so_fh }}}"
123 | )
124 | ) + list(
125 | s.tree.all_matches(
126 | "IP > { VP > { VP > { so_vh } } CP-ADV-COND > { IP > { VP >> so_fh }}}"
127 | )
128 | )
129 | assert len(m) == 0
130 |
131 | # Complex condition, correct sentence (vh in complex subsentence, vh in conditional subtree)
132 | s = r.parse_single(
133 | "Ég hefði farið út ef Jón, sem Anna hefði elskað heitt, hefði hegðað sér vel."
134 | )
135 | # There are two potential attachments of the CP-ADV-COND subtree
136 | m = list(
137 | s.tree.all_matches(
138 | "VP > { VP > { so_vh } CP-ADV-COND > { IP > { VP >> so_fh }}}"
139 | )
140 | ) + list(
141 | s.tree.all_matches(
142 | "IP > { VP > { VP > { so_vh } } CP-ADV-COND > { IP > { VP >> so_fh }}}"
143 | )
144 | )
145 | assert len(m) == 0
146 |
--------------------------------------------------------------------------------
/test/test_no_multiply_numbers.py:
--------------------------------------------------------------------------------
1 | # type: ignore
2 | """
3 |
4 | test_no_multiply_numbers.py
5 |
6 | Tests for Greynir no_multiply_numbers flag functionality
7 |
8 | Copyright © 2023 by Miðeind ehf.
9 |
10 | This software is licensed under the MIT License:
11 |
12 | Permission is hereby granted, free of charge, to any person
13 | obtaining a copy of this software and associated documentation
14 | files (the "Software"), to deal in the Software without restriction,
15 | including without limitation the rights to use, copy, modify, merge,
16 | publish, distribute, sublicense, and/or sell copies of the Software,
17 | and to permit persons to whom the Software is furnished to do so,
18 | subject to the following conditions:
19 |
20 | The above copyright notice and this permission notice shall be
21 | included in all copies or substantial portions of the Software.
22 |
23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
26 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
27 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
28 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
29 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
30 |
31 | """
32 |
33 | import pytest
34 |
35 | from reynir import Greynir
36 |
37 | # Import tests from other files directly into namespace
38 | # (they get run again with the new Greynir instance from the r function below)
39 | # in order to see if flag affects other functionality than just written numbers
40 | from test_cases import test_addresses, test_cases, test_noun_phrases
41 | from test_matcher import test_matcher
42 | from test_original import test_original
43 |
44 | # Too many to comfortably write, instead we
45 | # overwrite the only affected tests and the function r
46 | from test_parse import *
47 | from test_reynir import (
48 | test_augment_terminal,
49 | test_auto_uppercase,
50 | test_compounds,
51 | test_compounds_with_numbers,
52 | test_lemmas,
53 | test_names,
54 | test_sentence_split,
55 | )
56 | from test_serializers import test_annotree, test_serializers
57 |
58 |
59 | @pytest.fixture(scope="module")
60 | def r():
61 | """Provide module-scoped Greynir instance (which doesn't multiply numbers) as test fixture"""
62 | r = Greynir(no_multiply_numbers=True)
63 | yield r
64 | # Do teardown here
65 | r.__class__.cleanup()
66 |
67 |
68 | def check_terminal(t, text, lemma, category, variants):
69 | assert t.text == text
70 | assert t.lemma == lemma
71 | if category == "töl":
72 | # Ignore variants for undeclinable number words; also,
73 | # allow "no" for the category since some number words have
74 | # both "no" and "töl" categories in BÍN
75 | assert t.category == "no" or t.category == "töl"
76 | elif category == "to":
77 | # Allow "no" for the category since declinable number words have
78 | # both "no" and "to" categories in BÍN
79 | assert t.category == "no" or t.category == "to"
80 | assert set(t.variants) == set(variants)
81 | else:
82 | assert t.category == category
83 | assert set(t.variants) == set(variants)
84 |
85 |
86 | # Overwrite tests from test_parse which use numbers and assume flag is not set
87 | test_amounts = test_terminals = test_single = lambda r: None
88 |
89 |
90 | def test_no_multiply_numbers(r: Greynir):
91 | """Test no_multiply_numbers flag"""
92 |
93 | s = r.parse_single("Tjónið nam 10 milljörðum króna.")
94 | assert s is not None
95 | t: List[Terminal] = s.terminals or []
96 | assert len(t) == 6
97 | check_terminal(
98 | t[2],
99 | text="10",
100 | lemma="10",
101 | category="tala",
102 | variants=["þgf", "kk", "ft"],
103 | )
104 | check_terminal(
105 | t[3],
106 | text="milljörðum",
107 | lemma="milljarður",
108 | category="no",
109 | variants=["þgf", "kk", "ft"],
110 | )
111 | check_terminal(
112 | t[4],
113 | text="króna",
114 | lemma="króna",
115 | category="no",
116 | variants=["ef", "kvk", "ft"],
117 | )
118 |
119 | s = r.parse_single("Tjónið þann 22. maí nam einum milljarði króna.")
120 | assert s is not None
121 | t = s.terminals or []
122 | assert len(t) == 8
123 | check_terminal(
124 | t[4],
125 | text="einum",
126 | lemma="einn",
127 | category="to",
128 | variants=["et", "þgf", "kk"],
129 | )
130 | check_terminal(
131 | t[5],
132 | text="milljarði",
133 | lemma="milljarður",
134 | category="no",
135 | variants=["et", "þgf", "kk"],
136 | )
137 | check_terminal(
138 | t[6],
139 | text="króna",
140 | lemma="króna",
141 | category="no",
142 | variants=["ft", "ef", "kvk"],
143 | )
144 |
145 | s = r.parse_single("Tjónið nam tuttugu og einum milljarði króna.")
146 | assert s is not None
147 | t = s.terminals or []
148 | assert len(t) == 8
149 | check_terminal(
150 | t[2],
151 | text="tuttugu",
152 | lemma="tuttugu",
153 | category="töl",
154 | variants=[],
155 | )
156 | check_terminal(
157 | t[4],
158 | text="einum",
159 | lemma="einn",
160 | category="to",
161 | variants=["et", "þgf", "kk"],
162 | )
163 | check_terminal(
164 | t[5],
165 | text="milljarði",
166 | lemma="milljarður",
167 | category="no",
168 | variants=["et", "þgf", "kk"],
169 | )
170 | check_terminal(
171 | t[6],
172 | text="króna",
173 | lemma="króna",
174 | category="no",
175 | variants=["ft", "ef", "kvk"],
176 | )
177 |
178 | s = r.parse_single("Fjöldi stjarna í Vetrarbrautinni skiptir hundruðum milljarða.")
179 | assert s is not None
180 | t = s.terminals or []
181 | assert len(t) == 8
182 | check_terminal(
183 | t[5],
184 | text="hundruðum",
185 | lemma="hundrað",
186 | category="no",
187 | variants=["ft", "þgf", "hk"],
188 | )
189 | check_terminal(
190 | t[6],
191 | text="milljarða",
192 | lemma="milljarður",
193 | category="no",
194 | variants=["ft", "ef", "kk"],
195 | )
196 |
197 | s = r.parse_single("Sex hundruð áttatíu og þrír leikmenn mættu á blakmótið.")
198 | assert s is not None
199 | t = s.terminals or []
200 | assert len(t) == 10
201 | check_terminal(
202 | t[0],
203 | text="Sex",
204 | lemma="sex",
205 | category="töl",
206 | variants=[],
207 | )
208 | check_terminal(
209 | t[1],
210 | text="hundruð",
211 | lemma="hundrað",
212 | category="no",
213 | variants=["ft", "hk", "nf"],
214 | )
215 | check_terminal(
216 | t[2],
217 | text="áttatíu",
218 | lemma="áttatíu",
219 | category="töl",
220 | variants=[],
221 | )
222 | check_terminal(
223 | t[3],
224 | text="og",
225 | lemma="og",
226 | category="st",
227 | variants=[],
228 | )
229 | check_terminal(
230 | t[4],
231 | text="þrír",
232 | lemma="þrír",
233 | category="to",
234 | variants=["ft", "kk", "nf"],
235 | )
236 |
237 | s = r.parse_single("Tjónið nam tólf hundruðum punda.")
238 | assert s is not None
239 | t = s.terminals or []
240 | assert len(t) == 6
241 | check_terminal(
242 | t[2],
243 | text="tólf",
244 | lemma="tólf",
245 | category="töl",
246 | variants=[],
247 | )
248 | check_terminal(
249 | t[3],
250 | text="hundruðum",
251 | lemma="hundrað",
252 | category="no",
253 | variants=["ft", "þgf", "hk"],
254 | )
255 | check_terminal(
256 | t[4],
257 | text="punda",
258 | lemma="pund",
259 | category="no",
260 | variants=["ft", "ef", "hk"],
261 | )
262 |
263 | s = r.parse_single("Sjötíu þúsund manns söfnuðust fyrir á torginu.")
264 | assert s is not None
265 | t = s.terminals or []
266 | assert len(t) == 8
267 | check_terminal(
268 | t[0],
269 | text="Sjötíu",
270 | lemma="sjötíu",
271 | category="töl",
272 | variants=["ft", "nf", "hk"],
273 | )
274 | check_terminal(
275 | t[1],
276 | text="þúsund",
277 | lemma="þúsund",
278 | category="no", # "no", # The choice between töl and no seems a bit random
279 | variants=["ft", "nf", "hk"],
280 | )
281 |
282 | s = r.parse_single("7 milljón borðtenniskúlur.")
283 | assert s is not None
284 | t = s.terminals or []
285 | assert len(t) == 4
286 | check_terminal(
287 | t[0],
288 | text="7",
289 | lemma="7",
290 | category="tala",
291 | variants=["kvk", "ft", "nf"],
292 | )
293 | check_terminal(
294 | t[1],
295 | text="milljón",
296 | lemma="milljón",
297 | category="töl",
298 | variants=[], # ["kvk", "ft", "nf"]
299 | )
300 |
301 | s = r.parse_single("Árið áttatíu þúsund sextíu og tvö er í framtíðinni.")
302 | assert s is not None
303 | t = s.terminals or []
304 | assert len(t) == 10
305 | check_terminal(
306 | t[1],
307 | text="áttatíu",
308 | lemma="áttatíu",
309 | category="töl",
310 | variants=["ft", "nf", "hk"],
311 | )
312 | check_terminal(
313 | t[2],
314 | text="þúsund",
315 | lemma="þúsund",
316 | category="töl",
317 | variants=["ft", "nf", "hk"],
318 | )
319 | check_terminal(
320 | t[3],
321 | text="sextíu",
322 | lemma="sextíu",
323 | category="töl",
324 | variants=["ft", "nf", "hk"],
325 | )
326 | check_terminal(
327 | t[5],
328 | text="tvö",
329 | lemma="tveir",
330 | category="to",
331 | variants=["ft", "nf", "hk"],
332 | )
333 |
334 | s = r.parse_single("Árið átján hundruð níutíu og þrjú er í fortíðinni.")
335 | assert s is not None
336 | t = s.terminals or []
337 | assert len(t) == 10
338 | check_terminal(
339 | t[1],
340 | text="átján",
341 | lemma="átján",
342 | category="töl",
343 | variants=["ft", "nf", "hk"],
344 | )
345 | check_terminal(
346 | t[2],
347 | text="hundruð",
348 | lemma="hundrað",
349 | category="no",
350 | variants=["ft", "nf", "hk"],
351 | )
352 | check_terminal(
353 | t[3],
354 | text="níutíu",
355 | lemma="níutíu",
356 | category="töl",
357 | variants=["ft", "nf", "hk"],
358 | )
359 | check_terminal(
360 | t[5],
361 | text="þrjú",
362 | lemma="þrír",
363 | category="to",
364 | variants=["ft", "nf", "hk"],
365 | )
366 |
367 | s = r.parse_single("Tvö hundruð þúsund og þrír leikmenn mættu á blakmótið.")
368 | assert s is not None
369 | t = s.terminals or []
370 | assert len(t) == 10
371 | check_terminal(
372 | t[0],
373 | text="Tvö",
374 | lemma="tveir",
375 | category="to",
376 | variants=["ft", "hk", "nf"],
377 | )
378 | check_terminal(
379 | t[1],
380 | text="hundruð",
381 | lemma="hundrað",
382 | category="to",
383 | variants=["ft", "hk", "nf"],
384 | )
385 | check_terminal(
386 | t[2],
387 | text="þúsund",
388 | lemma="þúsund",
389 | category="töl",
390 | variants=["ft", "hk", "nf"],
391 | )
392 | check_terminal(
393 | t[3],
394 | text="og",
395 | lemma="og",
396 | category="st",
397 | variants=[],
398 | )
399 | check_terminal(
400 | t[4],
401 | text="þrír",
402 | lemma="þrír",
403 | category="to",
404 | variants=["ft", "kk", "nf"],
405 | )
406 |
407 | s = r.parse_single("Þúsundir mættu á blakmótið.")
408 | assert s is not None
409 | t = s.terminals or []
410 | assert len(t) == 5
411 | check_terminal(
412 | t[0],
413 | text="Þúsundir",
414 | lemma="þúsund",
415 | category="no",
416 | variants=["ft", "kvk", "nf"],
417 | )
418 |
--------------------------------------------------------------------------------
/test/test_original.py:
--------------------------------------------------------------------------------
1 | # type: ignore
2 | """
3 |
4 | test_original.py
5 |
6 | Tests for Greynir module
7 |
8 | Copyright © 2023 Miðeind ehf.
9 | Original author: Vilhjálmur Þorsteinsson
10 |
11 | This software is licensed under the MIT License:
12 |
13 | Permission is hereby granted, free of charge, to any person
14 | obtaining a copy of this software and associated documentation
15 | files (the "Software"), to deal in the Software without restriction,
16 | including without limitation the rights to use, copy, modify, merge,
17 | publish, distribute, sublicense, and/or sell copies of the Software,
18 | and to permit persons to whom the Software is furnished to do so,
19 | subject to the following conditions:
20 |
21 | The above copyright notice and this permission notice shall be
22 | included in all copies or substantial portions of the Software.
23 |
24 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
27 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
28 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
29 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
30 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 |
32 | """
33 |
34 | import pytest
35 |
36 | from reynir import Greynir
37 | from reynir.bintokenizer import tokenize
38 |
39 |
40 | @pytest.fixture(scope="module")
41 | def r():
42 | """Provide a module-scoped Greynir instance as a test fixture"""
43 | r = Greynir()
44 | yield r
45 | # Do teardown here
46 | r.__class__.cleanup()
47 |
48 |
49 | def test_original(r: Greynir) -> None:
50 |
51 | s = "Ég keypti 1000 EUR þann 23. 5. 2011 og græddi 10,5 % ."
52 | tlist = list(tokenize(s))
53 | assert sum(len(t.original or "") for t in tlist) == len(s)
54 |
55 | s = " Friðjón Pálsson hitti Friðbert \tJ. Ástráðsson í gær."
56 | tlist = list(tokenize(s))
57 | assert sum(len(t.original or "") for t in tlist) == len(s)
58 |
59 | s = " \t Casey Holdman \n og Luke Skywalker fóru saman á bar ."
60 | tlist = list(tokenize(s))
61 | assert sum(len(t.original or "") for t in tlist) == len(s)
62 |
63 | s = " Hver á USD 5,75 sem ég fann í grasinu með 5,558 prósent?"
64 | tlist = list(tokenize(s))
65 | assert sum(len(t.original or "") for t in tlist) == len(s)
66 |
67 | s = " Virkjunin var \t 600 MW og var á Reynimel 40C í Reykjavík ."
68 | tlist = list(tokenize(s))
69 | assert sum(len(t.original or "") for t in tlist) == len(s)
70 |
71 | s = " Katrín Júlíusdóttir var iðnaðar- \n\t og \t\t viðskiptaráðherra"
72 | tlist = list(tokenize(s))
73 | assert sum(len(t.original or "") for t in tlist) == len(s)
74 |
75 | s = " Friðbert Marsillíus Jónsson keypti hlutabréf í Eimskip hf. fyrir 100 milljónir í gær"
76 | tlist = list(tokenize(s))
77 | assert sum(len(t.original or "") for t in tlist) == len(s)
78 |
79 | s = " Jens \tStoltenberg keypti hlutabréf nú síðdegis fyrir 100 milljónir króna kl. 12:30 30. júlí 2002 og Jens er stoltur af því."
80 | tlist = list(tokenize(s))
81 | assert sum(len(t.original or "") for t in tlist) == len(s)
82 |
83 | s = "Gengi danskrar krónu féll um 2.000 EUR kl. 14:00 30. desember ."
84 | tlist = list(tokenize(s))
85 | assert sum(len(t.original or "") for t in tlist) == len(s)
86 |
87 | s = "Dómsmála- , iðnaðar- og viðskiptaráðherra gerði víðreist um landið"
88 | tlist = list(tokenize(s))
89 | assert sum(len(t.original or "") for t in tlist) == len(s)
90 |
91 | s = " Dagur Bergþóruson Eggertsson hefur verið farsæll borgarstjóri ."
92 | tlist = list(tokenize(s))
93 | assert sum(len(t.original or "") for t in tlist) == len(s)
94 |
95 | s = " Formaður framkvæmdastjórnarinnar er Ursula \t\t van der Leyen ."
96 | tlist = list(tokenize(s))
97 | assert sum(len(t.original or "") for t in tlist) == len(s)
98 |
99 | s = " Angela Merkel hefur lengi vel verið kanslari V-Þýskalands ."
100 | tlist = list(tokenize(s))
101 | assert sum(len(t.original or "") for t in tlist) == len(s)
102 |
103 |
104 | if __name__ == "__main__":
105 | # When invoked as a main module, do a verbose test
106 | from reynir import Greynir
107 |
108 | greynir = Greynir()
109 | test_original(greynir)
110 | greynir.__class__.cleanup()
111 |
--------------------------------------------------------------------------------
/test/test_serializers.py:
--------------------------------------------------------------------------------
1 | # type: ignore
2 | """
3 |
4 | test_serializers.py
5 |
6 | Tests for JSON serialization of sentences
7 |
8 | Copyright © 2023 by Miðeind ehf.
9 |
10 | This software is licensed under the MIT License:
11 |
12 | Permission is hereby granted, free of charge, to any person
13 | obtaining a copy of this software and associated documentation
14 | files (the "Software"), to deal in the Software without restriction,
15 | including without limitation the rights to use, copy, modify, merge,
16 | publish, distribute, sublicense, and/or sell copies of the Software,
17 | and to permit persons to whom the Software is furnished to do so,
18 | subject to the following conditions:
19 |
20 | The above copyright notice and this permission notice shall be
21 | included in all copies or substantial portions of the Software.
22 |
23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
26 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
27 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
28 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
29 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
30 |
31 | """
32 |
33 | import json
34 |
35 | import pytest
36 |
37 |
38 | @pytest.fixture(scope="module")
39 | def r():
40 | """Provide a module-scoped Greynir instance as a test fixture"""
41 | from reynir import Greynir
42 |
43 | r = Greynir()
44 | yield r
45 | # Do teardown here
46 | r.__class__.cleanup()
47 |
48 |
49 | def test_serializers(r):
50 | sents = [
51 | "Ég fór niðrá bryggjuna með Reyni Vilhjálmssyni í gær.",
52 | "Það var 17. júní árið 2020.",
53 | "Við sáum tvo seli og örugglega fleiri en 100 máva.",
54 | "Klukkan var orðin tólf þegar við fórum heim.",
55 | "Bíllinn kostaði €30.000 en ég greiddi 25500 USD fyrir hann.",
56 | "Morguninn eftir vaknaði ég kl. 07:30.",
57 | "Ég var fyrstur á fætur en Þuríður Hálfdánardóttir var númer 2.",
58 | ]
59 | for sent in sents:
60 | orig = r.parse_single(sent)
61 | assert orig.tree is not None
62 |
63 | json_str = r.dumps_single(orig, indent=2)
64 | new = r.loads_single(json_str)
65 |
66 | assert new.tree is not None
67 |
68 | assert all(ot.equal(nt) for ot, nt in zip(orig.tokens, new.tokens))
69 | assert orig.terminals == new.terminals
70 |
71 | assert orig.tree.flat_with_all_variants == orig.tree.flat_with_all_variants
72 | cls = r.__class__
73 | assert json.loads(orig.dumps(cls, indent=2)) == json.loads(
74 | new.dumps(cls, indent=2)
75 | )
76 |
77 |
78 | def test_annotree():
79 | s = """
80 | (META (ID-CORPUS 43bf66f3-51c4-11e6-8438-04014c605401.10)
81 | (ID-LOCAL greynir_corpus_00003.psd,.1)
82 | (URL http://www.mbl.is/sport/efstadeild/2016/07/24/ia_ibv_stadan_er_1_0/))
83 | (S0 (S-HEADING (IP (NP-SUBJ (fn_ft_kk_nf Engir (lemma enginn))
84 | (no_ft_kk_nf atburðir (lemma atburður)))
85 | (NP-PRD (VP (so_ft_kk_lhþt_nf_sb skráðir (lemma skrá))))
86 | (ADVP (ao enn (lemma enn))))))
87 |
88 | """
89 | from reynir.simpletree import AnnoTree
90 |
91 | atree = AnnoTree(s)
92 | stree = atree.as_simple_tree()
93 | assert stree is not None
94 | assert stree.text == "Engir atburðir skráðir enn"
95 | assert stree.tidy_text == "Engir atburðir skráðir enn"
96 | assert stree.nouns == ["atburður"]
97 | assert stree.verbs == ["skrá"]
98 |
99 |
100 | if __name__ == "__main__":
101 | # When invoked as a main module, do a verbose test
102 | from reynir import Greynir
103 |
104 | g = Greynir()
105 | test_serializers(g)
106 | g.__class__.cleanup()
107 |
--------------------------------------------------------------------------------