├── .gitattributes ├── .github └── workflows │ ├── python-package.yml │ └── wheels.yml ├── .gitignore ├── LICENSE.txt ├── MANIFEST.in ├── README.md ├── doc ├── _static │ ├── GreynirLogo220.png │ ├── GreynirLogo400.png │ ├── GreynirTreeExample.png │ ├── GreynirTreeExampleSmall.png │ ├── GreynirTreeExampleSmall2.png │ ├── LitlaGula.png │ ├── MideindLogoVert100.png │ ├── MideindLogoVert300.png │ ├── MideindLogoVert400.png │ ├── ReynirLogo216.png │ ├── ReynirLogo400.png │ ├── annotation_instructions.pdf │ ├── custom.css │ ├── favicon.ico │ ├── greynir-favicon-32x32.png │ ├── greynir-logo-large.png │ └── mideind-horizontal-small.png ├── conf.py ├── copyright.rst ├── index.rst ├── installation.rst ├── nonterminals.rst ├── overview.rst ├── patterns.rst ├── quickstart.rst ├── reference.rst ├── simpletree.rst └── terminals.rst ├── old ├── build_wheels.sh ├── release.sh └── wheels.sh ├── setup.py ├── src └── reynir │ ├── Greynir.grammar │ ├── __init__.py │ ├── _eparser.cpp │ ├── baseparser.py │ ├── basics.py │ ├── bindb.py │ ├── binparser.py │ ├── bintokenizer.py │ ├── cache.py │ ├── config │ ├── Abbrev_errors.conf │ ├── AdjectivePredicates.conf │ ├── Adjectives.conf │ ├── GreynirEngine.conf │ ├── Names.conf │ ├── NounPredicates.conf │ ├── Phrases.conf │ ├── Prefs.conf │ ├── Prepositions.conf │ └── Verbs.conf │ ├── eparser.cpp │ ├── eparser.h │ ├── eparser_build.py │ ├── fastparser.py │ ├── glock.py │ ├── grammar.py │ ├── ifdtagger.py │ ├── incparser.py │ ├── lemmatize.py │ ├── matcher.py │ ├── nounphrase.py │ ├── py.typed │ ├── reducer.py │ ├── resources │ ├── ord.auka.csv │ └── systematic_additions.csv │ ├── reynir.py │ ├── settings.py │ ├── simpletree.py │ └── verbframe.py └── test ├── test_cases.py ├── test_matcher.py ├── test_no_multiply_numbers.py ├── test_original.py ├── test_parse.py ├── test_reynir.py └── test_serializers.py /.gitattributes: -------------------------------------------------------------------------------- 1 | src/reynir/resources/ordalisti-*.bin filter=lfs diff=lfs merge=lfs -text 2 | src/reynir/resources/ord.compressed filter=lfs diff=lfs merge=lfs -text 3 | 4 | # Set the default line ending behavior to auto 5 | * text=auto 6 | 7 | # Source files should only have LF endings 8 | *.py text eol=lf 9 | *.c text eol=lf 10 | *.h text eol=lf 11 | *.cpp text eol=lf 12 | *.hpp text eol=lf 13 | *.csv text eol=lf 14 | *.grammar text eol=lf 15 | *.yml text eol=lf 16 | *.sh text eol=lf 17 | *.rst text eol=lf 18 | *.md text eol=lf 19 | *.in text eol=lf 20 | *.conf text eol=lf 21 | 22 | # Declare files that will always have CRLF line endings on checkout 23 | *.sln text eol=crlf 24 | *.bat text eol=crlf 25 | 26 | # Denote all files that are truly binary and should not be modified 27 | *.png binary 28 | *.jpg binary 29 | *.bin binary 30 | *.compressed binary 31 | -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | name: tests 2 | 3 | on: 4 | push: 5 | branches: [ "*" ] 6 | pull_request: 7 | branches: [ "*" ] 8 | 9 | jobs: 10 | build: 11 | 12 | runs-on: ${{ matrix.os }} 13 | strategy: 14 | matrix: 15 | os: [ubuntu-latest] 16 | python-version: ["3.9", "3.10", "3.11", "3.12", "3.13.0-rc.1", "pypy-3.9", "pypy-3.10"] 17 | 18 | steps: 19 | - uses: actions/checkout@v4 20 | - name: Set up Python ${{ matrix.python-version }} 21 | uses: actions/setup-python@v5 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | - name: Install GreynirEngine 25 | run: | 26 | python -m pip install --upgrade pip wheel setuptools pytest 27 | python -m pip install -e . 28 | - name: Test with pytest 29 | run: | 30 | python -m pytest 31 | - name: Slack notification 32 | uses: 8398a7/action-slack@v3 33 | with: 34 | status: ${{ job.status }} 35 | author_name: Integration Testing (Python ${{ matrix.python-version }}) 36 | env: 37 | SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} 38 | if: failure() # Pick up event if the job fails 39 | -------------------------------------------------------------------------------- /.github/workflows/wheels.yml: -------------------------------------------------------------------------------- 1 | name: wheels 2 | 3 | on: 4 | push: 5 | tags: 6 | - '**' 7 | 8 | jobs: 9 | build_wheels: 10 | name: Build wheels on ${{ matrix.os }} 11 | runs-on: ${{ matrix.os }} 12 | strategy: 13 | matrix: 14 | os: [macos-12, ubuntu-latest, windows-latest] 15 | 16 | steps: 17 | # Check out repository using git-lfs 18 | - uses: actions/checkout@v4 19 | with: 20 | lfs: true 21 | 22 | # Python used to run cibuildwheel 23 | - uses: actions/setup-python@v5 24 | with: 25 | python-version: '3.10' 26 | 27 | - name: Install cibuildwheel 28 | run: python -m pip install --upgrade pip wheel setuptools cibuildwheel 29 | 30 | - name: Build wheels 31 | run: python -m cibuildwheel --output-dir wheelhouse 32 | # Options (https://cibuildwheel.readthedocs.io/en/stable/options/) 33 | env: 34 | CIBW_SKIP: cp36-* cp37-* cp38-* *pp37-* pp38-* *musllinux* 35 | CIBW_BEFORE_BUILD_MACOS: python3 -m pip install --upgrade setuptools wheel cffi 36 | CIBW_ARCHS_MACOS: "x86_64 arm64" 37 | CIBW_ARCHS_WINDOWS: "AMD64" 38 | CIBW_ARCHS_LINUX: "x86_64" 39 | CIBW_BUILD_FRONTEND: "build" 40 | # CIBW_PROJECT_REQUIRES_PYTHON: ">=3.9" 41 | 42 | - uses: actions/upload-artifact@v3 43 | with: 44 | path: ./wheelhouse/*.whl 45 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | .cache/ 4 | .pytest_cache/ 5 | .mypy_cache/ 6 | .idea/ 7 | .vscode/ 8 | *.py[cod] 9 | *.o 10 | *.obj 11 | *.exp 12 | *.lib 13 | *.so 14 | *.exe 15 | 16 | # Compressed DAWG trees 17 | *.dawg 18 | 19 | # Pickled DAWG trees 20 | *.pickle 21 | 22 | # Training data files 23 | *.gz 24 | *.zip 25 | 26 | # Experimental Python stuff 27 | test.py 28 | *.new.py 29 | *.old.py 30 | 31 | # Mypy 32 | mypy.ini 33 | 34 | # Doc stuff 35 | doc/Makefile 36 | doc/make.bat 37 | doc/_build/ 38 | 39 | # Various resource files 40 | src/reynir/resources/*.csv 41 | src/reynir/resources/*.tsv 42 | src/reynir/resources/*.txt 43 | src/reynir/resources/*.py 44 | !src/reynir/resources/ord.auka.csv 45 | !src/reynir/resources/systematic_additions.csv 46 | src/reynir/resources/extra/ 47 | 48 | # Test files 49 | test/test_corpus/handpsd/annotaldLog.txt 50 | 51 | # Scratch work files 52 | src/reynir/_eparser.cpp 53 | *.bin 54 | !ordalisti-*.dawg.bin 55 | 56 | *.sublime-project 57 | *.sublime-workspace 58 | *.code-workspace 59 | *.bak 60 | *.profile 61 | *.log 62 | t 63 | txt 64 | input.txt 65 | nohup.out 66 | 67 | # Distribution / packaging 68 | .Python 69 | env/ 70 | bin/ 71 | build/ 72 | develop-eggs/ 73 | dist/ 74 | eggs/ 75 | lib/ 76 | lib64/ 77 | parts/ 78 | sdist/ 79 | var/ 80 | .eggs/ 81 | *.egg-info/ 82 | .installed.cfg 83 | *.egg 84 | log/ 85 | console/ 86 | 87 | # Virtual environments 88 | venv 89 | p35/ 90 | p358/ 91 | p359/ 92 | p37/ 93 | pypy* 94 | 95 | # Installer logs 96 | pip-log.txt 97 | pip-delete-this-directory.txt 98 | deploy_done.py 99 | 100 | # Windows image file caches 101 | Thumbs.db 102 | ehthumbs.db 103 | 104 | # Folder config file 105 | Desktop.ini 106 | 107 | # Recycle Bin used on file shares 108 | $RECYCLE.BIN/ 109 | 110 | # Windows Installer files 111 | *.cab 112 | *.msi 113 | *.msm 114 | *.msp 115 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Greynir is *copyright © 2016-2024 by Miðeind ehf.* 3 | The original author of this software is *Vilhjálmur Þorsteinsson*. 4 | 5 | This software is licensed under the MIT License: 6 | 7 | Permission is hereby granted, free of charge, to any person 8 | obtaining a copy of this software and associated documentation 9 | files (the "Software"), to deal in the Software without restriction, 10 | including without limitation the rights to use, copy, modify, merge, 11 | publish, distribute, sublicense, and/or sell copies of the Software, 12 | and to permit persons to whom the Software is furnished to do so, 13 | subject to the following conditions: 14 | 15 | The above copyright notice and this permission notice shall be 16 | included in all copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 21 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | graft src 2 | prune src/reynir/__pycache__ 3 | prune src/reynir/.mypy_cache 4 | prune test/test_corpus 5 | include src/reynir/Greynir.grammar 6 | exclude src/reynir/Greynir.*.bin 7 | include src/reynir/eparser.h 8 | exclude src/reynir/_eparser.cpp 9 | include src/reynir/config/*.conf 10 | exclude src/reynir/resources/*.csv 11 | exclude src/reynir/resources/*.txt 12 | exclude src/reynir/resources/.DS_Store 13 | exclude src/reynir/*.o 14 | exclude src/reynir/*.so 15 | exclude src/reynir/.DS_Store 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 2 | [![Python 3.9](https://img.shields.io/badge/python-3.9-blue.svg)](https://www.python.org/downloads/release/python-3817/) 3 | ![Release](https://shields.io/github/v/release/mideind/GreynirEngine?display_name=tag) 4 | ![PyPI](https://img.shields.io/pypi/v/reynir) 5 | [![Build](https://github.com/mideind/GreynirEngine/actions/workflows/python-package.yml/badge.svg)]() 6 | 7 | Greynir 8 | 9 | # GreynirEngine 10 | 11 | **A fast, efficient natural language processing engine for Icelandic** 12 | 13 | ## Overview 14 | 15 | Greynir is a Python 3 (>=3.9) package, 16 | published by [Miðeind ehf.](https://mideind.is), for 17 | **working with Icelandic natural language text**. 18 | Greynir can parse text into **sentence trees**, find **lemmas**, 19 | inflect **noun phrases**, assign **part-of-speech tags** and much more. 20 | 21 | Greynir's sentence trees can *inter alia* be used to extract 22 | information from text, for instance about people, titles, entities, facts, 23 | actions and opinions. 24 | 25 | Full documentation for Greynir is [available here](https://greynir.is/doc/). 26 | 27 | Greynir is the engine of [Greynir.is](https://greynir.is), 28 | a natural-language front end for a database of over 10 million 29 | sentences parsed from Icelandic news articles, and 30 | [Embla](https://embla.is), a voice-driven virtual assistant app 31 | for smart devices such as iOS and Android phones. 32 | 33 | Greynir includes a hand-written 34 | [context-free grammar](https://github.com/mideind/GreynirEngine/blob/master/src/reynir/Greynir.grammar) 35 | for the Icelandic language, consisting of over 7,000 lines of grammatical 36 | productions in [extended Backus-Naur format](https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form). 37 | Its fast C++ parser core is able to cope with long and ambiguous sentences, 38 | using an [Earley-type parser](https://en.wikipedia.org/wiki/Earley_parser) 39 | as [enhanced by Scott and Johnstone](https://www.sciencedirect.com/science/article/pii/S0167642309000951). 40 | 41 | Greynir employs the [Tokenizer](https://pypi.org/project/tokenizer/) package, 42 | by the same authors, to tokenize text, and 43 | uses [BinPackage](https://pypi.org/project/islenska/) as its database of 44 | Icelandic vocabulary and morphology. 45 | 46 | ## Examples 47 | 48 | ### Use Greynir to easily inflect noun phrases 49 | 50 | ````python 51 | from reynir import NounPhrase as Nl 52 | 53 | # Create a NounPhrase ('nafnliður') object 54 | karfa = Nl("þrír lúxus-miðar á Star Wars og tveir brimsaltir pokar af poppi") 55 | 56 | # Print the NounPhrase in the correct case for each context 57 | # (þf=þolfall/accusative, þgf=þágufall/dative). Note that 58 | # the NounPhrase class implements __format__(), allowing you 59 | # to use the case as a format specification, for instance in f-strings. 60 | 61 | print(f"Þú keyptir {karfa:þf}.") 62 | print(f"Hér er kvittunin þín fyrir {karfa:þgf}.") 63 | ```` 64 | 65 | The program outputs the following text, correctly inflected: 66 | 67 | ````text 68 | Þú keyptir þrjá lúxus-miða á Star Wars og tvo brimsalta poka af poppi. 69 | Hér er kvittunin þín fyrir þremur lúxus-miðum á Star Wars og tveimur brimsöltum pokum af poppi. 70 | ```` 71 | 72 | ### Use Greynir to parse a sentence 73 | 74 | ````python 75 | >>> from reynir import Greynir 76 | >>> g = Greynir() 77 | >>> sent = g.parse_single("Ása sá sól.") 78 | >>> print(sent.tree.view) 79 | P # Root 80 | +-S-MAIN # Main sentence 81 | +-IP # Inflected phrase 82 | +-NP-SUBJ # Noun phrase, subject 83 | +-no_et_nf_kvk: 'Ása' # Noun, singular, nominative, feminine 84 | +-VP # Verb phrase containing arguments 85 | +-VP # Verb phrase containing verb 86 | +-so_1_þf_et_p3: 'sá' # Verb, 1 accusative arg, singular, 3rd p 87 | +-NP-OBJ # Noun phrase, object 88 | +-no_et_þf_kvk: 'sól' # Noun, singular, accusative, feminine 89 | +-'.' # Punctuation 90 | >>> sent.tree.nouns 91 | ['Ása', 'sól'] 92 | >>> sent.tree.verbs 93 | ['sjá'] 94 | >>> sent.tree.flat 95 | 'P S-MAIN IP NP-SUBJ no_et_nf_kvk /NP-SUBJ VP so_1_þf_et_p3 96 | NP-OBJ no_et_þf_kvk /NP-OBJ /VP /IP /S-MAIN p /P' 97 | >>> # The subject noun phrase (S.IP.NP also works) 98 | >>> sent.tree.S.IP.NP_SUBJ.lemmas 99 | ['Ása'] 100 | >>> # The verb phrase 101 | >>> sent.tree.S.IP.VP.lemmas 102 | ['sjá', 'sól'] 103 | >>> # The object within the verb phrase (S.IP.VP.NP also works) 104 | >>> sent.tree.S.IP.VP.NP_OBJ.lemmas 105 | ['sól'] 106 | ```` 107 | 108 | ## Prerequisites 109 | 110 | This package runs on CPython 3.9 or newer, and on PyPy 3.9 or newer. 111 | 112 | To find out which version of Python you have, enter: 113 | 114 | ````sh 115 | python --version 116 | ```` 117 | 118 | If a binary wheel package isn't available on [PyPI](https://pypi.org>) 119 | for your system, you may need to have the `python3-dev` package 120 | (or its Windows equivalent) installed on your 121 | system to set up Greynir successfully. This is 122 | because a source distribution install requires a C++ compiler and linker: 123 | 124 | ````sh 125 | # Debian or Ubuntu 126 | sudo apt-get install python3-dev 127 | ```` 128 | 129 | Depending on your system, you may also need to install `libffi-dev`: 130 | 131 | ````sh 132 | # Debian or Ubuntu 133 | sudo apt-get install libffi-dev 134 | ```` 135 | 136 | ## Installation 137 | 138 | To install this package, assuming Python 3 is your default Python: 139 | 140 | ````sh 141 | pip install reynir 142 | ```` 143 | 144 | If you have **git** installed and want to be able to edit 145 | the source, do like so: 146 | 147 | ````sh 148 | git clone https://github.com/mideind/GreynirEngine 149 | cd GreynirEngine 150 | # [ Activate your virtualenv here if you have one ] 151 | pip install -e . 152 | ```` 153 | 154 | The package source code is in `GreynirEngine/src/reynir`. 155 | 156 | ## Tests 157 | 158 | To run the built-in tests, install [pytest](https://docs.pytest.org/en/latest), 159 | `cd` to your `GreynirEngine` subdirectory (and optionally activate your 160 | virtualenv), then run: 161 | 162 | ````sh 163 | python -m pytest 164 | ```` 165 | 166 | ## Evaluation 167 | 168 | A parsing test pipeline for different parsing schemas, including the Greynir schema, 169 | has been developed. It is available [here](https://github.com/mideind/ParsingTestPipe). 170 | 171 | ## Documentation 172 | 173 | Please consult [Greynir's documentation](https://greynir.is/doc/) for detailed 174 | [installation instructions](https://greynir.is/doc/installation.html), 175 | a [quickstart guide](https://greynir.is/doc/quickstart.html), 176 | and [reference information](https://greynir.is/doc/reference.html), 177 | as well as important information about 178 | [copyright and licensing](https://greynir.is/doc/copyright.html). 179 | 180 | ## Troubleshooting 181 | 182 | If parsing seems to hang, it is possible that a lock file that GreynirEngine 183 | uses has been left locked. This can happen if a Python process that uses 184 | GreynirEngine is killed abruptly. The solution is to delete the lock file 185 | and try again: 186 | 187 | On Linux and macOS: 188 | 189 | ````sh 190 | rm /tmp/greynir-grammar # May require sudo privileges 191 | ```` 192 | 193 | On Windows: 194 | 195 | ````cmd 196 | del %TEMP%\greynir-grammar 197 | ```` 198 | 199 | ## Copyright and licensing 200 | 201 | Greynir is Copyright © 2016-2024 by [Miðeind ehf.](https://mideind.is). 202 | The original author of this software is *Vilhjálmur Þorsteinsson*. 203 | 204 | Miðeind ehf. 206 | 207 | This software is licensed under the **MIT License**: 208 | 209 | *Permission is hereby granted, free of charge, to any person* 210 | *obtaining a copy of this software and associated documentation* 211 | *files (the "Software"), to deal in the Software without restriction,* 212 | *including without limitation the rights to use, copy, modify, merge,* 213 | *publish, distribute, sublicense, and/or sell copies of the Software,* 214 | *and to permit persons to whom the Software is furnished to do so,* 215 | *subject to the following conditions:* 216 | 217 | **The above copyright notice and this permission notice shall be** 218 | **included in all copies or substantial portions of the Software.** 219 | 220 | *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,* 221 | *EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF* 222 | *MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.* 223 | *IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY* 224 | *CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,* 225 | *TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE* 226 | *SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.* 227 | 228 | If you would like to use this software in ways that are incompatible 229 | with the standard MIT license, [contact Miðeind ehf.](mailto:mideind@mideind.is) 230 | to negotiate custom arrangements. 231 | 232 | ---- 233 | 234 | GreynirEngine indirectly embeds the [Database of Icelandic Morphology](https://bin.arnastofnun.is), 235 | ([Beygingarlýsing íslensks nútímamáls](https://bin.arnastofnun.is)), abbreviated BÍN. 236 | GreynirEngine does not claim any endorsement by the BÍN authors or copyright holders. 237 | 238 | The BÍN source data are publicly available under the 239 | [CC BY-SA 4.0 license](https://creativecommons.org/licenses/by-sa/4.0/), as further 240 | detailed [here in English](https://bin.arnastofnun.is/DMII/LTdata/conditions/) 241 | and [here in Icelandic](https://bin.arnastofnun.is/gogn/mimisbrunnur/). 242 | 243 | In accordance with the BÍN license terms, credit is hereby given as follows: 244 | 245 | *Beygingarlýsing íslensks nútímamáls. Stofnun Árna Magnússonar í íslenskum fræðum.* 246 | *Höfundur og ritstjóri Kristín Bjarnadóttir.* 247 | -------------------------------------------------------------------------------- /doc/_static/GreynirLogo220.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mideind/GreynirEngine/c827bfbba49eb49971d5bf63cc49f444534ce11b/doc/_static/GreynirLogo220.png -------------------------------------------------------------------------------- /doc/_static/GreynirLogo400.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mideind/GreynirEngine/c827bfbba49eb49971d5bf63cc49f444534ce11b/doc/_static/GreynirLogo400.png -------------------------------------------------------------------------------- /doc/_static/GreynirTreeExample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mideind/GreynirEngine/c827bfbba49eb49971d5bf63cc49f444534ce11b/doc/_static/GreynirTreeExample.png -------------------------------------------------------------------------------- /doc/_static/GreynirTreeExampleSmall.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mideind/GreynirEngine/c827bfbba49eb49971d5bf63cc49f444534ce11b/doc/_static/GreynirTreeExampleSmall.png -------------------------------------------------------------------------------- /doc/_static/GreynirTreeExampleSmall2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mideind/GreynirEngine/c827bfbba49eb49971d5bf63cc49f444534ce11b/doc/_static/GreynirTreeExampleSmall2.png -------------------------------------------------------------------------------- /doc/_static/LitlaGula.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mideind/GreynirEngine/c827bfbba49eb49971d5bf63cc49f444534ce11b/doc/_static/LitlaGula.png -------------------------------------------------------------------------------- /doc/_static/MideindLogoVert100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mideind/GreynirEngine/c827bfbba49eb49971d5bf63cc49f444534ce11b/doc/_static/MideindLogoVert100.png -------------------------------------------------------------------------------- /doc/_static/MideindLogoVert300.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mideind/GreynirEngine/c827bfbba49eb49971d5bf63cc49f444534ce11b/doc/_static/MideindLogoVert300.png -------------------------------------------------------------------------------- /doc/_static/MideindLogoVert400.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mideind/GreynirEngine/c827bfbba49eb49971d5bf63cc49f444534ce11b/doc/_static/MideindLogoVert400.png -------------------------------------------------------------------------------- /doc/_static/ReynirLogo216.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mideind/GreynirEngine/c827bfbba49eb49971d5bf63cc49f444534ce11b/doc/_static/ReynirLogo216.png -------------------------------------------------------------------------------- /doc/_static/ReynirLogo400.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mideind/GreynirEngine/c827bfbba49eb49971d5bf63cc49f444534ce11b/doc/_static/ReynirLogo400.png -------------------------------------------------------------------------------- /doc/_static/annotation_instructions.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mideind/GreynirEngine/c827bfbba49eb49971d5bf63cc49f444534ce11b/doc/_static/annotation_instructions.pdf -------------------------------------------------------------------------------- /doc/_static/custom.css: -------------------------------------------------------------------------------- 1 | /* 2 | custom.css 3 | 4 | Copyright © 2023 Miðeind ehf. 5 | See the Greynir GitHub repository at 6 | https://github.com/mideind/GreynirEngine 7 | for copyright and licensing information. 8 | 9 | This style sheet overrides stuff from the standard Sphinx Alabaster 10 | style to make it more compliant with the Mideind/Greynir branding 11 | 12 | */ 13 | 14 | @import url("https://fonts.googleapis.com/css?family=Lato:300,300i,400,400i,700,700i&display=swap"); 15 | 16 | body { 17 | font-weight: 300; 18 | font-size: 16px; 19 | } 20 | 21 | strong { 22 | font-weight: 400; 23 | } 24 | 25 | div.body h1, 26 | div.body h2, 27 | div.body h3, 28 | div.body h4, 29 | div.body h5, 30 | div.body h6 { 31 | font-family: "Lato", "Garamond", "Georgia", serif; 32 | font-weight: normal; 33 | } 34 | 35 | div.body h1 { 36 | font-weight: 700; 37 | font-style: italic; 38 | font-size: 220%; 39 | color: #006eff; 40 | margin-top: 20px; 41 | } 42 | 43 | div.body h2 { 44 | color: #006eff; 45 | font-size: 160%; 46 | margin-top: 42px; 47 | } 48 | 49 | div.body p, 50 | div.body dd, 51 | div.body li { 52 | line-height: 1.5em; 53 | } 54 | 55 | pre, 56 | tt, 57 | code { 58 | font-size: 14px; 59 | line-height: 1.25em; 60 | } 61 | 62 | div.sphinxsidebarwrapper p.blurb { 63 | margin-top: 5px; 64 | margin-bottom: 10px; 65 | font-size: 13px; 66 | } 67 | 68 | div.sphinxsidebar h3, 69 | div.sphinxsidebar h4 { 70 | font-family: "Lato", "Garamond", "Georgia", serif; 71 | font-weight: 400; 72 | font-size: 22px; 73 | color: #006eff; 74 | margin-top: 15px; 75 | margin-bottom: 5px; 76 | } 77 | 78 | div.sphinxsidebar ul li.toctree-l1 > a { 79 | font-size: 110%; 80 | line-height: 1.7; 81 | } 82 | 83 | div.figure p.caption span.caption-text { 84 | font-style: italic; 85 | } 86 | 87 | dl.py.class, 88 | dl.py.method, 89 | dl.py.attribute { 90 | margin-top: 1.5em; 91 | margin-bottom: 0.8em; 92 | } 93 | 94 | dl.py.method dt, 95 | dl.py.attribute dt { 96 | margin-bottom: 0.8em; 97 | padding-left: 2em; 98 | text-indent: -2em; 99 | } 100 | 101 | tt.descname, 102 | tt.descclassname, 103 | code.descname, 104 | code.descclassname { 105 | font-size: 1.05em; 106 | color: #006eff; 107 | } 108 | 109 | dl.field-list > dt { 110 | font-weight: normal; 111 | color: #888888; 112 | } 113 | -------------------------------------------------------------------------------- /doc/_static/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mideind/GreynirEngine/c827bfbba49eb49971d5bf63cc49f444534ce11b/doc/_static/favicon.ico -------------------------------------------------------------------------------- /doc/_static/greynir-favicon-32x32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mideind/GreynirEngine/c827bfbba49eb49971d5bf63cc49f444534ce11b/doc/_static/greynir-favicon-32x32.png -------------------------------------------------------------------------------- /doc/_static/greynir-logo-large.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mideind/GreynirEngine/c827bfbba49eb49971d5bf63cc49f444534ce11b/doc/_static/greynir-logo-large.png -------------------------------------------------------------------------------- /doc/_static/mideind-horizontal-small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mideind/GreynirEngine/c827bfbba49eb49971d5bf63cc49f444534ce11b/doc/_static/mideind-horizontal-small.png -------------------------------------------------------------------------------- /doc/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Greynir documentation build configuration file, created by 5 | # sphinx-quickstart on Sun Apr 8 01:20:08 2018. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | 21 | from typing import TYPE_CHECKING, Mapping, Any 22 | 23 | import os 24 | from datetime import datetime 25 | 26 | # -- General configuration ------------------------------------------------ 27 | 28 | # If your documentation needs a minimal Sphinx version, state it here. 29 | # 30 | # needs_sphinx = '1.0' 31 | 32 | # Add any Sphinx extension module names here, as strings. They can be 33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 34 | # ones. 35 | extensions = [] 36 | 37 | # Add any paths that contain templates here, relative to this directory. 38 | templates_path = ["_templates"] 39 | 40 | # The suffix(es) of source filenames. 41 | # You can specify multiple suffix as a list of string: 42 | # 43 | # source_suffix = ['.rst', '.md'] 44 | source_suffix = ".rst" 45 | 46 | # The master toctree document. 47 | master_doc = "index" 48 | 49 | # General information about the project. 50 | year = datetime.now().year 51 | project = "Greynir" 52 | copyright = "{0} Miðeind ehf".format(year) 53 | author = "Miðeind ehf." 54 | 55 | # The version info for the project you're documenting, acts as replacement for 56 | # |version| and |release|, also used in various other places throughout the 57 | # built documents. 58 | 59 | # Get version string from "../src/reynir/version.py" 60 | basepath, _ = os.path.split(os.path.realpath(__file__)) 61 | version_path = os.path.join(basepath, "..", "src", "reynir", "version.py") 62 | 63 | if TYPE_CHECKING: 64 | __version__ = "" 65 | else: 66 | exec(open(version_path).read()) 67 | 68 | # The full version, including alpha/beta/rc tags. 69 | release = __version__ # pylint: disable=undefined-variable 70 | # The short X.Y version. 71 | version = ".".join(__version__.split(".")[:2]) # pylint: disable=undefined-variable 72 | 73 | # The language for content autogenerated by Sphinx. Refer to documentation 74 | # for a list of supported languages. 75 | # 76 | # This is also used if you do content translation via gettext catalogs. 77 | # Usually you set "language" from the command line for these cases. 78 | language = None 79 | 80 | # List of patterns, relative to source directory, that match files and 81 | # directories to ignore when looking for source files. 82 | # This patterns also effect to html_static_path and html_extra_path 83 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 84 | 85 | # The name of the Pygments (syntax highlighting) style to use. 86 | pygments_style = "sphinx" 87 | 88 | # If true, `todo` and `todoList` produce output, else they produce nothing. 89 | todo_include_todos = False 90 | 91 | 92 | # -- Options for HTML output ---------------------------------------------- 93 | 94 | # The theme to use for HTML and HTML Help pages. See the documentation for 95 | # a list of builtin themes. 96 | # 97 | html_theme = "alabaster" 98 | 99 | # Theme options are theme-specific and customize the look and feel of a theme 100 | # further. For a list of options available for each theme, see the 101 | # documentation. 102 | # 103 | html_sidebars = { 104 | "**": ["about.html", "navigation.html", "relations.html", "searchbox.html"] 105 | } 106 | html_theme_options: Mapping[str, Any] = { 107 | "logo": "GreynirLogo400.png", 108 | "logo_name": False, 109 | "logo_text_align": "center", 110 | "description": "Natural Language Processing for Icelandic", 111 | "github_user": "mideind", 112 | "github_repo": "GreynirEngine", 113 | "github_button": True, 114 | "sidebar_collapse": False, 115 | "fixed_sidebar": True, 116 | "font_family": ( 117 | "Lato, Georgia, 'goudy old style', 'minion pro', " 118 | "'bell mt', 'Hiragino Mincho Pro', serif" 119 | ), 120 | } 121 | 122 | # Add any paths that contain custom static files (such as style sheets) here, 123 | # relative to this directory. They are copied after the builtin static files, 124 | # so a file named "default.css" will overwrite the builtin "default.css". 125 | html_static_path = ["_static"] 126 | 127 | # Set the favicon 128 | html_favicon = "_static/greynir-favicon-32x32.png" 129 | 130 | # -- Options for HTMLHelp output ------------------------------------------ 131 | 132 | # Output file base name for HTML help builder. 133 | htmlhelp_basename = "Greynirdoc" 134 | 135 | 136 | # -- Options for LaTeX output --------------------------------------------- 137 | 138 | latex_elements = { 139 | # The paper size ('letterpaper' or 'a4paper'). 140 | "papersize": "a4paper", 141 | # The font size ('10pt', '11pt' or '12pt'). 142 | "pointsize": "10pt", 143 | # Additional stuff for the LaTeX preamble. 144 | "preamble": "", 145 | # Latex figure (float) alignment 146 | "figure_align": "htbp", 147 | } 148 | 149 | # Grouping the document tree into LaTeX files. List of tuples 150 | # (source start file, target name, title, 151 | # author, documentclass [howto, manual, or own class]). 152 | latex_documents = [ 153 | (master_doc, "Greynir.tex", "Greynir Documentation", "Miðeind ehf.", "manual") 154 | ] 155 | 156 | 157 | # -- Options for manual page output --------------------------------------- 158 | 159 | # One entry per manual page. List of tuples 160 | # (source start file, name, description, authors, manual section). 161 | man_pages = [(master_doc, "greynir", "Greynir Documentation", [author], 1)] 162 | 163 | 164 | # -- Options for Texinfo output ------------------------------------------- 165 | 166 | # Grouping the document tree into Texinfo files. List of tuples 167 | # (source start file, target name, title, author, 168 | # dir menu entry, description, category) 169 | texinfo_documents = [ 170 | ( 171 | master_doc, 172 | "Greynir", 173 | "Greynir Documentation", 174 | author, 175 | "Greynir", 176 | "Natural language processing for Icelandic", 177 | "NLP", 178 | ) 179 | ] 180 | -------------------------------------------------------------------------------- /doc/copyright.rst: -------------------------------------------------------------------------------- 1 | .. _copyright: 2 | 3 | Copyright and licensing 4 | ======================= 5 | 6 | .. figure:: _static/MideindLogoVert100.png 7 | :align: left 8 | :alt: Miðeind ehf. 9 | 10 | GreynirEngine is *copyright © 2023 Miðeind ehf.*, Reykjavík, Iceland. 11 | 12 | The project's original author is *Vilhjálmur Þorsteinsson*. 13 | 14 | This software is licensed under the MIT License: 15 | 16 | *Permission is hereby granted, free of charge, to any person* 17 | *obtaining a copy of this software and associated documentation* 18 | *files (the "Software"), to deal in the Software without restriction,* 19 | *including without limitation the rights to use, copy, modify, merge,* 20 | *publish, distribute, sublicense, and/or sell copies of the Software,* 21 | *and to permit persons to whom the Software is furnished to do so,* 22 | *subject to the following conditions:* 23 | 24 | *The above copyright notice and this permission notice shall be* 25 | *included in all copies or substantial portions of the Software.* 26 | 27 | *THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,* 28 | *EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF* 29 | *MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.* 30 | *IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY* 31 | *CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,* 32 | *TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE* 33 | *SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.* 34 | 35 | .. note:: 36 | 37 | GreynirEngine indirectly embeds the `Database of Modern Icelandic Inflection `_ 38 | (`Beygingarlýsing íslensks nútímamáls `_), abbreviated BÍN. 39 | 40 | The BÍN source data are publicly available under the CC-BY-4.0 license, as further 41 | detailed `here in English `_ 42 | and `here in Icelandic `_. 43 | 44 | In accordance with the BÍN license terms, credit is hereby given as follows: 45 | 46 | *Beygingarlýsing íslensks nútímamáls. Stofnun Árna Magnússonar í íslenskum fræðum. Höfundur og ritstjóri Kristín Bjarnadóttir.* 47 | 48 | GreynirEngine accesses BÍN data through another package from the same authors 49 | called BinPackage, and further information is available in that package's 50 | `GitHub repository `_. 51 | -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | .. Greynir documentation master file, created by 2 | sphinx-quickstart on Sun Apr 8 01:20:08 2018. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | 7 | Welcome to Greynir 8 | ================== 9 | 10 | *Til að gagnast sem flestum er skjölun Greynis á ensku. - In order to serve 11 | the widest possible audience, Greynir's documentation is in English.* 12 | 13 | Greynir is a Python >= 3.9 package for **working with Icelandic text**, 14 | including parsing it into **sentence trees**, finding **lemmas**, 15 | inflecting **noun phrases**, assigning **part-of-speech tags** and much more. 16 | 17 | Greynir's sentence trees can *inter alia* be used to extract information 18 | from text, for instance about people, titles, entities, facts, actions 19 | and opinions. 20 | 21 | .. figure:: _static/GreynirTreeExampleSmall2.png 22 | :align: center 23 | :alt: An example of a parse tree 24 | 25 | Greynir is the engine of `Embla `_ and `Greynir.is `_. 26 | 27 | Greynir has been used to parse text from Icelandic news websites since 2015, 28 | processing over 10 million sentences in over 515.000 articles. Its 29 | optimized C++ parsing core is fast and efficient enough to parse 30 | real-world text according to a 31 | `context-free grammar for the Icelandic 32 | language `_ 33 | with over 22,000 productions. 34 | 35 | To get acquainted with Greynir, we recommend that you start with 36 | the :ref:`overview`, proceed with the :ref:`installation` instructions, 37 | and then look at the :ref:`quickstart`. For further reference, consult 38 | the :ref:`reference` section. 39 | 40 | This documentation also contains :ref:`important information about copyright 41 | and licensing `. 42 | 43 | Batteries included 44 | ------------------ 45 | 46 | To start using Greynir with Python, you (usually) need 47 | :ref:`ony one command `:: 48 | 49 | $ pip install reynir 50 | 51 | **No database to set up, no further data to download.** 52 | 53 | Greynir indirectly embeds the `Database of Modern Icelandic 54 | Inflection `_ 55 | (`Beygingarlýsing íslensks nútímamáls `_), 56 | with over 6 million entries, in compressed form. 57 | By looking up word forms in this database and applying context-free 58 | grammar rules (productions) and scoring heuristics, Greynir is able to 59 | infer what the most likely lemmas are, how they are inflected in the 60 | parsed text, and where they fit in the overall sentence structure. 61 | 62 | Greynir is thoroughly documented, and its source code is of course 63 | `available on GitHub `_. 64 | 65 | Enabling your application 66 | ------------------------- 67 | 68 | Greynir can serve as an enabling component of applications such as: 69 | 70 | * Natural language query systems 71 | * Bots and conversational systems 72 | * Information extraction tools 73 | * Intelligent search tools 74 | * Grammatical pattern analyzers 75 | * Text similarity 76 | * Author identification 77 | * Sentiment analysis 78 | * Content summarization 79 | * Content category labeling 80 | * Part-of-speech (POS) taggers and lemmatizers 81 | * Generation of training corpora for machine learning 82 | 83 | About Greynir 84 | ------------- 85 | 86 | Greynir is a project and product of Miðeind ehf. of Reykjavík, Iceland. 87 | It is a free open source software project (:ref:`MIT license `), 88 | started in mid-2015 by its original author, Vilhjálmur Þorsteinsson. 89 | Its aim is to produce an **industrial-strength Natural Language** 90 | **Processing toolset for Icelandic**, with the hope of supporting the 91 | language on the digital front in times of rapid advances in language 92 | technology; changes that may leave low-resource languages at a 93 | disadvantage unless explicit action is taken to strengthen their position. 94 | 95 | Greynir and associated projects received grants from the Icelandic 96 | Language Technology Fund *(Máltæknisjóður)* in 2016 and 2017, and have 97 | been partially supported by the Icelandic Government's Language 98 | Technology Programme The LT Programme is managed by 99 | `Almannarómur `_ and funded by the Ministry 100 | of Education, Science and Culture. 101 | 102 | 103 | .. toctree:: 104 | :maxdepth: 1 105 | :hidden: 106 | 107 | overview 108 | installation 109 | quickstart 110 | reference 111 | patterns 112 | nonterminals 113 | terminals 114 | copyright 115 | 116 | -------------------------------------------------------------------------------- /doc/installation.rst: -------------------------------------------------------------------------------- 1 | .. _installation: 2 | 3 | Installation 4 | ============ 5 | 6 | Prerequisites 7 | ------------- 8 | 9 | Greynir runs on **CPython 3.9** or newer, and on **PyPy 3.9** 10 | or newer (more info on PyPy `here `_). 11 | 12 | On GNU/Linux and similar systems, you may need to have ``python3-dev`` 13 | installed on your system: 14 | 15 | .. code-block:: bash 16 | 17 | # Debian or Ubuntu: 18 | $ sudo apt-get install python3-dev 19 | 20 | Depending on your system, you may also need to install ``libffi-dev``: 21 | 22 | .. code-block:: bash 23 | 24 | # Debian or Ubuntu: 25 | $ sudo apt-get install libffi-dev 26 | 27 | On Windows, you may need the latest 28 | `Visual Studio Build Tools `_, 29 | specifically the Visual C++ build tools, installed on your PC along 30 | with the Windows 10 SDK. 31 | 32 | 33 | Install with pip 34 | ---------------- 35 | 36 | To install Greynir: 37 | 38 | .. code-block:: bash 39 | 40 | $ pip install reynir 41 | 42 | ...or if you have both Python2 and Python3 available on your system: 43 | 44 | .. code-block:: bash 45 | 46 | $ pip3 install reynir 47 | 48 | ...or if you want to be able to edit Greynir's source code in-place, 49 | install ``git`` and do the following (note the final dot in the last line): 50 | 51 | .. code-block:: bash 52 | 53 | $ mkdir ~/github 54 | $ cd ~/github 55 | $ git clone https://github.com/mideind/GreynirEngine 56 | $ cd GreynirEngine 57 | $ git pull 58 | $ pip install -e . 59 | 60 | On most common Linux x86_64/amd64 systems, ``pip`` will download and 61 | install a binary wheel. On other systems, a source distribution will be 62 | downloaded and compiled to binary. This requires a standard, Python-supported 63 | C/C++ compiler to be present on the system. 64 | 65 | Greynir's binary wheels are in the ``manylinux2010`` format (or newer). 66 | This means that you will need version 19.0 or newer of ``pip`` to be able 67 | to install a Greynir wheel. Versions of Python from 3.7 onwards include a 68 | new-enough ``pip``. 69 | 70 | Pull requests are welcome in the project's 71 | `GitHub repository `_. 72 | 73 | 74 | Install into a virtualenv 75 | ------------------------- 76 | 77 | In many cases, you will want to maintain a separate Python environment for 78 | your project that uses Greynir. For this, you can use *virtualenv* 79 | (if you haven't already, install it with ``pip install virtualenv``): 80 | 81 | .. code-block:: bash 82 | 83 | $ virtualenv -p python3 venv 84 | 85 | # Enter the virtual environment 86 | $ source venv/bin/activate 87 | 88 | # Install Greynir into it 89 | $ pip install reynir 90 | 91 | $ python 92 | [ Use Python with Greynir ] 93 | 94 | # Leave the virtual environment 95 | $ deactivate 96 | 97 | On Windows: 98 | 99 | .. code-block:: batch 100 | 101 | C:\MyProject> virtualenv venv 102 | 103 | REM Enter the virtual environment 104 | C:\MyProject> venv/Scripts/activate 105 | 106 | REM Install Greynir into it 107 | (venv) C:\MyProject> pip install reynir 108 | 109 | (venv) C:\MyProject> python 110 | REM [ Use Python with Greynir ] 111 | 112 | REM Leave the virtual environment 113 | (venv) C:\MyProject> deactivate 114 | 115 | More information about *virtualenv* is `available 116 | here `_. 117 | -------------------------------------------------------------------------------- /doc/nonterminals.rst: -------------------------------------------------------------------------------- 1 | .. _nonterminals: 2 | 3 | Nonterminals 4 | ============ 5 | 6 | This section lists the nonterminals that can occur within simplified 7 | sentence trees, i.e. instances of the :py:class:`SimpleTree` class. 8 | The nonterminal name of a tree node can be read from the 9 | :py:attr:`SimpleTree.tag` property. 10 | 11 | Sentences and paragraphs 12 | ------------------------ 13 | 14 | *Setningar, málsgreinar og efnisgreinar* 15 | 16 | +--------------+----------------------------------------------------------+ 17 | | S0 | Root of tree | 18 | +--------------+----------------------------------------------------------+ 19 | | S-MAIN | Main clause (aðalsetning) | 20 | +--------------+----------------------------------------------------------+ 21 | | S-HEADING | Sentence-heading (fyrirsögn) | 22 | +--------------+----------------------------------------------------------+ 23 | | S-PREFIX | Prefix clause (*Með öðrum orðum:* Páll sá kött) | 24 | +--------------+----------------------------------------------------------+ 25 | | S-QUE | Question clause (spurnarsetning) | 26 | | | („*Hvaða stjaka viltu*?“) | 27 | +--------------+----------------------------------------------------------+ 28 | | CP-THT | Complement clause (skýringarsetning) | 29 | | | (Páll veit *að kötturinn kemur heim*) | 30 | +--------------+----------------------------------------------------------+ 31 | | CP-QUE | Question subclause (spurnaraukasetning) | 32 | | | (Páll spurði *hvaða stjaka hún vildi*) | 33 | +--------------+----------------------------------------------------------+ 34 | | CP-REL | Relative clause (tilvísunarsetning) | 35 | | | (Páll, *sem kom inn*, klappaði kettinum) | 36 | +--------------+----------------------------------------------------------+ 37 | | CP-ADV-TEMP | Adverbial temporal phrase (tíðarsetning) | 38 | | | (Páll fór út *á meðan kötturinn mjálmaði*) | 39 | +--------------+----------------------------------------------------------+ 40 | | CP-ADV-PURP | Adverbial purpose phrase (tilgangssetning) | 41 | | | (Fuglinn flaug *til þess að ná sér í mat*) | 42 | +--------------+----------------------------------------------------------+ 43 | | CP-ADV-ACK | Adverbial acknowledgement phrase (viðurkenningarsetning) | 44 | | | (Páll fór út, *þó að hann væri þreyttur*) | 45 | +--------------+----------------------------------------------------------+ 46 | | CP-ADV-CONS | Adverbial consequence phrase (afleiðingarsetning) | 47 | | | (Páll fór út, *þannig að hann er þreyttur*) | 48 | +--------------+----------------------------------------------------------+ 49 | | CP-ADV-CAUSE | Adverbial causal phrase (orsakarsetning) | 50 | | | (Páll fór út, *þar sem hann er þreyttur*) | 51 | +--------------+----------------------------------------------------------+ 52 | | CP-ADV-COND | Adverbial conditional phrase (skilyrðissetning) | 53 | | | (Páll færi út, *ef hann gæti*) | 54 | +--------------+----------------------------------------------------------+ 55 | | CP-ADV-CMP | Adverbial comparative phrase (samanburðarsetning) | 56 | +--------------+----------------------------------------------------------+ 57 | | CP-QUOTE | Direct quote (bein tilvitnun) | 58 | | | („*Þetta er fínt*,“ sagði Páll) | 59 | +--------------+----------------------------------------------------------+ 60 | 61 | 62 | Inflectional phrases 63 | -------------------- 64 | 65 | *Beygingarliðir* 66 | 67 | +------------+---------------------------------------------------+ 68 | | IP | Inflectional phrase (beygingarliður) | 69 | +------------+---------------------------------------------------+ 70 | | IP-INF | Infinitival inflectional phrase | 71 | +------------+---------------------------------------------------+ 72 | 73 | 74 | Noun phrases 75 | ------------ 76 | 77 | *Nafnliðir* 78 | 79 | +------------+---------------------------------------------------+ 80 | | NP | Noun phrase | 81 | +------------+---------------------------------------------------+ 82 | | NP-SUBJ | Noun phrase - subject (*Páll* sá sólina) | 83 | +------------+---------------------------------------------------+ 84 | | NP-OBJ | Noun phrase - direct object (Páll sá *sólina*) | 85 | +------------+---------------------------------------------------+ 86 | | NP-IOBJ | Noun phrase - indirect object | 87 | | | (Páll sýndi *barninu* bókina) | 88 | +------------+---------------------------------------------------+ 89 | | NP-PRD | Noun phrase - predicate (Páll er *formaður*) | 90 | +------------+---------------------------------------------------+ 91 | | NP-ADP | Noun phrase - adjectival object (líkur *Páli*) | 92 | +------------+---------------------------------------------------+ 93 | | NP-POSS | Noun phrase - possessive (köttur *Páls*) | 94 | +------------+---------------------------------------------------+ 95 | | NP-ADDR | Noun phrase - address (*Fiskislóð 31*) | 96 | +------------+---------------------------------------------------+ 97 | | NP-TITLE | Noun phrase - title (Páll Jónsson *ritari*) | 98 | +------------+---------------------------------------------------+ 99 | | NP-COMPANY | Noun phrase - company (*Samherji hf.*) | 100 | +------------+---------------------------------------------------+ 101 | | NP-MEASURE | Noun phrase - quantity | 102 | +------------+---------------------------------------------------+ 103 | | NP-AGE | Noun phrase - age | 104 | +------------+---------------------------------------------------+ 105 | 106 | 107 | Adjective phrases 108 | ----------------- 109 | 110 | *Lýsingarliðir* 111 | 112 | +------------+---------------------------------------------------+ 113 | | ADJP | Adjective phrase (Páll er *góður og gegn* maður) | 114 | +------------+---------------------------------------------------+ 115 | 116 | Verb phrases 117 | ------------ 118 | 119 | *Sagnliðir* 120 | 121 | +------------+---------------------------------------------------+ 122 | | VP | Verb phrase | 123 | +------------+---------------------------------------------------+ 124 | | VP-AUX | Auxiliary verb phrase (hjálparsögn) | 125 | | | (Páll *hefur* klappað kettinum) | 126 | +------------+---------------------------------------------------+ 127 | 128 | Prepositional phrases 129 | --------------------- 130 | 131 | *Forsetningarliðir* 132 | 133 | +------------+---------------------------------------------------+ 134 | | PP | Prepositional phrase | 135 | +------------+---------------------------------------------------+ 136 | 137 | Adverbial phrases 138 | ----------------- 139 | 140 | *Atviksliðir* 141 | 142 | +--------------------+-------------------------------------------+ 143 | | ADVP | Adverbial phrase | 144 | +--------------------+-------------------------------------------+ 145 | | ADVP-DIR | Directional adverbial phrase | 146 | +--------------------+-------------------------------------------+ 147 | | ADVP-DATE-ABS | Absolute date phrase | 148 | +--------------------+-------------------------------------------+ 149 | | ADVP-DATE-REL | Relative date phrase | 150 | +--------------------+-------------------------------------------+ 151 | | ADVP-TIMESTAMP-ABS | Absolute timestamp | 152 | +--------------------+-------------------------------------------+ 153 | | ADVP-TIMESTAMP-REL | Relative timestamp | 154 | +--------------------+-------------------------------------------+ 155 | | ADVP-TMP-SET | Temporal frequency phrase | 156 | +--------------------+-------------------------------------------+ 157 | | ADVP-DUR-ABS | Absolute duration | 158 | +--------------------+-------------------------------------------+ 159 | | ADVP-DUR-REL | Relative duration | 160 | +--------------------+-------------------------------------------+ 161 | | ADVP-DUR-TIME | Time period phrase | 162 | +--------------------+-------------------------------------------+ 163 | 164 | 165 | Other phrases 166 | ------------- 167 | 168 | *Aðrir liðir* 169 | 170 | +--------+---------------------------------------------------+ 171 | | P | Preposition | 172 | +--------+---------------------------------------------------+ 173 | | TO | Infinitive particle | 174 | +--------+---------------------------------------------------+ 175 | | C | Conjunction | 176 | +--------+---------------------------------------------------+ 177 | -------------------------------------------------------------------------------- /doc/overview.rst: -------------------------------------------------------------------------------- 1 | .. _overview: 2 | 3 | Overview 4 | ======== 5 | 6 | **Greynir** parses sentences of Icelandic text into **parse trees**. 7 | A parse tree recursively describes the grammatical structure 8 | of the sentence, including its noun phrases, verb phrases, 9 | prepositional phrases, etc. 10 | 11 | The individual tokens (words, numbers, punctuation, etc.) of the sentence 12 | correspond to leaves in the parse tree. 13 | 14 | .. figure:: _static/LitlaGula.png 15 | :align: center 16 | :alt: A parse tree 17 | 18 | *The parse tree for the sentence "Litla gula hænan fann fræ".* 19 | 20 | By examining and processing the parse tree, information and meaning 21 | can be extracted from the sentence. 22 | 23 | Example 24 | ------- 25 | 26 | Here is a short example of what can be done with Greynir:: 27 | 28 | >>> from reynir import Greynir 29 | >>> g = Greynir() 30 | >>> sent = g.parse_single("Ása sá sól.") 31 | >>> print(sent.tree.view) 32 | S0 # Root 33 | +-S-MAIN # Main sentence 34 | +-IP # Inflected phrase 35 | +-NP-SUBJ # Noun phrase, subject 36 | +-no_et_nf_kvk: 'Ása' # Noun, singular, nominative, feminine 37 | +-VP # Verb phrase containing arguments 38 | +-VP # Verb phrase containing verb 39 | +-so_1_þf_et_p3: 'sá' # Verb, 1 accusative arg, singular, 3rd p 40 | +-NP-OBJ # Noun phrase, object 41 | +-no_et_þf_kvk: 'sól' # Noun, singular, accusative, feminine 42 | +-'.' # Punctuation 43 | >>> sent.tree.nouns 44 | ['Ása', 'sól'] 45 | >>> sent.tree.verbs 46 | ['sjá'] 47 | >>> # Show the subject noun phrase 48 | >>> sent.tree.S.IP.NP_SUBJ.lemmas 49 | ['Ása'] 50 | >>> # Show the verb phrase 51 | >>> sent.tree.S.IP.VP.lemmas 52 | ['sjá', 'sól'] 53 | >>> # Show the object of the verb 54 | >>> sent.tree.S.IP.VP.NP_OBJ.lemmas 55 | ['sól'] 56 | 57 | Here, ``S`` stands for sentence *(málsgrein)*, ``IP`` for inflected 58 | phrase *(beygingarliður)*, ``VP`` is a verb phrase *(sagnliður)*, 59 | ``NP_SUBJ`` is a subject noun phrase *(frumlag)* and 60 | ``NP_OBJ`` is an object noun phrase *(andlag)*. 61 | Nonterminal names are listed in the :ref:`nonterminals` section. 62 | 63 | What Greynir does 64 | ----------------- 65 | 66 | Greynir starts by **tokenizing** your text, i.e. dividing it up into individual words, 67 | numbers, punctuation and other tokens. For this, it uses the separate 68 | `Tokenizer `_ package, by the 69 | same authors, which is automatically installed with Greynir. 70 | 71 | After tokenization, Greynir proceeds to **parse** the text according to a 72 | `context-free grammar `_ 73 | for the modern Icelandic language. This grammar contains rules describing 74 | how sentences and the various subparts thereof can be validly constructed. 75 | 76 | Almost all sentences are **ambiguous**. This means that there are multiple 77 | parse trees that can validly describe the sentence according to the grammar 78 | rules. Greynir thus has to choose a single best tree from the forest of 79 | possible trees. It does this with a scoring heuristic which assigns higher 80 | scores to common word forms and grammatical constructs, and lower scores to 81 | rare word forms and uncommon constructs. The parse tree with the highest 82 | overall score wins and is returned from the :py:meth:`Greynir.parse_single()` 83 | function. 84 | 85 | Once the best parse tree has been found, it is available for various kinds 86 | of **queries**. You can access word lemmas, extract noun and verb phrases 87 | as shown above, look for patterns via wildcard matching, and much more. 88 | This is described in detail in the :ref:`reference`. 89 | 90 | -------------------------------------------------------------------------------- /doc/patterns.rst: -------------------------------------------------------------------------------- 1 | .. _patterns: 2 | 3 | Patterns 4 | ======== 5 | 6 | This section describes grammatical matching patterns that can be used with the 7 | :py:meth:`SimpleTree.match()`, :py:meth:`SimpleTree.first_match()`, 8 | :py:meth:`SimpleTree.all_matches()` and :py:meth:`SimpleTree.top_matches()` 9 | methods. 10 | 11 | Overview 12 | -------- 13 | 14 | The above mentioned methods can be used to find trees and subtrees that match 15 | a specific grammatical pattern, within a sentence. The pattern can include 16 | conditions that apply to the root of each subtree as well as its children, 17 | direct or indirect. 18 | 19 | The patterns are given as strings, with pattern tokens separated by whitespace. 20 | :ref:`examples` are given below. 21 | 22 | See the documentation of each method for a further explanation of how the 23 | given pattern is matched in each case, and how results are returned. 24 | 25 | Simple matches 26 | -------------- 27 | 28 | * A ``"literal"`` within *double quotes* matches a subtree that covers exactly 29 | the given literal text, although using a case-neutral comparison. 30 | ``"Icelandic"`` thus matches ``icelandic`` and ``ICELANDIC``. 31 | The literal may have multiple words, separated by spaces: 32 | ``"borgarstjóri reykjavíkur"`` matches a subtree that covers these two 33 | word forms. The matched subtree can be a nonterminal or a terminal node. 34 | 35 | * A ``'literal'`` within *single quotes* matches a subtree that covers exactly 36 | the given word lemma(s), using a case-neutral comparison. 37 | ``'hestur'`` thus matches ``hests`` and ``Hestinum``. 38 | The literal may have multiple words, separated by spaces: 39 | ``'borgarstjóri reykjavík'`` matches a subtree that covers these 40 | two lemmas. (``'borgarstjóri reykjavíkur'`` would never match anything 41 | as ``reykjavíkur`` is not the lemma of any word form.) The matched subtree 42 | can be a nonterminal or a terminal node. 43 | 44 | * A ``@"literal"`` within *double quotes* and *prefixed with the @ symbol* matches 45 | a *terminal node* that corresponds to a token having 46 | the given literal text, although using a case-neutral comparison. 47 | ``@"Icelandic"`` thus matches ``icelandic`` and ``ICELANDIC``. 48 | 49 | * A ``@'literal'`` within *single quotes* and *prefixed with the @ symbol* matches 50 | a *terminal node* that corresponds to a token having the given word lemma, 51 | using a case-neutral comparison. ``@'hestur'`` thus matches ``hests`` 52 | and ``Hestinum``. 53 | 54 | * A ``NONTERMINAL`` identifier in upper case matches nodes associated with 55 | that nonterminal, as well as subcategories thereof. ``NP`` thus matches 56 | ``NP`` as well as ``NP-OBJ`` and ``NP-SUBJ``. ``NP-OBJ`` only matches 57 | ``NP-OBJ`` and subcategories thereof. 58 | 59 | * A ``terminal`` identifier in lower case matches nodes associated with 60 | the specified category of terminal, and having at least the variants given, 61 | if any. ``no`` thus matches all noun terminals, while ``no_nf_et`` 62 | only matches noun terminals in nominative case, singular (but any 63 | gender, since a gender variant is not specified). ``p`` matches a 64 | punctuation terminal. 65 | 66 | Wildcard match 67 | -------------- 68 | 69 | * A dot ``.`` matches any single tree node, which can be a terminal or nonterminal. 70 | 71 | OR match 72 | -------- 73 | 74 | * ``(Any1 | Any2 | ...)`` matches if anything between the parentheses matches. 75 | The options are separated by vertical bars ``|``. 76 | 77 | Sequence matches 78 | ---------------- 79 | 80 | * ``Any1 Any2 Any3`` matches the given sequence of matches if each 81 | element matches in exactly the given order. The match must be exhaustive, 82 | i.e. no child nodes may be left unmatched at the end of the list. 83 | 84 | * ``Any+`` matches one or more sequential instances of the given ``Any`` match. 85 | 86 | * ``Any*`` matches zero or more sequential instances of the 87 | given ``Any`` match. 88 | 89 | * ``Any?`` matches zero or one instances of the given ``Any`` match. 90 | 91 | * ``.*`` thus matches any number of any nodes and is an often-used construct. 92 | 93 | * ``[ Any1 Any2 ]`` matches any node sequence that starts with the two given 94 | matches. It does not matter whether the sequence contains more nodes. 95 | 96 | * ``[ Any1 Any2 $ ]`` matches any node sequence where ``Any1`` 97 | and ``Any2`` match and there are no further nodes in the sequence. 98 | The ``$`` sign is an end-of-sequence marker. 99 | 100 | * ``[ Any1 .* Any2 $ ]`` matches only sequences that start with ``Any1`` and 101 | end with ``Any2``. 102 | 103 | Hierarchical matches 104 | -------------------- 105 | 106 | * ``Any1 > { Any2 Any3 ... }`` matches if ``Any1`` matches and has *immediate* 107 | (direct) children that include ``Any2``, ``Any3`` *and* other given arguments 108 | (irrespective of order). This is a *set-like* operator. 109 | 110 | * ``Any1 >> { Any2 Any3 ... }`` matches if ``Any1`` matches and has children 111 | *at any sublevel*, that include ``Any2``, ``Any3`` *and* other given arguments 112 | (irrespective of order). However, subtrees of ``IP`` nonterminals are skipped, 113 | so nested inflectional phrases are excluded from the search. 114 | This is a *set-like* operator. 115 | 116 | * ``Any1 >>> { Any2 Any3 ... }`` matches if ``Any1`` matches and has children 117 | *at any sublevel*, that include ``Any2``, ``Any3`` *and* other given arguments 118 | (irrespective of order). Unlike the ``>>`` operator, subtrees of ``IP`` are 119 | included in the search. This is a *set-like* operator. 120 | 121 | * ``Any1 > [ Any2 Any3 ... ]`` matches if ``Any1`` matches and has immediate 122 | children that include ``Any2``, ``Any3`` *and* other given arguments 123 | *in the order specified*. This is a *list-like* operator. 124 | 125 | .. _examples: 126 | 127 | Examples 128 | -------- 129 | 130 | This pattern will match the root subtree of any sentence that has a verb phrase 131 | that refers to a person as an argument:: 132 | 133 | "S >> { VP >> { NP-OBJ >> person }}" 134 | 135 | This pattern will match any sentence that has a verb phrase that refers to 136 | a male person as an argument:: 137 | 138 | "S >> { VP >> { NP-OBJ >> person_kk }}" 139 | 140 | Here is a short program using some of the matching features:: 141 | 142 | from reynir import Greynir 143 | g = Greynir() 144 | my_text = ("Reynt er að efla áhuga ungs fólks á borgarstjórnarmálum " 145 | "með framboðsfundum og skuggakosningum en þótt kjörstaðirnir " 146 | "í þeim séu færðir inn í framhaldsskólana er þátttakan lítil.") 147 | s = g.parse_single(my_text) 148 | print("Parse tree:") 149 | print(s.tree.view) 150 | print("\nAll subjects:\n") 151 | for d in s.tree.descendants: 152 | if d.match_tag("NP-SUBJ"): 153 | print(d.text) 154 | print("\nAll masculine noun and pronoun phrases:\n") 155 | for m in s.tree.all_matches("NP > { (no_kk | pfn_kk) } "): 156 | print(m.canonical_np) 157 | 158 | Output: 159 | 160 | .. code-block:: none 161 | 162 | Parse tree: 163 | S0 164 | +-S-MAIN 165 | +-IP 166 | +-VP 167 | +-VP 168 | +-so_sagnb: 'Reynt' 169 | +-VP 170 | +-so_et_p3: 'er' 171 | +-IP-INF 172 | +-TO 173 | +-nhm: 'að' 174 | +-VP 175 | +-VP 176 | +-so_1_þf_nh: 'efla' 177 | +-NP-OBJ 178 | +-no_et_þf_kk: 'áhuga' 179 | +-NP-POSS 180 | +-lo_ef_et_hk: 'ungs' 181 | +-no_et_ef_hk: 'fólks' 182 | +-PP 183 | +-P 184 | +-fs_þgf: 'á' 185 | +-NP 186 | +-no_ft_þgf_hk: 'borgarstjórnarmálum' 187 | +-PP 188 | +-P 189 | +-fs_þgf: 'með' 190 | +-NP 191 | +-no_ft_þgf_kk: 'framboðsfundum' 192 | +-C 193 | +-st: 'og' 194 | +-no_ft_þgf_kvk: 'skuggakosningum' 195 | +-C 196 | +-st: 'en' 197 | +-S-MAIN 198 | +-CP-ADV-ACK 199 | +-C 200 | +-st: 'þótt' 201 | +-IP 202 | +-NP-SUBJ 203 | +-no_ft_nf_kk: 'kjörstaðirnir' 204 | +-PP 205 | +-P 206 | +-fs_þgf: 'í' 207 | +-NP 208 | +-pfn_kvk_ft_þgf: 'þeim' 209 | +-VP 210 | +-VP 211 | +-so_ft_p3: 'séu' 212 | +-NP-PRD 213 | +-NP-PRD 214 | +-VP 215 | +-so_lhþt_sb_nf_ft_kk: 'færðir' 216 | +-PP 217 | +-ADVP-DIR 218 | +-ao: 'inn' 219 | +-P 220 | +-fs_þf: 'í' 221 | +-NP 222 | +-no_ft_þf_kk: 'framhaldsskólana' 223 | +-IP 224 | +-VP 225 | +-VP 226 | +-so_et_p3: 'er' 227 | +-NP-SUBJ 228 | +-no_et_nf_kvk: 'þátttakan' 229 | +-NP-PRD 230 | +-lo_sb_nf_et_kvk: 'lítil' 231 | +-'.' 232 | 233 | All subjects: 234 | 235 | kjörstaðirnir í þeim 236 | þátttakan 237 | 238 | All masculine noun and pronoun phrases: 239 | 240 | áhugi 241 | framboðsfundur og skuggakosning 242 | kjörstaður 243 | framhaldsskóli 244 | 245 | -------------------------------------------------------------------------------- /doc/quickstart.rst: -------------------------------------------------------------------------------- 1 | .. _quickstart: 2 | 3 | Quickstart 4 | ========== 5 | 6 | After :ref:`installing Greynir `, fire up your 7 | Python 3 interpreter:: 8 | 9 | $ python3 10 | 11 | ...and try something like the following:: 12 | 13 | from reynir import NounPhrase as Nl 14 | 15 | # Create a NounPhrase ('nafnliður') object 16 | nl = Nl("þrír lúxus-miðar á Star Wars og tveir brimsaltir pokar af poppi") 17 | 18 | # Print the NounPhrase in the correct case for each context 19 | # (þf=þolfall/accusative, þgf=þágufall/dative) 20 | 21 | print("Þú keyptir {nl:þf}.".format(nl=nl)) 22 | print("Hér er kvittunin þín fyrir {nl:þgf}.".format(nl=nl)) 23 | 24 | The program outputs the following text, correctly inflected:: 25 | 26 | Þú keyptir þrjá lúxus-miða á Star Wars og tvo brimsalta poka af poppi. 27 | Hér er kvittunin þín fyrir þremur lúxus-miðum á Star Wars og tveimur brimsöltum pokum af poppi. 28 | 29 | Use the :py:class:`NounPhrase` class to easily inflect Icelandic noun phrases 30 | and to convert them between cases, for instance in user interfaces, in chatbot 31 | conversations and in printouts. 32 | 33 | A more detailed, lower-level example is as follows:: 34 | 35 | from reynir import Greynir 36 | 37 | my_text = "Litla gula hænan fann fræ. Það var hveitifræ." 38 | 39 | # Initialize the Greynir parser and submit the text as a parse job 40 | g = Greynir() 41 | job = g.submit(my_text) 42 | 43 | # Iterate through sentences and parse each one 44 | for sent in job: 45 | sent.parse() 46 | print("Sentence: {0}".format(sent.tidy_text)) 47 | print("Lemmas: {0}".format(sent.lemmas)) 48 | print("Parse tree:\n{0}\n".format(sent.tree.view)) 49 | 50 | The output of the program is:: 51 | 52 | Sentence: Litla gula hænan fann fræ. 53 | Lemmas: ['lítill', 'gulur', 'hæna', 'finna', 'fræ', '.'] 54 | Parse tree: 55 | S0 56 | +-S-MAIN 57 | +-IP 58 | +-NP-SUBJ 59 | +-lo_nf_et_kvk: 'Litla' 60 | +-lo_nf_et_kvk: 'gula' 61 | +-no_et_nf_kvk: 'hænan' 62 | +-VP 63 | +-VP 64 | +-so_1_þf_et_p3: 'fann' 65 | +-NP-OBJ 66 | +-no_et_þf_hk: 'fræ' 67 | +-'.' 68 | Sentence: Það var hveitifræ. 69 | Lemmas: ['það', 'vera', 'hveitifræ', '.'] 70 | Parse tree: 71 | S0 72 | +-S-MAIN 73 | +-IP 74 | +-NP-SUBJ 75 | +-pfn_hk_et_nf: 'Það' 76 | +-VP 77 | +-VP 78 | +-so_1_nf_et_p3: 'var' 79 | +-NP-PRD 80 | +-no_et_nf_hk: 'hveitifræ' 81 | +-'.' 82 | 83 | The code first creates an instance of the :py:class:`Greynir` class 84 | and assigns it to the ``g`` object. The :py:class:`Greynir` class is 85 | Greynir's main service interface. 86 | 87 | Next, the program submits a piece of text containing two sentences to 88 | Greynir, which returns a job object. Each job object encapsulates a 89 | stream of sentences that will be, or have been, processed through 90 | Greynir's tokenizer and parser. 91 | 92 | A job object is a Python generator, and the ``for`` loop iterates through 93 | the job's sentence stream, returning each sentence in turn in the ``sent`` 94 | object. 95 | 96 | The ``for`` loop body parses the sentence by calling ``sent.parse()``. 97 | This function returns ``True`` if the sentence was successfully parsed, i.e. 98 | at least one valid parse tree was found for it, or ``False`` otherwise. 99 | 100 | The sentence object has a number of properties, including ``sent.tidy_text`` 101 | which returns a normalized form of the tokenized sentence. 102 | 103 | If the sentence was successfully parsed, the ``sent.tree`` property 104 | (of type :py:class:`SimpleTree`) 105 | contains its best parse tree. This tree can be further queried via 106 | properties such as ``sent.lemmas``, which returns a list of the 107 | word lemmas in the sentence, and ``sent.tree.view``, which 108 | returns a string with an "ASCII art" representation of the parse tree. 109 | 110 | The parse tree contains grammar **nonterminals** in uppercase, such 111 | as ``S0`` (root), ``S-MAIN`` (main sentence), ``IP`` (inflected 112 | phrase - *beygingarliður*), ``NP-SUBJ`` (noun phrase - subject, 113 | *frumlag*), ``VP`` (verb phrase - *sagnliður*), etc. 114 | 115 | Nonterminals are listed and explained in the :ref:`nonterminals` section. 116 | 117 | The tree also shows grammar **terminals** (leaves, corresponding to 118 | tokens) in lowercase, as well as their :ref:`grammatical variants ` 119 | (features). Examples are ``pfn_hk_et_nf`` (personal pronoun, 120 | neutral gender, singular, nominative case), and ``so_1_nf_et_p3`` 121 | (verb, one argument in nominative case, singular, 3rd person). 122 | 123 | Terminals and variants are listed and explained in the :ref:`terminals` 124 | section. 125 | 126 | The sentence and tree properties and functions are further 127 | detailed and described in the :ref:`reference` section. 128 | 129 | -------------------------------------------------------------------------------- /doc/terminals.rst: -------------------------------------------------------------------------------- 1 | .. _terminals: 2 | 3 | Terminals 4 | ========= 5 | 6 | This section lists the terminals that can occur within simplified 7 | sentence trees, i.e. instances of the :py:class:`SimpleTree` class. The 8 | terminal associated with a tree node is available in the 9 | :py:attr:`SimpleTree.terminal` property. 10 | 11 | A terminal node always corresponds to a single token from the input text. 12 | 13 | A typical terminal string looks like this (for instance matching 14 | the word *hestur*):: 15 | 16 | 'no_kk_nf_et' # Noun, masculine, nominative case, singular 17 | 18 | The terminal category, i.e. the first part of the terminal name (``no`` in the 19 | example), is available 20 | in the :py:attr:`SimpleTree.tcat` property. The grammatical variants of the 21 | terminal are stored in the list :py:attr:`SimpleTree.variants`, 22 | which is ``[ 'kk', 'nf', 'et' ]`` in the example. 23 | 24 | To obtain the entire set of variants (features) associated with a word form, 25 | use the property :py:attr:`SimpleTree.all_variants`. 26 | 27 | The terminal categories and grammatical variants are listed below. 28 | 29 | .. _categories: 30 | 31 | Word categories 32 | --------------- 33 | 34 | +------------+---------------------------------------------------+ 35 | | no | Noun (nafnorð) | 36 | +------------+---------------------------------------------------+ 37 | | so | Verb (sagnorð) | 38 | +------------+---------------------------------------------------+ 39 | | lo | Adjective (lýsingarorð) | 40 | +------------+---------------------------------------------------+ 41 | | fs | Preposition (forsetning) | 42 | +------------+---------------------------------------------------+ 43 | | nhm | Verb infinitive indicator (nafnháttarmerki, *að*) | 44 | +------------+---------------------------------------------------+ 45 | | gr | Definite article (laus greinir, *hinn/hin/hið*) | 46 | +------------+---------------------------------------------------+ 47 | | uh | Exclamation (upphrópun) | 48 | +------------+---------------------------------------------------+ 49 | | ao | Adverb (atviksorð) | 50 | +------------+---------------------------------------------------+ 51 | | eo | Qualifying adverb (atviksorð sem stendur með | 52 | | | nafnorði í einkunn) | 53 | +------------+---------------------------------------------------+ 54 | | st | Conjunction (samtenging) | 55 | +------------+---------------------------------------------------+ 56 | | stt | Connective conjunction (sem/er-samtenging) | 57 | +------------+---------------------------------------------------+ 58 | | fn | Pronoun (fornafn) | 59 | +------------+---------------------------------------------------+ 60 | | pfn | Personal pronoun (persónufornafn) | 61 | +------------+---------------------------------------------------+ 62 | | abfn | Reflexive pronoun (afturbeygt fornafn) | 63 | +------------+---------------------------------------------------+ 64 | | person | Person name (mannsnafn) | 65 | +------------+---------------------------------------------------+ 66 | | sérnafn | Proper name (sérnafn) | 67 | +------------+---------------------------------------------------+ 68 | | entity | Proper name of recognized named entity | 69 | +------------+---------------------------------------------------+ 70 | | fyrirtæki | Company name (fyrirtækisnafn) | 71 | +------------+---------------------------------------------------+ 72 | | gata | Street name (götuheiti) | 73 | +------------+---------------------------------------------------+ 74 | | to | Number word, inflectable (beygjanlegt töluorð) | 75 | | | Only *núll, einn, tveir, þrír, fjórir* | 76 | +------------+---------------------------------------------------+ 77 | | töl | Number word, uninflectable (óbeygjanlegt töluorð) | 78 | +------------+---------------------------------------------------+ 79 | 80 | Number categories 81 | ----------------- 82 | 83 | +----------------+---------------------------------------------------+ 84 | | tala | Number | 85 | +----------------+---------------------------------------------------+ 86 | | prósenta | Percentage | 87 | +----------------+---------------------------------------------------+ 88 | | ártal | Year | 89 | +----------------+---------------------------------------------------+ 90 | | raðnr | Ordinal number | 91 | +----------------+---------------------------------------------------+ 92 | | talameðbókstaf | Number followed by letter: *15B* | 93 | +----------------+---------------------------------------------------+ 94 | | sequence | Sequence: *1, 2, 3..., a, b, c..., i, ii, iii...* | 95 | +----------------+---------------------------------------------------+ 96 | 97 | Date and time categories 98 | ------------------------ 99 | 100 | +------------+---------------------------------------------------+ 101 | | dagsföst | Absolute date (year, month, day) | 102 | +------------+---------------------------------------------------+ 103 | | dagsafs | Relative date | 104 | | | (year, month, day - at least one value missing) | 105 | +------------+---------------------------------------------------+ 106 | | tími | Time (hour, minute, second) | 107 | +------------+---------------------------------------------------+ 108 | | tímapunktur| Time point | 109 | | | (year, month, day, hour, minute, second) | 110 | +------------+---------------------------------------------------+ 111 | 112 | Other 113 | ----------- 114 | +---------------+------------------------------------------------+ 115 | | lén | *greynir.is* | 116 | +---------------+------------------------------------------------+ 117 | | myllumerki | *#lífiðeryndislegt* | 118 | +---------------+------------------------------------------------+ 119 | | tölvupóstfang | *gervi@greynir.is* | 120 | +---------------+------------------------------------------------+ 121 | 122 | 123 | 124 | Punctuation 125 | ----------- 126 | 127 | +------------+---------------------------------------------------+ 128 | | grm | Punctuation | 129 | +------------+---------------------------------------------------+ 130 | 131 | 132 | 133 | .. _variants: 134 | 135 | Variants 136 | ======== 137 | 138 | This section lists grammatical variants (features) that are 139 | included as parts of terminal names, separated by underscores (``_``). 140 | 141 | Gender 142 | ------ 143 | 144 | +------------+---------------------------------------------------+ 145 | | kk | Masculine (karlkyn) | 146 | +------------+---------------------------------------------------+ 147 | | kvk | Feminine (kvenkyn) | 148 | +------------+---------------------------------------------------+ 149 | | hk | Neutral (hvorugkyn) | 150 | +------------+---------------------------------------------------+ 151 | 152 | Number 153 | ------ 154 | 155 | +------------+---------------------------------------------------+ 156 | | et | Singular (eintala) | 157 | +------------+---------------------------------------------------+ 158 | | ft | Plural (fleirtala) | 159 | +------------+---------------------------------------------------+ 160 | 161 | Case 162 | ---- 163 | 164 | The *case* variants may occur with nouns, pronouns, adjectives, prepositions 165 | and verbs (``lhþt`` and ``subj``). In the case of prepositions, the 166 | variant indicates which case the preposition controls. 167 | 168 | +------------+---------------------------------------------------+ 169 | | nf | Nominative (nefnifall) | 170 | +------------+---------------------------------------------------+ 171 | | þf | Accusative (þolfall) | 172 | +------------+---------------------------------------------------+ 173 | | þgf | Dative (þágufall) | 174 | +------------+---------------------------------------------------+ 175 | | ef | Genitive (eignarfall) | 176 | +------------+---------------------------------------------------+ 177 | 178 | Arguments 179 | --------- 180 | 181 | Verb terminals, other than ``lhþt`` and ``subj``, indicate the number 182 | and cases of the verb's arguments as follows:: 183 | 184 | 'so_0_et_p3_gm' # No argument, singular/3rd person/active voice 185 | 'so_1_þf_et_p3_gm' # Same, but with one argument in accusative case 186 | 'so_2_þgf_þf_et_p3_gm' # Two arguments, dative and accusative 187 | 188 | An example of a verb that matches the last terminal would be 189 | *skrifaði* (wrote) in the sentence *"Hann skrifaði konunni bréf"* 190 | ("He wrote a letter to the woman"). 191 | 192 | +------------+---------------------------------------------------+ 193 | | 0 | No argument | 194 | +------------+---------------------------------------------------+ 195 | | 1 | One argument, whose case is in the following | 196 | | | variant | 197 | +------------+---------------------------------------------------+ 198 | | 2 | Two arguments, whose cases are in the following | 199 | | | two variants | 200 | +------------+---------------------------------------------------+ 201 | 202 | Person 203 | ------ 204 | 205 | Occurs with verbs (``so`` terminal category) only. 206 | 207 | +------------+---------------------------------------------------+ 208 | | p1 | First person *(Ég er / Við erum)* | 209 | +------------+---------------------------------------------------+ 210 | | p2 | Second person *(Þú ert / Þið eruð)* | 211 | +------------+---------------------------------------------------+ 212 | | p3 | Third person *(Það er / Þau eru)* | 213 | +------------+---------------------------------------------------+ 214 | 215 | Degree 216 | ------ 217 | 218 | Occurs with adjectives (``lo`` terminal category), and in the 219 | case of ``mst`` with certain adverbs (``ao`` terminal category). 220 | 221 | +------------+---------------------------------------------------+ 222 | | mst | Comparative *(stærri)* | 223 | +------------+---------------------------------------------------+ 224 | | esb | Superlative, indefinite *(maðurinn er stærstur)* | 225 | +------------+---------------------------------------------------+ 226 | | evb | Superlative, definite *(stærsti maðurinn)* | 227 | +------------+---------------------------------------------------+ 228 | 229 | Adjective object case 230 | --------------------- 231 | 232 | Occurs with adjectives (``lo`` terminal category) only. 233 | 234 | +------------+---------------------------------------------------+ 235 | | sþf | Accusative (viðstaddur *hátíðina*) | 236 | +------------+---------------------------------------------------+ 237 | | sþgf | Dative (líkur *Páli*) | 238 | +------------+---------------------------------------------------+ 239 | | sef | Genitive (fullur *orku*) | 240 | +------------+---------------------------------------------------+ 241 | 242 | Verb forms 243 | ---------- 244 | 245 | These variants occur with verbs (``so`` terminal category) only. 246 | 247 | +------------+---------------------------------------------------------+ 248 | | gm | Active voice (germynd) | 249 | +------------+---------------------------------------------------------+ 250 | | mm | Middle voice (miðmynd) | 251 | +------------+---------------------------------------------------------+ 252 | | nh | Infinitive (nafnháttur) | 253 | +------------+---------------------------------------------------------+ 254 | | fh | Indicative (framsöguháttur) | 255 | +------------+---------------------------------------------------------+ 256 | | bh | Imperative (boðháttur) | 257 | +------------+---------------------------------------------------------+ 258 | | vh | Subjunctive (viðtengingarháttur) | 259 | +------------+---------------------------------------------------------+ 260 | | nt | Present tense (nútíð) | 261 | +------------+---------------------------------------------------------+ 262 | | þt | Past tense (þátíð) | 263 | +------------+---------------------------------------------------------+ 264 | | lh | | Present participle (lýsingarháttur nútíðar) | 265 | | | | (note that the ``nt`` variant will also be present) | 266 | +------------+---------------------------------------------------------+ 267 | | lhþt | | Past participle (lýsingarþáttur þátíðar) | 268 | | | | (note that the ``þt`` variant will NOT be present) | 269 | +------------+---------------------------------------------------------+ 270 | | sagnb | Supine (sagnbót) | 271 | +------------+---------------------------------------------------------+ 272 | | sb | Indefinite (sterk beyging), | 273 | | | only occurs with ``lhþt`` | 274 | +------------+---------------------------------------------------------+ 275 | | vb | Definite (veik beyging), | 276 | | | only occurs with ``lhþt`` | 277 | +------------+---------------------------------------------------------+ 278 | | op | Impersonal verb (ópersónuleg sögn) | 279 | +------------+---------------------------------------------------------+ 280 | | subj | Verb that requires the subject's case to be | 281 | | | non-nominative (sögn sem krefst frumlags í | 282 | | | aukafalli) | 283 | +------------+---------------------------------------------------------+ 284 | | expl | Expletive (leppur), matches verb forms that can be used | 285 | | | with an expletive (*það rignir*) | 286 | +------------+---------------------------------------------------------+ 287 | 288 | Noun qualifiers 289 | --------------- 290 | 291 | These variants occur with noun terminals (``no`` category) only. 292 | 293 | +------------+---------------------------------------------------+ 294 | | gr | Definite, attached to noun (viðskeyttur greinir | 295 | | | með nafnorði) | 296 | +------------+---------------------------------------------------+ 297 | | abbrev | Abbreviation (skammstöfun) | 298 | +------------+---------------------------------------------------+ 299 | 300 | Word or lemma endings 301 | --------------------- 302 | 303 | These variants can be used to constrain matching to word forms or lemmas 304 | with particular endings only. They are used to detect certain forms of 305 | grammatical errors. 306 | 307 | +------------+---------------------------------------------------+ 308 | | xir | Matches only words with lemmas that end with | 309 | | | *ir* (e.g., *læknir*, *kælir*) | 310 | +------------+---------------------------------------------------+ 311 | | zana | Matches only word forms that end with | 312 | | | *ana* (e.g., *flokkana*, *bílana*) | 313 | +------------+---------------------------------------------------+ 314 | 315 | -------------------------------------------------------------------------------- /old/build_wheels.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Build the reynir wheels on the CentOS5/6 base manylinux1/manylinux2010 platform 3 | # This script should be executed inside the Docker container! 4 | # It is invoked indirectly from wheels.sh 5 | 6 | # Stop execution upon error; show executed commands 7 | set -e -x 8 | 9 | # Create wheels for Python 3.7 10 | for PYBIN in cp37; do 11 | "/opt/python/${PYBIN}-${PYBIN}m/bin/pip" wheel /io/ -w wheelhouse/ 12 | done 13 | # Create wheels for Python >= 3.8 14 | for PYBIN in cp38 cp39; do 15 | "/opt/python/${PYBIN}-${PYBIN}/bin/pip" wheel /io/ -w wheelhouse/ 16 | done 17 | # Create wheels for PyPy3 (>=3.7) 18 | for PYBIN in /opt/pypy/pypy3.*/bin; do 19 | "${PYBIN}/pip" wheel /io/ -w wheelhouse/ 20 | done 21 | 22 | # Bundle external shared libraries into the wheels 23 | for whl in wheelhouse/reynir-*.whl; do 24 | auditwheel repair "$whl" --plat $PLAT -w /io/wheelhouse/ 25 | done 26 | 27 | # Set read/write permissions on the wheels 28 | chmod 666 /io/wheelhouse/* 29 | -------------------------------------------------------------------------------- /old/release.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Build a GreynirEngine release and upload it to PyPi 3 | if [ "$1" = "" ]; then 4 | echo "Version name argument missing" 5 | exit 1 6 | fi 7 | echo "Upload a new GreynirEngine version:" "$1" 8 | # Fix permission bits 9 | chmod -x src/reynir/*.py 10 | chmod -x src/reynir/*.cpp 11 | chmod -x src/reynir/*.grammar 12 | chmod -x src/reynir/config/* 13 | chmod -x src/reynir/resources/* 14 | # Remove binary grammar files as they may be out of date 15 | rm src/reynir/Greynir.*.bin 16 | # Create the base source distribution 17 | rm -rf build/* 18 | python3 setup.py sdist 19 | # Create the binary wheels 20 | source wheels.sh 21 | # Upload the new release 22 | twine upload dist/reynir-$1* 23 | echo "Upload of" "$1" "done" 24 | -------------------------------------------------------------------------------- /old/wheels.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | echo "Building manylinux2010 wheels..." 3 | # Build manylinux2010 versions via a Docker CentOS6 image 4 | # See https://github.com/pypa/python-manylinux-demo/blob/master/.travis.yml 5 | # and https://github.com/pypy/manylinux 6 | mkdir -p /tmp/io 7 | chmod 777 /tmp/io 8 | chgrp docker /tmp/io 9 | rm -rf /tmp/io/* 10 | mkdir -p /tmp/io/src 11 | mkdir -p /tmp/io/test 12 | mkdir -p /tmp/io/wheelhouse 13 | chmod 777 /tmp/io/wheelhouse 14 | chgrp docker /tmp/io/wheelhouse 15 | # Fresh copy everything to the /tmp/io temporary subdirectory, 16 | # expanding symlinks 17 | cp -L ./* /tmp/io 18 | cp -L -r ./src/* /tmp/io/src 19 | cp -L -r ./test/* /tmp/io/test 20 | # Pull the latest pypywheels/manylinux2010 Docker image 21 | docker pull pypywheels/manylinux2010-pypy_x86_64 22 | # Run the Docker image 23 | docker run --rm -e PLAT=manylinux2010_x86_64 -it -v /tmp/io:/io pypywheels/manylinux2010-pypy_x86_64 bash /io/build_wheels.sh 24 | # Copy the finished wheels 25 | mkdir -p ./dist 26 | mv /tmp/io/wheelhouse/reynir* ./dist 27 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Greynir: Natural language processing for Icelandic 4 | 5 | Setup.py 6 | 7 | Copyright © 2023 Miðeind ehf. 8 | Original Author: Vilhjálmur Þorsteinsson 9 | 10 | This software is licensed under the MIT License: 11 | 12 | Permission is hereby granted, free of charge, to any person 13 | obtaining a copy of this software and associated documentation 14 | files (the "Software"), to deal in the Software without restriction, 15 | including without limitation the rights to use, copy, modify, merge, 16 | publish, distribute, sublicense, and/or sell copies of the Software, 17 | and to permit persons to whom the Software is furnished to do so, 18 | subject to the following conditions: 19 | 20 | The above copyright notice and this permission notice shall be 21 | included in all copies or substantial portions of the Software. 22 | 23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 26 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 27 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 28 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 29 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 30 | 31 | 32 | This module sets up the Greynir package. It uses the cffi_modules 33 | parameter, available in recent versions of setuptools, to 34 | automatically compile the eparser.cpp module to eparser.*.so/.pyd 35 | and build the required CFFI Python wrapper via eparser_build.py. 36 | The same applies to bin.cpp -> bin.*.so and bin_build.py. 37 | 38 | Note that installing under PyPy >= 3.9 is supported (and recommended 39 | for best performance). 40 | 41 | """ 42 | 43 | from glob import glob 44 | from os.path import basename, splitext 45 | 46 | from setuptools import find_packages 47 | from setuptools import setup # type: ignore 48 | 49 | 50 | with open("README.md", "r", encoding="utf-8") as fh: 51 | long_description = fh.read() 52 | 53 | setup( 54 | name="reynir", 55 | version="3.5.7", 56 | license="MIT", 57 | description="A natural language parser for Icelandic", 58 | long_description=long_description, 59 | long_description_content_type="text/markdown", 60 | author="Miðeind ehf", 61 | author_email="mideind@mideind.is", 62 | url="https://github.com/mideind/GreynirEngine", 63 | packages=find_packages("src"), 64 | package_dir={"": "src"}, 65 | py_modules=[splitext(basename(path))[0] for path in glob("src/*.py")], 66 | package_data={"reynir": ["py.typed"]}, 67 | include_package_data=True, 68 | zip_safe=True, 69 | classifiers=[ 70 | # complete classifier list: http://pypi.python.org/pypi?%3Aaction=list_classifiers 71 | "Development Status :: 5 - Production/Stable", 72 | "Intended Audience :: Developers", 73 | "Intended Audience :: Science/Research", 74 | "License :: OSI Approved :: MIT License", 75 | "Operating System :: Unix", 76 | "Operating System :: POSIX", 77 | "Operating System :: Microsoft :: Windows", 78 | "Operating System :: MacOS", 79 | "Natural Language :: Icelandic", 80 | "Programming Language :: Python", 81 | "Programming Language :: Python :: 3", 82 | "Programming Language :: Python :: 3.9", 83 | "Programming Language :: Python :: 3.10", 84 | "Programming Language :: Python :: 3.11", 85 | "Programming Language :: Python :: 3.12", 86 | "Programming Language :: Python :: 3.13", 87 | "Programming Language :: Python :: Implementation :: CPython", 88 | "Programming Language :: Python :: Implementation :: PyPy", 89 | "Topic :: Software Development :: Libraries :: Python Modules", 90 | "Topic :: Utilities", 91 | "Topic :: Text Processing :: Linguistic", 92 | ], 93 | keywords=["nlp", "parser", "icelandic"], 94 | # Note: cffi 1.15.1 is the version built into PyPy 3.9. 95 | # Do not specify a higher version as that would prevent installation on PyPy 3.9, 96 | # unless you know what you're doing. 97 | setup_requires=["cffi>=1.15.1"], 98 | install_requires=[ 99 | "cffi>=1.15.1", 100 | "tokenizer>=3.4.5", 101 | "islenska>=1.0.3", 102 | "typing_extensions", 103 | ], 104 | cffi_modules=["src/reynir/eparser_build.py:ffibuilder"], 105 | ) 106 | -------------------------------------------------------------------------------- /src/reynir/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Greynir: Natural language processing for Icelandic 4 | 5 | Copyright © 2023 Miðeind ehf. 6 | Original author: Vilhjálmur Þorsteinsson 7 | 8 | This software is licensed under the MIT License: 9 | 10 | Permission is hereby granted, free of charge, to any person 11 | obtaining a copy of this software and associated documentation 12 | files (the "Software"), to deal in the Software without restriction, 13 | including without limitation the rights to use, copy, modify, merge, 14 | publish, distribute, sublicense, and/or sell copies of the Software, 15 | and to permit persons to whom the Software is furnished to do so, 16 | subject to the following conditions: 17 | 18 | The above copyright notice and this permission notice shall be 19 | included in all copies or substantial portions of the Software. 20 | 21 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 22 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 23 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 24 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 25 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 26 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 27 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 28 | 29 | This module exposes the Greynir API, i.e. the identifiers that are 30 | directly accessible via the reynir module object after importing it. 31 | 32 | """ 33 | 34 | # Expose the Greynir API 35 | 36 | import importlib.metadata 37 | 38 | from .reynir import ( 39 | Greynir, 40 | Reynir, 41 | Terminal, 42 | LemmaTuple, 43 | ProgressFunc, 44 | ParseResult, 45 | Sentence, 46 | Paragraph, 47 | ICELANDIC_RATIO, 48 | ) 49 | 50 | # Import the following _underscored classes to be able to use them 51 | # in type signatures in derived classes 52 | from .reynir import ( 53 | _Job, 54 | _Sentence, 55 | _Paragraph, 56 | ) 57 | from .nounphrase import NounPhrase 58 | from .fastparser import ParseForestPrinter, ParseForestDumper, ParseForestFlattener 59 | from .fastparser import ParseError, ParseForestNavigator 60 | from .settings import Settings 61 | from .bintokenizer import tokenize, TokenList 62 | 63 | # Expose the tokenizer API 64 | 65 | from tokenizer import ( 66 | TOK, 67 | Tok, 68 | paragraphs, 69 | correct_spaces, 70 | mark_paragraphs, 71 | TP_LEFT, 72 | TP_CENTER, 73 | TP_RIGHT, 74 | TP_NONE, 75 | TP_WORD, 76 | KLUDGY_ORDINALS_PASS_THROUGH, 77 | KLUDGY_ORDINALS_MODIFY, 78 | KLUDGY_ORDINALS_TRANSLATE, 79 | ) 80 | from tokenizer.abbrev import Abbreviations 81 | 82 | __author__ = "Miðeind ehf." 83 | __copyright__ = "© 2023 Miðeind ehf." 84 | __version__ = importlib.metadata.version("reynir") 85 | 86 | __all__ = ( 87 | "TP_LEFT", 88 | "TP_RIGHT", 89 | "TP_CENTER", 90 | "TP_NONE", 91 | "TP_WORD", 92 | "KLUDGY_ORDINALS_MODIFY", 93 | "KLUDGY_ORDINALS_PASS_THROUGH", 94 | "KLUDGY_ORDINALS_TRANSLATE", 95 | "Greynir", 96 | "Reynir", 97 | "Terminal", 98 | "LemmaTuple", 99 | "ProgressFunc", 100 | "ParseResult", 101 | "Sentence", 102 | "Paragraph", 103 | "ICELANDIC_RATIO", 104 | "TOK", 105 | "Tok", 106 | "paragraphs", 107 | "correct_spaces", 108 | "mark_paragraphs", 109 | "_Job", 110 | "_Sentence", 111 | "_Paragraph", 112 | "NounPhrase", 113 | "ParseForestPrinter", 114 | "ParseForestDumper", 115 | "ParseForestFlattener", 116 | "ParseError", 117 | "ParseForestNavigator", 118 | "Settings", 119 | "tokenize", 120 | "TokenList", 121 | "__version__", 122 | "__author__", 123 | "__copyright__", 124 | ) 125 | 126 | Abbreviations.initialize() 127 | Settings.read("config/GreynirEngine.conf") 128 | -------------------------------------------------------------------------------- /src/reynir/baseparser.py: -------------------------------------------------------------------------------- 1 | """ 2 | Greynir: Natural language processing for Icelandic 3 | 4 | Parser base module 5 | 6 | Copyright © 2023 Miðeind ehf. 7 | 8 | This software is licensed under the MIT License: 9 | 10 | Permission is hereby granted, free of charge, to any person 11 | obtaining a copy of this software and associated documentation 12 | files (the "Software"), to deal in the Software without restriction, 13 | including without limitation the rights to use, copy, modify, merge, 14 | publish, distribute, sublicense, and/or sell copies of the Software, 15 | and to permit persons to whom the Software is furnished to do so, 16 | subject to the following conditions: 17 | 18 | The above copyright notice and this permission notice shall be 19 | included in all copies or substantial portions of the Software. 20 | 21 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 22 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 23 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 24 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 25 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 26 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 27 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 28 | 29 | This module defines a base parser class. The base is used in 30 | BIN_Parser (see binparser.py) which is again the base of the 31 | C++ Earley parser Fast_Parser (see fastparser.py) 32 | 33 | """ 34 | 35 | from typing import Dict, List, Iterator, Optional 36 | 37 | from .grammar import Grammar, GrammarItem, Terminal, Nonterminal, Production 38 | 39 | 40 | class _PackedProduction: 41 | 42 | """A container for a packed production, i.e. a grammar Production 43 | where the component terminals and nonterminals have been packed 44 | into a list of integer indices""" 45 | 46 | def __init__(self, priority: int, production: Production) -> None: 47 | # Store the relative priority of this production within its nonterminal 48 | self._priority = priority 49 | # Keep a reference to the original production 50 | self._production = production 51 | # Store the packed list of indices 52 | self._ix_list = production.prod 53 | # Cache the length 54 | self._len = len(self._ix_list) 55 | 56 | @property 57 | def production(self) -> Production: 58 | return self._production 59 | 60 | @property 61 | def priority(self) -> int: 62 | return self._priority 63 | 64 | def __getitem__(self, index: int) -> int: 65 | return self._ix_list[index] if 0 <= index < self._len else 0 66 | 67 | def __len__(self) -> int: 68 | return self._len 69 | 70 | def __iter__(self) -> Iterator[int]: 71 | return iter(self._ix_list) 72 | 73 | 74 | class Base_Parser: 75 | 76 | """Parses a sequence of tokens according to a given grammar and 77 | a root nonterminal within that grammar, returning a forest of 78 | possible parses. The parses uses an optimized Earley algorithm. 79 | """ 80 | 81 | def __init__(self) -> None: 82 | self._root: Optional[int] = None 83 | self._nt_dict: Dict[int, Optional[List[_PackedProduction]]] = {} 84 | self._nonterminals: Dict[int, Nonterminal] = {} 85 | self._terminals: Dict[int, Terminal] = {} 86 | 87 | def init_from_grammar(self, g: Grammar) -> None: 88 | """Initialize the parser with the given grammar""" 89 | nt_d = g.nt_dict 90 | r = g.root 91 | assert nt_d is not None 92 | assert r is not None 93 | assert r in nt_d 94 | # Convert the grammar to integer index representation for speed 95 | self._root = r.index 96 | # Make new grammar dictionary, keyed by nonterminal index and 97 | # containing packed productions with integer indices 98 | self._nt_dict = {} 99 | for nt, plist in nt_d.items(): 100 | self._nt_dict[nt.index] = ( 101 | [_PackedProduction(prio, p) for prio, p in plist] 102 | ) 103 | self._nonterminals = g.nonterminals_by_ix 104 | self._terminals = g.terminals_by_ix 105 | 106 | @classmethod 107 | def for_grammar(cls, g: Grammar) -> "Base_Parser": 108 | """Create a parser for the Grammar in g""" 109 | p = cls() 110 | p.init_from_grammar(g) 111 | return p 112 | 113 | def _lookup(self, ix: int) -> GrammarItem: 114 | """Convert a production item from an index to an object reference""" 115 | # Terminals have positive indices 116 | # Nonterminals have negative indices 117 | # A zero index is not allowed 118 | assert ix != 0 119 | return self._nonterminals[ix] if ix < 0 else self._terminals[ix] 120 | -------------------------------------------------------------------------------- /src/reynir/basics.py: -------------------------------------------------------------------------------- 1 | """ 2 | Greynir: Natural language processing for Icelandic 3 | 4 | Basic classes module 5 | 6 | Copyright © 2023 Miðeind ehf. 7 | 8 | This software is licensed under the MIT License: 9 | 10 | Permission is hereby granted, free of charge, to any person 11 | obtaining a copy of this software and associated documentation 12 | files (the "Software"), to deal in the Software without restriction, 13 | including without limitation the rights to use, copy, modify, merge, 14 | publish, distribute, sublicense, and/or sell copies of the Software, 15 | and to permit persons to whom the Software is furnished to do so, 16 | subject to the following conditions: 17 | 18 | The above copyright notice and this permission notice shall be 19 | included in all copies or substantial portions of the Software. 20 | 21 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 22 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 23 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 24 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 25 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 26 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 27 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 28 | 29 | 30 | This module contains basic functions that are used by the settings 31 | module and other modules. These functions have been extracted from the 32 | settings module to avoid circular imports or module references. 33 | 34 | """ 35 | 36 | from typing import ( 37 | Callable, 38 | Iterable, 39 | Iterator, 40 | List, 41 | Optional, 42 | ) 43 | 44 | import os 45 | import locale 46 | 47 | from contextlib import contextmanager 48 | import importlib.resources as importlib_resources 49 | 50 | 51 | # The locale used by default in the changedlocale function 52 | _DEFAULT_LOCALE = ("IS_is", "UTF-8") 53 | 54 | # A set of all valid verb argument cases 55 | ALL_CASES = frozenset(("nf", "þf", "þgf", "ef")) 56 | ALL_GENDERS = frozenset(("kk", "kvk", "hk")) 57 | ALL_NUMBERS = frozenset(("et", "ft")) 58 | SUBCLAUSES = frozenset(("nh", "nhx", "falls", "spurns")) 59 | REFLPRN = {"sig": "sig_hk_et_þf", "sér": "sig_hk_et_þgf", "sín": "sig_hk_et_ef"} 60 | REFLPRN_CASE = {"sig": "þf", "sér": "þgf", "sín": "ef"} 61 | REFLPRN_SET = frozenset(REFLPRN.keys()) 62 | 63 | # BÍN compressed file format version (used in tools/binpack.py and bincompress.py) 64 | BIN_COMPRESSOR_VERSION = b"Greynir 02.00.00" 65 | assert len(BIN_COMPRESSOR_VERSION) == 16 66 | BIN_COMPRESSED_FILE = "ord.compressed" 67 | 68 | 69 | @contextmanager 70 | def changedlocale( 71 | new_locale: Optional[str] = None, category: str = "LC_COLLATE" 72 | ) -> Iterator[Callable[[str], str]]: 73 | """Change locale for collation temporarily within a context (with-statement)""" 74 | # The newone locale parameter should be a tuple: ('is_IS', 'UTF-8') 75 | # The category should be a string such as 'LC_TIME', 'LC_NUMERIC' etc. 76 | cat = getattr(locale, category) 77 | old_locale = locale.getlocale(cat) 78 | try: 79 | locale.setlocale(cat, new_locale or _DEFAULT_LOCALE) 80 | yield locale.strxfrm # Function to transform string for sorting 81 | finally: 82 | locale.setlocale(cat, old_locale) 83 | 84 | 85 | def sort_strings(strings: Iterable[str], loc: Optional[str] = None) -> List[str]: 86 | """Sort a list of strings using the specified locale's collation order""" 87 | # Change locale temporarily for the sort 88 | with changedlocale(loc) as strxfrm: 89 | return sorted(strings, key=strxfrm) 90 | 91 | 92 | class ConfigError(Exception): 93 | """Exception class for configuration errors""" 94 | 95 | def __init__(self, s: str) -> None: 96 | super().__init__(s) 97 | self.fname: Optional[str] = None 98 | self.line = 0 99 | 100 | def set_pos(self, fname: str, line: int) -> None: 101 | """Set file name and line information, if not already set""" 102 | if not self.fname: 103 | self.fname = fname 104 | self.line = line 105 | 106 | def __str__(self) -> str: 107 | """Return a string representation of this exception""" 108 | s = Exception.__str__(self) 109 | if not self.fname: 110 | return s 111 | return "File {0}, line {1}: {2}".format(self.fname, self.line, s) 112 | 113 | 114 | class LineReader: 115 | """Read lines from a text file, recognizing $include directives""" 116 | 117 | def __init__( 118 | self, 119 | fname: str, 120 | *, 121 | package_name: Optional[str] = None, 122 | outer_fname: Optional[str] = None, 123 | outer_line: int = 0 124 | ) -> None: 125 | self._fname = fname 126 | self._package_name = package_name 127 | self._line = 0 128 | self._inner_rdr: Optional[LineReader] = None 129 | self._outer_fname = outer_fname 130 | self._outer_line = outer_line 131 | 132 | def fname(self) -> str: 133 | """The name of the file being read""" 134 | return self._fname if self._inner_rdr is None else self._inner_rdr.fname() 135 | 136 | def line(self) -> int: 137 | """The number of the current line within the file""" 138 | return self._line if self._inner_rdr is None else self._inner_rdr.line() 139 | 140 | def lines(self) -> Iterator[str]: 141 | """Generator yielding lines from a text file""" 142 | self._line = 0 143 | try: 144 | if self._package_name: 145 | ref = importlib_resources.files("reynir").joinpath(self._fname) 146 | stream = ref.open("rb") 147 | else: 148 | stream = open(self._fname, "rb") 149 | with stream as inp: 150 | # Read config file line-by-line from the package resources 151 | accumulator = "" 152 | for b in inp: 153 | # We get byte strings; convert from utf-8 to Python strings 154 | s = b.decode("utf-8") 155 | self._line += 1 156 | if s.rstrip().endswith("\\"): 157 | # Backslash at end of line: continuation in next line 158 | accumulator += s.strip()[:-1] 159 | continue 160 | if accumulator: 161 | # Add accumulated text from preceding 162 | # backslash-terminated lines, but drop leading whitespace 163 | s = accumulator + s.lstrip() 164 | accumulator = "" 165 | # Check for include directive: $include filename.txt 166 | if s.startswith("$") and s.lower().startswith("$include "): 167 | iname = s.split(maxsplit=1)[1].strip() 168 | # Do some path magic to allow the included path 169 | # to be relative to the current file path, or a 170 | # fresh (absolute) path by itself 171 | head, _ = os.path.split(self._fname) 172 | iname = os.path.join(head, iname) 173 | rdr = self._inner_rdr = LineReader( 174 | iname, 175 | package_name=self._package_name, 176 | outer_fname=self._fname, 177 | outer_line=self._line, 178 | ) 179 | yield from rdr.lines() 180 | self._inner_rdr = None 181 | else: 182 | yield s 183 | if accumulator: 184 | # Catch corner case where last line of file ends with a backslash 185 | yield accumulator 186 | except (IOError, OSError): 187 | if self._outer_fname: 188 | # This is an include file within an outer config file 189 | c = ConfigError( 190 | "Error while opening or reading include file '{0}'".format( 191 | self._fname 192 | ) 193 | ) 194 | c.set_pos(self._outer_fname, self._outer_line) 195 | else: 196 | # This is an outermost config file 197 | c = ConfigError( 198 | "Error while opening or reading config file '{0}'".format( 199 | self._fname 200 | ) 201 | ) 202 | raise c 203 | -------------------------------------------------------------------------------- /src/reynir/bindb.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Greynir: Natural language processing for Icelandic 4 | 5 | BinDb module 6 | 7 | Copyright © 2023 Miðeind ehf. 8 | 9 | This software is licensed under the MIT License: 10 | 11 | Permission is hereby granted, free of charge, to any person 12 | obtaining a copy of this software and associated documentation 13 | files (the "Software"), to deal in the Software without restriction, 14 | including without limitation the rights to use, copy, modify, merge, 15 | publish, distribute, sublicense, and/or sell copies of the Software, 16 | and to permit persons to whom the Software is furnished to do so, 17 | subject to the following conditions: 18 | 19 | The above copyright notice and this permission notice shall be 20 | included in all copies or substantial portions of the Software. 21 | 22 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 23 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 24 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 25 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 26 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 27 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 28 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 29 | 30 | This module implements a thin wrapper on top of the GreynirBin 31 | class from BinPackage, as well as a couple of basic data classes. 32 | 33 | """ 34 | 35 | from typing import Any, List, Optional, Tuple 36 | from functools import lru_cache 37 | 38 | from islenska.basics import make_bin_entry, ALL_CASES 39 | from islenska.bindb import GreynirBin as GBin, PERSON_NAME_FL 40 | 41 | from tokenizer.definitions import BIN_Tuple 42 | 43 | from .settings import StaticPhrases 44 | 45 | # SHSnid tuple as seen by the Greynir compatibility layer 46 | ResultTuple = Tuple[str, List[BIN_Tuple]] 47 | 48 | 49 | # Size of name cache for lookup_name_gender 50 | _NAME_GENDER_CACHE_SIZE = 128 51 | 52 | 53 | class GreynirBin(GBin): 54 | 55 | """Overridden class that adds a singleton instance of GreynirBin 56 | and a context manager protocol""" 57 | 58 | _singleton: Optional["GreynirBin"] = None 59 | 60 | @classmethod 61 | def get_db(cls) -> "GreynirBin": 62 | if cls._singleton is None: 63 | cls._singleton = GreynirBin() 64 | return cls._singleton 65 | 66 | def __enter__(self) -> "GreynirBin": 67 | """Allow this class to be used in a with statement""" 68 | return self 69 | 70 | def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: 71 | pass 72 | 73 | def lookup_g( 74 | self, w: str, at_sentence_start: bool = False, auto_uppercase: bool = False 75 | ) -> ResultTuple: 76 | """Returns BIN_Tuple instances, which are the Greynir version 77 | of islenska.BinEntry""" 78 | w, m = self._lookup( 79 | w, 80 | at_sentence_start, 81 | auto_uppercase, 82 | self._meanings_cache_lookup, 83 | make_bin_entry, 84 | ) 85 | return w, [BIN_Tuple._make(mm) for mm in m] 86 | 87 | def lookup_nominative_g(self, w: str, **options: Any) -> List[BIN_Tuple]: 88 | """Returns the Greynir version of islenska.BinEntry""" 89 | return [BIN_Tuple._make(mm) for mm in super().lookup_nominative(w, **options)] 90 | 91 | def lookup_accusative_g(self, w: str, **options: Any) -> List[BIN_Tuple]: 92 | """Returns the Greynir version of islenska.BinEntry""" 93 | return [BIN_Tuple._make(mm) for mm in super().lookup_accusative(w, **options)] 94 | 95 | def lookup_dative_g(self, w: str, **options: Any) -> List[BIN_Tuple]: 96 | """Returns the Greynir version of islenska.BinEntry""" 97 | return [BIN_Tuple._make(mm) for mm in super().lookup_dative(w, **options)] 98 | 99 | def lookup_genitive_g(self, w: str, **options: Any) -> List[BIN_Tuple]: 100 | """Returns the Greynir version of islenska.BinEntry""" 101 | return [BIN_Tuple._make(mm) for mm in super().lookup_genitive(w, **options)] 102 | 103 | def meanings(self, w: str) -> List[BIN_Tuple]: 104 | """Low-level lookup of BIN_Tuple instances for the given word""" 105 | return [ 106 | BIN_Tuple(k.ord, k.bin_id, k.ofl, k.hluti, k.bmynd, k.mark) 107 | for k in self._ksnid_lookup(w) 108 | ] 109 | 110 | @lru_cache(maxsize=_NAME_GENDER_CACHE_SIZE) 111 | def lookup_name_gender(self, name: str, preferred_case: str = "nf") -> str: 112 | """Given a person name, lookup its gender""" 113 | assert preferred_case in ALL_CASES 114 | 115 | if not name: 116 | return "hk" # Unknown gender 117 | 118 | w = name.split(maxsplit=1)[0] # Get first name 119 | m = self.meanings(w) # Look up meanings 120 | if m: 121 | # Find all meanings that can be person names 122 | nl = [x for x in m if x.fl in PERSON_NAME_FL] 123 | if nl: 124 | # Find all meanings in the preferred case 125 | prefc = [x for x in nl if x.beyging.lower().startswith(preferred_case)] 126 | if prefc: 127 | # Found a name meaning in the preferred case 128 | return prefc[0].ordfl 129 | # Found a name meaning *not* in the preferred case 130 | return nl[0].ordfl 131 | 132 | # The first name was not found: check whether the full name is 133 | # in the static phrases 134 | m = StaticPhrases.lookup(name) 135 | if m is not None: 136 | if m.fl in PERSON_NAME_FL: 137 | return m.ordfl 138 | return "hk" # Unknown gender 139 | -------------------------------------------------------------------------------- /src/reynir/cache.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | cache.py 4 | 5 | Cache utility classes 6 | 7 | The LRU_Cache and LFU_Cache classes herein are 8 | copyright © 2011 by Raymond Hettinger 9 | 10 | cf. http://code.activestate.com/recipes/577970-simplified-lru-cache/ 11 | http://code.activestate.com/recipes/498245-lru-and-lfu-cache-decorators/ 12 | 13 | MIT license: 14 | 15 | Permission is hereby granted, free of charge, to any person obtaining a copy of 16 | this software and associated documentation files (the "Software"), to deal in 17 | the Software without restriction, including without limitation the rights to use, 18 | copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the 19 | Software, and to permit persons to whom the Software is furnished to do so, 20 | subject to the following conditions: 21 | 22 | The above copyright notice and this permission notice shall be included 23 | in all copies or substantial portions of the Software. 24 | 25 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 26 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 27 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 28 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 29 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 30 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 31 | IN THE SOFTWARE. 32 | 33 | --- 34 | 35 | The classes have been modified from their original versions, 36 | which are available from the URLs given above. 37 | 38 | """ 39 | 40 | from typing import List, Dict, Any, Callable, TypeVar, Generic, cast 41 | 42 | from heapq import nsmallest 43 | from operator import itemgetter 44 | import threading 45 | from functools import wraps 46 | 47 | 48 | LRU_DEFAULT = 1024 49 | LFU_DEFAULT = 512 50 | 51 | 52 | _K = TypeVar("_K") 53 | _V = TypeVar("_V") 54 | 55 | 56 | class LRU_Cache(Generic[_V]): 57 | def __init__( 58 | self, user_function: Callable[..., _V], maxsize: int = LRU_DEFAULT 59 | ) -> None: 60 | # Link layout: [PREV, NEXT, KEY, RESULT] 61 | root: List[Any] = [None, None, None, None] 62 | self.root = root 63 | self.user_function = user_function 64 | cache: Dict[Any, List[Any]] = {} 65 | self.cache = cache 66 | 67 | last: List[Any] = root 68 | for _ in range(maxsize): 69 | key = object() 70 | cache[key] = last[1] = last = [last, root, key, None] 71 | root[0] = last 72 | 73 | def __call__(self, *key: Any) -> _V: 74 | cache = self.cache 75 | root = self.root 76 | link = cache.get(key) 77 | if link is not None: 78 | link_prev, link_next, _, result = link 79 | link_prev[1] = link_next 80 | link_next[0] = link_prev 81 | last = root[0] 82 | last[1] = root[0] = link 83 | link[0] = last 84 | link[1] = root 85 | return result 86 | result = self.user_function(*key) 87 | root[2] = key 88 | root[3] = result 89 | oldroot = root 90 | root = self.root = root[1] 91 | root[2], oldkey = None, root[2] 92 | root[3] = None 93 | del cache[oldkey] 94 | cache[key] = oldroot 95 | return result 96 | 97 | 98 | class Counter(Dict[_K, int], Generic[_K]): 99 | """Mapping where default values are zero""" 100 | 101 | def __missing__(self, key: _K) -> int: 102 | return 0 103 | 104 | 105 | class LFU_Cache(Generic[_K, _V]): 106 | 107 | """Least-frequently-used (LFU) cache for word lookups. 108 | Based on a pattern by Raymond Hettinger 109 | """ 110 | 111 | def __init__(self, maxsize: int = LFU_DEFAULT) -> None: 112 | # Mapping of keys to results 113 | self.cache: Dict[_K, _V] = {} 114 | # Times each key has been accessed 115 | self.use_count: Counter[_K] = Counter() 116 | self.maxsize = maxsize 117 | self.hits = self.misses = 0 118 | # The cache may be accessed in parallel by multiple threads 119 | self.lock = threading.Lock() 120 | 121 | def lookup(self, key: _K, func: Callable[[_K], _V]) -> _V: 122 | """Lookup a key in the cache, calling func(key) 123 | to obtain the data if not already there""" 124 | with self.lock: 125 | self.use_count[key] += 1 126 | # Get cache entry or compute if not found 127 | try: 128 | result = self.cache[key] 129 | self.hits += 1 130 | except KeyError: 131 | result = func(key) 132 | self.cache[key] = result 133 | self.misses += 1 134 | 135 | # Purge the 10% least frequently used cache entries 136 | if len(self.cache) > self.maxsize: 137 | for key, _ in nsmallest( 138 | self.maxsize // 10, self.use_count.items(), key=itemgetter(1) 139 | ): 140 | 141 | del self.cache[key], self.use_count[key] 142 | 143 | return result 144 | 145 | 146 | # Define a type variable to allow MyPy to infer the relationship 147 | # between intermediate types in cached and cached_property 148 | _T = TypeVar("_T") 149 | 150 | # Define a unique singleton for use as a sentinel 151 | _NA = object() 152 | 153 | 154 | def cached(func: Callable[..., _T]) -> Callable[..., _T]: 155 | """A decorator for caching function calls""" 156 | 157 | @wraps(func) 158 | def wrapper(*args: Any, **kwargs: Any) -> _T: 159 | val = cast(_T, getattr(func, "_cache", _NA)) 160 | if val is _NA: 161 | val = func(*args, **kwargs) 162 | setattr(func, "_cache", val) 163 | return val 164 | 165 | return wrapper 166 | 167 | 168 | class cached_property(Generic[_T]): 169 | 170 | """A decorator for caching instance properties""" 171 | 172 | def __init__(self, func: Callable[..., _T]) -> None: 173 | self.__doc__ = getattr(func, "__doc__") 174 | self.func = func 175 | 176 | def __get__(self, obj: Any, cls: Any) -> _T: 177 | if obj is None: 178 | return cast(_T, self) # Hack to satisfy mypy/Pylance 179 | # Get the property value and put it into the instance's 180 | # dict instead of the original function 181 | val = obj.__dict__[self.func.__name__] = self.func(obj) 182 | return val 183 | -------------------------------------------------------------------------------- /src/reynir/config/Abbrev_errors.conf: -------------------------------------------------------------------------------- 1 | 2 | þ.á.m. = "þar á meðal" ao frasi # Algeng villa 3 | n.k.* = "næstkomandi" lo # Prentvilla, en talin nógu saklaus til að leyfa 4 | -------------------------------------------------------------------------------- /src/reynir/config/AdjectivePredicates.conf: -------------------------------------------------------------------------------- 1 | # Greynir: Natural language processing for Icelandic 2 | 3 | # From Kristín Þóra Pétursdóttir's Master's thesis, http://hdl.handle.net/1946/17722, 4 | # with additions 5 | 6 | [adjective_predicates] 7 | 8 | aðgengilegur þgf 9 | andsnúinn þgf 10 | andstreymur þgf 11 | beinisamur þgf 12 | bundinn þgf 13 | feginn þgf 14 | fráhverfur þgf 15 | fyrirlitlegur þgf 16 | frændhollur þgf 17 | góðviljaður þgf 18 | haldinn þgf 19 | harmdauði þgf 20 | háður þgf 21 | óháður þgf 22 | hliðhollur þgf 23 | hollur þgf 24 | liðsinnaður þgf 25 | lokaður þgf 26 | mótdrægur þgf 27 | mótfallinn þgf 28 | mótgjarn þgf 29 | mótsnúinn þgf 30 | opinn þgf 31 | óbrigður þgf 32 | reiðubúinn þgf 33 | skæður þgf 34 | tilgefinn þgf 35 | vandabundinn þgf 36 | viljaður þgf 37 | vingæfur þgf 38 | vinhallur þgf 39 | vinhollur þgf 40 | vinveittur þgf 41 | vorkunnugur þgf 42 | 43 | alkunnugur þgf 44 | alkunnur þgf 45 | áþekkur þgf 46 | fjarlægur þgf 47 | fjarstæður þgf 48 | frábrugðinn þgf 49 | jafn þgf 50 | jafnaldra þgf 51 | jafnborinn þgf 52 | jafnfætis þgf 53 | jafngamall þgf 54 | jafnkosta þgf 55 | jafnkristinn þgf 56 | jafnliða þgf 57 | jafnlíkur þgf 58 | kunnugur þgf 59 | kynlíkur þgf 60 | ólíkur þgf 61 | líkur þgf 62 | málkunnugur þgf 63 | merktur þgf 64 | nafnkunnugur þgf 65 | náinn þgf 66 | nágöngull þgf 67 | nákominn þgf 68 | nálægur þgf 69 | nástæður þgf 70 | nærgengur þgf 71 | nærstandandi þgf 72 | nærri þgf 73 | ókunnur þgf 74 | sambærilegur þgf 75 | sameiginlegur þgf 76 | samferða þgf 77 | samhljóða þgf 78 | samhliða þgf 79 | samhuga þgf 80 | samkynja þgf 81 | ósamjafn þgf 82 | samjafn þgf 83 | samkvæmur þgf 84 | samlaga þgf 85 | samlendur þgf 86 | sammála þgf 87 | sammæddur þgf 88 | samnefndur þgf 89 | samsekur þgf 90 | samsíða þgf 91 | samskipa þgf 92 | samskóla þgf 93 | samstunda þgf 94 | samtengdur þgf 95 | samtíða þgf 96 | samvista þgf 97 | samþykkur þgf 98 | sifjaður þgf 99 | skaplíkur þgf 100 | óskyldur þgf 101 | skyldur þgf 102 | svipaður þgf 103 | tengdur þgf 104 | ótengdur þgf 105 | fasttengdur þgf 106 | 107 | alskipaður þgf 108 | ataður þgf 109 | auðráðinn þgf 110 | auðráður þgf 111 | áfastur þgf 112 | ánafnaður þgf 113 | blandaður þgf 114 | blandinn þgf 115 | byggður þgf 116 | búinn þgf 117 | eignaður þgf 118 | firrtur þgf 119 | gróinn þgf 120 | gyrtur þgf 121 | gæddur þgf 122 | heyrilegur þgf 123 | hjúpaður þgf 124 | hlaðinn þgf 125 | hulinn þgf 126 | innborinn þgf 127 | fagurskrýddur þgf 128 | falur þgf 129 | gildur þgf 130 | óheimill þgf 131 | heimill þgf 132 | kafhlaðinn þgf 133 | kafinn þgf 134 | klæddur þgf 135 | íklæddur þgf 136 | knúinn þgf 137 | litaður þgf 138 | rammskipaður þgf 139 | rúinn þgf 140 | skreyttur þgf 141 | sleginn þgf 142 | smurður þgf 143 | gersneyddur þgf 144 | sneyddur þgf 145 | sveipaður þgf 146 | sýnilegur þgf 147 | umvafinn þgf 148 | undanskilinn þgf 149 | undanþeginn þgf 150 | vaxinn þgf 151 | þakinn þgf 152 | vafinn þgf 153 | úðaður þgf 154 | 155 | ástfólginn þgf 156 | einhlítur þgf 157 | frábitinn þgf 158 | gagnlegur þgf 159 | hagfelldur þgf 160 | haldsamur þgf 161 | hjartfólginn þgf 162 | hugfelldur þgf 163 | hugleikinn þgf 164 | hugnæmur þgf 165 | hugstæður þgf 166 | hugþekkur þgf 167 | kær þgf 168 | ljós þgf 169 | leiður þgf 170 | maklegur þgf 171 | náttúrulegur þgf 172 | óskapfelldur þgf 173 | tamur þgf 174 | vandalaus þgf 175 | 176 | afhuga þgf 177 | andvígur þgf 178 | ástúðlegur þgf 179 | blíður þgf 180 | eftirlátur þgf 181 | fráskila þgf 182 | fylgisamur þgf 183 | fylginn þgf 184 | góður þgf 185 | góðviljaður þgf 186 | gramur þgf 187 | grimmur þgf 188 | handgenginn þgf 189 | harður þgf 190 | hjálplegur þgf 191 | hlynntur þgf 192 | hlýðinn þgf 193 | hægur þgf 194 | hættulegur þgf 195 | leiðitamur þgf 196 | miskunnsamur þgf 197 | mjúkur þgf 198 | reiður þgf 199 | skuldbundinn þgf 200 | traustur þgf 201 | ótrúr þgf 202 | trúr þgf 203 | ótryggur þgf 204 | tryggur þgf 205 | undirgefinn þgf 206 | undirlátur þgf 207 | viðbúinn þgf 208 | vondur þgf 209 | óþakklátur þgf 210 | þakklátur þgf 211 | þekkur þgf 212 | þýður þgf 213 | þægur þgf 214 | þægilegur þgf 215 | æfur þgf 216 | 217 | ástúðlegur /við þf 218 | blíður /við þf 219 | bundinn /við þf 220 | duglegur /við þf 221 | eftirlátur /við þf 222 | fráskila /við þf 223 | fylgisamur /við þf 224 | fylginn /við þf 225 | góður /við þf 226 | góðviljaður /við þf 227 | gramur /við þf 228 | grimmur /við þf 229 | handgenginn /við þf 230 | harður /við þf 231 | hjálplegur /við þf 232 | hlynntur /við þf 233 | hlýðinn /við þf 234 | hægur /við þf 235 | hættulegur /við þf 236 | leiðitamur /við þf 237 | miskunnsamur /við þf 238 | mjúkur /við þf 239 | reiður /við þf 240 | skuldbundinn /við þf 241 | traustur /við þf 242 | ótrúr /við þf 243 | trúr /við þf 244 | ótryggur /við þf 245 | tryggur /við þf 246 | undirgefinn /við þf 247 | undirlátur /við þf 248 | viðbúinn /við þf 249 | vondur /við þf 250 | óþakklátur /við þf 251 | þakklátur /við þf 252 | þekkur /við þf 253 | þýður /við þf 254 | þægur /við þf 255 | þægilegur /við þf 256 | æfur /við þf 257 | 258 | auðveldur þgf 259 | dýrmætur þgf 260 | eiginlegur þgf 261 | erfiður þgf 262 | óhagstæður þgf 263 | hagstæður þgf 264 | harðleikinn þgf 265 | óhentugur þgf 266 | hentugur þgf 267 | hættur þgf 268 | illur þgf 269 | nytsamur þgf 270 | nytsamlegur þgf 271 | skaðlaus þgf 272 | skaðlegur þgf 273 | skaðsamlegur þgf 274 | skaðsamur þgf 275 | skaðvænlegur þgf 276 | skaðvænn þgf 277 | torveldur þgf 278 | óþarfur þgf 279 | þarfur þgf 280 | þolanlegur þgf 281 | 282 | auðveldur /við þf 283 | auðveldur /fyrir þf 284 | dýrmætur /fyrir þf 285 | eiginlegur /fyrir þf 286 | erfiður /við þf 287 | erfiður /fyrir þf 288 | óhagstæður /fyrir þf 289 | hagstæður /fyrir þf 290 | harðleikinn /fyrir þf 291 | óhentugur /fyrir þf 292 | hentugur /fyrir þf 293 | hættur /við þf 294 | illur /við þf 295 | nytsamur /fyrir þf 296 | nytsamlegur /fyrir þf 297 | skaðlaus /fyrir þf 298 | skaðlegur /fyrir þf 299 | skaðsamlegur /fyrir þf 300 | skaðsamur /fyrir þf 301 | skaðvænlegur /fyrir þf 302 | skaðvænn /fyrir þf 303 | torveldur /fyrir þf 304 | óþarfur /fyrir þf 305 | þarfur /fyrir þf 306 | þolanlegur /fyrir þf 307 | 308 | viðriðinn þf 309 | viðstaddur þf 310 | viðloðandi þf 311 | viðloðinn þf 312 | # varðandi þf # Virðist vera gripið með so_lh_nt 313 | 314 | verður ef 315 | fullur ef 316 | frjáls ef # frjáls ferða sinna, frjáls skoðana sinna 317 | fullviss ef # þess fullviss að... 318 | meðvitaður ef # þess meðvitaður að... 319 | 320 | # að/af errors 321 | 322 | auðugur /að þgf 323 | auðugur /af þgf $error(WRONG-PP, að) 324 | kunnur /að þgf 325 | kunnur /af þgf $error(WRONG-PP, að) 326 | ólétt /að þgf 327 | ólétt /af þgf $error(WRONG-PP, að) 328 | ófrísk /að þgf 329 | ófrísk /af þgf $error(WRONG-PP, að) 330 | vanfær /að þgf 331 | vanfær /af þgf $error(WRONG-PP, að) 332 | rammur /að þgf 333 | rammur /af þgf $error(WRONG-PP, að) 334 | uppvís /að þgf 335 | uppvís /af þgf $error(WRONG-PP, að) 336 | ríkur /að þgf 337 | ríkur /af þgf $error(WRONG-PP, að) 338 | snauður /að þgf 339 | snauður /af þgf $error(WRONG-PP, að) 340 | þekktur /að þgf 341 | þekktur /af þgf $error(WRONG-PP, að) 342 | -------------------------------------------------------------------------------- /src/reynir/config/GreynirEngine.conf: -------------------------------------------------------------------------------- 1 | # 2 | # GreynirEngine.conf 3 | # 4 | # Configuration file for GreynirEngine ('reynir' on PyPI) 5 | # 6 | # Copyright © 2023 Miðeind ehf 7 | # 8 | # This software is licensed under the MIT License: 9 | # 10 | # Permission is hereby granted, free of charge, to any person 11 | # obtaining a copy of this software and associated documentation 12 | # files (the "Software"), to deal in the Software without restriction, 13 | # including without limitation the rights to use, copy, modify, merge, 14 | # publish, distribute, sublicense, and/or sell copies of the Software, 15 | # and to permit persons to whom the Software is furnished to do so, 16 | # subject to the following conditions: 17 | # 18 | # The above copyright notice and this permission notice shall be 19 | # included in all copies or substantial portions of the Software. 20 | # 21 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 22 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 23 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 24 | # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 25 | # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 26 | # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 27 | # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 28 | 29 | # This file is read in the package's __init__.py, via the settings.py module. 30 | # It omits settings that are not relevant to Greynir in its package form, 31 | # such as web server and database configuration. 32 | 33 | [settings] 34 | 35 | debug = false 36 | 37 | # Phrases.conf should be included before Prepositions.conf 38 | 39 | $include Phrases.conf 40 | 41 | $include Prepositions.conf 42 | 43 | $include Prefs.conf 44 | 45 | $include Names.conf 46 | 47 | $include Verbs.conf 48 | 49 | $include Adjectives.conf 50 | 51 | $include AdjectivePredicates.conf 52 | -------------------------------------------------------------------------------- /src/reynir/config/Names.conf: -------------------------------------------------------------------------------- 1 | 2 | # Greynir: Natural language processing for Icelandic 3 | 4 | # Additional information and configuration for person names 5 | 6 | # Copyright © 2023 Miðeind ehf. 7 | 8 | [disallowed_names] 9 | 10 | # Orðmyndir sem eru ekki teknar gildar sem byrjun nafna í því falli sem gefið er 11 | 12 | Almarr þf þgf ef 13 | Annar nf þf ef # Nánast alltaf fornafnið 14 | Annarr þf þgf ef # Nánast alltaf fornafnið 15 | Ara nf 16 | Án nf þf 17 | Ásti þf þgf ef 18 | Birnir þf þgf 19 | Bjarna nf 20 | Elína þf þgf 21 | Ernir þf þgf 22 | Donaldur þf þgf ef 23 | Finn þf þgf ef 24 | Fjalarr þf þgf ef 25 | Frár þf þgf 26 | Gamli nf þf þgf ef # Nánast alltaf lýsingarorð 27 | Gerða þf þgf ef 28 | Grein nf þf þgf 29 | Guðna nf 30 | Guðmund nf þf þgf ef 31 | Guðrúnn þf 32 | Gunnur ef 33 | Harald nf 34 | Heiðarr þf þgf ef 35 | Hildir þf þgf 36 | Hnikarr þf þgf ef 37 | Ísarr þf þgf ef 38 | Ísrael nf þg þgf ef # Nánast alltaf örnefnið 39 | Karli nf 40 | Konráður þf þgf ef 41 | Kristína þf þgf ef 42 | Leif þf þgf ef 43 | Minning nf þf þgf ef # Villa í BÍN? 44 | Oddnýr þf þgf 45 | Ormarr þf þgf ef 46 | Óttarr þf þgf ef 47 | Rögn ef 48 | Sali þf þgf ef 49 | Sigmund þf þgf ef 50 | Sigurð þf þgf ef 51 | Skúla nf 52 | Sólveigur þf þgf 53 | Steinarr þf þgf ef 54 | Styrr þf þgf ef 55 | Sævarr þf þgf ef 56 | Vörður þgf 57 | Ýrr þf þgf ef 58 | Þórr þf þgf ef 59 | Ævarr þf þgf ef 60 | Örvarr þf þgf ef 61 | 62 | 63 | # Margræð orð sem skilja á sem nöfn í byrjun setningar þó þau standi ein 64 | 65 | [name_preferences] 66 | 67 | Aðalberg 68 | Aðalbergi 69 | Aðalbergs 70 | Aðalbjargar 71 | Aðalbjörg 72 | Aðalráð 73 | Aðalráði 74 | Aðalráðs 75 | Agata 76 | Agða 77 | Agga 78 | Aggi 79 | Agli 80 | Agna 81 | Agnar 82 | Akurrós 83 | Akurrósar 84 | Alba 85 | Alda 86 | Andra 87 | Andri 88 | Ara 89 | Ari 90 | Arnar 91 | Arngeir 92 | Arngeiri 93 | Arngeirs 94 | Aski 95 | Asks 96 | Askur 97 | Aspar 98 | Assa 99 | Atla 100 | Atlas 101 | Atlasar 102 | Atlass 103 | Atli 104 | #Auðar 105 | Auðari 106 | Auðnu 107 | Ámunda 108 | Áni 109 | Áns 110 | Árbót 111 | Árbótar 112 | Árdís 113 | Árdísar 114 | Ármann 115 | Ármanni 116 | Ármanns 117 | #Árnes # Mannsnafn?? 118 | #Árnesi # Mannsnafn?? 119 | Ársól 120 | Ársólar 121 | Ársæl 122 | Ársælar 123 | Ársæli 124 | Ársæll 125 | Ársæls 126 | Árveig 127 | Árveigar 128 | Ásmund 129 | Ásmundar 130 | Ásmundi 131 | Ásmundur 132 | Ástráð 133 | Ástráði 134 | Ástráðs 135 | Ástríður 136 | Baldri 137 | Baldur 138 | Baldurs 139 | Barbara 140 | Barra 141 | #Bassa 142 | #Bassi 143 | Bella 144 | Benediktína 145 | Benta 146 | Bents 147 | Bergdís 148 | Bergdísar 149 | Berglind 150 | Berglindar 151 | Bergstein 152 | Bergsteina 153 | Bergsteini 154 | Bergsteinn 155 | Bergsteins 156 | Bersa 157 | Bersi 158 | Beru 159 | Bessa 160 | Bessi 161 | Beta 162 | Betu 163 | Birkir 164 | Birna 165 | Birnis 166 | Birnu 167 | Birtingi 168 | Birtings 169 | Birtingur 170 | Bjarglindar 171 | Bjarglindi 172 | Bjólan 173 | Blöku 174 | Bos 175 | Braga 176 | Bragi 177 | Brand 178 | Brandi 179 | Brandís 180 | Brands 181 | Brár 182 | Breka 183 | Breki 184 | Bretting 185 | Brima 186 | Brimar 187 | Brimari 188 | Brimi 189 | Bubba 190 | #Burkna 191 | #Burkni 192 | Bæring 193 | Böðvar 194 | Dagfara 195 | Dagfari 196 | Dagga 197 | Daggar 198 | Dagheiðar 199 | Dagheiði 200 | Dagheiður 201 | Dagmey 202 | Dagmeyjar 203 | Dagmeyju 204 | Dalrós 205 | Dalrósar 206 | Danna 207 | Danni 208 | Darra 209 | Darri 210 | Davíða 211 | Davíðu 212 | Dofra 213 | Dofri 214 | Drafnar 215 | Draumrún 216 | Draumrúnar 217 | Droplaug 218 | Droplaugar 219 | Dröfn 220 | Dúa 221 | Dúi 222 | Dæju 223 | Döllu 224 | Ebba 225 | Ebbi 226 | Edda 227 | Eddu 228 | Editar 229 | Eiðunnar 230 | Eiðunni 231 | Eiðvarar 232 | Eiðvör 233 | Eiðvöru 234 | Eldborg 235 | Eldborgar 236 | Eldey 237 | Eldeyjar 238 | Eldeyju 239 | Elfa 240 | Elfar 241 | Elfi 242 | Elfu 243 | Elliða 244 | Elliði 245 | Elna 246 | Emma 247 | Emmu 248 | Erla 249 | Erlar 250 | Erlu 251 | Erna 252 | Erni 253 | Ernu 254 | Esja 255 | Esju 256 | Eski 257 | Etna 258 | Etnu 259 | Eygló 260 | Eyglóar 261 | Fía 262 | Fjalar 263 | Fjölvar 264 | Fjölvari 265 | Fjölvars 266 | Fjörni 267 | Fjörnir 268 | Fjörnis 269 | Flosa 270 | Flosi 271 | Fransiska 272 | Fransisku 273 | Friðgerðar 274 | Friðmann 275 | Friðmanni 276 | Friðmanns 277 | Fúsa 278 | Fúsi 279 | Fylkis 280 | Galti 281 | Garra 282 | Garri 283 | Gassa 284 | Gassi 285 | Gaut 286 | Gauta 287 | Gauti 288 | Gauts 289 | Gautur 290 | Gefn 291 | Geir 292 | Geirs 293 | Gelli 294 | Gellir 295 | Gellis 296 | Gerðari 297 | Gígja 298 | Gígjari 299 | Gígju 300 | Glóey 301 | Glóeyjar 302 | Gnúp 303 | Gnúpi 304 | Gnúps 305 | Gnúpur 306 | #Góa 307 | #Góu 308 | Grana 309 | Grani 310 | Greipi 311 | Greips 312 | Greipur 313 | Grettis 314 | Grétu 315 | Grímar 316 | Grími 317 | Gudda 318 | Guddu 319 | Gullbrá 320 | Gullbrár 321 | Gullveig 322 | Gullveigar 323 | Gumma 324 | Gummi 325 | Gunnar 326 | Gunni 327 | Gunnlaðar 328 | Gunnlöð 329 | Gunnur 330 | Gylfa 331 | Gylfi 332 | Gyrðis 333 | Gýgjar 334 | Gými 335 | Gýmir 336 | Gýmis 337 | Hadd 338 | Hadda 339 | Haddar 340 | Haddi 341 | Hadds 342 | Haðar 343 | Hafborg 344 | Hafborgar 345 | Hafdís 346 | Hafdísar 347 | Hafliða 348 | Hafliði 349 | Hafnari 350 | #Haföldu 351 | Hall 352 | Hansa 353 | Harra 354 | Harri 355 | Hauður 356 | Hauk 357 | Hávar 358 | Hedda 359 | Heiðmann 360 | Heiðmanni 361 | Heiðmanns 362 | Heimis 363 | Hein 364 | Hekla 365 | Heklu 366 | Helma 367 | Helmu 368 | Herborg 369 | Herborgar 370 | Hergarð 371 | Hergarði 372 | Hergarðs 373 | Herjólf 374 | Herjólfi 375 | Herjólfs 376 | Herjólfur 377 | Hermann 378 | Hermanni 379 | Hermanns 380 | Hersi 381 | Hersir 382 | Hersis 383 | Héðin 384 | Héðinn 385 | Héðins 386 | Héðni 387 | Hilda 388 | Hildar 389 | Hildi 390 | Hildir 391 | Hildis 392 | Hildur 393 | Hilmi 394 | Hilmir 395 | Hilmis 396 | Hjalta 397 | Hjalti 398 | Hjartar 399 | Hjálmrún 400 | Hjálmrúnar 401 | Hjört 402 | Hjörvar 403 | Hleinar 404 | Hlífari 405 | Hlín 406 | Hlínar 407 | Hlöð 408 | Hlöður 409 | Hraunar 410 | Hróa 411 | Hróar 412 | Hrói 413 | Hrund 414 | Hrundar 415 | Hugborg 416 | Hugborgar 417 | Huld 418 | #Huldar 419 | Huldari 420 | Höddu 421 | Höð 422 | Inga 423 | Ingi 424 | Innu 425 | Irpa 426 | Irpu 427 | Íma 428 | Ími 429 | Ímu 430 | Íris 431 | Írisar 432 | Ísafold 433 | Ísafoldar 434 | Ísbjörg 435 | Ísey 436 | Íseyjar 437 | Íseyju 438 | Ísfold 439 | Ísfoldar 440 | Ísgerðar 441 | Íslilja 442 | Íslilju 443 | Jakobína 444 | Jan 445 | Jans 446 | Jara 447 | Járnbrá 448 | Járnbrár 449 | Jóa 450 | Jóanna 451 | Jódís 452 | Jódísar 453 | Jón 454 | Jóna 455 | Jónanna 456 | Jóni 457 | Jónu 458 | Jórunn 459 | Jórunnar 460 | Júlla 461 | Júllu 462 | Jústa 463 | Jústu 464 | Jöru 465 | #Kalla # Kalla þurfti til lögreglu ... 466 | Kalli 467 | Kamilla 468 | Kamillu 469 | Kamma 470 | #Kana 471 | #Kani 472 | Kara 473 | Kata 474 | Katla 475 | Katli 476 | Kára 477 | Kári 478 | Ketil 479 | Ketill 480 | Ketils 481 | Kidda 482 | Kiddi 483 | Kiljan 484 | Kjalar 485 | Kolbrún 486 | Kolbrúnar 487 | Kolbrúnu 488 | Kolur 489 | Kolþerna 490 | Kolþernu 491 | Krissa 492 | Krissi 493 | Kristmann 494 | Kristmanni 495 | Kristmanns 496 | Köllu 497 | Lana 498 | Lara 499 | Lasarus 500 | Lasarusar 501 | Lasarusi 502 | Laufar 503 | Lár 504 | Lára 505 | Lárs 506 | Leif 507 | Leifi 508 | Leó 509 | Leós 510 | Lill 511 | Lillar 512 | Linnar 513 | Línar 514 | Líneik 515 | Líneikur 516 | Lofn 517 | Lofnar 518 | Lotta 519 | Lottu 520 | Lyngheiðar 521 | Lyngheiði 522 | Lýra 523 | Lýru 524 | Maja 525 | Makan 526 | Makans 527 | Malinu 528 | Manga 529 | Mangi 530 | Mardallar 531 | Mardöll 532 | Marías 533 | Maríuerla 534 | Maríuerlu 535 | Marjas 536 | Marsa 537 | Mánadís 538 | Mánadísar 539 | Mist 540 | Mistar 541 | Mími 542 | Mímir 543 | Mortína 544 | Mortínu 545 | Móna 546 | Mónu 547 | Muggi 548 | Myrra 549 | Myrru 550 | Nanna 551 | Natans 552 | Nikulásar 553 | Nikulási 554 | Njála 555 | Njálu 556 | Njóla 557 | Njólu 558 | Nóa 559 | Nói 560 | Nóna 561 | #Nóni 562 | Nóra 563 | Nóru 564 | Nóu 565 | Núp 566 | Núpan 567 | Núpi 568 | Núps 569 | Núpur 570 | Nökkva 571 | Nökkvi 572 | Nönnu 573 | Oddbjargar 574 | Oddbjörg 575 | Oddhildar 576 | Oddvari 577 | Olla 578 | Orra 579 | Orri 580 | Otra 581 | Otri 582 | Otur 583 | Oturs 584 | Óðinn 585 | Óðrík 586 | Óðríki 587 | Óðríks 588 | Óðríkur 589 | Órækja 590 | Órækju 591 | Ósvífri 592 | Ósvífur 593 | Pál 594 | Pála 595 | Páli 596 | Páll 597 | Pálmari 598 | Pálmu 599 | Páls 600 | Pers 601 | Pésa 602 | Pési 603 | Pétri 604 | Pétur 605 | Péturs 606 | Regin 607 | Reginbjörg 608 | Reinar 609 | Reynis 610 | Reyrs 611 | Rikka 612 | Ritu 613 | Rín 614 | Rínar 615 | Ríta 616 | Rósalinda 617 | Rósalindar 618 | Rósalindi 619 | Rósalín 620 | Rósanna 621 | Rósfríð 622 | Rósfríðar 623 | Rósfríði 624 | Rósfríður 625 | Rósu 626 | Rúnari 627 | Rúnu 628 | Röðli 629 | Röðul 630 | Röðull 631 | Röðuls 632 | Salvar 633 | Seif 634 | Seifi 635 | Selju 636 | Sigga 637 | Siggi 638 | Siggu 639 | Sigmann 640 | Sigmanni 641 | Sigmanns 642 | Signari 643 | Signu 644 | Sigurrún 645 | Sigurrúnar 646 | Sigurstein 647 | Sigursteina 648 | Sigursteini 649 | Sigursteinn 650 | Sigursteins 651 | Sigurvarðar 652 | Sigurvarði 653 | Silla 654 | Sindra 655 | Sindri 656 | Síta 657 | Sjafnar 658 | Sjóborg 659 | Sjóborgar 660 | #Skafta 661 | Skírnir 662 | Skæring 663 | Smyril 664 | Smyrill 665 | Smyrils 666 | Smyrli 667 | Snærós 668 | Snærósar 669 | Sólbrá 670 | Sólbrár 671 | Sólbrún 672 | Sólbrúnar 673 | Sólbrúnu 674 | Sóldaggar 675 | Sóldís 676 | Sóldísar 677 | Sóldögg 678 | Sólrún 679 | Sólrúnar 680 | Stefnis 681 | Steinborg 682 | Steinborgar 683 | Steinka 684 | Steinku 685 | Stella 686 | Stellu 687 | Styrmi 688 | Styrmir 689 | Sumarliða 690 | Sumarliði 691 | Sumarlín 692 | Sunna 693 | Sunnu 694 | Svarthöfða 695 | Svarthöfði 696 | Sverri 697 | Sverrir 698 | Sverris 699 | Svía 700 | Sæbirni 701 | Sæbjarnar 702 | Sæbjörn 703 | Sæborg 704 | Sæborgar 705 | Sædís 706 | Sædísar 707 | Sæfinn 708 | Sæmu 709 | Sölva 710 | Sölvar 711 | Sölvi 712 | Sörla 713 | Sörli 714 | Tandra 715 | Tandri 716 | Teit 717 | Teits 718 | Teitur 719 | Tila 720 | Tíbrá 721 | Tíbrár 722 | Tína 723 | Tínu 724 | Todda 725 | Tór 726 | Tórs 727 | Tóta 728 | Tóti 729 | Tótu 730 | Trjámann 731 | Trjámanni 732 | Trjámanns 733 | Trúmann 734 | Trúmanni 735 | Trúmanns 736 | Tyrfing 737 | Tý 738 | Týr 739 | Týs 740 | Unnari 741 | Urður 742 | Úlfhéðin 743 | Úlfhéðinn 744 | Úlfhéðins 745 | Úlfhéðni 746 | Úlla 747 | Vagnborg 748 | Vagnborgar 749 | Valbjarkar 750 | Valbjörk 751 | Valborg 752 | Valborgar 753 | Valdís 754 | Valdísar 755 | Valur 756 | Vatnar 757 | Veigalín 758 | Veigalínar 759 | Veigs 760 | Verónika 761 | Veróniku 762 | Veturliða 763 | Veturliði 764 | Viðar 765 | Vigni 766 | Vignir 767 | Vignis 768 | Virgil 769 | Virgill 770 | Virgils 771 | Virgli 772 | Vífil 773 | Vífill 774 | Vífils 775 | Vífli 776 | Vordís 777 | Vordísar 778 | Vögnu 779 | Völund 780 | Völundar 781 | Völundi 782 | Völundur 783 | Yngva 784 | Yngvar 785 | Yngvi 786 | Yrja 787 | Yrju 788 | Yrsa 789 | Yrsu 790 | Ými 791 | Ýmir 792 | Ýr 793 | Ýrar 794 | Ýri 795 | Þallar 796 | Þengil 797 | Þengill 798 | Þengils 799 | Þengli 800 | Þiðrandi 801 | Þjóðvarðar 802 | Þormar 803 | Þrastar 804 | Þráinn 805 | Þránd 806 | Þrándar 807 | Þrándi 808 | Þrándur 809 | Þresti 810 | Þrym 811 | Þrymi 812 | Þryms 813 | Þrymur 814 | Þura 815 | Þuru 816 | Þyri 817 | Þöll 818 | Æsu 819 | Ævar 820 | Ögðu 821 | Öggu 822 | Ölbu 823 | Ölni 824 | Ölnir 825 | Ölrún 826 | Ölrúnar 827 | Ölveig 828 | Ölveigar 829 | Ölvi 830 | Ölvir 831 | Össu 832 | Össur 833 | -------------------------------------------------------------------------------- /src/reynir/config/NounPredicates.conf: -------------------------------------------------------------------------------- 1 | # Greynir: Natural language processing for Icelandic 2 | 3 | # Copyright © 2023 Miðeind ehf 4 | 5 | # Work in progress; handling of this data has not 6 | # been implemented as of yet. 7 | 8 | afborgun /af þgf $error(PP, á) 9 | afborgun /á þgf 10 | affall /af þgf 11 | affall /að þgf $error(AÐAF, af) 12 | afskriftir /af þgf 13 | afskriftir /að þgf $error(AÐAF, af) 14 | eftirsjá /af þgf $error(AÐAF, að) 15 | eftirsjá /að þgf 16 | frásögn /af þgf 17 | frásögn /um þf $error(PP-ALL, /af þgf) 18 | fyrirmynd /af þgf $error(AÐAF, að) 19 | fyrirmynd /að þgf 20 | færi /á þgf 21 | hlutdeild /að þgf $error(PP, í) 22 | hlutdeild /í þgf 23 | hætta /á þgf 24 | höfundur /af þgf $error(AÐAF, að) 25 | höfundur /að þgf 26 | hús /við þf 27 | húsnæði /við þf 28 | innsýn /inn_í þf $error(PP, í) 29 | innsýn /í þf 30 | karl /í þf # karl í krapinu, karlinn í tunglinu 31 | kaupandi /af þgf $error(AÐAF, að) 32 | kaupandi /að þgf 33 | lykill /af þgf $error(AÐAF, að) 34 | lykill /að þgf 35 | meðferð /gegn þgf $error(PP, við) 36 | meðferð /við þgf 37 | ofnæmi /fyrir þgf 38 | ofnæmi /gegn þgf $error(PP, fyrir) 39 | sveifla /á þgf $error(PP, á) 40 | sveifla /í þgf 41 | teikning /af þgf $error(AÐAF, að) 42 | teikning /að þgf 43 | tækifæri /til ef 44 | tækifæri /á þgf $error(PP-ALL, /til ef) 45 | uppdráttur /af þgf $error(AÐAF, að) 46 | uppdráttur /að þgf 47 | uppskrift /af þgf $error(AÐAF, að) 48 | uppskrift /að þgf 49 | uppástunga /að þgf $error(PP-ALL, /um þf) 50 | uppástunga /um þf 51 | virðing /fyrir þgf 52 | virðing /við þf $error(PP-ALL, /fyrir þgf) 53 | vitni /af þgf $error(AÐAF, að) 54 | vitni /að þgf 55 | vörn /gegn þgf 56 | vörn /við þgf $error(PP, gegn) 57 | áfangi /af þgf $error(AÐAF, að) 58 | áfangi /að þgf 59 | áhyggjur /af þgf 60 | áhyggjur /að þgf $error(AÐAF, af) 61 | áhætta /á þgf $error(ALL, hætta, /á þf) 62 | áskrifandi /af þgf $error(AÐAF, að) 63 | áskrifandi /að þgf 64 | ávöxtun /á þf $error(PP-TO-OBJ, ef) 65 | ávöxtun /á þgf $error(PP-TO-OBJ, ef) 66 | útgáfa /af þgf 67 | útgáfa /að þgf $error(AÐAF, af) 68 | -------------------------------------------------------------------------------- /src/reynir/config/Prepositions.conf: -------------------------------------------------------------------------------- 1 | 2 | # Greynir: Natural language processing for Icelandic 3 | 4 | # Copyright © 2023 Miðeind ehf. 5 | 6 | # Prepositions.conf 7 | 8 | # Forsetningar 9 | 10 | # Forsetningar merktar með 'nh' geta staðið á 11 | # undan sagnlið í nafnhætti: 12 | # 'Beiðnin um að handtaka manninn var send lögreglunni' 13 | 14 | # Stjörnumerktar forsetningar geta komið fyrir í 15 | # [ambiguous_phrases] kaflanum í Phrases.conf, þannig að 16 | # þær hafi áhrif til niðurskurðar mögulegra merkinga. 17 | # Þær verða að vera merktar sem forsetningar ('fs') í BÍN. 18 | 19 | [prepositions] 20 | 21 | að* þgf 22 | af* þgf nh 23 | allfjarri þgf 24 | andspænis þgf 25 | andstætt þgf 26 | auk ef 27 | austan ef 28 | austur þf 29 | á* þf nh 30 | á* þgf 31 | án ef 32 | árla ef 33 | ásamt þgf 34 | bak þgf 35 | eftir* þf nh 36 | eftir* þgf 37 | fjarri þgf 38 | fjær þgf 39 | fram þf 40 | frá þgf 41 | fyrir* þf nh 42 | fyrir* þgf 43 | gagnstætt þgf 44 | gagnvart þgf 45 | gegn þgf 46 | gegnt þgf 47 | gegnum þf nh 48 | handa þgf 49 | handan ef 50 | hjá þgf 51 | inn þf nh 52 | innan ef 53 | í* þf nh 54 | í* þgf 55 | jafnframt þgf 56 | jafnhliða þgf 57 | kring þgf 58 | kringum þf nh 59 | með* þf nh 60 | með* þgf 61 | meðal ef 62 | meðfram þgf 63 | meður þgf 64 | milli ef 65 | millum ef 66 | mót þgf 67 | móti þgf 68 | nálægt þgf 69 | neðan ef 70 | niður þf 71 | norðan ef 72 | norður þf 73 | nær þgf 74 | nærri þgf 75 | næst þgf 76 | #næstum nf # Frekar eo! 77 | #of 78 | ofan ef 79 | ofar þgf # 'ofar hverri kröfu' 80 | óháð þgf # 'hefðu alltaf greitt óháð nauðasamningi' 81 | ólíkt þgf # 'þeir fá enga styrki ólíkt frambjóðendum til þings' 82 | órafjarri þgf 83 | sakir ef 84 | samanber þf nh # 'samanber yfirlýsingu ríkisstjórnarinnar frá 3. júní' 85 | samfara þgf 86 | samhliða þgf 87 | samkvæmt þgf 88 | sammála þgf 89 | samsíðis þgf 90 | samskipa þgf 91 | samstíga þgf 92 | samtímis þgf 93 | #sem nf # 'í krafti stöðu minnar sem leikhússtjóri' 94 | #sem þf # 'margir hafa hvatt mig til að bjóða mig fram sem forseta Íslands' 95 | #síðan 96 | síðla ef 97 | snemma ef 98 | suður þf 99 | sunnan ef 100 | sökum ef 101 | til* ef nh # 'tilraunir til að skilgreina vandann' 102 | um* þf nh 103 | umfram þf nh 104 | umhverfis þf 105 | undan þgf # !!! á undan 106 | undir þf 107 | undir þgf 108 | upp þf # !!! upp á 109 | # !!! Note: In Verbs.conf, there are several instances of '/upp þgf', 110 | # !!! but 'upp' is not defined here as a preposition with a dative (þgf) argument. 111 | utan ef 112 | úr þgf # !!! upp úr 113 | út þf 114 | varðandi þf 115 | vegna ef 116 | vestan ef 117 | vestur þf 118 | víðsfjarri þgf 119 | við* þf nh 120 | við* þgf # Hard-coded in reducer.py to have less priority than við + þf 121 | yfir* þf nh 122 | yfir* þgf # yfir honum var helgisvipur 123 | 124 | # Multiword prepositions 125 | # These phrases should also be included in Phrases.conf, 126 | # in most cases as 'ao frasi' 127 | # Note that these prepositions can be associated with verbs 128 | # in Verbs.conf using underscores, for example 129 | # 'keppa /fyrir_hönd ef' 130 | 131 | fyrir aftan þf 132 | fyrir austan þf 133 | fyrir framan þf 134 | fyrir handan þf 135 | fyrir innan þf 136 | fyrir neðan þf 137 | fyrir norðan þf 138 | fyrir ofan þf 139 | fyrir sunnan þf 140 | fyrir utan þf 141 | fyrir vestan þf 142 | fyrir hönd ef 143 | #á móti þgf 144 | #á eftir þgf 145 | #á undan þgf 146 | #á meðal ef 147 | #á milli ef 148 | #á hendur þgf 149 | #á fætur þgf 150 | í kringum þf 151 | í gegnum þf 152 | fyrir sakir þf 153 | á móts við þf 154 | innan við þf 155 | samanborið við þf 156 | #miðað við þf 157 | með tilliti til ef 158 | þrátt fyrir þf 159 | það sem af er þgf 160 | það sem eftir er ef 161 | til og frá þgf 162 | upp úr þgf 163 | þvert á þf 164 | austur fyrir þf 165 | vestur fyrir þf 166 | norður fyrir þf 167 | suður fyrir þf 168 | skömmu fyrir þf 169 | skömmu eftir þf 170 | örskömmu fyrir þf 171 | örskömmu eftir þf 172 | 173 | # Other multiword prepositional phrases that 174 | # were written in one word but have been split up. 175 | # This information is still needed to know which 176 | # case the composite preposition governs. 177 | 178 | austan undir þf 179 | fram undir þf # 'fram undir kvöld' 180 | innan undir þf nh 181 | út undan þgf 182 | út yfir þf 183 | 184 | # Ambiguous erroneous multiword prepositions 185 | # Should be disambiguated into different things 186 | # based on what case they govern. 187 | 188 | # 'fram á eyrina' 189 | frammá þf nh $error(FORM-fram_á) 190 | # 'frammi á gangi' 191 | frammá þgf $error(FORM-frammi_á) 192 | # 'fram í hellinn' 193 | frammí þf $error(FORM-fram_í) 194 | # 'frammi í bílnum' 195 | frammí þgf $error(FORM-frammi_í) 196 | # 'inn á völlinn' 197 | inná þf nh $error(FORM-inn_á) 198 | # 'inni á vellinum' 199 | inná þgf $error(FORM-inni_á) 200 | # 'inn í hellinn' 201 | inní þf nh $error(FORM-inn_í) 202 | # 'inni í hellinum' 203 | inní þgf $error(FORM-inni_í) 204 | # 'niður á lækjarbakkann' 205 | niðrá þf nh $error(FORM-niður_á) 206 | # 'niðri á gólfinu' 207 | niðrá þgf $error(FORM-niðri_á) 208 | # 'niður í myrkrið' 209 | niðrí þf nh $error(FORM-niður_í) 210 | # 'niðri í myrkrinu' 211 | niðrí þgf $error(FORM-niðri_í) 212 | # 'upp á hestinn' 213 | uppá þf $error(FORM-upp_á) 214 | # 'uppi á borðinu' 215 | uppá þgf $error(FORM-uppi_á) 216 | # 'upp í bílinn' 217 | uppí þf $error(FORM-upp_í) 218 | # 'uppi í kastalanum' 219 | uppí þgf $error(FORM-uppi_í) 220 | # 'út á ystu nöf' 221 | útá þf nh $error(FORM-út_á) 222 | # 'úti á túninu' 223 | útá þgf $error(FORM-úti_á) 224 | # 'út í laugina' 225 | útí þf nh $error(FORM-út_í) 226 | # 'úti í náttúrunni' 227 | útí þgf $error(FORM-úti_í) 228 | 229 | # Compound prepositions that should be split into two words 230 | 231 | alltað þgf $error(FORM-allt_að) 232 | austanundir þf $error(FORM-austan_undir) 233 | framhjá þgf $error(FORM-fram_hjá) 234 | framundir þf $error(FORM-fram_undir) # 'framundir kvöld munu björgunarsveitir aðstoða fólk' 235 | innanum þf nh $error(FORM-innan_um) 236 | innanundir þf nh $error(FORM-innan_undir) 237 | innum þf nh $error(FORM-inn_um) 238 | útaf þgf $error(FORM-út_af) 239 | útundan þgf $error(FORM-út_undan) 240 | útúr þgf $error(FORM-út_úr) 241 | útyfir þf $error(FORM-út_yfir) 242 | 243 | -------------------------------------------------------------------------------- /src/reynir/eparser.h: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Greynir: Natural language processing for Icelandic 4 | 5 | C++ Earley parser module 6 | 7 | Copyright © 2023 Miðeind ehf. 8 | 9 | This software is licensed under the MIT License: 10 | 11 | Permission is hereby granted, free of charge, to any person 12 | obtaining a copy of this software and associated documentation 13 | files (the "Software"), to deal in the Software without restriction, 14 | including without limitation the rights to use, copy, modify, merge, 15 | publish, distribute, sublicense, and/or sell copies of the Software, 16 | and to permit persons to whom the Software is furnished to do so, 17 | subject to the following conditions: 18 | 19 | The above copyright notice and this permission notice shall be 20 | included in all copies or substantial portions of the Software. 21 | 22 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 23 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 24 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 25 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 26 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 27 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 28 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 29 | 30 | This module implements an optimized Earley parser in C++. 31 | It is designed to be called from Python code with 32 | already parsed and packed grammar structures. 33 | 34 | The Earley parser used here is the improved version described by Scott & Johnstone, 35 | referencing Tomita. This allows worst-case cubic (O(n^3)) order, where n is the 36 | length of the input sentence, while still returning all possible parse trees 37 | for an ambiguous grammar. 38 | 39 | See Elizabeth Scott, Adrian Johnstone: 40 | "Recognition is not parsing — SPPF-style parsing from cubic recognisers" 41 | Science of Computer Programming, Volume 75, Issues 1–2, 1 January 2010, Pages 55–70 42 | 43 | */ 44 | 45 | #include 46 | #include 47 | #include 48 | 49 | 50 | // Assert macro 51 | #ifdef DEBUG 52 | #define ASSERT(x) assert(x) 53 | #else 54 | #define ASSERT(x) 55 | #endif 56 | 57 | 58 | typedef unsigned int UINT; 59 | typedef int INT; 60 | typedef wchar_t WCHAR; 61 | typedef char CHAR; 62 | typedef unsigned char BYTE; 63 | typedef bool BOOL; 64 | 65 | 66 | class Production; 67 | class Parser; 68 | class State; 69 | class Column; 70 | class NodeDict; 71 | class Label; 72 | struct StateChunk; 73 | 74 | 75 | class AllocCounter { 76 | 77 | // A utility class to count allocated instances 78 | // of an instrumented class. Add this as a static 79 | // member (named e.g. 'ac') of the class to be watched 80 | // and call ac++ and ac-- in the constructor and destructor, 81 | // respectively. 82 | 83 | private: 84 | 85 | UINT m_nAllocs; 86 | UINT m_nFrees; 87 | 88 | public: 89 | 90 | AllocCounter(void) 91 | : m_nAllocs(0), m_nFrees(0) 92 | { } 93 | ~AllocCounter(void) 94 | { } 95 | 96 | void operator++(int) 97 | { this->m_nAllocs++; } 98 | void operator--(int) 99 | { 100 | ASSERT(this->m_nAllocs > this->m_nFrees); 101 | this->m_nFrees++; 102 | } 103 | UINT numAllocs(void) const 104 | { return this->m_nAllocs; } 105 | UINT numFrees(void) const 106 | { return this->m_nFrees; } 107 | INT getBalance(void) const 108 | { return (INT)(this->m_nAllocs - this->m_nFrees); } 109 | 110 | }; 111 | 112 | 113 | class Nonterminal { 114 | 115 | // A Nonterminal has an associated list of owned Productions 116 | 117 | friend class AllocReporter; 118 | 119 | private: 120 | 121 | WCHAR* m_pwzName; 122 | Production* m_pProd; 123 | 124 | static AllocCounter ac; 125 | 126 | protected: 127 | 128 | public: 129 | 130 | Nonterminal(const WCHAR* pwzName); 131 | 132 | ~Nonterminal(void); 133 | 134 | void addProduction(Production* p); 135 | 136 | // Get the first right-hand-side production of this nonterminal 137 | Production* getHead(void) const 138 | { return this->m_pProd; } 139 | 140 | WCHAR* getName(void) const 141 | { return this->m_pwzName; } 142 | 143 | }; 144 | 145 | 146 | class Production { 147 | 148 | // A Production owns a local copy of an array of items, 149 | // where each item is a negative nonterminal index, or 150 | // positive terminal index. Attempts to index past the 151 | // end of the production yield a 0 item. 152 | 153 | friend class AllocReporter; 154 | 155 | private: 156 | 157 | UINT m_nId; // Unique integer id (0-based) of this production 158 | UINT m_nPriority; // Relative priority of this production 159 | UINT m_n; // Number of items in production 160 | INT* m_pList; // List of items in production 161 | Production* m_pNext; // Next production of same nonterminal 162 | 163 | static AllocCounter ac; 164 | 165 | protected: 166 | 167 | public: 168 | 169 | Production(UINT nId, UINT nPriority, UINT n, const INT* pList); 170 | 171 | ~Production(void); 172 | 173 | void setNext(Production* p); 174 | Production* getNext(void) const 175 | { return this->m_pNext; } 176 | 177 | UINT getId(void) const 178 | { return this->m_nId; } 179 | UINT getLength(void) const 180 | { return this->m_n; } 181 | BOOL isEpsilon(void) const 182 | { return this->m_n == 0; } 183 | UINT getPriority(void) const 184 | { return this->m_nPriority; } 185 | 186 | // Get the item at the dot position within the production 187 | INT operator[] (UINT nDot) const; 188 | 189 | }; 190 | 191 | 192 | class Grammar { 193 | 194 | // A Grammar is a collection of Nonterminals 195 | // with their Productions. 196 | 197 | friend class AllocReporter; 198 | 199 | private: 200 | 201 | UINT m_nNonterminals; // Number of nonterminals 202 | UINT m_nTerminals; // Number of terminals (indexed from 1) 203 | INT m_iRoot; // Index of root nonterminal (negative) 204 | Nonterminal** m_nts; // Array of Nonterminal pointers, owned by the Grammar class 205 | 206 | static AllocCounter ac; 207 | 208 | protected: 209 | 210 | public: 211 | 212 | Grammar(UINT nNonterminals, UINT nTerminals, INT iRoot = -1); 213 | Grammar(void); 214 | ~Grammar(void); 215 | 216 | void reset(void); 217 | 218 | BOOL readBinary(const CHAR* pszFilename); 219 | 220 | UINT getNumNonterminals(void) const 221 | { return this->m_nNonterminals; } 222 | UINT getNumTerminals(void) const 223 | { return this->m_nTerminals; } 224 | INT getRoot(void) const 225 | { return this->m_iRoot; } 226 | 227 | void setNonterminal(INT iIndex, Nonterminal*); 228 | 229 | Nonterminal* operator[] (INT iIndex) const; 230 | 231 | const WCHAR* nameOfNt(INT iNt) const; 232 | 233 | }; 234 | 235 | 236 | class Label { 237 | 238 | // A Label is associated with a Node. 239 | 240 | friend class Node; 241 | 242 | private: 243 | 244 | INT m_iNt; 245 | UINT m_nDot; 246 | Production* m_pProd; 247 | UINT m_nI; 248 | UINT m_nJ; 249 | 250 | public: 251 | 252 | Label(INT iNt, UINT nDot, Production* pProd, UINT nI, UINT nJ) 253 | : m_iNt(iNt), m_nDot(nDot), m_pProd(pProd), m_nI(nI), m_nJ(nJ) 254 | { } 255 | 256 | BOOL operator==(const Label& other) const 257 | { return ::memcmp((void*)this, (void*)&other, sizeof(Label)) == 0; } 258 | 259 | }; 260 | 261 | 262 | class Node { 263 | 264 | friend class AllocReporter; 265 | 266 | private: 267 | 268 | struct FamilyEntry { 269 | Production* pProd; 270 | Node* p1; 271 | Node* p2; 272 | FamilyEntry* pNext; 273 | }; 274 | 275 | Label m_label; 276 | FamilyEntry* m_pHead; 277 | UINT m_nRefCount; 278 | 279 | static AllocCounter ac; 280 | 281 | void _dump(Grammar*, UINT nIndent); 282 | 283 | protected: 284 | 285 | public: 286 | 287 | Node(const Label&); 288 | ~Node(void); 289 | 290 | void addRef(void) 291 | { this->m_nRefCount++; } 292 | void delRef(void); 293 | 294 | void addFamily(Production*, Node* pW, Node* pV); 295 | 296 | BOOL hasLabel(const Label& label) const 297 | { return this->m_label == label; } 298 | 299 | void dump(Grammar*); 300 | 301 | static UINT numCombinations(Node*); 302 | 303 | }; 304 | 305 | 306 | // Token-terminal matching function 307 | typedef BOOL (*MatchingFunc)(UINT nHandle, UINT nToken, UINT nTerminal); 308 | 309 | // Allocator for token/terminal matching cache 310 | typedef BYTE* (*AllocFunc)(UINT nHandle, UINT nToken, UINT nTerminals); 311 | 312 | // Default matching function that simply 313 | // compares the token value with the terminal number 314 | BOOL defaultMatcher(UINT nHandle, UINT nToken, UINT nTerminal); 315 | 316 | 317 | class Parser { 318 | 319 | // Earley-Scott parser for a given Grammar 320 | 321 | friend class AllocReporter; 322 | friend class Column; 323 | 324 | private: 325 | 326 | // Grammar pointer, not owned by the Parser 327 | Grammar* m_pGrammar; 328 | MatchingFunc m_pMatchingFunc; 329 | AllocFunc m_pAllocFunc; 330 | 331 | void push(UINT nHandle, State*, Column*, State*&, StateChunk*); 332 | 333 | Node* makeNode(State* pState, UINT nEnd, Node* pV, NodeDict& ndV); 334 | 335 | // Internal token/terminal matching cache management 336 | BYTE* allocCache(UINT nHandle, UINT nToken, BOOL* pbNeedsRelease); 337 | void releaseCache(BYTE* abCache); 338 | 339 | protected: 340 | 341 | public: 342 | 343 | Parser(Grammar*, MatchingFunc = defaultMatcher, AllocFunc = NULL); 344 | ~Parser(void); 345 | 346 | UINT getNumTerminals(void) const 347 | { return this->m_pGrammar->getNumTerminals(); } 348 | UINT getNumNonterminals(void) const 349 | { return this->m_pGrammar->getNumNonterminals(); } 350 | MatchingFunc getMatchingFunc(void) const 351 | { return this->m_pMatchingFunc; } 352 | Grammar* getGrammar(void) const 353 | { return this->m_pGrammar; } 354 | 355 | // If pnToklist is NULL, a sequence of integers 0..nTokens-1 will be used 356 | Node* parse(UINT nHandle, INT iStartNt, UINT* pnErrorToken, 357 | UINT nTokens, const UINT pnToklist[] = NULL); 358 | 359 | }; 360 | 361 | // Print a report on memory allocation 362 | extern "C" void printAllocationReport(void); 363 | 364 | // Parse a token stream 365 | extern "C" Node* earleyParse(Parser*, UINT nTokens, INT iRoot, UINT nHandle, UINT* pnErrorToken); 366 | 367 | extern "C" Grammar* newGrammar(const CHAR* pszGrammarFile); 368 | 369 | extern "C" void deleteGrammar(Grammar*); 370 | 371 | extern "C" Parser* newParser(Grammar*, MatchingFunc fpMatcher = defaultMatcher, AllocFunc fpAlloc = NULL); 372 | 373 | extern "C" void deleteParser(Parser*); 374 | 375 | extern "C" void deleteForest(Node*); 376 | 377 | extern "C" void dumpForest(Node*, Grammar*); 378 | 379 | extern "C" UINT numCombinations(Node*); 380 | 381 | -------------------------------------------------------------------------------- /src/reynir/eparser_build.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Greynir: Natural language processing for Icelandic 4 | 5 | CFFI builder for _eparser module 6 | 7 | Copyright © 2023 Miðeind ehf. 8 | Author: Vilhjálmur Þorsteinsson 9 | 10 | This software is licensed under the MIT License: 11 | 12 | Permission is hereby granted, free of charge, to any person 13 | obtaining a copy of this software and associated documentation 14 | files (the "Software"), to deal in the Software without restriction, 15 | including without limitation the rights to use, copy, modify, merge, 16 | publish, distribute, sublicense, and/or sell copies of the Software, 17 | and to permit persons to whom the Software is furnished to do so, 18 | subject to the following conditions: 19 | 20 | The above copyright notice and this permission notice shall be 21 | included in all copies or substantial portions of the Software. 22 | 23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 26 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 27 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 28 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 29 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 30 | 31 | This module only runs at setup/installation time. It is invoked 32 | from setup.py as requested by the cffi_modules=[] parameter of the 33 | setup() function. It causes the _eparser.*.so CFFI wrapper library 34 | to be built from its source in eparser.cpp. 35 | 36 | """ 37 | 38 | import os 39 | import platform 40 | import cffi 41 | 42 | # Don't change the name of this variable unless you 43 | # change it in setup.py as well 44 | ffibuilder = cffi.FFI() 45 | 46 | WINDOWS = platform.system() == "Windows" 47 | MACOS = platform.system() == "Darwin" 48 | IMPLEMENTATION = platform.python_implementation() 49 | 50 | # What follows is the actual Python-wrapped C interface to eparser.*.so 51 | 52 | declarations = """ 53 | 54 | typedef unsigned int UINT; 55 | typedef int INT; 56 | typedef int BOOL; // Different from C++ 57 | typedef char CHAR; 58 | typedef unsigned char BYTE; 59 | 60 | struct Grammar { 61 | UINT nNonterminals; // Number of nonterminals 62 | UINT nTerminals; // Number of terminals (indexed from 1) 63 | INT iRoot; // Index of root nonterminal (negative) 64 | }; 65 | 66 | struct Parser { 67 | struct Grammar* pGrammar; 68 | }; 69 | 70 | struct Production { 71 | UINT nId; 72 | UINT nPriority; 73 | UINT n; 74 | INT* pList; 75 | }; 76 | 77 | struct Label { 78 | INT iNt; 79 | UINT nDot; 80 | struct Production* pProd; 81 | UINT nI; 82 | UINT nJ; 83 | }; 84 | 85 | struct FamilyEntry { 86 | struct Production* pProd; 87 | struct Node* p1; 88 | struct Node* p2; 89 | struct FamilyEntry* pNext; 90 | }; 91 | 92 | struct Node { 93 | struct Label label; 94 | struct FamilyEntry* pHead; 95 | UINT nRefCount; 96 | }; 97 | 98 | typedef BOOL (*MatchingFunc)(UINT nHandle, UINT nToken, UINT nTerminal); 99 | typedef BYTE* (*AllocFunc)(UINT nHandle, UINT nToken, UINT nSize); 100 | 101 | struct Node* earleyParse(struct Parser*, UINT nTokens, INT iRoot, UINT nHandle, UINT* pnErrorToken); 102 | struct Grammar* newGrammar(const CHAR* pszGrammarFile); 103 | void deleteGrammar(struct Grammar*); 104 | struct Parser* newParser(struct Grammar*, MatchingFunc fpMatcher, AllocFunc fpAlloc); 105 | void deleteParser(struct Parser*); 106 | void deleteForest(struct Node*); 107 | void dumpForest(struct Node*, struct Grammar*); 108 | UINT numCombinations(struct Node*); 109 | 110 | void printAllocationReport(void); 111 | 112 | """ 113 | 114 | # Declare the Python callbacks from fastparser.py that will be called by the C code 115 | # See: https://cffi.readthedocs.io/en/latest/using.html#extern-python-new-style-callbacks 116 | 117 | callbacks = """ 118 | 119 | extern "Python" BOOL matching_func(UINT, UINT, UINT); 120 | extern "Python" BYTE* alloc_func(UINT, UINT, UINT); 121 | 122 | """ 123 | 124 | # Do the magic CFFI incantations necessary to get CFFI and setuptools 125 | # to compile eparser.cpp at setup time, generate a .so library and 126 | # wrap it so that it is callable from Python and PyPy as _eparser 127 | 128 | if WINDOWS: 129 | extra_compile_args = ["/Zc:offsetof-"] 130 | elif MACOS: 131 | os.environ["CFLAGS"] = "-stdlib=libc++" # Fixes PyPy build on macOS 10.15.6+ 132 | os.environ["MACOSX_DEPLOYMENT_TARGET"] = "10.9" 133 | extra_compile_args = ["-mmacosx-version-min=10.9", "-stdlib=libc++"] 134 | else: 135 | extra_compile_args = ["-std=c++11"] 136 | 137 | # On some systems, the linker needs to be told to use the C++ compiler 138 | # under PyPy due to changes in the default behaviour of distutils. 139 | if IMPLEMENTATION == "PyPy": 140 | os.environ["LDCXXSHARED"] = "c++ -shared" 141 | 142 | ffibuilder.cdef(declarations + callbacks) 143 | 144 | ffibuilder.set_source( 145 | "reynir._eparser", 146 | # eparser.cpp is written in C++ but must export a pure C interface. 147 | # This is the reason for the "extern 'C' { ... }" wrapper. 148 | 'extern "C" {\n' + declarations + "\n}\n", 149 | source_extension=".cpp", 150 | sources=["src/reynir/eparser.cpp"], 151 | extra_compile_args=extra_compile_args, 152 | ) 153 | 154 | if __name__ == "__main__": 155 | ffibuilder.compile(verbose=True) 156 | -------------------------------------------------------------------------------- /src/reynir/glock.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Greynir: Natural language processing for Icelandic 4 | 5 | GlobalLock utility class 6 | 7 | Copyright © 2023 Miðeind ehf. 8 | Original author: Vilhjálmur Þorsteinsson 9 | 10 | This software is licensed under the MIT License: 11 | 12 | Permission is hereby granted, free of charge, to any person 13 | obtaining a copy of this software and associated documentation 14 | files (the "Software"), to deal in the Software without restriction, 15 | including without limitation the rights to use, copy, modify, merge, 16 | publish, distribute, sublicense, and/or sell copies of the Software, 17 | and to permit persons to whom the Software is furnished to do so, 18 | subject to the following conditions: 19 | 20 | The above copyright notice and this permission notice shall be 21 | included in all copies or substantial portions of the Software. 22 | 23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 26 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 27 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 28 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 29 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 30 | 31 | This module implements the GlobalLock class, providing 32 | interprocess locks within a server. 33 | 34 | A GlobalLock is implemented as a file in the /tmp/ directory, 35 | which is assumed to exist (on the current drive in Windows). 36 | 37 | A quick and easy way to use a blocking GlobalLock is as follows: 38 | 39 | with GlobalLock('somestring'): 40 | code_that_only_one_process_can_run_simultaneously() 41 | 42 | """ 43 | 44 | from typing import Any, IO, Optional 45 | 46 | import os 47 | import stat 48 | import tempfile 49 | 50 | 51 | class LockError(Exception): 52 | """Lock could not be obtained""" 53 | 54 | pass 55 | 56 | 57 | POSIX: bool = False 58 | 59 | try: 60 | # Try Linux/POSIX 61 | import fcntl 62 | except ImportError: 63 | 64 | try: 65 | # Try Windows 66 | import msvcrt 67 | except ImportError: 68 | 69 | # Not Unix, not Windows: bail out 70 | def _lock_file(file: IO[str], block: bool) -> None: 71 | raise TypeError("File locking not supported on this platform") 72 | 73 | def _unlock_file(file: IO[str]) -> None: 74 | raise TypeError("File locking not supported on this platform") 75 | 76 | else: 77 | 78 | # Windows 79 | 80 | def _lock_file(file: IO[str], block: bool) -> None: 81 | # Lock just the first byte of the file 82 | retry = True 83 | while retry: 84 | retry = False 85 | try: 86 | msvcrt.locking( # type: ignore 87 | file.fileno(), 88 | msvcrt.LK_LOCK if block else msvcrt.LK_NBLCK, # type: ignore 89 | 1, 90 | ) 91 | except OSError as e: 92 | if block and e.errno == 36: 93 | # Windows says 'resource deadlock avoided', but we truly want 94 | # a longer blocking wait: try again 95 | retry = True 96 | else: 97 | raise LockError( 98 | "Couldn't lock {0}, errno is {1}".format(file.name, e.errno) 99 | ) 100 | 101 | def _unlock_file(file: IO[str]) -> None: 102 | try: 103 | file.seek(0) 104 | msvcrt.locking(file.fileno(), msvcrt.LK_UNLCK, 1) # type: ignore 105 | except OSError as e: 106 | raise LockError( 107 | "Couldn't unlock {0}, errno is {1}".format(file.name, e.errno) 108 | ) 109 | 110 | else: 111 | 112 | # Linux/POSIX 113 | 114 | POSIX = True # type: ignore 115 | 116 | def _lock_file(file: IO[str], block: bool) -> None: 117 | try: 118 | fcntl.flock(file.fileno(), fcntl.LOCK_EX | (0 if block else fcntl.LOCK_NB)) 119 | except IOError: 120 | raise LockError("Couldn't lock {0}".format(file.name)) 121 | 122 | def _unlock_file(file: IO[str]) -> None: 123 | # File is automatically unlocked on close 124 | pass 125 | 126 | 127 | class GlobalLock: 128 | 129 | _TMP_DIR = tempfile.gettempdir() 130 | 131 | def __init__(self, lockname: str) -> None: 132 | """Initialize a global lock with the given name""" 133 | assert lockname and isinstance(lockname, str) 134 | # Locate global locks in the system temporary directory 135 | # (should work on both Windows and Unix/POSIX) 136 | self._path = os.path.join(self._TMP_DIR, "greynir-" + lockname) 137 | self._fp: Optional[IO[str]] = None 138 | 139 | def acquire(self, block: bool = True) -> None: 140 | """Acquire a global lock, blocking if block = True""" 141 | 142 | if self._fp is not None: 143 | # Already hold the lock 144 | return 145 | 146 | path = self._path 147 | fp = None 148 | try: 149 | # Try to open for writing without truncation: 150 | fp = open(path, "r+") 151 | except IOError: 152 | # If the file doesn't exist, we'll get an IO error, try a+ 153 | # Note that there may be a race here. Multiple processes 154 | # could fail on the r+ open and open the file a+, but only 155 | # one will get the the lock and write a pid. 156 | try: 157 | fp = open(path, "a+") 158 | # Make sure that the file is readable and writable by others 159 | if POSIX: 160 | os.fchmod( 161 | fp.fileno(), 162 | stat.S_IRUSR 163 | | stat.S_IWUSR 164 | | stat.S_IRGRP 165 | | stat.S_IWGRP 166 | | stat.S_IROTH 167 | | stat.S_IWOTH, 168 | ) 169 | except IOError: 170 | raise LockError("Couldn't open or create lock file {0}".format(path)) 171 | 172 | self._fp = fp 173 | 174 | try: 175 | _lock_file(fp, block) 176 | except: 177 | fp.seek(1) 178 | fp.close() 179 | raise 180 | 181 | # Once acquired, write the process id to the file 182 | fp.write(" %s\n" % os.getpid()) 183 | fp.truncate() 184 | fp.flush() 185 | 186 | def release(self) -> None: 187 | """Release the lock""" 188 | if self._fp is not None: 189 | _unlock_file(self._fp) 190 | self._fp.close() 191 | self._fp = None 192 | 193 | def __enter__(self): 194 | """Python context manager protocol""" 195 | self.acquire(block=True) 196 | return self 197 | 198 | def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any): 199 | """Python context manager protocol""" 200 | self.release() 201 | return False 202 | -------------------------------------------------------------------------------- /src/reynir/incparser.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Greynir: Natural language processing for Icelandic 4 | 5 | Utility class for incremental parsing of token streams 6 | 7 | Copyright © 2023 Miðeind ehf. 8 | Original author: Vilhjálmur Þorsteinsson 9 | 10 | This software is licensed under the MIT License: 11 | 12 | Permission is hereby granted, free of charge, to any person 13 | obtaining a copy of this software and associated documentation 14 | files (the "Software"), to deal in the Software without restriction, 15 | including without limitation the rights to use, copy, modify, merge, 16 | publish, distribute, sublicense, and/or sell copies of the Software, 17 | and to permit persons to whom the Software is furnished to do so, 18 | subject to the following conditions: 19 | 20 | The above copyright notice and this permission notice shall be 21 | included in all copies or substantial portions of the Software. 22 | 23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 26 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 27 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 28 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 29 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 30 | 31 | This module implements a simple utility class for parsing token 32 | streams into paragraphs and sentences. The parse is incremental so 33 | that the client can take action on each paragraph and sentence as 34 | it is processed. Also, time.sleep(0) is called between sentences 35 | to make multi-threaded parses proceed more smoothly and evenly. 36 | 37 | """ 38 | 39 | import time 40 | from typing import Iterable, Iterator, List, Optional, Tuple 41 | 42 | from tokenizer import paragraphs, Tok 43 | 44 | from .bintokenizer import tokens_are_foreign 45 | from .fastparser import Fast_Parser, Node, ParseError 46 | from .reducer import Reducer 47 | from .settings import Settings 48 | 49 | 50 | # Number of tree combinations that must be exceeded for a verbose 51 | # parse dump to include the sentence text (as opposed to just basic stats) 52 | _VERBOSE_AMBIGUITY_THRESHOLD = 1000 53 | 54 | # The ratio of words in a sentence that must be found in BÍN 55 | # for it to be analyzed as an Icelandic sentence 56 | ICELANDIC_RATIO = 0.5 57 | 58 | 59 | # The same type is defined in the Tokenizer module 60 | SentenceTuple = Tuple[int, List[Tok]] 61 | 62 | 63 | class IncrementalParser: 64 | 65 | """Utility class to parse a token list as a sequence of paragraphs 66 | containing sentences. Typical usage: 67 | 68 | toklist = tokenize(text) 69 | fp = Fast_Parser() 70 | ip = IncrementalParser(fp, toklist) 71 | for p in ip.paragraphs(): 72 | for sent in p.sentences(): 73 | if sent.parse(): 74 | # sentence parsed successfully 75 | # do something with sent.tree 76 | else: 77 | # an error occurred in the parse 78 | # the error token index is at sent.err_index 79 | num_sentences = ip.num_sentences 80 | num_parsed = ip.num_parsed 81 | ambiguity = ip.ambiguity 82 | parse_time = ip.parse_time 83 | 84 | """ 85 | 86 | class _IncrementalSentence: 87 | 88 | """An internal sentence representation class""" 89 | 90 | def __init__(self, ip: "IncrementalParser", s: List[Tok]) -> None: 91 | self._ip = ip 92 | self._s = s 93 | self._len = len(s) 94 | assert self._len > 0 # Input should be already sanitized 95 | self._err_index: Optional[int] = None 96 | self._tree: Optional[Node] = None 97 | self._score = 0 98 | self._error: Optional[ParseError] = None 99 | 100 | def __len__(self): 101 | return self._len 102 | 103 | def parse(self) -> bool: 104 | """Parse the sentence""" 105 | num = 0 106 | score = 0 107 | forest: Optional[Node] = None 108 | try: 109 | if tokens_are_foreign(self._s, min_icelandic_ratio=ICELANDIC_RATIO): 110 | raise ParseError( 111 | "Sentence is probably not in Icelandic", token_index=0 112 | ) 113 | forest = self._ip._parser.go(self._s) 114 | num = Fast_Parser.num_combinations(forest) 115 | if num > 1: 116 | forest, score = self._ip._reducer.go_with_score(forest) 117 | except ParseError as e: 118 | # The ParseError may originate in the reducer.go_with_score() 119 | # function, and in that case, forest is not None; be sure to reset it 120 | forest = None 121 | score = 0 122 | num = 0 123 | self._err_index = e.token_index 124 | self._error = e 125 | self._tree = forest 126 | self._score = score 127 | self._ip._add_sentence(self, num) 128 | return num > 0 129 | 130 | @property 131 | def tokens(self) -> List[Tok]: 132 | return self._s 133 | 134 | @property 135 | def tree(self) -> Optional[Node]: 136 | return self._tree 137 | 138 | @property 139 | def score(self) -> int: 140 | return self._score 141 | 142 | @property 143 | def error(self) -> Optional[ParseError]: 144 | return self._error 145 | 146 | @property 147 | def err_index(self) -> int: 148 | return self._len - 1 if self._err_index is None else self._err_index 149 | 150 | @property 151 | def text(self) -> str: 152 | return " ".join(t.txt for t in self._s if t.txt) 153 | 154 | def __str__(self) -> str: 155 | return self.text 156 | 157 | class _IncrementalParagraph: 158 | 159 | """An internal paragraph representation class""" 160 | 161 | def __init__(self, ip: "IncrementalParser", p: List[SentenceTuple]) -> None: 162 | self._ip = ip 163 | self._p = p 164 | 165 | def sentences(self) -> Iterator["IncrementalParser._IncrementalSentence"]: 166 | """Yield the sentences within the paragraph, nicely wrapped""" 167 | Sent = IncrementalParser._IncrementalSentence 168 | for _, sent in self._p: 169 | # Call time.sleep(0) to yield the current thread, i.e. 170 | # enable the threading subsystem and/or eventlet under Gunicorn 171 | # to switch threads at this point - since the parsing of an 172 | # entire article can take a long time 173 | time.sleep(0) 174 | yield Sent(self._ip, sent) 175 | 176 | def __init__( 177 | self, parser: Fast_Parser, toklist: Iterable[Tok], verbose: bool = False 178 | ) -> None: 179 | self._parser = parser 180 | self._reducer = Reducer(parser.grammar) 181 | self._num_sent = 0 182 | self._num_parsed_sent = 0 183 | self._num_tokens = 0 184 | self._num_combinations = 0 185 | self._total_score = 0 186 | self._total_ambig = 0.0 187 | self._total_tokens = 0 188 | self._start_time = self._last_time = time.time() 189 | self._verbose = verbose 190 | self._toklist = list(toklist) 191 | 192 | def _add_sentence( 193 | self, s: "IncrementalParser._IncrementalSentence", num: int 194 | ) -> None: 195 | """Add a processed sentence to the statistics""" 196 | slen = len(s) 197 | self._num_sent += 1 198 | self._num_tokens += slen 199 | if num > 0: 200 | # The sentence was parsed successfully 201 | self._num_parsed_sent += 1 202 | self._num_combinations += num 203 | ambig_factor = num ** (1 / slen) 204 | self._total_ambig += ambig_factor * slen 205 | self._total_tokens += slen 206 | self._total_score += s.score 207 | # Debugging output, if requested and enabled 208 | if self._verbose and Settings.DEBUG: 209 | current_time = time.time() 210 | print( 211 | "Parsed sentence of length {0} with {1} combinations{3} " 212 | "in {4:.1f} seconds{2}".format( 213 | slen, 214 | num, 215 | ("\n" + s.text) if num >= _VERBOSE_AMBIGUITY_THRESHOLD else "", 216 | " and score " + str(s.score) if num >= 1 else "", 217 | current_time - self._last_time, 218 | ) 219 | ) 220 | self._last_time = current_time 221 | 222 | def paragraphs(self) -> Iterator["IncrementalParser._IncrementalParagraph"]: 223 | """Yield the paragraphs from the token stream""" 224 | Para = IncrementalParser._IncrementalParagraph 225 | for p in paragraphs(self._toklist): 226 | yield Para(self, p) 227 | 228 | @property 229 | def num_tokens(self) -> int: 230 | return self._num_tokens 231 | 232 | @property 233 | def num_sentences(self) -> int: 234 | return self._num_sent 235 | 236 | @property 237 | def num_parsed(self) -> int: 238 | return self._num_parsed_sent 239 | 240 | @property 241 | def num_combinations(self) -> int: 242 | return self._num_combinations 243 | 244 | @property 245 | def total_score(self) -> int: 246 | return self._total_score 247 | 248 | @property 249 | def ambiguity(self) -> float: 250 | return ( 251 | (self._total_ambig / self._total_tokens) if self._total_tokens > 0 else 1.0 252 | ) 253 | 254 | @property 255 | def parse_time(self) -> float: 256 | return time.time() - self._start_time 257 | -------------------------------------------------------------------------------- /src/reynir/lemmatize.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Greynir: Natural language processing for Icelandic 4 | 5 | Copyright © 2023 Miðeind ehf. 6 | 7 | This software is licensed under the MIT License: 8 | 9 | Permission is hereby granted, free of charge, to any person 10 | obtaining a copy of this software and associated documentation 11 | files (the "Software"), to deal in the Software without restriction, 12 | including without limitation the rights to use, copy, modify, merge, 13 | publish, distribute, sublicense, and/or sell copies of the Software, 14 | and to permit persons to whom the Software is furnished to do so, 15 | subject to the following conditions: 16 | 17 | The above copyright notice and this permission notice shall be 18 | included in all copies or substantial portions of the Software. 19 | 20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 23 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 24 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 25 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 26 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | This module contains a function to (simplistically) lemmatize text 29 | without parsing it. 30 | 31 | """ 32 | 33 | from typing import Optional, Union, Callable, Tuple, List, Iterator, TypeVar, cast 34 | 35 | from abc import abstractmethod, ABCMeta 36 | 37 | from .bindb import BIN_Tuple 38 | from .bintokenizer import tokenize, TOK 39 | 40 | 41 | # TODO: In Python >= 3.8, the base class could be typing.Protocol 42 | class Comparable(metaclass=ABCMeta): 43 | """Protocol for annotating comparable types""" 44 | 45 | @abstractmethod 46 | def __lt__(self: "CT", other: "CT") -> bool: ... 47 | 48 | 49 | CT = TypeVar("CT", bound=Comparable) 50 | 51 | LemmaTuple = Tuple[str, str] # Lemma, category (ordfl) 52 | 53 | 54 | def simple_lemmatize( 55 | txt: str, 56 | *, 57 | all_lemmas: bool = False, 58 | sortkey: Optional[Callable[[LemmaTuple], Comparable]] = None, 59 | ) -> Union[Iterator[LemmaTuple], Iterator[List[LemmaTuple]]]: 60 | """Simplistically lemmatize a list of tokens, returning a generator of 61 | (lemma, category) tuples. The default behaviour is to return the 62 | first lemma provided by bintokenizer. If all_lemmas are requested, 63 | returns full list of potential lemmas. A sort function can be provided 64 | to determine the ordering of that list.""" 65 | for t in tokenize(txt): 66 | y: Optional[List[LemmaTuple]] = None 67 | if t.kind == TOK.WORD: 68 | if t.val: 69 | # Known word 70 | if "-" in t.txt: 71 | # The original word already contains a hyphen: leave'em in 72 | y = [(v.stofn, v.ordfl) for v in cast(List[BIN_Tuple], t.val)] 73 | else: 74 | # The original word doesn't contain a hyphen: any hyphens 75 | # in the lemmas must come from the compounding algorithm 76 | y = [ 77 | (v.stofn.replace("-", ""), v.ordfl) 78 | for v in cast(List[BIN_Tuple], t.val) 79 | ] 80 | else: 81 | # Unknown word: assume it's an entity 82 | y = [(t.txt, "entity")] 83 | elif t.kind == TOK.PERSON: 84 | assert t.person_names 85 | # Person name w. gender 86 | person_name = t.person_names[0] 87 | y = [(person_name.name, "person_" + (person_name.gender or "hk"))] 88 | elif t.kind == TOK.ENTITY or t.kind == TOK.COMPANY: 89 | # Entity or company name 90 | y = [(t.txt, "entity")] 91 | if y is not None: 92 | # OK, we're returning one or more lemmas for this token 93 | # Remove duplicates while preserving order 94 | y = list(dict.fromkeys(y)) 95 | if sortkey is not None: 96 | y.sort(key=sortkey) 97 | if all_lemmas: 98 | yield y 99 | else: 100 | yield y[0] # Naively return first lemma 101 | -------------------------------------------------------------------------------- /src/reynir/nounphrase.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Greynir: Natural language processing for Icelandic 4 | 5 | NounPhrase class implementation 6 | 7 | Copyright © 2023 Miðeind ehf. 8 | Original author: Vilhjálmur Þorsteinsson 9 | 10 | This software is licensed under the MIT License: 11 | 12 | Permission is hereby granted, free of charge, to any person 13 | obtaining a copy of this software and associated documentation 14 | files (the "Software"), to deal in the Software without restriction, 15 | including without limitation the rights to use, copy, modify, merge, 16 | publish, distribute, sublicense, and/or sell copies of the Software, 17 | and to permit persons to whom the Software is furnished to do so, 18 | subject to the following conditions: 19 | 20 | The above copyright notice and this permission notice shall be 21 | included in all copies or substantial portions of the Software. 22 | 23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 26 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 27 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 28 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 29 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 30 | 31 | This module implements the NounPhrase class, a handy container 32 | for noun phrases (nafnliður) allowing them to be easily inflected 33 | and formatted. 34 | 35 | """ 36 | 37 | from typing import Optional, Mapping, Callable 38 | 39 | import operator 40 | 41 | from .reynir import Greynir, _NounPhrase, SimpleTree 42 | 43 | 44 | # Format specifiers and how they relate to properties 45 | # of the contained NounPhrase object 46 | _FMT: Mapping[str, Callable[["_NounPhrase"], str]] = { 47 | # Icelandic format specifiers 48 | "nf": operator.attrgetter("nominative"), 49 | "þf": operator.attrgetter("accusative"), 50 | "þgf": operator.attrgetter("dative"), 51 | "ef": operator.attrgetter("genitive"), 52 | "ángr": operator.attrgetter("indefinite"), 53 | "stofn": operator.attrgetter("canonical"), 54 | # English/international format specifiers 55 | "nom": operator.attrgetter("nominative"), 56 | "acc": operator.attrgetter("accusative"), 57 | "dat": operator.attrgetter("dative"), 58 | "gen": operator.attrgetter("genitive"), 59 | "ind": operator.attrgetter("indefinite"), 60 | "can": operator.attrgetter("canonical"), 61 | } 62 | 63 | 64 | class NounPhrase: 65 | 66 | """A handy container for a noun phrase (nafnliður), 67 | allowing it to be easily inflected and formatted""" 68 | 69 | # Singleton parser instance 70 | _greynir: Optional[Greynir] = None 71 | 72 | def __init__(self, np_string: str, *, force_number: Optional[str] = None) -> None: 73 | """Initialize a NounPhrase from a text string. 74 | If force_number is set to "et" or "singular", we only 75 | consider singular interpretations of the string. 76 | If force_number is set to "ft" or "plural", we only 77 | consider plural interpretations of the string.""" 78 | self._np_string = np_string or "" 79 | self._number: Optional[str] = None 80 | self._person: Optional[str] = None 81 | self._case: Optional[str] = None 82 | self._gender: Optional[str] = None 83 | self._np: Optional[_NounPhrase] = None 84 | if self._np_string: 85 | if self._greynir is None: 86 | # Initialize our parser singleton 87 | # When parsing noun phrases, we don't assume that they 88 | # start a sentence - so we don't attempt to interpret the 89 | # first word as a lowercase word, as we would otherwise 90 | self.__class__._greynir = Greynir(no_sentence_start=True) 91 | # Parse the noun phrase string into a _NounPhrase object 92 | assert self._greynir is not None 93 | self._np = self._greynir.parse_noun_phrase( 94 | self._np_string, force_number=force_number 95 | ) 96 | if self._np is not None and self._np.deep_tree is not None: 97 | # Access the first child of the root 'Nl' nonterminal 98 | # of the deep parse tree 99 | nt = next(self._np.deep_tree.enum_child_nodes()).nonterminal.name 100 | # Sanity check 101 | assert nt.startswith("Nl_") or nt.startswith("NlEind_") 102 | # Extract the variants of the nonterminal 103 | variants = set(nt.split("_")[1:]) 104 | self._number = (variants & {"et", "ft"}).pop() 105 | self._person = (variants & {"p1", "p2", "p3"}).pop() 106 | self._case = (variants & {"nf", "þf", "þgf", "ef"}).pop() 107 | self._gender = (variants & {"kk", "kvk", "hk"}).pop() 108 | 109 | def __str__(self) -> str: 110 | """Return the contained string as-is""" 111 | return self._np_string 112 | 113 | def __repr__(self) -> str: 114 | return "".format( 115 | self._np_string, "parsed" if self.parsed else "not parsed" 116 | ) 117 | 118 | def __len__(self) -> int: 119 | """Provide len() for convenience""" 120 | return self._np_string.__len__() 121 | 122 | def __format__(self, format_spec: str) -> str: 123 | """Return the contained string after inflecting it according 124 | to the format specification, if given""" 125 | # Examples: 126 | # >>> np = NounPhrase('skjótti hesturinn') 127 | # >>> f"Hér er {np:nf}" 128 | # 'Hér er skjótti hesturinn' 129 | # >>> f"Um {np:þf}" 130 | # 'Um skjótta hestinn' 131 | # >>> f"Frá {np:þgf}" 132 | # 'Frá skjótta hestinum' 133 | # >>> f"Til {np:ef}" 134 | # 'Til skjótta hestsins' 135 | # >>> f"Hér er {np:ángr}" 136 | # 'Hér er skjóttur hestur' 137 | # np = NounPhrase("þrír skjóttir hestar") 138 | # >>> f"Umræðuefnið er {np:stofn}" 139 | # 'Umræðuefnið er skjóttur hestur' 140 | if not format_spec or not self.parsed: 141 | return self._np_string 142 | # Find the attrgetter (property access function) 143 | # corresponding to the format spec 144 | fmt = _FMT.get(format_spec) 145 | if fmt is None: 146 | # We don't recognize this format specifier 147 | raise ValueError( 148 | "Invalid format specifier for NounPhrase: '{0}'".format(format_spec) 149 | ) 150 | # Extract the requested property and return it 151 | assert self._np is not None 152 | return fmt(self._np) 153 | 154 | @property 155 | def parsed(self) -> bool: 156 | """Return True if the noun phrase was successfully parsed""" 157 | return self._np is not None and self._np.tree is not None 158 | 159 | @property 160 | def tree(self) -> Optional[SimpleTree]: 161 | """Return the SimpleTree object corresponding to the noun phrase""" 162 | return None if self._np is None else self._np.tree 163 | 164 | @property 165 | def case(self) -> Optional[str]: 166 | """Return the case of the noun phrase, as originally parsed""" 167 | return self._case 168 | 169 | @property 170 | def number(self) -> Optional[str]: 171 | """Return the number (singular='et'/plural='ft') of the noun phrase, 172 | as originally parsed""" 173 | return self._number 174 | 175 | @property 176 | def person(self) -> Optional[str]: 177 | """Return the person ('p1', 'p2', 'p3') of the noun phrase, 178 | as originally parsed""" 179 | return self._person 180 | 181 | @property 182 | def gender(self) -> Optional[str]: 183 | """Return the gender (masculine='kk', feminine='kvk', neutral='hk') 184 | of the noun phrase, as originally parsed""" 185 | return self._gender 186 | 187 | @property 188 | def nominative(self) -> Optional[str]: 189 | """Return nominative form (nefnifall)""" 190 | return None if self._np is None else self._np.nominative 191 | 192 | @property 193 | def indefinite(self) -> Optional[str]: 194 | """Return indefinite form (nefnifall án greinis)""" 195 | return None if self._np is None else self._np.indefinite 196 | 197 | @property 198 | def canonical(self) -> Optional[str]: 199 | """Return canonical form (nefnifall eintölu án greinis)""" 200 | return None if self._np is None else self._np.canonical 201 | 202 | @property 203 | def accusative(self) -> Optional[str]: 204 | """Return accusative form (þolfall)""" 205 | return None if self._np is None else self._np.accusative 206 | 207 | @property 208 | def dative(self) -> Optional[str]: 209 | """Return dative form (þágufall)""" 210 | return None if self._np is None else self._np.dative 211 | 212 | @property 213 | def genitive(self) -> Optional[str]: 214 | """Return genitive form (eignarfall)""" 215 | return None if self._np is None else self._np.genitive 216 | -------------------------------------------------------------------------------- /src/reynir/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mideind/GreynirEngine/c827bfbba49eb49971d5bf63cc49f444534ce11b/src/reynir/py.typed -------------------------------------------------------------------------------- /test/test_matcher.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | """ 3 | 4 | test_matcher.py 5 | 6 | Tests for the SimpleTree matching functionality in matcher.py 7 | 8 | Copyright © 2023 by Miðeind ehf. 9 | Original author: Vilhjálmur Þorsteinsson 10 | 11 | This software is licensed under the MIT License: 12 | 13 | Permission is hereby granted, free of charge, to any person 14 | obtaining a copy of this software and associated documentation 15 | files (the "Software"), to deal in the Software without restriction, 16 | including without limitation the rights to use, copy, modify, merge, 17 | publish, distribute, sublicense, and/or sell copies of the Software, 18 | and to permit persons to whom the Software is furnished to do so, 19 | subject to the following conditions: 20 | 21 | The above copyright notice and this permission notice shall be 22 | included in all copies or substantial portions of the Software. 23 | 24 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 27 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 28 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 29 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 30 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 31 | 32 | """ 33 | 34 | import pytest 35 | 36 | from tokenizer.definitions import AmountTuple, DateTimeTuple 37 | 38 | from reynir import Greynir 39 | from reynir.reynir import Terminal 40 | 41 | 42 | @pytest.fixture(scope="module") 43 | def r(): 44 | """Provide a module-scoped Greynir instance as a test fixture""" 45 | r = Greynir() 46 | yield r 47 | # Do teardown here 48 | r.__class__.cleanup() 49 | 50 | 51 | def test_matcher(r: Greynir, verbose: bool = False) -> None: 52 | 53 | s = r.parse_single("Hún á heiðurinn að þessu.") 54 | m = list( 55 | s.tree.all_matches( 56 | "( " 57 | "VP > [ .* VP > { ( 'eiga'|'fá'|'hljóta' ) } .* NP-OBJ > { 'heiður' PP > { 'að' } } ] " 58 | "| " 59 | "VP > [ .* VP > { ( 'eiga'|'fá'|'hljóta' ) } .* NP-OBJ > { 'heiður' } PP > { 'að' } ] " 60 | ") " 61 | ) 62 | ) 63 | assert len(m) == 1 64 | 65 | # Simple condition, correct sentence (vh in both subtrees) 66 | s = r.parse_single("Ég hefði farið út ef Jón hefði hegðað sér vel.") 67 | m = list( 68 | s.tree.all_matches( 69 | "VP > { VP > { so_vh } CP-ADV-COND > { IP > { VP >> so_fh }}}" 70 | ) 71 | ) 72 | assert len(m) == 0 73 | 74 | # Simple condition, incorrect sentence (fh in conditional subtree) 75 | s = r.parse_single("Ég hefði farið út ef Jón hafði hegðað sér vel.") 76 | m = list( 77 | s.tree.all_matches( 78 | "VP > { VP > { so_vh } CP-ADV-COND > { IP > { VP >> so_fh }}}" 79 | ) 80 | ) 81 | assert len(m) == 1 82 | 83 | # Complex condition, incorrect sentence (fh in complex subsentence, fh in conditional subtree) 84 | s = r.parse_single( 85 | "Ég hefði farið út ef Jón, sem Anna elskaði heitt, hafði hegðað sér vel." 86 | ) 87 | # There are two potential attachments of the CP-ADV-COND subtree 88 | m = list( 89 | s.tree.all_matches( 90 | "VP > { VP > { so_vh } CP-ADV-COND > { IP > { VP >> so_fh }}}" 91 | ) 92 | ) + list( 93 | s.tree.all_matches( 94 | " IP > { VP > { VP > { so_vh } } CP-ADV-COND > { IP > { VP >> so_fh }}}" 95 | ) 96 | ) 97 | assert len(m) == 1 98 | 99 | # Complex condition, incorrect sentence (vh in complex subsentence, fh in conditional subtree) 100 | s = r.parse_single( 101 | "Ég hefði farið út ef Jón, sem Anna hefði elskað heitt, hafði hegðað sér vel." 102 | ) 103 | # There are two potential attachments of the CP-ADV-COND subtree 104 | m = list( 105 | s.tree.all_matches( 106 | "VP > { VP > { so_vh } CP-ADV-COND > { IP > { VP >> so_fh }}}" 107 | ) 108 | ) + list( 109 | s.tree.all_matches( 110 | "IP > { VP > { VP > { so_vh } } CP-ADV-COND > { IP > { VP >> so_fh }}}" 111 | ) 112 | ) 113 | assert len(m) == 1 114 | 115 | # Complex condition, correct sentence (fh in complex subsentence, vh in conditional subtree) 116 | s = r.parse_single( 117 | "Ég hefði farið út ef Jón, sem Anna elskaði heitt, hefði hegðað sér vel." 118 | ) 119 | # There are two potential attachments of the CP-ADV-COND subtree 120 | m = list( 121 | s.tree.all_matches( 122 | "VP > { VP > { so_vh } CP-ADV-COND > { IP > { VP >> so_fh }}}" 123 | ) 124 | ) + list( 125 | s.tree.all_matches( 126 | "IP > { VP > { VP > { so_vh } } CP-ADV-COND > { IP > { VP >> so_fh }}}" 127 | ) 128 | ) 129 | assert len(m) == 0 130 | 131 | # Complex condition, correct sentence (vh in complex subsentence, vh in conditional subtree) 132 | s = r.parse_single( 133 | "Ég hefði farið út ef Jón, sem Anna hefði elskað heitt, hefði hegðað sér vel." 134 | ) 135 | # There are two potential attachments of the CP-ADV-COND subtree 136 | m = list( 137 | s.tree.all_matches( 138 | "VP > { VP > { so_vh } CP-ADV-COND > { IP > { VP >> so_fh }}}" 139 | ) 140 | ) + list( 141 | s.tree.all_matches( 142 | "IP > { VP > { VP > { so_vh } } CP-ADV-COND > { IP > { VP >> so_fh }}}" 143 | ) 144 | ) 145 | assert len(m) == 0 146 | -------------------------------------------------------------------------------- /test/test_no_multiply_numbers.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | """ 3 | 4 | test_no_multiply_numbers.py 5 | 6 | Tests for Greynir no_multiply_numbers flag functionality 7 | 8 | Copyright © 2023 by Miðeind ehf. 9 | 10 | This software is licensed under the MIT License: 11 | 12 | Permission is hereby granted, free of charge, to any person 13 | obtaining a copy of this software and associated documentation 14 | files (the "Software"), to deal in the Software without restriction, 15 | including without limitation the rights to use, copy, modify, merge, 16 | publish, distribute, sublicense, and/or sell copies of the Software, 17 | and to permit persons to whom the Software is furnished to do so, 18 | subject to the following conditions: 19 | 20 | The above copyright notice and this permission notice shall be 21 | included in all copies or substantial portions of the Software. 22 | 23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 26 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 27 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 28 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 29 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 30 | 31 | """ 32 | 33 | import pytest 34 | 35 | from reynir import Greynir 36 | 37 | # Import tests from other files directly into namespace 38 | # (they get run again with the new Greynir instance from the r function below) 39 | # in order to see if flag affects other functionality than just written numbers 40 | from test_cases import test_addresses, test_cases, test_noun_phrases 41 | from test_matcher import test_matcher 42 | from test_original import test_original 43 | 44 | # Too many to comfortably write, instead we 45 | # overwrite the only affected tests and the function r 46 | from test_parse import * 47 | from test_reynir import ( 48 | test_augment_terminal, 49 | test_auto_uppercase, 50 | test_compounds, 51 | test_compounds_with_numbers, 52 | test_lemmas, 53 | test_names, 54 | test_sentence_split, 55 | ) 56 | from test_serializers import test_annotree, test_serializers 57 | 58 | 59 | @pytest.fixture(scope="module") 60 | def r(): 61 | """Provide module-scoped Greynir instance (which doesn't multiply numbers) as test fixture""" 62 | r = Greynir(no_multiply_numbers=True) 63 | yield r 64 | # Do teardown here 65 | r.__class__.cleanup() 66 | 67 | 68 | def check_terminal(t, text, lemma, category, variants): 69 | assert t.text == text 70 | assert t.lemma == lemma 71 | if category == "töl": 72 | # Ignore variants for undeclinable number words; also, 73 | # allow "no" for the category since some number words have 74 | # both "no" and "töl" categories in BÍN 75 | assert t.category == "no" or t.category == "töl" 76 | elif category == "to": 77 | # Allow "no" for the category since declinable number words have 78 | # both "no" and "to" categories in BÍN 79 | assert t.category == "no" or t.category == "to" 80 | assert set(t.variants) == set(variants) 81 | else: 82 | assert t.category == category 83 | assert set(t.variants) == set(variants) 84 | 85 | 86 | # Overwrite tests from test_parse which use numbers and assume flag is not set 87 | test_amounts = test_terminals = test_single = lambda r: None 88 | 89 | 90 | def test_no_multiply_numbers(r: Greynir): 91 | """Test no_multiply_numbers flag""" 92 | 93 | s = r.parse_single("Tjónið nam 10 milljörðum króna.") 94 | assert s is not None 95 | t: List[Terminal] = s.terminals or [] 96 | assert len(t) == 6 97 | check_terminal( 98 | t[2], 99 | text="10", 100 | lemma="10", 101 | category="tala", 102 | variants=["þgf", "kk", "ft"], 103 | ) 104 | check_terminal( 105 | t[3], 106 | text="milljörðum", 107 | lemma="milljarður", 108 | category="no", 109 | variants=["þgf", "kk", "ft"], 110 | ) 111 | check_terminal( 112 | t[4], 113 | text="króna", 114 | lemma="króna", 115 | category="no", 116 | variants=["ef", "kvk", "ft"], 117 | ) 118 | 119 | s = r.parse_single("Tjónið þann 22. maí nam einum milljarði króna.") 120 | assert s is not None 121 | t = s.terminals or [] 122 | assert len(t) == 8 123 | check_terminal( 124 | t[4], 125 | text="einum", 126 | lemma="einn", 127 | category="to", 128 | variants=["et", "þgf", "kk"], 129 | ) 130 | check_terminal( 131 | t[5], 132 | text="milljarði", 133 | lemma="milljarður", 134 | category="no", 135 | variants=["et", "þgf", "kk"], 136 | ) 137 | check_terminal( 138 | t[6], 139 | text="króna", 140 | lemma="króna", 141 | category="no", 142 | variants=["ft", "ef", "kvk"], 143 | ) 144 | 145 | s = r.parse_single("Tjónið nam tuttugu og einum milljarði króna.") 146 | assert s is not None 147 | t = s.terminals or [] 148 | assert len(t) == 8 149 | check_terminal( 150 | t[2], 151 | text="tuttugu", 152 | lemma="tuttugu", 153 | category="töl", 154 | variants=[], 155 | ) 156 | check_terminal( 157 | t[4], 158 | text="einum", 159 | lemma="einn", 160 | category="to", 161 | variants=["et", "þgf", "kk"], 162 | ) 163 | check_terminal( 164 | t[5], 165 | text="milljarði", 166 | lemma="milljarður", 167 | category="no", 168 | variants=["et", "þgf", "kk"], 169 | ) 170 | check_terminal( 171 | t[6], 172 | text="króna", 173 | lemma="króna", 174 | category="no", 175 | variants=["ft", "ef", "kvk"], 176 | ) 177 | 178 | s = r.parse_single("Fjöldi stjarna í Vetrarbrautinni skiptir hundruðum milljarða.") 179 | assert s is not None 180 | t = s.terminals or [] 181 | assert len(t) == 8 182 | check_terminal( 183 | t[5], 184 | text="hundruðum", 185 | lemma="hundrað", 186 | category="no", 187 | variants=["ft", "þgf", "hk"], 188 | ) 189 | check_terminal( 190 | t[6], 191 | text="milljarða", 192 | lemma="milljarður", 193 | category="no", 194 | variants=["ft", "ef", "kk"], 195 | ) 196 | 197 | s = r.parse_single("Sex hundruð áttatíu og þrír leikmenn mættu á blakmótið.") 198 | assert s is not None 199 | t = s.terminals or [] 200 | assert len(t) == 10 201 | check_terminal( 202 | t[0], 203 | text="Sex", 204 | lemma="sex", 205 | category="töl", 206 | variants=[], 207 | ) 208 | check_terminal( 209 | t[1], 210 | text="hundruð", 211 | lemma="hundrað", 212 | category="no", 213 | variants=["ft", "hk", "nf"], 214 | ) 215 | check_terminal( 216 | t[2], 217 | text="áttatíu", 218 | lemma="áttatíu", 219 | category="töl", 220 | variants=[], 221 | ) 222 | check_terminal( 223 | t[3], 224 | text="og", 225 | lemma="og", 226 | category="st", 227 | variants=[], 228 | ) 229 | check_terminal( 230 | t[4], 231 | text="þrír", 232 | lemma="þrír", 233 | category="to", 234 | variants=["ft", "kk", "nf"], 235 | ) 236 | 237 | s = r.parse_single("Tjónið nam tólf hundruðum punda.") 238 | assert s is not None 239 | t = s.terminals or [] 240 | assert len(t) == 6 241 | check_terminal( 242 | t[2], 243 | text="tólf", 244 | lemma="tólf", 245 | category="töl", 246 | variants=[], 247 | ) 248 | check_terminal( 249 | t[3], 250 | text="hundruðum", 251 | lemma="hundrað", 252 | category="no", 253 | variants=["ft", "þgf", "hk"], 254 | ) 255 | check_terminal( 256 | t[4], 257 | text="punda", 258 | lemma="pund", 259 | category="no", 260 | variants=["ft", "ef", "hk"], 261 | ) 262 | 263 | s = r.parse_single("Sjötíu þúsund manns söfnuðust fyrir á torginu.") 264 | assert s is not None 265 | t = s.terminals or [] 266 | assert len(t) == 8 267 | check_terminal( 268 | t[0], 269 | text="Sjötíu", 270 | lemma="sjötíu", 271 | category="töl", 272 | variants=["ft", "nf", "hk"], 273 | ) 274 | check_terminal( 275 | t[1], 276 | text="þúsund", 277 | lemma="þúsund", 278 | category="no", # "no", # The choice between töl and no seems a bit random 279 | variants=["ft", "nf", "hk"], 280 | ) 281 | 282 | s = r.parse_single("7 milljón borðtenniskúlur.") 283 | assert s is not None 284 | t = s.terminals or [] 285 | assert len(t) == 4 286 | check_terminal( 287 | t[0], 288 | text="7", 289 | lemma="7", 290 | category="tala", 291 | variants=["kvk", "ft", "nf"], 292 | ) 293 | check_terminal( 294 | t[1], 295 | text="milljón", 296 | lemma="milljón", 297 | category="töl", 298 | variants=[], # ["kvk", "ft", "nf"] 299 | ) 300 | 301 | s = r.parse_single("Árið áttatíu þúsund sextíu og tvö er í framtíðinni.") 302 | assert s is not None 303 | t = s.terminals or [] 304 | assert len(t) == 10 305 | check_terminal( 306 | t[1], 307 | text="áttatíu", 308 | lemma="áttatíu", 309 | category="töl", 310 | variants=["ft", "nf", "hk"], 311 | ) 312 | check_terminal( 313 | t[2], 314 | text="þúsund", 315 | lemma="þúsund", 316 | category="töl", 317 | variants=["ft", "nf", "hk"], 318 | ) 319 | check_terminal( 320 | t[3], 321 | text="sextíu", 322 | lemma="sextíu", 323 | category="töl", 324 | variants=["ft", "nf", "hk"], 325 | ) 326 | check_terminal( 327 | t[5], 328 | text="tvö", 329 | lemma="tveir", 330 | category="to", 331 | variants=["ft", "nf", "hk"], 332 | ) 333 | 334 | s = r.parse_single("Árið átján hundruð níutíu og þrjú er í fortíðinni.") 335 | assert s is not None 336 | t = s.terminals or [] 337 | assert len(t) == 10 338 | check_terminal( 339 | t[1], 340 | text="átján", 341 | lemma="átján", 342 | category="töl", 343 | variants=["ft", "nf", "hk"], 344 | ) 345 | check_terminal( 346 | t[2], 347 | text="hundruð", 348 | lemma="hundrað", 349 | category="no", 350 | variants=["ft", "nf", "hk"], 351 | ) 352 | check_terminal( 353 | t[3], 354 | text="níutíu", 355 | lemma="níutíu", 356 | category="töl", 357 | variants=["ft", "nf", "hk"], 358 | ) 359 | check_terminal( 360 | t[5], 361 | text="þrjú", 362 | lemma="þrír", 363 | category="to", 364 | variants=["ft", "nf", "hk"], 365 | ) 366 | 367 | s = r.parse_single("Tvö hundruð þúsund og þrír leikmenn mættu á blakmótið.") 368 | assert s is not None 369 | t = s.terminals or [] 370 | assert len(t) == 10 371 | check_terminal( 372 | t[0], 373 | text="Tvö", 374 | lemma="tveir", 375 | category="to", 376 | variants=["ft", "hk", "nf"], 377 | ) 378 | check_terminal( 379 | t[1], 380 | text="hundruð", 381 | lemma="hundrað", 382 | category="to", 383 | variants=["ft", "hk", "nf"], 384 | ) 385 | check_terminal( 386 | t[2], 387 | text="þúsund", 388 | lemma="þúsund", 389 | category="töl", 390 | variants=["ft", "hk", "nf"], 391 | ) 392 | check_terminal( 393 | t[3], 394 | text="og", 395 | lemma="og", 396 | category="st", 397 | variants=[], 398 | ) 399 | check_terminal( 400 | t[4], 401 | text="þrír", 402 | lemma="þrír", 403 | category="to", 404 | variants=["ft", "kk", "nf"], 405 | ) 406 | 407 | s = r.parse_single("Þúsundir mættu á blakmótið.") 408 | assert s is not None 409 | t = s.terminals or [] 410 | assert len(t) == 5 411 | check_terminal( 412 | t[0], 413 | text="Þúsundir", 414 | lemma="þúsund", 415 | category="no", 416 | variants=["ft", "kvk", "nf"], 417 | ) 418 | -------------------------------------------------------------------------------- /test/test_original.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | """ 3 | 4 | test_original.py 5 | 6 | Tests for Greynir module 7 | 8 | Copyright © 2023 Miðeind ehf. 9 | Original author: Vilhjálmur Þorsteinsson 10 | 11 | This software is licensed under the MIT License: 12 | 13 | Permission is hereby granted, free of charge, to any person 14 | obtaining a copy of this software and associated documentation 15 | files (the "Software"), to deal in the Software without restriction, 16 | including without limitation the rights to use, copy, modify, merge, 17 | publish, distribute, sublicense, and/or sell copies of the Software, 18 | and to permit persons to whom the Software is furnished to do so, 19 | subject to the following conditions: 20 | 21 | The above copyright notice and this permission notice shall be 22 | included in all copies or substantial portions of the Software. 23 | 24 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 27 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 28 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 29 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 30 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 31 | 32 | """ 33 | 34 | import pytest 35 | 36 | from reynir import Greynir 37 | from reynir.bintokenizer import tokenize 38 | 39 | 40 | @pytest.fixture(scope="module") 41 | def r(): 42 | """Provide a module-scoped Greynir instance as a test fixture""" 43 | r = Greynir() 44 | yield r 45 | # Do teardown here 46 | r.__class__.cleanup() 47 | 48 | 49 | def test_original(r: Greynir) -> None: 50 | 51 | s = "Ég keypti 1000 EUR þann 23. 5. 2011 og græddi 10,5 % ." 52 | tlist = list(tokenize(s)) 53 | assert sum(len(t.original or "") for t in tlist) == len(s) 54 | 55 | s = " Friðjón Pálsson hitti Friðbert \tJ. Ástráðsson í gær." 56 | tlist = list(tokenize(s)) 57 | assert sum(len(t.original or "") for t in tlist) == len(s) 58 | 59 | s = " \t Casey Holdman \n og Luke Skywalker fóru saman á bar ." 60 | tlist = list(tokenize(s)) 61 | assert sum(len(t.original or "") for t in tlist) == len(s) 62 | 63 | s = " Hver á USD 5,75 sem ég fann í grasinu með 5,558 prósent?" 64 | tlist = list(tokenize(s)) 65 | assert sum(len(t.original or "") for t in tlist) == len(s) 66 | 67 | s = " Virkjunin var \t 600 MW og var á Reynimel 40C í Reykjavík ." 68 | tlist = list(tokenize(s)) 69 | assert sum(len(t.original or "") for t in tlist) == len(s) 70 | 71 | s = " Katrín Júlíusdóttir var iðnaðar- \n\t og \t\t viðskiptaráðherra" 72 | tlist = list(tokenize(s)) 73 | assert sum(len(t.original or "") for t in tlist) == len(s) 74 | 75 | s = " Friðbert Marsillíus Jónsson keypti hlutabréf í Eimskip hf. fyrir 100 milljónir í gær" 76 | tlist = list(tokenize(s)) 77 | assert sum(len(t.original or "") for t in tlist) == len(s) 78 | 79 | s = " Jens \tStoltenberg keypti hlutabréf nú síðdegis fyrir 100 milljónir króna kl. 12:30 30. júlí 2002 og Jens er stoltur af því." 80 | tlist = list(tokenize(s)) 81 | assert sum(len(t.original or "") for t in tlist) == len(s) 82 | 83 | s = "Gengi danskrar krónu féll um 2.000 EUR kl. 14:00 30. desember ." 84 | tlist = list(tokenize(s)) 85 | assert sum(len(t.original or "") for t in tlist) == len(s) 86 | 87 | s = "Dómsmála- , iðnaðar- og viðskiptaráðherra gerði víðreist um landið" 88 | tlist = list(tokenize(s)) 89 | assert sum(len(t.original or "") for t in tlist) == len(s) 90 | 91 | s = " Dagur Bergþóruson Eggertsson hefur verið farsæll borgarstjóri ." 92 | tlist = list(tokenize(s)) 93 | assert sum(len(t.original or "") for t in tlist) == len(s) 94 | 95 | s = " Formaður framkvæmdastjórnarinnar er Ursula \t\t van der Leyen ." 96 | tlist = list(tokenize(s)) 97 | assert sum(len(t.original or "") for t in tlist) == len(s) 98 | 99 | s = " Angela Merkel hefur lengi vel verið kanslari V-Þýskalands ." 100 | tlist = list(tokenize(s)) 101 | assert sum(len(t.original or "") for t in tlist) == len(s) 102 | 103 | 104 | if __name__ == "__main__": 105 | # When invoked as a main module, do a verbose test 106 | from reynir import Greynir 107 | 108 | greynir = Greynir() 109 | test_original(greynir) 110 | greynir.__class__.cleanup() 111 | -------------------------------------------------------------------------------- /test/test_serializers.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | """ 3 | 4 | test_serializers.py 5 | 6 | Tests for JSON serialization of sentences 7 | 8 | Copyright © 2023 by Miðeind ehf. 9 | 10 | This software is licensed under the MIT License: 11 | 12 | Permission is hereby granted, free of charge, to any person 13 | obtaining a copy of this software and associated documentation 14 | files (the "Software"), to deal in the Software without restriction, 15 | including without limitation the rights to use, copy, modify, merge, 16 | publish, distribute, sublicense, and/or sell copies of the Software, 17 | and to permit persons to whom the Software is furnished to do so, 18 | subject to the following conditions: 19 | 20 | The above copyright notice and this permission notice shall be 21 | included in all copies or substantial portions of the Software. 22 | 23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 26 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 27 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 28 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 29 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 30 | 31 | """ 32 | 33 | import json 34 | 35 | import pytest 36 | 37 | 38 | @pytest.fixture(scope="module") 39 | def r(): 40 | """Provide a module-scoped Greynir instance as a test fixture""" 41 | from reynir import Greynir 42 | 43 | r = Greynir() 44 | yield r 45 | # Do teardown here 46 | r.__class__.cleanup() 47 | 48 | 49 | def test_serializers(r): 50 | sents = [ 51 | "Ég fór niðrá bryggjuna með Reyni Vilhjálmssyni í gær.", 52 | "Það var 17. júní árið 2020.", 53 | "Við sáum tvo seli og örugglega fleiri en 100 máva.", 54 | "Klukkan var orðin tólf þegar við fórum heim.", 55 | "Bíllinn kostaði €30.000 en ég greiddi 25500 USD fyrir hann.", 56 | "Morguninn eftir vaknaði ég kl. 07:30.", 57 | "Ég var fyrstur á fætur en Þuríður Hálfdánardóttir var númer 2.", 58 | ] 59 | for sent in sents: 60 | orig = r.parse_single(sent) 61 | assert orig.tree is not None 62 | 63 | json_str = r.dumps_single(orig, indent=2) 64 | new = r.loads_single(json_str) 65 | 66 | assert new.tree is not None 67 | 68 | assert all(ot.equal(nt) for ot, nt in zip(orig.tokens, new.tokens)) 69 | assert orig.terminals == new.terminals 70 | 71 | assert orig.tree.flat_with_all_variants == orig.tree.flat_with_all_variants 72 | cls = r.__class__ 73 | assert json.loads(orig.dumps(cls, indent=2)) == json.loads( 74 | new.dumps(cls, indent=2) 75 | ) 76 | 77 | 78 | def test_annotree(): 79 | s = """ 80 | (META (ID-CORPUS 43bf66f3-51c4-11e6-8438-04014c605401.10) 81 | (ID-LOCAL greynir_corpus_00003.psd,.1) 82 | (URL http://www.mbl.is/sport/efstadeild/2016/07/24/ia_ibv_stadan_er_1_0/)) 83 | (S0 (S-HEADING (IP (NP-SUBJ (fn_ft_kk_nf Engir (lemma enginn)) 84 | (no_ft_kk_nf atburðir (lemma atburður))) 85 | (NP-PRD (VP (so_ft_kk_lhþt_nf_sb skráðir (lemma skrá)))) 86 | (ADVP (ao enn (lemma enn)))))) 87 | 88 | """ 89 | from reynir.simpletree import AnnoTree 90 | 91 | atree = AnnoTree(s) 92 | stree = atree.as_simple_tree() 93 | assert stree is not None 94 | assert stree.text == "Engir atburðir skráðir enn" 95 | assert stree.tidy_text == "Engir atburðir skráðir enn" 96 | assert stree.nouns == ["atburður"] 97 | assert stree.verbs == ["skrá"] 98 | 99 | 100 | if __name__ == "__main__": 101 | # When invoked as a main module, do a verbose test 102 | from reynir import Greynir 103 | 104 | g = Greynir() 105 | test_serializers(g) 106 | g.__class__.cleanup() 107 | --------------------------------------------------------------------------------