├── .gitignore ├── LICENSE ├── README.md ├── data ├── dict │ ├── d.bak │ ├── d.dat │ └── d.dir └── grammar.txt ├── notebooks.ipynb ├── requirements.txt ├── setup.py ├── src └── syntax_analyzer │ ├── __init__.py │ ├── parser.py │ ├── tree.py │ └── utils.py └── tests └── test_parser.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Initially taken from Github's Python gitignore file 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # tests and logs 12 | tests/fixtures/cached_*_text.txt 13 | logs/ 14 | lightning_logs/ 15 | lang_code_data/ 16 | 17 | # Distribution / packaging 18 | .Python 19 | build/ 20 | develop-eggs/ 21 | dist/ 22 | downloads/ 23 | eggs/ 24 | .eggs/ 25 | lib/ 26 | lib64/ 27 | parts/ 28 | sdist/ 29 | var/ 30 | wheels/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | MANIFEST 35 | 36 | # PyInstaller 37 | # Usually these files are written by a python script from a template 38 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 39 | *.manifest 40 | *.spec 41 | 42 | # Installer logs 43 | pip-log.txt 44 | pip-delete-this-directory.txt 45 | 46 | # Unit test / coverage reports 47 | htmlcov/ 48 | .tox/ 49 | .nox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | *.cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | 59 | # Translations 60 | *.mo 61 | *.pot 62 | 63 | # Django stuff: 64 | *.log 65 | local_settings.py 66 | db.sqlite3 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | .python-version 90 | 91 | # celery beat schedule file 92 | celerybeat-schedule 93 | 94 | # SageMath parsed files 95 | *.sage.py 96 | 97 | # Environments 98 | .env 99 | .venv 100 | env/ 101 | venv/ 102 | ENV/ 103 | env.bak/ 104 | venv.bak/ 105 | 106 | # Spyder project settings 107 | .spyderproject 108 | .spyproject 109 | 110 | # Rope project settings 111 | .ropeproject 112 | 113 | # mkdocs documentation 114 | /site 115 | 116 | # mypy 117 | .mypy_cache/ 118 | .dmypy.json 119 | dmypy.json 120 | 121 | # Pyre type checker 122 | .pyre/ 123 | 124 | # vscode 125 | .vs 126 | .vscode 127 | 128 | # Pycharm 129 | .idea 130 | 131 | # TF code 132 | tensorflow_code 133 | 134 | # Models 135 | proc_data 136 | 137 | # examples 138 | runs 139 | /runs_old 140 | /wandb 141 | /examples/runs 142 | /examples/**/*.args 143 | /examples/rag/sweep 144 | 145 | # data 146 | /data 147 | serialization_dir 148 | 149 | # emacs 150 | *.*~ 151 | debug.env 152 | 153 | # vim 154 | .*.swp 155 | 156 | #ctags 157 | tags 158 | 159 | # pre-commit 160 | .pre-commit* 161 | 162 | # .lock 163 | *.lock 164 | 165 | # DS_Store (MacOS) 166 | .DS_Store 167 | 168 | # ruff 169 | .ruff_cache 170 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Constantin Constantinov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Disclaimer 3 | 4 | This project created by me 5 years ago for educational purposes. For a while it was abandoned and archived, but many people contacted 5 | me to ask for help with use of this code. So, I've made the smallest effort to make a proper package from this code while keeping all functionalities 6 | and code formatting. No further development is planned. I discourage you to use it for any serious purposes other than educational. 7 | 8 | _12-05-2024_ 9 | 10 | # Syntax Analyzer 11 | 12 | This is syntax analyzer for Russian based on context-free grammar. 13 | It uses OpenCorpora dictionary of labelled words and pymorphy2 as interface. 14 | 15 | The repository consitsts of two main parts: 16 | 17 | (1) `tree` - class of binary tree to represent structure of sentence 18 | 19 | (2) `parser` - parser that takes raw sentence and returns set of possible parse-trees 20 | 21 | There are also complements: 22 | 23 | `data/grammar.txt` - context-free grammar for russian 24 | 25 | `data/dict` - dictionary of some complex phrases (conjugations, predicatives, adverbs etc.) that don't present in OpenCorpora dictionary. 26 | 27 | # Getting started 28 | 29 | ## Installation 30 | 31 | Clone the repository 32 | ```bash 33 | git clone 34 | ``` 35 | 36 | Install package 37 | ```bash 38 | pip install . 39 | ``` 40 | 41 | # Example 42 | 43 | ```python 44 | from syntax_analyzer.parser import Parser 45 | 46 | parser = Parser() 47 | 48 | sent = "Мама мыла раму." 49 | 50 | t = parser.parse(sent) 51 | 52 | t[0].display() 53 | ``` 54 | ``` 55 | S 56 | NP[case='nomn'] 57 | Мама ['NOUN', 'sing', 'femn', 'nomn'] 58 | VP[tran] 59 | VP[tran] 60 | мыла ['VERB', 'sing', 'femn', 'tran', 'past'] 61 | NP[case='accs'] 62 | раму ['NOUN', 'sing', 'femn', 'accs'] 63 | ``` 64 | 65 | -------------------------------------------------------------------------------- /data/dict/d.bak: -------------------------------------------------------------------------------- 1 | 'conj', (0, 458) 2 | 'pred', (512, 7036) 3 | -------------------------------------------------------------------------------- /data/dict/d.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/konverner/syntax_analyzer/d0a3a43e20ccd3fff4899e554b6c669736349072/data/dict/d.dat -------------------------------------------------------------------------------- /data/dict/d.dir: -------------------------------------------------------------------------------- 1 | 'conj', (0, 458) 2 | 'pred', (512, 7036) 3 | -------------------------------------------------------------------------------- /data/grammar.txt: -------------------------------------------------------------------------------- 1 | S -> NP[case='nomn'] VP[tran] | NP[case='nomn'] VP[intr] | NP[case='nomn'] PRED | NP[case='nomn'] ADJ | NP[case='nomn'] ADJ[case='nomn'] | NP[case='datv'] PRED | NP[case='datv'] NPRO | NP[case='datv'] VP[tran] | NP[case='datv'] VP[intr] 2 | NP[case='nomn'] -> N[case='nomn'] | NUMR[case='nomn'] | NPRO[case='nomn'] | NP[case='nomn'] NP[case='gent'] | ADJ[case='nomn'] NP[case='nomn'] | NP[case='nomn'] PRT[case='nomn'] | NPRO[case='nomn'] | NP[case='nomn'] NP[case='gent'] | NP[case='nomn'] NP[case='nomn'] | NP[case='nomn'] PRT 3 | NP[case='gent'] -> N[case='gent'] | NUMR[case='gent'] | NPRO[case='gent'] | NP[case='gent'] NP[case='gent'] | ADJ[case='gent'] NP[case='gent'] | NP[case='gent'] PRT[case='gent'] | NPRO[case='gent'] | NP[case='gent'] NP[case='gent'] | NP[case='gent'] NP[case='gent'] | NP[case='gent'] PRT 4 | NP[case='accs'] -> N[case='accs'] | NUMR[case='accs'] | NPRO[case='accs'] | NP[case='accs'] NP[case='accs'] | ADJ[case='accs'] NP[case='accs'] | NP[case='accs'] PRT[case='accs'] | NPRO[case='accs'] | NP[case='accs'] NP[case='gent'] | NP[case='accs'] NP[case='accs'] | NP[case='accs'] PRT 5 | NP[case='datv'] -> N[case='datv']| NUMR[case='datv'] | NPRO[case='datv'] | NP[case='datv'] NP[case='datv'] | ADJ[case='datv'] NP[case='datv'] | NP[case='datv'] PRT[case='datv'] | NPRO[case='datv'] | NP[case='datv'] NP[case='gent'] | NP[case='datv'] NP[case='datv'] | NP[case='datv'] PRT 6 | NP[case='ablt'] -> N[case='ablt'] | NUMR[case='ablt'] | NPRO[case='ablt'] | NP[case='ablt'] NP[case='ablt'] | ADJ[case='ablt'] NP[case='ablt'] | NP[case='ablt'] PRT[case='ablt'] | NPRO[case='ablt'] | NP[case='ablt'] NP[case='gent'] | NP[case='ablt'] NP[case='ablt'] | NP[case='ablt'] PRT 7 | NP[case="gen2"] -> ADJ[case='gent'] N[case='gen2'] | NP[case='gen2'] ADJ[case='gent'] | ADJ[case='accs'] N[case='accs'] ADJ[case='gent'] N[case='gen2'] | ADJ[case='gent'] N[case='gen2'] ADJ[case='accs'] N[case='accs'] 8 | NP[case='loct'] -> N[case='loct'] | NP[case='loct'] ADJ[case='loct'] 9 | VP[tran] -> INFN[tran] | VP[intr] VP[tran] | VP[tran] VP[tran] | V[tran] | VP[tran] ADJ | VP[tran] QUES | VP[tran] NP[case='ablt'] | VP[tran] ADJ[case='ablt'] | V[tran] PP | ADVB VP[tran] | VP[tran] ADVB | VP[tran] PRCL | GRND VP[tran] | VP[tran] NP[case='accs'] | VP[tran] NP[case='datv'] | VP[tran] NP[case='gent'] | VP[tran] PP | VP[tran] PRED 10 | VP[intr] -> INFN[intr] | VP[intr] VP[intr] | V[intr] | VP[intr] ADJ | VP[intr] QUES | VP[intr] NP[case='ablt'] | VP[intr] ADJ[case='ablt'] | ADVB VP[intr] | VP[intr] ADVB | VP[intr] NP[case='datv'] | VP[intr] NP[case='accs'] | VP[intr] NP[case='ablt'] | VP[intr] NP[case='gent'] | VP[intr] PRCL | GRND VP[intr] | VP[intr] PP | VP[intr] PRED 11 | PP -> PREP NP[case='gent'] | PREP NP[case='accs'] | PREP NP[case='datv'] | PREP NP[case='ablt'] | PREP NP[case='loct'] | PREP ADVB 12 | N[case='nomn'] -> [NOUN,?numb,?per,?gend,nomn,None,None] 13 | N[case='accs'] -> [NOUN,?numb,?per,?gend,accs,None,None] 14 | N[case='datv'] -> [NOUN,?numb,?per,?gend,datv,None,None] 15 | N[case='ablt'] -> [NOUN,?numb,?per,?gend,ablt,None,None] 16 | N[case='gent'] -> [NOUN,?numb,?per,?gend,gent,None,None] 17 | N[case='loct'] -> [NOUN,?numb,?per,?gend,loct,None,None] 18 | ADJ[case='nomn'] -> [ADJF,?numb,?per,?gend,nomn,None,None] | ADJ[case='nomn'] ADJ[case='nomn'] | ADVB ADJ[case='nomn'] 19 | ADJ[case='gent'] -> [ADJF,?numb,?per,?gend,gent,None,None] | ADJ[case='gent'] ADJ[case='gent'] | ADVB ADJ[case='gent'] 20 | ADJ[case='accs'] -> [ADJF,?numb,?per,?gend,accs,None,None] | ADJ[case='accs'] ADJ[case='accs'] | ADVB ADJ[case='accs'] 21 | ADJ[case='datv'] -> [ADJF,?numb,?per,?gend,datv,None,None] | ADJ[case='datv'] ADJ[case='datv'] | ADVB ADJ[case='datv'] 22 | ADJ[case='ablt'] -> [ADJF,?numb,?per,?gend,ablt,None,None] | ADJ[case='ablt'] ADJ[case='ablt'] | ADVB ADJ[case='ablt'] 23 | ADJ[case='loct'] -> [ADJF,?numb,?per,?gend,loct,None,None] | ADJ[case='loct'] ADJ[case='loct'] | ADVB ADJ[case='loct'] 24 | ADJ -> [ADJS,?numb,?per,?gend,None,None,None] 25 | PRT[case='nomn'] -> [PRTF,?numb,?per,?gend,nomn,None,?tense] | [PRTF,?numb,?per,?gend,nomn,tran,?tense] | [PRTF,?numb,?per,?gend,nomn,intr,?tense] | PRT[case='nomn'] PRT[case='nomn'] | PRT[case='nomn'] ADVB | PRT[case='nomn'] NP[case='ablt'] 26 | PRT[case='gent'] -> [PRTF,?numb,?per,?gend,gent,None,?tense] | [PRTF,?numb,?per,?gend,gent,tran,?tense] | [PRTF,?numb,?per,?gend,gent,intr,?tense] | PRT[case='gent'] PRT[case='gent'] | PRT[case='gent'] ADVB | PRT[case='gent'] NP[case='ablt'] 27 | PRT[case='datv'] -> [PRTF,?numb,?per,?gend,datv,None,?tense] | [PRTF,?numb,?per,?gend,datv,tran,?tense] | [PRTF,?numb,?per,?gend,datv,intr,?tense] | PRT[case='datv'] PRT[case='datv'] | PRT[case='datv'] ADVB | PRT[case='datv'] NP[case='ablt'] 28 | PRT[case='accs'] -> [PRTF,?numb,?per,?gend,accs,None,?tense] | [PRTF,?numb,?per,?gend,accs,tran,?tense] | [PRTF,?numb,?per,?gend,accs,intr,?tense] | [PRTF,?numb,?per,?gend,accs,intr,?tense] | PRT[case='accs'] PRT[case='accs'] | PRT[case='accs'] ADVB | PRT[case='accs'] NP[case='ablt'] 29 | PRT -> [PRTS,?numb,None,?gend,None,None,?tense] | [PRTS,?numb,None,None,None,None,?tense] | [PRTS,?numb,?per,?gend,None,None,?tense] | [PRTS,?numb,None,?gend,None,tran,?tense] | [PRTS,?numb,None,?gend,None,intr,?tense] | PRT NP[case='ablt'] | VP[intr] PRT | ADVB PRT 30 | V[intr] -> [VERB,?numb,?per,?gend,None,intr,?tense] | [VERB,?numb,?per,?gend,None,intr,None] | V[intr] PRCL 31 | V[tran] -> [VERB,?numb,?per,?gend,None,tran,?tense] | [VERB,?numb,?per,?gend,None,tran,None] | V[tran] PRCL 32 | INFN[tran] -> [INFN,?numb,?per,?gend,None,tran,None] 33 | INFN[intr] -> [INFN,?numb,?per,?gend,None,intr,None] 34 | GRND -> [GRND,?numb,?per,?gend,None,intr,?tense] | [GRND,?numb,?per,?gend,None,tran,?tense] 35 | ADVB -> [ADVB,?numb,?per,?gend,None,None,None] 36 | PREP -> [PREP,?numb,?per,?gend,None,None,None] 37 | PRCL -> [PRCL,?numb,?per,?gend,None,None,None] 38 | PRED -> [PRED,?numb,?per,?gend,None,None,None] | [PRED,?numb,?per,?gend,None,None,?tense] | PRED PRCL | PRED NP[case='gent'] 39 | CONJ -> [CONJ,?numb,?per,?gend,None,None,None] 40 | INTJ -> [INTJ,?numb,?per,?gend,None,None,None] 41 | QUES -> [QUES,?numb,?per,?gend,None,None,None] 42 | NUMR[case='nomn'] -> [NUMR,?numb,?per,?gend,nomn,None,None] 43 | NUMR[case='gent'] -> [NUMR,?numb,?per,?gend,gent,None,None] 44 | NUMR[case='datv'] -> [NUMR,?numb,?per,?gend,datv,None,None] 45 | NUMR[case='accs'] -> [NUMR,?numb,?per,?gend,accs,None,None] 46 | NUMR[case='ablt'] -> [NUMR,?numb,?per,?gend,ablt,None,None] 47 | NUMR[case='loct'] -> [NUMR,?numb,?per,?gend,loct,None,None] 48 | NPRO[case='nomn'] -> [NPRO,?numb,?per,?gend,nomn,None,None] 49 | NPRO[case='gent'] -> [NPRO,?numb,?per,?gend,gent,None,None] 50 | NPRO[case='datv'] -> [NPRO,?numb,?per,?gend,datv,None,None] 51 | NPRO[case='accs'] -> [NPRO,?numb,?per,?gend,accs,None,None] 52 | NPRO[case='ablt'] -> [NPRO,?numb,?per,?gend,ablt,None,None] 53 | NPRO[case='loct'] -> [NPRO,?numb,?per,?gend,loct,None,None] -------------------------------------------------------------------------------- /notebooks.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": { 7 | "collapsed": true, 8 | "ExecuteTime": { 9 | "end_time": "2024-05-12T21:01:27.459715900Z", 10 | "start_time": "2024-05-12T21:01:27.297689700Z" 11 | } 12 | }, 13 | "outputs": [], 14 | "source": [ 15 | "from syntax_analyzer.parser import Parser\n", 16 | "\n", 17 | "parser = Parser()\n", 18 | "\n", 19 | "sent = \"Мама мыла раму.\"\n", 20 | "\n", 21 | "t = parser.parse(sent)\n" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 5, 27 | "outputs": [ 28 | { 29 | "name": "stdout", 30 | "output_type": "stream", 31 | "text": [ 32 | "S\n", 33 | " NP[case='nomn'] \n", 34 | " Мама ['NOUN', 'sing', 'femn', 'nomn']\n", 35 | " VP[tran]\n", 36 | " VP[tran] \n", 37 | " мыла ['VERB', 'sing', 'femn', 'tran', 'past']\n", 38 | " NP[case='accs'] \n", 39 | " раму ['NOUN', 'sing', 'femn', 'accs']\n" 40 | ] 41 | } 42 | ], 43 | "source": [ 44 | "t[0].display()" 45 | ], 46 | "metadata": { 47 | "collapsed": false, 48 | "ExecuteTime": { 49 | "end_time": "2024-05-12T21:01:39.314815700Z", 50 | "start_time": "2024-05-12T21:01:39.304082600Z" 51 | } 52 | } 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "outputs": [], 58 | "source": [], 59 | "metadata": { 60 | "collapsed": false 61 | } 62 | } 63 | ], 64 | "metadata": { 65 | "kernelspec": { 66 | "display_name": "Python 3", 67 | "language": "python", 68 | "name": "python3" 69 | }, 70 | "language_info": { 71 | "codemirror_mode": { 72 | "name": "ipython", 73 | "version": 2 74 | }, 75 | "file_extension": ".py", 76 | "mimetype": "text/x-python", 77 | "name": "python", 78 | "nbconvert_exporter": "python", 79 | "pygments_lexer": "ipython2", 80 | "version": "2.7.6" 81 | } 82 | }, 83 | "nbformat": 4, 84 | "nbformat_minor": 0 85 | } 86 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pymorphy2==0.9.1 2 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='syntax_analyzer', 5 | version='0.1.0', 6 | description='Free Context Grammar Syntax Analyzer for Russian', 7 | author='Konstantin VERNER', 8 | author_email='konst.verner@gmail.com', 9 | package_dir={"": "src"}, 10 | packages=find_packages("src"), 11 | install_requires=[ 12 | 'pymorphy2==0.9.1' 13 | ], 14 | classifiers=[ 15 | 'Development Status :: 3 - Alpha', 16 | 'Intended Audience :: Developers', 17 | 'License :: OSI Approved :: MIT License', 18 | 'Operating System :: OS Independent', 19 | 'Programming Language :: Python :: 3', 20 | 'Programming Language :: Python :: 3.7', 21 | 'Programming Language :: Python :: 3.8', 22 | 'Programming Language :: Python :: 3.9', 23 | 'Programming Language :: Python :: 3.10', 24 | 'Topic :: Software Development :: Libraries :: Python Modules', 25 | ], 26 | python_requires='>=3.7', 27 | ) -------------------------------------------------------------------------------- /src/syntax_analyzer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/konverner/syntax_analyzer/d0a3a43e20ccd3fff4899e554b6c669736349072/src/syntax_analyzer/__init__.py -------------------------------------------------------------------------------- /src/syntax_analyzer/parser.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import os 3 | 4 | import pymorphy2 5 | import shelve 6 | 7 | from .tree import Tree, Root, Node 8 | from .utils import create_rules, load_grammar 9 | 10 | class Parser: 11 | def __init__( 12 | self, 13 | grammar_path=None, 14 | dict_path=None 15 | ): 16 | self.pos = ["NOUN", "VERB", "ADJF", "ADJS", "COMP", 17 | "INFN", "PRTF", "PRTS", "GRND", "NUMR", "ADVB", 18 | "NPRO", "PREP", "PRED", "CONJ", "PRCL", "INTJ", "QUES"] 19 | self.valency = ["tran", "intr"] 20 | self.numb = ["sing", "plur"] 21 | self.gend = ["musc", "femn", "neut"] 22 | self.per = ["1per", "2per", "3per"] 23 | self.tense= ["past", "pres", "futr"] 24 | self.cases = ["nomn", "gent", "datv", "accs", "acc2", "gen1", "gen2", "ablt", "loct", "voct", "loc1", "loc2"] 25 | 26 | if grammar_path is None: 27 | grammar_path = os.path.join(".", 'data', 'grammar.txt') 28 | if dict_path is None: 29 | dict_path = os.path.join(".", 'data', 'dict', 'd') 30 | 31 | self.grammar = load_grammar(grammar_path) 32 | self.dict = shelve.open(dict_path) 33 | 34 | self.morph = pymorphy2.MorphAnalyzer() 35 | 36 | 37 | def tag(self, word): 38 | 39 | result = list() 40 | tags_set = [None]*7 41 | 42 | # if word is complex pharse (predicative of conjugation) 43 | 44 | if (word[1:5] == 'pred'): 45 | tags = ["PRED",None,None,None,None,None,None] 46 | return list([tags]) 47 | 48 | if (word[1:5] == 'conj'): 49 | tags = ["CONJ",None,None,None,None,None,None] 50 | return list([tags]) 51 | 52 | # if word is single 53 | 54 | tags = [list(tag.grammemes) for tag in self.morph.tag(word)] 55 | 56 | for tag in tags: 57 | for grammeme in tag: 58 | 59 | if grammeme in self.numb: 60 | tags_set[1] = grammeme 61 | if grammeme in self.per: 62 | tags_set[2] = grammeme 63 | if grammeme in self.gend: 64 | tags_set[3] = grammeme 65 | if grammeme in self.cases: 66 | tags_set[4] = grammeme 67 | if grammeme in self.valency: 68 | tags_set[5] = grammeme 69 | if grammeme in self.tense: 70 | tags_set[6] = grammeme 71 | 72 | if (grammeme in self.pos and tags_set[0] == None): 73 | tags_set[0] = grammeme 74 | 75 | # if word is anaphora 76 | if (grammeme is "Anph"): 77 | tags_set[0] = "NPRO" 78 | 79 | # if word is predicative 80 | if (grammeme is "Prdx"): 81 | tags_set[0] = "PRED" 82 | 83 | 84 | result.append(tags_set) 85 | tags_set = [None]*7 86 | 87 | 88 | return result 89 | 90 | # convertation of grammmemes in general form to search in grammar 91 | 92 | def general_form(self, form): 93 | result = '[' 94 | result += str(form[0]) + ',' 95 | result += "?numb" + ',' 96 | result += "?per" + ',' 97 | result += "?gend" + ',' 98 | 99 | if (form[4] is None): result += 'None' + ',' 100 | else: 101 | result += str(form[4]) + ',' 102 | 103 | if (form[5] is None): result += 'None' + ',' 104 | else: 105 | result += str(form[5]) + ',' 106 | 107 | if (form[6] is None): result += 'None' + ',' 108 | else: 109 | result += '?tense' + ',' 110 | 111 | return result[:-1] + ']' 112 | 113 | 114 | # find lefthand part of production 115 | 116 | def find_lhs(self, rhs): 117 | rule = self.general_form(rhs) 118 | while (rule in self.grammar.keys()): 119 | rule = self.grammar[rule] 120 | return rule 121 | 122 | 123 | def create_parse_tree(self, sent): 124 | tree = Tree(self.grammar) 125 | tree.build(sent) 126 | tree.reduce() 127 | if (tree.create_root([tree])==False): 128 | return False 129 | else: 130 | return tree 131 | 132 | # function to split the sentence into clauses 133 | 134 | def split_sentence(self, sent): 135 | temp = sent.copy() 136 | subtrees = list() 137 | for i, word in enumerate(temp): 138 | 139 | # if conj - try to use it as delimiter of clauses 140 | if (temp[i][1] == "CONJ"): 141 | if(self.create_parse_tree(temp[:i]) is not False): 142 | subtrees.append(self.create_parse_tree(temp[:i])) 143 | del temp[:i+1] 144 | 145 | # if nominative - try to use it as delimiter of clauses 146 | if (i 0): 139 | rule = self.find_rule(self.nodes[i - 1], self.nodes[i]) 140 | 141 | if (rule != None and (self.grammar[rule][:3] == 'ADJ' or self.grammar[rule][:3] == 'PRT')): 142 | self.unite_nodes(self.nodes[i - 1], self.nodes[i], self.grammar[rule]) 143 | self.reduce_ADJ() 144 | 145 | rule = None 146 | 147 | if (i + 1 < len(self.nodes)): 148 | j = i + 1 149 | if (self.nodes[i + 1].tag == "CONJ" and i + 2 < len(self.nodes)): 150 | if (self.nodes[i + 2].tag[:3] == "ADJ" and self.nodes[i].tag[:3] == "ADJ"): 151 | j = i + 2 152 | else: 153 | rule2 = None 154 | if (self.nodes[i + 2].tag[:3] == "PRT" and self.nodes[i].tag[:3] == "PRT"): 155 | j = i + 2 156 | else: 157 | rule2 = None 158 | 159 | rule = self.find_rule(self.nodes[i], self.nodes[j]) 160 | 161 | if (rule != None and (self.grammar[rule][:3] == 'ADJ' or self.grammar[rule][:3] == 'PRT')): 162 | self.unite_nodes(self.nodes[i], self.nodes[j], self.grammar[rule]) 163 | self.reduce_ADJ() 164 | 165 | def reduce_NP(self): 166 | for i, node in enumerate(self.nodes): 167 | if (self.nodes[i].tag[:2] == "NP"): 168 | 169 | rule = None 170 | 171 | if (i > 0): 172 | rule = self.find_rule(self.nodes[i - 1], self.nodes[i]) 173 | 174 | if (rule != None and self.grammar[rule][:2] == 'NP'): 175 | self.unite_nodes(self.nodes[i - 1], self.nodes[i], self.grammar[rule]) 176 | self.reduce_NP() 177 | 178 | rule = None 179 | 180 | if (i + 1 < len(self.nodes)): 181 | j = i + 1 182 | if (self.nodes[i + 1].tag == "CONJ" and i + 2 < len(self.nodes)): 183 | if (self.nodes[i + 2].tag[:2] == "NP" and self.nodes[i].tag[:2] == "NP"): 184 | j = i + 2 185 | else: 186 | rule2 = None 187 | 188 | rule = self.find_rule(self.nodes[i], self.nodes[j]) 189 | 190 | if (rule != None and self.grammar[rule][:2] == 'NP'): 191 | self.unite_nodes(self.nodes[i], self.nodes[j], self.grammar[rule]) 192 | self.reduce_NP() 193 | 194 | def reduce_PP(self): 195 | for i, node in enumerate(self.nodes): 196 | if (self.nodes[i].tag == "PREP" or \ 197 | self.nodes[i].tag == "PP"): 198 | 199 | rule = None 200 | 201 | if (i > 0): 202 | rule = self.find_rule(self.nodes[i - 1], self.nodes[i]) 203 | 204 | if (rule != None and self.grammar[rule][:2] == 'PP'): 205 | self.unite_nodes(self.nodes[i - 1], self.nodes[i], self.grammar[rule]) 206 | self.reduce_PP() 207 | 208 | rule = None 209 | 210 | if (i + 1 < len(self.nodes)): 211 | j = i + 1 212 | if (self.nodes[i + 1].tag == "CONJ" and i + 2 < len(self.nodes)): 213 | if (self.nodes[i + 2].tag[:2] == "PP"): 214 | j = i + 2 215 | else: 216 | rule2 = None 217 | 218 | rule = self.find_rule(self.nodes[i], self.nodes[j]) 219 | 220 | if (rule != None and self.grammar[rule][:2] == 'PP'): 221 | self.unite_nodes(self.nodes[i], self.nodes[j], self.grammar[rule]) 222 | self.reduce_PP() 223 | 224 | def reduce_S(self): 225 | for i, node in enumerate(self.nodes): 226 | for j in range(i + 1, len(self.nodes)): 227 | if (i < len(self.nodes) and j < len(self.nodes)): 228 | rule = self.find_rule(self.nodes[i], self.nodes[j]) 229 | if (rule != None and self.grammar[rule][:2] == 'S' and self.agreement(self.nodes[i], 230 | self.nodes[j])): 231 | self.unite_nodes(self.nodes[i], self.nodes[j], self.grammar[rule]) 232 | self.reduce_S() 233 | for i, node in enumerate(self.nodes): 234 | if (node.tag == "CONJ"): 235 | self.nodes.remove(node) 236 | 237 | def create_root(self, subtrees): 238 | self.root = Root() 239 | for subtree in subtrees: 240 | for node in subtree.nodes: 241 | if (node.tag != 'S' and node.tag[:2] != 'VP'): 242 | return False 243 | else: 244 | # complete sentence 245 | if (node.tag == 'S'): 246 | self.root.sentences.append(node) 247 | # incomplete sentence 248 | if (node.tag[:2] == "VP"): 249 | self.root.sentences.append(node) 250 | if (len(self.root.sentences) == 0): 251 | return False 252 | else: 253 | return True 254 | 255 | def _display(self, node, space): 256 | if (node): 257 | if (node.tag == "S"): 258 | print(node.tag) 259 | if (node.tag != "S" and not node.leaf): 260 | print(" " * space + node.tag) 261 | if (node.leaf): 262 | print(' ' * space * 2, node.tag, '\n', ' ' * 3 * space, node.word, node.grammemes_simple) 263 | 264 | self._display(node.l, space + 2) 265 | self._display(node.r, space + 2) 266 | 267 | # preorder traversal 268 | def display(self): 269 | for s in self.root.sentences: 270 | self._display(s, 1) 271 | 272 | def get_sentence(self, node, sent): 273 | if node: 274 | self.get_sentence(node.l, sent) 275 | if (node.word): 276 | sent.append([node.word, node.grammemes_simple]) 277 | self.get_sentence(node.r, sent) 278 | return sent 279 | 280 | def sentence(self): 281 | return self.get_sentence(self.root, []) 282 | -------------------------------------------------------------------------------- /src/syntax_analyzer/utils.py: -------------------------------------------------------------------------------- 1 | def create_rules(lhs, rhs): 2 | rhs = rhs.split(' | ') 3 | rules = dict() 4 | for product in rhs: 5 | rules[product.strip()] = lhs 6 | return rules 7 | 8 | 9 | def load_grammar(file_path): 10 | with open(file_path, 'r') as file: 11 | data = file.read().split('\n') 12 | result = dict() 13 | for line in data: 14 | rule = line.split('->') 15 | lhs = rule[0].strip() 16 | rhs = rule[1] 17 | 18 | rules = create_rules(lhs, rhs) 19 | result.update(rules) 20 | return result 21 | -------------------------------------------------------------------------------- /tests/test_parser.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from syntax_analyzer.parser import Parser 4 | 5 | def test_parser(): 6 | parser = Parser( 7 | grammar_path="./data/grammar.txt", 8 | dict_path="./data/dict/d" 9 | ) 10 | sent = "Мама мыла раму" 11 | result = parser.parse(sent) 12 | assert len(result) == 1 13 | --------------------------------------------------------------------------------