├── .github └── FUNDING.yml ├── .gitignore ├── .travis.yml ├── CITATION.cff ├── LICENSE ├── MANIFEST.in ├── README.md ├── setup.cfg ├── setup.py ├── syntaxmaker ├── __init__.py ├── adposition_tool.py ├── converter.py ├── data │ ├── postpositions.csv │ └── prepositions.csv ├── grammar.json ├── head.py ├── inflector.py ├── locative_case.json ├── noun_tool.py ├── phrase.py ├── pronoun_tool.py ├── syntax_maker.py ├── ud_map.json ├── verb_valence.py └── verb_valences_new.json ├── test ├── 100verbs.txt ├── generate_sentences.py ├── results.csv ├── results.xlsx └── wiktionary_verbs.py ├── testi.py └── travis_test.py /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: [mikahama] 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.*~ 2 | *.pyc 3 | .idea/* 4 | syntaxmaker.egg-info/* 5 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | # command to install dependencies 5 | install: 6 | - pip install python-coveralls 7 | - python setup.py install 8 | # command to run tests 9 | script: 10 | - coverage run travis_test.py 11 | after_success: 12 | - coveralls 13 | 14 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | # This CITATION.cff file was generated with cffinit. 2 | # Visit https://bit.ly/cffinit to generate yours today! 3 | 4 | cff-version: 1.2.0 5 | title: Syntax Maker 6 | message: >- 7 | If you use this software, please cite it using the 8 | metadata from this file. 9 | type: software 10 | authors: 11 | - given-names: Mika 12 | family-names: Hämäläinen 13 | - given-names: Jack 14 | family-names: Rueter 15 | identifiers: 16 | - type: doi 17 | value: 10.5281/zenodo.1143056 18 | description: Zenodo 19 | repository-code: 'https://github.com/mikahama/syntaxmaker' 20 | date-released: '2018-01-09' 21 | preferred-citation: 22 | type: article 23 | authors: 24 | - family-names: "Hämäläinen" 25 | given-names: "Mika" 26 | - family-names: "Rueter" 27 | given-names: "Jack" 28 | journal: "Proceedings of the Fourth International Workshop on Computational Linguistics of Uralic Languages" 29 | title: "Development of an Open Source Natural Language Generation Tool for Finnish" 30 | year: 2018 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2015-2019 Mika Hämäläinen 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include DESCRIPTION.rst 2 | 3 | # Include the test suite (FIXME: does not work yet) 4 | # recursive-include tests * 5 | 6 | # If using Python 2.6 or less, then have to include package data, even though 7 | # it's already declared in setup.py 8 | include verb_valences_new.json 9 | include data/postpositions.csv 10 | include data/prepositions.csv 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Syntax maker 2 | ======= 3 | 4 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3483626.svg)](https://doi.org/10.5281/zenodo.3483626) 5 | 6 | The tool NLG tool for Finnish by [Mika Hämäläinen](https://mikakalevi.com) 7 | 8 | Syntax maker is the natural language generation tool for generating syntactically correct sentences in Finnish automatically. The tool is especially useful in the case of Finnish which has such a high diversity in its morphosyntax. All you need to know are the lemmas and their parts-of-speech and syntax maker will take care of the rest. 9 | 10 | For instance, just throw in words `rantaleijona`, `uneksia`, `korkea` and `aalto` and you will get `rantaleijonat uneksivat korkeista aalloista`. So you will get the morphology right automatically! Don't believe me? [Just take a look at this tutorial to find out how.](https://github.com/mikahama/syntaxmaker/wiki/Creating-a-sentence,-the-basics) 11 | 12 | # Installing 13 | Run 14 | 15 | pip install syntaxmaker 16 | python -m uralicNLP.download -l fin 17 | 18 | 19 | # Usage 20 | 21 | An example for generating a sentence in Finnish: 22 | 23 | from syntaxmaker.syntax_maker import * 24 | vp = create_verb_pharse("antaa") 25 | subject = create_phrase("NP", "hevonen", {"NUM": "PL"}) 26 | 27 | dobject = create_phrase("NP", "lahja", {"NUM": "PL"}) 28 | dobject.components["attribute"] = create_phrase("AP", "mahtava") 29 | dobject.components["attribute"].components["attribute"] = create_phrase("AdvP", "erittäin") 30 | 31 | indobject = create_phrase("NP", "lehmä") 32 | vp.components["subject"] = subject 33 | vp.components["dir_object"] = dobject 34 | vp.components["indir_object"] = indobject 35 | print(vp) 36 | >> hevoset antavat erittäin mahtavia lahjoja lehmälle 37 | 38 | Go to [Creating a sentence, the basics](https://github.com/mikahama/syntaxmaker/wiki/Creating-a-sentence,-the-basics) for a quick start guide. 39 | 40 | A good source of example code with the expected output can be found in [the Travis test file](https://github.com/mikahama/syntaxmaker/blob/master/travis_test.py). 41 | 42 | Don't forget to [read the Wiki](https://github.com/mikahama/syntaxmaker/wiki) for more instructions. 43 | 44 | # Cite 45 | 46 | If you use Syntax Maker in any academic publication, please cite it as follows: 47 | 48 | Hämäläinen, Mika and Rueter, Jack 2018. [Development of an Open Source Natural Language Generation Tool for Finnish](http://aclweb.org/anthology/W18-0205). In *Proceedings of the Fourth International Workshop on Computational Linguistics of Uralic Languages*, 51–58. 49 | 50 | # More information? 51 | 52 | Just go ahead and [take a look at the wiki](https://github.com/mikahama/syntaxmaker/wiki) or my [blog post about Syntax maker](https://mikalikes.men/create-finnish-sentences-computationally-in-python-nlg/). 53 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | # This flag says that the code is written to work on both Python 2 and Python 3 | # 3. If at all possible, it is good practice to do this. If you cannot, you 4 | # will need to generate wheels for each Python version that you support. 5 | universal=1 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """A setuptools based setup module. 3 | See: 4 | https://packaging.python.org/en/latest/distributing.html 5 | https://github.com/pypa/sampleproject 6 | """ 7 | 8 | # Always prefer setuptools over distutils 9 | from setuptools import setup, find_packages 10 | # To use a consistent encoding 11 | from codecs import open 12 | from os import path 13 | 14 | here = path.abspath(path.dirname(__file__)) 15 | 16 | # Get the long description from the relevant file 17 | with open(path.join(here, 'README.md'), encoding='utf-8') as f: 18 | long_description = f.read() 19 | 20 | setup( 21 | name='syntaxmaker', 22 | 23 | # Versions should comply with PEP440. For a discussion on single-sourcing 24 | # the version across setup.py and the project code, see 25 | # https://packaging.python.org/en/latest/single_source_version.html 26 | version='2.0.1', 27 | zip_safe=False, 28 | description='The NLG tool for Finnish', 29 | long_description=long_description, 30 | long_description_content_type="text/markdown", 31 | # The project's main homepage. 32 | url='https://github.com/mikahama/syntaxmaker/', 33 | 34 | # Author details 35 | author='Mika Hämäläinen', 36 | author_email='mika@rootroo.com', 37 | 38 | # Choose your license 39 | license='Apache License, Version 2.0', 40 | 41 | # See https://pypi.python.org/pypi?%3Aaction=list_classifiers 42 | classifiers=[ 43 | # How mature is this project? Common values are 44 | # 3 - Alpha 45 | # 4 - Beta 46 | # 5 - Production/Stable 47 | 'Development Status :: 5 - Production/Stable', 48 | 49 | # Indicate who your project is intended for 50 | 'Intended Audience :: Developers', 51 | 'Topic :: Text Processing', 52 | "Natural Language :: Finnish", 53 | 54 | # Specify the Python versions you support here. In particular, ensure 55 | # that you indicate whether you support Python 2, Python 3 or both. 56 | 'Programming Language :: Python :: 3', 57 | 58 | ], 59 | 60 | # What does your project relate to? 61 | keywords='NLG Finnish', 62 | 63 | # You can just specify the packages manually here if your project is 64 | # simple. Or you can use find_packages(). 65 | packages=["syntaxmaker"], 66 | package_dir={'syntaxmaker': 'syntaxmaker'}, 67 | 68 | # List run-time dependencies here. These will be installed by pip when 69 | # your project is installed. For an analysis of "install_requires" vs pip's 70 | # requirements files see: 71 | # https://packaging.python.org/en/latest/requirements.html 72 | install_requires=["uralicNLP>=1.2.2"], 73 | 74 | # List additional groups of dependencies here (e.g. development 75 | # dependencies). You can install these using the following syntax, 76 | # for example: 77 | # $ pip install -e .[dev,test] 78 | extras_require={}, 79 | 80 | # If there are data files included in your packages that need to be 81 | # installed, specify them here. If using Python 2.6 or less, then these 82 | # have to be included in MANIFEST.in as well. 83 | package_data={ 84 | 'syntaxmaker': ['verb_valences_new.json', 'data/*.csv', '*.json'], 85 | }, 86 | 87 | # Although 'package_data' is the preferred approach, in some case you may 88 | # need to place data files outside of your packages. See: 89 | # http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files # noqa 90 | # In this case, 'data_file' will be installed into '/my_data' 91 | data_files=[], 92 | 93 | # To provide executable scripts, use entry points in preference to the 94 | # "scripts" keyword. Entry points provide cross-platform support and allow 95 | # pip to create the appropriate form of executable for the target platform. 96 | entry_points={}, 97 | ) 98 | -------------------------------------------------------------------------------- /syntaxmaker/__init__.py: -------------------------------------------------------------------------------- 1 | import os, codecs 2 | import json 3 | from uralicNLP import uralicApi 4 | 5 | valence_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'verb_valences_new.json') 6 | valences = json.load(codecs.open(valence_path, "r", encoding="utf-8")) 7 | 8 | locative_cases_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'locative_case.json') 9 | locative_cases = json.load(codecs.open(locative_cases_path, "r", encoding="utf-8")) 10 | 11 | if not uralicApi.is_language_installed("fin"): 12 | print("Finnish morphology is missing\nStarting download... (this should only happen once)") 13 | uralicApi.download("fin") 14 | 15 | class ValencyException(Exception): 16 | pass -------------------------------------------------------------------------------- /syntaxmaker/adposition_tool.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'Mika Hämäläinen' 3 | import csv 4 | import random 5 | import os 6 | 7 | prepositions = {} 8 | postpositions = {} 9 | 10 | def load_csv(dictionary, file): 11 | 12 | reader = csv.reader(open(os.path.join(os.path.dirname(os.path.realpath(__file__)), "data", file), 'r')) 13 | for row in reader: 14 | k, v = row 15 | dictionary[k] = v 16 | 17 | load_csv(prepositions, "prepositions.csv") 18 | load_csv(postpositions, "postpositions.csv") 19 | 20 | def preposition_case(prep): 21 | if prep in prepositions: 22 | return prepositions[prep] 23 | else: 24 | return None 25 | 26 | def postposition_case(post): 27 | if post in postpositions: 28 | return postpositions[post] 29 | else: 30 | return None 31 | 32 | def get_an_adposition(): 33 | if random.choice([True, False]): 34 | return random.choice(postpositions) 35 | else: 36 | return random.choice(prepositions) -------------------------------------------------------------------------------- /syntaxmaker/converter.py: -------------------------------------------------------------------------------- 1 | from .syntaxmaker import * 2 | 3 | def convert_UD(UD_structure): 4 | nodes = UD_structure.find() 5 | phrases = [_node_to_phrase(x) for x in nodes] 6 | 7 | 8 | def _node_to_phrase(UD_node): 9 | pos = UD_node.upostag 10 | if pos == "NOUN" or pos =="PROPN": 11 | return create_noun_phrase(UD_node.lemma, morphology=_noun_morphology(UD_node.feats)) 12 | elif pos == "ADJ": 13 | return create_adjective_phrase(UD_node.lemma, morphology=_noun_morphology(UD_node.feats)) 14 | elif pos == "ADV": 15 | return create_adverb_phrase(UD_node.lemma, morphology=_noun_morphology(UD_node.feats)) 16 | elif pos == "VERB": 17 | pass 18 | elif pos == "ADP": 19 | return create_adposition_phrase(UD_node.lemma, np=None) 20 | elif pos == "AUX": 21 | pass 22 | elif pos == "CCONJ": 23 | pass 24 | elif pos == "DET": 25 | pass 26 | elif pos == "NUM": 27 | pass 28 | elif pos == "PART": 29 | pass 30 | elif pos == "PRON": 31 | pass 32 | elif pos == "SCONJ": 33 | pass 34 | elif pos == "PUNCT" or pos == "SYM" or pos == "X" or pos == "INTJ": 35 | return create_phrase("GENERIC_P", UD_node.lemma) 36 | 37 | def _noun_morphology(UD_node): 38 | ud_morphs = UD_node.feats.split("|") 39 | morphology = {} 40 | psor_n = None 41 | psor_p = None 42 | for ud_morph in ud_morphs: 43 | if ud_morph.startswith("Case="): 44 | morphology["CASE"] = ud_morph.replace("Case=", "") 45 | elif ud_morph == "Number=Sing": 46 | morphology["NUM"] = "Sg" 47 | elif ud_morph == "Number=Plur": 48 | morphology["NUM"] = "Pl" 49 | elif "Number[psor]" in ud_morph: 50 | if "Sing" in ud_morph: 51 | psor_n = "Sg" 52 | else: 53 | psor_n = "Pl" 54 | elif "Person[psor]" in ud_morph: 55 | psor_p = ud_morph[-1] 56 | elif ud_morph == "Degree=Cmp": 57 | morphology["DEGREE"] = "Comp" 58 | elif ud_morph == "Degree=Sup": 59 | morphology["DEGREE"] = "Superl" 60 | if psor_n is not None and psor_p is not None: 61 | morphology["POSS"] = "Px" + psor_n + psor_p 62 | return morphology 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /syntaxmaker/data/postpositions.csv: -------------------------------------------------------------------------------- 1 | aikana,Gen 2 | alapuolella,Gen 3 | alapuolelle,Gen 4 | alapuolelta,Gen 5 | ali,Gen 6 | alla,Gen 7 | alle,Gen 8 | asti,Ill 9 | edelle,Gen 10 | edellä,Gen 11 | perässä,Gen 12 | edessä,Gen 13 | johdosta,Gen 14 | jälkeen,Gen 15 | kanssa,Gen 16 | kautta,Gen 17 | edestä,Gen 18 | eteen,Gen 19 | kesken,Gen 20 | taakse,Gen 21 | kohtaan,Par 22 | kohti,Par 23 | lisäksi,Gen 24 | luo,Gen 25 | luokse,Gen 26 | luona,Gen 27 | luota,Gen 28 | lähtien,Ela 29 | läpi,Gen 30 | mennessä,Ill 31 | mukaan,Gen 32 | mukaisesti,Gen 33 | myötä,Gen 34 | nähden,Gen 35 | oheen,Gen 36 | ohella,Gen 37 | ohelle,Gen 38 | ohelta,Gen 39 | ohessa,Gen 40 | ohesta,Gen 41 | perästä,Gen 42 | perään,Gen 43 | poikitse,Gen 44 | puolesta,Gen 45 | päin,Par 46 | päähän,Gen 47 | päällä,Gen 48 | päältä,Gen 49 | päästä,Gen 50 | sijaan,Gen 51 | sisäpuolella,Gen 52 | sisäpuolelta,Gen 53 | sisäpuolelle,Gen 54 | suhteen,Gen 55 | takaa,Gen 56 | takana,Gen 57 | tykö,Gen 58 | tykönä,Gen 59 | tyköä,Gen 60 | ulkopuolella,Gen 61 | ulkopuolelle,Gen 62 | ulkopuolelta,Gen 63 | varrella,Gen 64 | varten,Par 65 | vastaan,Par 66 | vasten,Par 67 | verran,Gen 68 | viereen,Gen 69 | vierelle,Gen 70 | vierellä,Gen 71 | viereltä,Gen 72 | vieressä,Gen 73 | vierestä,Gen 74 | vuoksi,Gen 75 | välillä,Gen 76 | välissä,Gen 77 | välistä,Gen 78 | yläpuolella,Gen 79 | yläpuolelle,Gen 80 | yläpuolelta,Gen 81 | ympäri,Gen 82 | ympärillä,Gen 83 | -------------------------------------------------------------------------------- /syntaxmaker/data/prepositions.csv: -------------------------------------------------------------------------------- 1 | ennen,Par 2 | ilman,Par 3 | keskelle,Par 4 | keskellä,Par 5 | keskeltä,Par 6 | kohti,Par 7 | loitolla,Ela 8 | loitolle,Ela 9 | loitolta,Ela 10 | päin,Par 11 | riippumatta,Ela 12 | vastoin,Par 13 | ylle,Gen 14 | ympäri,Par 15 | -------------------------------------------------------------------------------- /syntaxmaker/grammar.json: -------------------------------------------------------------------------------- 1 | { 2 | "VP0": { 3 | "components": null, 4 | "head": "V" 5 | }, 6 | "VP1": { 7 | "components": { 8 | "subject": "NP" 9 | }, 10 | "order": ["subject", "head"], 11 | "head": "V", 12 | "agreement": { 13 | "subject": ["PERS", "NUM"] 14 | } 15 | }, 16 | "VP_COPULA": { 17 | "components": { 18 | "subject": "NP", 19 | "predicative": "NP" 20 | }, 21 | "order": ["subject", "head", "predicative"], 22 | "head": "V", 23 | "agreement": { 24 | "subject": ["PERS", "NUM"] 25 | } 26 | }, 27 | "VP2": { 28 | "components": { 29 | "subject": "NP", 30 | "dir_object": "NP" 31 | }, 32 | "order": ["subject", "head", "dir_object"], 33 | "head": "V", 34 | "agreement": { 35 | "subject": ["PERS", "NUM"] 36 | } 37 | }, 38 | "VP3": { 39 | "components": { 40 | "subject": "NP", 41 | "dir_object": "NP", 42 | "indir_object": "NP" 43 | }, 44 | "order": ["subject", "head", "dir_object", "indir_object"], 45 | "head": "V", 46 | "agreement": { 47 | "subject": ["PERS", "NUM"] 48 | } 49 | }, 50 | "GENERIC_P":{ 51 | "components": null, 52 | "head": "GENERIC", 53 | "order": ["head"] 54 | }, 55 | "NP": { 56 | "components": { 57 | "attribute": "AP*" 58 | }, 59 | "order": ["attribute", "head"], 60 | "head": "N" 61 | }, 62 | "AP": { 63 | "components": { 64 | "attribute": "AdvP?" 65 | }, 66 | "order": ["attribute", "head"], 67 | "head": "A", 68 | "agreement": { 69 | "parent": ["CASE", "NUM"] 70 | } 71 | }, 72 | "AdvP": { 73 | "components": { 74 | "attribute": "AdvP?" 75 | }, 76 | "order": ["attribute", "head"], 77 | "head": "Adv" 78 | }, 79 | "PrepP":{ 80 | "components": { 81 | "complement": "NP" 82 | }, 83 | "order": ["head", "complement"], 84 | "head": "Prep" 85 | }, 86 | "PostP":{ 87 | "components": { 88 | "complement": "NP" 89 | }, 90 | "order": ["complement", "head"], 91 | "head": "Post" 92 | } 93 | 94 | } -------------------------------------------------------------------------------- /syntaxmaker/head.py: -------------------------------------------------------------------------------- 1 | #encoding: utf-8 2 | __author__ = 'Mika Hämäläinen' 3 | from . import inflector 4 | 5 | class Head: 6 | def __init__(self, head, pos): 7 | self.lemma = head 8 | self.pos = pos 9 | 10 | def get_form(self, governance, agreement): 11 | if self.lemma is None: 12 | return "" 13 | governance.update(agreement) 14 | return inflector.inflect(self.lemma, self.pos, governance) 15 | 16 | def __str__(self): 17 | return self.lemma 18 | -------------------------------------------------------------------------------- /syntaxmaker/inflector.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'Mika Hämäläinen' 3 | 4 | import os 5 | from . import pronoun_tool 6 | import sys 7 | from uralicNLP import uralicApi 8 | 9 | if (sys.version_info > (3, 0)): 10 | # Python 3 11 | new_python = True 12 | from itertools import filterfalse as ffilter 13 | else: 14 | # Python 2 15 | new_python = False 16 | from itertools import ifilterfalse as ffilter 17 | 18 | 19 | case_suffixes ={"PAR":"A", "NOM": "","GEN":"n","ESS":"nA", "TRA": "ksi", "INE": "ssA", "ELA": "stA", "ADE": "llA", "ABL": "ltA", "ALL": "lle", "ABE": "ttA", "ILL": "n"} 20 | ei_forms = {"SG1": "en", "SG2":"et","SG3":"ei","PL1":"emme", "PL2":"ette", "PL3":"eivät", "PE4": "ei"} 21 | back_vowels = "aou" 22 | front_vowels = "äöy" 23 | vowels = "aeiouyäö" 24 | 25 | def inflect(word, pos, args): 26 | beginning = "" 27 | if "|" in word: 28 | beginning, word = word.rsplit("|",1) 29 | beginning = beginning.replace("|","") 30 | if len(args) == 0: 31 | return beginning + word 32 | clit ="" 33 | if "CLIT" in args: 34 | if args["CLIT"] == "KO": 35 | args["CLIT"] = "QST" 36 | clit = "+" +args["CLIT"].title() 37 | if pos == "GENERIC": 38 | return word 39 | elif pos == "V": 40 | if "PERS" in args and args["PERS"] == "4": 41 | voice = "PSS" 42 | else: 43 | voice = "ACT" 44 | if "MOOD" not in args or args["MOOD"] == "INDV": 45 | args["MOOD"] = "IND" 46 | if args["MOOD"] == "POTN": 47 | args["MOOD"] = "POT" 48 | if args["MOOD"] != "IND": 49 | tense = "" 50 | else: 51 | if "TENSE" not in args: 52 | tense = "+Prs" 53 | elif args["TENSE"] == "PRESENT": 54 | tense = "+Prs" 55 | elif args["TENSE"] == "PAST": 56 | tense = "+Prt" 57 | 58 | if word == "ei": 59 | ei_form = ei_forms[args["NUM"]+args["PERS"]] 60 | if "CLIT" in args and args["CLIT"] == "QST": 61 | ei_form = ei_form + "kö" 62 | return beginning + ei_form 63 | 64 | if "INF" in args: 65 | if args["INF"] == "A": 66 | #syödä, juoda 67 | return beginning + word 68 | else: 69 | #syömään, juomaan 70 | #omorfi_query = "[WORD_ID="+word+"][POS=VERB][VOICE="+voice+"][INF="+args["INF"]+"][CASE=ILL]" 71 | omorfi_query = word + "+V+Act+Inf"+args["INF"].title()+"+Sg+Ill" 72 | elif "NEG" in args and args["NEG"]: 73 | #(en) syö, juo... 74 | if "TEMPAUX" in args and args["PERS"] == "4" and tense == "+Prs": 75 | # ei ole syöty 76 | #omorfi_query = "[WORD_ID="+word+"][POS=VERB][VOICE=ACT][MOOD="+ args["MOOD"] +"]"+tense+"[NEG=CON]" 77 | omorfi_query = word+"+V+Pss+"+ args["MOOD"].title() +"+Prt+ConNeg" 78 | else: 79 | omorfi_query = word+"+V+"+voice.title()+"+"+args["MOOD"].title()+tense.title()+"+ConNeg" 80 | #omorfi_query = "[WORD_ID="+word+"][POS=VERB][VOICE="+voice+"][MOOD="+ args["MOOD"] +"]"+tense+ pers_string+"[NEG=CON]" 81 | else: 82 | #syön, juon 83 | if "TEMPAUX" in args and args["PERS"] == "4": 84 | #on syöty 85 | omorfi_query = word + "+V+Act+"+args["MOOD"].title()+tense+"+Sg3" 86 | #omorfi_query = "[WORD_ID="+word+"][POS=VERB][VOICE=ACT][MOOD="+ args["MOOD"] +"]"+tense+"[PERS=SG3]" 87 | elif "PERS" in args and args["PERS"] == "4": 88 | #omorfi_query = "[WORD_ID="+word+"][POS=VERB][VOICE="+voice+"][MOOD="+ args["MOOD"] +"]"+tense+pers_string +clit 89 | omorfi_query = word + "+V+Pss+"+args["MOOD"].title()+tense+"+Pe4" +clit 90 | else: 91 | omorfi_query = word + "+V+"+voice.title()+"+"+ args["MOOD"].title() +tense+ "+" + args["NUM"].title() + args["PERS"]+ clit 92 | #omorfi_query = "[WORD_ID="+word+"][POS=VERB][VOICE="+voice+"][MOOD="+ args["MOOD"] +"]"+tense+"[PERS="+args["NUM"]+args["PERS"]+"]" +clit 93 | elif pos == "PPron": 94 | #personal pronoun 95 | if "CASE" in args and args["CASE"] == "Gen": 96 | args["CASE"] = "ACC" 97 | if "CASE" in args and args["CASE"] == "TrueGen": 98 | args["CASE"] = "GEN" 99 | else: 100 | args["CASE"] = args["CASE"].upper() 101 | omorfi_query = word + "+Pron+Pers+"+args["NUM"].title()+args["PERS"]+"+"+ args["CASE"].title() + clit 102 | #omorfi_query = "[WORD_ID="+word+"][POS=PRONOUN][SUBCAT=PERSONAL][PERS="+args["NUM"]+args["PERS"]+"][NUM="+args["NUM"]+"][CASE="+args["CASE"]+"]" 103 | elif pos == "PastParticiple": 104 | #participle, syönyt, syöneet, syöty 105 | if args["NUM"] == "PE": 106 | #passive 107 | #omorfi_query = "[WORD_ID="+word+"][POS=VERB][VOICE=PSS][MOOD=INDV][TENSE=PAST][PERS=PE4][NEG=CON]" 108 | omorfi_query = word + "+V+Pss+PrfPrc+Sg+Nom" 109 | else: 110 | #active 111 | num = args["NUM"] 112 | #omorfi_query = "[WORD_ID="+word+"][POS=VERB][VOICE=ACT][MOOD=INDV][TENSE=PAST][NUM="+num+"][NEG=CON]" 113 | omorfi_query = word+"+V+Act+PrfPrc+"+num.title()+"+Nom" 114 | elif pos == "RelPron": 115 | case = args["CASE"] 116 | #omorfi_query = "[WORD_ID="+word+"][POS=PRONOUN][SUBCAT=RELATIVE][NUM="+args["NUM"]+"][CASE="+case+"]" 117 | omorfi_query = word + "+Pron+Rel+"+args["NUM"].title()+"+" + case.title() 118 | else: 119 | degree = "" 120 | if pos == "N": 121 | pos = "N" 122 | elif pos == "N+Prop": 123 | pass 124 | elif pos == "Adv": 125 | if "DEGREE" in args: 126 | pos = "A" 127 | args["CASE"] = "Ins" 128 | args["NUM"] = "Pl" 129 | else: 130 | pos = "A" 131 | 132 | if "CASE" not in args: 133 | args["CASE"] = "NOM" 134 | else: 135 | args["CASE"] = args["CASE"].upper() 136 | if "DEGREE" in args: 137 | degree = "+" + args["DEGREE"] 138 | possessive = "" 139 | if "POSS" in args: 140 | possessive = "+" + args["POSS"] 141 | #omorfi_query = "[WORD_ID="+word+"][POS="+pos+"][NUM="+args["NUM"]+"][CASE="+args["CASE"]+"]" 142 | omorfi_query = word +"+" +pos+ degree +"+" + args["NUM"].title() +"+" + args["CASE"].title() + possessive + clit 143 | word_form = _filter_generated(uralicApi.generate(omorfi_query, "fin"), word) 144 | if len(word_form) == 0: 145 | #Generation failed! 146 | if pos == "N": 147 | return inflect(beginning + "|" + word, "N+Prop", args) 148 | else: 149 | return beginning + backup_inflect(word, pos, args) 150 | else: 151 | return beginning + word_form[0][0] 152 | 153 | def _filter_generated(res, lemma): 154 | if len(res) < 2: 155 | return res 156 | for r in res: 157 | r_as = uralicApi.analyze(r[0], "fin", dictionary_forms=True) 158 | for r_a in r_as: 159 | r_a = r_a[0] 160 | if "+Use/Arch" not in r_a and "+Dial/" not in r_a and r_a.startswith(lemma): 161 | return [r] 162 | 163 | def backup_inflect(word, pos, args): 164 | if pos == "NOUN" or pos == "ADJECTIVE": 165 | #Nouns and adjectives 166 | if pronoun_tool.is_personal_pronoun(word): 167 | return word 168 | return standard_nominal_inflection(word, args["CASE"],args["NUM"]) 169 | else: 170 | return word 171 | 172 | def case_harmony(case, word): 173 | case = case_suffixes[case] 174 | if "A" in case: 175 | if has_back_vowels(word): 176 | return case.replace("A", "a") 177 | else: 178 | return case.replace("A", "ä") 179 | else: 180 | return case 181 | 182 | def has_back_vowels(word): 183 | word = word[::-1] 184 | for letter in word: 185 | if letter in back_vowels: 186 | return True 187 | elif letter in front_vowels: 188 | return False 189 | return False 190 | 191 | def standard_nominal_inflection(noun, case, number): 192 | if case not in case_suffixes: 193 | return noun 194 | 195 | last_letter = noun[-1] 196 | 197 | if case == "NOM": 198 | if number =="SG": 199 | return noun 200 | else: 201 | if last_letter not in vowels: 202 | return noun + "it" 203 | else: 204 | return noun + "t" 205 | 206 | 207 | if last_letter not in vowels: 208 | noun = noun + "i" 209 | 210 | if number == "PL" and noun[-1] == "i": 211 | if case != "PAR": 212 | noun = noun[:-1] + "ei" 213 | else: 214 | noun = noun[:-1] + "ej" 215 | elif number =="PL": 216 | noun = noun + "i" 217 | 218 | if case == "ILL": 219 | if number == "PL": 220 | noun = noun + "hi" 221 | else: 222 | noun = noun + noun[-1] 223 | noun = noun + case_harmony(case, noun) 224 | return noun 225 | 226 | -------------------------------------------------------------------------------- /syntaxmaker/noun_tool.py: -------------------------------------------------------------------------------- 1 | from . import locative_cases 2 | 3 | locative_map = {"external":{"in":"Ade","to":"All","from":"Abl"},"internal":{"in":"Ine","to":"Ill","from":"Ela"}} 4 | 5 | def get_locative(noun): 6 | if noun in locative_cases: 7 | noun_data = locative_cases[noun] 8 | if noun_data["Ade"] > noun_data["Ine"]: 9 | return "external" 10 | else: 11 | return "internal" 12 | else: 13 | return None 14 | 15 | def resolve_locative_case(locative_category, direction): 16 | return locative_map[locative_category][direction] 17 | -------------------------------------------------------------------------------- /syntaxmaker/phrase.py: -------------------------------------------------------------------------------- 1 | #encoding: utf-8 2 | __author__ = 'Mika Hämäläinen' 3 | from .head import Head 4 | import copy 5 | import re, sys 6 | 7 | unicode = str 8 | 9 | class Phrase: 10 | def __init__(self, head, structure, morphology={}): 11 | self.new_python = True 12 | self.parent = None 13 | self.head = Head(head, structure["head"]) 14 | self.components = copy.deepcopy(structure["components"]) 15 | if self.components is None: 16 | self.components = {} 17 | self.order = ["head"] 18 | else: 19 | self.order = copy.deepcopy(structure["order"]) 20 | if "agreement" in structure: 21 | self.agreement = copy.deepcopy(structure["agreement"]) 22 | else: 23 | self.agreement = {} 24 | if "governance" in structure: 25 | self.governance = copy.deepcopy(structure["governance"]) 26 | else: 27 | self.governance = {} 28 | self.morphology = copy.deepcopy(morphology) 29 | 30 | def resolve_agreement(self): 31 | forms = {} 32 | for key in self.agreement: 33 | if key == "parent" and self.parent is not None: 34 | morphology = self.parent.morphology 35 | elif key.startswith("parent->")and self.parent is not None: 36 | key_p = key[8:] 37 | morphology = self.parent.components[key_p].morphology 38 | elif key in self.components: 39 | morphology = self.components[key].morphology 40 | else: 41 | r = {"CASE": "Nom", "NUM": "SG", "PERS": "3"} 42 | r.update(self.morphology) 43 | return r 44 | for agreement_type in self.agreement[key]: 45 | forms[agreement_type] = morphology[agreement_type] 46 | return forms 47 | 48 | def to_string(self, received_governance = {}): 49 | self.morphology.update(received_governance) 50 | string_representation = "" 51 | if "dir_object" in self.components: 52 | if type(self.components["dir_object"]) is not str: 53 | if "NUM" in self.components["dir_object"].morphology and self.components["dir_object"].morphology["NUM"] == "PL": 54 | if "dir_object" in self.governance: 55 | if self.governance["dir_object"]["CASE"] == "Gen": 56 | self.governance["dir_object"]["CASE"] = "Par" 57 | for item in self.order: 58 | if item == "head": 59 | head_word = self.head.get_form(self.morphology, self.resolve_agreement()) 60 | string_representation = string_representation + " " + head_word 61 | else: 62 | phrase = self.components[item] 63 | if type(phrase) is str or (not self.new_python and type(phrase) is unicode): 64 | #Data not set 65 | pass 66 | else: 67 | phrase.parent = self 68 | governance = {} 69 | if item in self.governance: 70 | governance = self.governance[item] 71 | if "PREDICATIVE" in governance and governance["PREDICATIVE"]: 72 | if governance["NUM"] is None: 73 | governance["NUM"] = self.components["subject"].morphology["NUM"] 74 | if governance["CASE"] is None: 75 | if governance["NUM"] == "SG": 76 | governance["CASE"] = "Nom" 77 | else: 78 | governance["CASE"] = "Par" 79 | string_representation = string_representation + " " + phrase.to_string(governance) 80 | return string_representation.strip() 81 | 82 | def __str__(self): 83 | text = self.to_string() 84 | #remove multiple spaces 85 | text = re.sub("\s\s+", " ", text) 86 | #remove spaces before punctuation 87 | text = self.__remove_spaces_punct__(text) 88 | return text.strip() 89 | 90 | def __remove_spaces_punct__(self, text): 91 | puncts = ".,;:?!" 92 | for punct in puncts: 93 | if " "+punct in text: 94 | text = text.replace(" " + punct, punct) 95 | return text 96 | -------------------------------------------------------------------------------- /syntaxmaker/pronoun_tool.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'mika hämäläinen' 3 | 4 | 5 | pronouns = {"SG1" : "minä", "SG2" : "sinä", "SG3" : "se", "PL1" : "me", "PL2": "te", "PL3": "ne"} 6 | 7 | 8 | def pronoun(person, human=True): 9 | if human and person is "SG3": 10 | return "hän" 11 | if human and person is "PL3": 12 | return "he" 13 | if person in pronouns: 14 | return pronouns[person] 15 | else: 16 | return None 17 | 18 | def is_personal_pronoun(p_pronoun): 19 | if p_pronoun in pronouns.values(): 20 | return True 21 | else: 22 | return False 23 | -------------------------------------------------------------------------------- /syntaxmaker/syntax_maker.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'Mika Hämäläinen' 3 | from . import verb_valence 4 | from .phrase import Phrase 5 | import json 6 | import random 7 | from . import pronoun_tool 8 | from . import adposition_tool 9 | import os 10 | from . import noun_tool, ValencyException 11 | 12 | auxiliary_verbs = {"voida" : "A", 13 | "saada" : "A", 14 | "alkaa" : "A", 15 | "haluta" : "A", 16 | "ruveta" : "MA", 17 | "saattaa" : "A", 18 | "kehdata": "A", 19 | "jäädä": "MA", 20 | "yrittää": "A", 21 | "unohtaa":"A"} 22 | 23 | grammar ="" 24 | 25 | def load_grammar(): 26 | global grammar 27 | path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "grammar.json") 28 | f = open(path, "r") 29 | jsonText = f.read() 30 | f.close() 31 | grammar = json.loads(jsonText) 32 | 33 | load_grammar() 34 | 35 | def is_auxiliary_verb(verb): 36 | if verb in auxiliary_verbs: 37 | return True 38 | else: 39 | return False 40 | 41 | def create_verb_pharse(head): 42 | global grammar 43 | phrase_type = verb_valence.valency_count(head) 44 | governance = {} 45 | dir_obj = {} 46 | dir_obj[u"CASE"] = verb_valence.most_frequent_case(verb_valence.verb_direct_objects(head)) 47 | governance["dir_object"] = dir_obj 48 | indir_obj = {} 49 | indir_obj[u"CASE"] = verb_valence.most_frequent_case(verb_valence.verb_indirect_objects(head)) 50 | governance["indir_object"] = indir_obj 51 | 52 | phrase_structure = grammar["VP"+str(phrase_type)] 53 | phrase_structure["governance"] = governance 54 | vp = Phrase(head, phrase_structure) 55 | vp.morphology["VOICE"] = "ACT" 56 | return vp 57 | 58 | #oopsie, typos... 59 | create_verb_phrase = create_verb_pharse 60 | 61 | default_np_morphology = {u"CASE": "Nom", u"NUM": "SG", u"PERS": "3"} 62 | 63 | def create_phrase(name, head, morphology={}): 64 | global grammar 65 | if name in grammar: 66 | structure = grammar[name] 67 | else: 68 | structure = grammar["GENERIC_P"] 69 | if name == "NP": 70 | for key in default_np_morphology.keys(): 71 | if key not in morphology: 72 | morphology[key] = default_np_morphology[key] 73 | return Phrase(head, structure, morphology) 74 | 75 | def create_noun_phrase(head, morphology={}, number=None, case=None): 76 | if case is not None: 77 | morphology["CASE"] = case 78 | if number is not None: 79 | morphology["NUM"] = number 80 | return create_phrase("NP", head, morphology=morphology) 81 | 82 | def create_adjective_phrase(head, morphology={}, degree=None): 83 | if degree is not None: 84 | morphology["DEGREE"] = degree 85 | return create_phrase("AP", head, morphology=morphology) 86 | 87 | def create_personal_pronoun_phrase(person = "1", number = "SG", prodrop=False, human=False): 88 | if prodrop and person != "3": 89 | pronoun = None 90 | else: 91 | pronoun = pronoun_tool.pronoun(number + person, human=human) 92 | pp = create_phrase("NP", pronoun, morphology={u"PERS": person, u"NUM": number}) 93 | pp.head.pos = "PPron" 94 | return pp 95 | 96 | def create_copula_phrase(predicative_case=None, predicative_number=None): 97 | global grammar 98 | structure = grammar["VP_COPULA"] 99 | governance = { "predicative" : {u"CASE" : predicative_case, u"NUM":predicative_number, u"PREDICATIVE":True}} 100 | structure["governance"] = governance 101 | vp = Phrase("olla", structure) 102 | vp.morphology["VOICE"] = "ACT" 103 | return vp 104 | 105 | 106 | def negate_verb_pharse(vp): 107 | aux = create_phrase("GENERIC_P", "ei") 108 | aux.agreement["parent->subject"] = ["PERS", "NUM"] 109 | aux.head.pos = "V" 110 | vp.morphology["NEG"] = True 111 | vp.components["AUX"] = aux 112 | head_index = vp.order.index("head") 113 | vp.order.insert(head_index, "AUX") 114 | if "dir_object" in vp.governance: 115 | if vp.governance["dir_object"][u"CASE"] == "Gen" or vp.governance["dir_object"][u"CASE"] == "Nom": 116 | #Genitive or nomintaive objects to partitive syön kakun/syödään kakku -> en syö kakkua/ei syödä kakkua 117 | vp.governance["dir_object"][u"CASE"] = "Par" 118 | 119 | negate_verb_phrase = negate_verb_pharse 120 | 121 | def turn_vp_into_question(vp): 122 | if "NEG" in vp.morphology: 123 | vp.components["AUX"].morphology["CLIT"] = "KO" 124 | move_front = "AUX" 125 | else: 126 | vp.morphology["CLIT"] = "KO" 127 | move_front = "head" 128 | 129 | vp.order.remove(move_front) 130 | vp.order.insert(0, move_front) 131 | 132 | def add_np_subject_to_vp(vp, np): 133 | if "subject" not in vp.order: 134 | raise ValencyException("This verb "+str(vp.head)+" does not accept a subject") 135 | else: 136 | vp.components["subject"] = np 137 | 138 | def add_np_object_to_vp(vp, np, indirect=False, check_valency=False): 139 | if not indirect: 140 | if "predicative" in vp.order: 141 | vp.components["predicative"] = np 142 | elif "dir_object" in vp.order: 143 | vp.components["dir_object"] = np 144 | elif check_valency == False: 145 | vp.order.append("dir_object") 146 | vp.components["dir_object"] = np 147 | else: 148 | raise ValencyException("This verb "+str(vp.head)+" does not accept an object or a predicative") 149 | else: 150 | if "indir_object" in vp.order: 151 | vp.components["indir_object"] = np 152 | elif check_valency == False: 153 | vp.components["indir_object"] = np 154 | vp.order.append("indir_object") 155 | else: 156 | raise ValencyException("This verb "+str(vp.head)+" does not accept an indirect object") 157 | 158 | 159 | def add_auxiliary_verb_to_vp(vp, aux=None): 160 | if aux is None or aux not in auxiliary_verbs: 161 | return 162 | infinitive = auxiliary_verbs[aux] 163 | 164 | infp = create_phrase("GENERIC_P", vp.head.lemma) 165 | infp.head.pos = "V" 166 | infp.morphology["INF"] = infinitive 167 | 168 | vp.components["INF"] = infp 169 | head_index = vp.order.index("head") 170 | vp.order.insert(head_index+1, "INF") 171 | vp.head.lemma = aux 172 | 173 | def turn_vp_into_prefect(vp): 174 | old_verb = vp.head.lemma 175 | vp.head.lemma = "olla" 176 | 177 | participle = create_phrase("GENERIC_P", old_verb) 178 | participle.head.pos = "PastParticiple" 179 | participle.agreement["parent->subject"] = ["NUM"] 180 | 181 | vp.components["Participle"] = participle 182 | vp.morphology["TEMPAUX"] = True 183 | head_index = vp.order.index("head") 184 | vp.order.insert(head_index+1, "Participle") 185 | 186 | def set_vp_mood_and_tense(vp, mood="INDV", tense="PRESENT"): 187 | vp.morphology["MOOD"] = mood 188 | vp.morphology["TENSE"] = tense 189 | 190 | def turn_vp_into_passive(vp): 191 | subject_p = create_phrase("GENERIC_P", None, {u"PERS": "4", u"NUM": "PE"}) 192 | vp.components["subject"] = subject_p 193 | if "dir_object" in vp.governance: 194 | if vp.governance["dir_object"][u"CASE"] == "Gen": 195 | #Genitive object to nominative: Syön kaukun -> syödään kakku 196 | vp.governance["dir_object"][u"CASE"] = "Nom" 197 | 198 | def add_relative_clause_to_np(np, realtivep, case=None, subject=False): 199 | component = None 200 | if subject: 201 | #e.g. kissa, joka kiipesi puuhun 202 | component = "subject" 203 | if case is None: 204 | case = "Nom" 205 | #if case is none -> the antecedent of the relative clause will be the object of the verb e.g. talo, jonka näin 206 | elif case is None: 207 | #set the relative pronoun at the first free component such as direct object or indirect object 208 | objs = ["dir_object", "indir_object", "predicative"] 209 | for obj in objs: 210 | if obj in realtivep.components and type(realtivep.components[obj]) is not Phrase: 211 | component = "dir_object" 212 | break 213 | 214 | if component is None: 215 | #If can't be added to nowhere else or has a specific case e.g. päivä, jona kävelin kadulla 216 | component = "relative_pron" 217 | np.components[component] = "NP" 218 | np.order.append(component) 219 | 220 | morphology = {"NUM":"SG"} 221 | if case: 222 | morphology["CASE"] = case 223 | morphology["NUM"] = np.morphology["NUM"] 224 | if subject: 225 | morphology["PERS"] = np.morphology["PERS"] 226 | rel_pron = create_phrase("NP", "joka",morphology) 227 | rel_pron.head.pos = "RelPron" 228 | 229 | realtivep.components[component] = rel_pron 230 | realtivep.order.remove(component) 231 | realtivep.order.insert(0, component) 232 | 233 | realtivep.components["comma"] = create_phrase("GENERIC_P", ",") 234 | realtivep.order.insert(0, "comma") 235 | realtivep.order.append("comma") 236 | 237 | np.components["relative_attribute"] = realtivep 238 | np.order.append("relative_attribute") 239 | 240 | def add_advlp_to_vp(vp, advlp, place_type=None, default_locative_category="internal"): 241 | index = 0 242 | for component in vp.components: 243 | if component.startswith("AdvlP"): 244 | index = index + 1 245 | comp_name = "AdvlP" + str(index) 246 | if place_type is not None: 247 | loc = noun_tool.get_locative(advlp.head.lemma) or default_locative_category 248 | advlp.morphology["CASE"] = noun_tool.resolve_locative_case(loc, place_type) 249 | 250 | vp.components[comp_name] = advlp 251 | vp.order.append(comp_name) 252 | 253 | def create_adposition_phrase(adposition, np=None): 254 | if adposition is None: 255 | adposition = adposition_tool.get_an_adposition() 256 | case = adposition_tool.postposition_case(adposition) 257 | if case is not None: 258 | phrase = create_phrase("PostP", adposition) 259 | else: 260 | case = adposition_tool.preposition_case(adposition) 261 | if case is None: 262 | return None 263 | phrase = create_phrase("PrepP", adposition) 264 | phrase.governance["complement"] = {u"CASE": case} 265 | if np is None: 266 | np = "" 267 | phrase.components["complement"] = np 268 | return phrase 269 | 270 | def create_adverb_phrase(head, morphology={}, degree=None): 271 | if degree is not None: 272 | morphology["DEGREE"] = degree 273 | return create_phrase("AdvP", head, morphology=morphology) 274 | 275 | def add_possessive_to_np(np, person, number, prodrop=False, human=False, suffix=True): 276 | if human and person == "3": 277 | suffix = False 278 | persp = create_personal_pronoun_phrase(person, number, prodrop=prodrop, human=human) 279 | persp.morphology["CASE"] = "TrueGen" 280 | if suffix: 281 | np.morphology["POSS"] = "Px" + number.title() + person 282 | if not prodrop: 283 | np.components["det"] = persp 284 | np.order.insert(0, "det") 285 | 286 | 287 | 288 | 289 | """ 290 | vp = create_verb_pharse("uneksia") 291 | add_auxiliary_verb_to_vp(vp) 292 | 293 | 294 | 295 | subject = create_phrase("NP", "rantaleijona", {u"PERS": "3", u"NUM": "PL"}) 296 | 297 | 298 | dobject = create_phrase("NP", "aalto", {u"PERS": "3", u"NUM": "PL"}) 299 | dobject.components["attribute"] = create_phrase("AP", "korkea") 300 | 301 | dobject.components["attribute"].components["attribute"] = create_phrase("AdvP", "erittäin") 302 | 303 | 304 | vp.order.insert(0, "Advl") 305 | advl = {u"CASE": "Ess" } 306 | vp.governance["Advl"] = advl 307 | vp.components["Advl"] = create_phrase("NP","hipsteri",{u"PERS": "3", u"NUM": "PL"}) 308 | 309 | vp.components["subject"] = subject 310 | vp.components["dir_object"] = dobject 311 | 312 | #turn_vp_into_passive(vp) 313 | #negate_verb_pharse(vp) 314 | 315 | turn_vp_into_prefect(vp) 316 | set_vp_mood_and_tense(vp, mood="POTN") 317 | 318 | turn_vp_into_question(vp) 319 | print(vp.to_string()) 320 | 321 | np = create_phrase("NP", "kissa") 322 | pp = create_adposition_phrase("ilman", np) 323 | print(pp.to_string()) 324 | 325 | 326 | 327 | np1 = create_phrase("NP", "mies") 328 | relp = create_verb_pharse("katsoa") 329 | ppp = create_phrase("NP", "orava") 330 | relpp = create_verb_pharse("vaania") 331 | relpp.components["subject"] = create_phrase("NP", "kissa") 332 | add_relative_clause_to_np(ppp, relpp) 333 | 334 | relp.components["subject"] = ppp 335 | add_relative_clause_to_np(np1,relp) 336 | 337 | vep = create_verb_pharse("juosta") 338 | vep.components["subject"] = np1 339 | 340 | np2 = create_phrase("NP", "silta") 341 | pp = create_adposition_phrase("alla", np2) 342 | 343 | add_advlp_to_vp(vep, pp) 344 | 345 | print(vep) 346 | 347 | 348 | """ 349 | -------------------------------------------------------------------------------- /syntaxmaker/ud_map.json: -------------------------------------------------------------------------------- 1 | { 2 | "Aspect=Perf": "PrfPrc", 3 | 4 | "Connegative=Yes": "ConNeg", 5 | "Interj": "Interj", 6 | "Mood=Cnd": "Cond", 7 | "Mood=Imp": "Imprt", 8 | "Mood=Ind": "Ind", 9 | "Mood=Pot": "Pot", 10 | "N": "N", 11 | "N*": "N", 12 | "Num": "Num", 13 | "NumType=Card": "", 14 | "PUNCT": "CLB", 15 | "Pcle": "Pcle", 16 | "Person=1": "1", 17 | "Person=2": "2", 18 | "Person=3": "3", 19 | "Person[psor]=1": "1", 20 | "Person[psor]=2": "2", 21 | "Person[psor]=3": "3", 22 | "Po": "Po", 23 | "Polarity=Neg": "Neg", 24 | "Pr": "Pr", 25 | "Pron": "Pron", 26 | "PronType=Coll": "", 27 | "PronType=Dem": "", 28 | "PronType=Ind": "", 29 | "PronType=Int": "", 30 | "PronType=Prs": "", 31 | "PronType=Rcp": "", 32 | "PronType=Rel": "", 33 | "Reflex=Yes": "", 34 | "Tense=Past": "Prt", 35 | "Tense=Pres": "Prs", 36 | "V": "V", 37 | "V*": "V", 38 | "VerbForm=Fin": "", 39 | "VerbForm=Ger": "Ger", 40 | "VerbForm=Inf": "Inf", 41 | "VerbForm=Part": "PrfPrc", 42 | "VerbForm=Sup": "Sup", 43 | "Voice=Pass": "Pass", 44 | "_": "" 45 | } -------------------------------------------------------------------------------- /syntaxmaker/verb_valence.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'Mika Hämäläinen' 3 | import os 4 | import random 5 | import json 6 | import codecs 7 | from . import valences 8 | 9 | 10 | direct_cases = {"Gen", "Par", "Ela", "Ill"} 11 | indirect_cases = {"Ess", "Tra", "Abl", "All", "Ill"} 12 | direct_threshold = 0.23 13 | indirect_threshold = 0.18 14 | 15 | stop_adverbs = ["laisesti", "näköisesti", "kuuloisesti", "kaltaisesti"] 16 | 17 | 18 | 19 | 20 | def cases_total(verb, cases_list): 21 | global valences 22 | if verb not in valences: 23 | return 0 24 | cases = valences[verb] 25 | count = 0 26 | for case in cases: 27 | if case in cases_list: 28 | count = count + cases[case] 29 | return count 30 | 31 | 32 | def all_cases_total(verb): 33 | global indirect_cases 34 | global direct_cases 35 | return cases_total(verb, indirect_cases) + cases_total(verb, direct_cases) 36 | 37 | 38 | def verb_objects(verb, cases_list, threshold): 39 | global valences 40 | if verb not in valences: 41 | return {} 42 | total = float(all_cases_total(verb)) 43 | object_cases = {} 44 | verb_cases = valences[verb] 45 | for case in cases_list: 46 | if case in verb_cases: 47 | ratio = verb_cases[case] / total 48 | if ratio > threshold: 49 | object_cases[case] = ratio 50 | return object_cases 51 | 52 | 53 | def verb_direct_objects(verb): 54 | global direct_cases 55 | global direct_threshold 56 | objs = verb_objects(verb, direct_cases, direct_threshold) 57 | return objs 58 | 59 | 60 | def verb_indirect_objects(verb): 61 | global indirect_cases 62 | global indirect_threshold 63 | objs = verb_objects(verb, indirect_cases, indirect_threshold) 64 | return objs 65 | 66 | 67 | def valency_count(verb): 68 | direct_objects = verb_direct_objects(verb) 69 | indirect_objects = verb_indirect_objects(verb) 70 | if not direct_objects: 71 | # If the verb has no direct objects, it can only have a subject 72 | return 1 73 | elif not indirect_objects: 74 | # The verb has a direct object but no indirect objects 75 | if most_frequent_case(direct_objects) == "Gen" and "Par" not in direct_objects: 76 | # In case of genetive, partitive cases must also be present due to the Finnish syntax 77 | # If no partitive cases are present, genetives obtained from bigrams may be something else than direct objects 78 | return 1 79 | else: 80 | return 2 81 | else: 82 | if most_frequent_case(direct_objects) == most_frequent_case(indirect_objects): 83 | #Direct and indirect objects would be the same, so only direct object 84 | return 2 85 | else: 86 | # The verb has both kinds of objects 87 | return 3 88 | 89 | 90 | def most_frequent_case(case_dict): 91 | m_case = "" 92 | m_count = 0 93 | for case in case_dict: 94 | if case_dict[case] > m_count: 95 | m_count = case_dict[case] 96 | m_case = case 97 | return m_case 98 | 99 | 100 | def inflect_noun(noun, case): 101 | case = case.upper() 102 | query = "[WORD_ID=" + noun + "][POS=NOUN][NUM=SG][CASE=" + case + "]" 103 | result = os.popen("echo \"" + query + "\" | omorfi-generate.sh").read() 104 | word = result.split("\t")[1] 105 | if "[" in word: 106 | # Fail 107 | return None 108 | else: 109 | return word 110 | 111 | 112 | def inflect_objects(verb, direct_object, indirect_object=None): 113 | vals = valency_count(verb) 114 | if vals < 2: 115 | # The verb has no objects -> can't inflect 116 | return [] 117 | direct_case = most_frequent_case(verb_direct_objects(verb)) 118 | indirect_case = None 119 | if vals == 3: 120 | indirect_case = most_frequent_case(verb_indirect_objects(verb)) 121 | direct = inflect_noun(direct_object, direct_case) 122 | if indirect_case is not None and indirect_object is not None: 123 | indirect = inflect_noun(indirect_object, indirect_case) 124 | return [direct, indirect] 125 | else: 126 | return [direct] 127 | 128 | 129 | def is_copula(verb): 130 | if verb == "olla": 131 | # There's only one copulative verb in Finnish 132 | return True 133 | else: 134 | return False 135 | 136 | -------------------------------------------------------------------------------- /test/100verbs.txt: -------------------------------------------------------------------------------- 1 | suorittaa 2 | ilmaista 3 | kummuta 4 | tasata 5 | tehdä 6 | pohtia 7 | säännöstellä 8 | hakea 9 | siirtää 10 | sonnustautua 11 | järistä 12 | uskoa 13 | henkilöidä 14 | ruuhkauttaa 15 | myrskytä 16 | hämmästyttää 17 | subventoida 18 | absorboida 19 | mutista 20 | tapella 21 | paukkua 22 | seurata 23 | sopeuttaa 24 | edetä 25 | vihkiä 26 | kesannoida 27 | suodattaa 28 | meluta 29 | ihastuttaa 30 | todeta 31 | järjestää 32 | paiskautua 33 | poiketa 34 | palkita 35 | sihistä 36 | muhentaa 37 | hakata 38 | murehtia 39 | särkeä 40 | yllättää 41 | pystyttää 42 | laittaa 43 | leikata 44 | kuivata 45 | hivuttaa 46 | jutustella 47 | pyrkiä 48 | mennä 49 | arkailla 50 | kuivua 51 | mädäntyä 52 | vajota 53 | pestä 54 | hankkia 55 | pyytää 56 | desinfioida 57 | tuntea 58 | poistua 59 | mahtailla 60 | hinata 61 | musertaa 62 | photoshopata 63 | esiintyä 64 | torkkua 65 | pyytää 66 | tyynnyttää 67 | lingota 68 | armahtaa 69 | hypistellä 70 | kunnioittaa 71 | korjauttaa 72 | käsitellä 73 | pakata 74 | katsoa 75 | ottaa 76 | kiinnittää 77 | hivellä 78 | muuttua 79 | paistaa 80 | kutsua 81 | murehduttaa 82 | siksakata 83 | päästä 84 | juoda 85 | mäskätä 86 | käydä 87 | osata 88 | täristää 89 | tarjota 90 | laskea 91 | äristä 92 | jättää 93 | säästää 94 | hankkia 95 | masentua 96 | kurittaa 97 | suurustaa 98 | muodostaa 99 | haista 100 | asettaa -------------------------------------------------------------------------------- /test/generate_sentences.py: -------------------------------------------------------------------------------- 1 | #encoding: utf-8 2 | from syntaxmaker.syntax_maker import * 3 | import codecs 4 | 5 | f = codecs.open("100verbs.txt", "r", encoding="utf-8") 6 | results = [] 7 | for verb in f: 8 | verb = verb.replace("\n", "") 9 | vp = create_verb_pharse(verb) 10 | components= vp.components.keys() 11 | valency = str(len(components)) 12 | if u"subject" in components: 13 | vp.components["subject"] = create_phrase("NP", "lehmä") 14 | if u"dir_object" in components: 15 | vp.components["dir_object"] = create_phrase("NP", "koira") 16 | if u"indir_object" in components: 17 | vp.components["indir_object"] = create_phrase("NP", "kissa") 18 | phrase = vp.to_string() 19 | results.append([verb, phrase.decode('utf-8'), valency]) 20 | 21 | fo = codecs.open("results.csv", "w", encoding="utf-8") 22 | for result in results: 23 | fo.write(";".join(result) + "\n") 24 | fo.close() 25 | -------------------------------------------------------------------------------- /test/results.csv: -------------------------------------------------------------------------------- 1 | suorittaa;lehmä suorittaa koiran;2 2 | ilmaista;lehmä ilmaisee koiraa;2 3 | kummuta;lehmä kumpuaa koirasta;2 4 | tasata;lehmä tasaa koiraa;2 5 | tehdä;lehmä tekee;1 6 | pohtia;lehmä pohtii koiraa;2 7 | säännöstellä;lehmä säännöstelee;1 8 | hakea;lehmä hakee koiraa;2 9 | siirtää;lehmä siirtää;1 10 | sonnustautua;lehmä sonnustautuu koiraan;2 11 | järistä;lehmä järisee;1 12 | uskoa;lehmä uskoo koiraan;2 13 | henkilöidä;lehmä henkilöi;1 14 | ruuhkauttaa;lehmä ruuhkauttaa koiraa;2 15 | myrskytä;lehmä myrskyää;1 16 | hämmästyttää;lehmä hämmästyttää;1 17 | subventoida;lehmä subventoi koiraa;2 18 | absorboida;lehmä absorboi koiraa;2 19 | mutista;lehmä mutisee koirasta;2 20 | tapella;lehmä tappelee;1 21 | paukkua;lehmä paukkuu;1 22 | seurata;lehmä seuraa koiraa;2 23 | sopeuttaa;lehmä sopeuttaa koiraan;2 24 | edetä;lehmä etenee;1 25 | vihkiä;lehmä vihkii;1 26 | kesannoida;lehmä kesannoi;1 27 | suodattaa;lehmä suodattaa koiraa;2 28 | meluta;lehmä meluaa koiraa;2 29 | ihastuttaa;lehmä ihastuttaa koiraa;2 30 | todeta;lehmä toteaa;1 31 | järjestää;lehmä järjestää;1 32 | paiskautua;lehmä paiskautuu koiraan;2 33 | poiketa;lehmä poikkeaa koirasta;2 34 | palkita;lehmä palkitsee koiran;2 35 | sihistä;lehmä sihisee;1 36 | muhentaa;lehmä muhentaa koiraa kissaan;3 37 | hakata;lehmä hakkaa koiraa;2 38 | murehtia;lehmä murehtii koiraa;2 39 | särkeä;lehmä särkee;1 40 | yllättää;lehmä yllättää;1 41 | pystyttää;lehmä pystyttää;1 42 | laittaa;lehmä laittaa koiran kissaan;3 43 | leikata;lehmä leikkaa koiraa;2 44 | kuivata;lehmä kuivaa koiraa;2 45 | hivuttaa;lehmä hivuttaa koiraa;2 46 | jutustella;lehmä jutustelee;1 47 | pyrkiä;lehmä pyrkii;1 48 | mennä;lehmä menee;1 49 | arkailla;lehmä arkailee koiraa;2 50 | kuivua;lehmä kuivuu;1 51 | mädäntyä;lehmä mädäntyy;1 52 | vajota;lehmä vajoaa koiraan;2 53 | pestä;lehmä pesee;1 54 | hankkia;lehmä hankkii koiran;2 55 | pyytää;lehmä pyytää;1 56 | desinfioida;lehmä desinfioi koiran;2 57 | tuntea;lehmä tuntee koiraa;2 58 | poistua;lehmä poistuu koirasta;2 59 | mahtailla;lehmä mahtailee;1 60 | hinata;lehmä hinaa koiraa kissaan;3 61 | musertaa;lehmä musertaa koiran;2 62 | photoshopata;lehmä photoshopata koiraa kissaan;3 63 | esiintyä;lehmä esiintyy;1 64 | torkkua;lehmä torkkuu koiran;2 65 | pyytää;lehmä pyytää;1 66 | tyynnyttää;lehmä tyynnyttää;1 67 | lingota;lehmä linkoaa koiran kissaan;3 68 | armahtaa;lehmä armahtaa koiraa;2 69 | hypistellä;lehmä hypistelee;1 70 | kunnioittaa;lehmä kunnioittaa koiraa;2 71 | korjauttaa;lehmä korjauttaa koiran;2 72 | käsitellä;lehmä käsittelee;1 73 | pakata;lehmä pakkaa koiraan;2 74 | katsoa;lehmä katsoo koiraa;2 75 | ottaa;lehmä ottaa;1 76 | kiinnittää;lehmä kiinnittää;1 77 | hivellä;lehmä hivelee;1 78 | muuttua;lehmä muuttuu;1 79 | paistaa;lehmä paistaa koiraa;2 80 | kutsua;lehmä kutsuu;1 81 | murehduttaa;lehmä murehduttaa koiraa;2 82 | siksakata;lehmä siksakkaa koiraa kissalle;3 83 | päästä;lehmä pääsee;1 84 | juoda;lehmä juo koiraa;2 85 | mäskätä;lehmä mäskää;1 86 | käydä;lehmä käy;1 87 | osata;lehmä osaa koiraa;2 88 | täristää;lehmä täristää;1 89 | tarjota;lehmä tarjoaa koiraa;2 90 | laskea;lehmä laskee koiraa;2 91 | äristä;lehmä ärisee;1 92 | jättää;lehmä jättää;1 93 | säästää;lehmä säästää;1 94 | hankkia;lehmä hankkii koiran;2 95 | masentua;lehmä masentuu koirasta;2 96 | kurittaa;lehmä kurittaa koiraa;2 97 | suurustaa;lehmä suurustaa koiraa;2 98 | muodostaa;lehmä muodostaa;1 99 | haista;lehmä haisee;1 100 | asettaa;lehmä asettaa koiran kissalle;3 101 | -------------------------------------------------------------------------------- /test/results.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mikahama/syntaxmaker/f1ef72b2ee0daaf709927a97c6e5ef6bef0c5fb7/test/results.xlsx -------------------------------------------------------------------------------- /test/wiktionary_verbs.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import random 3 | f = codecs.open("fiwiktionary-latest-pages-articles.xml", "r", encoding="utf-8") 4 | seen_verb = False 5 | verbs = [] 6 | for line in f: 7 | if line.startswith(u"===Verbi==="): 8 | seen_verb = True 9 | elif "{{" in line or ":" in line: 10 | pass 11 | elif seen_verb and line.startswith(u"#") and "{{" not in line: 12 | verb = line.replace("#", "").replace("[", "").replace("]","").replace("\n","") 13 | verb = verb.strip() 14 | if "<" in verb: 15 | verb = verb.split("<")[0] 16 | if "," not in verb: 17 | if " " in verb: 18 | verb = verb.split(" ")[0] 19 | verbs.append(verb) 20 | else: 21 | verbs.extend(verb.split(",")) 22 | seen_verb = False 23 | 24 | verbs = list(set(verbs)) 25 | random.shuffle(verbs) 26 | fo = codecs.open("verbs.txt", "w", encoding="utf-8") 27 | for verb in verbs: 28 | fo.write(verb.strip() + "\n") 29 | fo.close() 30 | -------------------------------------------------------------------------------- /testi.py: -------------------------------------------------------------------------------- 1 | #encoding: utf-8 2 | from syntaxmaker.syntax_maker import * 3 | """ 4 | vp = create_verb_pharse("uneksia") 5 | add_auxiliary_verb_to_vp(vp) 6 | 7 | 8 | 9 | subject = create_phrase("NP", "rantaleijona", {u"PERS": "3", u"NUM": "PL"}) 10 | 11 | 12 | dobject = create_phrase("NP", "aalto", {u"PERS": "3", u"NUM": "PL"}) 13 | dobject.components["attribute"] = create_phrase("AP", "korkea") 14 | 15 | dobject.components["attribute"].components["attribute"] = create_phrase("AdvP", "erittäin") 16 | 17 | 18 | vp.order.insert(0, "Advl") 19 | advl = {u"CASE": "Ess" } 20 | vp.governance["Advl"] = advl 21 | vp.components["Advl"] = create_phrase("NP","hipsteri",{u"PERS": "3", u"NUM": "PL"}) 22 | 23 | vp.components["subject"] = subject 24 | vp.components["dir_object"] = dobject 25 | 26 | print vp 27 | """ 28 | 29 | vp = create_verb_pharse("antaa") 30 | subject = create_phrase("NP", "hevonen", {"NUM": "PL"}) 31 | 32 | dobject = create_phrase("NP", "lahja", {"NUM": "PL"}) 33 | dobject.components["attribute"] = create_phrase("AP", "mahtava") 34 | dobject.components["attribute"].components["attribute"] = create_phrase("AdvP", "erittäin") 35 | 36 | indobject = create_phrase("NP", "lehmä") 37 | vp.components["subject"] = subject 38 | vp.components["dir_object"] = dobject 39 | vp.components["indir_object"] = indobject 40 | 41 | print(vp) -------------------------------------------------------------------------------- /travis_test.py: -------------------------------------------------------------------------------- 1 | #encoding: utf-8 2 | import unittest 3 | from uralicNLP import uralicApi 4 | if not uralicApi.is_language_installed("fin"): 5 | uralicApi.download("fin",show_progress=False) 6 | from syntaxmaker.syntax_maker import * 7 | import codecs 8 | import copy 9 | 10 | 11 | class TestFSTS(unittest.TestCase): 12 | 13 | def setUp(self): 14 | 15 | vp = create_verb_pharse("uneksia") 16 | subject = create_phrase("NP", "rantaleijona", {u"PERS": "3", u"NUM": "PL"}) 17 | dobject = create_phrase("NP", "aalto", {u"PERS": "3", u"NUM": "PL"}) 18 | dobject.components["attribute"] = create_phrase("AP", "korkea") 19 | dobject.components["attribute"].components["attribute"] = create_phrase("AdvP", "erittäin") 20 | vp.components["subject"] = subject 21 | vp.components["dir_object"] = dobject 22 | self.vp = vp 23 | 24 | def test_sentence(self): 25 | vp = copy.deepcopy(self.vp) 26 | self.assertEqual(str(vp) , "rantaleijonat uneksivat erittäin korkeista aalloista") 27 | def test_sentence_pass(self): 28 | vp = copy.deepcopy(self.vp) 29 | turn_vp_into_passive(vp) 30 | self.assertEqual(str(vp) , "uneksitaan erittäin korkeista aalloista") 31 | def test_sentence_neg(self): 32 | vp = copy.deepcopy(self.vp) 33 | negate_verb_pharse(vp) 34 | self.assertEqual(str(vp) , "rantaleijonat eivät uneksi erittäin korkeista aalloista") 35 | def test_prefect(self): 36 | vp = copy.deepcopy(self.vp) 37 | turn_vp_into_prefect(vp) 38 | self.assertEqual(str(vp) , "rantaleijonat ovat uneksineet erittäin korkeista aalloista") 39 | 40 | def test_prefect_pass(self): 41 | vp = copy.deepcopy(self.vp) 42 | turn_vp_into_prefect(vp) 43 | turn_vp_into_passive(vp) 44 | self.assertEqual(str(vp) , "on uneksittu erittäin korkeista aalloista") 45 | 46 | def test_prefect_pass_cond(self): 47 | vp = copy.deepcopy(self.vp) 48 | turn_vp_into_prefect(vp) 49 | turn_vp_into_passive(vp) 50 | set_vp_mood_and_tense(vp, mood="COND") 51 | self.assertEqual(str(vp) , "olisi uneksittu erittäin korkeista aalloista") 52 | 53 | def test_prefect_pass_pot(self): 54 | vp = copy.deepcopy(self.vp) 55 | turn_vp_into_prefect(vp) 56 | turn_vp_into_passive(vp) 57 | set_vp_mood_and_tense(vp, mood="POTN") 58 | self.assertEqual(str(vp) , "lie uneksittu erittäin korkeista aalloista") 59 | 60 | def test_pot(self): 61 | vp = copy.deepcopy(self.vp) 62 | set_vp_mood_and_tense(vp, mood="POTN") 63 | self.assertEqual(str(vp) , "rantaleijonat uneksinevat erittäin korkeista aalloista") 64 | 65 | def test_total_plural(self): 66 | vp = create_verb_pharse("antaa") 67 | subject = create_phrase("NP", "hevonen", {"NUM": "PL"}) 68 | 69 | dobject = create_phrase("NP", "lahja", {"NUM": "PL"}) 70 | dobject.components["attribute"] = create_phrase("AP", "hyvä") 71 | dobject.components["attribute"].components["attribute"] = create_phrase("AdvP", "erittäin") 72 | 73 | indobject = create_phrase("NP", "lehmä") 74 | vp.components["subject"] = subject 75 | vp.components["dir_object"] = dobject 76 | vp.components["indir_object"] = indobject 77 | self.assertEqual(str(vp) , "hevoset antavat erittäin hyviä lahjoja lehmälle") 78 | 79 | def test_total_plural_neg(self): 80 | vp = create_verb_pharse("antaa") 81 | subject = create_phrase("NP", "hevonen", {"NUM": "PL"}) 82 | 83 | dobject = create_phrase("NP", "lahja", {"NUM": "PL"}) 84 | dobject.components["attribute"] = create_phrase("AP", "hyvä") 85 | dobject.components["attribute"].components["attribute"] = create_phrase("AdvP", "erittäin") 86 | 87 | indobject = create_phrase("NP", "lehmä") 88 | vp.components["subject"] = subject 89 | vp.components["dir_object"] = dobject 90 | vp.components["indir_object"] = indobject 91 | negate_verb_pharse(vp) 92 | self.assertEqual(str(vp) , "hevoset eivät anna erittäin hyviä lahjoja lehmälle") 93 | 94 | def test_adj(self): 95 | ap = create_adjective_phrase("kaunis", degree="Comp") 96 | self.assertEqual(str(ap), "kauniimpi") 97 | 98 | def test_cond(self): 99 | vp = copy.deepcopy(self.vp) 100 | set_vp_mood_and_tense(vp, mood="COND") 101 | self.assertEqual(str(vp) , "rantaleijonat uneksisivat erittäin korkeista aalloista") 102 | 103 | def test_imp(self): 104 | vp = copy.deepcopy(self.vp) 105 | set_vp_mood_and_tense(vp, mood="IMPRT") 106 | self.assertEqual(str(vp) , "rantaleijonat uneksikoot erittäin korkeista aalloista") 107 | 108 | def test_quest(self): 109 | vp = copy.deepcopy(self.vp) 110 | turn_vp_into_question(vp) 111 | self.assertEqual(str(vp) , "uneksivatko rantaleijonat erittäin korkeista aalloista") 112 | 113 | def test_rela(self): 114 | np1 = create_phrase("NP", "mies") 115 | relp = create_verb_pharse("katsoa") 116 | ppp = create_phrase("NP", "orava") 117 | relpp = create_verb_pharse("vaania") 118 | relpp.components["subject"] = create_phrase("NP", "kissa") 119 | add_relative_clause_to_np(ppp, relpp) 120 | 121 | relp.components["subject"] = ppp 122 | add_relative_clause_to_np(np1,relp) 123 | 124 | vep = create_verb_pharse("juosta") 125 | vep.components["subject"] = np1 126 | 127 | np2 = create_phrase("NP", "silta") 128 | pp = create_adposition_phrase("alla", np2) 129 | 130 | add_advlp_to_vp(vep, pp) 131 | self.assertEqual(str(vep) , "mies, jota orava, jota kissa vaanii, katsoo, juoksee sillan alla") 132 | 133 | def test_copula_pl(self): 134 | vp = create_copula_phrase() 135 | subject = create_phrase("NP", "koira", {u"NUM": "PL"}) 136 | predicative = create_phrase("NP", "eläin") 137 | vp.components["subject"] = subject 138 | vp.components["predicative"] = predicative 139 | self.assertEqual(str(vp) , "koirat ovat eläimiä") 140 | 141 | def test_copula_sg(self): 142 | vp = create_copula_phrase() 143 | subject = create_phrase("NP", "koira", {u"PERS": "3", u"NUM": "SG"}) 144 | predicative = create_phrase("NP", "eläin") 145 | vp.components["subject"] = subject 146 | vp.components["predicative"] = predicative 147 | self.assertEqual(str(vp) , "koira on eläin") 148 | 149 | def test_adpos(self): 150 | vp = create_copula_phrase() 151 | subject = create_phrase("NP", "koira", {u"PERS": "3", u"NUM": "SG"}) 152 | predicative = create_phrase("NP", "eläin") 153 | adp = create_adposition_phrase("ilman", predicative) 154 | vp.components["subject"] = subject 155 | add_advlp_to_vp(vp, adp) 156 | self.assertEqual(str(vp) , "koira on ilman eläintä") 157 | 158 | def test_place_name(self): 159 | vp = create_copula_phrase() 160 | subject = create_phrase("NP", "koira", {u"PERS": "3", u"NUM": "SG"}) 161 | predicative = create_phrase("NP", "Venäjä") 162 | vp.components["subject"] = subject 163 | add_advlp_to_vp(vp, predicative, place_type="in") 164 | self.assertEqual(str(vp) , "koira on Venäjällä") 165 | 166 | def test_possessive_name(self): 167 | vp = create_copula_phrase() 168 | subject = create_phrase("NP", "koira", {u"PERS": "3", u"NUM": "SG"}) 169 | predicative = create_phrase("NP", "Lontoo") 170 | add_possessive_to_np(predicative, "1", "SG") 171 | vp.components["subject"] = subject 172 | add_advlp_to_vp(vp, predicative, place_type="in") 173 | self.assertEqual(str(vp) , "koira on minun Lontoossani") 174 | 175 | def test_pp_acc(self): 176 | vp = create_verb_pharse("nähdä") 177 | add_np_subject_to_vp(vp, create_noun_phrase("hattu")) 178 | add_np_object_to_vp(vp, create_personal_pronoun_phrase("1", "SG")) 179 | self.assertEqual(str(vp) , "hattu näkee minut") 180 | 181 | def test_neg(self): 182 | vp = create_verb_pharse("nähdä") 183 | add_np_subject_to_vp(vp, create_personal_pronoun_phrase("1", "SG")) 184 | add_np_object_to_vp(vp, create_personal_pronoun_phrase("2", "SG")) 185 | negate_verb_pharse(vp) 186 | turn_vp_into_question(vp) 187 | self.assertEqual(str(vp) , "enkö minä näe sinua") 188 | 189 | def test_prefect_last_pass(self): 190 | vp = copy.deepcopy(self.vp) 191 | turn_vp_into_prefect(vp) 192 | turn_vp_into_passive(vp) 193 | set_vp_mood_and_tense(vp, tense="PAST") 194 | self.assertEqual(str(vp) , "oli uneksittu erittäin korkeista aalloista") 195 | 196 | def test_prefect_last_pass_neg(self): 197 | vp = copy.deepcopy(self.vp) 198 | turn_vp_into_prefect(vp) 199 | turn_vp_into_passive(vp) 200 | set_vp_mood_and_tense(vp, tense="PAST") 201 | negate_verb_pharse(vp) 202 | self.assertEqual(str(vp) , "ei oltu uneksittu erittäin korkeista aalloista") 203 | 204 | def test_sentence_can(self): 205 | vp = copy.deepcopy(self.vp) 206 | add_auxiliary_verb_to_vp(vp, "voida") 207 | self.assertEqual(str(vp) , "rantaleijonat voivat uneksia erittäin korkeista aalloista") 208 | 209 | def test_sentence_stay(self): 210 | vp = copy.deepcopy(self.vp) 211 | add_auxiliary_verb_to_vp(vp, "jäädä") 212 | self.assertEqual(str(vp) , "rantaleijonat jäävät uneksimaan erittäin korkeista aalloista") 213 | 214 | def test_adj_comp(self): 215 | vp = create_copula_phrase() 216 | subject = create_phrase("NP", "koira", {u"PERS": "3", u"NUM": "SG"}) 217 | predicative = create_phrase("NP", "eläin") 218 | adj = create_adjective_phrase("hieno", degree="Comp") 219 | predicative.components["attribute"] = adj 220 | vp.components["subject"] = subject 221 | vp.components["predicative"] = predicative 222 | self.assertEqual(str(vp) , "koira on hienompi eläin") 223 | 224 | def test_adj_superl(self): 225 | vp = create_copula_phrase() 226 | subject = create_phrase("NP", "koira", {u"PERS": "3", u"NUM": "SG"}) 227 | predicative = create_phrase("NP", "eläin") 228 | adj = create_adjective_phrase("hieno", degree="Superl") 229 | predicative.components["attribute"] = adj 230 | vp.components["subject"] = subject 231 | vp.components["predicative"] = predicative 232 | self.assertEqual(str(vp) , "koira on hienoin eläin") 233 | 234 | def test_adv_superl(self): 235 | vp = create_copula_phrase() 236 | subject = create_phrase("NP", "koira", {u"PERS": "3", u"NUM": "SG"}) 237 | predicative = create_phrase("NP", "eläin") 238 | adj = create_adverb_phrase("yleinen", degree="Superl") 239 | predicative.components["attribute"] = adj 240 | vp.components["subject"] = subject 241 | vp.components["predicative"] = predicative 242 | self.assertEqual(str(vp) , "koira on yleisimmin eläin") 243 | 244 | def test_adv_comp(self): 245 | vp = create_copula_phrase() 246 | subject = create_phrase("NP", "koira", {u"PERS": "3", u"NUM": "SG"}) 247 | predicative = create_phrase("NP", "eläin") 248 | adj = create_adverb_phrase("yleinen", degree="Comp") 249 | predicative.components["attribute"] = adj 250 | vp.components["subject"] = subject 251 | vp.components["predicative"] = predicative 252 | self.assertEqual(str(vp) , "koira on yleisemmin eläin") 253 | 254 | 255 | if __name__ == '__main__': 256 | unittest.main() 257 | 258 | --------------------------------------------------------------------------------