├── tests ├── __init__.py ├── context.py └── main_test.py ├── morphemes ├── utilities │ ├── __init__.py │ └── morpheme_database │ │ └── __init__.py ├── cli │ └── __init__.py ├── config │ └── __init__.py └── __init__.py ├── conda ├── build.bat ├── build.sh └── meta.yaml ├── images └── morphemes-logo.png ├── examples └── basic.py ├── requirements.txt ├── LICENSE.txt ├── LICENSE ├── Makefile ├── .github └── workflows │ ├── python-package-conda.yml │ └── python-publish.yml ├── setup.py ├── .gitignore └── README.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /morphemes/utilities/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /conda/build.bat: -------------------------------------------------------------------------------- 1 | "%PYTHON%" setup.py install 2 | if errorlevel 1 exit 1 -------------------------------------------------------------------------------- /conda/build.sh: -------------------------------------------------------------------------------- 1 | $PYTHON setup.py install # Python command to install the script. -------------------------------------------------------------------------------- /images/morphemes-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ecscstatsconsulting/morphemes/HEAD/images/morphemes-logo.png -------------------------------------------------------------------------------- /examples/basic.py: -------------------------------------------------------------------------------- 1 | from morphemes import Morphemes 2 | 3 | path = "../data" 4 | 5 | m = Morphemes(path) 6 | print(m.parse("organizationally")) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | morphemes~=0.0.1 2 | setuptools~=57.0.0, 3 | pandas~=1.4.1, 4 | requests~=2.27.1, 5 | tinydb~=4.7.0, 6 | openpyxl~=3.0.9, 7 | appdata~=2.1.2 -------------------------------------------------------------------------------- /tests/context.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.insert(0, os.path.abspath( 4 | os.path.join(os.path.dirname(__file__), '..'))) 5 | 6 | import morphemes -------------------------------------------------------------------------------- /conda/meta.yaml: -------------------------------------------------------------------------------- 1 | {% set version = "1.0.8" %} 2 | #CURRENTLY DOES NOT WORK BECAUSE OF tinydb IS ONLY AVAILABLE ON CONDA FORGE CHANNEL 3 | package: 4 | name: morphemes 5 | version: {{ version }} 6 | 7 | source: 8 | git_url: https://github.com/ecscstatsconsulting/morphemes.git 9 | git_rev: v1.0.9 10 | 11 | build: 12 | noarch: python 13 | number: 0 14 | script: python -m pip install --no-deps --ignore-installed . 15 | 16 | requirements: 17 | host: 18 | - python 19 | - pip 20 | - requests 21 | - pandas 22 | run: 23 | - python 24 | 25 | test: 26 | imports: 27 | - morphemes 28 | 29 | about: 30 | home: https://github.com/shibukawa/imagesize_py 31 | license: MIT 32 | summary: 'A practical Python Library for identifying morphemes in the english language.' 33 | description: | 34 | A practical Python Library for identifying morphemes in the english language. 35 | dev_url: https://github.com/ecscstatsconsulting/morphemes 36 | doc_url: https://github.com/ecscstatsconsulting/morphemes#readme 37 | doc_source_url: https://github.com/ecscstatsconsulting/morphemes/blob/main/README.md -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 ECSC, ltd 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 ECSC, ltd. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all 2 | 3 | define HEADER 4 | ,, 5 | `7MMM. ,MMF' `7MM 6 | MMMb dPMM MM 7 | M YM ,M MM ,pW"Wq.`7Mb,od8 `7MMpdMAo. MMpMMMb. .gP"Ya `7MMpMMMb.pMMMb. .gP"Ya ,pP"Ybd 8 | M Mb M' MM 6W' `Wb MM' "' MM `Wb MM MM ,M' Yb MM MM MM ,M' Yb 8I `" 9 | M YM.P' MM 8M M8 MM MM M8 MM MM 8M"""""" MM MM MM 8M"""""" `YMMMa. 10 | M `YM' MM YA. ,A9 MM MM ,AP MM MM YM. , MM MM MM YM. , L. I8 11 | .JML. `' .JMML.`Ybmd9'.JMML. MMbmmd'.JMML JMML.`Mbmmd'.JMML JMML JMML.`Mbmmd' M9mmmP' 12 | MM 13 | .JMML. 14 | 15 | Authors: Enkeleda Çuko & Paul Warren 16 | endef 17 | export HEADER 18 | 19 | _header: 20 | @clear 21 | @echo "$$HEADER" 22 | 23 | _install: 24 | python setup.py install 25 | 26 | _test: 27 | @echo 28 | @echo ================= RUNNING TESTS ================= 29 | @python setup.py -q nosetests -s 30 | 31 | info: _header 32 | install: _header _install 33 | test: _header _test -------------------------------------------------------------------------------- /.github/workflows/python-package-conda.yml: -------------------------------------------------------------------------------- 1 | name: Python Package using Conda 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build-linux: 7 | runs-on: ubuntu-latest 8 | strategy: 9 | max-parallel: 5 10 | 11 | steps: 12 | - uses: actions/checkout@v2 13 | - name: Set up Python 3.9 14 | uses: actions/setup-python@v2 15 | with: 16 | python-version: 3.9 17 | - name: Add conda to system path 18 | run: | 19 | # $CONDA is an environment variable pointing to the root of the miniconda directory 20 | echo $CONDA/bin >> $GITHUB_PATH 21 | - name: Install dependencies 22 | run: | 23 | conda env update --file environment.yml --name base 24 | - name: Lint with flake8 25 | run: | 26 | conda install flake8 27 | # stop the build if there are Python syntax errors or undefined names 28 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 29 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 30 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 31 | - name: Test with pytest 32 | run: | 33 | conda install pytest 34 | pytest 35 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | jobs: 16 | deploy: 17 | 18 | runs-on: ubuntu-latest 19 | 20 | steps: 21 | - uses: actions/checkout@v2 22 | - name: Set up Python 23 | uses: actions/setup-python@v2 24 | with: 25 | python-version: '3.x' 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install build 30 | - name: Build package 31 | run: python -m build 32 | - name: Publish package 33 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 34 | with: 35 | user: __token__ 36 | password: ${{ secrets.PYPI_API_TOKEN }} 37 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from pathlib import Path 3 | 4 | this_directory = Path(__file__).parent 5 | long_description = (this_directory / "README.md").read_text() 6 | 7 | setup( 8 | name='morphemes', 9 | long_description_content_type='text/markdown', 10 | long_description=long_description, 11 | keywords="morpheme, morphology, nlp", 12 | version='1.2.0', 13 | install_requires=[ 14 | "pandas>=1.4.1", 15 | "requests>=2.27.1", 16 | "tinydb>=4.7.0", 17 | "openpyxl>=3.0.9", 18 | "click>=8.0.3", 19 | "appdata>=2.1.2", 20 | "tabulate>=0.9.0", 21 | "asciitree>=0.3.3" 22 | ], 23 | packages=[ 24 | 'morphemes', 25 | 'morphemes.config', 26 | 'morphemes.cli', 27 | 'morphemes.utilities', 28 | 'morphemes.utilities.morpheme_database' 29 | ], 30 | entry_points={ 31 | 'console_scripts': [ 32 | 'morphemes=morphemes.cli:main' 33 | ] 34 | }, 35 | url='https://github.com/ecscstatsconsulting/morphemes', 36 | license='MIT', 37 | author='Enkeleda Cuko & Paul Warren', 38 | author_email='ecsctechdepartment@gmail.com', 39 | description="""A practical Python Library for identifying morphemes in the english language.""", 40 | project_urls={ 41 | 'Documentation': 'https://github.com/ecscstatsconsulting/morphemes#readme', 42 | 'Source': 'https://github.com/ecscstatsconsulting/morphemes', 43 | 'Tracker': 'https://github.com/ecscstatsconsulting/morphemes/issues', 44 | } 45 | ) 46 | -------------------------------------------------------------------------------- /morphemes/cli/__init__.py: -------------------------------------------------------------------------------- 1 | import click 2 | import re 3 | from morphemes.config import Config, Settings 4 | from morphemes import Morphemes 5 | import json 6 | 7 | CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help']) 8 | 9 | @click.group(context_settings=CONTEXT_SETTINGS) 10 | def main(): 11 | """morphemes cli""" 12 | pass 13 | 14 | @main.group("config") 15 | def config(): 16 | """config settings for the morphemes package""" 17 | 18 | @config.command("list") 19 | def config_list(): 20 | """list all config settings for the morphemes package""" 21 | Config.list() 22 | 23 | 24 | @main.command("word-tree") 25 | @click.argument("input_word") 26 | def word(input_word): 27 | m = Morphemes() 28 | d = m.parse(input_word) 29 | print(json.dumps(d, sort_keys=True, indent=4)) 30 | 31 | 32 | @main.command("word-count") 33 | @click.argument("input_word") 34 | def word_count(input_word): 35 | m = Morphemes() 36 | c = m.count(input_word) 37 | print(c) 38 | 39 | 40 | @main.command("count") 41 | @click.argument("filename", type=click.Path(exists=True)) 42 | def count(filename): 43 | print(filename) 44 | m = Morphemes() 45 | full_text = "" 46 | with open(filename) as f: 47 | lines = f.readlines() 48 | full_text = " ".join(lines) 49 | broken_text = full_text.split(" ") 50 | words = [] 51 | count = 0 52 | for part in broken_text: 53 | parse = re.sub(r"[^\w']+", "", part, flags=re.UNICODE) 54 | if len(parse) > 0: 55 | words.append(parse.lower()) 56 | count += m.count(parse.lower()) 57 | print(count) 58 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | data/ 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | db.sqlite3 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # Environments 87 | .env 88 | .venv 89 | env/ 90 | venv/ 91 | ENV/ 92 | env.bak/ 93 | venv.bak/ 94 | .idea 95 | 96 | # Spyder project settings 97 | .spyderproject 98 | .spyproject 99 | 100 | # Rope project settings 101 | .ropeproject 102 | 103 | # mkdocs documentation 104 | /site 105 | 106 | # mypy 107 | .mypy_cache/ 108 | 109 | .DS_Store -------------------------------------------------------------------------------- /morphemes/config/__init__.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | from enum import Enum 3 | import json 4 | from appdata import AppDataPaths 5 | from tabulate import tabulate 6 | 7 | class Settings(str, Enum): 8 | data_path = "data_path" 9 | 10 | 11 | class Config: 12 | __default_conf = { 13 | "data_path": None 14 | } 15 | 16 | __conf = __default_conf 17 | 18 | __setters = [ 19 | "data_path" 20 | ] 21 | 22 | @staticmethod 23 | def list(): 24 | Config.load() 25 | my_list = [["Key", "Value"]] 26 | for key in Config.__conf: 27 | my_list.append([key, Config.__conf[key]]) 28 | print(tabulate(my_list, tablefmt="grid")) 29 | 30 | @staticmethod 31 | def save(): 32 | data_path = Config.get(Settings.data_path) + '/config.json' 33 | with open(data_path, "w") as f: 34 | json.dump(Config.__conf, f, indent=2) 35 | 36 | @staticmethod 37 | def load(): 38 | data_path = Config.get(Settings.data_path) + "/config.json" 39 | if os.path.exists(data_path): 40 | with open(data_path, "r") as f: 41 | Config.__conf = json.load(f) 42 | 43 | @staticmethod 44 | def clear(): 45 | data_path = Config.get(Settings.data_path) + "/config.json" 46 | if os.path.exists(data_path): 47 | os.remove(data_path) 48 | Config.__conf = Config.__default_conf 49 | 50 | @staticmethod 51 | def get(name: Settings): 52 | if name == Settings.data_path and Config.__conf[name] is None: 53 | app_paths = AppDataPaths('morphemes') 54 | Config.__conf[name] = app_paths.app_data_path 55 | Config.save() 56 | return Config.__conf[name] 57 | 58 | @staticmethod 59 | def set(name: Settings, value): 60 | if name in Config.__setters: 61 | Config.__conf[name] = value 62 | Config.save() 63 | else: 64 | raise NameError("Name not accepted in set() method") -------------------------------------------------------------------------------- /morphemes/utilities/morpheme_database/__init__.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import re 4 | import pandas as pd 5 | import requests 6 | from tinydb import TinyDB, where 7 | import warnings 8 | 9 | warnings.filterwarnings('ignore', category=UserWarning, module='openpyxl') 10 | 11 | default_morpholex_git_location = "https://github.com/hugomailhot/MorphoLex-en/raw/master/MorphoLEX_en.xlsx" 12 | 13 | 14 | def process_df(df, db): 15 | table = db.table("WORDS") 16 | rows = json.loads(df.reset_index().to_json(orient='records')) 17 | table.insert_multiple(rows) 18 | 19 | 20 | class MorphemeDatabase: 21 | def __init__(self, data_path): 22 | self.data_path = data_path 23 | MorphemeDatabase.db = None 24 | 25 | def get_excel_dictionary_path(self): 26 | filename = self.data_path + "/MorphoLEX_en.xlsx" 27 | return filename 28 | 29 | def get_db_path(self): 30 | filename = self.data_path + "/db.json" 31 | return filename 32 | 33 | def create_db(self): 34 | path = self.get_db_path() 35 | if os.path.exists(path): 36 | os.remove(path) 37 | db = TinyDB(path) 38 | return db 39 | 40 | def load_db(self, access_mode="w"): 41 | if MorphemeDatabase.db is None: 42 | path = self.get_db_path() 43 | if not os.path.exists(path): 44 | self.refresh() 45 | MorphemeDatabase.db = TinyDB(path, access_mode=access_mode) 46 | return MorphemeDatabase.db 47 | 48 | def get_excel(self): 49 | path = self.get_excel_dictionary_path() 50 | if os.path.exists(path): 51 | return pd.ExcelFile(self.get_excel_dictionary_path()) 52 | else: 53 | self.download_morpholex_dictionary() 54 | return pd.ExcelFile(self.get_excel_dictionary_path()) 55 | 56 | def download_morpholex_dictionary(self, url=default_morpholex_git_location): 57 | r = requests.get(url) 58 | f = open(self.get_excel_dictionary_path(), "wb") 59 | f.write(r.content) 60 | 61 | def refresh(self): 62 | print("---- Downloading Morpheme Database ----") 63 | db = self.create_db() 64 | xl = self.get_excel() 65 | sheet_names = xl.sheet_names 66 | for sheet_name in sheet_names: 67 | if re.match("^[0-9]-[0-9]-[0-9]$", sheet_name): 68 | df = xl.parse(sheet_name) 69 | process_df(df, db) 70 | 71 | def lookup(self, word): 72 | db = self.load_db("r") 73 | tbl = db.table("WORDS") 74 | result = tbl.search(where("Word").matches("^" + word + "$", flags=re.IGNORECASE)) 75 | return result 76 | -------------------------------------------------------------------------------- /morphemes/__init__.py: -------------------------------------------------------------------------------- 1 | from .utilities.morpheme_database import MorphemeDatabase 2 | from appdata import AppDataPaths 3 | from morphemes.config import Config, Settings 4 | from enum import Enum 5 | 6 | 7 | class MorphoLEXSeperatorType(Enum): 8 | PREFIX = "<" 9 | BOUND = ">" 10 | ROOT_OPEN = "(" 11 | ROOT_CLOSE = ")" 12 | SEGMENT_OPEN = "{" 13 | SEGMENT_CLOSE = "}" 14 | 15 | @classmethod 16 | def contains(cls, value): 17 | return value in cls._value2member_map_ 18 | 19 | @classmethod 20 | def token_name(cls, value): 21 | if value == cls.PREFIX.value: 22 | return "prefix" 23 | elif value == cls.BOUND.value: 24 | return "bound" 25 | elif value == cls.ROOT_CLOSE.value: 26 | return "root" 27 | elif value == cls.ROOT_OPEN.value: 28 | return "root" 29 | elif value == cls.SEGMENT_CLOSE.value: 30 | return "segment" 31 | elif value == cls.SEGMENT_OPEN.value: 32 | return "segment" 33 | return "undefined" 34 | 35 | class Morphemes: 36 | 37 | def __init__(self, data_path=None): 38 | if data_path is not None: 39 | Config.set(Settings.data_path, data_path) 40 | self.db = MorphemeDatabase(Config.get(Settings.data_path)) 41 | 42 | def count(self, word): 43 | morph_db_results = self.db.lookup(word) 44 | output = 0 45 | if morph_db_results is not None and len(morph_db_results) > 0: 46 | morph_db_result = morph_db_results[0] 47 | output = morph_db_result["Nmorph"] 48 | return output 49 | 50 | def __parse_segmentation(self, segmentation): 51 | output = None 52 | if segmentation is not None: 53 | in_segment = False 54 | in_root = False 55 | in_add_on = False 56 | current = '' 57 | fragments = [] 58 | cur = None 59 | for c in segmentation: 60 | if c == MorphoLEXSeperatorType.SEGMENT_OPEN.value: 61 | cur = { 62 | "children": [], 63 | "type": "free" 64 | } 65 | in_segment = True 66 | if c == MorphoLEXSeperatorType.ROOT_OPEN.value: 67 | in_root = True 68 | if (c == MorphoLEXSeperatorType.PREFIX.value or c == MorphoLEXSeperatorType.BOUND.value) and in_add_on is False: 69 | in_add_on = True 70 | elif c == MorphoLEXSeperatorType.PREFIX.value or c == MorphoLEXSeperatorType.BOUND.value: 71 | if cur is not None: 72 | cur["children"].append({ 73 | "text": current, 74 | "type": MorphoLEXSeperatorType.token_name(c) 75 | }) 76 | else: 77 | fragments.append({ 78 | "text": current, 79 | "type": MorphoLEXSeperatorType.token_name(c) 80 | }) 81 | current = "" 82 | in_add_on = False 83 | if c == MorphoLEXSeperatorType.SEGMENT_CLOSE.value: 84 | current = "" 85 | fragments.append(cur) 86 | cur = None 87 | in_segment = False 88 | if c == MorphoLEXSeperatorType.ROOT_CLOSE.value: 89 | if cur is not None: 90 | cur["children"].append({ 91 | "text": current, 92 | "type": "root" 93 | }) 94 | current = "" 95 | in_root = False 96 | if not MorphoLEXSeperatorType.contains(c): 97 | current = current + c 98 | 99 | output = fragments 100 | return output 101 | 102 | def parse(self, word): 103 | morph_db_results = self.db.lookup(word) 104 | output = {} 105 | if morph_db_results is not None and len(morph_db_results) > 0: 106 | morph_db_result = morph_db_results[0] 107 | output["status"] = "FOUND_IN_DATABASE" 108 | output["word"] = word 109 | output["morpheme_count"] = morph_db_result["Nmorph"] 110 | segmentation = morph_db_result["MorphoLexSegm"] 111 | fragments = self.__parse_segmentation(segmentation) 112 | if fragments is not None: 113 | output["tree"] = fragments 114 | else: 115 | output["status"] = "NOT_FOUND" 116 | output["word"] = word 117 | #not found words, ie names of people/places should be counted as 1 morpheme 118 | output["morpheme_count"] = 1 119 | return output 120 | -------------------------------------------------------------------------------- /tests/main_test.py: -------------------------------------------------------------------------------- 1 | # The test based on unittest module 2 | import unittest 3 | from .context import morphemes 4 | 5 | path = None 6 | 7 | organizationally = { 8 | "word": "organizationally", 9 | "status": "FOUND_IN_DATABASE", 10 | "morpheme_count": 5, 11 | "tree": [ 12 | { 13 | "children": [ 14 | { 15 | "text": "organ", 16 | "type": "root" 17 | }, 18 | { 19 | "text": "ize", 20 | "type": "bound" 21 | } 22 | ], 23 | "type": "free" 24 | }, 25 | { 26 | "text": "ion", 27 | "type": "bound" 28 | }, 29 | { 30 | "text": "al", 31 | "type": "bound" 32 | }, 33 | { 34 | "text": "ly", 35 | "type": "bound" 36 | } 37 | ] 38 | } 39 | poop = { 40 | "status": "FOUND_IN_DATABASE", 41 | "word": "poop", 42 | "morpheme_count": 1, 43 | "tree": [ 44 | { 45 | "children": [ 46 | { 47 | "text": "poop", 48 | "type": "root" 49 | } 50 | ], 51 | "type": "free" 52 | } 53 | ] 54 | } 55 | automobile = { 56 | "status": "FOUND_IN_DATABASE", 57 | "word": "automobile", 58 | "morpheme_count": 2, 59 | "tree": [ 60 | { 61 | "children": [ 62 | { 63 | "text": "auto", 64 | "type": "prefix" 65 | }, 66 | { 67 | "text": "mobile", 68 | "type": "root" 69 | } 70 | ], 71 | "type": "free" 72 | } 73 | ] 74 | } 75 | applesauce_not_found = {'morpheme_count': 1, 'status': 'NOT_FOUND', 'word': 'applesauce'} 76 | 77 | premature = { 78 | 'morpheme_count': 2, 79 | 'status': 'FOUND_IN_DATABASE', 80 | 'tree': [ 81 | { 82 | 'children': [ 83 | {'text': 'pre', 'type': 'prefix'}, 84 | {'text': 'mature', 'type': 'root'} 85 | ], 86 | "type": "free" 87 | } 88 | ], 89 | 'word': 'premature' 90 | } 91 | 92 | # may be wrong 93 | overestimating = { 94 | 'morpheme_count': 3, 95 | 'status': 'FOUND_IN_DATABASE', 96 | 'tree': [ 97 | {'text': 'over', 'type': 'prefix'}, 98 | { 99 | 'children': [ 100 | {'text': 'esteem', 'type': 'root'}, 101 | {"text": "ate", "type": "bound"} 102 | ], 103 | "type": "free" 104 | } 105 | ], 106 | 'word': 'overestimating' 107 | } 108 | 109 | 110 | class TestSingleWordMorphemeParse(unittest.TestCase): 111 | def runTest(self): 112 | print("---Single word morpheme parse---") 113 | m = morphemes.Morphemes(path) 114 | output = m.parse("organizationally") 115 | self.assertEqual(output, 116 | organizationally, 117 | "Failed parse of 'organizationally") 118 | 119 | print(" ✓ PASSED") 120 | 121 | 122 | class TestSingleWordMorphemeCount(unittest.TestCase): 123 | def runTest(self): 124 | print("") 125 | print("---Single word morpheme count---") 126 | m = morphemes.Morphemes(path) 127 | output = m.count("organizationally") 128 | self.assertEqual(output, 129 | 5, 130 | "Failed count of 'organizationally") 131 | 132 | print(" ✓ PASSED") 133 | 134 | 135 | class TestMultipleWordMorphemeParse(unittest.TestCase): 136 | def runTest(self): 137 | print("") 138 | print("---Multiple word morpheme parse---") 139 | m = morphemes.Morphemes(path) 140 | output = m.parse("organizationally") 141 | self.assertEqual(output, 142 | organizationally, 143 | "Failed parse of 'organizationally") 144 | output = m.parse("poop") 145 | self.assertEqual(output, 146 | poop, 147 | "Failed parse of 'poop") 148 | output = m.parse("automobile") 149 | self.assertEqual(output, 150 | automobile, 151 | "Failed parse of 'automobile") 152 | output = m.parse("premature") 153 | self.assertEqual(output, 154 | premature, 155 | "Failed parse of 'premature") 156 | output = m.parse("overestimating") 157 | self.assertEqual(output, 158 | overestimating, 159 | "Failed parse of 'overestimating") 160 | print(" ✓ PASSED") 161 | 162 | 163 | class TestMultipleWordMorphemeCount(unittest.TestCase): 164 | def runTest(self): 165 | print("") 166 | print("---Multiple word morpheme count---") 167 | m = morphemes.Morphemes(path) 168 | output = m.count("organizationally") 169 | self.assertEqual(output, 170 | 5, 171 | "Failed count of 'organizationally") 172 | output = m.count("poop") 173 | self.assertEqual(output, 174 | 1, 175 | "Failed count of 'poop") 176 | output = m.count("automobile") 177 | self.assertEqual(output, 178 | 2, 179 | "Failed count of 'automobile") 180 | print(" ✓ PASSED") 181 | 182 | 183 | class TestNotFoundMorphemeOutput(unittest.TestCase): 184 | def runTest(self): 185 | print("---Not Found morpheme test---") 186 | m = morphemes.Morphemes(path) 187 | output = m.parse("applesauce") 188 | self.assertEqual(output, 189 | applesauce_not_found, 190 | "Failed not found test using the word 'applesauce") 191 | print(" ✓ PASSED") 192 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 9 | 10 | 11 | 12 | 13 | 20 | [![Contributors][contributors-shield]][contributors-url] 21 | [![Forks][forks-shield]][forks-url] 22 | [![Stargazers][stars-shield]][stars-url] 23 | [![Issues][issues-shield]][issues-url] 24 | [![MIT License][license-shield]][license-url] 25 | 26 | [![Downloads](https://static.pepy.tech/personalized-badge/morphemes?period=total&units=international_system&left_color=brightgreen&right_color=blue&left_text=Downloads)](https://pepy.tech/project/morphemes) 27 | 28 | 29 | 30 | 31 |
32 |
33 | 34 | Logo 35 | 36 | 37 |

morphemes

38 | 39 |

40 | A practical Python Library for identifying morphemes in the english language. 41 |
42 | 43 | [//]: # ( Explore the docs ») 44 | 45 | [//]: # (
) 46 |
47 | 48 | [//]: # ( View Demo) 49 | 50 | [//]: # ( ·) 51 | Report Bug 52 | · 53 | Request Feature 54 |

55 |
56 | 57 | 58 | 59 | 60 |
61 | Table of Contents 62 |
    63 |
  1. 64 | About The Project 65 | 68 |
  2. 69 |
  3. 70 | Getting Started 71 | 75 |
  4. 76 |
  5. Usage
  6. 77 |
  7. Roadmap
  8. 78 |
  9. Contributing
  10. 79 |
  11. License
  12. 80 |
  13. Contact
  14. 81 |
  15. Acknowledgments
  16. 82 |
83 |
84 | 85 | 86 | 87 | 88 | ## About The Project 89 | 90 | A simple and practical solution for obtaining morpheme information 91 | for a word. The majority of the logic uses a simple lookup strategy 92 | based off of the [MorphoLex-en](https://github.com/hugomailhot/MorphoLex-en) 93 | project. Unknown's ie. names of people & places are all counted as 1 morpheme. 94 | This is a non-contextual solution intended to feed more complex logic for NLP. 95 | 96 |

(back to top)

97 | 98 | 99 | 100 | ### Built With 101 | 102 | * [MorphoLex-en](https://github.com/hugomailhot/MorphoLex-en) 103 | * [tinydb](https://tinydb.readthedocs.io/en/latest/) 104 | * [pandas](https://pandas.pydata.org/) 105 | 106 |

(back to top)

107 | 108 | 109 | 110 | 111 | ## Getting Started 112 | 113 | Using this library is fairly routine and easy. More detail will be added 114 | to this section as we get closer to the first release. 115 | 116 | ### Prerequisites 117 | 118 | This project was developed with Python 3.9 other versions of Python 3 119 | *should* work. 120 | 121 | ### Installation 122 | 123 | ```sh 124 | pip install morphemes 125 | ``` 126 | 127 |

(back to top)

128 | 129 | 130 | 131 | 132 | ## Usage 133 | Using the morphemes library is very simple. 134 | 1. Import the library 135 | 2. Create an instance of the `Morphemes` class 136 | 1. Optional - Specify a data path where the morphemes database will be stored. If no data path is specified local app storage will be used. 137 | 3. Use the library by calling the `parse` function. 138 | 139 | Example: 140 | ```python 141 | from morphemes import Morphemes 142 | 143 | path = "./data" 144 | 145 | m = Morphemes(path) #Data path is optional, local storage will be used if left out. 146 | print(m.parse("organizationally")) 147 | ``` 148 | Output: 149 | ```json 150 | { 151 | "word": "organizationally", 152 | "status": "FOUND_IN_DATABASE", 153 | "morpheme_count": 5, 154 | "tree": [ 155 | { 156 | "children": [ 157 | { 158 | "text": "organ", 159 | "type": "root" 160 | }, 161 | { 162 | "text": "ize", 163 | "type": "bound" 164 | } 165 | ], 166 | "type": "free" 167 | }, 168 | { 169 | "text": "ion", 170 | "type": "bound" 171 | }, 172 | { 173 | "text": "al", 174 | "type": "bound" 175 | }, 176 | { 177 | "text": "ly", 178 | "type": "bound" 179 | } 180 | ] 181 | } 182 | ``` 183 | 184 | Types definition: 185 | - root: Root value of the word (some morphemes may have multiple roots (example: milkshake) 186 | - bound: adds to the root morphemes. Does not contribute meaning on it's own. 187 | - free: A word which can be used on its own. There can be multiple free types in a single morphem (example: milkshake) 188 | 189 | Words which are not found are marked with status `NOT_FOUND` and will default 190 | to 1 morpheme. This will be improved in future releases. 191 | 192 | NOTE: the `data` path specified is where the morphemes library will 193 | store a database containing morphemes from [MorphoLex-en](https://github.com/hugomailhot/MorphoLex-en) 194 | along with other lookups to help properly detect morphemes. 195 | 196 |

(back to top)

197 | 198 | 199 | 200 | 201 | ## Roadmap 202 | 203 | - [X] Morpheme detection of known words 204 | - [X] Handling of common names and places (counted as 1 morpheme) 205 | - [ ] Handling of unknown words 206 | 207 | 208 | See the [open issues](https://github.com/ecscstatsconsulting/morphemes/issues) for a full list of 209 | proposed features (and known issues). 210 | 211 |

(back to top)

212 | 213 | ## Developers 214 | 215 | Clone the repo and use the Make file to build a local version: 216 | `make install` 217 | 218 | 219 | ## Contributing 220 | 221 | Contributions are what make the open source community such an amazing 222 | place to learn, inspire, and create. Any contributions you make are 223 | **greatly appreciated**. 224 | 225 | Do you want other languages supported? Are you an fluent speaker of the 226 | language you want? Help contribute and grow this project in to a more 227 | universal morpheme solution! 228 | 229 | If you have a suggestion that would make this better, please fork the repo 230 | and create a pull request. You can also simply open an issue with the tag 231 | "enhancement". Don't forget to give the project a star! Thanks again! 232 | 233 | 1. Fork the Project 234 | 2. Create your Feature Branch (`git checkout -b feature/AmazingFeature`) 235 | 3. Commit your Changes (`git commit -m 'Add some AmazingFeature'`) 236 | 4. Push to the Branch (`git push origin feature/AmazingFeature`) 237 | 5. Open a Pull Request 238 | 239 |

(back to top)

240 | 241 | 242 | 243 | 244 | ## License 245 | 246 | Distributed under the MIT License. See `LICENSE.txt` for more information. 247 | 248 |

(back to top)

249 | 250 | 251 | 252 | 253 | ## Contact 254 | 255 | ECSC, ltd - ecsctechdepartment@gmail.com 256 | 257 | Project Link: [https://github.com/ecscstatsconsulting/morphemes](https://github.com/ecscstatsconsulting/morphemes) 258 | 259 |

(back to top)

260 | 261 | 262 | 263 | 264 | ## Acknowledgments 265 | 266 | * [Enkeleda Cuko]() 267 | * [Paul Warren](https://github.com/paul0warren) 268 | 269 |

(back to top)

270 | 271 | 272 | 273 | 274 | 275 | [contributors-shield]: https://img.shields.io/github/contributors/ecscstatsconsulting/morphemes.svg?style=for-the-badge 276 | [contributors-url]: https://github.com/ecscstatsconsulting/morphemes/graphs/contributors 277 | [forks-shield]: https://img.shields.io/github/forks/ecscstatsconsulting/morphemes.svg?style=for-the-badge 278 | [forks-url]: https://github.com/ecscstatsconsulting/morphemes/network/members 279 | [stars-shield]: https://img.shields.io/github/stars/ecscstatsconsulting/morphemes.svg?style=for-the-badge 280 | [stars-url]: https://github.com/ecscstatsconsulting/morphemes/stargazers 281 | [issues-shield]: https://img.shields.io/github/issues/ecscstatsconsulting/morphemes.svg?style=for-the-badge 282 | [issues-url]: https://github.com/ecscstatsconsulting/morphemes/issues 283 | [license-shield]: https://img.shields.io/github/license/ecscstatsconsulting/morphemes.svg?style=for-the-badge 284 | [license-url]: https://github.com/ecscstatsconsulting/morphemes/blob/master/LICENSE.txt 285 | [linkedin-shield]: https://img.shields.io/badge/-LinkedIn-black.svg?style=for-the-badge&logo=linkedin&colorB=555 286 | 287 | [//]: # ([linkedin-url]: https://linkedin.com/in/linkedin_username) 288 | [//]: # ([product-screenshot]: images/screenshot.png) 289 | --------------------------------------------------------------------------------