├── tests
    ├── __init__.py
    ├── context.py
    └── main_test.py
├── morphemes
    ├── utilities
    │   ├── __init__.py
    │   └── morpheme_database
    │   │   └── __init__.py
    ├── cli
    │   └── __init__.py
    ├── config
    │   └── __init__.py
    └── __init__.py
├── conda
    ├── build.bat
    ├── build.sh
    └── meta.yaml
├── images
    └── morphemes-logo.png
├── examples
    └── basic.py
├── requirements.txt
├── LICENSE.txt
├── LICENSE
├── Makefile
├── .github
    └── workflows
    │   ├── python-package-conda.yml
    │   └── python-publish.yml
├── setup.py
├── .gitignore
└── README.md


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/morphemes/utilities/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/conda/build.bat:
--------------------------------------------------------------------------------
1 | "%PYTHON%" setup.py install
2 | if errorlevel 1 exit 1


--------------------------------------------------------------------------------
/conda/build.sh:
--------------------------------------------------------------------------------
1 | $PYTHON setup.py install     # Python command to install the script.


--------------------------------------------------------------------------------
/images/morphemes-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ecscstatsconsulting/morphemes/HEAD/images/morphemes-logo.png


--------------------------------------------------------------------------------
/examples/basic.py:
--------------------------------------------------------------------------------
1 | from morphemes import Morphemes
2 | 
3 | path = "../data"
4 | 
5 | m = Morphemes(path)
6 | print(m.parse("organizationally"))


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | morphemes~=0.0.1
2 | setuptools~=57.0.0,
3 | pandas~=1.4.1,
4 | requests~=2.27.1,
5 | tinydb~=4.7.0,
6 | openpyxl~=3.0.9,
7 | appdata~=2.1.2


--------------------------------------------------------------------------------
/tests/context.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | sys.path.insert(0, os.path.abspath(
4 |     os.path.join(os.path.dirname(__file__), '..')))
5 | 
6 | import morphemes


--------------------------------------------------------------------------------
/conda/meta.yaml:
--------------------------------------------------------------------------------
 1 | {% set version = "1.0.8" %}
 2 | #CURRENTLY DOES NOT WORK BECAUSE OF tinydb IS ONLY AVAILABLE ON CONDA FORGE CHANNEL
 3 | package:
 4 |   name: morphemes
 5 |   version: {{ version }}
 6 | 
 7 | source:
 8 |   git_url: https://github.com/ecscstatsconsulting/morphemes.git
 9 |   git_rev: v1.0.9
10 | 
11 | build:
12 |   noarch: python
13 |   number: 0
14 |   script: python -m pip install --no-deps --ignore-installed .
15 | 
16 | requirements:
17 |   host:
18 |     - python
19 |     - pip
20 |     - requests
21 |     - pandas
22 |   run:
23 |     - python
24 | 
25 | test:
26 |   imports:
27 |     - morphemes
28 | 
29 | about:
30 |   home: https://github.com/shibukawa/imagesize_py
31 |   license: MIT
32 |   summary: 'A practical Python Library for identifying morphemes in the english language.'
33 |   description: |
34 |     A practical Python Library for identifying morphemes in the english language.
35 |   dev_url: https://github.com/ecscstatsconsulting/morphemes
36 |   doc_url: https://github.com/ecscstatsconsulting/morphemes#readme
37 |   doc_source_url: https://github.com/ecscstatsconsulting/morphemes/blob/main/README.md


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 ECSC, ltd
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 ECSC, ltd.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: all
 2 | 
 3 | define HEADER
 4 |                                             ,,
 5 |   `7MMM.     ,MMF'                         `7MM
 6 |    MMMb    dPMM                             MM
 7 |    M YM   ,M MM  ,pW"Wq.`7Mb,od8 `7MMpdMAo. MMpMMMb.  .gP"Ya `7MMpMMMb.pMMMb.  .gP"Ya  ,pP"Ybd
 8 |    M  Mb  M' MM 6W'   `Wb MM' "'   MM   `Wb MM    MM ,M'   Yb  MM    MM    MM ,M'   Yb 8I   `"
 9 |    M  YM.P'  MM 8M     M8 MM       MM    M8 MM    MM 8M""""""  MM    MM    MM 8M"""""" `YMMMa.
10 |    M  `YM'   MM YA.   ,A9 MM       MM   ,AP MM    MM YM.    ,  MM    MM    MM YM.    , L.   I8
11 |  .JML. `'  .JMML.`Ybmd9'.JMML.     MMbmmd'.JMML  JMML.`Mbmmd'.JMML  JMML  JMML.`Mbmmd' M9mmmP'
12 |                                    MM
13 |                                  .JMML.
14 | 
15 |   Authors: Enkeleda Çuko & Paul Warren
16 | endef
17 | export HEADER
18 | 
19 | _header:
20 | 	@clear
21 | 	@echo "$$HEADER"
22 | 
23 | _install:
24 | 	python setup.py install
25 | 
26 | _test:
27 | 	@echo
28 | 	@echo ================= RUNNING TESTS =================
29 | 	@python setup.py -q nosetests -s
30 | 
31 | info: _header
32 | install: _header _install
33 | test: _header _test


--------------------------------------------------------------------------------
/.github/workflows/python-package-conda.yml:
--------------------------------------------------------------------------------
 1 | name: Python Package using Conda
 2 | 
 3 | on: [push]
 4 | 
 5 | jobs:
 6 |   build-linux:
 7 |     runs-on: ubuntu-latest
 8 |     strategy:
 9 |       max-parallel: 5
10 | 
11 |     steps:
12 |     - uses: actions/checkout@v2
13 |     - name: Set up Python 3.9
14 |       uses: actions/setup-python@v2
15 |       with:
16 |         python-version: 3.9
17 |     - name: Add conda to system path
18 |       run: |
19 |         # $CONDA is an environment variable pointing to the root of the miniconda directory
20 |         echo $CONDA/bin >> $GITHUB_PATH
21 |     - name: Install dependencies
22 |       run: |
23 |         conda env update --file environment.yml --name base
24 |     - name: Lint with flake8
25 |       run: |
26 |         conda install flake8
27 |         # stop the build if there are Python syntax errors or undefined names
28 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
29 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
30 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
31 |     - name: Test with pytest
32 |       run: |
33 |         conda install pytest
34 |         pytest
35 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Upload Python Package
10 | 
11 | on:
12 |   release:
13 |     types: [published]
14 | 
15 | jobs:
16 |   deploy:
17 | 
18 |     runs-on: ubuntu-latest
19 | 
20 |     steps:
21 |     - uses: actions/checkout@v2
22 |     - name: Set up Python
23 |       uses: actions/setup-python@v2
24 |       with:
25 |         python-version: '3.x'
26 |     - name: Install dependencies
27 |       run: |
28 |         python -m pip install --upgrade pip
29 |         pip install build
30 |     - name: Build package
31 |       run: python -m build
32 |     - name: Publish package
33 |       uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
34 |       with:
35 |         user: __token__
36 |         password: ${{ secrets.PYPI_API_TOKEN }}
37 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | from pathlib import Path
 3 | 
 4 | this_directory = Path(__file__).parent
 5 | long_description = (this_directory / "README.md").read_text()
 6 | 
 7 | setup(
 8 |     name='morphemes',
 9 |     long_description_content_type='text/markdown',
10 |     long_description=long_description,
11 |     keywords="morpheme, morphology, nlp",
12 |     version='1.2.0',
13 |     install_requires=[
14 |         "pandas>=1.4.1",
15 |         "requests>=2.27.1",
16 |         "tinydb>=4.7.0",
17 |         "openpyxl>=3.0.9",
18 |         "click>=8.0.3",
19 |         "appdata>=2.1.2",
20 |         "tabulate>=0.9.0",
21 |         "asciitree>=0.3.3"
22 |     ],
23 |     packages=[
24 |         'morphemes',
25 |         'morphemes.config',
26 |         'morphemes.cli',
27 |         'morphemes.utilities',
28 |         'morphemes.utilities.morpheme_database'
29 |     ],
30 |     entry_points={
31 |         'console_scripts': [
32 |             'morphemes=morphemes.cli:main'
33 |         ]
34 |     },
35 |     url='https://github.com/ecscstatsconsulting/morphemes',
36 |     license='MIT',
37 |     author='Enkeleda Cuko & Paul Warren',
38 |     author_email='ecsctechdepartment@gmail.com',
39 |     description="""A practical Python Library for identifying morphemes in the english language.""",
40 |     project_urls={
41 |         'Documentation': 'https://github.com/ecscstatsconsulting/morphemes#readme',
42 |         'Source': 'https://github.com/ecscstatsconsulting/morphemes',
43 |         'Tracker': 'https://github.com/ecscstatsconsulting/morphemes/issues',
44 |     }
45 | )
46 | 


--------------------------------------------------------------------------------
/morphemes/cli/__init__.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | import re
 3 | from morphemes.config import Config, Settings
 4 | from morphemes import Morphemes
 5 | import json
 6 | 
 7 | CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help'])
 8 | 
 9 | @click.group(context_settings=CONTEXT_SETTINGS)
10 | def main():
11 |     """morphemes cli"""
12 |     pass
13 | 
14 | @main.group("config")
15 | def config():
16 |     """config settings for the morphemes package"""
17 | 
18 | @config.command("list")
19 | def config_list():
20 |     """list all config settings for the morphemes package"""
21 |     Config.list()
22 | 
23 | 
24 | @main.command("word-tree")
25 | @click.argument("input_word")
26 | def word(input_word):
27 |     m = Morphemes()
28 |     d = m.parse(input_word)
29 |     print(json.dumps(d, sort_keys=True, indent=4))
30 | 
31 | 
32 | @main.command("word-count")
33 | @click.argument("input_word")
34 | def word_count(input_word):
35 |     m = Morphemes()
36 |     c = m.count(input_word)
37 |     print(c)
38 | 
39 | 
40 | @main.command("count")
41 | @click.argument("filename", type=click.Path(exists=True))
42 | def count(filename):
43 |     print(filename)
44 |     m = Morphemes()
45 |     full_text = ""
46 |     with open(filename) as f:
47 |         lines = f.readlines()
48 |         full_text = " ".join(lines)
49 |     broken_text = full_text.split(" ")
50 |     words = []
51 |     count = 0
52 |     for part in broken_text:
53 |         parse = re.sub(r"[^\w']+", "", part, flags=re.UNICODE)
54 |         if len(parse) > 0:
55 |             words.append(parse.lower())
56 |             count += m.count(parse.lower())
57 |     print(count)
58 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | data/
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | .hypothesis/
 50 | .pytest_cache/
 51 | 
 52 | # Translations
 53 | *.mo
 54 | *.pot
 55 | 
 56 | # Django stuff:
 57 | *.log
 58 | local_settings.py
 59 | db.sqlite3
 60 | 
 61 | # Flask stuff:
 62 | instance/
 63 | .webassets-cache
 64 | 
 65 | # Scrapy stuff:
 66 | .scrapy
 67 | 
 68 | # Sphinx documentation
 69 | docs/_build/
 70 | 
 71 | # PyBuilder
 72 | target/
 73 | 
 74 | # Jupyter Notebook
 75 | .ipynb_checkpoints
 76 | 
 77 | # pyenv
 78 | .python-version
 79 | 
 80 | # celery beat schedule file
 81 | celerybeat-schedule
 82 | 
 83 | # SageMath parsed files
 84 | *.sage.py
 85 | 
 86 | # Environments
 87 | .env
 88 | .venv
 89 | env/
 90 | venv/
 91 | ENV/
 92 | env.bak/
 93 | venv.bak/
 94 | .idea
 95 | 
 96 | # Spyder project settings
 97 | .spyderproject
 98 | .spyproject
 99 | 
100 | # Rope project settings
101 | .ropeproject
102 | 
103 | # mkdocs documentation
104 | /site
105 | 
106 | # mypy
107 | .mypy_cache/
108 | 
109 | .DS_Store


--------------------------------------------------------------------------------
/morphemes/config/__init__.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | from enum import Enum
 3 | import json
 4 | from appdata import AppDataPaths
 5 | from tabulate import tabulate
 6 | 
 7 | class Settings(str, Enum):
 8 |     data_path = "data_path"
 9 | 
10 | 
11 | class Config:
12 |     __default_conf = {
13 |         "data_path": None
14 |     }
15 | 
16 |     __conf = __default_conf
17 | 
18 |     __setters = [
19 |         "data_path"
20 |     ]
21 | 
22 |     @staticmethod
23 |     def list():
24 |         Config.load()
25 |         my_list = [["Key", "Value"]]
26 |         for key in Config.__conf:
27 |             my_list.append([key, Config.__conf[key]])
28 |         print(tabulate(my_list, tablefmt="grid"))
29 | 
30 |     @staticmethod
31 |     def save():
32 |         data_path = Config.get(Settings.data_path) + '/config.json'
33 |         with open(data_path, "w") as f:
34 |             json.dump(Config.__conf, f, indent=2)
35 | 
36 |     @staticmethod
37 |     def load():
38 |         data_path = Config.get(Settings.data_path) + "/config.json"
39 |         if os.path.exists(data_path):
40 |             with open(data_path, "r") as f:
41 |                 Config.__conf = json.load(f)
42 | 
43 |     @staticmethod
44 |     def clear():
45 |         data_path = Config.get(Settings.data_path) + "/config.json"
46 |         if os.path.exists(data_path):
47 |             os.remove(data_path)
48 |         Config.__conf = Config.__default_conf
49 | 
50 |     @staticmethod
51 |     def get(name: Settings):
52 |         if name == Settings.data_path and Config.__conf[name] is None:
53 |             app_paths = AppDataPaths('morphemes')
54 |             Config.__conf[name] = app_paths.app_data_path
55 |             Config.save()
56 |         return Config.__conf[name]
57 | 
58 |     @staticmethod
59 |     def set(name: Settings, value):
60 |         if name in Config.__setters:
61 |             Config.__conf[name] = value
62 |             Config.save()
63 |         else:
64 |             raise NameError("Name not accepted in set() method")


--------------------------------------------------------------------------------
/morphemes/utilities/morpheme_database/__init__.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import re
 4 | import pandas as pd
 5 | import requests
 6 | from tinydb import TinyDB, where
 7 | import warnings
 8 | 
 9 | warnings.filterwarnings('ignore', category=UserWarning, module='openpyxl')
10 | 
11 | default_morpholex_git_location = "https://github.com/hugomailhot/MorphoLex-en/raw/master/MorphoLEX_en.xlsx"
12 | 
13 | 
14 | def process_df(df, db):
15 |     table = db.table("WORDS")
16 |     rows = json.loads(df.reset_index().to_json(orient='records'))
17 |     table.insert_multiple(rows)
18 | 
19 | 
20 | class MorphemeDatabase:
21 |     def __init__(self, data_path):
22 |         self.data_path = data_path
23 |         MorphemeDatabase.db = None
24 | 
25 |     def get_excel_dictionary_path(self):
26 |         filename = self.data_path + "/MorphoLEX_en.xlsx"
27 |         return filename
28 | 
29 |     def get_db_path(self):
30 |         filename = self.data_path + "/db.json"
31 |         return filename
32 | 
33 |     def create_db(self):
34 |         path = self.get_db_path()
35 |         if os.path.exists(path):
36 |             os.remove(path)
37 |         db = TinyDB(path)
38 |         return db
39 | 
40 |     def load_db(self, access_mode="w"):
41 |         if MorphemeDatabase.db is None:
42 |             path = self.get_db_path()
43 |             if not os.path.exists(path):
44 |                 self.refresh()
45 |             MorphemeDatabase.db = TinyDB(path, access_mode=access_mode)
46 |         return MorphemeDatabase.db
47 | 
48 |     def get_excel(self):
49 |         path = self.get_excel_dictionary_path()
50 |         if os.path.exists(path):
51 |             return pd.ExcelFile(self.get_excel_dictionary_path())
52 |         else:
53 |             self.download_morpholex_dictionary()
54 |             return pd.ExcelFile(self.get_excel_dictionary_path())
55 | 
56 |     def download_morpholex_dictionary(self, url=default_morpholex_git_location):
57 |         r = requests.get(url)
58 |         f = open(self.get_excel_dictionary_path(), "wb")
59 |         f.write(r.content)
60 | 
61 |     def refresh(self):
62 |         print("---- Downloading Morpheme Database ----")
63 |         db = self.create_db()
64 |         xl = self.get_excel()
65 |         sheet_names = xl.sheet_names
66 |         for sheet_name in sheet_names:
67 |             if re.match("^[0-9]-[0-9]-[0-9]$", sheet_name):
68 |                 df = xl.parse(sheet_name)
69 |                 process_df(df, db)
70 | 
71 |     def lookup(self, word):
72 |         db = self.load_db("r")
73 |         tbl = db.table("WORDS")
74 |         result = tbl.search(where("Word").matches("^" + word + "$", flags=re.IGNORECASE))
75 |         return result
76 | 


--------------------------------------------------------------------------------
/morphemes/__init__.py:
--------------------------------------------------------------------------------
  1 | from .utilities.morpheme_database import MorphemeDatabase
  2 | from appdata import AppDataPaths
  3 | from morphemes.config import Config, Settings
  4 | from enum import Enum
  5 | 
  6 | 
  7 | class MorphoLEXSeperatorType(Enum):
  8 |     PREFIX = "<"
  9 |     BOUND = ">"
 10 |     ROOT_OPEN = "("
 11 |     ROOT_CLOSE = ")"
 12 |     SEGMENT_OPEN = "{"
 13 |     SEGMENT_CLOSE = "}"
 14 | 
 15 |     @classmethod
 16 |     def contains(cls, value):
 17 |         return value in cls._value2member_map_
 18 | 
 19 |     @classmethod
 20 |     def token_name(cls, value):
 21 |         if value == cls.PREFIX.value:
 22 |             return "prefix"
 23 |         elif value == cls.BOUND.value:
 24 |             return "bound"
 25 |         elif value == cls.ROOT_CLOSE.value:
 26 |             return "root"
 27 |         elif value == cls.ROOT_OPEN.value:
 28 |             return "root"
 29 |         elif value == cls.SEGMENT_CLOSE.value:
 30 |             return "segment"
 31 |         elif value == cls.SEGMENT_OPEN.value:
 32 |             return "segment"
 33 |         return "undefined"
 34 | 
 35 | class Morphemes:
 36 | 
 37 |     def __init__(self, data_path=None):
 38 |         if data_path is not None:
 39 |             Config.set(Settings.data_path, data_path)
 40 |         self.db = MorphemeDatabase(Config.get(Settings.data_path))
 41 | 
 42 |     def count(self, word):
 43 |         morph_db_results = self.db.lookup(word)
 44 |         output = 0
 45 |         if morph_db_results is not None and len(morph_db_results) > 0:
 46 |             morph_db_result = morph_db_results[0]
 47 |             output = morph_db_result["Nmorph"]
 48 |         return output
 49 | 
 50 |     def __parse_segmentation(self, segmentation):
 51 |         output = None
 52 |         if segmentation is not None:
 53 |             in_segment = False
 54 |             in_root = False
 55 |             in_add_on = False
 56 |             current = ''
 57 |             fragments = []
 58 |             cur = None
 59 |             for c in segmentation:
 60 |                 if c == MorphoLEXSeperatorType.SEGMENT_OPEN.value:
 61 |                     cur = {
 62 |                         "children": [],
 63 |                         "type": "free"
 64 |                     }
 65 |                     in_segment = True
 66 |                 if c == MorphoLEXSeperatorType.ROOT_OPEN.value:
 67 |                     in_root = True
 68 |                 if (c == MorphoLEXSeperatorType.PREFIX.value or c == MorphoLEXSeperatorType.BOUND.value) and in_add_on is False:
 69 |                     in_add_on = True
 70 |                 elif c == MorphoLEXSeperatorType.PREFIX.value or c == MorphoLEXSeperatorType.BOUND.value:
 71 |                     if cur is not None:
 72 |                         cur["children"].append({
 73 |                             "text": current,
 74 |                             "type": MorphoLEXSeperatorType.token_name(c)
 75 |                         })
 76 |                     else:
 77 |                         fragments.append({
 78 |                             "text": current,
 79 |                             "type": MorphoLEXSeperatorType.token_name(c)
 80 |                         })
 81 |                     current = ""
 82 |                     in_add_on = False
 83 |                 if c == MorphoLEXSeperatorType.SEGMENT_CLOSE.value:
 84 |                     current = ""
 85 |                     fragments.append(cur)
 86 |                     cur = None
 87 |                     in_segment = False
 88 |                 if c == MorphoLEXSeperatorType.ROOT_CLOSE.value:
 89 |                     if cur is not None:
 90 |                         cur["children"].append({
 91 |                             "text": current,
 92 |                             "type": "root"
 93 |                         })
 94 |                     current = ""
 95 |                     in_root = False
 96 |                 if not MorphoLEXSeperatorType.contains(c):
 97 |                     current = current + c
 98 | 
 99 |             output = fragments
100 |         return output
101 | 
102 |     def parse(self, word):
103 |         morph_db_results = self.db.lookup(word)
104 |         output = {}
105 |         if morph_db_results is not None and len(morph_db_results) > 0:
106 |             morph_db_result = morph_db_results[0]
107 |             output["status"] = "FOUND_IN_DATABASE"
108 |             output["word"] = word
109 |             output["morpheme_count"] = morph_db_result["Nmorph"]
110 |             segmentation = morph_db_result["MorphoLexSegm"]
111 |             fragments = self.__parse_segmentation(segmentation)
112 |             if fragments is not None:
113 |                 output["tree"] = fragments
114 |         else:
115 |             output["status"] = "NOT_FOUND"
116 |             output["word"] = word
117 |             #not found words, ie names of people/places should be counted as 1 morpheme
118 |             output["morpheme_count"] = 1
119 |         return output
120 | 


--------------------------------------------------------------------------------
/tests/main_test.py:
--------------------------------------------------------------------------------
  1 | # The test based on unittest module
  2 | import unittest
  3 | from .context import morphemes
  4 | 
  5 | path = None
  6 | 
  7 | organizationally = {
  8 |                              "word": "organizationally",
  9 |                              "status": "FOUND_IN_DATABASE",
 10 |                              "morpheme_count": 5,
 11 |                              "tree": [
 12 |                                  {
 13 |                                      "children": [
 14 |                                          {
 15 |                                              "text": "organ",
 16 |                                              "type": "root"
 17 |                                          },
 18 |                                          {
 19 |                                              "text": "ize",
 20 |                                              "type": "bound"
 21 |                                          }
 22 |                                      ],
 23 |                                      "type": "free"
 24 |                                  },
 25 |                                  {
 26 |                                      "text": "ion",
 27 |                                      "type": "bound"
 28 |                                  },
 29 |                                  {
 30 |                                      "text": "al",
 31 |                                      "type": "bound"
 32 |                                  },
 33 |                                  {
 34 |                                      "text": "ly",
 35 |                                      "type": "bound"
 36 |                                  }
 37 |                              ]
 38 |                          }
 39 | poop = {
 40 |                              "status": "FOUND_IN_DATABASE",
 41 |                              "word": "poop",
 42 |                              "morpheme_count": 1,
 43 |                              "tree": [
 44 |                                  {
 45 |                                      "children": [
 46 |                                          {
 47 |                                              "text": "poop",
 48 |                                              "type": "root"
 49 |                                          }
 50 |                                      ],
 51 |                                      "type": "free"
 52 |                                  }
 53 |                              ]
 54 |                          }
 55 | automobile = {
 56 |                              "status": "FOUND_IN_DATABASE",
 57 |                              "word": "automobile",
 58 |                              "morpheme_count": 2,
 59 |                              "tree": [
 60 |                                  {
 61 |                                      "children": [
 62 |                                          {
 63 |                                              "text": "auto",
 64 |                                              "type": "prefix"
 65 |                                          },
 66 |                                          {
 67 |                                              "text": "mobile",
 68 |                                              "type": "root"
 69 |                                          }
 70 |                                      ],
 71 |                                      "type": "free"
 72 |                                  }
 73 |                              ]
 74 |                          }
 75 | applesauce_not_found = {'morpheme_count': 1, 'status': 'NOT_FOUND', 'word': 'applesauce'}
 76 | 
 77 | premature = {
 78 |     'morpheme_count': 2,
 79 |     'status': 'FOUND_IN_DATABASE',
 80 |     'tree': [
 81 |         {
 82 |             'children': [
 83 |                 {'text': 'pre', 'type': 'prefix'},
 84 |                 {'text': 'mature', 'type': 'root'}
 85 |             ],
 86 |             "type": "free"
 87 |         }
 88 |     ],
 89 |     'word': 'premature'
 90 | }
 91 | 
 92 | # may be wrong
 93 | overestimating = {
 94 |     'morpheme_count': 3,
 95 |     'status': 'FOUND_IN_DATABASE',
 96 |     'tree': [
 97 |         {'text': 'over', 'type': 'prefix'},
 98 |         {
 99 |             'children': [
100 |                 {'text': 'esteem', 'type': 'root'},
101 |                 {"text": "ate", "type": "bound"}
102 |             ],
103 |             "type": "free"
104 |         }
105 |     ],
106 |     'word': 'overestimating'
107 | }
108 | 
109 | 
110 | class TestSingleWordMorphemeParse(unittest.TestCase):
111 |     def runTest(self):
112 |         print("---Single word morpheme parse---")
113 |         m = morphemes.Morphemes(path)
114 |         output = m.parse("organizationally")
115 |         self.assertEqual(output,
116 |                          organizationally,
117 |                          "Failed parse of 'organizationally")
118 | 
119 |         print("  ✓ PASSED")
120 | 
121 | 
122 | class TestSingleWordMorphemeCount(unittest.TestCase):
123 |     def runTest(self):
124 |         print("")
125 |         print("---Single word morpheme count---")
126 |         m = morphemes.Morphemes(path)
127 |         output = m.count("organizationally")
128 |         self.assertEqual(output,
129 |                          5,
130 |                          "Failed count of 'organizationally")
131 | 
132 |         print("  ✓ PASSED")
133 | 
134 | 
135 | class TestMultipleWordMorphemeParse(unittest.TestCase):
136 |     def runTest(self):
137 |         print("")
138 |         print("---Multiple word morpheme parse---")
139 |         m = morphemes.Morphemes(path)
140 |         output = m.parse("organizationally")
141 |         self.assertEqual(output,
142 |                          organizationally,
143 |                          "Failed parse of 'organizationally")
144 |         output = m.parse("poop")
145 |         self.assertEqual(output,
146 |                          poop,
147 |                          "Failed parse of 'poop")
148 |         output = m.parse("automobile")
149 |         self.assertEqual(output,
150 |                          automobile,
151 |                          "Failed parse of 'automobile")
152 |         output = m.parse("premature")
153 |         self.assertEqual(output,
154 |                          premature,
155 |                          "Failed parse of 'premature")
156 |         output = m.parse("overestimating")
157 |         self.assertEqual(output,
158 |                          overestimating,
159 |                          "Failed parse of 'overestimating")
160 |         print("  ✓ PASSED")
161 | 
162 | 
163 | class TestMultipleWordMorphemeCount(unittest.TestCase):
164 |     def runTest(self):
165 |         print("")
166 |         print("---Multiple word morpheme count---")
167 |         m = morphemes.Morphemes(path)
168 |         output = m.count("organizationally")
169 |         self.assertEqual(output,
170 |                          5,
171 |                          "Failed count of 'organizationally")
172 |         output = m.count("poop")
173 |         self.assertEqual(output,
174 |                          1,
175 |                          "Failed count of 'poop")
176 |         output = m.count("automobile")
177 |         self.assertEqual(output,
178 |                          2,
179 |                          "Failed count of 'automobile")
180 |         print("  ✓ PASSED")
181 | 
182 | 
183 | class TestNotFoundMorphemeOutput(unittest.TestCase):
184 |     def runTest(self):
185 |         print("---Not Found morpheme test---")
186 |         m = morphemes.Morphemes(path)
187 |         output = m.parse("applesauce")
188 |         self.assertEqual(output,
189 |                          applesauce_not_found,
190 |                          "Failed not found test using the word 'applesauce")
191 |         print("  ✓ PASSED")
192 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div id="top"></div>
  2 | <!--
  3 | *** Thanks for checking out the Best-README-Template. If you have a suggestion
  4 | *** that would make this better, please fork the repo and create a pull request
  5 | *** or simply open an issue with the tag "enhancement".
  6 | *** Don't forget to give the project a star!
  7 | *** Thanks again! Now go create something AMAZING! :D
  8 | -->
  9 | 
 10 | 
 11 | 
 12 | <!-- PROJECT SHIELDS -->
 13 | <!--
 14 | *** I'm using markdown "reference style" links for readability.
 15 | *** Reference links are enclosed in brackets [ ] instead of parentheses ( ).
 16 | *** See the bottom of this document for the declaration of the reference variables
 17 | *** for contributors-url, forks-url, etc. This is an optional, concise syntax you may use.
 18 | *** https://www.markdownguide.org/basic-syntax/#reference-style-links
 19 | -->
 20 | [![Contributors][contributors-shield]][contributors-url]
 21 | [![Forks][forks-shield]][forks-url]
 22 | [![Stargazers][stars-shield]][stars-url]
 23 | [![Issues][issues-shield]][issues-url]
 24 | [![MIT License][license-shield]][license-url]
 25 | 
 26 | [![Downloads](https://static.pepy.tech/personalized-badge/morphemes?period=total&units=international_system&left_color=brightgreen&right_color=blue&left_text=Downloads)](https://pepy.tech/project/morphemes)
 27 | 
 28 | 
 29 | 
 30 | <!-- PROJECT LOGO -->
 31 | <br />
 32 | <div align="center">
 33 |   <a href="https://github.com/github_username/repo_name">
 34 |     <img src="https://raw.githubusercontent.com/ecscstatsconsulting/morphemes/main/images/morphemes-logo.png" alt="Logo" width="200" height="200">
 35 |   </a>
 36 | 
 37 | <h3 align="center">morphemes</h3>
 38 | 
 39 |   <p align="center">
 40 |     A practical Python Library for identifying morphemes in the english language.
 41 |     <br />
 42 | 
 43 | [//]: # (    <a href="https://github.com/ecscstatsconsulting/morphemes"><strong>Explore the docs »</strong></a>)
 44 | 
 45 | [//]: # (    <br />)
 46 |     <br />
 47 | 
 48 | [//]: # (    <a href="https://github.com/github_username/repo_name">View Demo</a>)
 49 | 
 50 | [//]: # (    ·)
 51 |     <a href="https://github.com/ecscstatsconsulting/morphemes/issues">Report Bug</a>
 52 |     ·
 53 |     <a href="https://github.com/ecscstatsconsulting/morphemes/issues">Request Feature</a>
 54 |   </p>
 55 | </div>
 56 | 
 57 | 
 58 | 
 59 | <!-- TABLE OF CONTENTS -->
 60 | <details>
 61 |   <summary>Table of Contents</summary>
 62 |   <ol>
 63 |     <li>
 64 |       <a href="#about-the-project">About The Project</a>
 65 |       <ul>
 66 |         <li><a href="#built-with">Built With</a></li>
 67 |       </ul>
 68 |     </li>
 69 |     <li>
 70 |       <a href="#getting-started">Getting Started</a>
 71 |       <ul>
 72 |         <li><a href="#prerequisites">Prerequisites</a></li>
 73 |         <li><a href="#installation">Installation</a></li>
 74 |       </ul>
 75 |     </li>
 76 |     <li><a href="#usage">Usage</a></li>
 77 |     <li><a href="#roadmap">Roadmap</a></li>
 78 |     <li><a href="#contributing">Contributing</a></li>
 79 |     <li><a href="#license">License</a></li>
 80 |     <li><a href="#contact">Contact</a></li>
 81 |     <li><a href="#acknowledgments">Acknowledgments</a></li>
 82 |   </ol>
 83 | </details>
 84 | 
 85 | 
 86 | 
 87 | <!-- ABOUT THE PROJECT -->
 88 | ## About The Project
 89 | 
 90 | A simple and practical solution for obtaining morpheme information
 91 | for a word.  The majority of the logic uses a simple lookup strategy
 92 | based off of the [MorphoLex-en](https://github.com/hugomailhot/MorphoLex-en)
 93 | project.  Unknown's ie. names of people & places are all counted as 1 morpheme.  
 94 | This is a non-contextual solution intended to feed more complex logic for NLP.
 95 | 
 96 | <p align="right">(<a href="#top">back to top</a>)</p>
 97 | 
 98 | 
 99 | 
100 | ### Built With
101 | 
102 | * [MorphoLex-en](https://github.com/hugomailhot/MorphoLex-en)
103 | * [tinydb](https://tinydb.readthedocs.io/en/latest/)
104 | * [pandas](https://pandas.pydata.org/)
105 | 
106 | <p align="right">(<a href="#top">back to top</a>)</p>
107 | 
108 | 
109 | 
110 | <!-- GETTING STARTED -->
111 | ## Getting Started
112 | 
113 | Using this library is fairly routine and easy.  More detail will be added
114 | to this section as we get closer to the first release.
115 | 
116 | ### Prerequisites
117 | 
118 | This project was developed with Python 3.9 other versions of Python 3 
119 | *should* work.
120 | 
121 | ### Installation
122 | 
123 |   ```sh
124 |   pip install morphemes
125 |   ```
126 | 
127 | <p align="right">(<a href="#top">back to top</a>)</p>
128 | 
129 | 
130 | 
131 | <!-- USAGE EXAMPLES -->
132 | ## Usage
133 | Using the morphemes library is very simple.
134 | 1. Import the library
135 | 2. Create an instance of the `Morphemes` class 
136 |    1. Optional - Specify a data path where the morphemes database will be stored.  If no data path is specified local app storage will be used.
137 | 3. Use the library by calling the `parse` function.
138 | 
139 | Example:
140 | ```python
141 | from morphemes import Morphemes
142 | 
143 | path = "./data"
144 | 
145 | m = Morphemes(path) #Data path is optional, local storage will be used if left out.
146 | print(m.parse("organizationally"))
147 | ```
148 | Output:
149 | ```json
150 | {
151 |   "word": "organizationally",
152 |   "status": "FOUND_IN_DATABASE",
153 |   "morpheme_count": 5,
154 |   "tree": [
155 |     {
156 |       "children": [
157 |         {
158 |           "text": "organ",
159 |           "type": "root"
160 |         },
161 |         {
162 |           "text": "ize",
163 |           "type": "bound"
164 |         }
165 |       ],
166 |       "type": "free"
167 |     },
168 |     {
169 |       "text": "ion",
170 |       "type": "bound"
171 |     },
172 |     {
173 |       "text": "al",
174 |       "type": "bound"
175 |     },
176 |     {
177 |       "text": "ly",
178 |       "type": "bound"
179 |     }
180 |   ]
181 | }
182 | ```
183 | 
184 | Types definition:
185 |  - root: Root value of the word (some morphemes may have multiple roots (example: milkshake)
186 |  - bound: adds to the root morphemes.  Does not contribute meaning on it's own.
187 |  - free: A word which can be used on its own.  There can be multiple free types in a single morphem (example: milkshake)
188 | 
189 | Words which are not found are marked with status `NOT_FOUND` and will default
190 | to 1 morpheme.  This will be improved in future releases.
191 | 
192 | NOTE: the `data` path specified is where the morphemes library will
193 | store a database containing morphemes from [MorphoLex-en](https://github.com/hugomailhot/MorphoLex-en)
194 | along with other lookups to help properly detect morphemes.
195 | 
196 | <p align="right">(<a href="#top">back to top</a>)</p>
197 | 
198 | 
199 | 
200 | <!-- ROADMAP -->
201 | ## Roadmap
202 | 
203 | - [X] Morpheme detection of known words
204 | - [X] Handling of common names and places (counted as 1 morpheme)
205 | - [ ] Handling of unknown words
206 | 
207 | 
208 | See the [open issues](https://github.com/ecscstatsconsulting/morphemes/issues) for a full list of 
209 | proposed features (and known issues).
210 | 
211 | <p align="right">(<a href="#top">back to top</a>)</p>
212 | 
213 | ## Developers
214 | 
215 | Clone the repo and use the Make file to build a local version:
216 | `make install`
217 | 
218 | <!-- CONTRIBUTING -->
219 | ## Contributing
220 | 
221 | Contributions are what make the open source community such an amazing 
222 | place to learn, inspire, and create. Any contributions you make are 
223 | **greatly appreciated**.
224 | 
225 | Do you want other languages supported?  Are you an fluent speaker of the
226 | language you want?  Help contribute and grow this project in to a more
227 | universal morpheme solution!
228 | 
229 | If you have a suggestion that would make this better, please fork the repo 
230 | and create a pull request. You can also simply open an issue with the tag 
231 | "enhancement".  Don't forget to give the project a star! Thanks again!
232 | 
233 | 1. Fork the Project
234 | 2. Create your Feature Branch (`git checkout -b feature/AmazingFeature`)
235 | 3. Commit your Changes (`git commit -m 'Add some AmazingFeature'`)
236 | 4. Push to the Branch (`git push origin feature/AmazingFeature`)
237 | 5. Open a Pull Request
238 | 
239 | <p align="right">(<a href="#top">back to top</a>)</p>
240 | 
241 | 
242 | 
243 | <!-- LICENSE -->
244 | ## License
245 | 
246 | Distributed under the MIT License. See `LICENSE.txt` for more information.
247 | 
248 | <p align="right">(<a href="#top">back to top</a>)</p>
249 | 
250 | 
251 | 
252 | <!-- CONTACT -->
253 | ## Contact
254 | 
255 | ECSC, ltd - ecsctechdepartment@gmail.com
256 | 
257 | Project Link: [https://github.com/ecscstatsconsulting/morphemes](https://github.com/ecscstatsconsulting/morphemes)
258 | 
259 | <p align="right">(<a href="#top">back to top</a>)</p>
260 | 
261 | 
262 | 
263 | <!-- ACKNOWLEDGMENTS -->
264 | ## Acknowledgments
265 | 
266 | * [Enkeleda Cuko]()
267 | * [Paul Warren](https://github.com/paul0warren)
268 | 
269 | <p align="right">(<a href="#top">back to top</a>)</p>
270 | 
271 | 
272 | 
273 | <!-- MARKDOWN LINKS & IMAGES -->
274 | <!-- https://www.markdownguide.org/basic-syntax/#reference-style-links -->
275 | [contributors-shield]: https://img.shields.io/github/contributors/ecscstatsconsulting/morphemes.svg?style=for-the-badge
276 | [contributors-url]: https://github.com/ecscstatsconsulting/morphemes/graphs/contributors
277 | [forks-shield]: https://img.shields.io/github/forks/ecscstatsconsulting/morphemes.svg?style=for-the-badge
278 | [forks-url]: https://github.com/ecscstatsconsulting/morphemes/network/members
279 | [stars-shield]: https://img.shields.io/github/stars/ecscstatsconsulting/morphemes.svg?style=for-the-badge
280 | [stars-url]: https://github.com/ecscstatsconsulting/morphemes/stargazers
281 | [issues-shield]: https://img.shields.io/github/issues/ecscstatsconsulting/morphemes.svg?style=for-the-badge
282 | [issues-url]: https://github.com/ecscstatsconsulting/morphemes/issues
283 | [license-shield]: https://img.shields.io/github/license/ecscstatsconsulting/morphemes.svg?style=for-the-badge
284 | [license-url]: https://github.com/ecscstatsconsulting/morphemes/blob/master/LICENSE.txt
285 | [linkedin-shield]: https://img.shields.io/badge/-LinkedIn-black.svg?style=for-the-badge&logo=linkedin&colorB=555
286 | 
287 | [//]: # ([linkedin-url]: https://linkedin.com/in/linkedin_username)
288 | [//]: # ([product-screenshot]: images/screenshot.png)
289 | 


--------------------------------------------------------------------------------