├── .github └── workflows │ └── pythonpackage.yml ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── setup.cfg ├── setup.py ├── tests ├── test_ssplit.py └── test_ssplit │ ├── 000.json │ ├── 001.json │ ├── 002.json │ ├── 003.json │ ├── 004.json │ ├── 005.json │ ├── 006.json │ ├── 007.json │ ├── 008.json │ ├── 009.json │ ├── 010.json │ ├── 011.json │ ├── 012.json │ ├── 013.json │ ├── 014.json │ ├── 015.json │ ├── 016.json │ └── 017.json └── textformatting ├── __init__.py ├── __version__.py └── ssplit.py /.github/workflows/pythonpackage.yml: -------------------------------------------------------------------------------- 1 | name: Python package 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | strategy: 10 | max-parallel: 2 11 | matrix: 12 | python-version: [3.6, 3.7] 13 | 14 | steps: 15 | - uses: actions/checkout@v1 16 | - name: Set up Python ${{ matrix.python-version }} 17 | uses: actions/setup-python@v1 18 | with: 19 | python-version: ${{ matrix.python-version }} 20 | - name: Install dependencies 21 | run: | 22 | python -m pip install --upgrade pip 23 | python setup.py install 24 | - name: Lint with flake8 25 | run: | 26 | pip install flake8 27 | # stop the build if there are Python syntax errors or undefined names 28 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 29 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 30 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 31 | - name: Test with pytest 32 | run: | 33 | pip install pytest 34 | pytest 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | .static_storage/ 56 | .media/ 57 | local_settings.py 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # pycharm 107 | .idea/ 108 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Kurohashi-Kawahara lab, Kyoto University 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE 3 | graft tests 4 | global-exclude __pycache__ 5 | global-exclude *.py[co] 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # python-textformatting 2 | 3 | ## Requirements 4 | 5 | - Python 3.6.8 6 | 7 | ## Installation 8 | 9 | ``` 10 | $ python setup.py install 11 | ``` 12 | 13 | ## Example 14 | 15 | ```python 16 | from textformatting import ssplit 17 | 18 | text = "日本語のテキストを文単位に分割します。Pythonで書かれています。" 19 | sentences = ssplit(text) # ['日本語のテキストを文単位に分割します。', 'Pythonで書かれています。'] 20 | ``` 21 | 22 | ## License 23 | 24 | - MIT 25 | 26 | ## Authors 27 | 28 | - Kyoto University (contact [at] nlp.ist.i.kyoto-u.ac.jp) 29 | - Hirokazu Kiyomaru 30 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [aliases] 2 | test=pytest 3 | 4 | [tool:pytest] 5 | addopts = --verbose 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import io 3 | import os 4 | 5 | from setuptools import find_packages, setup 6 | 7 | # package meta data 8 | NAME = 'textformatting' 9 | DESCRIPTION = 'A Japanese text formatter' 10 | EMAIL = 'contact@nlp.ist.i.kyoto-u.ac.jp' 11 | AUTHOR = 'Kurohashi-Kawahara Lab, Kyoto University' 12 | VERSION = '' 13 | 14 | INSTALL_REQUIRES = [] 15 | 16 | SETUP_REQUIRES = [ 17 | 'pytest-runner' 18 | ] 19 | 20 | TEST_REQUIRES = [ 21 | 'pytest==4.6.5' 22 | ] 23 | 24 | here = os.path.abspath(os.path.dirname(__file__)) 25 | with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f: 26 | long_description = '\n' + f.read() 27 | 28 | about = {} 29 | with io.open(os.path.join(here, NAME, '__version__.py'), encoding='utf-8') as f: 30 | exec(f.read(), about) 31 | 32 | setup( 33 | name=NAME, 34 | version=about['__version__'], 35 | description=DESCRIPTION, 36 | long_description=long_description, 37 | long_description_content_type='text/markdown', 38 | author=AUTHOR, 39 | author_email=EMAIL, 40 | packages=find_packages(exclude=('tests',)), 41 | install_requires=INSTALL_REQUIRES, 42 | setup_requires=SETUP_REQUIRES, 43 | tests_require=TEST_REQUIRES, 44 | license='MIT', 45 | classifiers=[ 46 | 'License :: OSI Approved :: MIT License', 47 | 'Programming Language :: Python' 48 | ] 49 | ) 50 | -------------------------------------------------------------------------------- /tests/test_ssplit.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import json 3 | import typing 4 | 5 | import os 6 | import pytest 7 | 8 | from textformatting import ssplit 9 | 10 | 11 | def read_test_file(path): 12 | """Read a test file. 13 | 14 | Parameters 15 | ---------- 16 | path : str 17 | The path to a test file. 18 | 19 | Returns 20 | ------- 21 | typing.Tuple[str, typing.List[str]] 22 | """ 23 | with open(path) as f: 24 | dct = json.load(f) 25 | return dct['text'], dct['sentences'] 26 | 27 | 28 | test_file_path_pattern = os.path.join(os.path.dirname(__file__), 'test_ssplit', '*.json') 29 | test_cases = [read_test_file(path) for path in sorted(glob.glob(test_file_path_pattern))] 30 | 31 | 32 | @pytest.mark.parametrize('test_case', test_cases) 33 | def test_ssplit(test_case): 34 | text, sentences = test_case 35 | assert ssplit(text) == sentences 36 | -------------------------------------------------------------------------------- /tests/test_ssplit/000.json: -------------------------------------------------------------------------------- 1 | { 2 | "text": "日本語のテキストを文単位に分割します。Pythonで書かれています。", 3 | "sentences": [ 4 | "日本語のテキストを文単位に分割します。", 5 | "Pythonで書かれています。" 6 | ] 7 | } 8 | -------------------------------------------------------------------------------- /tests/test_ssplit/001.json: -------------------------------------------------------------------------------- 1 | { 2 | "text": "今何時ですか?次の予定があるので失礼します。", 3 | "sentences": [ 4 | "今何時ですか?", 5 | "次の予定があるので失礼します。" 6 | ] 7 | } 8 | -------------------------------------------------------------------------------- /tests/test_ssplit/002.json: -------------------------------------------------------------------------------- 1 | { 2 | "text": "お疲れ様です!次の予定があるので失礼します。", 3 | "sentences": [ 4 | "お疲れ様です!", 5 | "次の予定があるので失礼します。" 6 | ] 7 | } 8 | -------------------------------------------------------------------------------- /tests/test_ssplit/003.json: -------------------------------------------------------------------------------- 1 | { 2 | "text": "お疲れ様です♪次の予定があるので失礼します。", 3 | "sentences": [ 4 | "お疲れ様です♪", 5 | "次の予定があるので失礼します。" 6 | ] 7 | } 8 | -------------------------------------------------------------------------------- /tests/test_ssplit/004.json: -------------------------------------------------------------------------------- 1 | { 2 | "text": "なるほど…これは難しい問題ですね。", 3 | "sentences": [ 4 | "なるほど…", 5 | "これは難しい問題ですね。" 6 | ] 7 | } 8 | -------------------------------------------------------------------------------- /tests/test_ssplit/005.json: -------------------------------------------------------------------------------- 1 | { 2 | "text": "今何時ですか???次の予定があるので失礼します。。。", 3 | "sentences": [ 4 | "今何時ですか???", 5 | "次の予定があるので失礼します。。。" 6 | ] 7 | } 8 | -------------------------------------------------------------------------------- /tests/test_ssplit/006.json: -------------------------------------------------------------------------------- 1 | { 2 | "text": "日本語のテキストを文単位に分割(ただしカッコ内のテキストは分割しません。)します。Pythonで書かれています。", 3 | "sentences": [ 4 | "日本語のテキストを文単位に分割(ただしカッコ内のテキストは分割しません。)します。", 5 | "Pythonで書かれています。" 6 | ] 7 | } 8 | -------------------------------------------------------------------------------- /tests/test_ssplit/007.json: -------------------------------------------------------------------------------- 1 | { 2 | "text": "テレビで「今年の夏は暑いので、熱中症に注意しましょう。」と言っていた。", 3 | "sentences": [ 4 | "テレビで「今年の夏は暑いので、熱中症に注意しましょう。」と言っていた。" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /tests/test_ssplit/008.json: -------------------------------------------------------------------------------- 1 | { 2 | "text": "日本語のテキストを文単位に分割します.Pythonで書かれています.", 3 | "sentences": [ 4 | "日本語のテキストを文単位に分割します.", 5 | "Pythonで書かれています." 6 | ] 7 | } 8 | -------------------------------------------------------------------------------- /tests/test_ssplit/009.json: -------------------------------------------------------------------------------- 1 | { 2 | "text": "日本語のテキストを文単位に分割します。", 3 | "sentences": [ 4 | "日本語のテキストを文単位に分割します。" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /tests/test_ssplit/010.json: -------------------------------------------------------------------------------- 1 | { 2 | "text": "日本語のテキストを文単位に分割します", 3 | "sentences": [ 4 | "日本語のテキストを文単位に分割します" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /tests/test_ssplit/011.json: -------------------------------------------------------------------------------- 1 | { 2 | "text": "日本語のテキストを文単位に分割します。Pythonで書かれています", 3 | "sentences": [ 4 | "日本語のテキストを文単位に分割します。", 5 | "Pythonで書かれています" 6 | ] 7 | } 8 | -------------------------------------------------------------------------------- /tests/test_ssplit/012.json: -------------------------------------------------------------------------------- 1 | { 2 | "text": "日本語のテキストを文単位に分割します\nPythonで書かれています", 3 | "sentences": [ 4 | "日本語のテキストを文単位に分割します", 5 | "Pythonで書かれています" 6 | ] 7 | } 8 | -------------------------------------------------------------------------------- /tests/test_ssplit/013.json: -------------------------------------------------------------------------------- 1 | { 2 | "text": "このライブラリは Python で書かれています", 3 | "sentences": [ 4 | "このライブラリは Python で書かれています" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /tests/test_ssplit/014.json: -------------------------------------------------------------------------------- 1 | { 2 | "text": "このライブラリは Python で書かれています!テキストを文に区切ります.", 3 | "sentences": [ 4 | "このライブラリは Python で書かれています!", 5 | "テキストを文に区切ります." 6 | ] 7 | } 8 | -------------------------------------------------------------------------------- /tests/test_ssplit/015.json: -------------------------------------------------------------------------------- 1 | { 2 | "text": "このライブラリは Python で書かれていますか? Python から利用したいのですが.", 3 | "sentences": [ 4 | "このライブラリは Python で書かれていますか?", 5 | "Python から利用したいのですが." 6 | ] 7 | } 8 | -------------------------------------------------------------------------------- /tests/test_ssplit/016.json: -------------------------------------------------------------------------------- 1 | { 2 | "text": "このライブラリは Python で書かれています(笑\n安心してください(笑", 3 | "sentences": [ 4 | "このライブラリは Python で書かれています(笑", 5 | "安心してください(笑" 6 | ] 7 | } 8 | -------------------------------------------------------------------------------- /tests/test_ssplit/017.json: -------------------------------------------------------------------------------- 1 | { 2 | "text": "なんと (((;゚Д゚)))))))\nびっくりしました.\n", 3 | "sentences": [ 4 | "なんと (((;゚Д゚)))))))", 5 | "びっくりしました." 6 | ] 7 | } 8 | -------------------------------------------------------------------------------- /textformatting/__init__.py: -------------------------------------------------------------------------------- 1 | from textformatting.ssplit import ssplit 2 | -------------------------------------------------------------------------------- /textformatting/__version__.py: -------------------------------------------------------------------------------- 1 | VERSION = (0, 0, 1) 2 | 3 | __version__ = '.'.join(map(str, VERSION)) 4 | -------------------------------------------------------------------------------- /textformatting/ssplit.py: -------------------------------------------------------------------------------- 1 | import typing 2 | 3 | import re 4 | 5 | PATTERNS = { 6 | 'period': '。.?!♪…?!' 7 | } 8 | 9 | 10 | def ssplit(text, model='regex'): 11 | """Split text into sentences. 12 | 13 | Parameters 14 | ---------- 15 | text : str 16 | A input text to be split. 17 | model : str 18 | A model name (default: regex). 19 | 20 | Returns 21 | ------- 22 | typing.List[str] 23 | """ 24 | if model == 'regex': 25 | return _ssplit_regex(text) 26 | else: 27 | raise NotImplementedError 28 | 29 | 30 | def _ssplit_regex(text): 31 | """Split text into sentences by regular expressions. 32 | 33 | Parameters 34 | ---------- 35 | text : str 36 | A input text to be split. 37 | 38 | Returns 39 | ------- 40 | typing.List[str] 41 | """ 42 | _base = r'[^%(period)s]*[%(period)s]' % PATTERNS 43 | _eol = r'[^%(period)s]*$' % PATTERNS 44 | _regex = re.compile(r'%(_base)s|%(_eol)s$' % locals()) 45 | _sentence_candidates = [] 46 | for line in text.split('\n'): 47 | _sentence_candidates += re.findall(_regex, line + '\n') 48 | _sentence_candidates = _merge_sentence_candidates(_sentence_candidates) 49 | return _clean_up_sentence_candidates(_sentence_candidates) 50 | 51 | 52 | def _merge_sentence_candidates(sentence_candidates): 53 | """Merge sentence candidates. 54 | 55 | Parameters 56 | ---------- 57 | sentence_candidates : typing.List[str] 58 | A list of sentence candidates. 59 | 60 | Returns 61 | ------- 62 | typing.List[str] 63 | """ 64 | sentence_candidates = _merge_single_periods(sentence_candidates) 65 | sentence_candidates = _merge_parenthesis(sentence_candidates) 66 | return sentence_candidates 67 | 68 | 69 | def _merge_single_periods(sentence_candidates): 70 | """Merge sentence candidates that consist of a single period. 71 | 72 | Parameters 73 | ---------- 74 | sentence_candidates : typing.List[str] 75 | A list of sentence candidates. 76 | 77 | Returns 78 | ------- 79 | typing.List[str] 80 | """ 81 | _regex = re.compile(r'^[%(period)s]$' % PATTERNS) 82 | 83 | merged_sentences = [''] 84 | for sentence_candidate in sentence_candidates: 85 | if re.match(_regex, sentence_candidate): 86 | merged_sentences[-1] += sentence_candidate 87 | else: 88 | merged_sentences.append(sentence_candidate) 89 | 90 | if merged_sentences[0] == '': 91 | merged_sentences.pop(0) # remove the dummy sentence 92 | return merged_sentences 93 | 94 | 95 | def _merge_parenthesis(sentence_candidates): 96 | """Merge sentence candidates so that they save strings in parentheses or brackets. 97 | 98 | Parameters 99 | ---------- 100 | sentence_candidates : typing.List[str] 101 | A list of sentence candidates. 102 | 103 | Returns 104 | ------- 105 | typing.List[str] 106 | """ 107 | parenthesis_level = 0 108 | quotation_level = 0 109 | 110 | merged_sentences = [] 111 | _sentence_candidate = '' 112 | while sentence_candidates: 113 | sentence_candidate = sentence_candidates.pop(0) 114 | 115 | parenthesis_level += sentence_candidate.count('(') + sentence_candidate.count('(') 116 | parenthesis_level -= sentence_candidate.count(')') + sentence_candidate.count(')') 117 | 118 | quotation_level += sentence_candidate.count('「') + sentence_candidate.count('“') 119 | quotation_level -= sentence_candidate.count('」') + sentence_candidate.count('”') 120 | 121 | if parenthesis_level == 0 and quotation_level == 0: 122 | sentence_candidate = _sentence_candidate + sentence_candidate 123 | merged_sentences.append(sentence_candidate) 124 | _sentence_candidate = '' 125 | else: 126 | if '\n' in sentence_candidate: 127 | sentence_candidate, rest = sentence_candidate.split('\n', maxsplit=1) 128 | sentence_candidate = _sentence_candidate + sentence_candidate 129 | merged_sentences.append(sentence_candidate) 130 | _sentence_candidate = '' 131 | sentence_candidates.insert(0, rest) 132 | parenthesis_level = 0 133 | quotation_level = 0 134 | else: 135 | _sentence_candidate += sentence_candidate 136 | 137 | if _sentence_candidate: 138 | merged_sentences.append(_sentence_candidate) 139 | return merged_sentences 140 | 141 | 142 | def _clean_up_sentence_candidates(sentence_candidates): 143 | """Remove empty sentence candidates. 144 | 145 | Parameters 146 | ---------- 147 | sentence_candidates : typing.List[str] 148 | A list of sentence candidates. 149 | 150 | Returns 151 | ------- 152 | typing.List[str] 153 | """ 154 | return [sentence_candidate.strip() for sentence_candidate in sentence_candidates if sentence_candidate.strip()] 155 | --------------------------------------------------------------------------------