├── .github
    └── workflows
    │   └── pythonpackage.yml
├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── setup.cfg
├── setup.py
├── tests
    ├── test_ssplit.py
    └── test_ssplit
    │   ├── 000.json
    │   ├── 001.json
    │   ├── 002.json
    │   ├── 003.json
    │   ├── 004.json
    │   ├── 005.json
    │   ├── 006.json
    │   ├── 007.json
    │   ├── 008.json
    │   ├── 009.json
    │   ├── 010.json
    │   ├── 011.json
    │   ├── 012.json
    │   ├── 013.json
    │   ├── 014.json
    │   ├── 015.json
    │   ├── 016.json
    │   └── 017.json
└── textformatting
    ├── __init__.py
    ├── __version__.py
    └── ssplit.py


/.github/workflows/pythonpackage.yml:
--------------------------------------------------------------------------------
 1 | name: Python package
 2 | 
 3 | on: [push]
 4 | 
 5 | jobs:
 6 |   build:
 7 | 
 8 |     runs-on: ubuntu-latest
 9 |     strategy:
10 |       max-parallel: 2
11 |       matrix:
12 |         python-version: [3.6, 3.7]
13 | 
14 |     steps:
15 |     - uses: actions/checkout@v1
16 |     - name: Set up Python ${{ matrix.python-version }}
17 |       uses: actions/setup-python@v1
18 |       with:
19 |         python-version: ${{ matrix.python-version }}
20 |     - name: Install dependencies
21 |       run: |
22 |         python -m pip install --upgrade pip
23 |         python setup.py install
24 |     - name: Lint with flake8
25 |       run: |
26 |         pip install flake8
27 |         # stop the build if there are Python syntax errors or undefined names
28 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
29 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
30 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
31 |     - name: Test with pytest
32 |       run: |
33 |         pip install pytest
34 |         pytest
35 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | .static_storage/
 56 | .media/
 57 | local_settings.py
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # pycharm
107 | .idea/
108 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Kurohashi-Kawahara lab, Kyoto University
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include LICENSE
3 | graft tests
4 | global-exclude __pycache__
5 | global-exclude *.py[co]
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # python-textformatting
 2 | 
 3 | ## Requirements
 4 | 
 5 | - Python 3.6.8
 6 | 
 7 | ## Installation
 8 | 
 9 | ```
10 | $ python setup.py install
11 | ```
12 | 
13 | ## Example
14 | 
15 | ```python
16 | from textformatting import ssplit
17 | 
18 | text = "日本語のテキストを文単位に分割します。Pythonで書かれています。"
19 | sentences = ssplit(text)  # ['日本語のテキストを文単位に分割します。', 'Pythonで書かれています。']
20 | ```
21 | 
22 | ## License
23 | 
24 | - MIT
25 | 
26 | ## Authors
27 | 
28 | - Kyoto University (contact [at] nlp.ist.i.kyoto-u.ac.jp)
29 |   - Hirokazu Kiyomaru
30 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [aliases]
2 | test=pytest
3 | 
4 | [tool:pytest]
5 | addopts = --verbose
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import io
 3 | import os
 4 | 
 5 | from setuptools import find_packages, setup
 6 | 
 7 | # package meta data
 8 | NAME = 'textformatting'
 9 | DESCRIPTION = 'A Japanese text formatter'
10 | EMAIL = 'contact@nlp.ist.i.kyoto-u.ac.jp'
11 | AUTHOR = 'Kurohashi-Kawahara Lab, Kyoto University'
12 | VERSION = ''
13 | 
14 | INSTALL_REQUIRES = []
15 | 
16 | SETUP_REQUIRES = [
17 |     'pytest-runner'
18 | ]
19 | 
20 | TEST_REQUIRES = [
21 |     'pytest==4.6.5'
22 | ]
23 | 
24 | here = os.path.abspath(os.path.dirname(__file__))
25 | with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
26 |     long_description = '\n' + f.read()
27 | 
28 | about = {}
29 | with io.open(os.path.join(here, NAME, '__version__.py'), encoding='utf-8') as f:
30 |     exec(f.read(), about)
31 | 
32 | setup(
33 |     name=NAME,
34 |     version=about['__version__'],
35 |     description=DESCRIPTION,
36 |     long_description=long_description,
37 |     long_description_content_type='text/markdown',
38 |     author=AUTHOR,
39 |     author_email=EMAIL,
40 |     packages=find_packages(exclude=('tests',)),
41 |     install_requires=INSTALL_REQUIRES,
42 |     setup_requires=SETUP_REQUIRES,
43 |     tests_require=TEST_REQUIRES,
44 |     license='MIT',
45 |     classifiers=[
46 |         'License :: OSI Approved :: MIT License',
47 |         'Programming Language :: Python'
48 |     ]
49 | )
50 | 


--------------------------------------------------------------------------------
/tests/test_ssplit.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import json
 3 | import typing
 4 | 
 5 | import os
 6 | import pytest
 7 | 
 8 | from textformatting import ssplit
 9 | 
10 | 
11 | def read_test_file(path):
12 |     """Read a test file.
13 | 
14 |     Parameters
15 |     ----------
16 |     path : str
17 |         The path to a test file.
18 | 
19 |     Returns
20 |     -------
21 |     typing.Tuple[str, typing.List[str]]
22 |     """
23 |     with open(path) as f:
24 |         dct = json.load(f)
25 |         return dct['text'], dct['sentences']
26 | 
27 | 
28 | test_file_path_pattern = os.path.join(os.path.dirname(__file__), 'test_ssplit', '*.json')
29 | test_cases = [read_test_file(path) for path in sorted(glob.glob(test_file_path_pattern))]
30 | 
31 | 
32 | @pytest.mark.parametrize('test_case', test_cases)
33 | def test_ssplit(test_case):
34 |     text, sentences = test_case
35 |     assert ssplit(text) == sentences
36 | 


--------------------------------------------------------------------------------
/tests/test_ssplit/000.json:
--------------------------------------------------------------------------------
1 | {
2 |   "text": "日本語のテキストを文単位に分割します。Pythonで書かれています。",
3 |   "sentences": [
4 |     "日本語のテキストを文単位に分割します。",
5 |     "Pythonで書かれています。"
6 |   ]
7 | }
8 | 


--------------------------------------------------------------------------------
/tests/test_ssplit/001.json:
--------------------------------------------------------------------------------
1 | {
2 |   "text": "今何時ですか？次の予定があるので失礼します。",
3 |   "sentences": [
4 |     "今何時ですか？",
5 |     "次の予定があるので失礼します。"
6 |   ]
7 | }
8 | 


--------------------------------------------------------------------------------
/tests/test_ssplit/002.json:
--------------------------------------------------------------------------------
1 | {
2 |   "text": "お疲れ様です！次の予定があるので失礼します。",
3 |   "sentences": [
4 |     "お疲れ様です！",
5 |     "次の予定があるので失礼します。"
6 |   ]
7 | }
8 | 


--------------------------------------------------------------------------------
/tests/test_ssplit/003.json:
--------------------------------------------------------------------------------
1 | {
2 |   "text": "お疲れ様です♪次の予定があるので失礼します。",
3 |   "sentences": [
4 |     "お疲れ様です♪",
5 |     "次の予定があるので失礼します。"
6 |   ]
7 | }
8 | 


--------------------------------------------------------------------------------
/tests/test_ssplit/004.json:
--------------------------------------------------------------------------------
1 | {
2 |   "text": "なるほど…これは難しい問題ですね。",
3 |   "sentences": [
4 |     "なるほど…",
5 |     "これは難しい問題ですね。"
6 |   ]
7 | }
8 | 


--------------------------------------------------------------------------------
/tests/test_ssplit/005.json:
--------------------------------------------------------------------------------
1 | {
2 |   "text": "今何時ですか？？？次の予定があるので失礼します。。。",
3 |   "sentences": [
4 |     "今何時ですか？？？",
5 |     "次の予定があるので失礼します。。。"
6 |   ]
7 | }
8 | 


--------------------------------------------------------------------------------
/tests/test_ssplit/006.json:
--------------------------------------------------------------------------------
1 | {
2 |   "text": "日本語のテキストを文単位に分割(ただしカッコ内のテキストは分割しません。)します。Pythonで書かれています。",
3 |   "sentences": [
4 |     "日本語のテキストを文単位に分割(ただしカッコ内のテキストは分割しません。)します。",
5 |     "Pythonで書かれています。"
6 |   ]
7 | }
8 | 


--------------------------------------------------------------------------------
/tests/test_ssplit/007.json:
--------------------------------------------------------------------------------
1 | {
2 |   "text": "テレビで「今年の夏は暑いので、熱中症に注意しましょう。」と言っていた。",
3 |   "sentences": [
4 |     "テレビで「今年の夏は暑いので、熱中症に注意しましょう。」と言っていた。"
5 |   ]
6 | }
7 | 


--------------------------------------------------------------------------------
/tests/test_ssplit/008.json:
--------------------------------------------------------------------------------
1 | {
2 |   "text": "日本語のテキストを文単位に分割します．Pythonで書かれています．",
3 |   "sentences": [
4 |     "日本語のテキストを文単位に分割します．",
5 |     "Pythonで書かれています．"
6 |   ]
7 | }
8 | 


--------------------------------------------------------------------------------
/tests/test_ssplit/009.json:
--------------------------------------------------------------------------------
1 | {
2 |   "text": "日本語のテキストを文単位に分割します。",
3 |   "sentences": [
4 |     "日本語のテキストを文単位に分割します。"
5 |   ]
6 | }
7 | 


--------------------------------------------------------------------------------
/tests/test_ssplit/010.json:
--------------------------------------------------------------------------------
1 | {
2 |   "text": "日本語のテキストを文単位に分割します",
3 |   "sentences": [
4 |     "日本語のテキストを文単位に分割します"
5 |   ]
6 | }
7 | 


--------------------------------------------------------------------------------
/tests/test_ssplit/011.json:
--------------------------------------------------------------------------------
1 | {
2 |   "text": "日本語のテキストを文単位に分割します。Pythonで書かれています",
3 |   "sentences": [
4 |     "日本語のテキストを文単位に分割します。",
5 |     "Pythonで書かれています"
6 |   ]
7 | }
8 | 


--------------------------------------------------------------------------------
/tests/test_ssplit/012.json:
--------------------------------------------------------------------------------
1 | {
2 |   "text": "日本語のテキストを文単位に分割します\nPythonで書かれています",
3 |   "sentences": [
4 |     "日本語のテキストを文単位に分割します",
5 |     "Pythonで書かれています"
6 |   ]
7 | }
8 | 


--------------------------------------------------------------------------------
/tests/test_ssplit/013.json:
--------------------------------------------------------------------------------
1 | {
2 |   "text": "このライブラリは Python で書かれています",
3 |   "sentences": [
4 |     "このライブラリは Python で書かれています"
5 |   ]
6 | }
7 | 


--------------------------------------------------------------------------------
/tests/test_ssplit/014.json:
--------------------------------------------------------------------------------
1 | {
2 |   "text": "このライブラリは Python で書かれています!テキストを文に区切ります．",
3 |   "sentences": [
4 |     "このライブラリは Python で書かれています!",
5 |     "テキストを文に区切ります．"
6 |   ]
7 | }
8 | 


--------------------------------------------------------------------------------
/tests/test_ssplit/015.json:
--------------------------------------------------------------------------------
1 | {
2 |   "text": "このライブラリは Python で書かれていますか? Python から利用したいのですが．",
3 |   "sentences": [
4 |     "このライブラリは Python で書かれていますか?",
5 |     "Python から利用したいのですが．"
6 |   ]
7 | }
8 | 


--------------------------------------------------------------------------------
/tests/test_ssplit/016.json:
--------------------------------------------------------------------------------
1 | {
2 |   "text": "このライブラリは Python で書かれています（笑\n安心してください（笑",
3 |   "sentences": [
4 |     "このライブラリは Python で書かれています（笑",
5 |     "安心してください（笑"
6 |   ]
7 | }
8 | 


--------------------------------------------------------------------------------
/tests/test_ssplit/017.json:
--------------------------------------------------------------------------------
1 | {
2 |   "text": "なんと (((；ﾟДﾟ)))))))\nびっくりしました．\n",
3 |   "sentences": [
4 |     "なんと (((；ﾟДﾟ)))))))",
5 |     "びっくりしました．"
6 |   ]
7 | }
8 | 


--------------------------------------------------------------------------------
/textformatting/__init__.py:
--------------------------------------------------------------------------------
1 | from textformatting.ssplit import ssplit
2 | 


--------------------------------------------------------------------------------
/textformatting/__version__.py:
--------------------------------------------------------------------------------
1 | VERSION = (0, 0, 1)
2 | 
3 | __version__ = '.'.join(map(str, VERSION))
4 | 


--------------------------------------------------------------------------------
/textformatting/ssplit.py:
--------------------------------------------------------------------------------
  1 | import typing
  2 | 
  3 | import re
  4 | 
  5 | PATTERNS = {
  6 |     'period': '。．？！♪…?!'
  7 | }
  8 | 
  9 | 
 10 | def ssplit(text, model='regex'):
 11 |     """Split text into sentences.
 12 | 
 13 |     Parameters
 14 |     ----------
 15 |     text : str
 16 |         A input text to be split.
 17 |     model : str
 18 |         A model name (default: regex).
 19 | 
 20 |     Returns
 21 |     -------
 22 |     typing.List[str]
 23 |     """
 24 |     if model == 'regex':
 25 |         return _ssplit_regex(text)
 26 |     else:
 27 |         raise NotImplementedError
 28 | 
 29 | 
 30 | def _ssplit_regex(text):
 31 |     """Split text into sentences by regular expressions.
 32 | 
 33 |     Parameters
 34 |     ----------
 35 |     text : str
 36 |         A input text to be split.
 37 | 
 38 |     Returns
 39 |     -------
 40 |     typing.List[str]
 41 |     """
 42 |     _base = r'[^%(period)s]*[%(period)s]' % PATTERNS
 43 |     _eol = r'[^%(period)s]*$' % PATTERNS
 44 |     _regex = re.compile(r'%(_base)s|%(_eol)s$' % locals())
 45 |     _sentence_candidates = []
 46 |     for line in text.split('\n'):
 47 |         _sentence_candidates += re.findall(_regex, line + '\n')
 48 |     _sentence_candidates = _merge_sentence_candidates(_sentence_candidates)
 49 |     return _clean_up_sentence_candidates(_sentence_candidates)
 50 | 
 51 | 
 52 | def _merge_sentence_candidates(sentence_candidates):
 53 |     """Merge sentence candidates.
 54 | 
 55 |     Parameters
 56 |     ----------
 57 |     sentence_candidates : typing.List[str]
 58 |         A list of sentence candidates.
 59 | 
 60 |     Returns
 61 |     -------
 62 |     typing.List[str]
 63 |     """
 64 |     sentence_candidates = _merge_single_periods(sentence_candidates)
 65 |     sentence_candidates = _merge_parenthesis(sentence_candidates)
 66 |     return sentence_candidates
 67 | 
 68 | 
 69 | def _merge_single_periods(sentence_candidates):
 70 |     """Merge sentence candidates that consist of a single period.
 71 | 
 72 |     Parameters
 73 |     ----------
 74 |     sentence_candidates : typing.List[str]
 75 |         A list of sentence candidates.
 76 | 
 77 |     Returns
 78 |     -------
 79 |     typing.List[str]
 80 |     """
 81 |     _regex = re.compile(r'^[%(period)s]$' % PATTERNS)
 82 | 
 83 |     merged_sentences = ['']
 84 |     for sentence_candidate in sentence_candidates:
 85 |         if re.match(_regex, sentence_candidate):
 86 |             merged_sentences[-1] += sentence_candidate
 87 |         else:
 88 |             merged_sentences.append(sentence_candidate)
 89 | 
 90 |     if merged_sentences[0] == '':
 91 |         merged_sentences.pop(0)  # remove the dummy sentence
 92 |     return merged_sentences
 93 | 
 94 | 
 95 | def _merge_parenthesis(sentence_candidates):
 96 |     """Merge sentence candidates so that they save strings in parentheses or brackets.
 97 | 
 98 |     Parameters
 99 |     ----------
100 |     sentence_candidates : typing.List[str]
101 |         A list of sentence candidates.
102 | 
103 |     Returns
104 |     -------
105 |     typing.List[str]
106 |     """
107 |     parenthesis_level = 0
108 |     quotation_level = 0
109 | 
110 |     merged_sentences = []
111 |     _sentence_candidate = ''
112 |     while sentence_candidates:
113 |         sentence_candidate = sentence_candidates.pop(0)
114 | 
115 |         parenthesis_level += sentence_candidate.count('（') + sentence_candidate.count('(')
116 |         parenthesis_level -= sentence_candidate.count('）') + sentence_candidate.count(')')
117 | 
118 |         quotation_level += sentence_candidate.count('「') + sentence_candidate.count('“')
119 |         quotation_level -= sentence_candidate.count('」') + sentence_candidate.count('”')
120 | 
121 |         if parenthesis_level == 0 and quotation_level == 0:
122 |             sentence_candidate = _sentence_candidate + sentence_candidate
123 |             merged_sentences.append(sentence_candidate)
124 |             _sentence_candidate = ''
125 |         else:
126 |             if '\n' in sentence_candidate:
127 |                 sentence_candidate, rest = sentence_candidate.split('\n', maxsplit=1)
128 |                 sentence_candidate = _sentence_candidate + sentence_candidate
129 |                 merged_sentences.append(sentence_candidate)
130 |                 _sentence_candidate = ''
131 |                 sentence_candidates.insert(0, rest)
132 |                 parenthesis_level = 0
133 |                 quotation_level = 0
134 |             else:
135 |                 _sentence_candidate += sentence_candidate
136 | 
137 |     if _sentence_candidate:
138 |         merged_sentences.append(_sentence_candidate)
139 |     return merged_sentences
140 | 
141 | 
142 | def _clean_up_sentence_candidates(sentence_candidates):
143 |     """Remove empty sentence candidates.
144 | 
145 |     Parameters
146 |     ----------
147 |     sentence_candidates : typing.List[str]
148 |         A list of sentence candidates.
149 | 
150 |     Returns
151 |     -------
152 |     typing.List[str]
153 |     """
154 |     return [sentence_candidate.strip() for sentence_candidate in sentence_candidates if sentence_candidate.strip()]
155 | 


--------------------------------------------------------------------------------