├── .gitignore
├── .travis.yml
├── CHANGES.rst
├── LICENSE
├── MANIFEST.in
├── README.rst
├── sengiri
    ├── __init__.py
    └── sengiri.py
├── setup.py
└── test_sengiri.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - 3.4
 4 |   - 3.5
 5 |   - 3.6
 6 |   - 3.7
 7 |   - 3.8
 8 | before_install:
 9 |   - sudo apt-get update -qq
10 |   - wget http://ftp.sjtu.edu.cn/ubuntu/pool/universe/m/mecab/libmecab2_0.996-1.1_amd64.deb
11 |   - sudo dpkg -i libmecab2_0.996-1.1_amd64.deb
12 |   - wget http://ftp.sjtu.edu.cn/ubuntu/pool/universe/m/mecab/libmecab-dev_0.996-1.1_amd64.deb
13 |   - sudo dpkg -i libmecab-dev_0.996-1.1_amd64.deb
14 |   - wget http://ftp.sjtu.edu.cn/ubuntu/pool/universe/m/mecab/mecab-utils_0.996-1.1_amd64.deb
15 |   - sudo dpkg -i mecab-utils_0.996-1.1_amd64.deb
16 |   - sudo apt-get install -y mecab-ipadic-utf8
17 | install:
18 |   - "python setup.py install"
19 |   - "pip install coveralls"
20 | script:
21 |   - "nosetests --with-coverage --cover-package=sengiri"
22 | after_success:
23 |   - coveralls
24 | 


--------------------------------------------------------------------------------
/CHANGES.rst:
--------------------------------------------------------------------------------
 1 | CHANGES
 2 | =======
 3 | 
 4 | 0.2.2 (2019-10-15)
 5 | ------------------
 6 | 
 7 | - In tokenize() method, `emoji_threshold` parameter is available
 8 | - Bugfix
 9 | 
10 | 0.2.1 (2019-10-12)
11 | ------------------
12 | 
13 | - Works well with also a text including emoticon and www (Laughing expression)
14 | - Always treat emoji to delimiter regardless MeCab's POS
15 | 
16 | 0.1.1 (2019-10-05)
17 | ------------------
18 | 
19 | - First release
20 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2019 Yukino Ikegami
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.rst
2 | include LICENSE
3 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | sengiri
 2 | ==========
 3 | |travis| |coveralls| |pyversion| |version| |license|
 4 | 
 5 | Yet another sentence-level tokenizer for the Japanese text
 6 | 
 7 | DEPENDENCIES
 8 | ==============
 9 | 
10 | - MeCab
11 | - emoji
12 | 
13 | INSTALLATION
14 | ==============
15 | 
16 | ::
17 | 
18 |  $ pip install sengiri
19 | 
20 | 
21 | USAGE
22 | ============
23 | 
24 | .. code:: python
25 | 
26 |   import sengiri
27 | 
28 |   print(sengiri.tokenize('うーん🤔🤔🤔どうしよう'))
29 |   #=>['うーん🤔🤔🤔', 'どうしよう']
30 |   print(sengiri.tokenize('モー娘。のコンサートに行った。'))
31 |   #=>['モー娘。のコンサートに行った。']
32 |   print(sengiri.tokenize('ありがとう＾＾ 助かります。'))
33 |   #=>['ありがとう＾＾', '助かります。']
34 |   print(sengiri.tokenize('顔文字テスト(*´ω｀*)うまくいくかな？'))
35 |   #=>['顔文字テスト(*´ω｀*)うまくいくかな？']
36 |   # I recommend using the NEologd dictionary.
37 |   print(sengiri.tokenize('顔文字テスト(*´ω｀*)うまくいくかな？', mecab_args='-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd'))
38 |   #=>['顔文字テスト(*´ω｀*)', 'うまくいくかな？']
39 |   print(sengiri.tokenize('子供が大変なことになった。'
40 |                          '（後で聞いたのだが、脅されたらしい）'
41 |                          '（脅迫はやめてほしいと言っているのに）'))
42 |   #=>['子供が大変なことになった。', '（後で聞いたのだが、脅されたらしい）', '（脅迫はやめてほしいと言っているのに）']
43 |   print(sengiri.tokenize('楽しかったw また遊ぼwww'))
44 |   #=>['楽しかったw', 'また遊ぼwww']
45 |   print(sengiri.tokenize('http://www.inpaku.go.jp/'))
46 |   #=>['http://www.inpaku.go.jp/']
47 | 
48 | .. |travis| image:: https://travis-ci.org/ikegami-yukino/sengiri.svg?branch=master
49 |     :target: https://travis-ci.org/ikegami-yukino/sengiri
50 |     :alt: travis-ci.org
51 | 
52 | .. |coveralls| image:: https://coveralls.io/repos/ikegami-yukino/sengiri/badge.svg?branch=master&service=github
53 |     :target: https://coveralls.io/github/ikegami-yukino/sengiri?branch=master
54 |     :alt: coveralls.io
55 | 
56 | .. |pyversion| image:: https://img.shields.io/pypi/pyversions/sengiri.svg
57 | 
58 | .. |version| image:: https://img.shields.io/pypi/v/sengiri.svg
59 |     :target: http://pypi.python.org/pypi/sengiri/
60 |     :alt: latest version
61 | 
62 | .. |license| image:: https://img.shields.io/pypi/l/sengiri.svg
63 |     :target: http://pypi.python.org/pypi/sengiri/
64 |     :alt: license
65 | 


--------------------------------------------------------------------------------
/sengiri/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from .sengiri import tokenize
 3 | """sengiri
 4 | 
 5 | This module provides the Japanese sentence-level tokenizer.
 6 | 
 7 | Author:
 8 |     Yukino Ikegami
 9 | 
10 | Lisence:
11 |     MIT License
12 | """
13 | 
14 | VERSION = (0, 2, 2)
15 | __version__ = '0.2.2'
16 | __all__ = ['tokenize']
17 | 


--------------------------------------------------------------------------------
/sengiri/sengiri.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import emoji
 4 | import MeCab
 5 | 
 6 | EMOJIS = set(emoji.unicode_codes.EMOJI_DATA.keys())
 7 | DELIMITERS = set({'。', '．', '…', '・・・', '...', '！', '!', '？', '?',
 8 |                   '！？', '？！', '!?', '?!'})
 9 | OPEN_BRACKETS = '｢「(（[［【『〈《〔｛{«‹〖〘〚'
10 | CLOSE_BRACKETS = '｣」)）]］】』〉》〕｝}»›〗〙〛'
11 | BRACKETS = set(OPEN_BRACKETS) | set(CLOSE_BRACKETS)
12 | LAUGHING = ('w', 'ww', 'www', 'wwww')
13 | re_parenthesis = None
14 | prev_parenthesis_threshold = 0
15 | 
16 | 
17 | def _has_delimiter(surface, features):
18 |     return ((features.startswith('記号,一般,') and surface not in BRACKETS)
19 |             or any(surface == d for d in DELIMITERS)
20 |                 or all(c in DELIMITERS for c in surface))
21 | 
22 | 
23 | def _analyze_by_mecab(line, mecab_args, emoji_threshold):
24 |     tagger = MeCab.Tagger(mecab_args)
25 |     pairs = [l.split('\t') for l in tagger.parse(line).splitlines()[:-1]]
26 | 
27 |     result = [[]]
28 |     has_delimiter_flag = False
29 |     emoji_count = 0
30 | 
31 |     for (i, (surface, features)) in enumerate(pairs[:-1]):
32 |         if all(c in EMOJIS for c in surface):
33 |             emoji_count += len(surface)
34 |             if result and emoji_count >= emoji_threshold and pairs[i+1][0] not in EMOJIS:
35 |                 result[-1].append(surface)
36 |                 result[-1] = ''.join(result[-1])
37 |                 result.append([])
38 |                 emoji_count = 0
39 |                 continue
40 |         elif surface in BRACKETS:
41 |             has_delimiter_flag = False
42 |         elif _has_delimiter(surface, features):
43 |             has_delimiter_flag = True
44 | 
45 |         # Check www is not in a part of URL
46 |         elif (result and result[-1] and result[-1][-1] not in ('http://', 'https://')
47 |                 and surface in LAUGHING):
48 |             has_delimiter_flag = True
49 |         elif has_delimiter_flag is True and surface == '.' and result[-1][-1] in LAUGHING:
50 |             has_delimiter_flag = False
51 | 
52 |         elif has_delimiter_flag is True:
53 |             result[-1] = ''.join(result[-1])
54 |             result.append([])
55 |             has_delimiter_flag = False
56 | 
57 |         result[-1].append(surface)
58 | 
59 |     result[-1].append(pairs[-1][0])
60 |     result[-1] = ''.join(result[-1])
61 |     return result
62 | 
63 | 
64 | def tokenize(doc, mecab_args='', emoji_threshold=3, parenthesis_threshold=10):
65 |     """Split document into sentences
66 | 
67 |     Parameters
68 |     ----------
69 |     doc : str
70 |         Document
71 |     mecab_args : str
72 |         Arguments for MeCab's Tagger
73 |     emoji_threshold : int
74 |         The numbers of emoji as sentence delimiter
75 |     parenthesis_threshold : int
76 |         The numbers of characters in parenthesis to delimit doc
77 | 
78 |     Return
79 |     ------
80 |     list
81 |         Sentences.
82 |     """
83 |     global re_parenthesis, prev_parenthesis_threshold
84 | 
85 |     if prev_parenthesis_threshold != parenthesis_threshold:
86 |         prev_parenthesis_threshold = parenthesis_threshold
87 |         re_parenthesis = re.compile('([%s])([%s][^%s]{%s,}[%s])'
88 |                                     % (''.join(DELIMITERS), re.escape(OPEN_BRACKETS),
89 |                                        re.escape(CLOSE_BRACKETS), parenthesis_threshold,
90 |                                        re.escape(CLOSE_BRACKETS)))
91 | 
92 |     doc = re_parenthesis.sub(lambda m: m.group(1) + '\n' + m.group(2) + '\n', doc)
93 | 
94 |     result = []
95 |     for line in filter(bool, doc.splitlines()):
96 |         result += _analyze_by_mecab(line, mecab_args, emoji_threshold)
97 |     return result
98 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from codecs import open
 3 | import os
 4 | import pkgutil
 5 | import re
 6 | from setuptools import setup
 7 | 
 8 | install_requires = ['emoji'] if pkgutil.find_loader('MeCab') else ['emoji', 'mecab']
 9 | 
10 | with open(os.path.join('sengiri', '__init__.py'), 'r', encoding='utf8') as f:
11 |     version = re.compile(
12 |         r".*__version__ = '(.*?)'", re.S).match(f.read()).group(1)
13 | 
14 | setup(
15 |     name='sengiri',
16 |     packages=['sengiri'],
17 |     version=version,
18 |     license='MIT License',
19 |     platforms=['POSIX', 'Windows', 'Unix', 'MacOS'],
20 |     description='Yet another sentence-level tokenizer for the Japanese text',
21 |     author='Yukino Ikegami',
22 |     author_email='yknikgm@gmail.com',
23 |     url='https://github.com/ikegami-yukino/sengiri',
24 |     keywords=['japanese', 'tokenizer', 'sentence', 'sentence-tokenizer'],
25 |     classifiers=[
26 |         'Development Status :: 3 - Alpha',
27 |         'Intended Audience :: Developers',
28 |         'Intended Audience :: Information Technology',
29 |         'License :: OSI Approved :: MIT License',
30 |         'Natural Language :: Japanese',
31 |         'Operating System :: MacOS',
32 |         'Operating System :: Microsoft',
33 |         'Operating System :: POSIX',
34 |         'Programming Language :: Python :: 3.4',
35 |         'Programming Language :: Python :: 3.5',
36 |         'Programming Language :: Python :: 3.6',
37 |         'Programming Language :: Python :: 3.7',
38 |         'Programming Language :: Python :: 3.8',
39 |         'Programming Language :: Python :: 3.9',
40 |         'Programming Language :: Python :: 3.10',
41 |         'Topic :: Text Processing'
42 |         ],
43 |     long_description='%s\n\n%s' % (open('README.rst', encoding='utf8').read(),
44 |                                    open('CHANGES.rst', encoding='utf8').read()),
45 |     install_requires=install_requires,
46 |     tests_require=['nose'],
47 |     test_suite='nose.collector'
48 | )
49 | 


--------------------------------------------------------------------------------
/test_sengiri.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | 
 3 | from nose.tools import assert_equal, assert_true
 4 | import sengiri.sengiri
 5 | 
 6 | TEST_CASES = {
 7 |     'うーん🤔🤔🤔どうしよう': ['うーん🤔🤔🤔', 'どうしよう'],
 8 |     'モー娘。のコンサートに行った。': ['モー娘。のコンサートに行った。'],
 9 |     '楽しかったし嬉しかった。すごく充実した!': ['楽しかったし嬉しかった。', 'すごく充実した!'],
10 |     'ありがとう＾＾ 助かります。': ['ありがとう＾＾', '助かります。'],
11 |     '大変なことになった。（後で聞いたのだが、脅されたらしい）（脅迫はやめてほしいと言っているのに）':
12 |         ['大変なことになった。', '（後で聞いたのだが、脅されたらしい）', '（脅迫はやめてほしいと言っているのに）'],
13 |     '楽しかったw また遊ぼwww': ['楽しかったw', 'また遊ぼwww'],
14 |     'http://www.inpaku.go.jp/': ['http://www.inpaku.go.jp/'],
15 |     '機械学習と統計的推論と微分幾何と関数解析と統計力学の動画！😎✌️':
16 |         ['機械学習と統計的推論と微分幾何と関数解析と統計力学の動画！😎✌️'],
17 |     '奇声を発しながら🦑をやっとる…': ['奇声を発しながら🦑をやっとる…'],
18 |     '心肺停止したので寝ます。おやすみなさい。': ['心肺停止したので寝ます。', 'おやすみなさい。'],
19 |     '大学院生「奨学金を…」': ['大学院生「奨学金を…」']
20 | }
21 | 
22 | 
23 | def test_has_delimiter():
24 |     assert_true(sengiri.sengiri._has_delimiter('♡', '記号,一般,*,*,*,*,♡,,,,'))
25 |     assert_true(sengiri.sengiri._has_delimiter('。', '記号,句点,*,*,*,*,。,。,。'))
26 | 
27 | 
28 | def test_analyze_by_mecab():
29 |     test_cases = copy.copy(TEST_CASES)
30 |     del test_cases['大変なことになった。（後で聞いたのだが、脅されたらしい）（脅迫はやめてほしいと言っているのに）']
31 |     for (source, expected) in test_cases.items():
32 |         actual = sengiri.sengiri._analyze_by_mecab(source, '', 3)
33 |         assert_equal(actual, expected)
34 | 
35 | 
36 | def test_tokenize():
37 |     for (source, expected) in TEST_CASES.items():
38 |         actual = sengiri.tokenize(source)
39 |         assert_equal(actual, expected)
40 | 


--------------------------------------------------------------------------------