├── .gitignore ├── .travis.yml ├── CHANGES.rst ├── LICENSE ├── MANIFEST.in ├── README.rst ├── sengiri ├── __init__.py └── sengiri.py ├── setup.py └── test_sengiri.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - 3.4 4 | - 3.5 5 | - 3.6 6 | - 3.7 7 | - 3.8 8 | before_install: 9 | - sudo apt-get update -qq 10 | - wget http://ftp.sjtu.edu.cn/ubuntu/pool/universe/m/mecab/libmecab2_0.996-1.1_amd64.deb 11 | - sudo dpkg -i libmecab2_0.996-1.1_amd64.deb 12 | - wget http://ftp.sjtu.edu.cn/ubuntu/pool/universe/m/mecab/libmecab-dev_0.996-1.1_amd64.deb 13 | - sudo dpkg -i libmecab-dev_0.996-1.1_amd64.deb 14 | - wget http://ftp.sjtu.edu.cn/ubuntu/pool/universe/m/mecab/mecab-utils_0.996-1.1_amd64.deb 15 | - sudo dpkg -i mecab-utils_0.996-1.1_amd64.deb 16 | - sudo apt-get install -y mecab-ipadic-utf8 17 | install: 18 | - "python setup.py install" 19 | - "pip install coveralls" 20 | script: 21 | - "nosetests --with-coverage --cover-package=sengiri" 22 | after_success: 23 | - coveralls 24 | -------------------------------------------------------------------------------- /CHANGES.rst: -------------------------------------------------------------------------------- 1 | CHANGES 2 | ======= 3 | 4 | 0.2.2 (2019-10-15) 5 | ------------------ 6 | 7 | - In tokenize() method, `emoji_threshold` parameter is available 8 | - Bugfix 9 | 10 | 0.2.1 (2019-10-12) 11 | ------------------ 12 | 13 | - Works well with also a text including emoticon and www (Laughing expression) 14 | - Always treat emoji to delimiter regardless MeCab's POS 15 | 16 | 0.1.1 (2019-10-05) 17 | ------------------ 18 | 19 | - First release 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2019 Yukino Ikegami 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.rst 2 | include LICENSE 3 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | sengiri 2 | ========== 3 | |travis| |coveralls| |pyversion| |version| |license| 4 | 5 | Yet another sentence-level tokenizer for the Japanese text 6 | 7 | DEPENDENCIES 8 | ============== 9 | 10 | - MeCab 11 | - emoji 12 | 13 | INSTALLATION 14 | ============== 15 | 16 | :: 17 | 18 | $ pip install sengiri 19 | 20 | 21 | USAGE 22 | ============ 23 | 24 | .. code:: python 25 | 26 | import sengiri 27 | 28 | print(sengiri.tokenize('うーん🤔🤔🤔どうしよう')) 29 | #=>['うーん🤔🤔🤔', 'どうしよう'] 30 | print(sengiri.tokenize('モー娘。のコンサートに行った。')) 31 | #=>['モー娘。のコンサートに行った。'] 32 | print(sengiri.tokenize('ありがとう^^ 助かります。')) 33 | #=>['ありがとう^^', '助かります。'] 34 | print(sengiri.tokenize('顔文字テスト(*´ω`*)うまくいくかな?')) 35 | #=>['顔文字テスト(*´ω`*)うまくいくかな?'] 36 | # I recommend using the NEologd dictionary. 37 | print(sengiri.tokenize('顔文字テスト(*´ω`*)うまくいくかな?', mecab_args='-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd')) 38 | #=>['顔文字テスト(*´ω`*)', 'うまくいくかな?'] 39 | print(sengiri.tokenize('子供が大変なことになった。' 40 | '(後で聞いたのだが、脅されたらしい)' 41 | '(脅迫はやめてほしいと言っているのに)')) 42 | #=>['子供が大変なことになった。', '(後で聞いたのだが、脅されたらしい)', '(脅迫はやめてほしいと言っているのに)'] 43 | print(sengiri.tokenize('楽しかったw また遊ぼwww')) 44 | #=>['楽しかったw', 'また遊ぼwww'] 45 | print(sengiri.tokenize('http://www.inpaku.go.jp/')) 46 | #=>['http://www.inpaku.go.jp/'] 47 | 48 | .. |travis| image:: https://travis-ci.org/ikegami-yukino/sengiri.svg?branch=master 49 | :target: https://travis-ci.org/ikegami-yukino/sengiri 50 | :alt: travis-ci.org 51 | 52 | .. |coveralls| image:: https://coveralls.io/repos/ikegami-yukino/sengiri/badge.svg?branch=master&service=github 53 | :target: https://coveralls.io/github/ikegami-yukino/sengiri?branch=master 54 | :alt: coveralls.io 55 | 56 | .. |pyversion| image:: https://img.shields.io/pypi/pyversions/sengiri.svg 57 | 58 | .. |version| image:: https://img.shields.io/pypi/v/sengiri.svg 59 | :target: http://pypi.python.org/pypi/sengiri/ 60 | :alt: latest version 61 | 62 | .. |license| image:: https://img.shields.io/pypi/l/sengiri.svg 63 | :target: http://pypi.python.org/pypi/sengiri/ 64 | :alt: license 65 | -------------------------------------------------------------------------------- /sengiri/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from .sengiri import tokenize 3 | """sengiri 4 | 5 | This module provides the Japanese sentence-level tokenizer. 6 | 7 | Author: 8 | Yukino Ikegami 9 | 10 | Lisence: 11 | MIT License 12 | """ 13 | 14 | VERSION = (0, 2, 2) 15 | __version__ = '0.2.2' 16 | __all__ = ['tokenize'] 17 | -------------------------------------------------------------------------------- /sengiri/sengiri.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import emoji 4 | import MeCab 5 | 6 | EMOJIS = set(emoji.unicode_codes.EMOJI_DATA.keys()) 7 | DELIMITERS = set({'。', '.', '…', '・・・', '...', '!', '!', '?', '?', 8 | '!?', '?!', '!?', '?!'}) 9 | OPEN_BRACKETS = '「「(([[【『〈《〔{{«‹〖〘〚' 10 | CLOSE_BRACKETS = '」」))]]】』〉》〕}}»›〗〙〛' 11 | BRACKETS = set(OPEN_BRACKETS) | set(CLOSE_BRACKETS) 12 | LAUGHING = ('w', 'ww', 'www', 'wwww') 13 | re_parenthesis = None 14 | prev_parenthesis_threshold = 0 15 | 16 | 17 | def _has_delimiter(surface, features): 18 | return ((features.startswith('記号,一般,') and surface not in BRACKETS) 19 | or any(surface == d for d in DELIMITERS) 20 | or all(c in DELIMITERS for c in surface)) 21 | 22 | 23 | def _analyze_by_mecab(line, mecab_args, emoji_threshold): 24 | tagger = MeCab.Tagger(mecab_args) 25 | pairs = [l.split('\t') for l in tagger.parse(line).splitlines()[:-1]] 26 | 27 | result = [[]] 28 | has_delimiter_flag = False 29 | emoji_count = 0 30 | 31 | for (i, (surface, features)) in enumerate(pairs[:-1]): 32 | if all(c in EMOJIS for c in surface): 33 | emoji_count += len(surface) 34 | if result and emoji_count >= emoji_threshold and pairs[i+1][0] not in EMOJIS: 35 | result[-1].append(surface) 36 | result[-1] = ''.join(result[-1]) 37 | result.append([]) 38 | emoji_count = 0 39 | continue 40 | elif surface in BRACKETS: 41 | has_delimiter_flag = False 42 | elif _has_delimiter(surface, features): 43 | has_delimiter_flag = True 44 | 45 | # Check www is not in a part of URL 46 | elif (result and result[-1] and result[-1][-1] not in ('http://', 'https://') 47 | and surface in LAUGHING): 48 | has_delimiter_flag = True 49 | elif has_delimiter_flag is True and surface == '.' and result[-1][-1] in LAUGHING: 50 | has_delimiter_flag = False 51 | 52 | elif has_delimiter_flag is True: 53 | result[-1] = ''.join(result[-1]) 54 | result.append([]) 55 | has_delimiter_flag = False 56 | 57 | result[-1].append(surface) 58 | 59 | result[-1].append(pairs[-1][0]) 60 | result[-1] = ''.join(result[-1]) 61 | return result 62 | 63 | 64 | def tokenize(doc, mecab_args='', emoji_threshold=3, parenthesis_threshold=10): 65 | """Split document into sentences 66 | 67 | Parameters 68 | ---------- 69 | doc : str 70 | Document 71 | mecab_args : str 72 | Arguments for MeCab's Tagger 73 | emoji_threshold : int 74 | The numbers of emoji as sentence delimiter 75 | parenthesis_threshold : int 76 | The numbers of characters in parenthesis to delimit doc 77 | 78 | Return 79 | ------ 80 | list 81 | Sentences. 82 | """ 83 | global re_parenthesis, prev_parenthesis_threshold 84 | 85 | if prev_parenthesis_threshold != parenthesis_threshold: 86 | prev_parenthesis_threshold = parenthesis_threshold 87 | re_parenthesis = re.compile('([%s])([%s][^%s]{%s,}[%s])' 88 | % (''.join(DELIMITERS), re.escape(OPEN_BRACKETS), 89 | re.escape(CLOSE_BRACKETS), parenthesis_threshold, 90 | re.escape(CLOSE_BRACKETS))) 91 | 92 | doc = re_parenthesis.sub(lambda m: m.group(1) + '\n' + m.group(2) + '\n', doc) 93 | 94 | result = [] 95 | for line in filter(bool, doc.splitlines()): 96 | result += _analyze_by_mecab(line, mecab_args, emoji_threshold) 97 | return result 98 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from codecs import open 3 | import os 4 | import pkgutil 5 | import re 6 | from setuptools import setup 7 | 8 | install_requires = ['emoji'] if pkgutil.find_loader('MeCab') else ['emoji', 'mecab'] 9 | 10 | with open(os.path.join('sengiri', '__init__.py'), 'r', encoding='utf8') as f: 11 | version = re.compile( 12 | r".*__version__ = '(.*?)'", re.S).match(f.read()).group(1) 13 | 14 | setup( 15 | name='sengiri', 16 | packages=['sengiri'], 17 | version=version, 18 | license='MIT License', 19 | platforms=['POSIX', 'Windows', 'Unix', 'MacOS'], 20 | description='Yet another sentence-level tokenizer for the Japanese text', 21 | author='Yukino Ikegami', 22 | author_email='yknikgm@gmail.com', 23 | url='https://github.com/ikegami-yukino/sengiri', 24 | keywords=['japanese', 'tokenizer', 'sentence', 'sentence-tokenizer'], 25 | classifiers=[ 26 | 'Development Status :: 3 - Alpha', 27 | 'Intended Audience :: Developers', 28 | 'Intended Audience :: Information Technology', 29 | 'License :: OSI Approved :: MIT License', 30 | 'Natural Language :: Japanese', 31 | 'Operating System :: MacOS', 32 | 'Operating System :: Microsoft', 33 | 'Operating System :: POSIX', 34 | 'Programming Language :: Python :: 3.4', 35 | 'Programming Language :: Python :: 3.5', 36 | 'Programming Language :: Python :: 3.6', 37 | 'Programming Language :: Python :: 3.7', 38 | 'Programming Language :: Python :: 3.8', 39 | 'Programming Language :: Python :: 3.9', 40 | 'Programming Language :: Python :: 3.10', 41 | 'Topic :: Text Processing' 42 | ], 43 | long_description='%s\n\n%s' % (open('README.rst', encoding='utf8').read(), 44 | open('CHANGES.rst', encoding='utf8').read()), 45 | install_requires=install_requires, 46 | tests_require=['nose'], 47 | test_suite='nose.collector' 48 | ) 49 | -------------------------------------------------------------------------------- /test_sengiri.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | from nose.tools import assert_equal, assert_true 4 | import sengiri.sengiri 5 | 6 | TEST_CASES = { 7 | 'うーん🤔🤔🤔どうしよう': ['うーん🤔🤔🤔', 'どうしよう'], 8 | 'モー娘。のコンサートに行った。': ['モー娘。のコンサートに行った。'], 9 | '楽しかったし嬉しかった。すごく充実した!': ['楽しかったし嬉しかった。', 'すごく充実した!'], 10 | 'ありがとう^^ 助かります。': ['ありがとう^^', '助かります。'], 11 | '大変なことになった。(後で聞いたのだが、脅されたらしい)(脅迫はやめてほしいと言っているのに)': 12 | ['大変なことになった。', '(後で聞いたのだが、脅されたらしい)', '(脅迫はやめてほしいと言っているのに)'], 13 | '楽しかったw また遊ぼwww': ['楽しかったw', 'また遊ぼwww'], 14 | 'http://www.inpaku.go.jp/': ['http://www.inpaku.go.jp/'], 15 | '機械学習と統計的推論と微分幾何と関数解析と統計力学の動画!😎✌️': 16 | ['機械学習と統計的推論と微分幾何と関数解析と統計力学の動画!😎✌️'], 17 | '奇声を発しながら🦑をやっとる…': ['奇声を発しながら🦑をやっとる…'], 18 | '心肺停止したので寝ます。おやすみなさい。': ['心肺停止したので寝ます。', 'おやすみなさい。'], 19 | '大学院生「奨学金を…」': ['大学院生「奨学金を…」'] 20 | } 21 | 22 | 23 | def test_has_delimiter(): 24 | assert_true(sengiri.sengiri._has_delimiter('♡', '記号,一般,*,*,*,*,♡,,,,')) 25 | assert_true(sengiri.sengiri._has_delimiter('。', '記号,句点,*,*,*,*,。,。,。')) 26 | 27 | 28 | def test_analyze_by_mecab(): 29 | test_cases = copy.copy(TEST_CASES) 30 | del test_cases['大変なことになった。(後で聞いたのだが、脅されたらしい)(脅迫はやめてほしいと言っているのに)'] 31 | for (source, expected) in test_cases.items(): 32 | actual = sengiri.sengiri._analyze_by_mecab(source, '', 3) 33 | assert_equal(actual, expected) 34 | 35 | 36 | def test_tokenize(): 37 | for (source, expected) in TEST_CASES.items(): 38 | actual = sengiri.tokenize(source) 39 | assert_equal(actual, expected) 40 | --------------------------------------------------------------------------------