├── src └── text_cleaning │ ├── __init__.py │ ├── main.py │ └── clean_text.py ├── tests ├── test_cleaning │ ├── 002.json │ ├── 008.json │ ├── 013.json │ ├── 004.json │ ├── 010.json │ ├── 012.json │ ├── 007.json │ ├── 011.json │ ├── 003.json │ ├── 005.json │ ├── 006.json │ ├── 001.json │ ├── 014.json │ ├── 000.json │ └── 009.json └── test_cleaning.py ├── pyproject.toml ├── LICENSE ├── Makefile ├── README.md ├── .gitignore └── poetry.lock /src/text_cleaning/__init__.py: -------------------------------------------------------------------------------- 1 | from .clean_text import clean_text 2 | -------------------------------------------------------------------------------- /tests/test_cleaning/002.json: -------------------------------------------------------------------------------- 1 | { 2 | "input_text": "アドバイスありがとう!✋!", 3 | "output_text": "アドバイスありがとう!" 4 | } -------------------------------------------------------------------------------- /tests/test_cleaning/008.json: -------------------------------------------------------------------------------- 1 | { 2 | "input_text": "本当ですか?(動揺してる)", 3 | "output_text": "本当ですか?(動揺してる)。" 4 | } -------------------------------------------------------------------------------- /tests/test_cleaning/013.json: -------------------------------------------------------------------------------- 1 | { 2 | "input_text": "Let's goですね", 3 | "output_text": "Let。goですね。" 4 | } 5 | -------------------------------------------------------------------------------- /tests/test_cleaning/004.json: -------------------------------------------------------------------------------- 1 | { 2 | "input_text": "面白い笑笑笑笑。笑いが止まらんwwwww", 3 | "output_text": "面白い。笑いが止まらん。" 4 | } -------------------------------------------------------------------------------- /tests/test_cleaning/010.json: -------------------------------------------------------------------------------- 1 | { 2 | "input_text": "あい。)おはよう。ヽ( ゚∀゚)ノ┌┛)`Д゚)・;'━!!", 3 | "output_text": "あい。おはよう。" 4 | } -------------------------------------------------------------------------------- /tests/test_cleaning/012.json: -------------------------------------------------------------------------------- 1 | { 2 | "input_text": "これは本当(ある意味嘘(実は本当))のこと。", 3 | "output_text": "これは本当のこと。" 4 | } 5 | -------------------------------------------------------------------------------- /tests/test_cleaning/007.json: -------------------------------------------------------------------------------- 1 | { 2 | "input_text": "((o(゚▽゚*)o))おや_(ˇωˇ」∠)_ スヤァ…(笑)", 3 | "output_text": "おや。スヤァ。(笑)。" 4 | } -------------------------------------------------------------------------------- /tests/test_cleaning/011.json: -------------------------------------------------------------------------------- 1 | { 2 | "input_text": "確かに「嘘でしょww」って笑ってたね", 3 | "output_text": "確かに「嘘でしょ。」って笑ってたね。" 4 | } 5 | -------------------------------------------------------------------------------- /tests/test_cleaning/003.json: -------------------------------------------------------------------------------- 1 | { 2 | "input_text": "可愛い!。\(^oao^)/お大事になさってください...", 3 | "output_text": "可愛い!お大事になさってください。" 4 | } -------------------------------------------------------------------------------- /tests/test_cleaning/005.json: -------------------------------------------------------------------------------- 1 | { 2 | "input_text": "幸せいっぱいの一年にしましょう。(。>ㅅ<。)よろしく♥", 3 | "output_text": "幸せいっぱいの一年にしましょう。よろしく。" 4 | } -------------------------------------------------------------------------------- /tests/test_cleaning/006.json: -------------------------------------------------------------------------------- 1 | { 2 | "input_text": "(・_・_・_)(。口。)(ーー゛)(。三三。)( ´ ▽ ` )ノ((T_T))ヾ(⌒(ノ*•ω•*)", 3 | "output_text": "" 4 | } -------------------------------------------------------------------------------- /tests/test_cleaning/001.json: -------------------------------------------------------------------------------- 1 | { 2 | "input_text": "@abcde0123 おっとっとwwそうでした✋!よろしくお願いします♪‼", 3 | "output_text": "おっとっと。そうでした!よろしくお願いします。" 4 | } -------------------------------------------------------------------------------- /tests/test_cleaning/014.json: -------------------------------------------------------------------------------- 1 | { 2 | "input_text": "帰ってきたら新年会の応募が届いておりまして、〆切後に参加したくなって", 3 | "output_text": "帰ってきたら新年会の応募が届いておりまして、〆切後に参加したくなって。" 4 | } 5 | -------------------------------------------------------------------------------- /tests/test_cleaning/000.json: -------------------------------------------------------------------------------- 1 | { 2 | "input_text": "それはすご〜〜〜いな!これ見て→\uD83D\uDC46\uD83D\uDC4F\uD83D\uDC95 http://a.bc/defGHIjkl", 3 | "output_text": "それはすごいな!これ見て。" 4 | } -------------------------------------------------------------------------------- /tests/test_cleaning/009.json: -------------------------------------------------------------------------------- 1 | { 2 | "input_text": "I’ve taken her mail, but I could not follow her.", 3 | "output_text": "I’ve taken her mail, but I could not follow her." 4 | } -------------------------------------------------------------------------------- /tests/test_cleaning.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import json 4 | from typing import Tuple 5 | 6 | import pytest 7 | 8 | from text_cleaning import clean_text 9 | 10 | 11 | def read_test_file(path: str) -> Tuple[str, str]: 12 | with open(path) as f: 13 | dct = json.load(f) 14 | return dct['input_text'], dct['output_text'] 15 | 16 | 17 | test_file_path_pattern = os.path.join(os.path.dirname(__file__), 'test_cleaning', '*.json') 18 | test_cases = [read_test_file(path) for path in sorted(glob.glob(test_file_path_pattern))] 19 | 20 | 21 | @pytest.mark.parametrize('test_case', test_cases) 22 | def test_clean_text(test_case): 23 | input_text, output_text = test_case 24 | assert clean_text(input_text, twitter=True, han2zen=True) == output_text 25 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "text-cleaning" 3 | version = "1.0.0" 4 | description = "A powerful text cleaner for Japanese web texts" 5 | license = "MIT" 6 | authors = ["Kurohashi-Kawahara Lab, Kyoto Univ "] 7 | maintainers = ["Takashi Kodama ", "Nobuhiro Ueda "] 8 | readme = "README.md" 9 | repository = "https://github.com/ku-nlp/text-cleaning" 10 | keywords = [ 11 | "NLP" 12 | ] 13 | classifiers = [ 14 | # "License :: OSI Approved :: BSD License", 15 | "Programming Language :: Python :: 3", 16 | "Programming Language :: Python :: 3.7", 17 | "Programming Language :: Python :: 3.8", 18 | "Programming Language :: Python :: 3.9" 19 | ] 20 | 21 | [tool.poetry.dependencies] 22 | python = "^3.7" 23 | mojimoji = "*" 24 | neologdn = "*" 25 | joblib = "*" 26 | 27 | [tool.poetry.dev-dependencies] 28 | pytest = "^6.2" 29 | 30 | [build-system] 31 | requires = ["poetry-core>=1.0.0"] 32 | build-backend = "poetry.core.masonry.api" 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2020 Takashi KODAMA and Nobuhiro UEDA 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | INPUT_DIR := /somewhere/input 2 | OUTPUT_DIR := /somewhere/output 3 | # input file format. txt or csv or tsv 4 | FILE_FORMAT := txt 5 | NUM_JOBS_PER_MACHINE := 10 6 | TWITTER := 7 | PYTHON := 8 | 9 | INPUT_FILES := $(shell find $(INPUT_DIR) -type f) 10 | REL_PATHS := $(patsubst $(INPUT_DIR)/%,%,$(INPUT_FILES)) 11 | INPUT_EXT := $(suffix $(word 1, $(INPUT_FILES))) 12 | 13 | CAT := cat 14 | CATOUT := cat 15 | ifeq ($(INPUT_EXT),.gz) 16 | CAT := zcat 17 | CATOUT := gzip 18 | endif 19 | ifeq ($(INPUT_EXT),.zip) 20 | CAT := zcat 21 | CATOUT := zip 22 | endif 23 | ifeq ($(INPUT_EXT),.bz2) 24 | CAT := bzcat 25 | CATOUT := bzip2 26 | endif 27 | 28 | ifndef PYTHON 29 | PYTHON := $(shell which python) 30 | endif 31 | 32 | CLEANED_FILES := $(addprefix $(OUTPUT_DIR)/,$(REL_PATHS)) 33 | 34 | CLEANING_ARGS = --file-format $(FILE_FORMAT) 35 | CLEANING_ARGS += --n-jobs $(NUM_JOBS_PER_MACHINE) 36 | ifdef TWITTER 37 | CLEANING_ARGS += --twitter 38 | endif 39 | 40 | .PHONY: all 41 | all: $(CLEANED_FILES) 42 | 43 | $(CLEANED_FILES): $(OUTPUT_DIR)/%: $(INPUT_DIR)/% 44 | mkdir -p $(dir $@) 45 | $(CAT) $< | $(PYTHON) src/text_cleaning/main.py $(CLEANING_ARGS) | $(CATOUT) > $@ || rm $@ 46 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # text-cleaning: A Japanese powerful text cleaner 2 | 3 | ## Description 4 | This project cleans dirty Japanese texts, which include a lot of emoji and kaomoji 5 | in a whitelist method. 6 | 7 | ## Cleaning Example 8 | 9 | ```text 10 | INPUT: これはサンプルです(≧∇≦*)!見てみて→http://a.bc/defGHIjkl 11 | OUTPUT: これはサンプルです!見てみて。 12 | 13 | INPUT: 一緒に応援してるよ(o^^o)。ありがとう😃 14 | OUTPUT: 一緒に応援してるよ。ありがとう。 15 | 16 | INPUT: いいぞ〜⸜(* ॑꒳ ॑* )⸝⋆* 17 | OUTPUT: いいぞ。 18 | 19 | INPUT: えっ((((;゚Д゚))))))) 20 | OUTPUT: えっ。 21 | 22 | INPUT: 確かに「嘘でしょww」って笑ってたね 23 | OUTPUT: 確かに「嘘でしょ。」って笑ってたね。 24 | 25 | INPUT: おはようございますヽ(*´∀`)ノ。。今日は雨ですね・・・・・(T_T) 26 | OUTPUT: おはようございます。今日は雨ですね。 27 | 28 | INPUT: (灬º﹃º灬)おいしそうです♡ 29 | OUTPUT: おいしそうです。 30 | 31 | INPUT: 今日の夜、友達とラーメン行くよ(((o(*゚▽゚*)o))) 32 | OUTPUT: 今日の夜、友達とラーメン行くよ。 33 | 34 | # When using the twitter option. 35 | INPUT: @abcde0123 おっとっとwwそうでした✋!!よろしくお願いします♪‼ #挨拶 36 | OUTPUT: おっとっと。そうでした!よろしくお願いします。 37 | ``` 38 | 39 | ## Requirements 40 | - Python 3.7+ 41 | - mojimoji 42 | - neologdn 43 | - joblib 44 | 45 | ## How to Run 46 | 47 | ### Using python script directly 48 | 49 | ```zsh 50 | cat input.txt | python src/text_cleaning/main.py > output.txt 51 | ``` 52 | 53 | ### Using makefile 54 | When input files are located in directories hierarchically you can clean 55 | them keeping directory structure by using makefile. 56 | If input is compressed files, Makefile detect their format from their 57 | suffix and output cleaned files in the same format. 58 | 59 | ```zsh 60 | make INPUT_DIR=/somewhere/in OUTPUT_DIR=/somewhere/out PYTHON=/somewhere/.venv/bin/python 61 | ``` 62 | 63 | Options: 64 | 65 | - FILE_FORMAT=txt: Format of input file (txt or csv or tsv) 66 | - NUM_JOBS_PER_MACHINE=10: The maximum number of concurrently running jobs per machine 67 | - TWITTER=1: Perform twitter specific cleaning 68 | - PYTHON: Path to python interpreter of virtual environment 69 | -------------------------------------------------------------------------------- /src/text_cleaning/main.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from argparse import ArgumentParser 3 | from typing import Optional 4 | 5 | from joblib import Parallel, delayed 6 | 7 | from .clean_text import clean_text 8 | 9 | DELIMITER = {'txt': None, 'csv': ',', 'tsv': '\t'} 10 | JOINER = {'txt': '', 'csv': ',', 'tsv': '\t'} 11 | 12 | 13 | def _clean_texts(input_text: str, file_format: str, twitter: bool, han2zen: bool) -> str: 14 | delimiter: Optional[str] = DELIMITER[file_format] 15 | joiner: str = JOINER[file_format] 16 | return joiner.join(clean_text(text, twitter=twitter, han2zen=han2zen) for text in input_text.split(delimiter)) 17 | 18 | 19 | def main(): 20 | parser = ArgumentParser() 21 | parser.add_argument('-f', '--file-format', default='txt', type=str, choices=['txt', 'csv', 'tsv']) 22 | parser.add_argument('-n', '--n-jobs', default=1, type=int) 23 | parser.add_argument('-t', '--twitter', action='store_true', help='perform twitter-specific cleaning') 24 | parser.add_argument('--han2zen', '--h2z', action='store_true', help='convert hankaku characters to zenkaku ones') 25 | parser.add_argument('-i', '--input-file', type=str) 26 | args = parser.parse_args() 27 | 28 | input_texts = [] 29 | with open(args.input_file, 'rb') if args.input_file else sys.stdin as f: 30 | for line in f.buffer: 31 | try: 32 | line = line.decode('utf-8') 33 | except UnicodeDecodeError: 34 | line = '' 35 | input_texts.append(line.strip()) 36 | 37 | if args.n_jobs == 0: 38 | outputs = [_clean_texts(input_text, args.file_format, args.twitter, args.han2zen) for input_text in input_texts] 39 | else: 40 | outputs = Parallel(n_jobs=args.n_jobs, verbose=10)( 41 | [delayed(_clean_texts)(input_text, args.file_format, args.twitter, args.han2zen) for input_text in 42 | input_texts]) 43 | for output in outputs: 44 | print(output) 45 | 46 | 47 | if __name__ == '__main__': 48 | main() 49 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # IPython 79 | profile_default/ 80 | ipython_config.py 81 | 82 | # pyenv 83 | .python-version 84 | 85 | # pipenv 86 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 87 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 88 | # having no cross-platform support, pipenv may install dependencies that don’t work, or not 89 | # install all needed dependencies. 90 | #Pipfile.lock 91 | 92 | # celery beat schedule file 93 | celerybeat-schedule 94 | 95 | # SageMath parsed files 96 | *.sage.py 97 | 98 | # Environments 99 | .env 100 | .venv 101 | env/ 102 | venv/ 103 | ENV/ 104 | env.bak/ 105 | venv.bak/ 106 | 107 | # Spyder project settings 108 | .spyderproject 109 | .spyproject 110 | 111 | # Rope project settings 112 | .ropeproject 113 | 114 | # mkdocs documentation 115 | /site 116 | 117 | # mypy 118 | .mypy_cache/ 119 | .dmypy.json 120 | dmypy.json 121 | 122 | # Pyre type checker 123 | .pyre/ 124 | 125 | # Pycharm 126 | .idea 127 | -------------------------------------------------------------------------------- /src/text_cleaning/clean_text.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import neologdn 4 | from mojimoji import han_to_zen 5 | 6 | ESCAPE_CODES = [r'<', r'>', r'&', r'"', r' ', r'©'] 7 | 8 | HIRAGANA = r'\u3041-\u3096' 9 | KATAKANA = r'\u30A1-\u30F6' 10 | PROLONGED_SOUND_MARK = r'\u30FC' 11 | KANJI = r'\u3006\u4E00-\u9FFF' # U+3006: 〆 12 | REPEATING_MARK = r'\u3005' 13 | 14 | WHITELIST_PTN = re.compile(rf'[a-zA-Z0-9!?()「」、。{HIRAGANA}{KATAKANA}{PROLONGED_SOUND_MARK}{KANJI}{REPEATING_MARK}]') 15 | JP_PTN = re.compile(rf'[{HIRAGANA}{KATAKANA}{PROLONGED_SOUND_MARK}{KANJI}]') 16 | 17 | 18 | def clean_text(text: str, twitter: bool, han2zen: bool, repeat: int = 3) -> str: 19 | text = _normalize(text=text, repeat=repeat) 20 | if _is_japanese(text): 21 | if twitter is True: 22 | text = _twitter_preprocess(text=text) 23 | text = _filter(text=text) 24 | if han2zen is True: 25 | text = han_to_zen(text) 26 | return text 27 | 28 | 29 | def _normalize(text: str, repeat: int) -> str: 30 | return neologdn.normalize(text, repeat=repeat) 31 | 32 | 33 | def _is_japanese(string: str) -> bool: 34 | al_num = re.compile(r'^[a-zA-Z0-9()!?,.:;\-\'\"\s]+$') 35 | return al_num.match(string) is None 36 | 37 | 38 | def _twitter_preprocess(text: str) -> str: 39 | replaced_text = re.sub(r'[RT]\w+', '', text) 40 | replaced_text = re.sub(r'[@][a-zA-Z0-9_]+', '', replaced_text) 41 | replaced_text = re.sub(r'#(\w+)', '', replaced_text) 42 | return replaced_text 43 | 44 | 45 | def _replace_punctuation(text: str) -> str: 46 | replaced_text = re.sub(r'、+', '、', text) # "、、、" -> "、" 47 | replaced_text = re.sub(r'[、。]*。[、。]*', '。', replaced_text) 48 | replaced_text = re.sub(r'^[、。!?]', '', replaced_text) 49 | replaced_text = re.sub( 50 | rf'。[a-zA-Z0-9!?「」{HIRAGANA}{KATAKANA}{PROLONGED_SOUND_MARK}{KANJI}]。', '。', replaced_text) 51 | return replaced_text 52 | 53 | 54 | def _whitelist_filter(text: str) -> str: 55 | """ 56 | あいうw → あいう。 57 | (あいう)w → (あいう)。 58 | あいう → あいう。 59 | あいう☆ → あいう。 60 | あいう。w 。→ あいう。。。 61 | 62 | """ 63 | ptn = re.compile(rf'[0-9。w{HIRAGANA}{KATAKANA}{PROLONGED_SOUND_MARK}{KANJI}]') 64 | filtered_text = '' 65 | for i, character in enumerate(text): 66 | if WHITELIST_PTN.match(character) and \ 67 | not (character == 'w' and filtered_text and ptn.match(filtered_text[-1])): 68 | filtered_text += character 69 | continue 70 | filtered_text += '。' 71 | filtered_text += '。' 72 | return filtered_text 73 | 74 | 75 | def _delete_kaomoji(text: str) -> str: 76 | text_ = '' 77 | buff = '' 78 | bracket_counter = 0 79 | for c in text: 80 | buff += c 81 | if c == '(': 82 | bracket_counter += 1 83 | elif c == ')': 84 | bracket_counter -= 1 85 | if bracket_counter == 0: 86 | stripped_buff = buff.lstrip('(').rstrip(')') 87 | if all(JP_PTN.match(c) for c in stripped_buff) and stripped_buff: 88 | text_ += buff 89 | buff = '' 90 | continue 91 | if bracket_counter == 0: 92 | text_ += buff 93 | buff = '' 94 | return text_ 95 | 96 | 97 | def _filter(text: str) -> str: 98 | text = re.sub(r'(http|https)://([-\w]+\.)+[-\w]+(/[-\w./?%&=]*)?', '', text) 99 | for escape_code in ESCAPE_CODES: 100 | text = re.sub(escape_code, '', text) 101 | text = _whitelist_filter(text=text) 102 | text = _replace_punctuation(text) 103 | 104 | text = re.sub(r'笑笑+', '笑', text) 105 | text = re.sub(r'笑。', '。', text) 106 | 107 | text = re.sub(r'([!?。])[a-zA-Z0-9]+([!?。])', r'\1\2', text) 108 | text = _replace_punctuation(text) 109 | 110 | text = _delete_kaomoji(text) 111 | text = _replace_punctuation(text) 112 | 113 | text = re.sub(r'(。\))|(\(。)', '。', text) 114 | text = re.sub(r'[。!?][ノシノシ]+[。!?]', '。', text) 115 | text = re.sub(r'。([!?])', r'\1', text) 116 | text = re.sub(r'([!?])。', r'\1', text) 117 | text = _replace_punctuation(text) 118 | 119 | text = re.sub(r'!!+', '!', text) 120 | text = re.sub(r'\?\?+', '?', text) 121 | text = re.sub(r'^.。', '', text) 122 | text = '' if len(text) == 1 else text 123 | 124 | return text 125 | -------------------------------------------------------------------------------- /poetry.lock: -------------------------------------------------------------------------------- 1 | [[package]] 2 | name = "atomicwrites" 3 | version = "1.4.0" 4 | description = "Atomic file writes." 5 | category = "dev" 6 | optional = false 7 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" 8 | 9 | [[package]] 10 | name = "attrs" 11 | version = "21.2.0" 12 | description = "Classes Without Boilerplate" 13 | category = "dev" 14 | optional = false 15 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" 16 | 17 | [package.extras] 18 | dev = ["coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "sphinx", "sphinx-notfound-page", "zope.interface"] 19 | docs = ["furo", "sphinx", "sphinx-notfound-page", "zope.interface"] 20 | tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "zope.interface"] 21 | tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"] 22 | 23 | [[package]] 24 | name = "colorama" 25 | version = "0.4.4" 26 | description = "Cross-platform colored terminal text." 27 | category = "dev" 28 | optional = false 29 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" 30 | 31 | [[package]] 32 | name = "importlib-metadata" 33 | version = "4.8.1" 34 | description = "Read metadata from Python packages" 35 | category = "dev" 36 | optional = false 37 | python-versions = ">=3.6" 38 | 39 | [package.dependencies] 40 | typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""} 41 | zipp = ">=0.5" 42 | 43 | [package.extras] 44 | docs = ["jaraco.packaging (>=8.2)", "rst.linker (>=1.9)", "sphinx"] 45 | perf = ["ipython"] 46 | testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pep517", "pyfakefs", "pytest (>=4.6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.0.1)", "pytest-flake8", "pytest-mypy", "pytest-perf (>=0.9.2)"] 47 | 48 | [[package]] 49 | name = "iniconfig" 50 | version = "1.1.1" 51 | description = "iniconfig: brain-dead simple config-ini parsing" 52 | category = "dev" 53 | optional = false 54 | python-versions = "*" 55 | 56 | [[package]] 57 | name = "joblib" 58 | version = "1.2.0" 59 | description = "Lightweight pipelining with Python functions" 60 | category = "main" 61 | optional = false 62 | python-versions = ">=3.7" 63 | 64 | [[package]] 65 | name = "mojimoji" 66 | version = "0.0.11" 67 | description = "A fast converter between Japanese hankaku and zenkaku characters" 68 | category = "main" 69 | optional = false 70 | python-versions = "*" 71 | 72 | [[package]] 73 | name = "neologdn" 74 | version = "0.5.1" 75 | description = "Japanese text normalizer for mecab-neologd" 76 | category = "main" 77 | optional = false 78 | python-versions = "*" 79 | 80 | [[package]] 81 | name = "packaging" 82 | version = "21.0" 83 | description = "Core utilities for Python packages" 84 | category = "dev" 85 | optional = false 86 | python-versions = ">=3.6" 87 | 88 | [package.dependencies] 89 | pyparsing = ">=2.0.2" 90 | 91 | [[package]] 92 | name = "pluggy" 93 | version = "1.0.0" 94 | description = "plugin and hook calling mechanisms for python" 95 | category = "dev" 96 | optional = false 97 | python-versions = ">=3.6" 98 | 99 | [package.dependencies] 100 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} 101 | 102 | [package.extras] 103 | dev = ["pre-commit", "tox"] 104 | testing = ["pytest", "pytest-benchmark"] 105 | 106 | [[package]] 107 | name = "py" 108 | version = "1.10.0" 109 | description = "library with cross-python path, ini-parsing, io, code, log facilities" 110 | category = "dev" 111 | optional = false 112 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" 113 | 114 | [[package]] 115 | name = "pyparsing" 116 | version = "2.4.7" 117 | description = "Python parsing module" 118 | category = "dev" 119 | optional = false 120 | python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" 121 | 122 | [[package]] 123 | name = "pytest" 124 | version = "6.2.5" 125 | description = "pytest: simple powerful testing with Python" 126 | category = "dev" 127 | optional = false 128 | python-versions = ">=3.6" 129 | 130 | [package.dependencies] 131 | atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""} 132 | attrs = ">=19.2.0" 133 | colorama = {version = "*", markers = "sys_platform == \"win32\""} 134 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} 135 | iniconfig = "*" 136 | packaging = "*" 137 | pluggy = ">=0.12,<2.0" 138 | py = ">=1.8.2" 139 | toml = "*" 140 | 141 | [package.extras] 142 | testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"] 143 | 144 | [[package]] 145 | name = "toml" 146 | version = "0.10.2" 147 | description = "Python Library for Tom's Obvious, Minimal Language" 148 | category = "dev" 149 | optional = false 150 | python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" 151 | 152 | [[package]] 153 | name = "typing-extensions" 154 | version = "3.10.0.2" 155 | description = "Backported and Experimental Type Hints for Python 3.5+" 156 | category = "dev" 157 | optional = false 158 | python-versions = "*" 159 | 160 | [[package]] 161 | name = "zipp" 162 | version = "3.6.0" 163 | description = "Backport of pathlib-compatible object wrapper for zip files" 164 | category = "dev" 165 | optional = false 166 | python-versions = ">=3.6" 167 | 168 | [package.extras] 169 | docs = ["jaraco.packaging (>=8.2)", "rst.linker (>=1.9)", "sphinx"] 170 | testing = ["func-timeout", "jaraco.itertools", "pytest (>=4.6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.0.1)", "pytest-flake8", "pytest-mypy"] 171 | 172 | [metadata] 173 | lock-version = "1.1" 174 | python-versions = "^3.7" 175 | content-hash = "bbac8bba46a07688d50cbea79c1dc1c1a7e1e5364a34c50ab4ea8e16d9d4b093" 176 | 177 | [metadata.files] 178 | atomicwrites = [ 179 | {file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"}, 180 | {file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"}, 181 | ] 182 | attrs = [ 183 | {file = "attrs-21.2.0-py2.py3-none-any.whl", hash = "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1"}, 184 | {file = "attrs-21.2.0.tar.gz", hash = "sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"}, 185 | ] 186 | colorama = [ 187 | {file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"}, 188 | {file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"}, 189 | ] 190 | importlib-metadata = [ 191 | {file = "importlib_metadata-4.8.1-py3-none-any.whl", hash = "sha256:b618b6d2d5ffa2f16add5697cf57a46c76a56229b0ed1c438322e4e95645bd15"}, 192 | {file = "importlib_metadata-4.8.1.tar.gz", hash = "sha256:f284b3e11256ad1e5d03ab86bb2ccd6f5339688ff17a4d797a0fe7df326f23b1"}, 193 | ] 194 | iniconfig = [ 195 | {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"}, 196 | {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"}, 197 | ] 198 | joblib = [ 199 | {file = "joblib-1.2.0-py3-none-any.whl", hash = "sha256:091138ed78f800342968c523bdde947e7a305b8594b910a0fea2ab83c3c6d385"}, 200 | {file = "joblib-1.2.0.tar.gz", hash = "sha256:e1cee4a79e4af22881164f218d4311f60074197fb707e082e803b61f6d137018"}, 201 | ] 202 | mojimoji = [ 203 | {file = "mojimoji-0.0.11-cp35-cp35m-macosx_10_14_x86_64.whl", hash = "sha256:b86ee36240e77a414f796630cf3c80acef4e9528277769d1851d00b94af1cc98"}, 204 | {file = "mojimoji-0.0.11-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:3d3756f60754ee95847f419f48246a68271d9024c5296d298054229403804002"}, 205 | {file = "mojimoji-0.0.11-cp35-cp35m-win_amd64.whl", hash = "sha256:e257176fe55332ac5dc69735270840b36738a82f1dda3383758a6f36b5918fb6"}, 206 | {file = "mojimoji-0.0.11-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:ece991b6529dddb91ae5035dcf3b62b486af675d4ec94fc3ecf7872b66f3df88"}, 207 | {file = "mojimoji-0.0.11-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:b7a1fe5726ad7036b096d44e7d014ceac174aec718199e19c6c6d1c2a757f6b9"}, 208 | {file = "mojimoji-0.0.11-cp36-cp36m-win_amd64.whl", hash = "sha256:ae2fcb9fa35f5890ba888780c9d445074717ef988c3171a058f002cd0f3933f4"}, 209 | {file = "mojimoji-0.0.11-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:e551bb3be777643e2e1f7bc40db4d57baedaa6dac6c7875d9a926ffe66b67fb1"}, 210 | {file = "mojimoji-0.0.11-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:642c5e8642fa53c9556cd4c880e09438786dd955c32c7ce5fa2693e16914cdb6"}, 211 | {file = "mojimoji-0.0.11-cp37-cp37m-win_amd64.whl", hash = "sha256:029c5d2646c8ab36bef61abe26ed6bc698f7ffb6a744d82fafd86e233d6669a2"}, 212 | {file = "mojimoji-0.0.11-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:2fb18323ffae39391d77c77b27ed675cbf6e49e10f884ceaf8399be7ca7e0dc5"}, 213 | {file = "mojimoji-0.0.11-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:ea9c3d0c37ae7a8e494cb9681dae404de4599d4218e5b5d13c009ab1856d6409"}, 214 | {file = "mojimoji-0.0.11-cp38-cp38-win_amd64.whl", hash = "sha256:304681e2d99db69c552e1e36992cb41b967b46abd1720c1826f9dec678491d56"}, 215 | {file = "mojimoji-0.0.11.tar.gz", hash = "sha256:adef094e1bfd961e40c1fbd2d4664da1be2426d5b31884e27394226af15d50b5"}, 216 | ] 217 | neologdn = [ 218 | {file = "neologdn-0.5.1-cp36-cp36m-win32.whl", hash = "sha256:67e131890ba1b005f366b998bc63d80ce22fea6a5023930cd6b383ee71456d23"}, 219 | {file = "neologdn-0.5.1-cp36-cp36m-win_amd64.whl", hash = "sha256:3d6208fd3e24a7ad4318e59ba3858944a0a06f1f2d3e592f6644c6625c2eda91"}, 220 | {file = "neologdn-0.5.1-cp37-cp37m-win32.whl", hash = "sha256:fd0517f18a9e818e3ca7e1cae31ecf132556bf3a79d2c8c5530d083cefdb3109"}, 221 | {file = "neologdn-0.5.1-cp37-cp37m-win_amd64.whl", hash = "sha256:55e67e4e1f52589a51ade0ffd2eb772251588757e3f15c60bad372fcb613df50"}, 222 | {file = "neologdn-0.5.1-cp38-cp38-win32.whl", hash = "sha256:bcaa99e5635f6b3f171b5261a2b8965b831f03008adb0521081404e4a7f18226"}, 223 | {file = "neologdn-0.5.1-cp38-cp38-win_amd64.whl", hash = "sha256:eb74d99c20b5864fb389d8a843afe67135285d6867c60db30ff1384fbade1d5d"}, 224 | {file = "neologdn-0.5.1-cp39-cp39-win32.whl", hash = "sha256:636c92852a0156a8746874996cbaf114d8816119962ede8f79cb9f634ba9b97f"}, 225 | {file = "neologdn-0.5.1-cp39-cp39-win_amd64.whl", hash = "sha256:d17277c19901eb53f52ccdc8ec829b49441c314fde612d93528be78b64247e0d"}, 226 | {file = "neologdn-0.5.1.tar.gz", hash = "sha256:206afe3e8de50bbe4a7f3209b8bca4ce8d3ea314467f6ca04a5c553acc8c32ba"}, 227 | ] 228 | packaging = [ 229 | {file = "packaging-21.0-py3-none-any.whl", hash = "sha256:c86254f9220d55e31cc94d69bade760f0847da8000def4dfe1c6b872fd14ff14"}, 230 | {file = "packaging-21.0.tar.gz", hash = "sha256:7dc96269f53a4ccec5c0670940a4281106dd0bb343f47b7471f779df49c2fbe7"}, 231 | ] 232 | pluggy = [ 233 | {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, 234 | {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, 235 | ] 236 | py = [ 237 | {file = "py-1.10.0-py2.py3-none-any.whl", hash = "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"}, 238 | {file = "py-1.10.0.tar.gz", hash = "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3"}, 239 | ] 240 | pyparsing = [ 241 | {file = "pyparsing-2.4.7-py2.py3-none-any.whl", hash = "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"}, 242 | {file = "pyparsing-2.4.7.tar.gz", hash = "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1"}, 243 | ] 244 | pytest = [ 245 | {file = "pytest-6.2.5-py3-none-any.whl", hash = "sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134"}, 246 | {file = "pytest-6.2.5.tar.gz", hash = "sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89"}, 247 | ] 248 | toml = [ 249 | {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"}, 250 | {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"}, 251 | ] 252 | typing-extensions = [ 253 | {file = "typing_extensions-3.10.0.2-py2-none-any.whl", hash = "sha256:d8226d10bc02a29bcc81df19a26e56a9647f8b0a6d4a83924139f4a8b01f17b7"}, 254 | {file = "typing_extensions-3.10.0.2-py3-none-any.whl", hash = "sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34"}, 255 | {file = "typing_extensions-3.10.0.2.tar.gz", hash = "sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e"}, 256 | ] 257 | zipp = [ 258 | {file = "zipp-3.6.0-py3-none-any.whl", hash = "sha256:9fe5ea21568a0a70e50f273397638d39b03353731e6cbbb3fd8502a33fec40bc"}, 259 | {file = "zipp-3.6.0.tar.gz", hash = "sha256:71c644c5369f4a6e07636f0aa966270449561fcea2e3d6747b8d23efaa9d7832"}, 260 | ] 261 | --------------------------------------------------------------------------------