├── src
    └── text_cleaning
    │   ├── __init__.py
    │   ├── main.py
    │   └── clean_text.py
├── tests
    ├── test_cleaning
    │   ├── 002.json
    │   ├── 008.json
    │   ├── 013.json
    │   ├── 004.json
    │   ├── 010.json
    │   ├── 012.json
    │   ├── 007.json
    │   ├── 011.json
    │   ├── 003.json
    │   ├── 005.json
    │   ├── 006.json
    │   ├── 001.json
    │   ├── 014.json
    │   ├── 000.json
    │   └── 009.json
    └── test_cleaning.py
├── pyproject.toml
├── LICENSE
├── Makefile
├── README.md
├── .gitignore
└── poetry.lock


/src/text_cleaning/__init__.py:
--------------------------------------------------------------------------------
1 | from .clean_text import clean_text
2 | 


--------------------------------------------------------------------------------
/tests/test_cleaning/002.json:
--------------------------------------------------------------------------------
1 | {
2 |   "input_text": "アドバイスありがとう！✋！",
3 |   "output_text": "アドバイスありがとう！"
4 | }


--------------------------------------------------------------------------------
/tests/test_cleaning/008.json:
--------------------------------------------------------------------------------
1 | {
2 |   "input_text": "本当ですか？（動揺してる）",
3 |   "output_text": "本当ですか？（動揺してる）。"
4 | }


--------------------------------------------------------------------------------
/tests/test_cleaning/013.json:
--------------------------------------------------------------------------------
1 | {
2 |   "input_text": "Let's goですね",
3 |   "output_text": "Ｌｅｔ。ｇｏですね。"
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/test_cleaning/004.json:
--------------------------------------------------------------------------------
1 | {
2 |   "input_text": "面白い笑笑笑笑。笑いが止まらんwwwww",
3 |   "output_text": "面白い。笑いが止まらん。"
4 | }


--------------------------------------------------------------------------------
/tests/test_cleaning/010.json:
--------------------------------------------------------------------------------
1 | {
2 |   "input_text": "あい。）おはよう。ヽ( ﾟ∀ﾟ)ﾉ┌┛)`Дﾟ)･;'━!!",
3 |   "output_text": "あい。おはよう。"
4 | }


--------------------------------------------------------------------------------
/tests/test_cleaning/012.json:
--------------------------------------------------------------------------------
1 | {
2 |   "input_text": "これは本当（ある意味嘘（実は本当））のこと。",
3 |   "output_text": "これは本当のこと。"
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/test_cleaning/007.json:
--------------------------------------------------------------------------------
1 | {
2 |   "input_text": "((o(ﾟ▽ﾟ*)o))おや_(ˇωˇ」∠)_ ｽﾔｧ…(笑)",
3 |   "output_text": "おや。スヤァ。（笑）。"
4 | }


--------------------------------------------------------------------------------
/tests/test_cleaning/011.json:
--------------------------------------------------------------------------------
1 | {
2 |   "input_text": "確かに「嘘でしょww」って笑ってたね",
3 |   "output_text": "確かに「嘘でしょ。」って笑ってたね。"
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/test_cleaning/003.json:
--------------------------------------------------------------------------------
1 | {
2 |   "input_text": "可愛い！。＼(^oao^)／お大事になさってください...",
3 |   "output_text": "可愛い！お大事になさってください。"
4 | }


--------------------------------------------------------------------------------
/tests/test_cleaning/005.json:
--------------------------------------------------------------------------------
1 | {
2 |   "input_text": "幸せいっぱいの一年にしましょう。(｡&gt;ㅅ&lt;｡)よろしく♥",
3 |   "output_text": "幸せいっぱいの一年にしましょう。よろしく。"
4 | }


--------------------------------------------------------------------------------
/tests/test_cleaning/006.json:
--------------------------------------------------------------------------------
1 | {
2 |   "input_text": "(・_・_・_)(。口。)(ーー゛)（。三三。）( ´ ▽ ` )ﾉ((T_T))ヾ(⌒(ﾉ*•ω•*)",
3 |   "output_text": ""
4 | }


--------------------------------------------------------------------------------
/tests/test_cleaning/001.json:
--------------------------------------------------------------------------------
1 | {
2 |   "input_text": "@abcde0123 おっとっとwwそうでした✋！よろしくお願いします♪‼",
3 |   "output_text": "おっとっと。そうでした！よろしくお願いします。"
4 | }


--------------------------------------------------------------------------------
/tests/test_cleaning/014.json:
--------------------------------------------------------------------------------
1 | {
2 |   "input_text": "帰ってきたら新年会の応募が届いておりまして、〆切後に参加したくなって",
3 |   "output_text": "帰ってきたら新年会の応募が届いておりまして、〆切後に参加したくなって。"
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/test_cleaning/000.json:
--------------------------------------------------------------------------------
1 | {
2 |   "input_text": "それはすご〜〜〜いな!これ見て→\uD83D\uDC46\uD83D\uDC4F\uD83D\uDC95 http://a.bc/defGHIjkl",
3 |   "output_text": "それはすごいな！これ見て。"
4 | }


--------------------------------------------------------------------------------
/tests/test_cleaning/009.json:
--------------------------------------------------------------------------------
1 | {
2 |   "input_text": "I’ve taken her mail, but I could not follow her.",
3 |   "output_text": "Ｉ’ｖｅ　ｔａｋｅｎ　ｈｅｒ　ｍａｉｌ，　ｂｕｔ　Ｉ　ｃｏｕｌｄ　ｎｏｔ　ｆｏｌｌｏｗ　ｈｅｒ．"
4 | }


--------------------------------------------------------------------------------
/tests/test_cleaning.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import glob
 3 | import json
 4 | from typing import Tuple
 5 | 
 6 | import pytest
 7 | 
 8 | from text_cleaning import clean_text
 9 | 
10 | 
11 | def read_test_file(path: str) -> Tuple[str, str]:
12 |     with open(path) as f:
13 |         dct = json.load(f)
14 |         return dct['input_text'], dct['output_text']
15 | 
16 | 
17 | test_file_path_pattern = os.path.join(os.path.dirname(__file__), 'test_cleaning', '*.json')
18 | test_cases = [read_test_file(path) for path in sorted(glob.glob(test_file_path_pattern))]
19 | 
20 | 
21 | @pytest.mark.parametrize('test_case', test_cases)
22 | def test_clean_text(test_case):
23 |     input_text, output_text = test_case
24 |     assert clean_text(input_text, twitter=True, han2zen=True) == output_text
25 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "text-cleaning"
 3 | version = "1.0.0"
 4 | description = "A powerful text cleaner for Japanese web texts"
 5 | license = "MIT"
 6 | authors = ["Kurohashi-Kawahara Lab, Kyoto Univ <contact@nlp.ist.i.kyoto-u.ac.jp>"]
 7 | maintainers = ["Takashi Kodama <kodama@nlp.ist.i.kyoto-u.ac.jp>", "Nobuhiro Ueda <ueda@nlp.ist.i.kyoto-u.ac.jp>"]
 8 | readme = "README.md"
 9 | repository = "https://github.com/ku-nlp/text-cleaning"
10 | keywords = [
11 |     "NLP"
12 | ]
13 | classifiers = [
14 | #    "License :: OSI Approved :: BSD License",
15 |     "Programming Language :: Python :: 3",
16 |     "Programming Language :: Python :: 3.7",
17 |     "Programming Language :: Python :: 3.8",
18 |     "Programming Language :: Python :: 3.9"
19 | ]
20 | 
21 | [tool.poetry.dependencies]
22 | python = "^3.7"
23 | mojimoji = "*"
24 | neologdn = "*"
25 | joblib = "*"
26 | 
27 | [tool.poetry.dev-dependencies]
28 | pytest = "^6.2"
29 | 
30 | [build-system]
31 | requires = ["poetry-core>=1.0.0"]
32 | build-backend = "poetry.core.masonry.api"
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2020 Takashi KODAMA and Nobuhiro UEDA
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | INPUT_DIR := /somewhere/input
 2 | OUTPUT_DIR := /somewhere/output
 3 | # input file format. txt or csv or tsv
 4 | FILE_FORMAT := txt
 5 | NUM_JOBS_PER_MACHINE := 10
 6 | TWITTER :=
 7 | PYTHON :=
 8 | 
 9 | INPUT_FILES := $(shell find $(INPUT_DIR) -type f)
10 | REL_PATHS := $(patsubst $(INPUT_DIR)/%,%,$(INPUT_FILES))
11 | INPUT_EXT := $(suffix $(word 1, $(INPUT_FILES)))
12 | 
13 | CAT := cat
14 | CATOUT := cat
15 | ifeq ($(INPUT_EXT),.gz)
16 | 	CAT := zcat
17 | 	CATOUT := gzip
18 | endif
19 | ifeq ($(INPUT_EXT),.zip)
20 | 	CAT := zcat
21 | 	CATOUT := zip
22 | endif
23 | ifeq ($(INPUT_EXT),.bz2)
24 | 	CAT := bzcat
25 | 	CATOUT := bzip2
26 | endif
27 | 
28 | ifndef PYTHON
29 | 	PYTHON := $(shell which python)
30 | endif
31 | 
32 | CLEANED_FILES := $(addprefix $(OUTPUT_DIR)/,$(REL_PATHS))
33 | 
34 | CLEANING_ARGS = --file-format $(FILE_FORMAT)
35 | CLEANING_ARGS += --n-jobs $(NUM_JOBS_PER_MACHINE)
36 | ifdef TWITTER
37 | 	CLEANING_ARGS += --twitter
38 | endif
39 | 
40 | .PHONY: all
41 | all: $(CLEANED_FILES)
42 | 
43 | $(CLEANED_FILES): $(OUTPUT_DIR)/%: $(INPUT_DIR)/%
44 | 	mkdir -p $(dir $@)
45 | 	$(CAT) $< | $(PYTHON) src/text_cleaning/main.py $(CLEANING_ARGS) | $(CATOUT) > $@ || rm $@
46 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # text-cleaning: A Japanese powerful text cleaner
 2 | 
 3 | ## Description
 4 | This project cleans dirty Japanese texts, which include a lot of emoji and kaomoji
 5 | in a whitelist method.
 6 | 
 7 | ## Cleaning Example
 8 | 
 9 | ```text
10 | INPUT: これはサンプルです(≧∇≦*)!見てみて→http://a.bc/defGHIjkl
11 | OUTPUT: これはサンプルです！見てみて。
12 | 
13 | INPUT: 一緒に応援してるよ(o^^o)。ありがとう😃
14 | OUTPUT: 一緒に応援してるよ。ありがとう。
15 | 
16 | INPUT: いいぞ〜⸜(* ॑꒳ ॑*  )⸝⋆*
17 | OUTPUT: いいぞ。
18 | 
19 | INPUT: えっ((((；ﾟДﾟ)))))))
20 | OUTPUT: えっ。
21 | 
22 | INPUT: 確かに「嘘でしょww」って笑ってたね
23 | OUTPUT: 確かに「嘘でしょ。」って笑ってたね。
24 | 
25 | INPUT: おはようございますヽ(*´∀｀)ノ。。今日は雨ですね･････(T_T)
26 | OUTPUT: おはようございます。今日は雨ですね。
27 | 
28 | INPUT: (灬º﹃º灬)おいしそうです♡
29 | OUTPUT: おいしそうです。
30 | 
31 | INPUT: 今日の夜、友達とラーメン行くよ(((o(*ﾟ▽ﾟ*)o)))
32 | OUTPUT: 今日の夜、友達とラーメン行くよ。
33 | 
34 | # When using the twitter option.
35 | INPUT: @abcde0123 おっとっとwwそうでした✋!！よろしくお願いします♪‼ #挨拶
36 | OUTPUT: おっとっと。そうでした！よろしくお願いします。
37 | ```
38 | 
39 | ## Requirements
40 | - Python 3.7+
41 | - mojimoji
42 | - neologdn
43 | - joblib
44 | 
45 | ## How to Run
46 | 
47 | ### Using python script directly
48 | 
49 | ```zsh
50 | cat input.txt | python src/text_cleaning/main.py <options> > output.txt
51 | ```
52 | 
53 | ### Using makefile
54 | When input files are located in directories hierarchically you can clean
55 | them keeping directory structure by using makefile.
56 | If input is compressed files, Makefile detect their format from their
57 | suffix and output cleaned files in the same format.
58 | 
59 | ```zsh
60 | make INPUT_DIR=/somewhere/in OUTPUT_DIR=/somewhere/out PYTHON=/somewhere/.venv/bin/python
61 | ```
62 | 
63 | Options:
64 | 
65 | - FILE_FORMAT=txt: Format of input file (txt or csv or tsv)
66 | - NUM_JOBS_PER_MACHINE=10: The maximum number of concurrently running jobs per machine
67 | - TWITTER=1: Perform twitter specific cleaning
68 | - PYTHON: Path to python interpreter of virtual environment
69 | 


--------------------------------------------------------------------------------
/src/text_cleaning/main.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from argparse import ArgumentParser
 3 | from typing import Optional
 4 | 
 5 | from joblib import Parallel, delayed
 6 | 
 7 | from .clean_text import clean_text
 8 | 
 9 | DELIMITER = {'txt': None, 'csv': ',', 'tsv': '\t'}
10 | JOINER = {'txt': '', 'csv': ',', 'tsv': '\t'}
11 | 
12 | 
13 | def _clean_texts(input_text: str, file_format: str, twitter: bool, han2zen: bool) -> str:
14 |     delimiter: Optional[str] = DELIMITER[file_format]
15 |     joiner: str = JOINER[file_format]
16 |     return joiner.join(clean_text(text, twitter=twitter, han2zen=han2zen) for text in input_text.split(delimiter))
17 | 
18 | 
19 | def main():
20 |     parser = ArgumentParser()
21 |     parser.add_argument('-f', '--file-format', default='txt', type=str, choices=['txt', 'csv', 'tsv'])
22 |     parser.add_argument('-n', '--n-jobs', default=1, type=int)
23 |     parser.add_argument('-t', '--twitter', action='store_true', help='perform twitter-specific cleaning')
24 |     parser.add_argument('--han2zen', '--h2z', action='store_true', help='convert hankaku characters to zenkaku ones')
25 |     parser.add_argument('-i', '--input-file', type=str)
26 |     args = parser.parse_args()
27 | 
28 |     input_texts = []
29 |     with open(args.input_file, 'rb') if args.input_file else sys.stdin as f:
30 |         for line in f.buffer:
31 |             try:
32 |                 line = line.decode('utf-8')
33 |             except UnicodeDecodeError:
34 |                 line = ''
35 |             input_texts.append(line.strip())
36 | 
37 |     if args.n_jobs == 0:
38 |         outputs = [_clean_texts(input_text, args.file_format, args.twitter, args.han2zen) for input_text in input_texts]
39 |     else:
40 |         outputs = Parallel(n_jobs=args.n_jobs, verbose=10)(
41 |             [delayed(_clean_texts)(input_text, args.file_format, args.twitter, args.han2zen) for input_text in
42 |              input_texts])
43 |     for output in outputs:
44 |         print(output)
45 | 
46 | 
47 | if __name__ == '__main__':
48 |     main()
49 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | 
 62 | # Flask stuff:
 63 | instance/
 64 | .webassets-cache
 65 | 
 66 | # Scrapy stuff:
 67 | .scrapy
 68 | 
 69 | # Sphinx documentation
 70 | docs/_build/
 71 | 
 72 | # PyBuilder
 73 | target/
 74 | 
 75 | # Jupyter Notebook
 76 | .ipynb_checkpoints
 77 | 
 78 | # IPython
 79 | profile_default/
 80 | ipython_config.py
 81 | 
 82 | # pyenv
 83 | .python-version
 84 | 
 85 | # pipenv
 86 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 87 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 88 | #   having no cross-platform support, pipenv may install dependencies that don’t work, or not
 89 | #   install all needed dependencies.
 90 | #Pipfile.lock
 91 | 
 92 | # celery beat schedule file
 93 | celerybeat-schedule
 94 | 
 95 | # SageMath parsed files
 96 | *.sage.py
 97 | 
 98 | # Environments
 99 | .env
100 | .venv
101 | env/
102 | venv/
103 | ENV/
104 | env.bak/
105 | venv.bak/
106 | 
107 | # Spyder project settings
108 | .spyderproject
109 | .spyproject
110 | 
111 | # Rope project settings
112 | .ropeproject
113 | 
114 | # mkdocs documentation
115 | /site
116 | 
117 | # mypy
118 | .mypy_cache/
119 | .dmypy.json
120 | dmypy.json
121 | 
122 | # Pyre type checker
123 | .pyre/
124 | 
125 | # Pycharm
126 | .idea
127 | 


--------------------------------------------------------------------------------
/src/text_cleaning/clean_text.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | import neologdn
  4 | from mojimoji import han_to_zen
  5 | 
  6 | ESCAPE_CODES = [r'&lt;', r'&gt;', r'&amp;', r'&quot;', r'&nbsp;', r'&copy;']
  7 | 
  8 | HIRAGANA = r'\u3041-\u3096'
  9 | KATAKANA = r'\u30A1-\u30F6'
 10 | PROLONGED_SOUND_MARK = r'\u30FC'
 11 | KANJI = r'\u3006\u4E00-\u9FFF'  # U+3006: 〆
 12 | REPEATING_MARK = r'\u3005'
 13 | 
 14 | WHITELIST_PTN = re.compile(rf'[a-zA-Z0-9!?()「」、。{HIRAGANA}{KATAKANA}{PROLONGED_SOUND_MARK}{KANJI}{REPEATING_MARK}]')
 15 | JP_PTN = re.compile(rf'[{HIRAGANA}{KATAKANA}{PROLONGED_SOUND_MARK}{KANJI}]')
 16 | 
 17 | 
 18 | def clean_text(text: str, twitter: bool, han2zen: bool, repeat: int = 3) -> str:
 19 |     text = _normalize(text=text, repeat=repeat)
 20 |     if _is_japanese(text):
 21 |         if twitter is True:
 22 |             text = _twitter_preprocess(text=text)
 23 |         text = _filter(text=text)
 24 |     if han2zen is True:
 25 |         text = han_to_zen(text)
 26 |     return text
 27 | 
 28 | 
 29 | def _normalize(text: str, repeat: int) -> str:
 30 |     return neologdn.normalize(text, repeat=repeat)
 31 | 
 32 | 
 33 | def _is_japanese(string: str) -> bool:
 34 |     al_num = re.compile(r'^[a-zA-Z0-9()!?,.:;\-\'\"\s]+$')
 35 |     return al_num.match(string) is None
 36 | 
 37 | 
 38 | def _twitter_preprocess(text: str) -> str:
 39 |     replaced_text = re.sub(r'[RT]\w+', '', text)
 40 |     replaced_text = re.sub(r'[@][a-zA-Z0-9_]+', '', replaced_text)
 41 |     replaced_text = re.sub(r'#(\w+)', '', replaced_text)
 42 |     return replaced_text
 43 | 
 44 | 
 45 | def _replace_punctuation(text: str) -> str:
 46 |     replaced_text = re.sub(r'、+', '、', text)  # "、、、" -> "、"
 47 |     replaced_text = re.sub(r'[、。]*。[、。]*', '。', replaced_text)
 48 |     replaced_text = re.sub(r'^[、。!?]', '', replaced_text)
 49 |     replaced_text = re.sub(
 50 |         rf'。[a-zA-Z0-9!?「」{HIRAGANA}{KATAKANA}{PROLONGED_SOUND_MARK}{KANJI}]。', '。', replaced_text)
 51 |     return replaced_text
 52 | 
 53 | 
 54 | def _whitelist_filter(text: str) -> str:
 55 |     """
 56 |     あいうw → あいう。
 57 |     (あいう)w → (あいう)。
 58 |     あいう → あいう。
 59 |     あいう☆ → あいう。
 60 |     あいう。w 。→ あいう。。。
 61 | 
 62 |     """
 63 |     ptn = re.compile(rf'[0-9。w{HIRAGANA}{KATAKANA}{PROLONGED_SOUND_MARK}{KANJI}]')
 64 |     filtered_text = ''
 65 |     for i, character in enumerate(text):
 66 |         if WHITELIST_PTN.match(character) and \
 67 |                 not (character == 'w' and filtered_text and ptn.match(filtered_text[-1])):
 68 |             filtered_text += character
 69 |             continue
 70 |         filtered_text += '。'
 71 |     filtered_text += '。'
 72 |     return filtered_text
 73 | 
 74 | 
 75 | def _delete_kaomoji(text: str) -> str:
 76 |     text_ = ''
 77 |     buff = ''
 78 |     bracket_counter = 0
 79 |     for c in text:
 80 |         buff += c
 81 |         if c == '(':
 82 |             bracket_counter += 1
 83 |         elif c == ')':
 84 |             bracket_counter -= 1
 85 |             if bracket_counter == 0:
 86 |                 stripped_buff = buff.lstrip('(').rstrip(')')
 87 |                 if all(JP_PTN.match(c) for c in stripped_buff) and stripped_buff:
 88 |                     text_ += buff
 89 |                 buff = ''
 90 |                 continue
 91 |         if bracket_counter == 0:
 92 |             text_ += buff
 93 |             buff = ''
 94 |     return text_
 95 | 
 96 | 
 97 | def _filter(text: str) -> str:
 98 |     text = re.sub(r'(http|https)://([-\w]+\.)+[-\w]+(/[-\w./?%&=]*)?', '', text)
 99 |     for escape_code in ESCAPE_CODES:
100 |         text = re.sub(escape_code, '', text)
101 |     text = _whitelist_filter(text=text)
102 |     text = _replace_punctuation(text)
103 | 
104 |     text = re.sub(r'笑笑+', '笑', text)
105 |     text = re.sub(r'笑。', '。', text)
106 | 
107 |     text = re.sub(r'([!?。])[a-zA-Z0-9]+([!?。])', r'\1\2', text)
108 |     text = _replace_punctuation(text)
109 | 
110 |     text = _delete_kaomoji(text)
111 |     text = _replace_punctuation(text)
112 | 
113 |     text = re.sub(r'(。\))|(\(。)', '。', text)
114 |     text = re.sub(r'[。!?][ノシﾉｼ]+[。!?]', '。', text)
115 |     text = re.sub(r'。([!?])', r'\1', text)
116 |     text = re.sub(r'([!?])。', r'\1', text)
117 |     text = _replace_punctuation(text)
118 | 
119 |     text = re.sub(r'!!+', '!', text)
120 |     text = re.sub(r'\?\?+', '?', text)
121 |     text = re.sub(r'^.。', '', text)
122 |     text = '' if len(text) == 1 else text
123 | 
124 |     return text
125 | 


--------------------------------------------------------------------------------
/poetry.lock:
--------------------------------------------------------------------------------
  1 | [[package]]
  2 | name = "atomicwrites"
  3 | version = "1.4.0"
  4 | description = "Atomic file writes."
  5 | category = "dev"
  6 | optional = false
  7 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
  8 | 
  9 | [[package]]
 10 | name = "attrs"
 11 | version = "21.2.0"
 12 | description = "Classes Without Boilerplate"
 13 | category = "dev"
 14 | optional = false
 15 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 16 | 
 17 | [package.extras]
 18 | dev = ["coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "sphinx", "sphinx-notfound-page", "zope.interface"]
 19 | docs = ["furo", "sphinx", "sphinx-notfound-page", "zope.interface"]
 20 | tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "zope.interface"]
 21 | tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"]
 22 | 
 23 | [[package]]
 24 | name = "colorama"
 25 | version = "0.4.4"
 26 | description = "Cross-platform colored terminal text."
 27 | category = "dev"
 28 | optional = false
 29 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 30 | 
 31 | [[package]]
 32 | name = "importlib-metadata"
 33 | version = "4.8.1"
 34 | description = "Read metadata from Python packages"
 35 | category = "dev"
 36 | optional = false
 37 | python-versions = ">=3.6"
 38 | 
 39 | [package.dependencies]
 40 | typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""}
 41 | zipp = ">=0.5"
 42 | 
 43 | [package.extras]
 44 | docs = ["jaraco.packaging (>=8.2)", "rst.linker (>=1.9)", "sphinx"]
 45 | perf = ["ipython"]
 46 | testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pep517", "pyfakefs", "pytest (>=4.6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.0.1)", "pytest-flake8", "pytest-mypy", "pytest-perf (>=0.9.2)"]
 47 | 
 48 | [[package]]
 49 | name = "iniconfig"
 50 | version = "1.1.1"
 51 | description = "iniconfig: brain-dead simple config-ini parsing"
 52 | category = "dev"
 53 | optional = false
 54 | python-versions = "*"
 55 | 
 56 | [[package]]
 57 | name = "joblib"
 58 | version = "1.2.0"
 59 | description = "Lightweight pipelining with Python functions"
 60 | category = "main"
 61 | optional = false
 62 | python-versions = ">=3.7"
 63 | 
 64 | [[package]]
 65 | name = "mojimoji"
 66 | version = "0.0.11"
 67 | description = "A fast converter between Japanese hankaku and zenkaku characters"
 68 | category = "main"
 69 | optional = false
 70 | python-versions = "*"
 71 | 
 72 | [[package]]
 73 | name = "neologdn"
 74 | version = "0.5.1"
 75 | description = "Japanese text normalizer for mecab-neologd"
 76 | category = "main"
 77 | optional = false
 78 | python-versions = "*"
 79 | 
 80 | [[package]]
 81 | name = "packaging"
 82 | version = "21.0"
 83 | description = "Core utilities for Python packages"
 84 | category = "dev"
 85 | optional = false
 86 | python-versions = ">=3.6"
 87 | 
 88 | [package.dependencies]
 89 | pyparsing = ">=2.0.2"
 90 | 
 91 | [[package]]
 92 | name = "pluggy"
 93 | version = "1.0.0"
 94 | description = "plugin and hook calling mechanisms for python"
 95 | category = "dev"
 96 | optional = false
 97 | python-versions = ">=3.6"
 98 | 
 99 | [package.dependencies]
100 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""}
101 | 
102 | [package.extras]
103 | dev = ["pre-commit", "tox"]
104 | testing = ["pytest", "pytest-benchmark"]
105 | 
106 | [[package]]
107 | name = "py"
108 | version = "1.10.0"
109 | description = "library with cross-python path, ini-parsing, io, code, log facilities"
110 | category = "dev"
111 | optional = false
112 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
113 | 
114 | [[package]]
115 | name = "pyparsing"
116 | version = "2.4.7"
117 | description = "Python parsing module"
118 | category = "dev"
119 | optional = false
120 | python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
121 | 
122 | [[package]]
123 | name = "pytest"
124 | version = "6.2.5"
125 | description = "pytest: simple powerful testing with Python"
126 | category = "dev"
127 | optional = false
128 | python-versions = ">=3.6"
129 | 
130 | [package.dependencies]
131 | atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""}
132 | attrs = ">=19.2.0"
133 | colorama = {version = "*", markers = "sys_platform == \"win32\""}
134 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""}
135 | iniconfig = "*"
136 | packaging = "*"
137 | pluggy = ">=0.12,<2.0"
138 | py = ">=1.8.2"
139 | toml = "*"
140 | 
141 | [package.extras]
142 | testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"]
143 | 
144 | [[package]]
145 | name = "toml"
146 | version = "0.10.2"
147 | description = "Python Library for Tom's Obvious, Minimal Language"
148 | category = "dev"
149 | optional = false
150 | python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
151 | 
152 | [[package]]
153 | name = "typing-extensions"
154 | version = "3.10.0.2"
155 | description = "Backported and Experimental Type Hints for Python 3.5+"
156 | category = "dev"
157 | optional = false
158 | python-versions = "*"
159 | 
160 | [[package]]
161 | name = "zipp"
162 | version = "3.6.0"
163 | description = "Backport of pathlib-compatible object wrapper for zip files"
164 | category = "dev"
165 | optional = false
166 | python-versions = ">=3.6"
167 | 
168 | [package.extras]
169 | docs = ["jaraco.packaging (>=8.2)", "rst.linker (>=1.9)", "sphinx"]
170 | testing = ["func-timeout", "jaraco.itertools", "pytest (>=4.6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.0.1)", "pytest-flake8", "pytest-mypy"]
171 | 
172 | [metadata]
173 | lock-version = "1.1"
174 | python-versions = "^3.7"
175 | content-hash = "bbac8bba46a07688d50cbea79c1dc1c1a7e1e5364a34c50ab4ea8e16d9d4b093"
176 | 
177 | [metadata.files]
178 | atomicwrites = [
179 |     {file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"},
180 |     {file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"},
181 | ]
182 | attrs = [
183 |     {file = "attrs-21.2.0-py2.py3-none-any.whl", hash = "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1"},
184 |     {file = "attrs-21.2.0.tar.gz", hash = "sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"},
185 | ]
186 | colorama = [
187 |     {file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"},
188 |     {file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"},
189 | ]
190 | importlib-metadata = [
191 |     {file = "importlib_metadata-4.8.1-py3-none-any.whl", hash = "sha256:b618b6d2d5ffa2f16add5697cf57a46c76a56229b0ed1c438322e4e95645bd15"},
192 |     {file = "importlib_metadata-4.8.1.tar.gz", hash = "sha256:f284b3e11256ad1e5d03ab86bb2ccd6f5339688ff17a4d797a0fe7df326f23b1"},
193 | ]
194 | iniconfig = [
195 |     {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"},
196 |     {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"},
197 | ]
198 | joblib = [
199 |     {file = "joblib-1.2.0-py3-none-any.whl", hash = "sha256:091138ed78f800342968c523bdde947e7a305b8594b910a0fea2ab83c3c6d385"},
200 |     {file = "joblib-1.2.0.tar.gz", hash = "sha256:e1cee4a79e4af22881164f218d4311f60074197fb707e082e803b61f6d137018"},
201 | ]
202 | mojimoji = [
203 |     {file = "mojimoji-0.0.11-cp35-cp35m-macosx_10_14_x86_64.whl", hash = "sha256:b86ee36240e77a414f796630cf3c80acef4e9528277769d1851d00b94af1cc98"},
204 |     {file = "mojimoji-0.0.11-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:3d3756f60754ee95847f419f48246a68271d9024c5296d298054229403804002"},
205 |     {file = "mojimoji-0.0.11-cp35-cp35m-win_amd64.whl", hash = "sha256:e257176fe55332ac5dc69735270840b36738a82f1dda3383758a6f36b5918fb6"},
206 |     {file = "mojimoji-0.0.11-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:ece991b6529dddb91ae5035dcf3b62b486af675d4ec94fc3ecf7872b66f3df88"},
207 |     {file = "mojimoji-0.0.11-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:b7a1fe5726ad7036b096d44e7d014ceac174aec718199e19c6c6d1c2a757f6b9"},
208 |     {file = "mojimoji-0.0.11-cp36-cp36m-win_amd64.whl", hash = "sha256:ae2fcb9fa35f5890ba888780c9d445074717ef988c3171a058f002cd0f3933f4"},
209 |     {file = "mojimoji-0.0.11-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:e551bb3be777643e2e1f7bc40db4d57baedaa6dac6c7875d9a926ffe66b67fb1"},
210 |     {file = "mojimoji-0.0.11-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:642c5e8642fa53c9556cd4c880e09438786dd955c32c7ce5fa2693e16914cdb6"},
211 |     {file = "mojimoji-0.0.11-cp37-cp37m-win_amd64.whl", hash = "sha256:029c5d2646c8ab36bef61abe26ed6bc698f7ffb6a744d82fafd86e233d6669a2"},
212 |     {file = "mojimoji-0.0.11-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:2fb18323ffae39391d77c77b27ed675cbf6e49e10f884ceaf8399be7ca7e0dc5"},
213 |     {file = "mojimoji-0.0.11-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:ea9c3d0c37ae7a8e494cb9681dae404de4599d4218e5b5d13c009ab1856d6409"},
214 |     {file = "mojimoji-0.0.11-cp38-cp38-win_amd64.whl", hash = "sha256:304681e2d99db69c552e1e36992cb41b967b46abd1720c1826f9dec678491d56"},
215 |     {file = "mojimoji-0.0.11.tar.gz", hash = "sha256:adef094e1bfd961e40c1fbd2d4664da1be2426d5b31884e27394226af15d50b5"},
216 | ]
217 | neologdn = [
218 |     {file = "neologdn-0.5.1-cp36-cp36m-win32.whl", hash = "sha256:67e131890ba1b005f366b998bc63d80ce22fea6a5023930cd6b383ee71456d23"},
219 |     {file = "neologdn-0.5.1-cp36-cp36m-win_amd64.whl", hash = "sha256:3d6208fd3e24a7ad4318e59ba3858944a0a06f1f2d3e592f6644c6625c2eda91"},
220 |     {file = "neologdn-0.5.1-cp37-cp37m-win32.whl", hash = "sha256:fd0517f18a9e818e3ca7e1cae31ecf132556bf3a79d2c8c5530d083cefdb3109"},
221 |     {file = "neologdn-0.5.1-cp37-cp37m-win_amd64.whl", hash = "sha256:55e67e4e1f52589a51ade0ffd2eb772251588757e3f15c60bad372fcb613df50"},
222 |     {file = "neologdn-0.5.1-cp38-cp38-win32.whl", hash = "sha256:bcaa99e5635f6b3f171b5261a2b8965b831f03008adb0521081404e4a7f18226"},
223 |     {file = "neologdn-0.5.1-cp38-cp38-win_amd64.whl", hash = "sha256:eb74d99c20b5864fb389d8a843afe67135285d6867c60db30ff1384fbade1d5d"},
224 |     {file = "neologdn-0.5.1-cp39-cp39-win32.whl", hash = "sha256:636c92852a0156a8746874996cbaf114d8816119962ede8f79cb9f634ba9b97f"},
225 |     {file = "neologdn-0.5.1-cp39-cp39-win_amd64.whl", hash = "sha256:d17277c19901eb53f52ccdc8ec829b49441c314fde612d93528be78b64247e0d"},
226 |     {file = "neologdn-0.5.1.tar.gz", hash = "sha256:206afe3e8de50bbe4a7f3209b8bca4ce8d3ea314467f6ca04a5c553acc8c32ba"},
227 | ]
228 | packaging = [
229 |     {file = "packaging-21.0-py3-none-any.whl", hash = "sha256:c86254f9220d55e31cc94d69bade760f0847da8000def4dfe1c6b872fd14ff14"},
230 |     {file = "packaging-21.0.tar.gz", hash = "sha256:7dc96269f53a4ccec5c0670940a4281106dd0bb343f47b7471f779df49c2fbe7"},
231 | ]
232 | pluggy = [
233 |     {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"},
234 |     {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"},
235 | ]
236 | py = [
237 |     {file = "py-1.10.0-py2.py3-none-any.whl", hash = "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"},
238 |     {file = "py-1.10.0.tar.gz", hash = "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3"},
239 | ]
240 | pyparsing = [
241 |     {file = "pyparsing-2.4.7-py2.py3-none-any.whl", hash = "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"},
242 |     {file = "pyparsing-2.4.7.tar.gz", hash = "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1"},
243 | ]
244 | pytest = [
245 |     {file = "pytest-6.2.5-py3-none-any.whl", hash = "sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134"},
246 |     {file = "pytest-6.2.5.tar.gz", hash = "sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89"},
247 | ]
248 | toml = [
249 |     {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"},
250 |     {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"},
251 | ]
252 | typing-extensions = [
253 |     {file = "typing_extensions-3.10.0.2-py2-none-any.whl", hash = "sha256:d8226d10bc02a29bcc81df19a26e56a9647f8b0a6d4a83924139f4a8b01f17b7"},
254 |     {file = "typing_extensions-3.10.0.2-py3-none-any.whl", hash = "sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34"},
255 |     {file = "typing_extensions-3.10.0.2.tar.gz", hash = "sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e"},
256 | ]
257 | zipp = [
258 |     {file = "zipp-3.6.0-py3-none-any.whl", hash = "sha256:9fe5ea21568a0a70e50f273397638d39b03353731e6cbbb3fd8502a33fec40bc"},
259 |     {file = "zipp-3.6.0.tar.gz", hash = "sha256:71c644c5369f4a6e07636f0aa966270449561fcea2e3d6747b8d23efaa9d7832"},
260 | ]
261 | 


--------------------------------------------------------------------------------