├── .gitignore ├── .vscode ├── env ├── launch.json └── settings.json ├── LICENSE ├── Makefile ├── README.md ├── bareunpy ├── __init__.py ├── _corrector.py ├── _custom_dict.py ├── _custom_dict_client.py ├── _lang_service_client.py ├── _revision_service_client.py ├── _tagger.py └── _tokenizer.py ├── poetry.lock ├── pyproject.toml ├── requirements.txt ├── setup.cfg ├── setup.py └── tests ├── test_tagger.py └── test_tokenizer.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /.vscode/env: -------------------------------------------------------------------------------- 1 | PYTHONPATH=bareunpy:. -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // IntelliSense를 사용하여 가능한 특성에 대해 알아보세요. 3 | // 기존 특성에 대한 설명을 보려면 가리킵니다. 4 | // 자세한 내용을 보려면 https://go.microsoft.com/fwlink/?linkid=830387을(를) 방문하세요. 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": "Python: pytest", 9 | "type": "debugpy", 10 | "request": "launch", 11 | "module": "pytest", 12 | "justMyCode": true, 13 | "args": ["tests/"] 14 | } 15 | ] 16 | } -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.defaultInterpreterPath": "${workspaceFolder}/venv/bin/python", 3 | "python.envFile": "${workspaceFolder}/.vscode/.env", 4 | "python.autoComplete.extraPaths": [ 5 | "bareunpy" 6 | ], 7 | "python.testing.pytestArgs": [ 8 | "tests" 9 | ], 10 | "python.testing.unittestEnabled": true, 11 | "python.testing.pytestEnabled": true 12 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2020-2023, BAIKAL AI Inc. and Korea Press Foundation. 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: venv-local venv-pip-upgrade clean \ 2 | venv-upgrade-build venv-pytest test build \ 3 | venv-poetry poetry venv 4 | 5 | PIP3=venv/bin/pip3 6 | PY3=venv/bin/python3 7 | POETRY=venv/bin/poetry 8 | 9 | all: build 10 | 11 | venv-local: 12 | @test -d venv || python3 -m venv venv 13 | 14 | venv-pip-upgrade: venv-local 15 | @$(PIP3) install --upgrade pip 2> /dev/null 16 | 17 | venv/req-install.log: requirements.txt venv-pip-upgrade 18 | @$(PY3) -c "import pkg_resources; pkg_resources.require(open('requirements.txt',mode='r'))" 2> /dev/null \ 19 | || (echo "installing"; $(PIP3) install -r $< --log $@) 20 | 21 | venv: venv/req-install.log 22 | @. venv/bin/activate 23 | 24 | 25 | clean: 26 | rm -rf venv 27 | find . -iname "*.pyc" -delete 28 | 29 | venv-poetry: venv 30 | @$(PY3) -m pip install --upgrade poetry 2> /dev/null 31 | 32 | build: venv-poetry 33 | @$(POETRY) build 34 | 35 | publish: build 36 | @$(POETRY) publish 37 | 38 | venv-pytest: venv 39 | @$(PY3) -m pip install --upgrade pytest &> /dev/null 40 | 41 | testall: venv 42 | @$(PY3) -m pytest tests/ 43 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # What is this? 2 | 3 | `bareunpy` is the python 3 library for bareun. 4 | 5 | Bareun is a Korean NLP, 6 | which provides tokenizing, POS tagging for Korean. 7 | 8 | ## How to install 9 | 10 | ```shell 11 | pip3 install bareunpy 12 | ``` 13 | 14 | ## How to get bareun 15 | - Go to https://bareun.ai/. 16 | - With registration, for the first time, you can get a API-KEY to use it freely. 17 | - With API-KEY, you can install the `bareun1` server. 18 | - Or you can make a call to use this `bareunpy` library to any servers. 19 | - Or use docker image. See https://hub.docker.com/r/bareunai/bareun 20 | ```shell 21 | docker pull bareunai/bareun:latest 22 | ``` 23 | 24 | ## How to use, tagger 25 | 26 | ```python 27 | import sys 28 | import google.protobuf.text_format as tf 29 | from bareunpy import Tagger 30 | 31 | # You can get an API-KEY from https://bareun.ai/ 32 | # Please note that you need to sign up and verify your email. 33 | # 아래에 "https://bareun.ai/"에서 이메일 인증 후 발급받은 API KEY("koba-...")를 입력해주세요. "로그인-내정보 확인" 34 | API_KEY = "koba-ABCDEFG-1234567-LMNOPQR-7654321" # <- 본인의 API KEY로 교체(Replace this with your own API KEY) 35 | 36 | # If you have your own localhost bareun. 37 | my_tagger = Tagger(API_KEY, 'localhost') 38 | # or if you have your own bareun which is running on 10.8.3.211:15656. 39 | my_tagger = Tagger(API_KEY, '10.8.3.211', 15656) 40 | 41 | 42 | # print results. 43 | res = tagger.tags(["안녕하세요.", "반가워요!"]) 44 | 45 | # get protobuf message. 46 | m = res.msg() 47 | tf.PrintMessage(m, out=sys.stdout, as_utf8=True) 48 | print(tf.MessageToString(m, as_utf8=True)) 49 | print(f'length of sentences is {len(m.sentences)}') 50 | ## output : 2 51 | print(f'length of tokens in sentences[0] is {len(m.sentences[0].tokens)}') 52 | print(f'length of morphemes of first token in sentences[0] is {len(m.sentences[0].tokens[0].morphemes)}') 53 | print(f'lemma of first token in sentences[0] is {m.sentences[0].tokens[0].lemma}') 54 | print(f'first morph of first token in sentences[0] is {m.sentences[0].tokens[0].morphemes[0]}') 55 | print(f'tag of first morph of first token in sentences[0] is {m.sentences[0].tokens[0].morphemes[0].tag}') 56 | 57 | ## Advanced usage. 58 | for sent in m.sentences: 59 | for token in sent.tokens: 60 | for m in token.morphemes: 61 | print(f'{m.text.content}/{m.tag}:{m.probability}:{m.out_of_vocab}) 62 | 63 | # get json object 64 | jo = res.as_json() 65 | print(jo) 66 | 67 | # get tuple of pos tagging. 68 | pa = res.pos() 69 | print(pa) 70 | # another methods 71 | ma = res.morphs() 72 | print(ma) 73 | na = res.nouns() 74 | print(na) 75 | va = res.verbs() 76 | print(va) 77 | 78 | # custom dictionary 79 | cust_dic = tagger.custom_dict("my") 80 | cust_dic.copy_np_set({'내고유명사', '우리집고유명사'}) 81 | cust_dic.copy_cp_set({'코로나19'}) 82 | cust_dic.copy_cp_caret_set({'코로나^백신', '"독감^백신'}) 83 | cust_dic.update() 84 | 85 | # laod prev custom dict 86 | cust_dict2 = tagger.custom_dict("my") 87 | cust_dict2.load() 88 | 89 | tagger.set_domain('my') 90 | tagger.pos('코로나19는 언제 끝날까요?') 91 | ``` 92 | 93 | 94 | ## How to use, tokenizer 95 | 96 | ```python 97 | import sys 98 | import google.protobuf.text_format as tf 99 | from bareunpy import Tokenizer 100 | 101 | # You can get an API-KEY from https://bareun.ai/ 102 | # Please note that you need to sign up and verify your email. 103 | # 아래에 "https://bareun.ai/"에서 이메일 인증 후 발급받은 API KEY("koba-...")를 입력해주세요. "로그인-내정보 확인" 104 | API_KEY = "koba-ABCDEFG-1234567-LMNOPQR-7654321" # <- 본인의 API KEY로 교체(Replace this with your own API KEY) 105 | 106 | # If you have your own localhost bareun. 107 | my_tokenizer = Tokenizer(API_KEY, 'localhost') 108 | # or if you have your own bareun which is running on 10.8.3.211:15656. 109 | my_tokenizer = Tagger(API_KEY, '10.8.3.211', 15656) 110 | 111 | 112 | # print results. 113 | tokenized = tokenizer.tokenize_list(["안녕하세요.", "반가워요!"]) 114 | 115 | # get protobuf message. 116 | m = tokenized.msg() 117 | tf.PrintMessage(m, out=sys.stdout, as_utf8=True) 118 | print(tf.MessageToString(m, as_utf8=True)) 119 | print(f'length of sentences is {len(m.sentences)}') 120 | ## output : 2 121 | print(f'length of tokens in sentences[0] is {len(m.sentences[0].tokens)}') 122 | print(f'length of segments of first token in sentences[0] is {len(m.sentences[0].tokens[0].segments)}') 123 | print(f'tagged of first token in sentences[0] is {m.sentences[0].tokens[0].tagged}') 124 | print(f'first segment of first token in sentences[0] is {m.sentences[0].tokens[0].segments[0]}') 125 | print(f'hint of first morph of first token in sentences[0] is {m.sentences[0].tokens[0].segments[0].hint}') 126 | 127 | ## Advanced usage. 128 | for sent in m.sentences: 129 | for token in sent.tokens: 130 | for m in token.segments: 131 | print(f'{m.text.content}/{m.hint}) 132 | 133 | # get json object 134 | jo = tokenized.as_json() 135 | print(jo) 136 | 137 | # get tuple of segments 138 | ss = tokenized.segments() 139 | print(ss) 140 | ns = tokenized.nouns() 141 | print(ns) 142 | vs = tokenized.verbs() 143 | print(vs) 144 | # postpositions: 조사 145 | ps = tokenized.postpositions() 146 | print(ps) 147 | # Adverbs, 부사 148 | ass = tokenized.adverbs() 149 | print(ass) 150 | ss = tokenized.symbols() 151 | print(ss) 152 | 153 | ``` 154 | 155 | ## How to use, spelling corrector 156 | ```python 157 | from bareunpy import Corrector 158 | 159 | # You can get an API-KEY from https://bareun.ai/ 160 | # Please note that you need to sign up and verify your email. 161 | # 아래에 "https://bareun.ai/"에서 이메일 인증 후 발급받은 API KEY("koba-...")를 입력해주세요. "로그인-내정보 확인" 162 | API_KEY = "koba-ABCDEFG-1234567-LMNOPQR-7654321" # <- 본인의 API KEY로 교체(Replace this with your own API KEY) 163 | 164 | # Initialize Corrector 165 | corrector = Corrector(API_KEY) 166 | 167 | # Single sentence correction 168 | response = corrector.correct_error("영수 도 줄기가 얇어서 시들을 것 같은 꽃에물을 주었다.") 169 | print(f"Original: {response.origin}") 170 | print(f"Corrected: {response.revised}") 171 | corrector.print_results(response) 172 | 173 | # Multiple sentences correction 174 | responses = corrector.correct_error_list([ 175 | "어머니 께서 만들어주신김치찌게가너무맵다며동생이울어버렸다.", 176 | "영수 도 줄기가 얇어서 시들을 것 같은 꽃에물을 주었다." 177 | ]) 178 | for res in responses: 179 | print(f"Original: {res.origin}") 180 | print(f"Corrected: {res.revised}") 181 | 182 | corrector.print_results(responses) 183 | 184 | # JSON output 185 | corrector.print_as_json(response) 186 | 187 | ``` -------------------------------------------------------------------------------- /bareunpy/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | bareunpy 3 | ===== 4 | Provides 5 | 1. a Korean Part-Of-Speech Tagger as bareun client 6 | 2. Multiple custom dictionaries which is kept in the your bareun server. 7 | 8 | 9 | How to use the documentation 10 | ---------------------------- 11 | Full documentation for bareun is available in 12 | installable tarball or docker images. 13 | - see `docs/intro.html` at installable tarball. 14 | - or `http://localhost:5757/intro.html` after running docker. 15 | 16 | The docstring examples assume that `bareunpy` has been imported as `brn`:: 17 | >>> import bareunpy as brn 18 | 19 | Use the built-in ``help`` function to view a class's docstring:: 20 | >>> help(brn.Tagger) 21 | ... 22 | 23 | Classes 24 | ------- 25 | Tagger 26 | the bareun POS tagger for Korean 27 | `from bareunpy import Tagger` 28 | Tagged 29 | Wrapper for tagged output 30 | `from bareunpy import Tagged` 31 | CustomDict 32 | Custom dictionary for Korean. 33 | `from bareunpy import CustomDict` 34 | 35 | Version 36 | ------- 37 | ``` 38 | import bareunpy as brn 39 | print(brn.version) 40 | print(brn.bareun_version) 41 | ``` 42 | 43 | Get bareun 44 | ---------------------------- 45 | - Use docker, https://hub.docker.com/r/bareunai/bareun 46 | - Or visit https://bareun.ai/ 47 | """ 48 | 49 | import sys 50 | import os 51 | 52 | from bareunpy._tagger import Tagger, Tagged 53 | from bareunpy._tokenizer import Tokenizer, Tokenized 54 | from bareunpy._corrector import Corrector 55 | from bareunpy._custom_dict import CustomDict 56 | from bareunpy._custom_dict_client import CustomDictionaryServiceClient 57 | from bareunpy._lang_service_client import BareunLanguageServiceClient 58 | 59 | version = "1.7.0" 60 | bareun_version = "3.0.0" 61 | -------------------------------------------------------------------------------- /bareunpy/_corrector.py: -------------------------------------------------------------------------------- 1 | import grpc 2 | import json 3 | from sys import stdout 4 | from typing import IO, List, Union 5 | from google.protobuf.json_format import MessageToDict 6 | 7 | import bareun.revision_service_pb2 as pb 8 | import bareun.lang_common_pb2 as lpb 9 | from ._revision_service_client import BareunRevisionServiceClient 10 | 11 | MAX_MESSAGE_LENGTH = 100 * 1024 * 1024 12 | 13 | 14 | class Corrector: 15 | """ 16 | Corrector는 맞춤법 교정 서비스를 제공하는 클래스입니다. 17 | 18 | .. code-block:: python 19 | :emphasize-lines: 1 20 | >>> from bareunpy import Corrector 21 | >>> corrector = Corrector(apikey="koba-YOURKEY") 22 | 23 | >>> response = corrector.correct_error("영수 도 줄기가 얇어서 시들을 것 같은 꽃에물을 주었더니 고은 꽃이 피었다.") 24 | >>> corrector.print_results(response) 25 | === 맞춤법 검사 결과 1=== 26 | 원문: 영수 도 줄기가 얇어서 시들을 것 같은 꽃에물을 주었더니 고은 꽃이 피었다. 27 | 교정문: 영수도 줄기가 얇아서 시들 것 같은 꽃에 물을 주었더니 고운 꽃이 피었다. 28 | 29 | === 교정된 문장들 === 30 | [1] 원문: 영수 도 줄기가 얇어서 시들을 것 같은 꽃에물을 주었더니 고은 꽃이 피었다. 31 | 교정문: 영수도 줄기가 얇아서 시들 것 같은 꽃에 물을 주었더니 고운 꽃이 피었다. 32 | === 수정 블록 === 33 | 1-1 원문: 영수 도 34 | 교정문: 영수도 35 | 수정 세부사항: 36 | - 조사는 그 앞말에 붙여 쓴다. (일반) 37 | 1-2 ... 38 | 39 | 40 | :param apikey: str. Bareun API 키 41 | :param host: str. gRPC 서버 호스트, 로컬에 바른 서버 설치시 'localhost' 입력 (기본값: nlp.bareun.ai) 42 | :param port: int. gRPC 서버 포트 (기본값: 5656) 43 | """ 44 | 45 | def __init__(self, apikey: str, host: str = "", port: int = 5656): 46 | """ 47 | Corrector 초기화 48 | 49 | Args: 50 | apikey (str): API 키 51 | host (str): gRPC 서버 호스트 52 | port (int): gRPC 서버 포트 53 | """ 54 | if host: 55 | host = host.strip() 56 | if apikey: 57 | apikey = apikey.strip() 58 | if host == "" or host is None: 59 | self.host = 'nlp.bareun.ai' 60 | else: 61 | self.host = host 62 | 63 | if port is not None: 64 | self.port = port 65 | else: 66 | self.port = 5656 67 | 68 | self.channel = grpc.insecure_channel( 69 | f"{self.host}:{self.port}", 70 | options=[ 71 | ('grpc.max_send_message_length', MAX_MESSAGE_LENGTH), 72 | ('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH), 73 | ] 74 | ) 75 | self.client = BareunRevisionServiceClient(self.channel, apikey, self.host, self.port) 76 | 77 | def correct_error(self, content: str, 78 | custom_dicts: List[str] = [], 79 | config: Union[pb.RevisionConfig, None] = None) -> pb.CorrectErrorResponse: 80 | """ 81 | 맞춤법 교정 요청 82 | 83 | Args: 84 | content (str): 교정을 요청할 문장 85 | custom_dicts (List[str]): 커스텀 도메인 정보 86 | config Union[pb.RevisionConfig, None] : 요청 설정 87 | 88 | Returns: 89 | pb.CorrectErrorResponse: 교정 결과 90 | """ 91 | request = pb.CorrectErrorRequest( 92 | document=lpb.Document(content=content, language="ko_KR"), 93 | encoding_type=lpb.EncodingType.UTF32, 94 | ) 95 | if len(custom_dicts): 96 | request.custom_dict_names.extend(custom_dicts) 97 | if config != None: 98 | request.config.CopyFrom(config) 99 | 100 | return self.client.correct_error(request) 101 | 102 | def print_results(self, res: pb.CorrectErrorResponse, out: IO = stdout) -> None: 103 | """ 104 | 교정 결과를 출력 105 | 106 | Args: 107 | response pb.CorrectErrorResponse: 교정 결과 또는 교정 결과의 리스트 108 | out (IO): 출력 대상 (기본값: stdout) 109 | """ 110 | print(f'원문: {res.origin}', file=out) 111 | print(f'교정: {res.revised}', file=out) 112 | 113 | print("\n=== 교정된 문장들 ===", file=out) 114 | 115 | for sent in res.revised_sentences: 116 | print(f" 원문: {sent.origin}", file=out) 117 | print(f"교정문: {sent.revised}", file=out) 118 | 119 | for block in res.revised_blocks: 120 | print(f'원문:{block.origin.content} offset:{block.origin.begin_offset}, length:{block.origin.length}', file=out) 121 | print(f'대표 교정: {block.revised}', file=out) 122 | for rev in block.revisions: 123 | print(f' 교정: {rev.revised}, 카테고리:{rev.category}, 도움말 {res.helps[rev.help_id].comment}') 124 | 125 | for cleanup in res.whitespace_cleanup_ranges: 126 | print(f'공백제거: offset:{cleanup.offset} length:{cleanup.length} position: {cleanup.position}') 127 | 128 | 129 | def as_json(self, response: pb.CorrectErrorResponse) -> dict: 130 | """ 131 | 교정 결과를 JSON 형식으로 변환 132 | 133 | Args: 134 | response (Union[pb.CorrectErrorResponse, List[pb.CorrectErrorResponse]]): 교정 결과 또는 교정 결과의 리스트 135 | 136 | Returns: 137 | Union[dict, List[dict]]: JSON 형식으로 변환된 결과 138 | """ 139 | return MessageToDict(response, True) 140 | 141 | def as_json_str(self, response: pb.CorrectErrorResponse) -> str: 142 | """ 143 | 교정 결과를 JSON 문자열로 변환 144 | 145 | Args: 146 | response (Union[pb.CorrectErrorResponse, List[pb.CorrectErrorResponse]]): 교정 결과 또는 교정 결과의 리스트 147 | 148 | Returns: 149 | str: JSON 문자열로 변환된 결과 150 | """ 151 | json_data = self.as_json(response) 152 | return json.dumps(json_data, ensure_ascii=False, indent=2) 153 | 154 | def print_as_json(self, response: pb.CorrectErrorResponse, out: IO = stdout) -> None: 155 | """ 156 | 교정 결과를 JSON 형식으로 출력 157 | 158 | Args: 159 | response (Union[pb.CorrectErrorResponse, List[pb.CorrectErrorResponse]]): 교정 결과 또는 교정 결과의 리스트 160 | out (IO): 출력 대상 (기본값: stdout) 161 | """ 162 | json_data = self.as_json(response) 163 | json.dump(json_data, out, ensure_ascii=False, indent=2) 164 | -------------------------------------------------------------------------------- /bareunpy/_custom_dict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from typing import List 4 | import grpc 5 | from ._custom_dict_client import CustomDictionaryServiceClient 6 | from bareun.custom_dict_pb2 import CustomDictionary 7 | from bareun.dict_common_pb2 import DictSet 8 | 9 | 10 | def read_dic_file(fn :str) -> set: 11 | """ 12 | 사용자 사전의 파일을 읽어들입니다. 13 | 14 | Args: 15 | fn (str): 사용자 사전 파일 이름 16 | 17 | Returns: 18 | set: 사용자 사전을 set 형식으로 만들어서 돌려줍니다. 19 | """ 20 | dict_set = set() 21 | with open(fn, 'r') as r: 22 | while True: 23 | w = r.readline() 24 | if not w: 25 | break 26 | if w[0] != '#': 27 | w2 = w.strip() 28 | if len(w2) > 0: 29 | dict_set.add(w2) 30 | return dict_set 31 | 32 | 33 | def pb_map_to_set(ds: DictSet) -> set: 34 | """ 35 | DictSet을 사전으로 변환합니다. 36 | 37 | Args: 38 | ds (DictSet): DictSet 객체 39 | 40 | Returns: 41 | set: 중복이 없는 사전 객체 42 | """ 43 | ret = set() 44 | for k in ds.items.keys(): 45 | ret.add(k) 46 | return ret 47 | 48 | class CustomDict(): 49 | """ 50 | 사용자 사전을 쉽게 사용하도록 해주는 래퍼(wrapper). 51 | 52 | 'CustomDict' . 53 | :ref:`optional-installations`. 54 | .. code-block:: python 55 | :emphasize-lines: 1 56 | >>> import bareunpy as brn 57 | >>> tagger = brn.Tagger() 58 | >>> cd = tagger.custom_dict("law") 59 | >>> # or 60 | >>> cd = brn.CustomDict("law", "localhost", 5656) 61 | >>> cd.read_cp_set_from_file("my_np_set.txt") 62 | >>> cd.copy_cp_set(set(['새단어', '코로나19', 'K방역'])) 63 | >>> cd.read_cp_caret_set_from_file('my_cp_caret.txt') 64 | >>> cd.copy_vv_set(set(['카톡하', '신박하다'])) 65 | >>> cd.copy_va_set(set(['드라마틱하', '판타스틱하'])) 66 | >>> cd.update() 67 | >>> ## copy data from server 68 | >>> cd2 = tagger.custom_dict("law") 69 | >>> custom_dict = cd2.get() 70 | >>> # cd2.save(dir="my_dir") 71 | """ 72 | 73 | def __init__(self, apikey:str, domain: str, channel: grpc.Channel): 74 | """ 75 | 사용자 사전 래퍼(wrapper)의 생성자 76 | 77 | Args: 78 | domain (str): 사용자 사전의 이름, 반드시 지정되어야 합니다. 79 | channel(grpc.Channel): 원격에 연결할 정보 80 | Raises: 81 | ValueError: 사용자 사전의 이름이 없으면 에러를 발생시킵니다. 82 | """ 83 | self.domain = domain 84 | if domain is None: 85 | raise ValueError("domain name must be specified.") 86 | 87 | self.stub = CustomDictionaryServiceClient(channel, apikey) 88 | self.cp_set = set() 89 | self.np_set = set() 90 | self.cp_caret_set = set() 91 | self.vv_set = set() 92 | self.va_set = set() 93 | 94 | def read_np_set_from_file(self, fn: str): 95 | """ 96 | 고유명사 사전을 파일에서 읽어들입니다. 97 | 98 | 이 파일은 한줄에 하나의 사전입니다. '#'로 시작하는 줄은 무시합니다. 99 | 100 | Args: 101 | fn (str): 고유명사 파일 이름 102 | """ 103 | self.np_set = read_dic_file(fn) 104 | 105 | def read_cp_set_from_file(self, fn: str): 106 | """ 107 | 복합명사 사전을 파일에서 읽어들입니다. 108 | 109 | 이 파일은 한줄에 하나의 사전입니다. '#'로 시작하는 줄은 무시합니다. 110 | 111 | Args: 112 | fn (str): 복합명사 파일 이름 113 | """ 114 | self.cp_set = read_dic_file(fn) 115 | 116 | def read_cp_caret_set_from_file(self, fn: str): 117 | """ 118 | 복합명사 분리 사전을 파일에서 읽어들입니다. 119 | 120 | 이 파일은 한줄에 하나의 사전입니다. '#'로 시작하는 줄은 무시합니다. 121 | 122 | Args: 123 | fn (str): 복합명사 분리 사전 파일 이름 124 | """ 125 | self.cp_caret_set = read_dic_file(fn) 126 | 127 | def read_vv_set_from_file(self, fn: str): 128 | """ 129 | 동사 사전을 파일에서 읽어들입니다. 130 | 131 | 이 파일은 한줄에 하나의 사전입니다. '#'로 시작하는 줄은 무시합니다. 132 | 133 | Args: 134 | fn (str): 동사 사전 파일 이름 135 | """ 136 | self.vv_set = read_dic_file(fn) 137 | 138 | def read_va_set_from_file(self, fn: str): 139 | """ 140 | 형용사 사전을 파일에서 읽어들입니다. 141 | 142 | 이 파일은 한줄에 하나의 사전입니다. '#'로 시작하는 줄은 무시합니다. 143 | 144 | Args: 145 | fn (str): 형용사 사전 파일 이름 146 | """ 147 | self.va_set = read_dic_file(fn) 148 | 149 | def copy_np_set(self, dict_set: set): 150 | """ 151 | 집합을 고유명사 사전으로 지정합니다. 152 | 153 | Args: 154 | dict_set (set): 고유명사 사전 155 | """ 156 | self.np_set = dict_set 157 | 158 | def copy_cp_set(self, dict_set: set): 159 | """ 160 | 집합을 복합명사 사전으로 지정합니다. 161 | 162 | Args: 163 | dict_set (set): 복합명사 사전 164 | """ 165 | self.cp_set = dict_set 166 | 167 | def copy_cp_caret_set(self, dict_set: set): 168 | """ 169 | 집합을 복합명사 분리 사전으로 지정합니다. 170 | 171 | Args: 172 | dict_set (set): 복합명사 분리 사전 173 | """ 174 | self.cp_caret_set = dict_set 175 | 176 | def copy_vv_set(self, dict_set: set): 177 | """ 178 | 집합을 동사 사전으로 지정합니다. 179 | 180 | Args: 181 | dict_set (set): 동사 사전 182 | """ 183 | self.vv_set = dict_set 184 | 185 | def copy_va_set(self, dict_set: set): 186 | """ 187 | 집합을 형용사 사전으로 지정합니다. 188 | 189 | Args: 190 | dict_set (set): 형용사 사전 191 | """ 192 | self.va_set = dict_set 193 | 194 | def update(self) -> bool: 195 | """ 196 | 복합명사 사전을 바이칼 NLP 서버에 갱신합니다. 197 | 198 | Raises: 199 | e: grpc.Error, 원격 호출시 예외가 발생할 수 있습니다. 200 | 201 | Returns: 202 | bool: 갱신이 성공하면 참을 돌려줍니다. 203 | """ 204 | return self.stub.update(self.domain, 205 | self.np_set, 206 | self.cp_set, 207 | self.cp_caret_set, 208 | self.vv_set, 209 | self.va_set) 210 | 211 | def get(self) -> CustomDictionary: 212 | """ 213 | 사용자 사전의 내용을 가져옵니다. 214 | 가져온 결과는 현재 설정된 사전의 내용을 반영하지 않습니다. 215 | 216 | Raises: 217 | e: grpc.Error, 원격 호출시 예외가 발생할 수 있습니다. 218 | 219 | Returns: 220 | pb.CustomDictionary: 사용자 사전 데이터 전체를 담고 있는 protobuf 메시지 221 | """ 222 | return self.stub.get(self.domain) 223 | 224 | 225 | def load(self): 226 | """ 227 | 서버에 저정되어 있는 사용자 사전을 모두 가져옵니다. 228 | """ 229 | try: 230 | d = self.stub.get(self.domain) 231 | self.np_set = pb_map_to_set(d.np_set) 232 | self.cp_caret_set = pb_map_to_set(d.cp_caret_set) 233 | self.cp_set = pb_map_to_set(d.cp_set) 234 | except Exception as e: 235 | pass 236 | 237 | 238 | def clear(self) -> List[str]: 239 | """ 240 | 사용자 사전의 내용을 삭제합니다. 241 | 242 | Raises: 243 | e: grpc.Error, 원격 호출시 예외가 발생할 수 있습니다. 244 | 245 | Returns: 246 | List[str]: 삭제한 사용자 사전의 이름 247 | """ 248 | 249 | self.np_set.clear() 250 | self.cp_set.clear() 251 | self.cp_caret_set.clear() 252 | return self.stub.remove([self.domain]) 253 | -------------------------------------------------------------------------------- /bareunpy/_custom_dict_client.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import grpc 4 | from google.protobuf.empty_pb2 import Empty 5 | 6 | import bareun.custom_dict_pb2 as pb 7 | import bareun.custom_dict_pb2_grpc as cds 8 | import bareun.dict_common_pb2 as common 9 | 10 | 11 | 12 | def build_dict_set(domain: str, name: str, dict_set: set) -> common.DictSet: 13 | """ 14 | 주어진 파라미터를 사용하여 사용자 사전의 한 표현 형태인 DictSet protobuf 메시지를 만듭니다. 15 | 16 | Args: 17 | domain (str): 사용자 사전의 이름 18 | name (str): 사용자 사전에 대한 설명 19 | dict_set (set): 사용자 사전에 들어가야 할 단어들의 잡합 20 | 21 | Returns: 22 | common.DictSet: protobuf DictSet 메시지 23 | """ 24 | ret = common.DictSet() 25 | ret.name = domain + "-" + name 26 | ret.type = common.DictType.WORD_LIST 27 | for v in dict_set: 28 | ret.items[v] = 1 29 | return ret 30 | 31 | 32 | MAX_MESSAGE_LENGTH = 100 * 1024 * 1024 33 | 34 | 35 | class CustomDictionaryServiceClient: 36 | """ 37 | 커스텀 사전을 생성, 조회, 업데이트, 삭제하는 클라이언트 38 | 39 | The custom dictionary client which can create, update, list, delete your own one. 40 | """ 41 | 42 | def __init__(self, channel: grpc.Channel, apikey:str): 43 | """사용자 사전을 관리하는 클라이언트 객체 생성자 44 | 45 | Args: 46 | remote (grpc.Channel): 미리 만들어 놓은 channel 객체 47 | """ 48 | super().__init__() 49 | self.channel = channel 50 | self.apikey = apikey 51 | self.metadata=( 52 | ('api-key', self.apikey), 53 | ) 54 | 55 | self.stub = cds.CustomDictionaryServiceStub(self.channel) 56 | 57 | 58 | def get_list(self) -> List[pb.CustomDictionaryMeta]: 59 | """사전 목록을 가져옵니다. 60 | 61 | Raises: 62 | e: grpc.Error, 원격 호출시 예외가 발생할 수 있습니다. 63 | 64 | Returns: 65 | List[pb.CustomDictionaryMeta]: 사전에 대한 정보들을 목록을 배열합니다. 66 | """ 67 | req = Empty() 68 | try: 69 | res, c = self.stub.GetCustomDictionaryList.with_call( 70 | request=req, metadata=self.metadata) 71 | return res.domain_dicts 72 | except grpc.RpcError as e: 73 | raise e 74 | 75 | 76 | def get(self, domain: str) -> pb.CustomDictionary: 77 | """ 78 | 정의된 사용사 사전의 내용 전체를 가져온다. 79 | 80 | Args: 81 | domain (str): 사용자 사전이 이름 82 | 83 | Raises: 84 | e: grpc.Error, 원격 호출시 예외가 발생할 수 있습니다. 85 | 86 | Returns: 87 | pb.CustomDictionary: 사용자 사전 데이터 전체를 담고 있는 protobuf 메시지 88 | """ 89 | req = pb.GetCustomDictionaryRequest() 90 | req.domain_name = domain 91 | try: 92 | res, c = self.stub.GetCustomDictionary.with_call( 93 | request=req, metadata=self.metadata) 94 | return res.dict 95 | except grpc.RpcError as e: 96 | raise e 97 | 98 | 99 | def update(self, domain: str, np: set, cp: set, cp_caret: set, vv: set, va: set) -> bool: 100 | """ 사용자 사전을 갱신합니다. 101 | 102 | Args: 103 | domain (str): 사용자 사전의 이름 104 | np (set): 고유명사 단어 집합 105 | cp (set): 복합명사 단어 집합 106 | cp_caret (set): 복합명사 분리 단어 집합 107 | vv (set): 동사 단어 집합 108 | va (set): 형용사 단어 집합 109 | 110 | Raises: 111 | e: grpc.Error, 원격 호출시 예외가 발생할 수 있습니다. 112 | 113 | Returns: 114 | bool: 정상적으로 갱신되면 참을 돌려줍니다. 115 | """ 116 | 117 | req = pb.UpdateCustomDictionaryRequest() 118 | req.domain_name = domain 119 | 120 | req.dict.domain_name = domain 121 | 122 | req.dict.np_set.CopyFrom(build_dict_set(domain, 'np-set', np)) 123 | req.dict.cp_set.CopyFrom(build_dict_set(domain, 'cp-set', cp)) 124 | req.dict.vv_set.CopyFrom(build_dict_set(domain, 'vv-set', vv)) 125 | req.dict.va_set.CopyFrom(build_dict_set(domain, 'va-set', va)) 126 | req.dict.cp_caret_set.CopyFrom( 127 | build_dict_set(domain, 'cp-caret-set', cp_caret)) 128 | 129 | try: 130 | res, c = self.stub.UpdateCustomDictionary.with_call( 131 | request=req, metadata=self.metadata) 132 | return res.updated_domain_name == domain 133 | except grpc.RpcError as e: 134 | raise e 135 | 136 | 137 | """ 138 | :return: 삭제된 도메인의 이름들 139 | """ 140 | def remove_all(self) -> List[str]: 141 | """ 142 | 모든 커스텀 사전을 삭제한 다음 삭제한 사전의 이름을 돌려줍니다. 143 | 144 | Raises: 145 | e: grpc.Error, 원격 호출시 예외가 발생할 수 있습니다. 146 | 147 | Returns: 148 | List[str]: 삭제한 사전의 이름 149 | """ 150 | req = pb.RemoveCustomDictionariesRequest() 151 | req.all = True 152 | 153 | try: 154 | res, c = self.stub.RemoveCustomDictionaries.with_call( 155 | request=req, metadata=self.metadata) 156 | return res.deleted_domain_names.keys() 157 | except grpc.RpcError as e: 158 | raise e 159 | 160 | """ 161 | 지정한 도메인의 커스텀 사전을 삭제한다. 162 | :param domains: 163 | :return: 164 | """ 165 | def remove(self, domains: List[str]) -> List[str]: 166 | """ 지정한 도메인의 사용지 사전을 삭제한 다음 삭제한 사전의 목록을 반환합니다. 167 | 168 | Args: 169 | domains (List[str]): 삭제할 커스텀 사전의 이름들 170 | 171 | Raises: 172 | e: grpc.Error, 원격 호출시 예외가 발생할 수 있습니다. 173 | 174 | Returns: 175 | List[str]: 정상 삭제된 도메인의 이름 목록을 돌려줍니다. 176 | """ 177 | req = pb.RemoveCustomDictionariesRequest() 178 | req.domain_names.extend(domains) 179 | req.all = False 180 | try: 181 | res, c = self.stub.RemoveCustomDictionaries.with_call( 182 | request=req, metadata=self.metadata) 183 | return res.deleted_domain_names.keys() 184 | except grpc.RpcError as e: 185 | raise e 186 | -------------------------------------------------------------------------------- /bareunpy/_lang_service_client.py: -------------------------------------------------------------------------------- 1 | import grpc 2 | from typing import List 3 | 4 | import bareunpy 5 | import bareun.language_service_pb2 as pb 6 | import bareun.language_service_pb2_grpc as ls 7 | import bareun.lang_common_pb2 as lpb 8 | 9 | MAX_MESSAGE_LENGTH = 100 * 1024 * 1024 10 | 11 | 12 | class BareunLanguageServiceClient: 13 | """ 14 | 형태소 분석을 처리하는 클라이언트 15 | """ 16 | 17 | def __init__(self, channel:grpc.Channel, apikey:str, host:str, port:int): 18 | """ 19 | 클라이언트 생성자 20 | 21 | Args: 22 | channel (grpc.Channel): 원격 채널 정보 23 | """ 24 | self.channel = channel 25 | self.apikey = apikey 26 | self.metadata=( 27 | ('api-key', self.apikey), 28 | ('user-agent', f'bareunpy/{bareunpy.version}') 29 | ) 30 | self.host = host 31 | self.port = port 32 | self.stub = ls.LanguageServiceStub(self.channel) 33 | 34 | def _handle_grpc_error(self, e: grpc.RpcError): 35 | """gRPC 에러를 처리하는 메서드""" 36 | details = getattr(e, "details", lambda: None)() 37 | code = getattr(e, "code", lambda: grpc.StatusCode.OK)() 38 | 39 | server_message = details if details else "서버에서 추가 메시지를 제공하지 않았습니다." 40 | if code == grpc.StatusCode.PERMISSION_DENIED: 41 | message = f'\n입력한 API KEY가 정확한지 확인해 주세요.\n > APIKEY: {self.apikey}\n서버 메시지: {server_message}' 42 | elif code == grpc.StatusCode.UNAVAILABLE: 43 | message = f'\n서버에 연결할 수 없습니다. 입력한 서버주소 [{self.host}:{self.port}]가 정확한지 확인해 주세요.\n서버 메시지: {server_message}' 44 | elif code == grpc.StatusCode.INVALID_ARGUMENT: 45 | message = f'\n잘못된 요청이 서버로 전송되었습니다. 입력 데이터를 확인하세요.\n서버 메시지: {server_message}' 46 | else: 47 | message = f'알 수 없는 오류가 발생했습니다.\n서버 메시지: {server_message}' 48 | raise e 49 | raise Exception(message) from e 50 | 51 | def analyze_syntax(self, content: str, 52 | custom_dicts: List[str] = [], 53 | auto_split=False, 54 | auto_spacing=True, 55 | auto_jointing=True) -> pb.AnalyzeSyntaxResponse: 56 | """ 57 | 형태소 분석을 수행합니다. 58 | 59 | Args: 60 | content (str): 형태소 분석할 원문, 여러 문장일 경우에 개행문자로 줄바꿈을 하면 됩니다. 61 | domain (str, optional): 사용사 사전의 이름. 기본값은 "". 62 | auto_split (bool, optional): 문장 자동 분리 여부, 기본값은 사용하지 않음. 63 | auto_spacing (bool, optional): 띄어쓰기 보정 기능, 기본값은 사용하도록 함. 64 | auto_jointing (bool, optional): 붙여쓰기 보정 기능, 기본값은 사용하지 않음. 65 | 66 | Raises: 67 | e: grpc.Error, 원격 호출시 예외가 발생할 수 있습니다. 68 | 69 | Returns: 70 | pb.AnalyzeSyntaxResponse: 형태소 분석 결과 71 | """ 72 | req = pb.AnalyzeSyntaxRequest() 73 | # req.document = pb.Document() 74 | req.document.content = content 75 | req.document.language = "ko_KR" 76 | req.encoding_type = lpb.EncodingType.UTF32 77 | req.auto_split_sentence = auto_split 78 | req.auto_spacing = auto_spacing 79 | req.auto_jointing = auto_jointing 80 | req.custom_dict_names.extend(custom_dicts) 81 | 82 | try: 83 | res, c = self.stub.AnalyzeSyntax.with_call( 84 | request=req, metadata=self.metadata) 85 | return res 86 | except grpc.RpcError as e: 87 | self._handle_grpc_error(e) 88 | except Exception as e2: 89 | import traceback 90 | traceback.print_exc() 91 | raise e2 92 | 93 | def analyze_syntax_list(self, content: List[str], 94 | custom_dicts: List[str] = [], 95 | auto_spacing=True, 96 | auto_jointing=True) -> pb.AnalyzeSyntaxListResponse: 97 | """ 98 | 형태소 분석을 수행하되, 입력된 문장 단위가 일치하도록 반환됩니다. 99 | 문장 분할 기능을 사용하지 않습니다. 100 | 101 | Args: 102 | content (List[str]): 형태소 분석할 원문의 리스트 103 | domain (str, optional): 사용사 사전의 이름. 기본값은 "". 104 | auto_spacing (bool, optional): 띄어쓰기 보정 기능, 기본값은 사용하도록 함. 105 | auto_jointing (bool, optional): 붙여쓰기 보정 기능, 기본값은 사용하지 않음. 106 | 107 | Raises: 108 | e: grpc.Error, 원격 호출시 예외가 발생할 수 있습니다. 109 | 110 | Returns: 111 | pb.AnalyzeSyntaxListResponse: 형태소 분석 결과 112 | """ 113 | req = pb.AnalyzeSyntaxListRequest() 114 | req.sentences.extend(content) 115 | req.language = "ko_KR" 116 | req.encoding_type = lpb.EncodingType.UTF32 117 | req.auto_spacing = auto_spacing 118 | req.auto_jointing = auto_jointing 119 | req.custom_dict_names.extend(custom_dicts) 120 | 121 | try: 122 | res, c = self.stub.AnalyzeSyntaxList.with_call( 123 | request=req, metadata=self.metadata) 124 | return res 125 | except grpc.RpcError as e: 126 | self._handle_grpc_error(e) 127 | except Exception as e2: 128 | import traceback 129 | traceback.print_exc() 130 | raise e2 131 | 132 | 133 | def tokenize(self, content: str, auto_split=False) -> pb.TokenizeResponse: 134 | """ 135 | 형태소 분석을 수행합니다. 136 | 137 | Args: 138 | content (str): 형태소 분석할 원문, 여러 문장일 경우에 개행문자로 줄바꿈을 하면 됩니다. 139 | domain (str, optional): 사용사 사전의 이름. 기본값은 "". 140 | auto_split (bool, optional): 문장 자동 분리 여부, 기본값은 사용하지 않음. 141 | 142 | Raises: 143 | e: grpc.Error, 원격 호출시 예외가 발생할 수 있습니다. 144 | 145 | Returns: 146 | pb.AnalyzeSyntaxResponse: 형태소 분석 결과 147 | """ 148 | req = pb.TokenizeRequest() 149 | # req.document = pb.Document() 150 | req.document.content = content 151 | req.document.language = "ko_KR" 152 | req.encoding_type = lpb.EncodingType.UTF32 153 | req.auto_split_sentence = auto_split 154 | try: 155 | res, c = self.stub.Tokenize.with_call( 156 | request=req, metadata=self.metadata) 157 | return res 158 | except grpc.RpcError as e: 159 | self._handle_grpc_error(e) 160 | except Exception as e2: 161 | import traceback 162 | traceback.print_exc() 163 | raise e2 164 | -------------------------------------------------------------------------------- /bareunpy/_revision_service_client.py: -------------------------------------------------------------------------------- 1 | import grpc 2 | import bareun.revision_service_pb2 as pb 3 | import bareun.revision_service_pb2_grpc as rs_grpc 4 | 5 | MAX_MESSAGE_LENGTH = 100 * 1024 * 1024 6 | 7 | 8 | class BareunRevisionServiceClient: 9 | """ 10 | 맞춤법 검사를 처리하는 클라이언트 11 | """ 12 | 13 | def __init__(self, channel, apikey: str, host: str, port: int): 14 | """ 15 | RevisionServiceClient 초기화 16 | 17 | Args: 18 | apikey (str): API 키 19 | host (str): gRPC 서버 주소 20 | port (int): gRPC 서버 포트 21 | """ 22 | self.channel = channel 23 | self.apikey = apikey 24 | self.host = host 25 | self.port = port 26 | self.metadata = [ 27 | ('api-key', self.apikey), 28 | ('user-agent', 'bareun-revision-client'), 29 | ] 30 | 31 | 32 | self.stub = rs_grpc.RevisionServiceStub(self.channel) 33 | 34 | def _handle_grpc_error(self, e: grpc.RpcError): 35 | """gRPC 에러를 처리하는 메서드""" 36 | details = getattr(e, "details", lambda: None)() 37 | code = getattr(e, "code", lambda: grpc.StatusCode.OK)() 38 | server_message = details if details else "서버에서 추가 메시지를 제공하지 않았습니다." 39 | if code == grpc.StatusCode.PERMISSION_DENIED: 40 | message = f'\n입력한 API KEY가 정확한지 확인해 주세요.\n > APIKEY: {self.apikey}\n서버 메시지: {server_message}' 41 | elif code == grpc.StatusCode.UNAVAILABLE: 42 | message = f'\n서버에 연결할 수 없습니다. 입력한 서버주소 [{self.host}:{self.port}]를 확인하세요.\n서버 메시지: {server_message}' 43 | elif code == grpc.StatusCode.INVALID_ARGUMENT: 44 | message = f'\n잘못된 요청이 서버로 전송되었습니다. 입력 데이터를 확인하세요.\n서버 메시지: {server_message}' 45 | else: 46 | message = f'알 수 없는 오류가 발생했습니다.\n서버 메시지: {server_message}' 47 | raise e 48 | raise Exception(message) from e 49 | 50 | def correct_error(self, request: pb.CorrectErrorRequest) -> pb.CorrectErrorResponse: 51 | """ 52 | 맞춤법 교정을 위한 gRPC 호출 53 | 54 | Args: 55 | request (pb.CorrectErrorRequest): gRPC 요청 메시지 56 | 57 | Returns: 58 | pb.CorrectErrorResponse: gRPC 응답 메시지 59 | """ 60 | try: 61 | response, call = self.stub.CorrectError.with_call( 62 | request=request, metadata=self.metadata 63 | ) 64 | return response 65 | except grpc.RpcError as e: 66 | self._handle_grpc_error(e) 67 | -------------------------------------------------------------------------------- /bareunpy/_tagger.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | from sys import stdout 4 | from typing import IO, List, Any, Union 5 | 6 | from google.protobuf.json_format import MessageToDict 7 | import grpc 8 | from bareunpy._custom_dict import CustomDict 9 | from bareunpy._lang_service_client import BareunLanguageServiceClient, MAX_MESSAGE_LENGTH 10 | from bareun.language_service_pb2 import AnalyzeSyntaxResponse, AnalyzeSyntaxListResponse, Morpheme, Sentence, Token 11 | 12 | 13 | class Tagged: 14 | """ 15 | Tagged result. 16 | It has various output manipulations. 17 | """ 18 | 19 | def __init__(self, phrase: Union[str, List[str]], res: Union[AnalyzeSyntaxResponse, AnalyzeSyntaxListResponse]): 20 | """ 21 | constructor, which is used internally. 22 | :param phrase: requested sentences. 23 | :param res: 24 | """ 25 | super().__init__() 26 | self.phrase = phrase 27 | self.r = res 28 | 29 | # 빈 응답이 있는 경우를 대비해서 값이 없지 않도록 처리한다. 30 | if self.r is None: 31 | self.r = AnalyzeSyntaxResponse() 32 | self.phrase = '' 33 | 34 | def msg(self) -> Union[AnalyzeSyntaxResponse, AnalyzeSyntaxListResponse]: 35 | """ 36 | Protobuf message object containing all of NLP engine. 37 | """ 38 | return self.r 39 | 40 | def sentences(self) -> List[Sentence]: 41 | """ 42 | :return: get sentences from tagged results. 43 | """ 44 | ret = list() 45 | for s in self.r.sentences: 46 | ret.append(s) 47 | return ret 48 | 49 | def as_json(self): 50 | """ 51 | convert the message to a json object. 52 | :return: Json Obejct 53 | """ 54 | return MessageToDict(self.r, True) 55 | 56 | def as_json_str(self) -> str: 57 | """ 58 | a json string representing analyzed sentences. 59 | :return: json string 60 | """ 61 | d = MessageToDict(self.r, True) 62 | return json.dumps(d, ensure_ascii=False, indent=2) 63 | 64 | def print_as_json(self, out: IO = stdout): 65 | """ 66 | print the analysis result 67 | :param out: File, if nothing provided, sys.stdout is used. 68 | :return: None 69 | """ 70 | d = MessageToDict(self.r, True) 71 | json.dump(d, out, ensure_ascii=False, indent=2) 72 | 73 | @staticmethod 74 | def _pos(m: Morpheme, join: bool, detail: bool): 75 | if join: 76 | if detail: 77 | p = f':{m.probability:5.3f}' if m.probability > 0 else '' 78 | oov = f'#{Morpheme.OutOfVocab.Name(m.out_of_vocab)}' if m.out_of_vocab != 0 else '' 79 | return f'{m.text.content}/{Morpheme.Tag.Name(m.tag)}{p}{oov}' 80 | else: 81 | return f'{m.text.content}/{Morpheme.Tag.Name(m.tag)}' 82 | else: 83 | if detail: 84 | return m.text.content,\ 85 | Morpheme.Tag.Name(m.tag),\ 86 | Morpheme.OutOfVocab.Name(m.out_of_vocab),\ 87 | m.probability 88 | else: 89 | return m.text.content, Morpheme.Tag.Name(m.tag) 90 | 91 | def pos(self, flatten: bool = True, join: bool = False, detail: bool = False) -> List: 92 | """ 93 | POS tagger to tuple. 94 | :param flatten : If False, returns original morphs. 95 | :param join : If True, returns joined sets of morph and tag. 96 | :param detail : if True, returns everything of morph result 97 | """ 98 | if flatten: 99 | return [Tagged._pos(m, join, detail) for s in self.r.sentences 100 | for token in s.tokens 101 | for m in token.morphemes] 102 | else: 103 | return [[Tagged._pos(m, join, detail) for m in token.morphemes] 104 | for s in self.r.sentences 105 | for token in s.tokens] 106 | 107 | def morphs(self) -> List: 108 | """Parse phrase to morphemes.""" 109 | return [m.text.content for s in self.r.sentences 110 | for token in s.tokens 111 | for m in token.morphemes] 112 | 113 | def nouns(self) -> List: 114 | """Noun extractor.""" 115 | return [m.text.content for s in self.r.sentences 116 | for token in s.tokens 117 | for m in token.morphemes 118 | if m.tag in {Morpheme.Tag.NNP, Morpheme.Tag.NNG, Morpheme.Tag.NP, Morpheme.Tag.NNB}] 119 | 120 | def verbs(self) -> List: 121 | """Noun extractor.""" 122 | return [m.text.content for s in self.r.sentences 123 | for token in s.tokens 124 | for m in token.morphemes 125 | if m.tag in {Morpheme.Tag.VV}] 126 | 127 | 128 | class Tagger: 129 | """Wrapper for `bareun v1.7.x `_. 130 | 'bareun' is a morphological analyzer developed by Baikal AI, Inc. and Korea Press Foundation. 131 | 132 | .. code-block:: python 133 | :emphasize-lines: 1 134 | >>> import bareunpy as brn 135 | >>> tagger = brn.Tagger(apikey="kpba-YOURKEY", custom_dicts=["custom", "my"]) 136 | >>> print(tagger.morphs('안녕하세요, 반가워요.')) 137 | ['안녕', '하', '시', '어요', ',', '반갑', '어요', '.'] 138 | >>> print(tagger.nouns('나비 허리에 새파란 초생달이 시리다.')) 139 | ['나비', '허리', '초생달'] 140 | >>> print(tagger.pos('햇빛이 선명하게 나뭇잎을 핥고 있었다.')) 141 | [('햇빛', 'NNG'), ('이', 'JKS'), ('선명', 'NNG'), ('하', 'XSA'), ('게', 'EC'), ('나뭇잎', 'NNG'), 142 | ('을', 'JKO'), ('핥', 'VV'), ('고', 'EC'), ('있', 'VX'), ('었', 'EP'), ('다', 'EF'), ('.', 'SF')] 143 | :param host : str. host name for bareun server 144 | :param port : int. port for bareun server 145 | :param custom_dicts : List[str]. custom dictionary names for analyzing request 146 | """ 147 | 148 | def __init__(self, apikey:str, host: str = "", port: int = 5656, custom_dicts: List[str] = []): 149 | 150 | if host: 151 | host = host.strip() 152 | 153 | if host == "" or host is None: 154 | self.host = 'nlp.bareun.ai' 155 | else: 156 | self.host = host 157 | 158 | if port is not None: 159 | self.port = port 160 | else: 161 | self.port = 5656 162 | 163 | self.channel = grpc.insecure_channel( 164 | f"{self.host}:{self.port}", 165 | options=[ 166 | ('grpc.max_send_message_length', MAX_MESSAGE_LENGTH), 167 | ('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH), 168 | ]) 169 | self.apikey = apikey 170 | 171 | if apikey == None or len(apikey) == 0: 172 | raise ValueError("an apikey must be provided!") 173 | 174 | self.client = BareunLanguageServiceClient(self.channel, apikey, self.host, self.port) 175 | 176 | self.custom_dicts = custom_dicts 177 | self.internal_custom_dicts = {} 178 | 179 | def _handle_grpc_error(self, e: grpc.RpcError): 180 | """gRPC 에러를 처리하는 메서드""" 181 | details = getattr(e, "details", lambda: None)() 182 | code = getattr(e, "code", lambda: grpc.StatusCode.OK)() 183 | 184 | server_message = details if details else "서버에서 추가 메시지를 제공하지 않았습니다." 185 | if code == grpc.StatusCode.PERMISSION_DENIED: 186 | message = f'\n입력한 API KEY가 정확한지 확인해 주세요.\n > APIKEY: {self.apikey}\n서버 메시지: {server_message}' 187 | elif code == grpc.StatusCode.UNAVAILABLE: 188 | message = f'\n서버에 연결할 수 없습니다. 입력한 서버주소 [{self.host}:{self.port}]가 정확한지 확인해 주세요.\n서버 메시지: {server_message}' 189 | else: 190 | raise e 191 | raise Exception(message) from e 192 | 193 | @DeprecationWarning 194 | def set_domain(self, domain: str): 195 | """ 196 | Set domain of custom dict. 197 | :param domain: domain name of custom dict 198 | """ 199 | if len(self.custom_dicts) == 0: 200 | self.custom_dicts = [] 201 | self.custom_dicts.append(domain) 202 | 203 | def set_custom_dicts(self, custom_dicts: List[str]): 204 | """ 205 | Set domain of custom dict. 206 | :param domain: domain name of custom dict 207 | """ 208 | if len(custom_dicts) > 0: 209 | self.custom_dicts = custom_dicts 210 | else: 211 | self.custom_dicts = [] 212 | 213 | def custom_dict(self, name: str) -> CustomDict: 214 | # self.domain = domain 215 | if name == "" or name is None: 216 | raise ValueError("invalid name for custom dict") 217 | 218 | if name in self.internal_custom_dicts: 219 | return self.internal_custom_dicts[name] 220 | else: 221 | self.internal_custom_dicts[name] = CustomDict(self.apikey, name, self.channel) 222 | return self.internal_custom_dicts[name] 223 | 224 | def tag(self, phrase: str, auto_split: bool = False, auto_spacing: bool = True, auto_jointing: bool = True) -> Tagged: 225 | if len(phrase) == 0: 226 | print("OOPS, no sentences.") 227 | return Tagged('', AnalyzeSyntaxResponse()) 228 | try: 229 | res = self.client.analyze_syntax(phrase, self.custom_dicts, auto_split=auto_split, auto_spacing=auto_spacing, auto_jointing=auto_jointing) 230 | return Tagged(phrase, res) 231 | except grpc.RpcError as e: 232 | self._handle_grpc_error(e) 233 | 234 | def tags(self, phrase: List[str], auto_split: bool = False, auto_spacing: bool = True, auto_jointing: bool = True) -> Tagged: 235 | """ 236 | tag string array. 237 | :param phrase: array of string 238 | :param auto_split(bool, optional): Whether to automatically perform sentence split 239 | :param auto_spacing(bool, optional): Whether to automatically perform space insertion for typo correction 240 | :param auto_jointing(bool, optional): Whether to automatically perform word joining for typo correction 241 | :return: Tagged result instance 242 | """ 243 | if len(phrase) == 0: 244 | print("OOPS, no sentences.") 245 | return Tagged('', AnalyzeSyntaxResponse()) 246 | p = '\n'.join(phrase) 247 | try: 248 | res = self.client.analyze_syntax(p, self.custom_dicts, auto_split=auto_split, auto_spacing=auto_spacing, auto_jointing=auto_jointing) 249 | return Tagged(p, res) 250 | except grpc.RpcError as e: 251 | self._handle_grpc_error(e) 252 | 253 | def taglist(self, phrase: List[str], auto_spacing: bool = True, auto_jointing: bool = True) -> Tagged: 254 | """ 255 | the array is not being split and the input value is being returned as-is. 256 | :param phrase: array of string 257 | :param auto_split(bool, optional): Whether to automatically perform sentence split 258 | :param auto_spacing(bool, optional): Whether to automatically perform space insertion for typo correction 259 | :param auto_jointing(bool, optional): Whether to automatically perform word joining for typo correction 260 | :return: Tagged result instance 261 | """ 262 | if len(phrase) == 0: 263 | print("OOPS, no sentences.") 264 | return Tagged('', AnalyzeSyntaxListResponse()) 265 | try: 266 | res = self.client.analyze_syntax_list(phrase, self.custom_dicts, auto_spacing=auto_spacing, auto_jointing=auto_jointing) 267 | return Tagged(phrase, res) 268 | except grpc.RpcError as e: 269 | self._handle_grpc_error(e) 270 | 271 | def pos(self, phrase: str, flatten: bool = True, join: bool = False, detail: bool = False) -> List: 272 | """ 273 | POS tagger. 274 | :param phrase : string to analyse 275 | :param flatten : If False, returns original morphs. 276 | :param join : If True, returns joined sets of morph and tag. 277 | :param detail : if True, returns every things of morph result 278 | """ 279 | return self.tag(phrase).pos(flatten, join, detail) 280 | 281 | def morphs(self, phrase: str) -> List: 282 | """Parse phrase to morphemes.""" 283 | return self.tag(phrase).morphs() 284 | 285 | def nouns(self, phrase: str) -> List: 286 | """Noun extractor.""" 287 | return self.tag(phrase).nouns() 288 | 289 | def verbs(self, phrase: str) -> List: 290 | """Verbs extractor.""" 291 | return self.tag(phrase).verbs() 292 | -------------------------------------------------------------------------------- /bareunpy/_tokenizer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | from sys import stdout 4 | from typing import IO, List, Any 5 | 6 | from google.protobuf.json_format import MessageToDict 7 | import grpc 8 | from bareunpy._lang_service_client import BareunLanguageServiceClient, MAX_MESSAGE_LENGTH 9 | from bareun.language_service_pb2 import TokenizeResponse, Segment, SegmentSentence, SegmentToken 10 | 11 | 12 | class Tokenized: 13 | """ 14 | Tokenized result. 15 | It has various output manipulations. 16 | """ 17 | 18 | def __init__(self, phrase: str, res: TokenizeResponse): 19 | """ 20 | constructor, which is used internally. 21 | :param phrase: requested sentences. 22 | :param res: 23 | """ 24 | super().__init__() 25 | self.phrase = phrase 26 | self.r = res 27 | 28 | # 빈 응답이 있는 경우를 대비해서 값이 없지 않도록 처리한다. 29 | if self.r is None: 30 | self.r = TokenizeResponse() 31 | self.phrase = '' 32 | 33 | def msg(self) -> TokenizeResponse: 34 | """ 35 | Protobuf message object containing all of NLP engine. 36 | """ 37 | return self.r 38 | 39 | def sentences(self) -> List[SegmentSentence]: 40 | """ 41 | :return: get sentences from tagged results. 42 | """ 43 | ret = list() 44 | for s in self.r.sentences: 45 | ret.append(s) 46 | return ret 47 | 48 | def as_json(self): 49 | """ 50 | convert the message to a json object. 51 | :return: Json Obejct 52 | """ 53 | return MessageToDict(self.r, True) 54 | 55 | def as_json_str(self) -> str: 56 | """ 57 | a json string representing analyzed sentences. 58 | :return: json string 59 | """ 60 | d = MessageToDict(self.r, True) 61 | return json.dumps(d, ensure_ascii=False, indent=2) 62 | 63 | def print_as_json(self, out: IO = stdout): 64 | """ 65 | print the analysis result 66 | :param out: File, if nothing provided, sys.stdout is used. 67 | :return: None 68 | """ 69 | d = MessageToDict(self.r, True) 70 | json.dump(d, out, ensure_ascii=False, indent=2) 71 | 72 | @staticmethod 73 | def _segment(m: Segment, join: bool, detail: bool): 74 | if join: 75 | if detail: 76 | return f'{m.text.content}/{m.hint}' 77 | else: 78 | return f'{m.text.content}' 79 | else: 80 | if detail: 81 | return m.text.content, m.hint 82 | else: 83 | return m.text.content 84 | 85 | def seg(self, flatten: bool = True, join: bool = False, detail: bool = False) -> List: 86 | """ 87 | 분절의 결과를 튜플 형태로 반환한다. 88 | :param flatten : If False, returns original morphs. 89 | :param join : If True, returns joined sets of morph and tag. 90 | :param detail : if True, returns everything of morph result 91 | """ 92 | if flatten: 93 | return [Tokenized._segment(m, join, detail) for s in self.r.sentences 94 | for token in s.tokens 95 | for m in token.segments] 96 | else: 97 | return [[Tokenized._segment(m, join, detail) for m in token.segments] 98 | for s in self.r.sentences 99 | for token in s.tokens] 100 | 101 | def segments(self) -> List: 102 | """문장의 모든 segment들을 반환한다. """ 103 | return [m.text.content for s in self.r.sentences 104 | for token in s.tokens 105 | for m in token.segments] 106 | 107 | def nouns(self) -> List: 108 | """체언을 추출한다.""" 109 | return [m.text.content for s in self.r.sentences 110 | for token in s.tokens 111 | for m in token.segments 112 | if m.hint == 'N'] 113 | 114 | def verbs(self) -> List: 115 | """동사 또는 형용사, 즉, 용언을 추출한다.""" 116 | return [m.text.content for s in self.r.sentences 117 | for token in s.tokens 118 | for m in token.segments 119 | if m.hint == 'V'] 120 | 121 | def predicates(self) -> List: 122 | """용언을 추출한다.""" 123 | return [m.text.content for s in self.r.sentences 124 | for token in s.tokens 125 | for m in token.segments 126 | if m.hint == 'V'] 127 | 128 | def substantives(self) -> List: 129 | """체언을 추출한다.""" 130 | return [m.text.content for s in self.r.sentences 131 | for token in s.tokens 132 | for m in token.segments 133 | if m.hint == 'N'] 134 | 135 | def symbols(self) -> List: 136 | """기호를 추출한다.""" 137 | return [m.text.content for s in self.r.sentences 138 | for token in s.tokens 139 | for m in token.segments 140 | if m.hint == 'S'] 141 | 142 | def adverbs(self) -> List: 143 | """부사를 추출한다..""" 144 | return [m.text.content for s in self.r.sentences 145 | for token in s.tokens 146 | for m in token.segments 147 | if m.hint == 'A'] 148 | 149 | def prenouns(self) -> List: 150 | """관형사를 추출한다.""" 151 | return [m.text.content for s in self.r.sentences 152 | for token in s.tokens 153 | for m in token.segments 154 | if m.hint == 'M'] 155 | 156 | def postpositions(self) -> List: 157 | """감탄사를 추출한다.""" 158 | return [m.text.content for s in self.r.sentences 159 | for token in s.tokens 160 | for m in token.segments 161 | if m.hint == 'J'] 162 | 163 | def interjections(self) -> List: 164 | """감탄사를 추출한다.""" 165 | return [m.text.content for s in self.r.sentences 166 | for token in s.tokens 167 | for m in token.segments 168 | if m.hint == 'I'] 169 | 170 | def endings(self) -> List: 171 | """어미를 반환한다.""" 172 | return [m.text.content for s in self.r.sentences 173 | for token in s.tokens 174 | for m in token.segments 175 | if m.hint == 'E'] 176 | 177 | class Tokenizer: 178 | """Wrapper for `bareun v1.7.x `_. 179 | 'bareun' is a morphological analyzer developed by Baikal AI, Inc. and Korea Press Foundation. 180 | 181 | .. code-block:: python 182 | :emphasize-lines: 1 183 | >>> import bareunpy as brn 184 | >>> tokenizer = brn.Tokenizer() 185 | >>> print(tokenizer.segments('안녕하세요, 반가워요.')) 186 | ['안녕', '하', '시', '어요', ',', '반갑', '어요', '.'] 187 | >>> print(tokenizer.nouns('나비 허리에 새파란 초생달이 시리다.')) 188 | ['나비', '허리', '초생달'] 189 | >>> print(tokenizer.seg('햇빛이 선명하게 나뭇잎을 핥고 있었다.')) 190 | [('햇빛', 'NNG'), ('이', 'JKS'), ('선명', 'NNG'), ('하', 'XSA'), ('게', 'EC'), ('나뭇잎', 'NNG'), 191 | ('을', 'JKO'), ('핥', 'VV'), ('고', 'EC'), ('있', 'VX'), ('었', 'EP'), ('다', 'EF'), ('.', 'SF')] 192 | :param host : str. host name for bareun server 193 | :param port : int. port for bareun server 194 | """ 195 | 196 | def __init__(self, apikey:str, host: str = "", port: int = 5656): 197 | 198 | if host: 199 | host = host.strip() 200 | if host == "" or host is None: 201 | self.host = 'nlp.bareun.ai' 202 | else: 203 | self.host = host 204 | 205 | if port is not None: 206 | self.port = port 207 | else: 208 | self.port = 5656 209 | 210 | if apikey == None or len(apikey) == 0: 211 | raise ValueError("a apikey must be provided!") 212 | self.apikey = apikey 213 | self.channel = grpc.insecure_channel( 214 | f"{self.host}:{self.port}", 215 | options=[ 216 | ('grpc.max_send_message_length', MAX_MESSAGE_LENGTH), 217 | ('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH), 218 | ]) 219 | self.client = BareunLanguageServiceClient(self.channel, apikey, host, port) 220 | 221 | def _handle_grpc_error(self, e: grpc.RpcError): 222 | """gRPC 에러를 처리하는 메서드""" 223 | details = getattr(e, "details", lambda: None)() 224 | code = getattr(e, "code", lambda: grpc.StatusCode.OK)() 225 | server_message = details if details else "서버에서 추가 메시지를 제공하지 않았습니다." 226 | if code == grpc.StatusCode.PERMISSION_DENIED: 227 | message = f'\n입력한 API KEY가 정확한지 확인해 주세요.\n > APIKEY: {self.apikey}\n서버 메시지: {server_message}' 228 | elif code == grpc.StatusCode.UNAVAILABLE: 229 | message = f'\n서버에 연결할 수 없습니다. 입력한 서버주소 [{self.host}:{self.port}]가 정확한지 확인해 주세요.\n서버 메시지: {server_message}' 230 | else: 231 | raise e 232 | raise Exception(message) from e 233 | 234 | def tokenize(self, phrase: str, auto_split: bool = False) -> Tokenized: 235 | if len(phrase) == 0: 236 | print("OOPS, no sentences.") 237 | return Tokenized('', TokenizeResponse()) 238 | try: 239 | res = Tokenized(phrase, 240 | self.client.tokenize(phrase, auto_split)) 241 | return res 242 | except grpc.RpcError as e: 243 | self._handle_grpc_error(e) 244 | 245 | def tokenize_list(self, phrase: List[str]) -> Tokenized: 246 | """ 247 | tag string array. 248 | :param phrase: array of string 249 | :return: Tagged result instance 250 | """ 251 | if len(phrase) == 0: 252 | print("OOPS, no sentences.") 253 | return Tokenized('', TokenizeResponse()) 254 | p = '\n'.join(phrase) 255 | try: 256 | res = Tokenized(p, 257 | self.client.tokenize(p, auto_split=False)) 258 | return res 259 | except grpc.RpcError as e: 260 | self._handle_grpc_error(e) 261 | 262 | def seg(self, phrase: str, flatten: bool = True, join: bool = False, detail: bool = False) -> List: 263 | """ 264 | 분절 하기, 265 | :param phrase : string to analyse 266 | :param flatten : If False, returns original morphs. 267 | :param join : If True, returns joined sets of morph and tag. 268 | :param detail : if True, returns every things of morph result 269 | """ 270 | return self.tokenize(phrase).seg(flatten, join, detail) 271 | 272 | def segments(self, phrase: str) -> List: 273 | """문장을 분절하여 어절 내부의 기본 단위로 만들어 낸다.""" 274 | return self.tokenize(phrase).segments() 275 | 276 | def nouns(self, phrase: str) -> List: 277 | """문장을 분절하여 어절 내부의 기본 단위로 만들어 내고 체언을 뽑아낸다.""" 278 | return self.tokenize(phrase).nouns() 279 | 280 | def verbs(self, phrase: str) -> List: 281 | """문장을 분절하여 어절 내부의 기본 단위로 만들어 내고 용언을 뽑아낸다.""" 282 | return self.tokenize(phrase).verbs() 283 | -------------------------------------------------------------------------------- /poetry.lock: -------------------------------------------------------------------------------- 1 | [[package]] 2 | name = "atomicwrites" 3 | version = "1.4.0" 4 | description = "Atomic file writes." 5 | category = "dev" 6 | optional = false 7 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" 8 | 9 | [[package]] 10 | name = "attrs" 11 | version = "20.3.0" 12 | description = "Classes Without Boilerplate" 13 | category = "dev" 14 | optional = false 15 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" 16 | 17 | [package.extras] 18 | dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface", "furo", "sphinx", "pre-commit"] 19 | docs = ["furo", "sphinx", "zope.interface"] 20 | tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface"] 21 | tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six"] 22 | 23 | [[package]] 24 | name = "colorama" 25 | version = "0.4.4" 26 | description = "Cross-platform colored terminal text." 27 | category = "dev" 28 | optional = false 29 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" 30 | 31 | [[package]] 32 | name = "grpcio" 33 | version = "1.35.0" 34 | description = "HTTP/2-based RPC framework" 35 | category = "main" 36 | optional = false 37 | python-versions = "*" 38 | 39 | [package.dependencies] 40 | six = ">=1.5.2" 41 | 42 | [package.extras] 43 | protobuf = ["grpcio-tools (>=1.35.0)"] 44 | 45 | [[package]] 46 | name = "importlib-metadata" 47 | version = "3.4.0" 48 | description = "Read metadata from Python packages" 49 | category = "dev" 50 | optional = false 51 | python-versions = ">=3.6" 52 | 53 | [package.dependencies] 54 | typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""} 55 | zipp = ">=0.5" 56 | 57 | [package.extras] 58 | docs = ["sphinx", "jaraco.packaging (>=8.2)", "rst.linker (>=1.9)"] 59 | testing = ["pytest (>=3.5,!=3.7.3)", "pytest-checkdocs (>=1.2.3)", "pytest-flake8", "pytest-cov", "pytest-enabler", "packaging", "pep517", "pyfakefs", "flufl.flake8", "pytest-black (>=0.3.7)", "pytest-mypy", "importlib-resources (>=1.3)"] 60 | 61 | [[package]] 62 | name = "iniconfig" 63 | version = "1.1.1" 64 | description = "iniconfig: brain-dead simple config-ini parsing" 65 | category = "dev" 66 | optional = false 67 | python-versions = "*" 68 | 69 | [[package]] 70 | name = "packaging" 71 | version = "20.9" 72 | description = "Core utilities for Python packages" 73 | category = "dev" 74 | optional = false 75 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" 76 | 77 | [package.dependencies] 78 | pyparsing = ">=2.0.2" 79 | 80 | [[package]] 81 | name = "pluggy" 82 | version = "0.13.1" 83 | description = "plugin and hook calling mechanisms for python" 84 | category = "dev" 85 | optional = false 86 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" 87 | 88 | [package.dependencies] 89 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} 90 | 91 | [package.extras] 92 | dev = ["pre-commit", "tox"] 93 | 94 | [[package]] 95 | name = "protobuf" 96 | version = "3.14.0" 97 | description = "Protocol Buffers" 98 | category = "main" 99 | optional = false 100 | python-versions = "*" 101 | 102 | [package.dependencies] 103 | six = ">=1.9" 104 | 105 | [[package]] 106 | name = "py" 107 | version = "1.10.0" 108 | description = "library with cross-python path, ini-parsing, io, code, log facilities" 109 | category = "dev" 110 | optional = false 111 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" 112 | 113 | [[package]] 114 | name = "pyparsing" 115 | version = "2.4.7" 116 | description = "Python parsing module" 117 | category = "dev" 118 | optional = false 119 | python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" 120 | 121 | [[package]] 122 | name = "pytest" 123 | version = "6.2.2" 124 | description = "pytest: simple powerful testing with Python" 125 | category = "dev" 126 | optional = false 127 | python-versions = ">=3.6" 128 | 129 | [package.dependencies] 130 | atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""} 131 | attrs = ">=19.2.0" 132 | colorama = {version = "*", markers = "sys_platform == \"win32\""} 133 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} 134 | iniconfig = "*" 135 | packaging = "*" 136 | pluggy = ">=0.12,<1.0.0a1" 137 | py = ">=1.8.2" 138 | toml = "*" 139 | 140 | [package.extras] 141 | testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"] 142 | 143 | [[package]] 144 | name = "six" 145 | version = "1.15.0" 146 | description = "Python 2 and 3 compatibility utilities" 147 | category = "main" 148 | optional = false 149 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" 150 | 151 | [[package]] 152 | name = "toml" 153 | version = "0.10.2" 154 | description = "Python Library for Tom's Obvious, Minimal Language" 155 | category = "dev" 156 | optional = false 157 | python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" 158 | 159 | [[package]] 160 | name = "typing-extensions" 161 | version = "3.7.4.3" 162 | description = "Backported and Experimental Type Hints for Python 3.5+" 163 | category = "dev" 164 | optional = false 165 | python-versions = "*" 166 | 167 | [[package]] 168 | name = "zipp" 169 | version = "3.4.0" 170 | description = "Backport of pathlib-compatible object wrapper for zip files" 171 | category = "dev" 172 | optional = false 173 | python-versions = ">=3.6" 174 | 175 | [package.extras] 176 | docs = ["sphinx", "jaraco.packaging (>=3.2)", "rst.linker (>=1.9)"] 177 | testing = ["pytest (>=3.5,!=3.7.3)", "pytest-checkdocs (>=1.2.3)", "pytest-flake8", "pytest-cov", "jaraco.test (>=3.2.0)", "jaraco.itertools", "func-timeout", "pytest-black (>=0.3.7)", "pytest-mypy"] 178 | 179 | [metadata] 180 | lock-version = "1.1" 181 | python-versions = "^3.6" 182 | content-hash = "2189ca77911e79e25a7de2f41b8625e10e836021cbc76cd1431d6da6e3e65339" 183 | 184 | [metadata.files] 185 | atomicwrites = [ 186 | {file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"}, 187 | {file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"}, 188 | ] 189 | attrs = [ 190 | {file = "attrs-20.3.0-py2.py3-none-any.whl", hash = "sha256:31b2eced602aa8423c2aea9c76a724617ed67cf9513173fd3a4f03e3a929c7e6"}, 191 | {file = "attrs-20.3.0.tar.gz", hash = "sha256:832aa3cde19744e49938b91fea06d69ecb9e649c93ba974535d08ad92164f700"}, 192 | ] 193 | colorama = [ 194 | {file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"}, 195 | {file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"}, 196 | ] 197 | grpcio = [ 198 | {file = "grpcio-1.35.0-cp27-cp27m-macosx_10_10_x86_64.whl", hash = "sha256:95cc4d2067deced18dc807442cf8062a93389a86abf8d40741120054389d3f29"}, 199 | {file = "grpcio-1.35.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:d186a0ce291f4386e28a7042ec31c85250b0c2e25d2794b87fa3c15ff473c46c"}, 200 | {file = "grpcio-1.35.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:c8d0a6a58a42275c6cb616e7cb9f9fcf5eba1e809996546e561cd818b8f7cff7"}, 201 | {file = "grpcio-1.35.0-cp27-cp27m-win32.whl", hash = "sha256:8d08f90d72a8e8d9af087476337da76d26749617b0a092caff4e684ce267af21"}, 202 | {file = "grpcio-1.35.0-cp27-cp27m-win_amd64.whl", hash = "sha256:0072ec4563ab4268c4c32e936955085c2d41ea175b662363496daedd2273372c"}, 203 | {file = "grpcio-1.35.0-cp27-cp27mu-linux_armv7l.whl", hash = "sha256:aca45d2ccb693c9227fbf21144891422a42dc4b76b52af8dd1d4e43afebe321d"}, 204 | {file = "grpcio-1.35.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:87147b1b306c88fe7dca7e3dff8aefd1e63d6aed86e224f9374ddf283f17d7f1"}, 205 | {file = "grpcio-1.35.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:22edfc278070d54f3ab7f741904e09155a272fe934e842babbf84476868a50de"}, 206 | {file = "grpcio-1.35.0-cp35-cp35m-linux_armv7l.whl", hash = "sha256:f3654a52f72ba28953dbe2e93208099f4903f4b3c07dc7ff4db671c92968111d"}, 207 | {file = "grpcio-1.35.0-cp35-cp35m-macosx_10_10_intel.whl", hash = "sha256:dc2589370ef84eb1cc53530070d658a7011d2ee65f18806581809c11cd016136"}, 208 | {file = "grpcio-1.35.0-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:f0c27fd16582a303e5baf6cffd9345c9ac5f855d69a51232664a0b888a77ba80"}, 209 | {file = "grpcio-1.35.0-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:b2985f73611b637271b00d9c4f177e65cc3193269bc9760f16262b1a12757265"}, 210 | {file = "grpcio-1.35.0-cp35-cp35m-manylinux2014_i686.whl", hash = "sha256:acb489b7aafdcf960f1a0000a1f22b45e5b6ccdf8dba48f97617d627f4133195"}, 211 | {file = "grpcio-1.35.0-cp35-cp35m-manylinux2014_x86_64.whl", hash = "sha256:16fd33030944672e49e0530dec2c60cd4089659ccdf327e99569b3b29246a0b6"}, 212 | {file = "grpcio-1.35.0-cp35-cp35m-win32.whl", hash = "sha256:1757e81c09132851e85495b802fe4d4fbef3547e77fa422a62fb4f7d51785be0"}, 213 | {file = "grpcio-1.35.0-cp35-cp35m-win_amd64.whl", hash = "sha256:35b72884e09cbc46c564091f4545a39fa66d132c5676d1a6e827517fff47f2c1"}, 214 | {file = "grpcio-1.35.0-cp36-cp36m-linux_armv7l.whl", hash = "sha256:17940a7dc461066f28816df48be44f24d3b9f150db344308ee2aeae033e1af0b"}, 215 | {file = "grpcio-1.35.0-cp36-cp36m-macosx_10_10_x86_64.whl", hash = "sha256:75ea903edc42a8c6ec61dbc5f453febd79d8bdec0e1bad6df7088c34282e8c42"}, 216 | {file = "grpcio-1.35.0-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:b180a3ec4a5d6f96d3840c83e5f8ab49afac9fa942921e361b451d7a024efb00"}, 217 | {file = "grpcio-1.35.0-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:e163c27d2062cd3eb07057f23f8d1330925beaba16802312b51b4bad33d74098"}, 218 | {file = "grpcio-1.35.0-cp36-cp36m-manylinux2014_i686.whl", hash = "sha256:764b50ba1a15a2074cdd1a841238f2dead0a06529c495a46821fae84cb9c7342"}, 219 | {file = "grpcio-1.35.0-cp36-cp36m-manylinux2014_x86_64.whl", hash = "sha256:088c8bea0f6b596937fefacf2c8df97712e7a3dd49496975049cc95dbf02af1a"}, 220 | {file = "grpcio-1.35.0-cp36-cp36m-win32.whl", hash = "sha256:1aa53f82362c7f2791fe0cdd9a3b3aec325c11d8f0dfde600f91907dfaa8546b"}, 221 | {file = "grpcio-1.35.0-cp36-cp36m-win_amd64.whl", hash = "sha256:efb3d67405eb8030db6f27920b4be023fabfb5d4e09c34deab094a7c473a5472"}, 222 | {file = "grpcio-1.35.0-cp37-cp37m-macosx_10_10_x86_64.whl", hash = "sha256:44aaa6148d18a8e836f99dadcdec17b27bc7ec0995b2cc12c94e61826040ec90"}, 223 | {file = "grpcio-1.35.0-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:18ad7644e23757420ea839ac476ef861e4f4841c8566269b7c91c100ca1943b3"}, 224 | {file = "grpcio-1.35.0-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:859a0ceb23d7189362cc06fe7e906e9ed5c7a8f3ac960cc04ce13fe5847d0b62"}, 225 | {file = "grpcio-1.35.0-cp37-cp37m-manylinux2014_i686.whl", hash = "sha256:3e7d4428ed752fdfe2dddf2a404c93d3a2f62bf4b9109c0c10a850c698948891"}, 226 | {file = "grpcio-1.35.0-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:a36151c335280b09afd5123f3b25085027ae2b10682087a4342fb6f635b928fb"}, 227 | {file = "grpcio-1.35.0-cp37-cp37m-win32.whl", hash = "sha256:dfecb2acd3acb8bb50e9aa31472c6e57171d97c1098ee67cd283a6fe7d56a926"}, 228 | {file = "grpcio-1.35.0-cp37-cp37m-win_amd64.whl", hash = "sha256:e87e55fba98ebd7b4c614dcef9940dc2a7e057ad8bba5f91554934d47319a35b"}, 229 | {file = "grpcio-1.35.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:da44bf613eed5d9e8df0785463e502a416de1be6e4ac31edbe99c9111abaed5f"}, 230 | {file = "grpcio-1.35.0-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:9e503eaf853199804a954dc628c5207e67d6c7848dcba42a997fbe718618a2b1"}, 231 | {file = "grpcio-1.35.0-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:6ba3d7acf70acde9ce27e22921db921b84a71be578b32739536c32377b65041a"}, 232 | {file = "grpcio-1.35.0-cp38-cp38-manylinux2014_i686.whl", hash = "sha256:048c01d1eb5c2ae7cba2254b98938d2fc81f6dc10d172d9261d65266adb0fdb3"}, 233 | {file = "grpcio-1.35.0-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:efd896e8ca7adb2654cf014479a5e1f74e4f776b6b2c0fbf95a6c92787a6631a"}, 234 | {file = "grpcio-1.35.0-cp38-cp38-win32.whl", hash = "sha256:8a29a26b9f39701ce15aa1d5aa5e96e0b5f7028efe94f95341a4ed8dbe4bed78"}, 235 | {file = "grpcio-1.35.0-cp38-cp38-win_amd64.whl", hash = "sha256:aea3d592a7ece84739b92d212cd16037c51d84a259414f64b51c14e946611f3d"}, 236 | {file = "grpcio-1.35.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:2f8e8d35d4799aa1627a212dbe8546594abf4064056415c31bd1b3b8f2a62027"}, 237 | {file = "grpcio-1.35.0-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:9f0da13b215068e7434b161a35d0b4e92140ffcfa33ddda9c458199ea1d7ce45"}, 238 | {file = "grpcio-1.35.0-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:7ae408780b79c9b9b91a2592abd1d7abecd05675d988ea75038580f420966b59"}, 239 | {file = "grpcio-1.35.0-cp39-cp39-manylinux2014_i686.whl", hash = "sha256:0f714e261e1d63615476cda4ee808a79cca62f8f09e2943c136c2f87ec5347b1"}, 240 | {file = "grpcio-1.35.0-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:7ee7d54da9d176d3c9a0f47c04d7ff6fdc6ee1c17643caff8c33d6c8a70678a4"}, 241 | {file = "grpcio-1.35.0-cp39-cp39-win32.whl", hash = "sha256:94c3b81089a86d3c5877d22b07ebc66b5ed1d84771e24b001844e29a5b6178dd"}, 242 | {file = "grpcio-1.35.0-cp39-cp39-win_amd64.whl", hash = "sha256:399ee377b312ac652b07ef4365bbbba009da361fa7708c4d3d4ce383a1534ea7"}, 243 | {file = "grpcio-1.35.0.tar.gz", hash = "sha256:7bd0ebbb14dde78bf66a1162efd29d3393e4e943952e2f339757aa48a184645c"}, 244 | ] 245 | importlib-metadata = [ 246 | {file = "importlib_metadata-3.4.0-py3-none-any.whl", hash = "sha256:ace61d5fc652dc280e7b6b4ff732a9c2d40db2c0f92bc6cb74e07b73d53a1771"}, 247 | {file = "importlib_metadata-3.4.0.tar.gz", hash = "sha256:fa5daa4477a7414ae34e95942e4dd07f62adf589143c875c133c1e53c4eff38d"}, 248 | ] 249 | iniconfig = [ 250 | {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"}, 251 | {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"}, 252 | ] 253 | packaging = [ 254 | {file = "packaging-20.9-py2.py3-none-any.whl", hash = "sha256:67714da7f7bc052e064859c05c595155bd1ee9f69f76557e21f051443c20947a"}, 255 | {file = "packaging-20.9.tar.gz", hash = "sha256:5b327ac1320dc863dca72f4514ecc086f31186744b84a230374cc1fd776feae5"}, 256 | ] 257 | pluggy = [ 258 | {file = "pluggy-0.13.1-py2.py3-none-any.whl", hash = "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"}, 259 | {file = "pluggy-0.13.1.tar.gz", hash = "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0"}, 260 | ] 261 | protobuf = [ 262 | {file = "protobuf-3.14.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:629b03fd3caae7f815b0c66b41273f6b1900a579e2ccb41ef4493a4f5fb84f3a"}, 263 | {file = "protobuf-3.14.0-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:5b7a637212cc9b2bcf85dd828b1178d19efdf74dbfe1ddf8cd1b8e01fdaaa7f5"}, 264 | {file = "protobuf-3.14.0-cp35-cp35m-macosx_10_9_intel.whl", hash = "sha256:43b554b9e73a07ba84ed6cf25db0ff88b1e06be610b37656e292e3cbb5437472"}, 265 | {file = "protobuf-3.14.0-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:5e9806a43232a1fa0c9cf5da8dc06f6910d53e4390be1fa06f06454d888a9142"}, 266 | {file = "protobuf-3.14.0-cp35-cp35m-win32.whl", hash = "sha256:1c51fda1bbc9634246e7be6016d860be01747354ed7015ebe38acf4452f470d2"}, 267 | {file = "protobuf-3.14.0-cp35-cp35m-win_amd64.whl", hash = "sha256:4b74301b30513b1a7494d3055d95c714b560fbb630d8fb9956b6f27992c9f980"}, 268 | {file = "protobuf-3.14.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:86a75477addde4918e9a1904e5c6af8d7b691f2a3f65587d73b16100fbe4c3b2"}, 269 | {file = "protobuf-3.14.0-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:ecc33531a213eee22ad60e0e2aaea6c8ba0021f0cce35dbf0ab03dee6e2a23a1"}, 270 | {file = "protobuf-3.14.0-cp36-cp36m-win32.whl", hash = "sha256:72230ed56f026dd664c21d73c5db73ebba50d924d7ba6b7c0d81a121e390406e"}, 271 | {file = "protobuf-3.14.0-cp36-cp36m-win_amd64.whl", hash = "sha256:0fc96785262042e4863b3f3b5c429d4636f10d90061e1840fce1baaf59b1a836"}, 272 | {file = "protobuf-3.14.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:4e75105c9dfe13719b7293f75bd53033108f4ba03d44e71db0ec2a0e8401eafd"}, 273 | {file = "protobuf-3.14.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:2a7e2fe101a7ace75e9327b9c946d247749e564a267b0515cf41dfe450b69bac"}, 274 | {file = "protobuf-3.14.0-cp37-cp37m-win32.whl", hash = "sha256:b0d5d35faeb07e22a1ddf8dce620860c8fe145426c02d1a0ae2688c6e8ede36d"}, 275 | {file = "protobuf-3.14.0-cp37-cp37m-win_amd64.whl", hash = "sha256:8971c421dbd7aad930c9bd2694122f332350b6ccb5202a8b7b06f3f1a5c41ed5"}, 276 | {file = "protobuf-3.14.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9616f0b65a30851e62f1713336c931fcd32c057202b7ff2cfbfca0fc7d5e3043"}, 277 | {file = "protobuf-3.14.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:22bcd2e284b3b1d969c12e84dc9b9a71701ec82d8ce975fdda19712e1cfd4e00"}, 278 | {file = "protobuf-3.14.0-py2.py3-none-any.whl", hash = "sha256:0e247612fadda953047f53301a7b0407cb0c3cb4ae25a6fde661597a04039b3c"}, 279 | {file = "protobuf-3.14.0.tar.gz", hash = "sha256:1d63eb389347293d8915fb47bee0951c7b5dab522a4a60118b9a18f33e21f8ce"}, 280 | ] 281 | py = [ 282 | {file = "py-1.10.0-py2.py3-none-any.whl", hash = "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"}, 283 | {file = "py-1.10.0.tar.gz", hash = "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3"}, 284 | ] 285 | pyparsing = [ 286 | {file = "pyparsing-2.4.7-py2.py3-none-any.whl", hash = "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"}, 287 | {file = "pyparsing-2.4.7.tar.gz", hash = "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1"}, 288 | ] 289 | pytest = [ 290 | {file = "pytest-6.2.2-py3-none-any.whl", hash = "sha256:b574b57423e818210672e07ca1fa90aaf194a4f63f3ab909a2c67ebb22913839"}, 291 | {file = "pytest-6.2.2.tar.gz", hash = "sha256:9d1edf9e7d0b84d72ea3dbcdfd22b35fb543a5e8f2a60092dd578936bf63d7f9"}, 292 | ] 293 | six = [ 294 | {file = "six-1.15.0-py2.py3-none-any.whl", hash = "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced"}, 295 | {file = "six-1.15.0.tar.gz", hash = "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259"}, 296 | ] 297 | toml = [ 298 | {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"}, 299 | {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"}, 300 | ] 301 | typing-extensions = [ 302 | {file = "typing_extensions-3.7.4.3-py2-none-any.whl", hash = "sha256:dafc7639cde7f1b6e1acc0f457842a83e722ccca8eef5270af2d74792619a89f"}, 303 | {file = "typing_extensions-3.7.4.3-py3-none-any.whl", hash = "sha256:7cb407020f00f7bfc3cb3e7881628838e69d8f3fcab2f64742a5e76b2f841918"}, 304 | {file = "typing_extensions-3.7.4.3.tar.gz", hash = "sha256:99d4073b617d30288f569d3f13d2bd7548c3a7e4c8de87db09a9d29bb3a4a60c"}, 305 | ] 306 | zipp = [ 307 | {file = "zipp-3.4.0-py3-none-any.whl", hash = "sha256:102c24ef8f171fd729d46599845e95c7ab894a4cf45f5de11a44cc7444fb1108"}, 308 | {file = "zipp-3.4.0.tar.gz", hash = "sha256:ed5eee1974372595f9e416cc7bbeeb12335201d8081ca8a0743c954d4446e5cb"}, 309 | ] 310 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "bareunpy" 3 | version = "1.7.1" 4 | description = "The bareun python library using grpc" 5 | authors = ["Gihyun YUN "] 6 | license = "BSD-3-Clause" 7 | readme = "README.md" 8 | homepage = "https://bareun.ai/" 9 | repository = "https://github.com/bareun-nlp/bareunpy" 10 | keywords = [ "NLP", "Korean", "Deep Learning", "POS tagger", "bareun"] 11 | classifiers = [ 12 | "Development Status :: 5 - Production/Stable", 13 | "Intended Audience :: Science/Research", 14 | "Intended Audience :: Developers", 15 | "Intended Audience :: Information Technology", 16 | "Intended Audience :: Education", 17 | "Natural Language :: Korean", 18 | "Operating System :: OS Independent", 19 | "Typing :: Typed", 20 | "Topic :: Software Development", 21 | "Topic :: Software Development :: Libraries", 22 | "Topic :: Scientific/Engineering", 23 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 24 | "Topic :: Scientific/Engineering :: Information Analysis", 25 | "Operating System :: Microsoft :: Windows", 26 | "Operating System :: POSIX", 27 | "Operating System :: Unix", 28 | "Operating System :: MacOS" 29 | ] 30 | 31 | [tool.poetry.dependencies] 32 | python = "^3.6" 33 | grpcio = "^1.53.2" 34 | protobuf = "^3.19.6" 35 | googleapis-common-protos = "^1.56.0" 36 | bareun-apis = "^0.15.2" 37 | 38 | [tool.poetry.group.dev.dependencies] 39 | pytest = "^6.2.2" 40 | 41 | [build-system] 42 | requires = ["poetry-core>=1.0.0"] 43 | build-backend = "poetry.core.masonry.api" 44 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | grpcio==1.53.2 2 | googleapis-common-protos==1.56.0 3 | protobuf>=3.19.6 4 | bareun-apis==0.15.2 5 | setuptools~=60.5.0 # 6 | pytest>=7.2.1 -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal=1 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import setuptools 3 | 4 | with open("README.md", "r", encoding="utf-8") as fh: 5 | long_description = fh.read() 6 | 7 | CLASSIFIERS = """\ 8 | Development Status :: 5 - Production/Stable 9 | Intended Audience :: Science/Research 10 | Intended Audience :: Developers 11 | License :: OSI Approved :: BSD License 12 | Programming Language :: Python :: 3 13 | Programming Language :: Python :: 3.6 14 | Programming Language :: Python :: 3.7 15 | Programming Language :: Python :: 3.8 16 | Programming Language :: Python :: 3.9 17 | Programming Language :: Python :: 3.10 18 | Programming Language :: Python :: 3.11 19 | Programming Language :: Python :: 3.12 20 | Programming Language :: Python :: 3.13 21 | Programming Language :: Python :: 3 :: Only 22 | Natural Language :: Korean 23 | Development Status :: 5 - Production/Stable 24 | Operating System :: OS Independent 25 | Typing :: Typed 26 | Topic :: Software Development 27 | Topic :: Scientific/Engineering 28 | Topic :: Scientific/Engineering :: Artificial Intelligence 29 | Topic :: Scientific/Engineering :: Information Analysis 30 | Operating System :: Microsoft :: Windows 31 | Operating System :: POSIX 32 | Operating System :: Unix 33 | Operating System :: MacOS 34 | """ 35 | 36 | # import grpc_tools 37 | # 38 | # setuptools.setup( 39 | # cmdclass={ 40 | # 'build_proto_modules': grpc_tools.command.BuildPackageProtos, 41 | # } 42 | # ) 43 | 44 | setuptools.setup( 45 | name="bareunpy", 46 | version="1.7.1", 47 | author="Gihyun YUN", 48 | author_email="gih2yun@baikal.ai", 49 | description="The bareun python API library", 50 | long_description=long_description, 51 | long_description_content_type="text/markdown", 52 | url="https://bareun.ai/", 53 | download_url="https://pypi.python.org/pypi/bareunpy", 54 | project_urls={ 55 | "Bug Tracker": "https://github.com/bareun-nlp/bareunpy/issues", 56 | # "Documentation": get_docs_url(), 57 | "Source Code": "https://github.com/bareun-nlp/bareunpy", 58 | }, 59 | license='BSD', 60 | platform='Independent', 61 | packages=setuptools.find_packages(), 62 | classifiers=[_f for _f in CLASSIFIERS.split('\n') if _f], 63 | python_requires='>=3.6', 64 | ) 65 | -------------------------------------------------------------------------------- /tests/test_tagger.py: -------------------------------------------------------------------------------- 1 | #!env python3 2 | # -*- coding: utf-8 -*- 3 | import pytest 4 | 5 | 6 | @pytest.fixture 7 | def tagger_instance(): 8 | import bareunpy 9 | ## FIXME change it nlp.bareun.ai 10 | t = bareunpy.Tagger(apikey="koba-42CXULQ-SDPU6ZA-RQ6QPBQ-4BMZCOA", 11 | host="10.3.8.44", 12 | port=5656) 13 | return t 14 | 15 | @pytest.fixture 16 | def tagger_error_host_instance(): 17 | import bareunpy 18 | t = bareunpy.Tagger(apikey="koba-42CXULQ-SDPU6ZA-RQ6QPBQ-4BMZCOA", 19 | host="10.3.8.44:5656", 20 | port=5656) 21 | return t 22 | 23 | @pytest.fixture 24 | def tagger_error_apikey_instance(): 25 | import bareunpy 26 | t = bareunpy.Tagger(apikey="koba-42CXULQ-SDPU6ZA", 27 | host="10.3.8.44", 28 | port=5656) 29 | return t 30 | 31 | @pytest.fixture 32 | def sample1(): 33 | return '오늘은 정말 추운 날이네요.' 34 | 35 | 36 | def test_tagger_pos(tagger_instance, sample1): 37 | assert tagger_instance.pos(sample1) == \ 38 | [('오늘', 'NNG'), 39 | ('은', 'JX'), 40 | ('정말', 'MAG'), 41 | ('춥', 'VA'), 42 | ('ㄴ', 'ETM'), 43 | ('날', 'NNG'), 44 | ('이', 'VCP'), 45 | ('네', 'EF'), 46 | ('요', 'JX'), 47 | ('.', 'SF') 48 | ] 49 | 50 | 51 | def test_tagger_pos_join(tagger_instance, sample1): 52 | assert tagger_instance.pos(sample1, join=True) == \ 53 | ['오늘/NNG', 54 | '은/JX', 55 | '정말/MAG', 56 | '춥/VA', 57 | 'ㄴ/ETM', 58 | '날/NNG', 59 | '이/VCP', 60 | '네/EF', 61 | '요/JX', 62 | './SF' 63 | ] 64 | 65 | 66 | def test_tagger_pos_detail(tagger_instance, sample1): 67 | temp = tagger_instance.pos(sample1, detail=True) 68 | temp2 = [(t[0], t[1], t[2]) for t in temp] 69 | assert (temp2 == [('오늘', 'NNG', 'IN_WORD_EMBEDDING'), 70 | ('은', 'JX', 'IN_WORD_EMBEDDING'), 71 | ('정말', 'MAG', 'IN_WORD_EMBEDDING'), 72 | ('춥', 'VA', 'IN_WORD_EMBEDDING'), 73 | ('ㄴ', 'ETM', 'IN_WORD_EMBEDDING'), 74 | ('날', 'NNG', 'IN_WORD_EMBEDDING'), 75 | ('이', 'VCP', 'IN_WORD_EMBEDDING'), 76 | ('네', 'EF', 'IN_WORD_EMBEDDING'), 77 | ('요', 'JX', 'IN_WORD_EMBEDDING'), 78 | ('.', 'SF', 'IN_WORD_EMBEDDING')]) 79 | 80 | 81 | def test_tagger_morphs(tagger_instance, sample1): 82 | assert tagger_instance.morphs(sample1) == \ 83 | ['오늘', 84 | '은', 85 | '정말', 86 | '춥', 87 | 'ㄴ', 88 | '날', 89 | '이', 90 | '네', 91 | '요', 92 | '.'] 93 | 94 | 95 | def test_tagger_nouns(tagger_instance, sample1): 96 | assert tagger_instance.nouns(sample1) == \ 97 | ['오늘', '날'] 98 | 99 | 100 | def test_tagger_tag_as_json_str(tagger_instance, sample1): 101 | j = tagger_instance.tag(sample1).as_json() 102 | assert len(j['sentences']) == 1 103 | assert len(j['sentences'][0]['tokens']) == 4 104 | assert len(j['sentences'][0]['tokens'][0]['morphemes']) == 2 105 | assert len(j['sentences'][0]['tokens'][1]['morphemes']) == 1 106 | assert len(j['sentences'][0]['tokens'][2]['morphemes']) == 2 107 | assert len(j['sentences'][0]['tokens'][3]['morphemes']) == 5 108 | assert len(j['sentences'][0]['tokens'][3]['morphemes']) == 5 109 | 110 | 111 | def test_tagger_tag_as_msg(tagger_instance, sample1): 112 | m = tagger_instance.tag(sample1).msg() 113 | assert m.sentences[0].tokens[3].tagged == '날/NNG+이/VCP+네/EF+요/JX+./SF' 114 | 115 | 116 | def test_tagger_tag_print_as_json(tagger_instance, sample1): 117 | import tempfile 118 | with tempfile.TemporaryFile('w+') as f: 119 | tagger_instance.tag(sample1).print_as_json(out=f) 120 | assert f.tell() > 0 121 | 122 | 123 | def test_tagger_create_custom_dict(tagger_instance): 124 | try: 125 | cd = tagger_instance.custom_dict('my') 126 | assert cd is not None 127 | except TypeError as e: 128 | assert False 129 | 130 | 131 | def test_tagger_update_custom_dict(tagger_instance): 132 | try: 133 | cd = tagger_instance.custom_dict('my') 134 | cd.copy_np_set({'유리왕', '근초고왕', '누루하치', '베링거인겔하임'}) 135 | cd.copy_cp_set({'코로나19'}) 136 | cd.copy_cp_caret_set({'인공지능^데이터^학습', '자연어^처리^엔진'}) 137 | cd.update() 138 | assert True 139 | except TypeError as e: 140 | assert False 141 | 142 | 143 | def test_tagger_get_custom_dict_np_set(tagger_instance): 144 | try: 145 | cd = tagger_instance.custom_dict('my') 146 | dic = cd.get() 147 | assert len(dic.np_set.items) == 4 148 | assert '유리왕' in dic.np_set.items 149 | assert '근초고왕' in dic.np_set.items 150 | assert '누루하치' in dic.np_set.items 151 | assert '베링거인겔하임' in dic.np_set.items 152 | except TypeError as e: 153 | assert False 154 | 155 | 156 | def test_tagger_get_custom_dict_cp_set(tagger_instance): 157 | try: 158 | cd = tagger_instance.custom_dict('my') 159 | dic = cd.get() 160 | assert len(dic.cp_set.items) == 1 161 | assert '코로나19' in dic.cp_set.items 162 | except TypeError as e: 163 | assert False 164 | 165 | 166 | def test_tagger_get_custom_dict_cp_caret_set(tagger_instance): 167 | try: 168 | cd = tagger_instance.custom_dict('my') 169 | dic = cd.get() 170 | assert len(dic.cp_caret_set.items) == 2 171 | assert '인공지능^데이터^학습' in dic.cp_caret_set.items 172 | assert '자연어^처리^엔진' in dic.cp_caret_set.items 173 | except TypeError as e: 174 | assert False 175 | 176 | def test_exception_apikey(tagger_error_apikey_instance, sample1): 177 | try: 178 | tagger_error_apikey_instance.pos(sample1) 179 | except Exception as e: 180 | assert e.args[0][:27] == '\n입력한 API KEY가 정확한지 확인해 주세요.' 181 | 182 | def test_exception_host(tagger_error_host_instance, sample1): 183 | try: 184 | tagger_error_host_instance.pos(sample1) 185 | except Exception as e: 186 | assert e.args[0][:16] == '\n서버에 연결할 수 없습니다.' 187 | 188 | -------------------------------------------------------------------------------- /tests/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | #!env python3 2 | # -*- coding: utf-8 -*- 3 | import pytest 4 | from bareunpy import Tokenized 5 | 6 | @pytest.fixture 7 | def tokenizer_instance(): 8 | import bareunpy 9 | ## FIXME change it nlp.bareun.ai 10 | t = bareunpy.Tokenizer( 11 | apikey="koba-42CXULQ-SDPU6ZA-RQ6QPBQ-4BMZCOA", 12 | host="10.3.8.44", 13 | port=5757) 14 | return t 15 | 16 | @pytest.fixture 17 | def tokenizer_error_host_instance(): 18 | import bareunpy 19 | ## FIXME change it nlp.bareun.ai 20 | t = bareunpy.Tokenizer( 21 | apikey="koba-42CXULQ-SDPU6ZA-RQ6QPBQ-4BMZCOA", 22 | host="10.3.8.44:5757", 23 | port=5757) 24 | return t 25 | 26 | @pytest.fixture 27 | def tokenizer_error_apikey_instance(): 28 | import bareunpy 29 | ## FIXME change it nlp.bareun.ai 30 | t = bareunpy.Tokenizer( 31 | apikey="koba-42CXULQ-SDPU6ZA", 32 | host="10.3.8.44", 33 | port=5757) 34 | return t 35 | 36 | TEST_STR='오늘은 정말 추운 날이네요.' 37 | @pytest.fixture 38 | def sample1(): 39 | return TEST_STR 40 | 41 | 42 | def test_tokenizer_seg_not_flatten(tokenizer_instance, sample1): 43 | assert tokenizer_instance.seg(sample1, flatten=False) == \ 44 | [[('오늘'), ('은')], 45 | [('정말')], 46 | [('춥'), ('ㄴ')], 47 | [('날'), ('이'), ('네'), ('요'), ('.')] 48 | ] 49 | 50 | 51 | def test_tokenizer_seg_join(tokenizer_instance, sample1): 52 | assert tokenizer_instance.seg(sample1, join=True, detail=True) == \ 53 | ['오늘/N', 54 | '은/J', 55 | '정말/A', 56 | '춥/V', 57 | 'ㄴ/E', 58 | '날/N', 59 | '이/V', 60 | '네/E', 61 | '요/J', 62 | './S' 63 | ] 64 | 65 | 66 | def test_tokenizer_seg_detail(tokenizer_instance, sample1): 67 | temp = tokenizer_instance.seg(sample1, detail=True) 68 | temp2 = [(t[0], t[1]) for t in temp] 69 | assert (temp2 == [('오늘', 'N'), 70 | ('은', 'J'), 71 | ('정말', 'A'), 72 | ('춥', 'V'), 73 | ('ㄴ', 'E'), 74 | ('날', 'N'), 75 | ('이', 'V'), 76 | ('네', 'E'), 77 | ('요', 'J'), 78 | ('.', 'S')]) 79 | 80 | 81 | def test_tokenizer_seg(tokenizer_instance, sample1): 82 | assert tokenizer_instance.seg(sample1) == \ 83 | ['오늘', 84 | '은', 85 | '정말', 86 | '춥', 87 | 'ㄴ', 88 | '날', 89 | '이', 90 | '네', 91 | '요', 92 | '.'] 93 | 94 | 95 | def test_tokenizer_nouns(tokenizer_instance, sample1): 96 | assert tokenizer_instance.nouns(sample1) == \ 97 | ['오늘', '날'] 98 | 99 | 100 | def test_tokenizer_tokenize_as_json_str(tokenizer_instance, sample1): 101 | j = tokenizer_instance.tokenize(sample1).as_json() 102 | assert len(j['sentences']) == 1 103 | assert len(j['sentences'][0]['tokens']) == 4 104 | assert len(j['sentences'][0]['tokens'][0]['segments']) == 2 105 | assert len(j['sentences'][0]['tokens'][1]['segments']) == 1 106 | assert len(j['sentences'][0]['tokens'][2]['segments']) == 2 107 | assert len(j['sentences'][0]['tokens'][3]['segments']) == 5 108 | assert len(j['sentences'][0]['tokens'][3]['segments']) == 5 109 | 110 | 111 | def test_tokenizer_tokenize_as_msg(tokenizer_instance, sample1): 112 | m = tokenizer_instance.tokenize(sample1).msg() 113 | assert m.sentences[0].tokens[3].tagged == '날/N+이/V+네/E+요/J+./S' 114 | 115 | 116 | def test_tokenizer_tokenize_print_as_json(tokenizer_instance, sample1): 117 | import tempfile 118 | with tempfile.TemporaryFile('w+') as f: 119 | tokenizer_instance.tokenize(sample1).print_as_json(out=f) 120 | assert f.tell() > 0 121 | 122 | def test_tokenized_nouns(tokenized: Tokenized): 123 | assert tokenized.nouns() == \ 124 | ['오늘', 125 | '날'] 126 | 127 | def test_exception_apikey_tokenizer(tokenizer_error_apikey_instance, sample1): 128 | try: 129 | m = tokenizer_error_apikey_instance.tokenize(sample1).msg() 130 | except Exception as e: 131 | assert e.args[0][:27] == '\n입력한 API KEY가 정확한지 확인해 주세요.' 132 | 133 | def test_exception_host_tokenizer(tokenizer_error_host_instance, sample1): 134 | try: 135 | m = tokenizer_error_host_instance.tokenize(sample1).msg() 136 | except Exception as e: 137 | assert e.args[0][:16] == '\n서버에 연결할 수 없습니다.' 138 | 139 | @pytest.fixture 140 | def tokenized() -> Tokenized: 141 | import bareunpy 142 | # FIXME change it nlp.bareun.ai 143 | t = bareunpy.Tokenizer( 144 | apikey="koba-42CXULQ-SDPU6ZA-RQ6QPBQ-4BMZCOA", 145 | host="10.3.8.44", 146 | port=5757) 147 | 148 | return t.tokenize(TEST_STR) 149 | 150 | def test_tokenized_verbs(tokenized: Tokenized): 151 | assert tokenized.predicates() == \ 152 | ['춥', 153 | '이'] 154 | 155 | def test_tokenized_symbols(tokenized: Tokenized): 156 | assert tokenized.symbols() == \ 157 | ['.'] 158 | 159 | def test_tokenized_adverbs(tokenized: Tokenized): 160 | assert tokenized.adverbs() == \ 161 | ['정말'] 162 | 163 | def test_tokenized_endings(tokenized: Tokenized): 164 | assert tokenized.endings() == \ 165 | ['ㄴ','네'] 166 | 167 | def test_tokenized_postpositions(tokenized: Tokenized): 168 | assert tokenized.postpositions() == \ 169 | ['은','요',] 170 | --------------------------------------------------------------------------------