├── .gitignore
├── .vscode
    ├── env
    ├── launch.json
    └── settings.json
├── LICENSE
├── Makefile
├── README.md
├── bareunpy
    ├── __init__.py
    ├── _corrector.py
    ├── _custom_dict.py
    ├── _custom_dict_client.py
    ├── _lang_service_client.py
    ├── _revision_service_client.py
    ├── _tagger.py
    └── _tokenizer.py
├── poetry.lock
├── pyproject.toml
├── requirements.txt
├── setup.cfg
├── setup.py
└── tests
    ├── test_tagger.py
    └── test_tokenizer.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/.vscode/env:
--------------------------------------------------------------------------------
1 | PYTHONPATH=bareunpy:.


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     // IntelliSense를 사용하여 가능한 특성에 대해 알아보세요.
 3 |     // 기존 특성에 대한 설명을 보려면 가리킵니다.
 4 |     // 자세한 내용을 보려면 https://go.microsoft.com/fwlink/?linkid=830387을(를) 방문하세요.
 5 |     "version": "0.2.0",
 6 |     "configurations": [
 7 |         {
 8 |             "name": "Python: pytest",
 9 |             "type": "debugpy",
10 |             "request": "launch",
11 |             "module": "pytest",
12 |             "justMyCode": true,
13 |             "args": ["tests/"]
14 |         }
15 |     ]
16 | }


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "python.defaultInterpreterPath": "${workspaceFolder}/venv/bin/python",
 3 |     "python.envFile": "${workspaceFolder}/.vscode/.env",
 4 |     "python.autoComplete.extraPaths": [
 5 |         "bareunpy"
 6 |     ],
 7 |     "python.testing.pytestArgs": [
 8 |         "tests"
 9 |     ],
10 |     "python.testing.unittestEnabled": true,
11 |     "python.testing.pytestEnabled": true
12 | }


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2020-2023, BAIKAL AI Inc. and Korea Press Foundation.
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: venv-local venv-pip-upgrade clean \
 2 |  venv-upgrade-build venv-pytest test build \
 3 |  venv-poetry poetry venv
 4 | 
 5 | PIP3=venv/bin/pip3
 6 | PY3=venv/bin/python3
 7 | POETRY=venv/bin/poetry
 8 | 
 9 | all: build
10 | 
11 | venv-local:
12 | 	@test -d venv || python3 -m venv venv
13 | 
14 | venv-pip-upgrade: venv-local
15 | 	@$(PIP3) install --upgrade pip 2> /dev/null
16 | 
17 | venv/req-install.log: requirements.txt venv-pip-upgrade
18 | 	@$(PY3) -c "import pkg_resources; pkg_resources.require(open('requirements.txt',mode='r'))" 2> /dev/null \
19 | 	|| (echo "installing"; $(PIP3) install -r $< --log $@)
20 | 
21 | venv: venv/req-install.log
22 | 	@. venv/bin/activate
23 | 
24 | 
25 | clean:
26 | 	rm -rf venv
27 | 	find . -iname "*.pyc" -delete
28 | 
29 | venv-poetry: venv
30 | 	@$(PY3) -m pip install --upgrade poetry 2> /dev/null
31 | 
32 | build: venv-poetry
33 | 	@$(POETRY) build
34 | 
35 | publish: build
36 | 	@$(POETRY) publish
37 | 
38 | venv-pytest: venv
39 | 	@$(PY3) -m pip install --upgrade pytest &> /dev/null
40 | 
41 | testall: venv
42 | 	@$(PY3) -m pytest tests/
43 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # What is this?
  2 | 
  3 | `bareunpy` is the python 3 library for bareun.
  4 | 
  5 | Bareun is a Korean NLP,
  6 | which provides tokenizing, POS tagging for Korean.
  7 | 
  8 | ## How to install
  9 | 
 10 | ```shell
 11 | pip3 install bareunpy
 12 | ```
 13 | 
 14 | ## How to get bareun
 15 | - Go to https://bareun.ai/.
 16 |   - With registration, for the first time, you can get a API-KEY to use it freely.
 17 |   - With API-KEY, you can install the `bareun1` server.
 18 |   - Or you can make a call to use this `bareunpy` library to any servers.
 19 | - Or use docker image. See https://hub.docker.com/r/bareunai/bareun
 20 | ```shell
 21 | docker pull bareunai/bareun:latest
 22 | ```
 23 | 
 24 | ## How to use, tagger
 25 | 
 26 | ```python
 27 | import sys
 28 | import google.protobuf.text_format as tf
 29 | from bareunpy import Tagger
 30 | 
 31 | # You can get an API-KEY from https://bareun.ai/
 32 | # Please note that you need to sign up and verify your email.
 33 | # 아래에 "https://bareun.ai/"에서 이메일 인증 후 발급받은 API KEY("koba-...")를 입력해주세요. "로그인-내정보 확인"
 34 | API_KEY = "koba-ABCDEFG-1234567-LMNOPQR-7654321" # <- 본인의 API KEY로 교체(Replace this with your own API KEY)
 35 | 
 36 | # If you have your own localhost bareun.
 37 | my_tagger = Tagger(API_KEY, 'localhost')
 38 | # or if you have your own bareun which is running on 10.8.3.211:15656.
 39 | my_tagger = Tagger(API_KEY, '10.8.3.211', 15656)
 40 | 
 41 | 
 42 | # print results. 
 43 | res = tagger.tags(["안녕하세요.", "반가워요!"])
 44 | 
 45 | # get protobuf message.
 46 | m = res.msg()
 47 | tf.PrintMessage(m, out=sys.stdout, as_utf8=True)
 48 | print(tf.MessageToString(m, as_utf8=True))
 49 | print(f'length of sentences is {len(m.sentences)}')
 50 | ## output : 2
 51 | print(f'length of tokens in sentences[0] is {len(m.sentences[0].tokens)}')
 52 | print(f'length of morphemes of first token in sentences[0] is {len(m.sentences[0].tokens[0].morphemes)}')
 53 | print(f'lemma of first token in sentences[0] is {m.sentences[0].tokens[0].lemma}')
 54 | print(f'first morph of first token in sentences[0] is {m.sentences[0].tokens[0].morphemes[0]}')
 55 | print(f'tag of first morph of first token in sentences[0] is {m.sentences[0].tokens[0].morphemes[0].tag}')
 56 | 
 57 | ## Advanced usage.
 58 | for sent in m.sentences:
 59 |     for token in sent.tokens:
 60 |         for m in token.morphemes:
 61 |             print(f'{m.text.content}/{m.tag}:{m.probability}:{m.out_of_vocab})
 62 | 
 63 | # get json object
 64 | jo = res.as_json()
 65 | print(jo)
 66 | 
 67 | # get tuple of pos tagging.
 68 | pa = res.pos()
 69 | print(pa)
 70 | # another methods
 71 | ma = res.morphs()
 72 | print(ma)
 73 | na = res.nouns()
 74 | print(na)
 75 | va = res.verbs()
 76 | print(va)
 77 | 
 78 | # custom dictionary
 79 | cust_dic = tagger.custom_dict("my")
 80 | cust_dic.copy_np_set({'내고유명사', '우리집고유명사'})
 81 | cust_dic.copy_cp_set({'코로나19'})
 82 | cust_dic.copy_cp_caret_set({'코로나^백신', '"독감^백신'})
 83 | cust_dic.update()
 84 | 
 85 | # laod prev custom dict
 86 | cust_dict2 = tagger.custom_dict("my")
 87 | cust_dict2.load()
 88 | 
 89 | tagger.set_domain('my')
 90 | tagger.pos('코로나19는 언제 끝날까요?')
 91 | ```
 92 | 
 93 | 
 94 | ## How to use, tokenizer
 95 | 
 96 | ```python
 97 | import sys
 98 | import google.protobuf.text_format as tf
 99 | from bareunpy import Tokenizer
100 | 
101 | # You can get an API-KEY from https://bareun.ai/
102 | # Please note that you need to sign up and verify your email.
103 | # 아래에 "https://bareun.ai/"에서 이메일 인증 후 발급받은 API KEY("koba-...")를 입력해주세요. "로그인-내정보 확인"
104 | API_KEY = "koba-ABCDEFG-1234567-LMNOPQR-7654321" # <- 본인의 API KEY로 교체(Replace this with your own API KEY)
105 | 
106 | # If you have your own localhost bareun.
107 | my_tokenizer = Tokenizer(API_KEY, 'localhost')
108 | # or if you have your own bareun which is running on 10.8.3.211:15656.
109 | my_tokenizer = Tagger(API_KEY, '10.8.3.211', 15656)
110 | 
111 | 
112 | # print results. 
113 | tokenized = tokenizer.tokenize_list(["안녕하세요.", "반가워요!"])
114 | 
115 | # get protobuf message.
116 | m = tokenized.msg()
117 | tf.PrintMessage(m, out=sys.stdout, as_utf8=True)
118 | print(tf.MessageToString(m, as_utf8=True))
119 | print(f'length of sentences is {len(m.sentences)}')
120 | ## output : 2
121 | print(f'length of tokens in sentences[0] is {len(m.sentences[0].tokens)}')
122 | print(f'length of segments of first token in sentences[0] is {len(m.sentences[0].tokens[0].segments)}')
123 | print(f'tagged of first token in sentences[0] is {m.sentences[0].tokens[0].tagged}')
124 | print(f'first segment of first token in sentences[0] is {m.sentences[0].tokens[0].segments[0]}')
125 | print(f'hint of first morph of first token in sentences[0] is {m.sentences[0].tokens[0].segments[0].hint}')
126 | 
127 | ## Advanced usage.
128 | for sent in m.sentences:
129 |     for token in sent.tokens:
130 |         for m in token.segments:
131 |             print(f'{m.text.content}/{m.hint})
132 | 
133 | # get json object
134 | jo = tokenized.as_json()
135 | print(jo)
136 | 
137 | # get tuple of segments
138 | ss = tokenized.segments()
139 | print(ss)
140 | ns = tokenized.nouns()
141 | print(ns)
142 | vs = tokenized.verbs()
143 | print(vs)
144 | # postpositions: 조사
145 | ps = tokenized.postpositions()
146 | print(ps)
147 | # Adverbs, 부사
148 | ass = tokenized.adverbs()
149 | print(ass)
150 | ss = tokenized.symbols()
151 | print(ss)
152 | 
153 | ```
154 | 
155 | ## How to use, spelling corrector
156 | ```python
157 | from bareunpy import Corrector
158 | 
159 | # You can get an API-KEY from https://bareun.ai/
160 | # Please note that you need to sign up and verify your email.
161 | # 아래에 "https://bareun.ai/"에서 이메일 인증 후 발급받은 API KEY("koba-...")를 입력해주세요. "로그인-내정보 확인"
162 | API_KEY = "koba-ABCDEFG-1234567-LMNOPQR-7654321"  # <- 본인의 API KEY로 교체(Replace this with your own API KEY)
163 | 
164 | # Initialize Corrector
165 | corrector = Corrector(API_KEY)
166 | 
167 | # Single sentence correction
168 | response = corrector.correct_error("영수 도 줄기가 얇어서 시들을 것 같은 꽃에물을 주었다.")
169 | print(f"Original: {response.origin}")
170 | print(f"Corrected: {response.revised}")
171 | corrector.print_results(response)
172 | 
173 | # Multiple sentences correction
174 | responses = corrector.correct_error_list([
175 |     "어머니 께서 만들어주신김치찌게가너무맵다며동생이울어버렸다.",
176 |     "영수 도 줄기가 얇어서 시들을 것 같은 꽃에물을 주었다."
177 | ])
178 | for res in responses:
179 |     print(f"Original: {res.origin}")
180 |     print(f"Corrected: {res.revised}")
181 | 
182 | corrector.print_results(responses)
183 | 
184 | # JSON output
185 | corrector.print_as_json(response)
186 | 
187 | ```


--------------------------------------------------------------------------------
/bareunpy/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | bareunpy
 3 | =====
 4 | Provides
 5 |   1. a Korean Part-Of-Speech Tagger as bareun client
 6 |   2. Multiple custom dictionaries which is kept in the your bareun server.
 7 | 
 8 | 
 9 | How to use the documentation
10 | ----------------------------
11 | Full documentation for bareun is available in
12 | installable tarball or docker images.
13 | - see `docs/intro.html` at installable tarball.
14 | - or `http://localhost:5757/intro.html` after running docker.
15 | 
16 | The docstring examples assume that `bareunpy` has been imported as `brn`::
17 |   >>> import bareunpy as brn
18 | 
19 | Use the built-in ``help`` function to view a class's docstring::
20 |   >>> help(brn.Tagger)
21 |   ...
22 | 
23 | Classes
24 | -------
25 | Tagger
26 |     the bareun POS tagger for Korean
27 |     `from bareunpy import Tagger`
28 | Tagged
29 |     Wrapper for tagged output
30 |     `from bareunpy import Tagged`
31 | CustomDict
32 |     Custom dictionary for Korean.
33 |     `from bareunpy import CustomDict`
34 | 
35 | Version
36 | -------
37 | ```
38 | import bareunpy as brn
39 | print(brn.version)
40 | print(brn.bareun_version)
41 | ```
42 | 
43 | Get bareun
44 | ----------------------------
45 | - Use docker, https://hub.docker.com/r/bareunai/bareun
46 | - Or visit https://bareun.ai/
47 | """
48 | 
49 | import sys
50 | import os
51 | 
52 | from bareunpy._tagger import Tagger, Tagged
53 | from bareunpy._tokenizer import Tokenizer, Tokenized
54 | from bareunpy._corrector import Corrector
55 | from bareunpy._custom_dict import CustomDict
56 | from bareunpy._custom_dict_client import CustomDictionaryServiceClient
57 | from bareunpy._lang_service_client import BareunLanguageServiceClient
58 | 
59 | version = "1.7.0"
60 | bareun_version = "3.0.0"
61 | 


--------------------------------------------------------------------------------
/bareunpy/_corrector.py:
--------------------------------------------------------------------------------
  1 | import grpc
  2 | import json
  3 | from sys import stdout
  4 | from typing import IO, List, Union
  5 | from google.protobuf.json_format import MessageToDict
  6 | 
  7 | import bareun.revision_service_pb2 as pb
  8 | import bareun.lang_common_pb2 as lpb
  9 | from ._revision_service_client import BareunRevisionServiceClient
 10 | 
 11 | MAX_MESSAGE_LENGTH = 100 * 1024 * 1024
 12 | 
 13 | 
 14 | class Corrector:
 15 |     """
 16 |     Corrector는 맞춤법 교정 서비스를 제공하는 클래스입니다.
 17 | 
 18 |     .. code-block:: python
 19 |         :emphasize-lines: 1
 20 |         >>> from bareunpy import Corrector
 21 |         >>> corrector = Corrector(apikey="koba-YOURKEY")
 22 |         
 23 |         >>> response = corrector.correct_error("영수 도 줄기가 얇어서 시들을 것 같은 꽃에물을 주었더니 고은 꽃이 피었다.")
 24 |         >>> corrector.print_results(response)
 25 |             === 맞춤법 검사 결과 1===
 26 |             원문: 영수 도 줄기가 얇어서 시들을 것 같은 꽃에물을 주었더니 고은 꽃이 피었다.
 27 |             교정문: 영수도 줄기가 얇아서 시들 것 같은 꽃에 물을 주었더니 고운 꽃이 피었다.
 28 | 
 29 |             === 교정된 문장들 ===
 30 |             [1] 원문: 영수 도 줄기가 얇어서 시들을 것 같은 꽃에물을 주었더니 고은 꽃이 피었다.
 31 |                 교정문: 영수도 줄기가 얇아서 시들 것 같은 꽃에 물을 주었더니 고운 꽃이 피었다.
 32 |                 === 수정 블록 ===
 33 |                 1-1 원문: 영수 도
 34 |                     교정문: 영수도
 35 |                     수정 세부사항:
 36 |                     - 조사는 그 앞말에 붙여 쓴다. (일반) 
 37 |                 1-2 ...
 38 | 
 39 | 
 40 |     :param apikey: str. Bareun API 키
 41 |     :param host: str. gRPC 서버 호스트, 로컬에 바른 서버 설치시 'localhost' 입력 (기본값: nlp.bareun.ai)
 42 |     :param port: int. gRPC 서버 포트 (기본값: 5656)
 43 |     """
 44 | 
 45 |     def __init__(self, apikey: str, host: str = "", port: int = 5656):
 46 |         """
 47 |         Corrector 초기화
 48 | 
 49 |         Args:
 50 |             apikey (str): API 키
 51 |             host (str): gRPC 서버 호스트
 52 |             port (int): gRPC 서버 포트
 53 |         """
 54 |         if host:
 55 |             host = host.strip()
 56 |         if apikey:
 57 |             apikey = apikey.strip()
 58 |         if host == "" or host is None:
 59 |             self.host = 'nlp.bareun.ai'
 60 |         else:
 61 |             self.host = host 
 62 |         
 63 |         if port is not None:
 64 |             self.port = port
 65 |         else:
 66 |             self.port = 5656
 67 | 
 68 |         self.channel = grpc.insecure_channel(
 69 |             f"{self.host}:{self.port}",
 70 |             options=[
 71 |                 ('grpc.max_send_message_length', MAX_MESSAGE_LENGTH),
 72 |                 ('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH),
 73 |             ]
 74 |         )
 75 |         self.client = BareunRevisionServiceClient(self.channel, apikey, self.host, self.port)
 76 | 
 77 |     def correct_error(self, content: str,
 78 |                       custom_dicts: List[str] = [],
 79 |                       config: Union[pb.RevisionConfig, None] = None) -> pb.CorrectErrorResponse:
 80 |         """
 81 |         맞춤법 교정 요청
 82 | 
 83 |         Args:
 84 |             content (str): 교정을 요청할 문장
 85 |             custom_dicts (List[str]): 커스텀 도메인 정보
 86 |             config Union[pb.RevisionConfig, None] : 요청 설정
 87 | 
 88 |         Returns:
 89 |             pb.CorrectErrorResponse: 교정 결과
 90 |         """
 91 |         request = pb.CorrectErrorRequest(
 92 |             document=lpb.Document(content=content, language="ko_KR"),
 93 |             encoding_type=lpb.EncodingType.UTF32,
 94 |         )
 95 |         if len(custom_dicts):
 96 |             request.custom_dict_names.extend(custom_dicts)
 97 |         if config != None:
 98 |             request.config.CopyFrom(config)
 99 | 
100 |         return self.client.correct_error(request)
101 | 
102 |     def print_results(self, res: pb.CorrectErrorResponse, out: IO = stdout) -> None:
103 |         """
104 |         교정 결과를 출력
105 | 
106 |         Args:
107 |             response pb.CorrectErrorResponse: 교정 결과 또는 교정 결과의 리스트
108 |             out (IO): 출력 대상 (기본값: stdout)
109 |         """
110 |         print(f'원문: {res.origin}', file=out)
111 |         print(f'교정: {res.revised}', file=out)
112 |  
113 |         print("\n=== 교정된 문장들 ===", file=out)
114 |         
115 |         for sent in res.revised_sentences:
116 |             print(f" 원문: {sent.origin}", file=out)
117 |             print(f"교정문: {sent.revised}", file=out)
118 |     
119 |         for block in res.revised_blocks:
120 |             print(f'원문:{block.origin.content} offset:{block.origin.begin_offset}, length:{block.origin.length}', file=out)
121 |             print(f'대표 교정: {block.revised}', file=out)
122 |             for rev in block.revisions:
123 |                 print(f' 교정: {rev.revised}, 카테고리:{rev.category}, 도움말 {res.helps[rev.help_id].comment}')
124 |                 
125 |         for cleanup in res.whitespace_cleanup_ranges:
126 |             print(f'공백제거: offset:{cleanup.offset} length:{cleanup.length} position: {cleanup.position}')
127 |         
128 | 
129 |     def as_json(self, response: pb.CorrectErrorResponse) -> dict:
130 |         """
131 |         교정 결과를 JSON 형식으로 변환
132 | 
133 |         Args:
134 |             response (Union[pb.CorrectErrorResponse, List[pb.CorrectErrorResponse]]): 교정 결과 또는 교정 결과의 리스트
135 | 
136 |         Returns:
137 |             Union[dict, List[dict]]: JSON 형식으로 변환된 결과
138 |         """
139 |         return MessageToDict(response, True)
140 | 
141 |     def as_json_str(self, response: pb.CorrectErrorResponse) -> str:
142 |         """
143 |         교정 결과를 JSON 문자열로 변환
144 | 
145 |         Args:
146 |             response (Union[pb.CorrectErrorResponse, List[pb.CorrectErrorResponse]]): 교정 결과 또는 교정 결과의 리스트
147 | 
148 |         Returns:
149 |             str: JSON 문자열로 변환된 결과
150 |         """
151 |         json_data = self.as_json(response)
152 |         return json.dumps(json_data, ensure_ascii=False, indent=2)
153 | 
154 |     def print_as_json(self, response: pb.CorrectErrorResponse, out: IO = stdout) -> None:
155 |         """
156 |         교정 결과를 JSON 형식으로 출력
157 | 
158 |         Args:
159 |             response (Union[pb.CorrectErrorResponse, List[pb.CorrectErrorResponse]]): 교정 결과 또는 교정 결과의 리스트
160 |             out (IO): 출력 대상 (기본값: stdout)
161 |         """
162 |         json_data = self.as_json(response)
163 |         json.dump(json_data, out, ensure_ascii=False, indent=2)
164 | 


--------------------------------------------------------------------------------
/bareunpy/_custom_dict.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from typing import List
  4 | import grpc
  5 | from ._custom_dict_client import CustomDictionaryServiceClient
  6 | from bareun.custom_dict_pb2 import CustomDictionary
  7 | from bareun.dict_common_pb2 import DictSet
  8 | 
  9 | 
 10 | def read_dic_file(fn :str) -> set:
 11 |     """
 12 |     사용자 사전의 파일을 읽어들입니다.
 13 | 
 14 |     Args:
 15 |         fn (str): 사용자 사전 파일 이름
 16 | 
 17 |     Returns:
 18 |         set: 사용자 사전을 set 형식으로 만들어서 돌려줍니다.
 19 |     """
 20 |     dict_set = set()
 21 |     with open(fn, 'r') as r:
 22 |         while True:
 23 |             w = r.readline()
 24 |             if not w:
 25 |                 break
 26 |             if w[0] != '#':
 27 |                 w2 = w.strip()
 28 |                 if len(w2) > 0:
 29 |                     dict_set.add(w2)
 30 |     return dict_set
 31 | 
 32 | 
 33 | def pb_map_to_set(ds: DictSet) -> set:
 34 |     """
 35 |     DictSet을 사전으로 변환합니다.
 36 | 
 37 |     Args:
 38 |         ds (DictSet): DictSet 객체
 39 | 
 40 |     Returns:
 41 |         set: 중복이 없는 사전 객체
 42 |     """
 43 |     ret = set()
 44 |     for k in ds.items.keys():
 45 |         ret.add(k)
 46 |     return ret
 47 | 
 48 | class CustomDict():
 49 |     """
 50 |     사용자 사전을 쉽게 사용하도록 해주는 래퍼(wrapper).
 51 | 
 52 |     'CustomDict' .
 53 |     :ref:`optional-installations`.
 54 |     .. code-block:: python
 55 |         :emphasize-lines: 1
 56 |         >>> import bareunpy as brn
 57 |         >>> tagger = brn.Tagger()
 58 |         >>> cd = tagger.custom_dict("law")
 59 |         >>> # or
 60 |         >>> cd = brn.CustomDict("law", "localhost", 5656)
 61 |         >>> cd.read_cp_set_from_file("my_np_set.txt")
 62 |         >>> cd.copy_cp_set(set(['새단어', '코로나19', 'K방역']))
 63 |         >>> cd.read_cp_caret_set_from_file('my_cp_caret.txt')
 64 |         >>> cd.copy_vv_set(set(['카톡하', '신박하다']))
 65 |         >>> cd.copy_va_set(set(['드라마틱하', '판타스틱하']))
 66 |         >>> cd.update()
 67 |         >>> ## copy data from server
 68 |         >>> cd2 = tagger.custom_dict("law")
 69 |         >>> custom_dict = cd2.get()
 70 |         >>> # cd2.save(dir="my_dir")
 71 |     """
 72 | 
 73 |     def __init__(self, apikey:str, domain: str, channel: grpc.Channel):
 74 |         """
 75 |         사용자 사전 래퍼(wrapper)의 생성자
 76 | 
 77 |         Args:
 78 |             domain (str): 사용자 사전의 이름, 반드시 지정되어야 합니다.
 79 |             channel(grpc.Channel): 원격에 연결할 정보
 80 |         Raises:
 81 |             ValueError: 사용자 사전의 이름이 없으면 에러를 발생시킵니다.
 82 |         """
 83 |         self.domain = domain
 84 |         if domain is None:
 85 |             raise ValueError("domain name must be specified.")
 86 | 
 87 |         self.stub = CustomDictionaryServiceClient(channel, apikey)
 88 |         self.cp_set = set()
 89 |         self.np_set = set()
 90 |         self.cp_caret_set = set()
 91 |         self.vv_set = set()
 92 |         self.va_set = set()
 93 | 
 94 |     def read_np_set_from_file(self, fn: str):
 95 |         """
 96 |         고유명사 사전을 파일에서 읽어들입니다.
 97 | 
 98 |         이 파일은 한줄에 하나의 사전입니다. '#'로 시작하는 줄은 무시합니다.
 99 | 
100 |         Args:
101 |             fn (str): 고유명사 파일 이름
102 |         """
103 |         self.np_set = read_dic_file(fn)
104 | 
105 |     def read_cp_set_from_file(self, fn: str):
106 |         """
107 |         복합명사 사전을 파일에서 읽어들입니다.
108 | 
109 |         이 파일은 한줄에 하나의 사전입니다. '#'로 시작하는 줄은 무시합니다.
110 | 
111 |         Args:
112 |             fn (str): 복합명사 파일 이름
113 |         """
114 |         self.cp_set = read_dic_file(fn)
115 | 
116 |     def read_cp_caret_set_from_file(self, fn: str):
117 |         """
118 |         복합명사 분리 사전을 파일에서 읽어들입니다.
119 | 
120 |         이 파일은 한줄에 하나의 사전입니다. '#'로 시작하는 줄은 무시합니다.
121 | 
122 |         Args:
123 |             fn (str): 복합명사 분리 사전 파일 이름
124 |         """
125 |         self.cp_caret_set = read_dic_file(fn)
126 | 
127 |     def read_vv_set_from_file(self, fn: str):
128 |         """
129 |         동사 사전을 파일에서 읽어들입니다.
130 | 
131 |         이 파일은 한줄에 하나의 사전입니다. '#'로 시작하는 줄은 무시합니다.
132 | 
133 |         Args:
134 |             fn (str): 동사 사전 파일 이름
135 |         """
136 |         self.vv_set = read_dic_file(fn)
137 | 
138 |     def read_va_set_from_file(self, fn: str):
139 |         """
140 |         형용사 사전을 파일에서 읽어들입니다.
141 | 
142 |         이 파일은 한줄에 하나의 사전입니다. '#'로 시작하는 줄은 무시합니다.
143 | 
144 |         Args:
145 |             fn (str): 형용사 사전 파일 이름
146 |         """
147 |         self.va_set = read_dic_file(fn)
148 | 
149 |     def copy_np_set(self, dict_set: set):
150 |         """
151 |         집합을 고유명사 사전으로 지정합니다.
152 | 
153 |         Args:
154 |             dict_set (set): 고유명사 사전
155 |         """
156 |         self.np_set = dict_set
157 | 
158 |     def copy_cp_set(self, dict_set: set):
159 |         """
160 |         집합을 복합명사 사전으로 지정합니다.
161 | 
162 |         Args:
163 |             dict_set (set): 복합명사 사전
164 |         """
165 |         self.cp_set = dict_set
166 | 
167 |     def copy_cp_caret_set(self, dict_set: set):
168 |         """
169 |         집합을 복합명사 분리 사전으로 지정합니다.
170 | 
171 |         Args:
172 |             dict_set (set): 복합명사 분리 사전
173 |         """
174 |         self.cp_caret_set = dict_set
175 | 
176 |     def copy_vv_set(self, dict_set: set):
177 |         """
178 |         집합을 동사 사전으로 지정합니다.
179 | 
180 |         Args:
181 |             dict_set (set): 동사 사전
182 |         """
183 |         self.vv_set = dict_set
184 | 
185 |     def copy_va_set(self, dict_set: set):
186 |         """
187 |         집합을 형용사 사전으로 지정합니다.
188 | 
189 |         Args:
190 |             dict_set (set): 형용사 사전
191 |         """
192 |         self.va_set = dict_set
193 | 
194 |     def update(self) -> bool:
195 |         """
196 |         복합명사 사전을 바이칼 NLP 서버에 갱신합니다.
197 | 
198 |         Raises:
199 |             e: grpc.Error, 원격 호출시 예외가 발생할 수 있습니다.
200 | 
201 |         Returns:
202 |             bool: 갱신이 성공하면 참을 돌려줍니다.
203 |         """
204 |         return self.stub.update(self.domain,
205 |                                 self.np_set,
206 |                                 self.cp_set,
207 |                                 self.cp_caret_set,
208 |                                 self.vv_set,
209 |                                 self.va_set)
210 | 
211 |     def get(self) -> CustomDictionary:
212 |         """
213 |         사용자 사전의 내용을 가져옵니다.
214 |         가져온 결과는 현재 설정된 사전의 내용을 반영하지 않습니다.
215 | 
216 |         Raises:
217 |             e: grpc.Error, 원격 호출시 예외가 발생할 수 있습니다.
218 | 
219 |         Returns:
220 |             pb.CustomDictionary: 사용자 사전 데이터 전체를 담고 있는 protobuf 메시지
221 |         """
222 |         return self.stub.get(self.domain)
223 | 
224 | 
225 |     def load(self):
226 |         """
227 |         서버에 저정되어 있는 사용자 사전을 모두 가져옵니다.
228 |         """
229 |         try:
230 |             d = self.stub.get(self.domain)
231 |             self.np_set = pb_map_to_set(d.np_set)
232 |             self.cp_caret_set = pb_map_to_set(d.cp_caret_set)
233 |             self.cp_set = pb_map_to_set(d.cp_set)
234 |         except Exception as e:
235 |             pass
236 | 
237 | 
238 |     def clear(self) -> List[str]:
239 |         """
240 |         사용자 사전의 내용을 삭제합니다.
241 | 
242 |         Raises:
243 |             e: grpc.Error, 원격 호출시 예외가 발생할 수 있습니다.
244 | 
245 |         Returns:
246 |             List[str]: 삭제한 사용자 사전의 이름
247 |         """
248 | 
249 |         self.np_set.clear()
250 |         self.cp_set.clear()
251 |         self.cp_caret_set.clear()
252 |         return self.stub.remove([self.domain])
253 | 


--------------------------------------------------------------------------------
/bareunpy/_custom_dict_client.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | 
  3 | import grpc
  4 | from google.protobuf.empty_pb2 import Empty
  5 | 
  6 | import bareun.custom_dict_pb2 as pb
  7 | import bareun.custom_dict_pb2_grpc as cds
  8 | import bareun.dict_common_pb2 as common
  9 | 
 10 | 
 11 | 
 12 | def build_dict_set(domain: str, name: str, dict_set: set) -> common.DictSet:
 13 |     """
 14 |     주어진 파라미터를 사용하여 사용자 사전의 한 표현 형태인 DictSet protobuf 메시지를 만듭니다.
 15 | 
 16 |     Args:
 17 |         domain (str): 사용자 사전의 이름
 18 |         name (str): 사용자 사전에 대한 설명
 19 |         dict_set (set): 사용자 사전에 들어가야 할 단어들의 잡합
 20 | 
 21 |     Returns:
 22 |         common.DictSet: protobuf DictSet 메시지
 23 |     """
 24 |     ret = common.DictSet()
 25 |     ret.name = domain + "-" + name
 26 |     ret.type = common.DictType.WORD_LIST
 27 |     for v in dict_set:
 28 |         ret.items[v] = 1
 29 |     return ret
 30 | 
 31 | 
 32 | MAX_MESSAGE_LENGTH = 100 * 1024 * 1024
 33 | 
 34 | 
 35 | class CustomDictionaryServiceClient:
 36 |     """
 37 |     커스텀 사전을 생성, 조회, 업데이트, 삭제하는 클라이언트
 38 |     
 39 |     The custom dictionary client which can create, update, list, delete your own one.
 40 |     """
 41 | 
 42 |     def __init__(self, channel: grpc.Channel, apikey:str):
 43 |         """사용자 사전을 관리하는 클라이언트 객체 생성자
 44 | 
 45 |         Args:
 46 |             remote (grpc.Channel): 미리 만들어 놓은 channel 객체
 47 |         """
 48 |         super().__init__()
 49 |         self.channel = channel
 50 |         self.apikey = apikey
 51 |         self.metadata=(
 52 |                 ('api-key', self.apikey),
 53 |                 )
 54 | 
 55 |         self.stub = cds.CustomDictionaryServiceStub(self.channel)
 56 | 
 57 | 
 58 |     def get_list(self) -> List[pb.CustomDictionaryMeta]:
 59 |         """사전 목록을 가져옵니다.
 60 | 
 61 |         Raises:
 62 |             e: grpc.Error, 원격 호출시 예외가 발생할 수 있습니다.
 63 | 
 64 |         Returns:
 65 |             List[pb.CustomDictionaryMeta]: 사전에 대한 정보들을 목록을 배열합니다.
 66 |         """
 67 |         req = Empty()
 68 |         try:
 69 |             res, c = self.stub.GetCustomDictionaryList.with_call(
 70 |                 request=req, metadata=self.metadata)
 71 |             return res.domain_dicts
 72 |         except grpc.RpcError as e:
 73 |             raise e
 74 | 
 75 | 
 76 |     def get(self, domain: str) -> pb.CustomDictionary:
 77 |         """
 78 |         정의된 사용사 사전의 내용 전체를 가져온다.
 79 | 
 80 |         Args:
 81 |             domain (str): 사용자 사전이 이름
 82 | 
 83 |         Raises:
 84 |             e: grpc.Error, 원격 호출시 예외가 발생할 수 있습니다.
 85 | 
 86 |         Returns:
 87 |             pb.CustomDictionary: 사용자 사전 데이터 전체를 담고 있는 protobuf 메시지
 88 |         """
 89 |         req = pb.GetCustomDictionaryRequest()
 90 |         req.domain_name = domain
 91 |         try:
 92 |             res, c = self.stub.GetCustomDictionary.with_call(
 93 |                 request=req, metadata=self.metadata)
 94 |             return res.dict
 95 |         except grpc.RpcError as e:
 96 |             raise e
 97 | 
 98 | 
 99 |     def update(self, domain: str, np: set, cp: set, cp_caret: set, vv: set, va: set) -> bool:
100 |         """ 사용자 사전을 갱신합니다.
101 | 
102 |         Args:
103 |             domain (str): 사용자 사전의 이름
104 |             np (set): 고유명사 단어 집합
105 |             cp (set): 복합명사 단어 집합
106 |             cp_caret (set): 복합명사 분리 단어 집합
107 |             vv (set): 동사 단어 집합
108 |             va (set): 형용사 단어 집합
109 | 
110 |         Raises:
111 |             e: grpc.Error, 원격 호출시 예외가 발생할 수 있습니다.
112 | 
113 |         Returns:
114 |             bool: 정상적으로 갱신되면 참을 돌려줍니다.
115 |         """
116 |         
117 |         req = pb.UpdateCustomDictionaryRequest()
118 |         req.domain_name = domain
119 | 
120 |         req.dict.domain_name = domain
121 | 
122 |         req.dict.np_set.CopyFrom(build_dict_set(domain, 'np-set', np))
123 |         req.dict.cp_set.CopyFrom(build_dict_set(domain, 'cp-set', cp))
124 |         req.dict.vv_set.CopyFrom(build_dict_set(domain, 'vv-set', vv))
125 |         req.dict.va_set.CopyFrom(build_dict_set(domain, 'va-set', va))
126 |         req.dict.cp_caret_set.CopyFrom(
127 |             build_dict_set(domain, 'cp-caret-set', cp_caret))
128 | 
129 |         try:
130 |             res, c = self.stub.UpdateCustomDictionary.with_call(
131 |                 request=req, metadata=self.metadata)
132 |             return res.updated_domain_name == domain
133 |         except grpc.RpcError as e:
134 |             raise e
135 | 
136 | 
137 |         """
138 |         :return: 삭제된 도메인의 이름들
139 |         """
140 |     def remove_all(self) -> List[str]:
141 |         """
142 |         모든 커스텀 사전을 삭제한 다음 삭제한 사전의 이름을 돌려줍니다.
143 | 
144 |         Raises:
145 |             e: grpc.Error, 원격 호출시 예외가 발생할 수 있습니다.
146 | 
147 |         Returns:
148 |             List[str]: 삭제한 사전의 이름
149 |         """
150 |         req = pb.RemoveCustomDictionariesRequest()
151 |         req.all = True
152 | 
153 |         try:
154 |             res, c = self.stub.RemoveCustomDictionaries.with_call(
155 |                 request=req, metadata=self.metadata)
156 |             return res.deleted_domain_names.keys()
157 |         except grpc.RpcError as e:
158 |             raise e
159 | 
160 |         """
161 |         지정한 도메인의 커스텀 사전을 삭제한다.
162 |         :param domains: 
163 |         :return: 
164 |         """
165 |     def remove(self, domains: List[str]) -> List[str]:
166 |         """ 지정한 도메인의 사용지 사전을 삭제한 다음 삭제한 사전의 목록을 반환합니다.
167 | 
168 |         Args:
169 |             domains (List[str]): 삭제할 커스텀 사전의 이름들
170 | 
171 |         Raises:
172 |             e: grpc.Error, 원격 호출시 예외가 발생할 수 있습니다.
173 | 
174 |         Returns:
175 |             List[str]: 정상 삭제된 도메인의 이름 목록을 돌려줍니다.
176 |         """
177 |         req = pb.RemoveCustomDictionariesRequest()
178 |         req.domain_names.extend(domains)
179 |         req.all = False
180 |         try:
181 |             res, c = self.stub.RemoveCustomDictionaries.with_call(
182 |                 request=req, metadata=self.metadata)
183 |             return res.deleted_domain_names.keys()
184 |         except grpc.RpcError as e:
185 |             raise e
186 | 


--------------------------------------------------------------------------------
/bareunpy/_lang_service_client.py:
--------------------------------------------------------------------------------
  1 | import grpc
  2 | from typing import List
  3 | 
  4 | import bareunpy
  5 | import bareun.language_service_pb2 as pb
  6 | import bareun.language_service_pb2_grpc as ls
  7 | import bareun.lang_common_pb2 as lpb
  8 | 
  9 | MAX_MESSAGE_LENGTH = 100 * 1024 * 1024
 10 | 
 11 | 
 12 | class BareunLanguageServiceClient:
 13 |     """
 14 |     형태소 분석을 처리하는 클라이언트
 15 |     """
 16 | 
 17 |     def __init__(self, channel:grpc.Channel, apikey:str, host:str, port:int):
 18 |         """
 19 |         클라이언트 생성자
 20 | 
 21 |         Args:
 22 |             channel (grpc.Channel): 원격 채널 정보
 23 |         """
 24 |         self.channel = channel
 25 |         self.apikey = apikey
 26 |         self.metadata=(
 27 |                 ('api-key', self.apikey),
 28 |                 ('user-agent', f'bareunpy/{bareunpy.version}')
 29 |                 )
 30 |         self.host = host
 31 |         self.port = port
 32 |         self.stub = ls.LanguageServiceStub(self.channel)
 33 |     
 34 |     def _handle_grpc_error(self, e: grpc.RpcError):
 35 |         """gRPC 에러를 처리하는 메서드"""
 36 |         details = getattr(e, "details", lambda: None)()
 37 |         code = getattr(e, "code", lambda: grpc.StatusCode.OK)()
 38 | 
 39 |         server_message = details if details else "서버에서 추가 메시지를 제공하지 않았습니다."
 40 |         if code == grpc.StatusCode.PERMISSION_DENIED:
 41 |             message = f'\n입력한 API KEY가 정확한지 확인해 주세요.\n > APIKEY: {self.apikey}\n서버 메시지: {server_message}'
 42 |         elif code == grpc.StatusCode.UNAVAILABLE:
 43 |             message = f'\n서버에 연결할 수 없습니다. 입력한 서버주소 [{self.host}:{self.port}]가 정확한지 확인해 주세요.\n서버 메시지: {server_message}'
 44 |         elif code == grpc.StatusCode.INVALID_ARGUMENT:
 45 |             message = f'\n잘못된 요청이 서버로 전송되었습니다. 입력 데이터를 확인하세요.\n서버 메시지: {server_message}'
 46 |         else:
 47 |             message = f'알 수 없는 오류가 발생했습니다.\n서버 메시지: {server_message}'
 48 |             raise e
 49 |         raise Exception(message) from e
 50 |     
 51 |     def analyze_syntax(self, content: str,
 52 |         custom_dicts: List[str] = [],
 53 |         auto_split=False,
 54 |         auto_spacing=True,
 55 |         auto_jointing=True) -> pb.AnalyzeSyntaxResponse:
 56 |         """
 57 |         형태소 분석을 수행합니다.
 58 | 
 59 |         Args:
 60 |             content (str): 형태소 분석할 원문, 여러 문장일 경우에 개행문자로 줄바꿈을 하면 됩니다.
 61 |             domain (str, optional): 사용사 사전의 이름. 기본값은 "".
 62 |             auto_split (bool, optional): 문장 자동 분리 여부, 기본값은 사용하지 않음.
 63 |             auto_spacing (bool, optional): 띄어쓰기 보정 기능, 기본값은 사용하도록 함.
 64 |             auto_jointing (bool, optional): 붙여쓰기 보정 기능, 기본값은 사용하지 않음.
 65 | 
 66 |         Raises:
 67 |             e: grpc.Error, 원격 호출시 예외가 발생할 수 있습니다.
 68 | 
 69 |         Returns:
 70 |             pb.AnalyzeSyntaxResponse: 형태소 분석 결과
 71 |         """
 72 |         req = pb.AnalyzeSyntaxRequest()
 73 |         # req.document = pb.Document()
 74 |         req.document.content = content
 75 |         req.document.language = "ko_KR"
 76 |         req.encoding_type = lpb.EncodingType.UTF32
 77 |         req.auto_split_sentence = auto_split
 78 |         req.auto_spacing = auto_spacing
 79 |         req.auto_jointing = auto_jointing
 80 |         req.custom_dict_names.extend(custom_dicts)
 81 | 
 82 |         try:
 83 |             res, c = self.stub.AnalyzeSyntax.with_call(
 84 |                 request=req, metadata=self.metadata)
 85 |             return res
 86 |         except grpc.RpcError as e:
 87 |             self._handle_grpc_error(e)
 88 |         except Exception as e2:
 89 |             import traceback
 90 |             traceback.print_exc()
 91 |             raise e2
 92 |         
 93 |     def analyze_syntax_list(self, content: List[str],
 94 |         custom_dicts: List[str] = [],
 95 |         auto_spacing=True,
 96 |         auto_jointing=True) -> pb.AnalyzeSyntaxListResponse:
 97 |         """
 98 |         형태소 분석을 수행하되, 입력된 문장 단위가 일치하도록 반환됩니다.
 99 |         문장 분할 기능을 사용하지 않습니다.
100 | 
101 |         Args:
102 |             content (List[str]): 형태소 분석할 원문의 리스트
103 |             domain (str, optional): 사용사 사전의 이름. 기본값은 "".
104 |             auto_spacing (bool, optional): 띄어쓰기 보정 기능, 기본값은 사용하도록 함.
105 |             auto_jointing (bool, optional): 붙여쓰기 보정 기능, 기본값은 사용하지 않음.
106 | 
107 |         Raises:
108 |             e: grpc.Error, 원격 호출시 예외가 발생할 수 있습니다.
109 | 
110 |         Returns:
111 |             pb.AnalyzeSyntaxListResponse: 형태소 분석 결과
112 |         """
113 |         req = pb.AnalyzeSyntaxListRequest()
114 |         req.sentences.extend(content)
115 |         req.language = "ko_KR"
116 |         req.encoding_type = lpb.EncodingType.UTF32
117 |         req.auto_spacing = auto_spacing
118 |         req.auto_jointing = auto_jointing
119 |         req.custom_dict_names.extend(custom_dicts)
120 | 
121 |         try:
122 |             res, c = self.stub.AnalyzeSyntaxList.with_call(
123 |                 request=req, metadata=self.metadata)
124 |             return res
125 |         except grpc.RpcError as e:
126 |             self._handle_grpc_error(e)
127 |         except Exception as e2:
128 |             import traceback
129 |             traceback.print_exc()
130 |             raise e2
131 | 
132 | 
133 |     def tokenize(self, content: str, auto_split=False) -> pb.TokenizeResponse:
134 |         """
135 |         형태소 분석을 수행합니다.
136 | 
137 |         Args:
138 |             content (str): 형태소 분석할 원문, 여러 문장일 경우에 개행문자로 줄바꿈을 하면 됩니다.
139 |             domain (str, optional): 사용사 사전의 이름. 기본값은 "".
140 |             auto_split (bool, optional): 문장 자동 분리 여부, 기본값은 사용하지 않음.
141 | 
142 |         Raises:
143 |             e: grpc.Error, 원격 호출시 예외가 발생할 수 있습니다.
144 | 
145 |         Returns:
146 |             pb.AnalyzeSyntaxResponse: 형태소 분석 결과
147 |         """
148 |         req = pb.TokenizeRequest()
149 |         # req.document = pb.Document()
150 |         req.document.content = content
151 |         req.document.language = "ko_KR"
152 |         req.encoding_type = lpb.EncodingType.UTF32
153 |         req.auto_split_sentence = auto_split
154 |         try:
155 |             res, c = self.stub.Tokenize.with_call(
156 |                 request=req, metadata=self.metadata)
157 |             return res
158 |         except grpc.RpcError as e:
159 |             self._handle_grpc_error(e)
160 |         except Exception as e2:
161 |             import traceback
162 |             traceback.print_exc()
163 |             raise e2
164 | 


--------------------------------------------------------------------------------
/bareunpy/_revision_service_client.py:
--------------------------------------------------------------------------------
 1 | import grpc
 2 | import bareun.revision_service_pb2 as pb
 3 | import bareun.revision_service_pb2_grpc as rs_grpc
 4 | 
 5 | MAX_MESSAGE_LENGTH = 100 * 1024 * 1024
 6 | 
 7 | 
 8 | class BareunRevisionServiceClient:
 9 |     """
10 |     맞춤법 검사를 처리하는 클라이언트
11 |     """
12 | 
13 |     def __init__(self, channel, apikey: str, host: str, port: int):
14 |         """
15 |         RevisionServiceClient 초기화
16 | 
17 |         Args:
18 |             apikey (str): API 키
19 |             host (str): gRPC 서버 주소
20 |             port (int): gRPC 서버 포트
21 |         """
22 |         self.channel = channel
23 |         self.apikey = apikey
24 |         self.host = host
25 |         self.port = port
26 |         self.metadata = [
27 |             ('api-key', self.apikey),
28 |             ('user-agent', 'bareun-revision-client'),
29 |         ]
30 | 
31 |         
32 |         self.stub = rs_grpc.RevisionServiceStub(self.channel)
33 | 
34 |     def _handle_grpc_error(self, e: grpc.RpcError):
35 |         """gRPC 에러를 처리하는 메서드"""
36 |         details = getattr(e, "details", lambda: None)()
37 |         code = getattr(e, "code", lambda: grpc.StatusCode.OK)()
38 |         server_message = details if details else "서버에서 추가 메시지를 제공하지 않았습니다."
39 |         if code == grpc.StatusCode.PERMISSION_DENIED:
40 |             message = f'\n입력한 API KEY가 정확한지 확인해 주세요.\n > APIKEY: {self.apikey}\n서버 메시지: {server_message}'
41 |         elif code == grpc.StatusCode.UNAVAILABLE:
42 |             message = f'\n서버에 연결할 수 없습니다. 입력한 서버주소 [{self.host}:{self.port}]를 확인하세요.\n서버 메시지: {server_message}'
43 |         elif code == grpc.StatusCode.INVALID_ARGUMENT:
44 |             message = f'\n잘못된 요청이 서버로 전송되었습니다. 입력 데이터를 확인하세요.\n서버 메시지: {server_message}'
45 |         else:
46 |             message = f'알 수 없는 오류가 발생했습니다.\n서버 메시지: {server_message}'
47 |             raise e
48 |         raise Exception(message) from e
49 | 
50 |     def correct_error(self, request: pb.CorrectErrorRequest) -> pb.CorrectErrorResponse:
51 |         """
52 |         맞춤법 교정을 위한 gRPC 호출
53 | 
54 |         Args:
55 |             request (pb.CorrectErrorRequest): gRPC 요청 메시지
56 | 
57 |         Returns:
58 |             pb.CorrectErrorResponse: gRPC 응답 메시지
59 |         """
60 |         try:
61 |             response, call = self.stub.CorrectError.with_call(
62 |                 request=request, metadata=self.metadata
63 |             )
64 |             return response
65 |         except grpc.RpcError as e:
66 |             self._handle_grpc_error(e)
67 | 


--------------------------------------------------------------------------------
/bareunpy/_tagger.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import json
  3 | from sys import stdout
  4 | from typing import IO, List, Any, Union
  5 | 
  6 | from google.protobuf.json_format import MessageToDict
  7 | import grpc
  8 | from bareunpy._custom_dict import CustomDict
  9 | from bareunpy._lang_service_client import BareunLanguageServiceClient, MAX_MESSAGE_LENGTH
 10 | from bareun.language_service_pb2 import AnalyzeSyntaxResponse, AnalyzeSyntaxListResponse, Morpheme, Sentence, Token
 11 | 
 12 | 
 13 | class Tagged:
 14 |     """
 15 |     Tagged result.
 16 |     It has various output manipulations.
 17 |     """
 18 | 
 19 |     def __init__(self, phrase: Union[str, List[str]], res: Union[AnalyzeSyntaxResponse, AnalyzeSyntaxListResponse]):
 20 |         """
 21 |         constructor, which is used internally.
 22 |         :param phrase: requested sentences.
 23 |         :param res:
 24 |         """
 25 |         super().__init__()
 26 |         self.phrase = phrase
 27 |         self.r = res
 28 | 
 29 |         # 빈 응답이 있는 경우를 대비해서 값이 없지 않도록 처리한다.
 30 |         if self.r is None:
 31 |             self.r = AnalyzeSyntaxResponse()
 32 |             self.phrase = ''
 33 | 
 34 |     def msg(self) -> Union[AnalyzeSyntaxResponse, AnalyzeSyntaxListResponse]:
 35 |         """
 36 |         Protobuf message object containing all of NLP engine.
 37 |         """
 38 |         return self.r
 39 | 
 40 |     def sentences(self) -> List[Sentence]:
 41 |         """
 42 |         :return: get sentences from tagged results.
 43 |         """
 44 |         ret = list()
 45 |         for s in self.r.sentences:
 46 |             ret.append(s)
 47 |         return ret
 48 | 
 49 |     def as_json(self):
 50 |         """
 51 |         convert the message to a json object.
 52 |         :return: Json Obejct
 53 |         """
 54 |         return MessageToDict(self.r, True)
 55 | 
 56 |     def as_json_str(self) -> str:
 57 |         """
 58 |         a json string representing analyzed sentences.
 59 |         :return: json string
 60 |         """
 61 |         d = MessageToDict(self.r, True)
 62 |         return json.dumps(d, ensure_ascii=False, indent=2)
 63 | 
 64 |     def print_as_json(self, out: IO = stdout):
 65 |         """
 66 |         print the analysis result
 67 |         :param out: File, if nothing provided, sys.stdout is used.
 68 |         :return: None
 69 |         """
 70 |         d = MessageToDict(self.r, True)
 71 |         json.dump(d, out, ensure_ascii=False, indent=2)
 72 | 
 73 |     @staticmethod
 74 |     def _pos(m: Morpheme, join: bool, detail: bool):
 75 |         if join:
 76 |             if detail:
 77 |                 p = f':{m.probability:5.3f}' if m.probability > 0 else ''
 78 |                 oov = f'#{Morpheme.OutOfVocab.Name(m.out_of_vocab)}' if m.out_of_vocab != 0 else ''
 79 |                 return f'{m.text.content}/{Morpheme.Tag.Name(m.tag)}{p}{oov}'
 80 |             else:
 81 |                 return f'{m.text.content}/{Morpheme.Tag.Name(m.tag)}'
 82 |         else:
 83 |             if detail:
 84 |                 return m.text.content,\
 85 |                        Morpheme.Tag.Name(m.tag),\
 86 |                        Morpheme.OutOfVocab.Name(m.out_of_vocab),\
 87 |                        m.probability
 88 |             else:
 89 |                 return m.text.content, Morpheme.Tag.Name(m.tag)
 90 | 
 91 |     def pos(self, flatten: bool = True, join: bool = False, detail: bool = False) -> List:
 92 |         """
 93 |         POS tagger to tuple.
 94 |         :param flatten : If False, returns original morphs.
 95 |         :param join    : If True, returns joined sets of morph and tag.
 96 |         :param detail  : if True, returns everything of morph result
 97 |         """
 98 |         if flatten:
 99 |             return [Tagged._pos(m, join, detail) for s in self.r.sentences
100 |                     for token in s.tokens
101 |                     for m in token.morphemes]
102 |         else:
103 |             return [[Tagged._pos(m, join, detail) for m in token.morphemes]
104 |                     for s in self.r.sentences
105 |                     for token in s.tokens]
106 | 
107 |     def morphs(self) -> List:
108 |         """Parse phrase to morphemes."""
109 |         return [m.text.content for s in self.r.sentences
110 |                 for token in s.tokens
111 |                 for m in token.morphemes]
112 | 
113 |     def nouns(self) -> List:
114 |         """Noun extractor."""
115 |         return [m.text.content for s in self.r.sentences
116 |                 for token in s.tokens
117 |                 for m in token.morphemes
118 |                 if m.tag in {Morpheme.Tag.NNP, Morpheme.Tag.NNG, Morpheme.Tag.NP, Morpheme.Tag.NNB}]
119 | 
120 |     def verbs(self) -> List:
121 |         """Noun extractor."""
122 |         return [m.text.content for s in self.r.sentences
123 |                 for token in s.tokens
124 |                 for m in token.morphemes
125 |                 if m.tag in {Morpheme.Tag.VV}]
126 | 
127 | 
128 | class Tagger:
129 |     """Wrapper for `bareun v1.7.x <https://github.com/bareun-nlp>`_.
130 |     'bareun' is a morphological analyzer developed by Baikal AI, Inc. and Korea Press Foundation.
131 | 
132 |     .. code-block:: python
133 |         :emphasize-lines: 1
134 |         >>> import bareunpy as brn
135 |         >>> tagger = brn.Tagger(apikey="kpba-YOURKEY", custom_dicts=["custom", "my"])
136 |         >>> print(tagger.morphs('안녕하세요, 반가워요.'))
137 |         ['안녕', '하', '시', '어요', ',', '반갑', '어요', '.']
138 |         >>> print(tagger.nouns('나비 허리에 새파란 초생달이 시리다.'))
139 |         ['나비', '허리', '초생달']
140 |         >>> print(tagger.pos('햇빛이 선명하게 나뭇잎을 핥고 있었다.'))
141 |         [('햇빛', 'NNG'), ('이', 'JKS'), ('선명', 'NNG'), ('하', 'XSA'), ('게', 'EC'), ('나뭇잎', 'NNG'),
142 |          ('을', 'JKO'), ('핥', 'VV'), ('고', 'EC'), ('있', 'VX'), ('었', 'EP'), ('다', 'EF'), ('.', 'SF')]
143 |     :param host         : str. host name for bareun server
144 |     :param port         : int. port  for bareun server
145 |     :param custom_dicts : List[str]. custom dictionary names for analyzing request
146 |     """
147 | 
148 |     def __init__(self, apikey:str, host: str = "", port: int = 5656, custom_dicts: List[str] = []):
149 | 
150 |         if host:
151 |             host = host.strip()
152 | 
153 |         if host == "" or host is None:
154 |             self.host = 'nlp.bareun.ai'
155 |         else:
156 |             self.host = host
157 | 
158 |         if port is not None:
159 |             self.port = port
160 |         else:
161 |             self.port = 5656
162 | 
163 |         self.channel = grpc.insecure_channel(
164 |             f"{self.host}:{self.port}",
165 |             options=[
166 |                 ('grpc.max_send_message_length', MAX_MESSAGE_LENGTH),
167 |                 ('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH),
168 |             ])
169 |         self.apikey = apikey
170 | 
171 |         if apikey == None or len(apikey) == 0:
172 |             raise ValueError("an apikey must be provided!")
173 | 
174 |         self.client = BareunLanguageServiceClient(self.channel, apikey, self.host, self.port)
175 | 
176 |         self.custom_dicts = custom_dicts
177 |         self.internal_custom_dicts = {}
178 |     
179 |     def _handle_grpc_error(self, e: grpc.RpcError):
180 |         """gRPC 에러를 처리하는 메서드"""
181 |         details = getattr(e, "details", lambda: None)()
182 |         code = getattr(e, "code", lambda: grpc.StatusCode.OK)()
183 | 
184 |         server_message = details if details else "서버에서 추가 메시지를 제공하지 않았습니다."
185 |         if code == grpc.StatusCode.PERMISSION_DENIED:
186 |             message = f'\n입력한 API KEY가 정확한지 확인해 주세요.\n > APIKEY: {self.apikey}\n서버 메시지: {server_message}'
187 |         elif code == grpc.StatusCode.UNAVAILABLE:
188 |             message = f'\n서버에 연결할 수 없습니다. 입력한 서버주소 [{self.host}:{self.port}]가 정확한지 확인해 주세요.\n서버 메시지: {server_message}'
189 |         else:
190 |             raise e
191 |         raise Exception(message) from e
192 | 
193 |     @DeprecationWarning
194 |     def set_domain(self, domain: str):
195 |         """
196 |         Set domain of custom dict.
197 |         :param domain: domain name of custom dict
198 |         """
199 |         if len(self.custom_dicts) == 0:
200 |             self.custom_dicts = []
201 |         self.custom_dicts.append(domain)
202 | 
203 |     def set_custom_dicts(self, custom_dicts: List[str]):
204 |         """
205 |         Set domain of custom dict.
206 |         :param domain: domain name of custom dict
207 |         """
208 |         if len(custom_dicts) > 0:
209 |             self.custom_dicts = custom_dicts
210 |         else:
211 |             self.custom_dicts = []
212 | 
213 |     def custom_dict(self, name: str) -> CustomDict:
214 |         # self.domain = domain
215 |         if name == "" or name is None:
216 |             raise ValueError("invalid name for custom dict")
217 | 
218 |         if name in self.internal_custom_dicts:
219 |             return self.internal_custom_dicts[name]
220 |         else:
221 |             self.internal_custom_dicts[name] = CustomDict(self.apikey, name,  self.channel)
222 |             return self.internal_custom_dicts[name]
223 | 
224 |     def tag(self, phrase: str, auto_split: bool = False, auto_spacing: bool = True, auto_jointing: bool = True) -> Tagged:
225 |         if len(phrase) == 0:
226 |             print("OOPS, no sentences.")
227 |             return Tagged('', AnalyzeSyntaxResponse())
228 |         try:
229 |             res = self.client.analyze_syntax(phrase, self.custom_dicts, auto_split=auto_split, auto_spacing=auto_spacing, auto_jointing=auto_jointing)
230 |             return Tagged(phrase, res)
231 |         except grpc.RpcError as e:
232 |             self._handle_grpc_error(e)
233 | 
234 |     def tags(self, phrase: List[str], auto_split: bool = False, auto_spacing: bool = True, auto_jointing: bool = True) -> Tagged:
235 |         """
236 |         tag string array.
237 |         :param phrase: array of string
238 |         :param auto_split(bool, optional): Whether to automatically perform sentence split
239 |         :param auto_spacing(bool, optional): Whether to automatically perform space insertion for typo correction
240 |         :param auto_jointing(bool, optional): Whether to automatically perform word joining for typo correction
241 |         :return: Tagged result instance
242 |         """
243 |         if len(phrase) == 0:
244 |             print("OOPS, no sentences.")
245 |             return Tagged('', AnalyzeSyntaxResponse())
246 |         p = '\n'.join(phrase)
247 |         try:
248 |             res = self.client.analyze_syntax(p, self.custom_dicts, auto_split=auto_split, auto_spacing=auto_spacing, auto_jointing=auto_jointing)
249 |             return Tagged(p, res)
250 |         except grpc.RpcError as e:
251 |             self._handle_grpc_error(e)
252 | 
253 |     def taglist(self, phrase: List[str], auto_spacing: bool = True, auto_jointing: bool = True) -> Tagged:
254 |         """
255 |         the array is not being split and the input value is being returned as-is.
256 |         :param phrase: array of string
257 |         :param auto_split(bool, optional): Whether to automatically perform sentence split
258 |         :param auto_spacing(bool, optional): Whether to automatically perform space insertion for typo correction
259 |         :param auto_jointing(bool, optional): Whether to automatically perform word joining for typo correction
260 |         :return: Tagged result instance
261 |         """
262 |         if len(phrase) == 0:
263 |             print("OOPS, no sentences.")
264 |             return Tagged('', AnalyzeSyntaxListResponse())
265 |         try:
266 |             res = self.client.analyze_syntax_list(phrase, self.custom_dicts, auto_spacing=auto_spacing, auto_jointing=auto_jointing)
267 |             return Tagged(phrase, res)
268 |         except grpc.RpcError as e:
269 |             self._handle_grpc_error(e)
270 | 
271 |     def pos(self, phrase: str, flatten: bool = True, join: bool = False, detail: bool = False) -> List:
272 |         """
273 |         POS tagger.
274 |         :param phrase  : string to analyse
275 |         :param flatten : If False, returns original morphs.
276 |         :param join    : If True, returns joined sets of morph and tag.
277 |         :param detail  : if True, returns every things of morph result
278 |         """
279 |         return self.tag(phrase).pos(flatten, join, detail)
280 | 
281 |     def morphs(self, phrase: str) -> List:
282 |         """Parse phrase to morphemes."""
283 |         return self.tag(phrase).morphs()
284 | 
285 |     def nouns(self, phrase: str) -> List:
286 |         """Noun extractor."""
287 |         return self.tag(phrase).nouns()
288 | 
289 |     def verbs(self, phrase: str) -> List:
290 |         """Verbs extractor."""
291 |         return self.tag(phrase).verbs()
292 | 


--------------------------------------------------------------------------------
/bareunpy/_tokenizer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import json
  3 | from sys import stdout
  4 | from typing import IO, List, Any
  5 | 
  6 | from google.protobuf.json_format import MessageToDict
  7 | import grpc
  8 | from bareunpy._lang_service_client import BareunLanguageServiceClient, MAX_MESSAGE_LENGTH
  9 | from bareun.language_service_pb2 import TokenizeResponse, Segment, SegmentSentence, SegmentToken
 10 | 
 11 | 
 12 | class Tokenized:
 13 |     """
 14 |     Tokenized result.
 15 |     It has various output manipulations.
 16 |     """
 17 | 
 18 |     def __init__(self, phrase: str, res: TokenizeResponse):
 19 |         """
 20 |         constructor, which is used internally.
 21 |         :param phrase: requested sentences.
 22 |         :param res:
 23 |         """
 24 |         super().__init__()
 25 |         self.phrase = phrase
 26 |         self.r = res
 27 | 
 28 |         # 빈 응답이 있는 경우를 대비해서 값이 없지 않도록 처리한다.
 29 |         if self.r is None:
 30 |             self.r = TokenizeResponse()
 31 |             self.phrase = ''
 32 | 
 33 |     def msg(self) -> TokenizeResponse:
 34 |         """
 35 |         Protobuf message object containing all of NLP engine.
 36 |         """
 37 |         return self.r
 38 | 
 39 |     def sentences(self) -> List[SegmentSentence]:
 40 |         """
 41 |         :return: get sentences from tagged results.
 42 |         """
 43 |         ret = list()
 44 |         for s in self.r.sentences:
 45 |             ret.append(s)
 46 |         return ret
 47 | 
 48 |     def as_json(self):
 49 |         """
 50 |         convert the message to a json object.
 51 |         :return: Json Obejct
 52 |         """
 53 |         return MessageToDict(self.r, True)
 54 | 
 55 |     def as_json_str(self) -> str:
 56 |         """
 57 |         a json string representing analyzed sentences.
 58 |         :return: json string
 59 |         """
 60 |         d = MessageToDict(self.r, True)
 61 |         return json.dumps(d, ensure_ascii=False, indent=2)
 62 | 
 63 |     def print_as_json(self, out: IO = stdout):
 64 |         """
 65 |         print the analysis result
 66 |         :param out: File, if nothing provided, sys.stdout is used.
 67 |         :return: None
 68 |         """
 69 |         d = MessageToDict(self.r, True)
 70 |         json.dump(d, out, ensure_ascii=False, indent=2)
 71 | 
 72 |     @staticmethod
 73 |     def _segment(m: Segment, join: bool, detail: bool):
 74 |         if join:
 75 |             if detail:
 76 |                 return f'{m.text.content}/{m.hint}'
 77 |             else:
 78 |                 return f'{m.text.content}'
 79 |         else:
 80 |             if detail:
 81 |                 return m.text.content, m.hint
 82 |             else:
 83 |                 return m.text.content
 84 | 
 85 |     def seg(self, flatten: bool = True, join: bool = False, detail: bool = False) -> List:
 86 |         """
 87 |         분절의 결과를 튜플 형태로 반환한다.
 88 |         :param flatten : If False, returns original morphs.
 89 |         :param join    : If True, returns joined sets of morph and tag.
 90 |         :param detail  : if True, returns everything of morph result
 91 |         """
 92 |         if flatten:
 93 |             return [Tokenized._segment(m, join, detail) for s in self.r.sentences
 94 |                     for token in s.tokens
 95 |                     for m in token.segments]
 96 |         else:
 97 |             return [[Tokenized._segment(m, join, detail) for m in token.segments]
 98 |                     for s in self.r.sentences
 99 |                     for token in s.tokens]
100 | 
101 |     def segments(self) -> List:
102 |         """문장의 모든 segment들을 반환한다. """
103 |         return [m.text.content for s in self.r.sentences
104 |                 for token in s.tokens
105 |                 for m in token.segments]
106 | 
107 |     def nouns(self) -> List:
108 |         """체언을 추출한다."""
109 |         return [m.text.content for s in self.r.sentences
110 |                 for token in s.tokens
111 |                 for m in token.segments
112 |                 if m.hint == 'N']
113 | 
114 |     def verbs(self) -> List:
115 |         """동사 또는 형용사, 즉, 용언을 추출한다."""
116 |         return [m.text.content for s in self.r.sentences
117 |                 for token in s.tokens
118 |                 for m in token.segments
119 |                 if m.hint == 'V']
120 | 
121 |     def predicates(self) -> List:
122 |         """용언을 추출한다."""
123 |         return [m.text.content for s in self.r.sentences
124 |                 for token in s.tokens
125 |                 for m in token.segments
126 |                 if m.hint == 'V']
127 | 
128 |     def substantives(self) -> List:
129 |         """체언을 추출한다."""
130 |         return [m.text.content for s in self.r.sentences
131 |                 for token in s.tokens
132 |                 for m in token.segments
133 |                 if m.hint == 'N']
134 | 
135 |     def symbols(self) -> List:
136 |         """기호를 추출한다."""
137 |         return [m.text.content for s in self.r.sentences
138 |                 for token in s.tokens
139 |                 for m in token.segments
140 |                 if m.hint == 'S']
141 | 
142 |     def adverbs(self) -> List:
143 |         """부사를 추출한다.."""
144 |         return [m.text.content for s in self.r.sentences
145 |                 for token in s.tokens
146 |                 for m in token.segments
147 |                 if m.hint == 'A']
148 | 
149 |     def prenouns(self) -> List:
150 |         """관형사를 추출한다."""
151 |         return [m.text.content for s in self.r.sentences
152 |                 for token in s.tokens
153 |                 for m in token.segments
154 |                 if m.hint == 'M']
155 | 
156 |     def postpositions(self) -> List:
157 |         """감탄사를 추출한다."""
158 |         return [m.text.content for s in self.r.sentences
159 |                 for token in s.tokens
160 |                 for m in token.segments
161 |                 if m.hint == 'J']
162 | 
163 |     def interjections(self) -> List:
164 |         """감탄사를 추출한다."""
165 |         return [m.text.content for s in self.r.sentences
166 |                 for token in s.tokens
167 |                 for m in token.segments
168 |                 if m.hint == 'I']
169 | 
170 |     def endings(self) -> List:
171 |         """어미를 반환한다."""
172 |         return [m.text.content for s in self.r.sentences
173 |                 for token in s.tokens
174 |                 for m in token.segments
175 |                 if m.hint == 'E']
176 | 
177 | class Tokenizer:
178 |     """Wrapper for `bareun v1.7.x <https://github.com/bareun-nlp>`_.
179 |     'bareun' is a morphological analyzer developed by Baikal AI, Inc. and Korea Press Foundation.
180 | 
181 |     .. code-block:: python
182 |         :emphasize-lines: 1
183 |         >>> import bareunpy as brn
184 |         >>> tokenizer = brn.Tokenizer()
185 |         >>> print(tokenizer.segments('안녕하세요, 반가워요.'))
186 |         ['안녕', '하', '시', '어요', ',', '반갑', '어요', '.']
187 |         >>> print(tokenizer.nouns('나비 허리에 새파란 초생달이 시리다.'))
188 |         ['나비', '허리', '초생달']
189 |         >>> print(tokenizer.seg('햇빛이 선명하게 나뭇잎을 핥고 있었다.'))
190 |         [('햇빛', 'NNG'), ('이', 'JKS'), ('선명', 'NNG'), ('하', 'XSA'), ('게', 'EC'), ('나뭇잎', 'NNG'),
191 |          ('을', 'JKO'), ('핥', 'VV'), ('고', 'EC'), ('있', 'VX'), ('었', 'EP'), ('다', 'EF'), ('.', 'SF')]
192 |     :param host         : str. host name for bareun server
193 |     :param port         : int. port  for bareun server
194 |     """
195 | 
196 |     def __init__(self, apikey:str, host: str = "", port: int = 5656):
197 | 
198 |         if host:
199 |             host = host.strip()
200 |         if host == "" or host is None:
201 |             self.host = 'nlp.bareun.ai'
202 |         else:
203 |             self.host = host
204 | 
205 |         if port is not None:
206 |             self.port = port
207 |         else:
208 |             self.port = 5656
209 | 
210 |         if apikey == None or len(apikey) == 0:
211 |             raise ValueError("a apikey must be provided!")
212 |         self.apikey = apikey
213 |         self.channel = grpc.insecure_channel(
214 |             f"{self.host}:{self.port}",
215 |             options=[
216 |                 ('grpc.max_send_message_length', MAX_MESSAGE_LENGTH),
217 |                 ('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH),
218 |             ])
219 |         self.client = BareunLanguageServiceClient(self.channel, apikey, host, port)
220 |     
221 |     def _handle_grpc_error(self, e: grpc.RpcError):
222 |         """gRPC 에러를 처리하는 메서드"""
223 |         details = getattr(e, "details", lambda: None)()
224 |         code = getattr(e, "code", lambda: grpc.StatusCode.OK)()
225 |         server_message = details if details else "서버에서 추가 메시지를 제공하지 않았습니다."
226 |         if code == grpc.StatusCode.PERMISSION_DENIED:
227 |             message = f'\n입력한 API KEY가 정확한지 확인해 주세요.\n > APIKEY: {self.apikey}\n서버 메시지: {server_message}'
228 |         elif code == grpc.StatusCode.UNAVAILABLE:
229 |             message = f'\n서버에 연결할 수 없습니다. 입력한 서버주소 [{self.host}:{self.port}]가 정확한지 확인해 주세요.\n서버 메시지: {server_message}'
230 |         else:
231 |             raise e
232 |         raise Exception(message) from e
233 | 
234 |     def tokenize(self, phrase: str, auto_split: bool = False) -> Tokenized:
235 |         if len(phrase) == 0:
236 |             print("OOPS, no sentences.")
237 |             return Tokenized('', TokenizeResponse())
238 |         try:
239 |             res = Tokenized(phrase,
240 |                         self.client.tokenize(phrase, auto_split))
241 |             return res
242 |         except grpc.RpcError as e:
243 |             self._handle_grpc_error(e)
244 | 
245 |     def tokenize_list(self, phrase: List[str]) -> Tokenized:
246 |         """
247 |         tag string array.
248 |         :param phrase: array of string
249 |         :return: Tagged result instance
250 |         """
251 |         if len(phrase) == 0:
252 |             print("OOPS, no sentences.")
253 |             return Tokenized('', TokenizeResponse())
254 |         p = '\n'.join(phrase)
255 |         try:
256 |             res =  Tokenized(p,
257 |                         self.client.tokenize(p, auto_split=False))      
258 |             return res
259 |         except grpc.RpcError as e:
260 |             self._handle_grpc_error(e)
261 | 
262 |     def seg(self, phrase: str, flatten: bool = True, join: bool = False, detail: bool = False) -> List:
263 |         """
264 |         분절 하기,
265 |         :param phrase  : string to analyse
266 |         :param flatten : If False, returns original morphs.
267 |         :param join    : If True, returns joined sets of morph and tag.
268 |         :param detail  : if True, returns every things of morph result
269 |         """
270 |         return self.tokenize(phrase).seg(flatten, join, detail)
271 | 
272 |     def segments(self, phrase: str) -> List:
273 |         """문장을 분절하여 어절 내부의 기본 단위로 만들어 낸다."""
274 |         return self.tokenize(phrase).segments()
275 | 
276 |     def nouns(self, phrase: str) -> List:
277 |         """문장을 분절하여 어절 내부의 기본 단위로 만들어 내고 체언을 뽑아낸다."""
278 |         return self.tokenize(phrase).nouns()
279 | 
280 |     def verbs(self, phrase: str) -> List:
281 |         """문장을 분절하여 어절 내부의 기본 단위로 만들어 내고 용언을 뽑아낸다."""
282 |         return self.tokenize(phrase).verbs()
283 | 


--------------------------------------------------------------------------------
/poetry.lock:
--------------------------------------------------------------------------------
  1 | [[package]]
  2 | name = "atomicwrites"
  3 | version = "1.4.0"
  4 | description = "Atomic file writes."
  5 | category = "dev"
  6 | optional = false
  7 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
  8 | 
  9 | [[package]]
 10 | name = "attrs"
 11 | version = "20.3.0"
 12 | description = "Classes Without Boilerplate"
 13 | category = "dev"
 14 | optional = false
 15 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 16 | 
 17 | [package.extras]
 18 | dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface", "furo", "sphinx", "pre-commit"]
 19 | docs = ["furo", "sphinx", "zope.interface"]
 20 | tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface"]
 21 | tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six"]
 22 | 
 23 | [[package]]
 24 | name = "colorama"
 25 | version = "0.4.4"
 26 | description = "Cross-platform colored terminal text."
 27 | category = "dev"
 28 | optional = false
 29 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 30 | 
 31 | [[package]]
 32 | name = "grpcio"
 33 | version = "1.35.0"
 34 | description = "HTTP/2-based RPC framework"
 35 | category = "main"
 36 | optional = false
 37 | python-versions = "*"
 38 | 
 39 | [package.dependencies]
 40 | six = ">=1.5.2"
 41 | 
 42 | [package.extras]
 43 | protobuf = ["grpcio-tools (>=1.35.0)"]
 44 | 
 45 | [[package]]
 46 | name = "importlib-metadata"
 47 | version = "3.4.0"
 48 | description = "Read metadata from Python packages"
 49 | category = "dev"
 50 | optional = false
 51 | python-versions = ">=3.6"
 52 | 
 53 | [package.dependencies]
 54 | typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""}
 55 | zipp = ">=0.5"
 56 | 
 57 | [package.extras]
 58 | docs = ["sphinx", "jaraco.packaging (>=8.2)", "rst.linker (>=1.9)"]
 59 | testing = ["pytest (>=3.5,!=3.7.3)", "pytest-checkdocs (>=1.2.3)", "pytest-flake8", "pytest-cov", "pytest-enabler", "packaging", "pep517", "pyfakefs", "flufl.flake8", "pytest-black (>=0.3.7)", "pytest-mypy", "importlib-resources (>=1.3)"]
 60 | 
 61 | [[package]]
 62 | name = "iniconfig"
 63 | version = "1.1.1"
 64 | description = "iniconfig: brain-dead simple config-ini parsing"
 65 | category = "dev"
 66 | optional = false
 67 | python-versions = "*"
 68 | 
 69 | [[package]]
 70 | name = "packaging"
 71 | version = "20.9"
 72 | description = "Core utilities for Python packages"
 73 | category = "dev"
 74 | optional = false
 75 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 76 | 
 77 | [package.dependencies]
 78 | pyparsing = ">=2.0.2"
 79 | 
 80 | [[package]]
 81 | name = "pluggy"
 82 | version = "0.13.1"
 83 | description = "plugin and hook calling mechanisms for python"
 84 | category = "dev"
 85 | optional = false
 86 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 87 | 
 88 | [package.dependencies]
 89 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""}
 90 | 
 91 | [package.extras]
 92 | dev = ["pre-commit", "tox"]
 93 | 
 94 | [[package]]
 95 | name = "protobuf"
 96 | version = "3.14.0"
 97 | description = "Protocol Buffers"
 98 | category = "main"
 99 | optional = false
100 | python-versions = "*"
101 | 
102 | [package.dependencies]
103 | six = ">=1.9"
104 | 
105 | [[package]]
106 | name = "py"
107 | version = "1.10.0"
108 | description = "library with cross-python path, ini-parsing, io, code, log facilities"
109 | category = "dev"
110 | optional = false
111 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
112 | 
113 | [[package]]
114 | name = "pyparsing"
115 | version = "2.4.7"
116 | description = "Python parsing module"
117 | category = "dev"
118 | optional = false
119 | python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
120 | 
121 | [[package]]
122 | name = "pytest"
123 | version = "6.2.2"
124 | description = "pytest: simple powerful testing with Python"
125 | category = "dev"
126 | optional = false
127 | python-versions = ">=3.6"
128 | 
129 | [package.dependencies]
130 | atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""}
131 | attrs = ">=19.2.0"
132 | colorama = {version = "*", markers = "sys_platform == \"win32\""}
133 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""}
134 | iniconfig = "*"
135 | packaging = "*"
136 | pluggy = ">=0.12,<1.0.0a1"
137 | py = ">=1.8.2"
138 | toml = "*"
139 | 
140 | [package.extras]
141 | testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"]
142 | 
143 | [[package]]
144 | name = "six"
145 | version = "1.15.0"
146 | description = "Python 2 and 3 compatibility utilities"
147 | category = "main"
148 | optional = false
149 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
150 | 
151 | [[package]]
152 | name = "toml"
153 | version = "0.10.2"
154 | description = "Python Library for Tom's Obvious, Minimal Language"
155 | category = "dev"
156 | optional = false
157 | python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
158 | 
159 | [[package]]
160 | name = "typing-extensions"
161 | version = "3.7.4.3"
162 | description = "Backported and Experimental Type Hints for Python 3.5+"
163 | category = "dev"
164 | optional = false
165 | python-versions = "*"
166 | 
167 | [[package]]
168 | name = "zipp"
169 | version = "3.4.0"
170 | description = "Backport of pathlib-compatible object wrapper for zip files"
171 | category = "dev"
172 | optional = false
173 | python-versions = ">=3.6"
174 | 
175 | [package.extras]
176 | docs = ["sphinx", "jaraco.packaging (>=3.2)", "rst.linker (>=1.9)"]
177 | testing = ["pytest (>=3.5,!=3.7.3)", "pytest-checkdocs (>=1.2.3)", "pytest-flake8", "pytest-cov", "jaraco.test (>=3.2.0)", "jaraco.itertools", "func-timeout", "pytest-black (>=0.3.7)", "pytest-mypy"]
178 | 
179 | [metadata]
180 | lock-version = "1.1"
181 | python-versions = "^3.6"
182 | content-hash = "2189ca77911e79e25a7de2f41b8625e10e836021cbc76cd1431d6da6e3e65339"
183 | 
184 | [metadata.files]
185 | atomicwrites = [
186 |     {file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"},
187 |     {file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"},
188 | ]
189 | attrs = [
190 |     {file = "attrs-20.3.0-py2.py3-none-any.whl", hash = "sha256:31b2eced602aa8423c2aea9c76a724617ed67cf9513173fd3a4f03e3a929c7e6"},
191 |     {file = "attrs-20.3.0.tar.gz", hash = "sha256:832aa3cde19744e49938b91fea06d69ecb9e649c93ba974535d08ad92164f700"},
192 | ]
193 | colorama = [
194 |     {file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"},
195 |     {file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"},
196 | ]
197 | grpcio = [
198 |     {file = "grpcio-1.35.0-cp27-cp27m-macosx_10_10_x86_64.whl", hash = "sha256:95cc4d2067deced18dc807442cf8062a93389a86abf8d40741120054389d3f29"},
199 |     {file = "grpcio-1.35.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:d186a0ce291f4386e28a7042ec31c85250b0c2e25d2794b87fa3c15ff473c46c"},
200 |     {file = "grpcio-1.35.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:c8d0a6a58a42275c6cb616e7cb9f9fcf5eba1e809996546e561cd818b8f7cff7"},
201 |     {file = "grpcio-1.35.0-cp27-cp27m-win32.whl", hash = "sha256:8d08f90d72a8e8d9af087476337da76d26749617b0a092caff4e684ce267af21"},
202 |     {file = "grpcio-1.35.0-cp27-cp27m-win_amd64.whl", hash = "sha256:0072ec4563ab4268c4c32e936955085c2d41ea175b662363496daedd2273372c"},
203 |     {file = "grpcio-1.35.0-cp27-cp27mu-linux_armv7l.whl", hash = "sha256:aca45d2ccb693c9227fbf21144891422a42dc4b76b52af8dd1d4e43afebe321d"},
204 |     {file = "grpcio-1.35.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:87147b1b306c88fe7dca7e3dff8aefd1e63d6aed86e224f9374ddf283f17d7f1"},
205 |     {file = "grpcio-1.35.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:22edfc278070d54f3ab7f741904e09155a272fe934e842babbf84476868a50de"},
206 |     {file = "grpcio-1.35.0-cp35-cp35m-linux_armv7l.whl", hash = "sha256:f3654a52f72ba28953dbe2e93208099f4903f4b3c07dc7ff4db671c92968111d"},
207 |     {file = "grpcio-1.35.0-cp35-cp35m-macosx_10_10_intel.whl", hash = "sha256:dc2589370ef84eb1cc53530070d658a7011d2ee65f18806581809c11cd016136"},
208 |     {file = "grpcio-1.35.0-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:f0c27fd16582a303e5baf6cffd9345c9ac5f855d69a51232664a0b888a77ba80"},
209 |     {file = "grpcio-1.35.0-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:b2985f73611b637271b00d9c4f177e65cc3193269bc9760f16262b1a12757265"},
210 |     {file = "grpcio-1.35.0-cp35-cp35m-manylinux2014_i686.whl", hash = "sha256:acb489b7aafdcf960f1a0000a1f22b45e5b6ccdf8dba48f97617d627f4133195"},
211 |     {file = "grpcio-1.35.0-cp35-cp35m-manylinux2014_x86_64.whl", hash = "sha256:16fd33030944672e49e0530dec2c60cd4089659ccdf327e99569b3b29246a0b6"},
212 |     {file = "grpcio-1.35.0-cp35-cp35m-win32.whl", hash = "sha256:1757e81c09132851e85495b802fe4d4fbef3547e77fa422a62fb4f7d51785be0"},
213 |     {file = "grpcio-1.35.0-cp35-cp35m-win_amd64.whl", hash = "sha256:35b72884e09cbc46c564091f4545a39fa66d132c5676d1a6e827517fff47f2c1"},
214 |     {file = "grpcio-1.35.0-cp36-cp36m-linux_armv7l.whl", hash = "sha256:17940a7dc461066f28816df48be44f24d3b9f150db344308ee2aeae033e1af0b"},
215 |     {file = "grpcio-1.35.0-cp36-cp36m-macosx_10_10_x86_64.whl", hash = "sha256:75ea903edc42a8c6ec61dbc5f453febd79d8bdec0e1bad6df7088c34282e8c42"},
216 |     {file = "grpcio-1.35.0-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:b180a3ec4a5d6f96d3840c83e5f8ab49afac9fa942921e361b451d7a024efb00"},
217 |     {file = "grpcio-1.35.0-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:e163c27d2062cd3eb07057f23f8d1330925beaba16802312b51b4bad33d74098"},
218 |     {file = "grpcio-1.35.0-cp36-cp36m-manylinux2014_i686.whl", hash = "sha256:764b50ba1a15a2074cdd1a841238f2dead0a06529c495a46821fae84cb9c7342"},
219 |     {file = "grpcio-1.35.0-cp36-cp36m-manylinux2014_x86_64.whl", hash = "sha256:088c8bea0f6b596937fefacf2c8df97712e7a3dd49496975049cc95dbf02af1a"},
220 |     {file = "grpcio-1.35.0-cp36-cp36m-win32.whl", hash = "sha256:1aa53f82362c7f2791fe0cdd9a3b3aec325c11d8f0dfde600f91907dfaa8546b"},
221 |     {file = "grpcio-1.35.0-cp36-cp36m-win_amd64.whl", hash = "sha256:efb3d67405eb8030db6f27920b4be023fabfb5d4e09c34deab094a7c473a5472"},
222 |     {file = "grpcio-1.35.0-cp37-cp37m-macosx_10_10_x86_64.whl", hash = "sha256:44aaa6148d18a8e836f99dadcdec17b27bc7ec0995b2cc12c94e61826040ec90"},
223 |     {file = "grpcio-1.35.0-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:18ad7644e23757420ea839ac476ef861e4f4841c8566269b7c91c100ca1943b3"},
224 |     {file = "grpcio-1.35.0-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:859a0ceb23d7189362cc06fe7e906e9ed5c7a8f3ac960cc04ce13fe5847d0b62"},
225 |     {file = "grpcio-1.35.0-cp37-cp37m-manylinux2014_i686.whl", hash = "sha256:3e7d4428ed752fdfe2dddf2a404c93d3a2f62bf4b9109c0c10a850c698948891"},
226 |     {file = "grpcio-1.35.0-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:a36151c335280b09afd5123f3b25085027ae2b10682087a4342fb6f635b928fb"},
227 |     {file = "grpcio-1.35.0-cp37-cp37m-win32.whl", hash = "sha256:dfecb2acd3acb8bb50e9aa31472c6e57171d97c1098ee67cd283a6fe7d56a926"},
228 |     {file = "grpcio-1.35.0-cp37-cp37m-win_amd64.whl", hash = "sha256:e87e55fba98ebd7b4c614dcef9940dc2a7e057ad8bba5f91554934d47319a35b"},
229 |     {file = "grpcio-1.35.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:da44bf613eed5d9e8df0785463e502a416de1be6e4ac31edbe99c9111abaed5f"},
230 |     {file = "grpcio-1.35.0-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:9e503eaf853199804a954dc628c5207e67d6c7848dcba42a997fbe718618a2b1"},
231 |     {file = "grpcio-1.35.0-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:6ba3d7acf70acde9ce27e22921db921b84a71be578b32739536c32377b65041a"},
232 |     {file = "grpcio-1.35.0-cp38-cp38-manylinux2014_i686.whl", hash = "sha256:048c01d1eb5c2ae7cba2254b98938d2fc81f6dc10d172d9261d65266adb0fdb3"},
233 |     {file = "grpcio-1.35.0-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:efd896e8ca7adb2654cf014479a5e1f74e4f776b6b2c0fbf95a6c92787a6631a"},
234 |     {file = "grpcio-1.35.0-cp38-cp38-win32.whl", hash = "sha256:8a29a26b9f39701ce15aa1d5aa5e96e0b5f7028efe94f95341a4ed8dbe4bed78"},
235 |     {file = "grpcio-1.35.0-cp38-cp38-win_amd64.whl", hash = "sha256:aea3d592a7ece84739b92d212cd16037c51d84a259414f64b51c14e946611f3d"},
236 |     {file = "grpcio-1.35.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:2f8e8d35d4799aa1627a212dbe8546594abf4064056415c31bd1b3b8f2a62027"},
237 |     {file = "grpcio-1.35.0-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:9f0da13b215068e7434b161a35d0b4e92140ffcfa33ddda9c458199ea1d7ce45"},
238 |     {file = "grpcio-1.35.0-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:7ae408780b79c9b9b91a2592abd1d7abecd05675d988ea75038580f420966b59"},
239 |     {file = "grpcio-1.35.0-cp39-cp39-manylinux2014_i686.whl", hash = "sha256:0f714e261e1d63615476cda4ee808a79cca62f8f09e2943c136c2f87ec5347b1"},
240 |     {file = "grpcio-1.35.0-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:7ee7d54da9d176d3c9a0f47c04d7ff6fdc6ee1c17643caff8c33d6c8a70678a4"},
241 |     {file = "grpcio-1.35.0-cp39-cp39-win32.whl", hash = "sha256:94c3b81089a86d3c5877d22b07ebc66b5ed1d84771e24b001844e29a5b6178dd"},
242 |     {file = "grpcio-1.35.0-cp39-cp39-win_amd64.whl", hash = "sha256:399ee377b312ac652b07ef4365bbbba009da361fa7708c4d3d4ce383a1534ea7"},
243 |     {file = "grpcio-1.35.0.tar.gz", hash = "sha256:7bd0ebbb14dde78bf66a1162efd29d3393e4e943952e2f339757aa48a184645c"},
244 | ]
245 | importlib-metadata = [
246 |     {file = "importlib_metadata-3.4.0-py3-none-any.whl", hash = "sha256:ace61d5fc652dc280e7b6b4ff732a9c2d40db2c0f92bc6cb74e07b73d53a1771"},
247 |     {file = "importlib_metadata-3.4.0.tar.gz", hash = "sha256:fa5daa4477a7414ae34e95942e4dd07f62adf589143c875c133c1e53c4eff38d"},
248 | ]
249 | iniconfig = [
250 |     {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"},
251 |     {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"},
252 | ]
253 | packaging = [
254 |     {file = "packaging-20.9-py2.py3-none-any.whl", hash = "sha256:67714da7f7bc052e064859c05c595155bd1ee9f69f76557e21f051443c20947a"},
255 |     {file = "packaging-20.9.tar.gz", hash = "sha256:5b327ac1320dc863dca72f4514ecc086f31186744b84a230374cc1fd776feae5"},
256 | ]
257 | pluggy = [
258 |     {file = "pluggy-0.13.1-py2.py3-none-any.whl", hash = "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"},
259 |     {file = "pluggy-0.13.1.tar.gz", hash = "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0"},
260 | ]
261 | protobuf = [
262 |     {file = "protobuf-3.14.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:629b03fd3caae7f815b0c66b41273f6b1900a579e2ccb41ef4493a4f5fb84f3a"},
263 |     {file = "protobuf-3.14.0-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:5b7a637212cc9b2bcf85dd828b1178d19efdf74dbfe1ddf8cd1b8e01fdaaa7f5"},
264 |     {file = "protobuf-3.14.0-cp35-cp35m-macosx_10_9_intel.whl", hash = "sha256:43b554b9e73a07ba84ed6cf25db0ff88b1e06be610b37656e292e3cbb5437472"},
265 |     {file = "protobuf-3.14.0-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:5e9806a43232a1fa0c9cf5da8dc06f6910d53e4390be1fa06f06454d888a9142"},
266 |     {file = "protobuf-3.14.0-cp35-cp35m-win32.whl", hash = "sha256:1c51fda1bbc9634246e7be6016d860be01747354ed7015ebe38acf4452f470d2"},
267 |     {file = "protobuf-3.14.0-cp35-cp35m-win_amd64.whl", hash = "sha256:4b74301b30513b1a7494d3055d95c714b560fbb630d8fb9956b6f27992c9f980"},
268 |     {file = "protobuf-3.14.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:86a75477addde4918e9a1904e5c6af8d7b691f2a3f65587d73b16100fbe4c3b2"},
269 |     {file = "protobuf-3.14.0-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:ecc33531a213eee22ad60e0e2aaea6c8ba0021f0cce35dbf0ab03dee6e2a23a1"},
270 |     {file = "protobuf-3.14.0-cp36-cp36m-win32.whl", hash = "sha256:72230ed56f026dd664c21d73c5db73ebba50d924d7ba6b7c0d81a121e390406e"},
271 |     {file = "protobuf-3.14.0-cp36-cp36m-win_amd64.whl", hash = "sha256:0fc96785262042e4863b3f3b5c429d4636f10d90061e1840fce1baaf59b1a836"},
272 |     {file = "protobuf-3.14.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:4e75105c9dfe13719b7293f75bd53033108f4ba03d44e71db0ec2a0e8401eafd"},
273 |     {file = "protobuf-3.14.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:2a7e2fe101a7ace75e9327b9c946d247749e564a267b0515cf41dfe450b69bac"},
274 |     {file = "protobuf-3.14.0-cp37-cp37m-win32.whl", hash = "sha256:b0d5d35faeb07e22a1ddf8dce620860c8fe145426c02d1a0ae2688c6e8ede36d"},
275 |     {file = "protobuf-3.14.0-cp37-cp37m-win_amd64.whl", hash = "sha256:8971c421dbd7aad930c9bd2694122f332350b6ccb5202a8b7b06f3f1a5c41ed5"},
276 |     {file = "protobuf-3.14.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9616f0b65a30851e62f1713336c931fcd32c057202b7ff2cfbfca0fc7d5e3043"},
277 |     {file = "protobuf-3.14.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:22bcd2e284b3b1d969c12e84dc9b9a71701ec82d8ce975fdda19712e1cfd4e00"},
278 |     {file = "protobuf-3.14.0-py2.py3-none-any.whl", hash = "sha256:0e247612fadda953047f53301a7b0407cb0c3cb4ae25a6fde661597a04039b3c"},
279 |     {file = "protobuf-3.14.0.tar.gz", hash = "sha256:1d63eb389347293d8915fb47bee0951c7b5dab522a4a60118b9a18f33e21f8ce"},
280 | ]
281 | py = [
282 |     {file = "py-1.10.0-py2.py3-none-any.whl", hash = "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"},
283 |     {file = "py-1.10.0.tar.gz", hash = "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3"},
284 | ]
285 | pyparsing = [
286 |     {file = "pyparsing-2.4.7-py2.py3-none-any.whl", hash = "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"},
287 |     {file = "pyparsing-2.4.7.tar.gz", hash = "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1"},
288 | ]
289 | pytest = [
290 |     {file = "pytest-6.2.2-py3-none-any.whl", hash = "sha256:b574b57423e818210672e07ca1fa90aaf194a4f63f3ab909a2c67ebb22913839"},
291 |     {file = "pytest-6.2.2.tar.gz", hash = "sha256:9d1edf9e7d0b84d72ea3dbcdfd22b35fb543a5e8f2a60092dd578936bf63d7f9"},
292 | ]
293 | six = [
294 |     {file = "six-1.15.0-py2.py3-none-any.whl", hash = "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced"},
295 |     {file = "six-1.15.0.tar.gz", hash = "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259"},
296 | ]
297 | toml = [
298 |     {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"},
299 |     {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"},
300 | ]
301 | typing-extensions = [
302 |     {file = "typing_extensions-3.7.4.3-py2-none-any.whl", hash = "sha256:dafc7639cde7f1b6e1acc0f457842a83e722ccca8eef5270af2d74792619a89f"},
303 |     {file = "typing_extensions-3.7.4.3-py3-none-any.whl", hash = "sha256:7cb407020f00f7bfc3cb3e7881628838e69d8f3fcab2f64742a5e76b2f841918"},
304 |     {file = "typing_extensions-3.7.4.3.tar.gz", hash = "sha256:99d4073b617d30288f569d3f13d2bd7548c3a7e4c8de87db09a9d29bb3a4a60c"},
305 | ]
306 | zipp = [
307 |     {file = "zipp-3.4.0-py3-none-any.whl", hash = "sha256:102c24ef8f171fd729d46599845e95c7ab894a4cf45f5de11a44cc7444fb1108"},
308 |     {file = "zipp-3.4.0.tar.gz", hash = "sha256:ed5eee1974372595f9e416cc7bbeeb12335201d8081ca8a0743c954d4446e5cb"},
309 | ]
310 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "bareunpy"
 3 | version = "1.7.1"
 4 | description = "The bareun python library using grpc"
 5 | authors = ["Gihyun YUN <gih2yun@baikal.ai>"]
 6 | license = "BSD-3-Clause"
 7 | readme = "README.md"
 8 | homepage = "https://bareun.ai/"
 9 | repository = "https://github.com/bareun-nlp/bareunpy"
10 | keywords = [ "NLP", "Korean", "Deep Learning", "POS tagger", "bareun"]
11 | classifiers = [
12 |     "Development Status :: 5 - Production/Stable",
13 |     "Intended Audience :: Science/Research",
14 |     "Intended Audience :: Developers",
15 |     "Intended Audience :: Information Technology",
16 |     "Intended Audience :: Education",
17 |     "Natural Language :: Korean",
18 |     "Operating System :: OS Independent",
19 |     "Typing :: Typed",
20 |     "Topic :: Software Development",
21 |     "Topic :: Software Development :: Libraries",
22 |     "Topic :: Scientific/Engineering",
23 |     "Topic :: Scientific/Engineering :: Artificial Intelligence",
24 |     "Topic :: Scientific/Engineering :: Information Analysis",
25 |     "Operating System :: Microsoft :: Windows",
26 |     "Operating System :: POSIX",
27 |     "Operating System :: Unix",
28 |     "Operating System :: MacOS"
29 | ]
30 | 
31 | [tool.poetry.dependencies]
32 | python = "^3.6"
33 | grpcio = "^1.53.2"
34 | protobuf = "^3.19.6"
35 | googleapis-common-protos = "^1.56.0"
36 | bareun-apis = "^0.15.2"
37 | 
38 | [tool.poetry.group.dev.dependencies]
39 | pytest = "^6.2.2"
40 | 
41 | [build-system]
42 | requires = ["poetry-core>=1.0.0"]
43 | build-backend = "poetry.core.masonry.api"
44 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | grpcio==1.53.2
2 | googleapis-common-protos==1.56.0
3 | protobuf>=3.19.6
4 | bareun-apis==0.15.2
5 | setuptools~=60.5.0  # 
6 | pytest>=7.2.1


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal=1
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import setuptools
 3 | 
 4 | with open("README.md", "r", encoding="utf-8") as fh:
 5 |     long_description = fh.read()
 6 | 
 7 | CLASSIFIERS = """\
 8 | Development Status :: 5 - Production/Stable
 9 | Intended Audience :: Science/Research
10 | Intended Audience :: Developers
11 | License :: OSI Approved :: BSD License
12 | Programming Language :: Python :: 3
13 | Programming Language :: Python :: 3.6
14 | Programming Language :: Python :: 3.7
15 | Programming Language :: Python :: 3.8
16 | Programming Language :: Python :: 3.9
17 | Programming Language :: Python :: 3.10
18 | Programming Language :: Python :: 3.11
19 | Programming Language :: Python :: 3.12
20 | Programming Language :: Python :: 3.13
21 | Programming Language :: Python :: 3 :: Only
22 | Natural Language :: Korean
23 | Development Status :: 5 - Production/Stable
24 | Operating System :: OS Independent
25 | Typing :: Typed
26 | Topic :: Software Development
27 | Topic :: Scientific/Engineering
28 | Topic :: Scientific/Engineering :: Artificial Intelligence
29 | Topic :: Scientific/Engineering :: Information Analysis
30 | Operating System :: Microsoft :: Windows
31 | Operating System :: POSIX
32 | Operating System :: Unix
33 | Operating System :: MacOS
34 | """
35 | 
36 | # import grpc_tools
37 | #
38 | # setuptools.setup(
39 | #     cmdclass={
40 | #         'build_proto_modules': grpc_tools.command.BuildPackageProtos,
41 | #     }
42 | # )
43 | 
44 | setuptools.setup(
45 |     name="bareunpy",
46 |     version="1.7.1",
47 |     author="Gihyun YUN",
48 |     author_email="gih2yun@baikal.ai",
49 |     description="The bareun python API library",
50 |     long_description=long_description,
51 |     long_description_content_type="text/markdown",
52 |     url="https://bareun.ai/",
53 |     download_url="https://pypi.python.org/pypi/bareunpy",
54 |     project_urls={
55 |         "Bug Tracker": "https://github.com/bareun-nlp/bareunpy/issues",
56 |         # "Documentation": get_docs_url(),
57 |         "Source Code": "https://github.com/bareun-nlp/bareunpy",
58 |     },
59 |     license='BSD',
60 |     platform='Independent',
61 |     packages=setuptools.find_packages(),
62 |     classifiers=[_f for _f in CLASSIFIERS.split('\n') if _f],
63 |     python_requires='>=3.6',
64 | )
65 | 


--------------------------------------------------------------------------------
/tests/test_tagger.py:
--------------------------------------------------------------------------------
  1 | #!env python3
  2 | # -*- coding: utf-8 -*-
  3 | import pytest
  4 | 
  5 | 
  6 | @pytest.fixture
  7 | def tagger_instance():
  8 |     import bareunpy
  9 |     ## FIXME change it nlp.bareun.ai
 10 |     t = bareunpy.Tagger(apikey="koba-42CXULQ-SDPU6ZA-RQ6QPBQ-4BMZCOA",
 11 |                         host="10.3.8.44",
 12 |                         port=5656)
 13 |     return t
 14 | 
 15 | @pytest.fixture
 16 | def tagger_error_host_instance():
 17 |     import bareunpy
 18 |     t = bareunpy.Tagger(apikey="koba-42CXULQ-SDPU6ZA-RQ6QPBQ-4BMZCOA",
 19 |                         host="10.3.8.44:5656",
 20 |                         port=5656)
 21 |     return t
 22 | 
 23 | @pytest.fixture
 24 | def tagger_error_apikey_instance():
 25 |     import bareunpy
 26 |     t = bareunpy.Tagger(apikey="koba-42CXULQ-SDPU6ZA",
 27 |                         host="10.3.8.44",
 28 |                         port=5656)
 29 |     return t
 30 | 
 31 | @pytest.fixture
 32 | def sample1():
 33 |     return '오늘은 정말 추운 날이네요.'
 34 | 
 35 | 
 36 | def test_tagger_pos(tagger_instance, sample1):
 37 |     assert tagger_instance.pos(sample1) == \
 38 |            [('오늘', 'NNG'),
 39 |             ('은', 'JX'),
 40 |             ('정말', 'MAG'),
 41 |             ('춥', 'VA'),
 42 |             ('ㄴ', 'ETM'),
 43 |             ('날', 'NNG'),
 44 |             ('이', 'VCP'),
 45 |             ('네', 'EF'),
 46 |             ('요', 'JX'),
 47 |             ('.', 'SF')
 48 |             ]
 49 | 
 50 | 
 51 | def test_tagger_pos_join(tagger_instance, sample1):
 52 |     assert tagger_instance.pos(sample1, join=True) == \
 53 |            ['오늘/NNG',
 54 |             '은/JX',
 55 |             '정말/MAG',
 56 |             '춥/VA',
 57 |             'ㄴ/ETM',
 58 |             '날/NNG',
 59 |             '이/VCP',
 60 |             '네/EF',
 61 |             '요/JX',
 62 |             './SF'
 63 |             ]
 64 | 
 65 | 
 66 | def test_tagger_pos_detail(tagger_instance, sample1):
 67 |     temp = tagger_instance.pos(sample1, detail=True)
 68 |     temp2 = [(t[0], t[1], t[2]) for t in temp]
 69 |     assert (temp2 == [('오늘', 'NNG', 'IN_WORD_EMBEDDING'),
 70 |                       ('은', 'JX', 'IN_WORD_EMBEDDING'),
 71 |                       ('정말', 'MAG', 'IN_WORD_EMBEDDING'),
 72 |                       ('춥', 'VA', 'IN_WORD_EMBEDDING'),
 73 |                       ('ㄴ', 'ETM', 'IN_WORD_EMBEDDING'),
 74 |                       ('날', 'NNG', 'IN_WORD_EMBEDDING'),
 75 |                       ('이', 'VCP', 'IN_WORD_EMBEDDING'),
 76 |                       ('네', 'EF', 'IN_WORD_EMBEDDING'),
 77 |                       ('요', 'JX', 'IN_WORD_EMBEDDING'),
 78 |                       ('.', 'SF', 'IN_WORD_EMBEDDING')])
 79 | 
 80 | 
 81 | def test_tagger_morphs(tagger_instance, sample1):
 82 |     assert tagger_instance.morphs(sample1) == \
 83 |            ['오늘',
 84 |             '은',
 85 |             '정말',
 86 |             '춥',
 87 |             'ㄴ',
 88 |             '날',
 89 |             '이',
 90 |             '네',
 91 |             '요',
 92 |             '.']
 93 | 
 94 | 
 95 | def test_tagger_nouns(tagger_instance, sample1):
 96 |     assert tagger_instance.nouns(sample1) == \
 97 |            ['오늘', '날']
 98 | 
 99 | 
100 | def test_tagger_tag_as_json_str(tagger_instance, sample1):
101 |     j = tagger_instance.tag(sample1).as_json()
102 |     assert len(j['sentences']) == 1
103 |     assert len(j['sentences'][0]['tokens']) == 4
104 |     assert len(j['sentences'][0]['tokens'][0]['morphemes']) == 2
105 |     assert len(j['sentences'][0]['tokens'][1]['morphemes']) == 1
106 |     assert len(j['sentences'][0]['tokens'][2]['morphemes']) == 2
107 |     assert len(j['sentences'][0]['tokens'][3]['morphemes']) == 5
108 |     assert len(j['sentences'][0]['tokens'][3]['morphemes']) == 5
109 | 
110 | 
111 | def test_tagger_tag_as_msg(tagger_instance, sample1):
112 |     m = tagger_instance.tag(sample1).msg()
113 |     assert m.sentences[0].tokens[3].tagged == '날/NNG+이/VCP+네/EF+요/JX+./SF'
114 | 
115 | 
116 | def test_tagger_tag_print_as_json(tagger_instance, sample1):
117 |     import tempfile
118 |     with tempfile.TemporaryFile('w+') as f:
119 |         tagger_instance.tag(sample1).print_as_json(out=f)
120 |         assert f.tell() > 0
121 | 
122 | 
123 | def test_tagger_create_custom_dict(tagger_instance):
124 |     try:
125 |         cd = tagger_instance.custom_dict('my')
126 |         assert cd is not None
127 |     except TypeError as e:
128 |         assert False
129 | 
130 | 
131 | def test_tagger_update_custom_dict(tagger_instance):
132 |     try:
133 |         cd = tagger_instance.custom_dict('my')
134 |         cd.copy_np_set({'유리왕', '근초고왕', '누루하치', '베링거인겔하임'})
135 |         cd.copy_cp_set({'코로나19'})
136 |         cd.copy_cp_caret_set({'인공지능^데이터^학습', '자연어^처리^엔진'})
137 |         cd.update()
138 |         assert True
139 |     except TypeError as e:
140 |         assert False
141 | 
142 | 
143 | def test_tagger_get_custom_dict_np_set(tagger_instance):
144 |     try:
145 |         cd = tagger_instance.custom_dict('my')
146 |         dic = cd.get()
147 |         assert len(dic.np_set.items) == 4
148 |         assert '유리왕' in dic.np_set.items
149 |         assert '근초고왕' in dic.np_set.items
150 |         assert '누루하치' in dic.np_set.items
151 |         assert '베링거인겔하임' in dic.np_set.items
152 |     except TypeError as e:
153 |         assert False
154 | 
155 | 
156 | def test_tagger_get_custom_dict_cp_set(tagger_instance):
157 |     try:
158 |         cd = tagger_instance.custom_dict('my')
159 |         dic = cd.get()
160 |         assert len(dic.cp_set.items) == 1
161 |         assert '코로나19' in dic.cp_set.items
162 |     except TypeError as e:
163 |         assert False
164 | 
165 | 
166 | def test_tagger_get_custom_dict_cp_caret_set(tagger_instance):
167 |     try:
168 |         cd = tagger_instance.custom_dict('my')
169 |         dic = cd.get()
170 |         assert len(dic.cp_caret_set.items) == 2
171 |         assert '인공지능^데이터^학습' in dic.cp_caret_set.items
172 |         assert '자연어^처리^엔진' in dic.cp_caret_set.items
173 |     except TypeError as e:
174 |         assert False
175 | 
176 | def test_exception_apikey(tagger_error_apikey_instance, sample1):
177 |     try:
178 |         tagger_error_apikey_instance.pos(sample1)
179 |     except Exception as e:
180 |         assert e.args[0][:27] == '\n입력한 API KEY가 정확한지 확인해 주세요.'
181 | 
182 | def test_exception_host(tagger_error_host_instance, sample1):
183 |     try:
184 |         tagger_error_host_instance.pos(sample1)
185 |     except Exception as e:
186 |         assert e.args[0][:16] == '\n서버에 연결할 수 없습니다.'
187 | 
188 | 


--------------------------------------------------------------------------------
/tests/test_tokenizer.py:
--------------------------------------------------------------------------------
  1 | #!env python3
  2 | # -*- coding: utf-8 -*-
  3 | import pytest
  4 | from bareunpy import Tokenized
  5 | 
  6 | @pytest.fixture
  7 | def tokenizer_instance():
  8 |     import bareunpy
  9 |     ## FIXME change it nlp.bareun.ai
 10 |     t = bareunpy.Tokenizer(
 11 |         apikey="koba-42CXULQ-SDPU6ZA-RQ6QPBQ-4BMZCOA",
 12 |         host="10.3.8.44",
 13 |         port=5757)
 14 |     return t
 15 | 
 16 | @pytest.fixture
 17 | def tokenizer_error_host_instance():
 18 |     import bareunpy
 19 |     ## FIXME change it nlp.bareun.ai
 20 |     t = bareunpy.Tokenizer(
 21 |         apikey="koba-42CXULQ-SDPU6ZA-RQ6QPBQ-4BMZCOA",
 22 |         host="10.3.8.44:5757",
 23 |         port=5757)
 24 |     return t
 25 | 
 26 | @pytest.fixture
 27 | def tokenizer_error_apikey_instance():
 28 |     import bareunpy
 29 |     ## FIXME change it nlp.bareun.ai
 30 |     t = bareunpy.Tokenizer(
 31 |         apikey="koba-42CXULQ-SDPU6ZA",
 32 |         host="10.3.8.44",
 33 |         port=5757)
 34 |     return t
 35 | 
 36 | TEST_STR='오늘은 정말 추운 날이네요.'
 37 | @pytest.fixture
 38 | def sample1():
 39 |     return TEST_STR
 40 | 
 41 | 
 42 | def test_tokenizer_seg_not_flatten(tokenizer_instance, sample1):
 43 |     assert tokenizer_instance.seg(sample1, flatten=False) == \
 44 |            [[('오늘'), ('은')],
 45 |             [('정말')],
 46 |             [('춥'), ('ㄴ')],
 47 |             [('날'), ('이'), ('네'), ('요'), ('.')]
 48 |             ]
 49 | 
 50 | 
 51 | def test_tokenizer_seg_join(tokenizer_instance, sample1):
 52 |     assert tokenizer_instance.seg(sample1, join=True, detail=True) == \
 53 |            ['오늘/N',
 54 |             '은/J',
 55 |             '정말/A',
 56 |             '춥/V',
 57 |             'ㄴ/E',
 58 |             '날/N',
 59 |             '이/V',
 60 |             '네/E',
 61 |             '요/J',
 62 |             './S'
 63 |             ]
 64 | 
 65 | 
 66 | def test_tokenizer_seg_detail(tokenizer_instance, sample1):
 67 |     temp = tokenizer_instance.seg(sample1, detail=True)
 68 |     temp2 = [(t[0], t[1]) for t in temp]
 69 |     assert (temp2 == [('오늘', 'N'),
 70 |                       ('은', 'J'),
 71 |                       ('정말', 'A'),
 72 |                       ('춥', 'V'),
 73 |                       ('ㄴ', 'E'),
 74 |                       ('날', 'N'),
 75 |                       ('이', 'V'),
 76 |                       ('네', 'E'),
 77 |                       ('요', 'J'),
 78 |                       ('.', 'S')])
 79 | 
 80 | 
 81 | def test_tokenizer_seg(tokenizer_instance, sample1):
 82 |     assert tokenizer_instance.seg(sample1) == \
 83 |            ['오늘',
 84 |             '은',
 85 |             '정말',
 86 |             '춥',
 87 |             'ㄴ',
 88 |             '날',
 89 |             '이',
 90 |             '네',
 91 |             '요',
 92 |             '.']
 93 | 
 94 | 
 95 | def test_tokenizer_nouns(tokenizer_instance, sample1):
 96 |     assert tokenizer_instance.nouns(sample1) == \
 97 |            ['오늘', '날']
 98 | 
 99 | 
100 | def test_tokenizer_tokenize_as_json_str(tokenizer_instance, sample1):
101 |     j = tokenizer_instance.tokenize(sample1).as_json()
102 |     assert len(j['sentences']) == 1
103 |     assert len(j['sentences'][0]['tokens']) == 4
104 |     assert len(j['sentences'][0]['tokens'][0]['segments']) == 2
105 |     assert len(j['sentences'][0]['tokens'][1]['segments']) == 1
106 |     assert len(j['sentences'][0]['tokens'][2]['segments']) == 2
107 |     assert len(j['sentences'][0]['tokens'][3]['segments']) == 5
108 |     assert len(j['sentences'][0]['tokens'][3]['segments']) == 5
109 | 
110 | 
111 | def test_tokenizer_tokenize_as_msg(tokenizer_instance, sample1):
112 |     m = tokenizer_instance.tokenize(sample1).msg()
113 |     assert m.sentences[0].tokens[3].tagged == '날/N+이/V+네/E+요/J+./S'
114 | 
115 | 
116 | def test_tokenizer_tokenize_print_as_json(tokenizer_instance, sample1):
117 |     import tempfile
118 |     with tempfile.TemporaryFile('w+') as f:
119 |         tokenizer_instance.tokenize(sample1).print_as_json(out=f)
120 |         assert f.tell() > 0
121 | 
122 | def test_tokenized_nouns(tokenized: Tokenized):
123 |     assert tokenized.nouns() == \
124 |            ['오늘',
125 |             '날']
126 | 
127 | def test_exception_apikey_tokenizer(tokenizer_error_apikey_instance, sample1):
128 |     try:
129 |         m = tokenizer_error_apikey_instance.tokenize(sample1).msg()
130 |     except Exception as e:
131 |         assert e.args[0][:27] == '\n입력한 API KEY가 정확한지 확인해 주세요.'
132 | 
133 | def test_exception_host_tokenizer(tokenizer_error_host_instance, sample1):
134 |     try:
135 |         m = tokenizer_error_host_instance.tokenize(sample1).msg()
136 |     except Exception as e:
137 |         assert e.args[0][:16] == '\n서버에 연결할 수 없습니다.'
138 | 
139 | @pytest.fixture
140 | def tokenized() -> Tokenized:
141 |     import bareunpy
142 |     # FIXME change it nlp.bareun.ai
143 |     t = bareunpy.Tokenizer(
144 |         apikey="koba-42CXULQ-SDPU6ZA-RQ6QPBQ-4BMZCOA",
145 |         host="10.3.8.44",
146 |         port=5757)
147 | 
148 |     return t.tokenize(TEST_STR)
149 | 
150 | def test_tokenized_verbs(tokenized: Tokenized):
151 |     assert tokenized.predicates() == \
152 |            ['춥',
153 |             '이']
154 | 
155 | def test_tokenized_symbols(tokenized: Tokenized):
156 |     assert tokenized.symbols() == \
157 |            ['.']
158 | 
159 | def test_tokenized_adverbs(tokenized: Tokenized):
160 |     assert tokenized.adverbs() == \
161 |            ['정말']
162 | 
163 | def test_tokenized_endings(tokenized: Tokenized):
164 |     assert tokenized.endings() == \
165 |            ['ㄴ','네']
166 | 
167 | def test_tokenized_postpositions(tokenized: Tokenized):
168 |     assert tokenized.postpositions() == \
169 |            ['은','요',]
170 | 


--------------------------------------------------------------------------------