├── .github ├── doc │ ├── khaiii_for_space_error.pptx │ └── network.pptx ├── img │ ├── multi-task-learning.png │ ├── network.png │ ├── pull-request-to-develop.png │ └── win_emb_f.png └── pull_request_template.md ├── .gitignore ├── CMakeLists.txt ├── CONTRIBUTING.md ├── LICENSE ├── NOTICE.md ├── README.md ├── cmake ├── CodeCoverage.cmake ├── FindGperftools.cmake ├── FusedMultiplyAdd.cmake ├── Hunter │ └── config.cmake └── HunterGate.cmake ├── docker └── Dockerfile ├── include └── khaiii │ ├── KhaiiiApi.hpp │ ├── khaiii_api.h │ └── khaiii_dev.h ├── munjong ├── apply_patch.py ├── convert_jamo_to_compat.py ├── detect_sejong_period_error.py ├── fix_final_symbol_error.py ├── make_patch.py ├── recover_english_case.py ├── recover_raw_morph_mismatch.py ├── recover_wide_quotation.py └── remove_sejong_period_error.py ├── requirements.txt ├── rsc ├── Makefile ├── bin │ ├── compile_errpatch.py │ ├── compile_model.py │ ├── compile_preanal.py │ └── compile_restore.py └── src │ ├── base.config.json │ ├── base.errpatch.auto │ ├── base.errpatch.manual │ ├── base.model.pickle │ ├── char_align.map │ ├── large.config.json │ ├── large.errpatch.auto │ ├── large.errpatch.manual │ ├── large.model.pickle │ ├── preanal.auto │ ├── preanal.manual │ ├── restore.dic │ ├── vocab.in │ ├── vocab.out │ └── vocab.out.more ├── src ├── main │ ├── cpp │ │ ├── khaiii │ │ │ ├── Config.cpp │ │ │ ├── Config.hpp │ │ │ ├── Embed.cpp │ │ │ ├── Embed.hpp │ │ │ ├── ErrPatch.cpp │ │ │ ├── ErrPatch.hpp │ │ │ ├── KhaiiiImpl.cpp │ │ │ ├── KhaiiiImpl.hpp │ │ │ ├── MemMapFile.hpp │ │ │ ├── Morph.cpp │ │ │ ├── Morph.hpp │ │ │ ├── Preanal.cpp │ │ │ ├── Preanal.hpp │ │ │ ├── Resource.cpp │ │ │ ├── Resource.hpp │ │ │ ├── Restore.cpp │ │ │ ├── Restore.hpp │ │ │ ├── Sentence.cpp │ │ │ ├── Sentence.hpp │ │ │ ├── Tagger.cpp │ │ │ ├── Tagger.hpp │ │ │ ├── Trie.cpp │ │ │ ├── Trie.hpp │ │ │ ├── Word.cpp │ │ │ ├── Word.hpp │ │ │ ├── khaiii_api.cpp │ │ │ ├── khaiii_dev.cpp │ │ │ ├── nn │ │ │ │ ├── Conv1d.cpp │ │ │ │ ├── Conv1d.hpp │ │ │ │ ├── Linear.cpp │ │ │ │ ├── Linear.hpp │ │ │ │ ├── tensor.cpp │ │ │ │ └── tensor.hpp │ │ │ └── util.hpp │ │ └── main.cpp │ └── python │ │ ├── MANIFEST.in.in │ │ ├── khaiii │ │ ├── __init__.py │ │ ├── __init__.py.in │ │ ├── khaiii.py │ │ ├── munjong │ │ │ ├── __init__.py │ │ │ ├── libpatch.py │ │ │ └── sejong_corpus.py │ │ ├── resource │ │ │ ├── __init__.py │ │ │ ├── char_align.py │ │ │ ├── jaso.py │ │ │ ├── morphs.py │ │ │ ├── resource.py │ │ │ ├── trie.py │ │ │ └── vocabulary.py │ │ └── train │ │ │ ├── dataset.py │ │ │ ├── embedder.py │ │ │ ├── evaluator.py │ │ │ ├── models.py │ │ │ ├── sentence.py │ │ │ ├── tagger.py │ │ │ └── trainer.py │ │ └── setup.py.in └── test │ ├── cpp │ ├── khaiii │ │ ├── ErrPatchTest.cpp │ │ ├── KhaiiiApiTest.cpp │ │ ├── KhaiiiApiTest.hpp │ │ ├── KhaiiiDevTest.cpp │ │ └── PreanalTest.cpp │ └── test_main.cpp │ └── python │ └── test_khaiii │ ├── __init__.py │ └── test_khaiii.py └── train ├── eval.py ├── extract_errpatch.py ├── extract_preanal.py ├── hd_validate_errpatch.bash ├── make_vocab.py ├── map_char_to_tag.py ├── pickle_model.py ├── requirements.txt ├── split_corpus.py ├── tag.py ├── train.py ├── transform_corpus.py └── validate_errpatch.py /.github/doc/khaiii_for_space_error.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kakao/khaiii/3d0c8944374163b1107fd6690ccf3a408430a02d/.github/doc/khaiii_for_space_error.pptx -------------------------------------------------------------------------------- /.github/doc/network.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kakao/khaiii/3d0c8944374163b1107fd6690ccf3a408430a02d/.github/doc/network.pptx -------------------------------------------------------------------------------- /.github/img/multi-task-learning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kakao/khaiii/3d0c8944374163b1107fd6690ccf3a408430a02d/.github/img/multi-task-learning.png -------------------------------------------------------------------------------- /.github/img/network.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kakao/khaiii/3d0c8944374163b1107fd6690ccf3a408430a02d/.github/img/network.png -------------------------------------------------------------------------------- /.github/img/pull-request-to-develop.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kakao/khaiii/3d0c8944374163b1107fd6690ccf3a408430a02d/.github/img/pull-request-to-develop.png -------------------------------------------------------------------------------- /.github/img/win_emb_f.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kakao/khaiii/3d0c8944374163b1107fd6690ccf3a408430a02d/.github/img/win_emb_f.png -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | 설명 (Description) 2 | ---- 3 | _이 문구를 지우고 여기에 내용을 적어주세요. (Remove this sentence and describe here.)_ 4 | 5 | ~~_겁내지 말아요, 저희는 한글을 사랑합니다._~~ 6 | 7 | 8 | 개발자를 위한 가이드 (Developer's Guide) 9 | ---- 10 | 만약 khaiii에 pull request가 처음이라면 [개발자를 위한 가이드](https://github.com/kakao/khaiii/wiki#%EA%B0%9C%EB%B0%9C%EC%9E%90%EB%A5%BC-%EC%9C%84%ED%95%9C-%EA%B0%80%EC%9D%B4%EB%93%9C) 문서들을 한번 읽어보시길 권고드립니다. 11 | 12 | If this is your first pull request for khaiii, please see the [Developer's Guide](https://github.com/kakao/khaiii/wiki#%EA%B0%9C%EB%B0%9C%EC%9E%90%EB%A5%BC-%EC%9C%84%ED%95%9C-%EA%B0%80%EC%9D%B4%EB%93%9C). 13 | 14 | 15 | 체크 리스트 (Checklist) 16 | ---- 17 | pull request 전에 아래 체크 리스트들을 만족하는 지 확인한 후 체크('x') 표시를 해주시기 바랍니다. 18 | 19 | Before you submit pull requests, please check(set 'x') to the checklist below. 20 | 21 | - [ ] master 브랜치가 아니라 **develop** 브랜치에 머지하도록 pull request를 작성 중이신가요? (Did you merge into **develop** branch not master?) 22 | - [ ] `build/test/khaiii` 프로그램을 실행하여 **테스트**가 성공했나요? (Did all **tests** are passed when you ran as `build/test/khaiii`) 23 | - [ ] **PyLint** 툴을 실행하여 발생한 에러를 모두 수정하셨나요? (Did you fix all errors after running **PyLint**?) 24 | - [ ] **CppLint** 툴을 실행하여 발생한 에러를 모두 수정하셨나요? (Did you fix all errors after running **CppLint**?) 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | __pycache__/ 3 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ## Submitting Pull Requests 2 | 3 | When you are sending a pull request, please sign the [CLA](https://cla-assistant.io/kakao/khaiii)(Contributor Licensing Agreement) for Individual. 4 | If you need a Contributor Licensing Agreement for Corporate, please [contact us](mailto:oss@kakaocorp.com). 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | khaiii 2 | ==== 3 | khaiii는 "Kakao Hangul Analyzer III"의 첫 글자들만 모아 만든 이름으로 카카오에서 개발한 세 번째 형태소분석기입니다. 두 번째 버전의 형태소분석기 이름인 dha2 (Daumkakao Hangul Analyzer 2)를 계승한 이름이기도 합니다. 4 | 5 | 형태소는 언어학에서 일정한 의미가 있는 가장 작은 말의 단위로 발화체 내에서 따로 떼어낼 수 있는 것을 말합니다. 즉, 더 분석하면 뜻이 없어지는 말의 단위입니다. 형태소분석기는 단어를 보고 형태소 단위로 분리해내는 소프트웨어를 말합니다. 이러한 형태소분석은 자연어 처리의 가장 기초적인 절차로 이후 구문 분석이나 의미 분석으로 나아가기 위해 가장 먼저 이루어져야 하는 과정으로 볼 수 있습니다. (한국어 위키피디아에서 인용) 6 | 7 | 8 | 데이터 기반 9 | ---- 10 | 기존 버전이 사전과 규칙에 기반해 분석을 하는 데 반해 khaiii는 데이터(혹은 기계학습) 기반의 알고리즘을 이용하여 분석을 합니다. 학습에 사용한 코퍼스는 국립국어원에서 배포한 [21세기 세종계획 최종 성과물](https://ithub.korean.go.kr/user/noticeView.do?boardSeq=1&articleSeq=16)을 저희 카카오에서 오류를 수정하고 내용을 일부 추가하기도 한 것입니다. 11 | 12 | 전처리 과정에서 오류가 발생하는 문장을 제외하고 약 85만 문장, 천만 어절의 코퍼스를 사용하여 학습을 했습니다. 코퍼스와 품사 체계에 대한 자세한 내용은 [코퍼스](https://github.com/kakao/khaiii/wiki/%EC%BD%94%ED%8D%BC%EC%8A%A4) 문서를 참고하시기 바랍니다. 13 | 14 | 15 | 알고리즘 16 | ---- 17 | 기계학습에 사용한 알고리즘은 신경망 알고리즘들 중에서 Convolutional Neural Network(CNN)을 사용하였습니다. 한국어에서 형태소분석은 자연어처리를 위한 가장 기본적인 전처리 과정이므로 속도가 매우 중요한 요소라고 생각합니다. 따라서 자연어처리에 많이 사용하는 Long-Short Term Memory(LSTM)와 같은 Recurrent Neural Network(RNN) 알고리즘은 속도 면에서 활용도가 떨어질 것으로 예상하여 고려 대상에서 제외하였습니다. 18 | 19 | CNN 모델에 대한 상세한 내용은 [CNN 모델](https://github.com/kakao/khaiii/wiki/CNN-%EB%AA%A8%EB%8D%B8) 문서를 참고하시기 바랍니다. 20 | 21 | 22 | 성능 23 | ---- 24 | ### 정확도 25 | 26 | #### v0.3 27 | CNN 모델의 주요 하이퍼 파라미터는 분류하려는 음절의 좌/우 문맥의 크기를 나타내는 win 값과, 음절 임베딩의 차원을 나타내는 emb 값입니다. win 값은 {2, 3, 4, 5, 7, 10}의 값을 가지며, emb 값은 {20, 30, 40, 50, 70, 100, 150, 200, 300, 500}의 값을 가집니다. 따라서 이 두 가지 값의 조합은 6 x 10으로 총 60가지를 실험하였고 아래와 같은 성능을 보였습니다. 성능 지표는 정확률과 재현율의 조화 평균값인 F-Score입니다. 28 | 29 | ![](.github/img/win_emb_f.png) 30 | 31 | win 파라미터의 경우 3 혹은 4에서 가장 좋은 성능을 보이며 그 이상에서는 오히려 성능이 떨어집니다. emb 파라미터의 경우 150까지는 성능도 같이 높아지다가 그 이상에서는 별 차이가 없습니다. 최 상위 5위 중 비교적 작은 모델은 win=3, emb=150으로 F-Score 값은 97.11입니다. 이 모델을 large 모델이라 명명합니다. 32 | 33 | #### v0.4 34 | [띄어쓰기 오류에 강건한 모델을 위한 실험](https://github.com/kakao/khaiii/wiki/%EB%9D%84%EC%96%B4%EC%93%B0%EA%B8%B0-%EC%98%A4%EB%A5%98%EC%97%90-%EA%B0%95%EA%B1%B4%ED%95%9C-%EB%AA%A8%EB%8D%B8%EC%9D%84-%EC%9C%84%ED%95%9C-%EC%8B%A4%ED%97%98)을 통해 모델을 개선하였습니다. v0.4 모델은 띄어쓰기가 잘 되어있지 않은 입력에 대해 보다 좋은 성능을 보이는데 반해 세종 코퍼스에서는 다소 정확도가 떨어집니다. 이러한 점을 보완하기 위해 base 및 large 모델의 파라미터를 아래와 같이 조금 변경했습니다. 35 | 36 | * base 모델: win=4, emb=35, F-Score: 94.96 37 | * large 모델: win=4, emb=180, F-Score: 96.71 38 | 39 | 40 | ### 속도 41 | 42 | #### v0.3 43 | 모델의 크기가 커지면 정확도가 높아지긴 하지만 그만큼 계산량 또한 많아져 속도가 떨어집니다. 그래서 적당한 정확도를 갖는 모델 중에서 크기가 작아 속도가 빠른 모델을 base 모델로 선정하였습니다. F-Score 값이 95 이상이면서 모델의 크기가 작은 모델은 win=3, emb=30이며 F-Score는 95.30입니다. 44 | 45 | 속도를 비교하기 위해 1만 문장(총 903KB, 문장 평균 91)의 텍스트를 분석해 비교했습니다. base 모델의 경우 약 10.5초, large 모델의 경우 약 78.8초가 걸립니다. 46 | 47 | #### v0.4 48 | 모델의 크기가 커짐에 따라 아래와 같이 base, large 모델의 속도를 다시 측정했으며 v0.4 버전에서 다소 느려졌습니다. 49 | 50 | * base 모델: 10.8 -> 14.4 51 | * large 모델: 87.3 -> 165 52 | 53 | 54 | 사용자 사전 55 | ---- 56 | 신경망 알고리즘은 소위 말하는 블랙박스 알고리즘으로 결과를 유추하는 과정을 사람이 따라가기가 쉽지 않습니다. 그래서 오분석이 발생할 경우 모델의 파라미터를 수정하여 바른 결과를 내도록 하는 것이 매우 어렵습니다. 이를 위해 khaiii에서는 신경망 알고리즘의 앞단에 기분석 사전을 뒷단에 오분석 패치라는 두 가지 사용자 사전 장치를 마련해 두었습니다. 57 | 58 | ### 기분석 사전 59 | 기분석 사전은 단일 어절에 대해 문맥에 상관없이 일괄적인 분석 결과를 갖는 경우에 사용합니다. 예를 들어 아래와 같은 엔트리가 있다면, 60 | 61 | 입력 어절 | 분석 결과 62 | --------|-------- 63 | 이더리움* | 이더리움/NNP 64 | 65 | 문장에서 `이더리움`으로 시작하는 모든 어절은 신경망 알고리즘을 사용하지 않고 `이더리움/NNP`로 동일하게 분석합니다. 66 | 67 | 세종 코퍼스에서 분석 모호성이 없는 어절들로부터 자동으로 기분석 사전을 추출할 경우 약 8만 개의 엔트리가 생성됩니다. 이를 적용할 경우 약간의 속도 향상도 있어서 base 모델에 적용하면 약 9.2초로 10% 정도 속도 향상이 있었습니다. 68 | 69 | 기분석 사전의 기술 방법 및 자세한 내용은 [기분석 사전 문서](https://github.com/kakao/khaiii/wiki/%EA%B8%B0%EB%B6%84%EC%84%9D-%EC%82%AC%EC%A0%84)를 참고하시기 바랍니다. 70 | 71 | 72 | ### 오분석 패치 73 | 오분석 패치는 여러 어절에 걸쳐서 충분한 문맥과 함께 오분석을 바로잡아야 할 경우에 사용합니다. 예를 들어 아래와 같은 엔트리가 있다면, 74 | 75 | 입력 텍스트 | 오분석 결과 | 정분석 결과 76 | ---------|-----------|--------- 77 | 이 다른 것 | 이/JKS + _ + 다/VA + 른/MM + _ + 것/NNB | 이/JKS + _ + 다르/VA + ㄴ/ETM + _ + 것/NNB 78 | 79 | 만약 khaiii가 위 "오분석 결과"와 같이 오분석을 발생한 경우에 한해 바른 분석 결과인 "정분석 결과"로 수정합니다. 여기서 "\_"는 어절 간 경계, 즉 공백을 의미합니다. 80 | 81 | 오분석 패치의 기술 방법 및 자세한 내용은 [오분석 패치 문서](https://github.com/kakao/khaiii/wiki/%EC%98%A4%EB%B6%84%EC%84%9D-%ED%8C%A8%EC%B9%98)를 참고하시기 바랍니다. 82 | 83 | 84 | 빌드 및 설치 85 | ---- 86 | khaiii의 빌드 및 설치에 관해서는 [빌드 및 설치 문서](https://github.com/kakao/khaiii/wiki/%EB%B9%8C%EB%93%9C-%EB%B0%8F-%EC%84%A4%EC%B9%98)를 참고하시기 바랍니다. 87 | 88 | 89 | Contributing 90 | ---- 91 | khaiii에 기여하실 분들은 [CONTRIBUTING](CONTRIBUTING.md) 및 [개발자를 위한 가이드](https://github.com/kakao/khaiii/wiki#%EA%B0%9C%EB%B0%9C%EC%9E%90%EB%A5%BC-%EC%9C%84%ED%95%9C-%EA%B0%80%EC%9D%B4%EB%93%9C) 문서를 참고하시기 바랍니다. 92 | 93 | 94 | License 95 | ---- 96 | This software is licensed under the [Apache 2 license](LICENSE), quoted below. 97 | 98 | Copyright 2018 Kakao Corp. 99 | 100 | Licensed under the Apache License, Version 2.0 (the "License"); you may not 101 | use this project except in compliance with the License. You may obtain a copy 102 | of the License at http://www.apache.org/licenses/LICENSE-2.0. 103 | 104 | Unless required by applicable law or agreed to in writing, software 105 | distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 106 | WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 107 | License for the specific language governing permissions and limitations under 108 | the License. 109 | -------------------------------------------------------------------------------- /cmake/FindGperftools.cmake: -------------------------------------------------------------------------------- 1 | # Tries to find Gperftools. 2 | # 3 | # Usage of this module as follows: 4 | # 5 | # find_package(Gperftools) 6 | # 7 | # Variables used by this module, they can change the default behaviour and need 8 | # to be set before calling find_package: 9 | # 10 | # Gperftools_ROOT_DIR Set this variable to the root installation of 11 | # Gperftools if the module has problems finding 12 | # the proper installation path. 13 | # 14 | # Variables defined by this module: 15 | # 16 | # GPERFTOOLS_FOUND System has Gperftools libs/headers 17 | # GPERFTOOLS_LIBRARIES The Gperftools libraries (tcmalloc & profiler) 18 | # GPERFTOOLS_INCLUDE_DIR The location of Gperftools headers 19 | 20 | find_library(GPERFTOOLS_TCMALLOC 21 | NAMES tcmalloc 22 | HINTS ${Gperftools_ROOT_DIR}/lib) 23 | 24 | find_library(GPERFTOOLS_PROFILER 25 | NAMES profiler 26 | HINTS ${Gperftools_ROOT_DIR}/lib) 27 | 28 | find_library(GPERFTOOLS_TCMALLOC_AND_PROFILER 29 | NAMES tcmalloc_and_profiler 30 | HINTS ${Gperftools_ROOT_DIR}/lib) 31 | 32 | find_path(GPERFTOOLS_INCLUDE_DIR 33 | NAMES gperftools/heap-profiler.h 34 | HINTS ${Gperftools_ROOT_DIR}/include) 35 | 36 | set(GPERFTOOLS_LIBRARIES ${GPERFTOOLS_TCMALLOC_AND_PROFILER}) 37 | 38 | include(FindPackageHandleStandardArgs) 39 | find_package_handle_standard_args( 40 | Gperftools 41 | DEFAULT_MSG 42 | GPERFTOOLS_LIBRARIES 43 | GPERFTOOLS_INCLUDE_DIR) 44 | 45 | mark_as_advanced( 46 | Gperftools_ROOT_DIR 47 | GPERFTOOLS_TCMALLOC 48 | GPERFTOOLS_PROFILER 49 | GPERFTOOLS_TCMALLOC_AND_PROFILER 50 | GPERFTOOLS_LIBRARIES 51 | GPERFTOOLS_INCLUDE_DIR) 52 | -------------------------------------------------------------------------------- /cmake/FusedMultiplyAdd.cmake: -------------------------------------------------------------------------------- 1 | include(CheckCXXCompilerFlag) 2 | check_cxx_compiler_flag(-mfma fma_compiles) 3 | if(fma_compiles) 4 | include(CheckCXXSourceRuns) 5 | set(test_src 6 | "#include 7 | double fma_wrap(double x, double y, double z) { return fma(x, y, z); } 8 | int main() { double a = fma_wrap(1.2, 3.4, 5.6); return 0; }") 9 | set(CMAKE_REQUIRED_FLAGS -mfma) 10 | check_cxx_source_runs("${test_src}" fma_runs) 11 | if(fma_runs) 12 | message(STATUS "[khaiii] fused multiply add option enabled") 13 | add_definitions(-mfma) 14 | else() 15 | message(WARNING "[khaiii] cpu does not have fused multiply add instruction") 16 | endif() 17 | else() 18 | message(WARNING "[khaiii] compiler does not support fused multiply add option") 19 | endif() 20 | -------------------------------------------------------------------------------- /cmake/Hunter/config.cmake: -------------------------------------------------------------------------------- 1 | hunter_config(Boost VERSION 1.68.0-p1) 2 | hunter_config(cxxopts VERSION 2.1.1-pre) 3 | hunter_config(Eigen VERSION 3.3.5) 4 | hunter_config(fmt VERSION 4.1.0) 5 | hunter_config(GTest VERSION 1.8.0-hunter-p11) 6 | hunter_config(nlohmann_json VERSION 3.3.0) 7 | hunter_config(spdlog VERSION 0.16.3-p1) 8 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:latest 2 | MAINTAINER nako.sung@navercorp.com 3 | 4 | RUN git clone https://github.com/kakao/khaiii.git 5 | WORKDIR /workspace/khaiii 6 | 7 | RUN pip install cython 8 | RUN pip install --upgrade pip 9 | RUN pip install -r requirements.txt 10 | 11 | RUN mkdir build 12 | WORKDIR /workspace/khaiii/build 13 | 14 | RUN cmake .. 15 | RUN make all 16 | RUN make resource 17 | 18 | RUN apt-get update -y 19 | RUN apt-get install -y language-pack-ko 20 | RUN locale-gen en_US.UTF-8 21 | RUN update-locale LANG=en_US.UTF-8 22 | -------------------------------------------------------------------------------- /include/khaiii/KhaiiiApi.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2017-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | #ifndef INCLUDE_KHAIII_KHAIIIAPI_HPP_ 8 | #define INCLUDE_KHAIII_KHAIIIAPI_HPP_ 9 | 10 | 11 | 12 | ////////////// 13 | // includes // 14 | ////////////// 15 | #include 16 | #include 17 | #include // NOLINT 18 | #include 19 | 20 | #include "khaiii/khaiii_api.h" 21 | 22 | 23 | namespace khaiii { 24 | 25 | 26 | class KhaiiiApi { 27 | public: 28 | /** 29 | * create khaiii api object 30 | * @return shared pointer of khaiii api object 31 | */ 32 | static std::shared_ptr create(); 33 | 34 | /** 35 | * open resources 36 | * @param rsc_dir resource directory 37 | * @param opt_str option string (JSON format) 38 | */ 39 | virtual void open(std::string rsc_dir = "", std::string opt_str = "") = 0; 40 | 41 | /** 42 | * analyze input text 43 | * @param input input text 44 | * @param opt_str runtime option (JSON format) 45 | * @return results 46 | */ 47 | virtual const khaiii_word_t* analyze(const char* input, const char* opt_str) = 0; 48 | 49 | /** 50 | * free memories of analyzed results 51 | * @param results results got from analyze() function 52 | */ 53 | virtual void free_results(const khaiii_word_t* results) = 0; 54 | 55 | virtual void close() = 0; ///< close resources 56 | }; 57 | 58 | 59 | /** 60 | * standard exception thrown by khaiii api 61 | */ 62 | class Except: public std::exception { 63 | public: 64 | /** 65 | * @param msg error message 66 | * @param file source file (for debug) 67 | * @param line line number in source file (for debug) 68 | * @param func function name (for debug) 69 | */ 70 | explicit Except(std::string msg, const char* file = nullptr, const int line = 0, 71 | const char* func = nullptr); 72 | 73 | virtual const char* what() const noexcept; 74 | 75 | std::string debug(); ///< message with some debug information 76 | 77 | private: 78 | std::string _msg; ///< error message 79 | const char* _file = nullptr; ///< source file 80 | const int _line = 0; ///< line number in source file 81 | const char* _func = nullptr; ///< function name 82 | }; 83 | 84 | 85 | } // namespace khaiii 86 | 87 | 88 | #endif // INCLUDE_KHAIII_KHAIIIAPI_HPP_ 89 | -------------------------------------------------------------------------------- /include/khaiii/khaiii_api.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2017-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | #ifndef INCLUDE_KHAIII_KHAIII_API_H_ 8 | #define INCLUDE_KHAIII_KHAIII_API_H_ 9 | 10 | 11 | /////////////// 12 | // constants // 13 | /////////////// 14 | #define KHAIII_VERSION_MAJOR 0 15 | #define KHAIII_VERSION_MINOR 4 16 | #define _MAC2STR(m) #m 17 | #define _JOIN_VER(x,y) _MAC2STR(x) "." _MAC2STR(y) // NOLINT 18 | #define KHAIII_VERSION _JOIN_VER(KHAIII_VERSION_MAJOR,KHAIII_VERSION_MINOR) // NOLINT 19 | 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | 26 | /** 27 | * morpheme data structure 28 | */ 29 | typedef struct khaiii_morph_t_ { 30 | const char* lex; ///< lexical 31 | const char* tag; ///< part-of-speech tag 32 | int begin; ///< morpheme begin position 33 | int length; ///< morpheme length 34 | char reserved[8]; ///< reserved 35 | const struct khaiii_morph_t_* next; ///< next pointer 36 | } khaiii_morph_t; 37 | 38 | 39 | /** 40 | * word data structure 41 | */ 42 | typedef struct khaiii_word_t_ { 43 | int begin; ///< word begin position 44 | int length; ///< word length 45 | char reserved[8]; ///< reserved 46 | const khaiii_morph_t* morphs; ///< morpheme list 47 | const struct khaiii_word_t_* next; ///< next pointer 48 | } khaiii_word_t; 49 | 50 | 51 | /** 52 | * get version string 53 | * @return version string like "2.1" 54 | */ 55 | const char* khaiii_version(); 56 | 57 | 58 | /** 59 | * open resources 60 | * @param rsc_dir resource directory 61 | * @param opt_str option string (JSON format) 62 | * @return handle. -1 if failed 63 | */ 64 | int khaiii_open(const char* rsc_dir, const char* opt_str); 65 | 66 | 67 | /** 68 | * analyze input text 69 | * @param handle handle got from open() function 70 | * @param input input text 71 | * @param opt_str runtime option (JSON format) 72 | * @return results. NULL if failed 73 | */ 74 | const khaiii_word_t* khaiii_analyze(int handle, const char* input, const char* opt_str); 75 | 76 | 77 | /** 78 | * free memories of analyzed results 79 | * @param handle handle got from open() function 80 | * @param results results got from analyze() function 81 | */ 82 | void khaiii_free_results(int handle, const khaiii_word_t* results); 83 | 84 | 85 | /** 86 | * close resources 87 | * @param handle handle got from open() function 88 | */ 89 | void khaiii_close(int handle); 90 | 91 | 92 | /** 93 | * get last error 94 | * @param handle handle got from open() function 95 | * @return message 96 | */ 97 | const char* khaiii_last_error(int handle); 98 | 99 | 100 | #ifdef __cplusplus 101 | } 102 | #endif 103 | 104 | 105 | #endif // INCLUDE_KHAIII_KHAIII_API_H_ 106 | -------------------------------------------------------------------------------- /include/khaiii/khaiii_dev.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2018-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | #ifndef INCLUDE_KHAIII_KHAIII_DEV_H_ 8 | #define INCLUDE_KHAIII_KHAIII_DEV_H_ 9 | 10 | 11 | ////////////// 12 | // includes // 13 | ////////////// 14 | #include 15 | 16 | 17 | #ifdef __cplusplus 18 | extern "C" { 19 | #endif 20 | 21 | 22 | /** 23 | * 분석을 수행하고 오분석 패치를 실행하기 직전에 멈춘 다음 그 결과를 리턴한다. 24 | * @param handle handle got from open() function 25 | * @param input input text 26 | * @param opt_str runtime option (JSON format) 27 | * @param output output value for each character 28 | * @return output length. -1 if failed 29 | */ 30 | int khaiii_analyze_bfr_errpatch(int handle, const char* input, const char* opt_str, 31 | int16_t* output); 32 | 33 | /** 34 | * 로그 레벨을 지정한다. 35 | * @param name 로거 이름. "all"인 경우 모든 로거 36 | * @param level 로거 레벨. trace, debug, info, warn, err, critical 37 | * @return 0 if success. -1 if failed 38 | */ 39 | int khaiii_set_log_level(const char* name, const char* level); 40 | 41 | 42 | /** 43 | * 여러 로그 레벨을 한꺼번에 지정한다. 44 | * @param name_level_pairs 로거 (이름, 레벨) 쌍의 리스트. 45 | * "all:warn,console:info,Tagger:debug"와 같은 형식 46 | * @return 0 if success. -1 if failed 47 | */ 48 | int khaiii_set_log_levels(const char* name_level_pairs); 49 | 50 | 51 | #ifdef __cplusplus 52 | } 53 | #endif 54 | 55 | 56 | #endif // INCLUDE_KHAIII_KHAIII_DEV_H_ 57 | -------------------------------------------------------------------------------- /munjong/apply_patch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """ 6 | apply patch to original Sejong corpus 7 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)' 8 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' 9 | """ 10 | 11 | 12 | ########### 13 | # imports # 14 | ########### 15 | from argparse import ArgumentParser, Namespace 16 | import logging 17 | import os 18 | import shutil 19 | 20 | from khaiii.munjong import libpatch 21 | 22 | 23 | ############# 24 | # functions # 25 | ############# 26 | def run(args: Namespace): 27 | """ 28 | run function which is the start point of program 29 | Args: 30 | args: program arguments 31 | """ 32 | if not os.path.exists(args.modified): 33 | logging.info('creating modified corpus dir: %s', args.modified) 34 | os.mkdir(args.modified) 35 | 36 | for name in sorted(os.listdir(args.original)): 37 | if not name.endswith('.txt'): 38 | continue 39 | org_path = '%s/%s' % (args.original, name) 40 | mod_path = '%s/%s' % (args.modified, name) 41 | patch_path = '%s/%s.patch' % (args.patch, name[:-len('.txt')]) 42 | if os.path.exists(patch_path): 43 | logging.info('[%s] + [%s] = [%s]', org_path, patch_path, mod_path) 44 | libpatch.apply(org_path, args.org_enc, patch_path, mod_path, args.mod_enc) 45 | else: 46 | logging.info('[%s] = [%s]', org_path, mod_path) 47 | shutil.copyfile(org_path, mod_path) 48 | 49 | 50 | ######## 51 | # main # 52 | ######## 53 | def main(): 54 | """ 55 | main function processes only argument parsing 56 | """ 57 | parser = ArgumentParser(description='apply patch to original Sejong corpus') 58 | parser.add_argument('-o', '--original', help='original corpus dir', metavar='DIR', 59 | required=True) 60 | parser.add_argument('-p', '--patch', help='patch dir', metavar='DIR', required=True) 61 | parser.add_argument('-m', '--modified', help='modified corpus output dir', metavar='DIR', 62 | required=True) 63 | parser.add_argument('--org-enc', help='original corpus encoding ', 64 | metavar='ENCODING', default='UTF-16') 65 | parser.add_argument('--mod-enc', help='modified corpus encoding ', 66 | metavar='ENCODING', default='UTF-8') 67 | parser.add_argument('--debug', help='enable debug', action='store_true') 68 | args = parser.parse_args() 69 | 70 | if args.debug: 71 | logging.basicConfig(level=logging.DEBUG) 72 | else: 73 | logging.basicConfig(level=logging.INFO) 74 | 75 | run(args) 76 | 77 | 78 | if __name__ == '__main__': 79 | main() 80 | -------------------------------------------------------------------------------- /munjong/convert_jamo_to_compat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """ 6 | 한글 자모 영역의 코드를 호환 영역으로 변환 7 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)' 8 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' 9 | """ 10 | 11 | 12 | ########### 13 | # imports # 14 | ########### 15 | from argparse import ArgumentParser 16 | import logging 17 | import sys 18 | 19 | from khaiii.munjong.sejong_corpus import WORD_ID_PTN 20 | from khaiii.resource.jaso import norm_compat 21 | 22 | 23 | ############# 24 | # functions # 25 | ############# 26 | def _norm(text: str) -> str: 27 | """ 28 | 정규화를 수행하는 함수 29 | Args: 30 | text: 입력 텍스트 31 | Returns: 32 | 정규화된 텍스트 33 | """ 34 | normalized = norm_compat(text) 35 | normalized = normalized.replace('ᆞ', 'ㆍ') # 0x119e -> 0x318d 36 | normalized = normalized.replace('ᄝ', 'ㅱ') # 0x111d -> 0x3171 37 | return normalized 38 | 39 | 40 | def run(): 41 | """ 42 | run function which is the start point of program 43 | """ 44 | for line in sys.stdin: 45 | line = line.rstrip('\r\n') 46 | if not WORD_ID_PTN.match(line): 47 | print(line) 48 | continue 49 | wid, word, morph = line.split('\t') 50 | print('{}\t{}\t{}'.format(wid, _norm(word), _norm(morph))) 51 | 52 | 53 | ######## 54 | # main # 55 | ######## 56 | def main(): 57 | """ 58 | main function processes only argument parsing 59 | """ 60 | parser = ArgumentParser(description='한글 자모 영역의 코드를 호환 영역으로 변환') 61 | parser.add_argument('--input', help='input file ', metavar='FILE') 62 | parser.add_argument('--output', help='output file ', metavar='FILE') 63 | parser.add_argument('--debug', help='enable debug', action='store_true') 64 | args = parser.parse_args() 65 | 66 | if args.input: 67 | sys.stdin = open(args.input, 'r', encoding='UTF-8') 68 | if args.output: 69 | sys.stdout = open(args.output, 'w', encoding='UTF-8') 70 | if args.debug: 71 | logging.basicConfig(level=logging.DEBUG) 72 | else: 73 | logging.basicConfig(level=logging.INFO) 74 | 75 | run() 76 | 77 | 78 | if __name__ == '__main__': 79 | main() 80 | -------------------------------------------------------------------------------- /munjong/detect_sejong_period_error.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """ 6 | detect period error of Sejong corpus 7 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)' 8 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' 9 | """ 10 | 11 | 12 | ########### 13 | # imports # 14 | ########### 15 | from argparse import ArgumentParser 16 | import logging 17 | import os 18 | import re 19 | import sys 20 | from typing import Iterator, TextIO, Tuple 21 | 22 | from khaiii.munjong.sejong_corpus import Morph, WORD_ID_PTN 23 | 24 | 25 | ############# 26 | # functions # 27 | ############# 28 | def _get_two_lines(fin: TextIO) -> Iterator[Tuple[str, str]]: 29 | """ 30 | get two lines tuple from file (generator) 31 | Args: 32 | fin: input file 33 | Yields: 34 | current line 35 | next line 36 | """ 37 | curr_line = fin.readline().rstrip('\r\n') 38 | for next_line in fin: 39 | next_line = next_line.rstrip('\r\n') 40 | yield curr_line, next_line 41 | curr_line = next_line 42 | 43 | 44 | def _is_correct_eos(line: str) -> bool: 45 | """ 46 | whether correct end of sentence or not 47 | Args: 48 | line: line (word) 49 | Returns: 50 | whether correct or not 51 | """ 52 | _, _, morphs_str = line.split('\t') 53 | if re.match(r'.+/EF \+ ./SF$', morphs_str): 54 | return True 55 | if re.match(r'.+/SF \+ [\'"’”」\]]/SS$', morphs_str): 56 | return True 57 | morphs = [Morph.parse(_) for _ in morphs_str.split(' + ')] 58 | tags_str = '+'.join([_.tag for _ in morphs]) 59 | if tags_str.endswith('+SF+SS+JKQ') or tags_str.endswith('+SF+SS+VCP+ETM'): 60 | return True 61 | return False 62 | 63 | 64 | def run(): 65 | """ 66 | run function which is the start point of program 67 | """ 68 | file_name = os.path.basename(sys.stdin.name) 69 | for line_num, (curr_line, next_line) in enumerate(_get_two_lines(sys.stdin), start=1): 70 | cols = curr_line.split('\t') 71 | if len(cols) != 3 or not WORD_ID_PTN.match(cols[0]): 72 | continue 73 | if '/SF + ' not in cols[2] or not next_line.startswith('', metavar='FILE') 89 | parser.add_argument('--output', help='output file ', metavar='FILE') 90 | parser.add_argument('--debug', help='enable debug', action='store_true') 91 | args = parser.parse_args() 92 | 93 | if args.input: 94 | sys.stdin = open(args.input, 'rt') 95 | if args.output: 96 | sys.stdout = open(args.output, 'wt') 97 | if args.debug: 98 | logging.basicConfig(level=logging.DEBUG) 99 | else: 100 | logging.basicConfig(level=logging.INFO) 101 | 102 | run() 103 | 104 | 105 | if __name__ == '__main__': 106 | main() 107 | -------------------------------------------------------------------------------- /munjong/fix_final_symbol_error.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """ 6 | fix final symbol errors on Sejong corpus 7 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)' 8 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' 9 | """ 10 | 11 | 12 | ########### 13 | # imports # 14 | ########### 15 | from argparse import ArgumentParser 16 | import logging 17 | import os 18 | import sys 19 | 20 | from khaiii.munjong.sejong_corpus import Morph, Word, WORD_ID_PTN 21 | 22 | 23 | ############# 24 | # functions # 25 | ############# 26 | def _attach_missing_symbol(word: Word): 27 | """ 28 | attach missing symbol 29 | Args: 30 | word: Word object 31 | """ 32 | raw_word = word.raw 33 | raw_morph = ''.join([_.lex for _ in word.morphs]) 34 | if not raw_word.startswith(raw_morph) or len(raw_word) != len(raw_morph)+1: 35 | return 36 | last_symbol = raw_word[-1] 37 | if last_symbol == '.' and word.morphs[-1].tag == 'EC': 38 | word.morphs.append(Morph('.', 'SF')) 39 | elif last_symbol == ',': 40 | word.morphs.append(Morph(',', 'SP')) 41 | elif last_symbol == '"': 42 | word.morphs.append(Morph('"', 'SS')) 43 | 44 | 45 | def run(): 46 | """ 47 | run function which is the start point of program 48 | """ 49 | file_name = os.path.basename(sys.stdin.name) 50 | for line_num, line in enumerate(sys.stdin, start=1): 51 | line = line.rstrip('\r\n') 52 | if not WORD_ID_PTN.match(line): 53 | print(line) 54 | continue 55 | word = Word.parse(line, file_name, line_num) 56 | _attach_missing_symbol(word) 57 | print(word) 58 | 59 | 60 | ######## 61 | # main # 62 | ######## 63 | def main(): 64 | """ 65 | main function processes only argument parsing 66 | """ 67 | parser = ArgumentParser(description='fix final symbol errors on Sejong corpus') 68 | parser.add_argument('--input', help='input file ', metavar='FILE') 69 | parser.add_argument('--output', help='output file ', metavar='FILE') 70 | parser.add_argument('--debug', help='enable debug', action='store_true') 71 | args = parser.parse_args() 72 | 73 | if args.input: 74 | sys.stdin = open(args.input, 'rt') 75 | if args.output: 76 | sys.stdout = open(args.output, 'wt') 77 | if args.debug: 78 | logging.basicConfig(level=logging.DEBUG) 79 | else: 80 | logging.basicConfig(level=logging.INFO) 81 | 82 | run() 83 | 84 | 85 | if __name__ == '__main__': 86 | main() 87 | -------------------------------------------------------------------------------- /munjong/make_patch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """ 6 | make patch from two Sejong corpora 7 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)' 8 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' 9 | """ 10 | 11 | 12 | ########### 13 | # imports # 14 | ########### 15 | from argparse import ArgumentParser, Namespace 16 | import logging 17 | import os 18 | 19 | from khaiii.munjong import libpatch 20 | 21 | 22 | ############# 23 | # functions # 24 | ############# 25 | def run(args: Namespace): 26 | """ 27 | run function which is the start point of program 28 | Args: 29 | args: program arguments 30 | """ 31 | if not os.path.exists(args.patch): 32 | logging.info('creating patch dir: %s', args.patch) 33 | os.mkdir(args.patch) 34 | 35 | for name in sorted(os.listdir(args.original)): 36 | if not name.endswith('.txt'): 37 | continue 38 | org_path = '%s/%s' % (args.original, name) 39 | mod_path = '%s/%s' % (args.modified, name) 40 | patch_path = '%s/%s.patch' % (args.patch, name[:-len('.txt')]) 41 | logging.info('[%s] - [%s] = [%s]', org_path, mod_path, patch_path) 42 | patches = libpatch.make(org_path, args.org_enc, mod_path, args.mod_enc) 43 | if patches: 44 | logging.info('creating patch file: %s', patch_path) 45 | with open(patch_path, 'w', encoding='UTF-8') as fout: 46 | for patch in patches: 47 | print(patch, file=fout) 48 | elif os.path.exists(patch_path): 49 | logging.info('removing existing patch file: %s', patch_path) 50 | os.remove(patch_path) 51 | 52 | 53 | ######## 54 | # main # 55 | ######## 56 | def main(): 57 | """ 58 | main function processes only argument parsing 59 | """ 60 | parser = ArgumentParser(description='make patch from two Sejong corpora') 61 | parser.add_argument('-o', '--original', help='original corpus dir', metavar='DIR', 62 | required=True) 63 | parser.add_argument('-m', '--modified', help='modified corpus dir', metavar='DIR', 64 | required=True) 65 | parser.add_argument('-p', '--patch', help='patch output dir', metavar='DIR', required=True) 66 | parser.add_argument('--org-enc', help='original corpus encoding ', 67 | metavar='ENCODING', default='UTF-16') 68 | parser.add_argument('--mod-enc', help='modified corpus encoding ', 69 | metavar='ENCODING', default='UTF-8') 70 | parser.add_argument('--debug', help='enable debug', action='store_true') 71 | args = parser.parse_args() 72 | 73 | if args.debug: 74 | logging.basicConfig(level=logging.DEBUG) 75 | else: 76 | logging.basicConfig(level=logging.INFO) 77 | 78 | run(args) 79 | 80 | 81 | if __name__ == '__main__': 82 | main() 83 | -------------------------------------------------------------------------------- /munjong/recover_english_case.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """ 6 | recover cases of English letters in Sejong corpus 7 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)' 8 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' 9 | """ 10 | 11 | 12 | ########### 13 | # imports # 14 | ########### 15 | from argparse import ArgumentParser 16 | import copy 17 | import logging 18 | import os 19 | import re 20 | import sys 21 | 22 | from khaiii.munjong.sejong_corpus import Word, WORD_ID_PTN 23 | 24 | 25 | ############# 26 | # functions # 27 | ############# 28 | def _recover(word: Word): 29 | """ 30 | recover cases 31 | Args: 32 | word: Word object 33 | """ 34 | word_letters = [_ for _ in word.raw if re.match(r'[a-zA-Z]', _)] 35 | letter_idx = -1 36 | is_recovered = False 37 | word_copy = copy.deepcopy(word) 38 | for morph in word_copy.morphs: 39 | for idx, char in enumerate(morph.lex): 40 | if not re.match(r'[a-zA-Z]', char): 41 | continue 42 | letter_idx += 1 43 | if word_letters[letter_idx] == char: 44 | continue 45 | morph.lex = morph.lex[:idx] + word_letters[letter_idx] + morph.lex[idx+1:] 46 | is_recovered = True 47 | if is_recovered: 48 | logging.info('%s => %s', str(word), word_copy.morph_str()) 49 | word.morphs = word_copy.morphs 50 | 51 | 52 | def run(): 53 | """ 54 | run function which is the start point of program 55 | """ 56 | file_name = os.path.basename(sys.stdin.name) 57 | for line_num, line in enumerate(sys.stdin, start=1): 58 | line = line.rstrip('\r\n') 59 | if not WORD_ID_PTN.match(line): 60 | print(line) 61 | continue 62 | word = Word.parse(line, file_name, line_num) 63 | try: 64 | _recover(word) 65 | except IndexError as idx_err: 66 | logging.error('%s(%d): %s: %s', file_name, line_num, idx_err, word) 67 | print(word) 68 | 69 | 70 | ######## 71 | # main # 72 | ######## 73 | def main(): 74 | """ 75 | main function processes only argument parsing 76 | """ 77 | parser = ArgumentParser(description='recover cases of English letters in Sejong corpus') 78 | parser.add_argument('--input', help='input file ', metavar='FILE') 79 | parser.add_argument('--output', help='output file ', metavar='FILE') 80 | parser.add_argument('--debug', help='enable debug', action='store_true') 81 | args = parser.parse_args() 82 | 83 | if args.input: 84 | sys.stdin = open(args.input, 'r', encoding='UTF-8') 85 | if args.output: 86 | sys.stdout = open(args.output, 'w', encoding='UTF-8') 87 | if args.debug: 88 | logging.basicConfig(level=logging.DEBUG) 89 | else: 90 | logging.basicConfig(level=logging.INFO) 91 | 92 | run() 93 | 94 | 95 | if __name__ == '__main__': 96 | main() 97 | -------------------------------------------------------------------------------- /munjong/recover_raw_morph_mismatch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """ 6 | 어절의 원문과 형태소 분석 결과의 문자가 정규화하면 같지만 코드가 다른 경우 원문의 문자로 복원 7 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)' 8 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' 9 | """ 10 | 11 | 12 | ########### 13 | # imports # 14 | ########### 15 | from argparse import ArgumentParser 16 | import logging 17 | import os 18 | import sys 19 | 20 | from khaiii.munjong.sejong_corpus import Morph, ParseError, Word, WORD_ID_PTN 21 | 22 | 23 | ############# 24 | # functions # 25 | ############# 26 | def _recover(line: str) -> str: 27 | """ 28 | 문자를 복원한다. 29 | Args: 30 | line: 어절 라인 31 | Returns: 32 | 복원된 라인 33 | """ 34 | wid, raw, morphs_str = line.split('\t') 35 | raw_idx = 0 36 | morphs = [] 37 | for token_str in morphs_str.split(' + '): 38 | morph = Morph.parse(token_str) 39 | lex = [] 40 | for _ in range(len(morph.lex)): 41 | try: 42 | lex.append(raw[raw_idx]) 43 | raw_idx += 1 44 | except IndexError as idx_err: 45 | logging.error(line) 46 | raise idx_err 47 | morph.lex = ''.join(lex) 48 | morphs.append(morph) 49 | morphs_new = ' + '.join([str(m) for m in morphs]) 50 | logging.debug('%s\t%s\t%s => %s', wid, raw, morphs_str, morphs_new) 51 | return '{}\t{}\t{}'.format(wid, raw, morphs_new) 52 | 53 | 54 | def run(): 55 | """ 56 | run function which is the start point of program 57 | """ 58 | file_name = os.path.basename(sys.stdin.name) 59 | for line_num, line in enumerate(sys.stdin, start=1): 60 | line = line.rstrip('\r\n') 61 | if not WORD_ID_PTN.match(line): 62 | print(line) 63 | continue 64 | try: 65 | Word.parse(line, file_name, line_num) 66 | except ParseError as par_err: 67 | if 'raw-morph mismatch' in str(par_err): 68 | line = _recover(line) 69 | else: 70 | raise par_err 71 | print(line) 72 | 73 | 74 | ######## 75 | # main # 76 | ######## 77 | def main(): 78 | """ 79 | main function processes only argument parsing 80 | """ 81 | parser = ArgumentParser(description='어절의 원문과 형태소 분석 결과의 문자가 정규화하면 같지만 코드가 다른 경우' 82 | ' 원문의 문자로 복원') 83 | parser.add_argument('--input', help='input file ', metavar='FILE') 84 | parser.add_argument('--output', help='output file ', metavar='FILE') 85 | parser.add_argument('--debug', help='enable debug', action='store_true') 86 | args = parser.parse_args() 87 | 88 | if args.input: 89 | sys.stdin = open(args.input, 'r', encoding='UTF-8') 90 | if args.output: 91 | sys.stdout = open(args.output, 'w', encoding='UTF-8') 92 | if args.debug: 93 | logging.basicConfig(level=logging.DEBUG) 94 | else: 95 | logging.basicConfig(level=logging.INFO) 96 | 97 | run() 98 | 99 | 100 | if __name__ == '__main__': 101 | main() 102 | -------------------------------------------------------------------------------- /munjong/recover_wide_quotation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """ 6 | recover wide char quotations in Sejong corpus 7 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)' 8 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' 9 | """ 10 | 11 | 12 | ########### 13 | # imports # 14 | ########### 15 | from argparse import ArgumentParser 16 | import logging 17 | import os 18 | import sys 19 | 20 | from khaiii.munjong.sejong_corpus import Word, WORD_ID_PTN 21 | 22 | 23 | ############# 24 | # constants # 25 | ############# 26 | _QUOT_NORM = { 27 | '"': '"', 28 | '“': '"', 29 | '”': '"', 30 | "'": "'", 31 | "‘": "'", 32 | "’": "'", 33 | "`": "'", 34 | } 35 | 36 | 37 | ############# 38 | # functions # 39 | ############# 40 | def _recover(word: Word): 41 | """ 42 | recover wide char quotations 43 | Args: 44 | word: Word object 45 | """ 46 | word_quots = [_ for _ in word.raw if _ in _QUOT_NORM] 47 | morph_quots = [] 48 | for idx, morph in enumerate(word.morphs): 49 | if morph.tag != 'SS' or morph.lex not in _QUOT_NORM: 50 | continue 51 | morph_quots.append((idx, morph)) 52 | quot_idx = len(morph_quots)-1 53 | if len(word_quots) <= quot_idx or _QUOT_NORM[word_quots[quot_idx]] != _QUOT_NORM[morph.lex]: 54 | logging.error('%d-th quots are different: %s', quot_idx+1, word) 55 | return 56 | if len(word_quots) != len(morph_quots): 57 | morph_quots = [_ for _ in word.morph_str() if _ in _QUOT_NORM] 58 | if word_quots != morph_quots: 59 | logging.error('number of quots are different: %s', word) 60 | return 61 | for word_char, (idx, morph) in zip(word_quots, morph_quots): 62 | if word_char == morph.lex: 63 | continue 64 | morph.lex = word_char 65 | 66 | 67 | def run(): 68 | """ 69 | run function which is the start point of program 70 | """ 71 | file_name = os.path.basename(sys.stdin.name) 72 | for line_num, line in enumerate(sys.stdin, start=1): 73 | line = line.rstrip('\r\n') 74 | if not WORD_ID_PTN.match(line): 75 | print(line) 76 | continue 77 | word = Word.parse(line, file_name, line_num) 78 | _recover(word) 79 | print(word) 80 | 81 | 82 | ######## 83 | # main # 84 | ######## 85 | def main(): 86 | """ 87 | main function processes only argument parsing 88 | """ 89 | parser = ArgumentParser(description='recover wide char quotations in Sejong corpus') 90 | parser.add_argument('--input', help='input file ', metavar='FILE') 91 | parser.add_argument('--output', help='output file ', metavar='FILE') 92 | parser.add_argument('--debug', help='enable debug', action='store_true') 93 | args = parser.parse_args() 94 | 95 | if args.input: 96 | sys.stdin = open(args.input, 'rt') 97 | if args.output: 98 | sys.stdout = open(args.output, 'wt') 99 | if args.debug: 100 | logging.basicConfig(level=logging.DEBUG) 101 | else: 102 | logging.basicConfig(level=logging.INFO) 103 | 104 | run() 105 | 106 | 107 | if __name__ == '__main__': 108 | main() 109 | -------------------------------------------------------------------------------- /munjong/remove_sejong_period_error.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """ 6 | remove wrong sentence breaking marks after period error eojeol 7 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)' 8 | __copyright__ = 'Copyright (C) 2017-, Kakao Corp. All rights reserved.' 9 | """ 10 | 11 | 12 | ########### 13 | # imports # 14 | ########### 15 | from argparse import ArgumentParser 16 | import logging 17 | import os 18 | import re 19 | import sys 20 | from typing import TextIO, Tuple 21 | 22 | from khaiii.munjong.sejong_corpus import Morph, WORD_ID_PTN 23 | 24 | 25 | ############# 26 | # functions # 27 | ############# 28 | def _get_three_lines(fin: TextIO) -> Tuple[str, str, str]: 29 | """ 30 | get three lines tuple from file (generator) 31 | Args: 32 | fin: input file 33 | Yields: 34 | prev. prev. line 35 | prev. line 36 | curr. line 37 | """ 38 | prev_prev_line = fin.readline().rstrip('\r\n') 39 | prev_line = fin.readline().rstrip('\r\n') 40 | # print first two lines 41 | print(prev_prev_line) 42 | print(prev_line) 43 | for curr_line in fin: 44 | curr_line = curr_line.rstrip('\r\n') 45 | yield prev_prev_line, prev_line, curr_line 46 | prev_prev_line = prev_line 47 | prev_line = curr_line 48 | 49 | 50 | def _is_known_period_error_eojeol(line: str) -> bool: 51 | """ 52 | 알려진 특정 문장분리 오류를 포함하는 어절인 지 여부 53 | Args: 54 | line: line (eojeol) 55 | Returns: 56 | whether has error or not 57 | """ 58 | cols = line.split('\t') 59 | if len(cols) != 3 or not WORD_ID_PTN.match(cols[0]): 60 | return False 61 | if '/SF + ' not in cols[2] or re.match(r'.+/EF \+ ./SF$', cols[2]): 62 | return False 63 | if re.match(r'.+/SF \+ [\'"’”]/SS$', cols[2]): 64 | return False 65 | morphs = [Morph.parse(_) for _ in cols[2].split(' + ')] 66 | tags_str = '+'.join([_.tag for _ in morphs]) 67 | if 'SN+SF+SN' in tags_str and not tags_str.endswith('+SF'): 68 | # 4.6판: 4/SN + ./SF + 6/SN + 판/NNB 69 | if 'XSN+SF+SN' not in tags_str: 70 | return True 71 | elif 'SL+SF+SL' in tags_str and not tags_str.endswith('+SF'): 72 | # S.M.오너: S/SL + ./SF + M/SL + ./SF + 오너/NNG 73 | return True 74 | return False 75 | 76 | 77 | def run(): 78 | """ 79 | run function which is the start point of program 80 | """ 81 | file_name = os.path.basename(sys.stdin.name) 82 | for line_num, (prev_prev_line, prev_line, curr_line) in enumerate(_get_three_lines(sys.stdin), 83 | start=1): 84 | if curr_line == '

' and _is_known_period_error_eojeol(prev_line): 85 | continue 86 | elif prev_line == '

' and curr_line == '

' and \ 87 | _is_known_period_error_eojeol(prev_prev_line): 88 | logging.info('%s:%d\t%s', file_name, line_num, prev_prev_line) 89 | continue 90 | print(curr_line) 91 | 92 | 93 | ######## 94 | # main # 95 | ######## 96 | def main(): 97 | """ 98 | main function processes only argument parsing 99 | """ 100 | parser = ArgumentParser(description='remove wrong sentence breaking marks after' 101 | ' period error eojeol') 102 | parser.add_argument('--input', help='input file ', metavar='FILE') 103 | parser.add_argument('--output', help='output file ', metavar='FILE') 104 | parser.add_argument('--debug', help='enable debug', action='store_true') 105 | args = parser.parse_args() 106 | 107 | if args.input: 108 | sys.stdin = open(args.input, 'rt') 109 | if args.output: 110 | sys.stdout = open(args.output, 'wt') 111 | if args.debug: 112 | logging.basicConfig(level=logging.DEBUG) 113 | else: 114 | logging.basicConfig(level=logging.INFO) 115 | 116 | run() 117 | 118 | 119 | if __name__ == '__main__': 120 | main() 121 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cmake>=3.10 2 | -------------------------------------------------------------------------------- /rsc/Makefile: -------------------------------------------------------------------------------- 1 | HOME_DIR = . 2 | BIN_DIR = $(HOME_DIR)/bin 3 | SRC_PYTHON = $(HOME_DIR)/../src/main/python 4 | RSC_SRC = $(HOME_DIR)/src 5 | PREFIX = /usr/local 6 | RSC_DIR = $(PREFIX)/share/khaiii 7 | MODEL_SIZE = base 8 | 9 | MODEL = \ 10 | $(RSC_DIR)/config.json \ 11 | $(RSC_DIR)/embed.bin \ 12 | $(RSC_DIR)/conv.2.fil \ 13 | $(RSC_DIR)/conv.3.fil \ 14 | $(RSC_DIR)/conv.4.fil \ 15 | $(RSC_DIR)/conv.5.fil \ 16 | $(RSC_DIR)/cnv2hdn.lin \ 17 | $(RSC_DIR)/hdn2tag.lin 18 | 19 | RESTORE = \ 20 | $(RSC_DIR)/restore.key \ 21 | $(RSC_DIR)/restore.val \ 22 | $(RSC_DIR)/restore.one 23 | 24 | PREANAL = \ 25 | $(RSC_DIR)/preanal.tri \ 26 | $(RSC_DIR)/preanal.val 27 | 28 | ERRPATCH = \ 29 | $(RSC_DIR)/errpatch.tri \ 30 | $(RSC_DIR)/errpatch.val \ 31 | $(RSC_DIR)/errpatch.len 32 | 33 | all: $(MODEL) $(PREANAL) $(RESTORE) $(ERRPATCH) 34 | 35 | $(wordlist 2,100,$(MODEL)): $(firstword $(MODEL)) 36 | $(firstword $(MODEL)): $(RSC_SRC)/$(MODEL_SIZE).config.json $(RSC_SRC)/$(MODEL_SIZE).model.pickle 37 | mkdir -p $(RSC_DIR) 38 | PYTHONPATH=$(SRC_PYTHON) python3 $(BIN_DIR)/compile_model.py --model-size $(MODEL_SIZE) --rsc-src $(RSC_SRC) --rsc-dir $(RSC_DIR) 39 | 40 | $(wordlist 2,100,$(PREANAL)): $(firstword $(PREANAL)) 41 | $(firstword $(PREANAL)): $(RSC_SRC)/preanal.auto $(RSC_SRC)/preanal.manual 42 | mkdir -p $(RSC_DIR) 43 | PYTHONPATH=$(SRC_PYTHON) python3 $(BIN_DIR)/compile_preanal.py --rsc-src $(RSC_SRC) --rsc-dir $(RSC_DIR) 44 | 45 | $(wordlist 2,100,$(RESTORE)): $(firstword $(RESTORE)) 46 | $(firstword $(RESTORE)): $(RSC_SRC)/restore.dic $(RSC_SRC)/vocab.out $(RSC_SRC)/vocab.out.more 47 | mkdir -p $(RSC_DIR) 48 | PYTHONPATH=$(SRC_PYTHON) python3 $(BIN_DIR)/compile_restore.py --rsc-src $(RSC_SRC) --rsc-dir $(RSC_DIR) 49 | 50 | $(wordlist 2,100,$(ERRPATCH)): $(firstword $(ERRPATCH)) 51 | $(firstword $(ERRPATCH)): $(RSC_SRC)/$(MODEL_SIZE).errpatch.auto $(RSC_SRC)/$(MODEL_SIZE).errpatch.manual 52 | mkdir -p $(RSC_DIR) 53 | PYTHONPATH=$(SRC_PYTHON) python3 $(BIN_DIR)/compile_errpatch.py --model-size $(MODEL_SIZE) --rsc-src $(RSC_SRC) --rsc-dir $(RSC_DIR) 54 | 55 | clean: 56 | rm -rf $(RSC_DIR) 57 | -------------------------------------------------------------------------------- /rsc/src/base.config.json: -------------------------------------------------------------------------------- 1 | { 2 | "cutoff": 1, 3 | "embed_dim": 35, 4 | "hidden_dim": 320, 5 | "model_id": "munjong.cut1.win4.sdo0.1.emb35.lr0.001.lrd0.9.bs500", 6 | "rsc_src": "../rsc/src", 7 | "window": 4 8 | } -------------------------------------------------------------------------------- /rsc/src/base.errpatch.manual: -------------------------------------------------------------------------------- 1 | # 아래 엔트리는 단위테스트에 사용되는 것으로 삭제하지 마시기 바랍니다. 2 | 지저스크라이스트 지저스크라이스/NNP + 트/NNG 지저스/NNP + 크라이스트/NNP 3 | 지저스 크라이스트 지저스/NNP + _ + 크라이스/NNP + 트/NNG 지저스/NNP + _ + 크라이스트/NNP 4 | 고타마싯다르타 | + 고타마싯다르타/NNP | + 고타마/NNP + 싯다르타/NNP 5 | 무함마드압둘라 무함마드압/NNP + 둘/NR + 라/NNP + | 무함마드/NNP + 압둘라/NNP + | 6 | -------------------------------------------------------------------------------- /rsc/src/base.model.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kakao/khaiii/3d0c8944374163b1107fd6690ccf3a408430a02d/rsc/src/base.model.pickle -------------------------------------------------------------------------------- /rsc/src/large.config.json: -------------------------------------------------------------------------------- 1 | { 2 | "cutoff": 1, 3 | "embed_dim": 180, 4 | "hidden_dim": 610, 5 | "model_id": "munjong.cut1.win4.sdo0.1.emb180.lr0.001.lrd0.9.bs500", 6 | "rsc_src": "../rsc/src", 7 | "window": 4 8 | } -------------------------------------------------------------------------------- /rsc/src/large.errpatch.auto: -------------------------------------------------------------------------------- 1 | 이름 석자 이름/NNG + _ + 석자/NNG 이름/NNG + _ + 석/MM + 자/NNG 2 | 채 썬다 채/MAG + _ + 썰/VV + ㄴ다/EF 채/NNG + _ + 썰/VV + ㄴ다/EF 3 | 중증급성호흡기증후군 중증/NNG + 급성호흡기/NNG + 증후군/NNG 중증/NNG + 급성/NNG + 호흡기/NNG + 증후군/NNG 4 | 한국교사휴양원 _ + 한국교사휴양/NNP + 원/NNG _ + 한국교사휴양원/NNP 5 | 모여 들 모이/VV + 어/EC + _ + 들/VV 모이/VV + 어/EC + _ + 들/VX 6 | 연탄가스 _ + 연탄/NNG + 가스/NNG _ + 연탄가스/NNG 7 | 실수요자 _ + 실수/NNG + 요자/NNG _ + 실수요자/NNG 8 | 너랑 나 너/NP + 랑/JKB + _ + 나/NP 너/NP + 랑/JC + _ + 나/NP 9 | 시아누크공 시아누크공/NNP 시아누크/NNP + 공/NNG 10 | 하리만치 하/XSA + 리만/EC + 치/MAG 하/XSA + 리만치/EC 11 | 그래선지 | + 그렇/VA + 어선지/EC | + 그렇/VA + 어서/EC + 이/VCP + ㄴ지/EC 12 | 대대적인 대대/NNG + 적/XSN + 이/VCP + ㄴ/ETM 대대적/NNG + 이/VCP + ㄴ/ETM 13 | 이른바 ` 이른/MAJ + 바/MAG + _ + `/SS 이른바/MAJ + _ + `/SS 14 | 산간벽지 산간/NNG + 벽지/NNG 산간벽지/NNG 15 | 미스 민을 미스/NNG + _ + 민/NNG + 을/JKO 미스/NNG + _ + 민/NNP + 을/JKO 16 | 무임승차 무임/NNG + 승차/NNG 무임승차/NNG 17 | 습니다그려. 습니다/EC + 그/JX + 려/IC + ./SF 습니다/EC + 그려/JX + ./SF 18 | 진두지휘 _ + 진두/NNG + 지휘/NNG _ + 진두지휘/NNG 19 | 한편 1997 한/MAG + 편/NNG + _ + 1997/SN 한편/NNG + _ + 1997/SN 20 | 한편 1997 | + 한/MAG + 편/NNG + _ + 1997/SN | + 한편/NNG + _ + 1997/SN 21 | 지식인이란 지식인/NNG + 이/VCP + 란/JX + _ 지식인/NNG + 이란/JX + _ 22 | 시험공부 시험/NNG + 공부/NNG 시험공부/NNG 23 | 중증급성호흡기 중증/NNG + 급성호흡기/NNG 중증/NNG + 급성/NNG + 호흡기/NNG 24 | 기념행사 기념/NNG + 행사/NNG 기념행사/NNG 25 | 그래선지 그렇/VA + 어선지/EC + _ 그렇/VA + 어서/EC + 이/VCP + ㄴ지/EC + _ 26 | 사 가지고 사/VV + 아/EC + _ + 가/VV + 지/VX + 고/EC 사/VV + 아/EC + _ + 가지/VX + 고/EC 27 | 한국교사휴양원 한국교사휴양/NNP + 원/NNG 한국교사휴양원/NNP 28 | 언어문화 언어/NNG + 문화/NNG 언어문화/NNG 29 | 간 쇠고기 가/VV + ㄴ/ETM + _ + 쇠고기/NNG 갈/VV + ㄴ/ETM + _ + 쇠고기/NNG 30 | 달래 주 달러/VV + 어/EC + _ + 주/VX 달래/VV + 어/EC + _ + 주/VX 31 | 기 일원론 기/NNG + _ + 일원/NNG + 론/XSN 기/NNG + _ + 일원론/NNG 32 | 돼지머리 돼지머리/NNG 돼지/NNG + 머리/NNG 33 | 제자리걸음 _ + 제자리/NNG + 걸음/NNG _ + 제자리걸음/NNG 34 | 전지훈련 전지/NNG + 훈련/NNG 전지훈련/NNG 35 | 진우 씬 진우/NNP + _ + 씬/NNG 진우/NNP + _ + 씨/NNB + ㄴ/JX 36 | 이 바람에 이/JKS + _ + 바/NNG + 람/NNB + 에/JKB 이/JKS + _ + 바람/NNG + 에/JKB 37 | 대대적인 _ + 대대/NNG + 적/XSN + 이/VCP + ㄴ/ETM _ + 대대적/NNG + 이/VCP + ㄴ/ETM 38 | 돼지머리 _ + 돼지머리/NNG _ + 돼지/NNG + 머리/NNG 39 | 반벌거숭이 _ + 반벌거숭이/NNG _ + 반/NNG + 벌거숭이/NNG 40 | 이나 있 이나/JX + _ + 있/VX 이나/JX + _ + 있/VV 41 | 도시가스 _ + 도시/NNG + 가스/NNG _ + 도시가스/NNG 42 | 그 반벌거숭이 그/MM + _ + 반벌거숭이/NNG 그/MM + _ + 반/NNG + 벌거숭이/NNG 43 | 제자리걸음 제자리/NNG + 걸음/NNG 제자리걸음/NNG 44 | 만나 보 만나/VV + 아/EC + _ + 보/VV 만나/VV + 아/EC + _ + 보/VX 45 | 세계정세 _ + 세계/NNG + 정세/NNG _ + 세계정세/NNG 46 | 가상공간 가상/NNG + 공간/NNG 가상공간/NNG 47 | 만병통치약 만병/NNG + 통치약/NNG 만병통치약/NNG 48 | 조선말기 _ + 조/NNP + 선말기/NNG _ + 조선/NNP + 말기/NNG 49 | 그래선지 그렇/VA + 어선지/EC 그렇/VA + 어서/EC + 이/VCP + ㄴ지/EC 50 | 해임건의안 해임/NNG + 건의/NNG + 안/NNG 해임/NNG + 건의안/NNG 51 | 생맥주집 생/XPN + 맥주집/NNG 생/XPN + 맥주/NNG + 집/NNG 52 | 다문화주의 _ + 다문화주의/NNG _ + 다문화/NNG + 주의/NNG 53 | 가족계획 가족/NNG + 계획/NNG 가족계획/NNG 54 | 세대교체 세대/NNG + 교체/NNG 세대교체/NNG 55 | 물항아리 물항아리/NNG 물/NNG + 항아리/NNG 56 | 비평용어 _ + 비평용어/NNG _ + 비평/NNG + 용어/NNG 57 | 반벌거숭이 반벌거숭이/NNG 반/NNG + 벌거숭이/NNG 58 | 수사본부 수사/NNG + 본부/NNG 수사본부/NNG 59 | 전기난로 전기난로/NNG 전기/NNG + 난로/NNG 60 | 원상회복 원상/NNG + 회복/NNG 원상회복/NNG 61 | 베이지색 _ + 베이지색/NNG + _ _ + 베이지/NNG + 색/NNG + _ 62 | 이 바람 이/JKS + _ + 바/NNG + 람/NNB 이/JKS + _ + 바람/NNG 63 | 시기상조 _ + 시기/NNG + 상조/NNG _ + 시기상조/NNG 64 | 하리만치 하/XSA + 리만/EC + 치/MAG + _ 하/XSA + 리만치/EC + _ 65 | 원상회복 _ + 원상/NNG + 회복/NNG _ + 원상회복/NNG 66 | 수공예품 수공/NNG + 예품/NNG 수공예품/NNG 67 | 베이지색 베이지색/NNG 베이지/NNG + 색/NNG 68 | 신용보증기금 신/NNG + 용보증기금/NNP 신용보증기금/NNP 69 | 도시가스 도시/NNG + 가스/NNG 도시가스/NNG 70 | 가상공간 _ + 가상/NNG + 공간/NNG _ + 가상공간/NNG 71 | 학력고사 학력/NNG + 고사/NNG 학력고사/NNG 72 | 사 가지 사/VV + 아/EC + _ + 가/VV + 지/VX 사/VV + 아/EC + _ + 가지/VX 73 | 시기상조 시기/NNG + 상조/NNG 시기상조/NNG 74 | 슬기슬기 슬기슬기/NNG 슬기/NNG + 슬기/NNG 75 | 전기난로 _ + 전기난로/NNG _ + 전기/NNG + 난로/NNG 76 | 동물학자 _ + 동물/NNG + 학자/NNG _ + 동물학자/NNG 77 | 오리고기 오리고기/NNG 오리/NNG + 고기/NNG 78 | 슬기슬기 _ + 슬기슬기/NNG _ + 슬기/NNG + 슬기/NNG 79 | 가족계획 _ + 가족/NNG + 계획/NNG _ + 가족계획/NNG 80 | 위기관리 _ + 위기/NNG + 관리/NNG _ + 위기관리/NNG 81 | 전지훈련 _ + 전지/NNG + 훈련/NNG _ + 전지훈련/NNG 82 | 습니다그려 습니다/EC + 그/JX + 려/IC 습니다/EC + 그려/JX 83 | 비평용어 비평용어/NNG 비평/NNG + 용어/NNG 84 | 지식인이란 지식인/NNG + 이/VCP + 란/JX 지식인/NNG + 이란/JX 85 | 동물학자 동물/NNG + 학자/NNG 동물학자/NNG 86 | 예술가촌 예술가촌/NNG 예술가/NNG + 촌/NNG 87 | 베이지색 베이지색/NNG + _ 베이지/NNG + 색/NNG + _ 88 | 가 주는 가/JKS + _ + 주/VX + 는/ETM 가/JKS + _ + 주/VV + 는/ETM 89 | 담임교사 _ + 담임/NNG + 교사/NNG _ + 담임교사/NNG 90 | 네덜란드인 네/NNP + 덜란드인/NNG 네덜란드인/NNG 91 | 선불카드 선불/NNG + 카드/NNG 선불카드/NNG 92 | 다문화주의 다문화주의/NNG 다문화/NNG + 주의/NNG 93 | 어인 일 어/NNG + 이/VV + ㄴ/ETM + _ + 일/NNG 어인/MM + _ + 일/NNG 94 | 조선말기 조/NNP + 선말기/NNG 조선/NNP + 말기/NNG 95 | 진두지휘 진두/NNG + 지휘/NNG 진두지휘/NNG 96 | 베이지색 _ + 베이지색/NNG _ + 베이지/NNG + 색/NNG 97 | 개인연금 개인/NNG + 연금/NNG 개인연금/NNG 98 | 위기관리 위기/NNG + 관리/NNG 위기관리/NNG 99 | , 대파 ,/SP + _ + 대파/NNG ,/SP + _ + 대/XPN + 파/NNG 100 | 연탄가스 연탄/NNG + 가스/NNG 연탄가스/NNG 101 | 50퍼센트 50/SN + 퍼센/NNG + 트/NNB 50/SN + 퍼센트/NNG 102 | 담임교사 담임/NNG + 교사/NNG 담임교사/NNG 103 | 개인연금 _ + 개인/NNG + 연금/NNG _ + 개인연금/NNG 104 | 전문학교 전문/NNG + 학교/NNG 전문학교/NNG 105 | 기념행사 _ + 기념/NNG + 행사/NNG _ + 기념행사/NNG 106 | 실수요자 실수/NNG + 요자/NNG 실수요자/NNG 107 | 세계정세 세계/NNG + 정세/NNG 세계정세/NNG 108 | 아씨마님 아씨마님/NNG 아씨/NNG + 마님/NNG 109 | 미스 민 미스/NNG + _ + 민/NNG 미스/NNG + _ + 민/NNP 110 | 통신업체 통신/NNG + 업체/NNG 통신업체/NNG 111 | 소강상태 소강/NNG + 상태/NNG 소강상태/NNG 112 | -------------------------------------------------------------------------------- /rsc/src/large.errpatch.manual: -------------------------------------------------------------------------------- 1 | # 아래 엔트리는 단위테스트에 사용되는 것으로 삭제하지 마시기 바랍니다. 2 | 지저스크라이스트 지/NNG + 저스크라이스/NNP + 트/NNG 지저스/NNP + 크라이스트/NNP 3 | 지저스 크라이스트 지저스/NNP + _ + 크라이스/NNP + 트/NNG 지저스/NNP + _ + 크라이스트/NNP 4 | 고타마싯다르타 | + 고타마싯다르타/NNP | + 고타마/NNP + 싯다르타/NNP 5 | 무함마드압둘라 무함마드압둘라/NNP + | 무함마드/NNP + 압둘라/NNP + | 6 | -------------------------------------------------------------------------------- /rsc/src/large.model.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kakao/khaiii/3d0c8944374163b1107fd6690ccf3a408430a02d/rsc/src/large.model.pickle -------------------------------------------------------------------------------- /rsc/src/preanal.manual: -------------------------------------------------------------------------------- 1 | # 아래 두 엔트리는 단위테스트에 사용되는 것으로 삭제하지 마시기 바랍니다. 2 | 이더리움 이더리움/NNG 3 | 가즈아* 가/VV + 즈아/EC 4 | -------------------------------------------------------------------------------- /rsc/src/vocab.out.more: -------------------------------------------------------------------------------- 1 | I-SS:I-MAG:0 2 | I-SS:I-VCP:0 3 | -------------------------------------------------------------------------------- /src/main/cpp/khaiii/Config.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2018-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | #include "khaiii/Config.hpp" 8 | 9 | 10 | ////////////// 11 | // includes // 12 | ////////////// 13 | #include 14 | 15 | #include "fmt/format.h" 16 | #include "nlohmann/json.hpp" 17 | 18 | #include "khaiii/KhaiiiApi.hpp" 19 | 20 | 21 | namespace khaiii { 22 | 23 | 24 | using std::exception; 25 | using std::ifstream; 26 | using std::make_shared; 27 | using std::shared_ptr; 28 | using std::string; 29 | 30 | 31 | ///////////// 32 | // methods // 33 | ///////////// 34 | void Config::read_from_file(string path) { 35 | try { 36 | ifstream ifs(path); 37 | nlohmann::json jsn; 38 | ifs >> jsn; 39 | set_members(jsn); 40 | } catch (const exception& exc) { 41 | throw Except(fmt::format("fail to parse config: {}", exc.what())); 42 | } 43 | } 44 | 45 | 46 | void Config::override_from_str(const char* opt_str) { 47 | if (opt_str == nullptr || opt_str[0] == '\0') return; 48 | 49 | try { 50 | auto jsn = nlohmann::json::parse(opt_str); 51 | override_members(jsn); 52 | } catch (const exception& exc) { 53 | throw Except(fmt::format("fail to parse option: {}\n{}", exc.what(), opt_str)); 54 | } 55 | } 56 | 57 | 58 | Config* Config::copy_and_override(const char* opt_str) { 59 | if (opt_str == nullptr || opt_str[0] == '\0') return this; 60 | 61 | auto found = _cfg_cache.find(opt_str); 62 | if (found != _cfg_cache.end()) return found->second.get(); 63 | 64 | auto cfg = copy(); 65 | try { 66 | auto jsn = nlohmann::json::parse(opt_str); 67 | cfg->override_members(jsn); 68 | _cfg_cache[opt_str] = cfg; 69 | } catch (const exception& exc) { 70 | throw Except(fmt::format("fail to parse option: {}\n{}", exc.what(), opt_str)); 71 | } 72 | 73 | return cfg.get(); 74 | } 75 | 76 | 77 | void Config::set_members(const nlohmann::json& jsn) { 78 | class_num = jsn.value("class_num", class_num); 79 | if (class_num <= 0) throw Except(fmt::format("invalid 'class_num' value: {}", class_num)); 80 | 81 | embed_dim = jsn.value("embed_dim", embed_dim); 82 | if (embed_dim <= 0) throw Except(fmt::format("invalid 'embed_dim' value: {}", embed_dim)); 83 | 84 | hidden_dim = jsn.value("hidden_dim", hidden_dim); 85 | if (hidden_dim <= 0) throw Except(fmt::format("invalid 'hidden_dim' value: {}", hidden_dim)); 86 | 87 | vocab_size = jsn.value("vocab_size", vocab_size); 88 | if (vocab_size <= 0) throw Except(fmt::format("invalid 'vocab_size' value: {}", vocab_size)); 89 | 90 | window = jsn.value("window", window); 91 | if (window <= 0) throw Except(fmt::format("invalid 'window' value: {}", window)); 92 | 93 | override_members(jsn); 94 | } 95 | 96 | void Config::override_members(const nlohmann::json& jsn) { 97 | preanal = jsn.value("preanal", preanal); 98 | errpatch = jsn.value("errpatch", errpatch); 99 | restore = jsn.value("restore", restore); 100 | } 101 | 102 | shared_ptr Config::copy() { 103 | auto that = make_shared(); 104 | that->class_num = class_num; 105 | that->embed_dim = embed_dim; 106 | that->hidden_dim = hidden_dim; 107 | that->vocab_size = vocab_size; 108 | that->window = window; 109 | that->preanal = preanal; 110 | that->errpatch = errpatch; 111 | that->restore = restore; 112 | return that; 113 | } 114 | 115 | 116 | } // namespace khaiii 117 | -------------------------------------------------------------------------------- /src/main/cpp/khaiii/Config.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2018-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | #ifndef SRC_MAIN_CPP_KHAIII_CONFIG_HPP_ 8 | #define SRC_MAIN_CPP_KHAIII_CONFIG_HPP_ 9 | 10 | 11 | ////////////// 12 | // includes // 13 | ////////////// 14 | #include 15 | #include 16 | #include 17 | 18 | #include "nlohmann/json.hpp" 19 | 20 | 21 | namespace khaiii { 22 | 23 | 24 | /** 25 | * JSON format configuration file 26 | */ 27 | class Config { 28 | public: 29 | int class_num = -1; ///< number of classes 30 | int embed_dim = -1; ///< embedding dimension 31 | int hidden_dim = -1; ///< hidden dimension 32 | int vocab_size = -1; ///< vocabulary size 33 | int window = -1; ///< context window size 34 | 35 | bool preanal = true; ///< whether apply preanal or not 36 | bool errpatch = true; ///< whether apply error patch or not 37 | bool restore = true; ///< whether restore morphemes or not 38 | 39 | Config() = default; 40 | Config(const Config&) = delete; ///< delete copy constructor 41 | Config& operator=(const Config&) = delete; ///< delete assignment operator 42 | 43 | /** 44 | * 파일로부터 설정을 읽어들인다. 45 | * @param path file path 46 | */ 47 | void read_from_file(std::string path); 48 | 49 | /** 50 | * JSON 옵션을 이용해 설정을 override 한다. 51 | * @param opt_str option string (JSON format) 52 | */ 53 | void override_from_str(const char* opt_str); 54 | 55 | /** 56 | * 객체를 복사하고 설정을 override 한다. 57 | * @param opt_str option string (JSON format) 58 | * @return 존재할 경우 그 옵션 객체 59 | */ 60 | Config* copy_and_override(const char* opt_str); 61 | 62 | /** 63 | * 파싱된 JSON 객체를 이용해서 멤버를 세팅한다. 64 | * @param jsn JSON 객체 65 | */ 66 | void set_members(const nlohmann::json& jsn); 67 | 68 | /** 69 | * 파싱된 JSON 객체를 이용해서 오버라이딩할 멤버만 세팅한다. 70 | * @param jsn JSON 객체 71 | */ 72 | void override_members(const nlohmann::json& jsn); 73 | 74 | /** 75 | * 자기 자신을 복사한 객체를 생성한다. 76 | * @return 복사된 객체 77 | */ 78 | std::shared_ptr copy(); 79 | 80 | private: 81 | /** 82 | * 오버라이딩된 객체의 캐시 83 | */ 84 | std::unordered_map> _cfg_cache; 85 | }; 86 | 87 | 88 | } // namespace khaiii 89 | 90 | 91 | #endif // SRC_MAIN_CPP_KHAIII_CONFIG_HPP_ 92 | -------------------------------------------------------------------------------- /src/main/cpp/khaiii/Embed.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2018-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | #include "khaiii/Embed.hpp" 8 | 9 | 10 | ////////////// 11 | // includes // 12 | ////////////// 13 | #include 14 | #include 15 | 16 | #include "khaiii/Config.hpp" 17 | #ifndef NDEBUG 18 | #include "khaiii/util.hpp" 19 | #endif 20 | 21 | 22 | namespace khaiii { 23 | 24 | 25 | using std::make_shared; 26 | using std::shared_ptr; 27 | using std::string; 28 | 29 | 30 | //////////////////// 31 | // static members // 32 | //////////////////// 33 | shared_ptr Embed::_log = spdlog::stderr_color_mt("Embed"); 34 | 35 | 36 | ///////////// 37 | // methods // 38 | ///////////// 39 | void Embed::open(const Config& cfg, string dir) { 40 | _embed_mmf.open(fmt::format("{}/embed.bin", dir)); 41 | _keys = reinterpret_cast(_embed_mmf.data()); 42 | const float* val_start = reinterpret_cast(_keys + cfg.vocab_size); 43 | for (int i = 0; i < cfg.vocab_size; ++i) { 44 | const float* embed_start = val_start + i * cfg.embed_dim; 45 | _vals.emplace_back(embedding_t(const_cast(embed_start), cfg.embed_dim)); 46 | SPDLOG_TRACE(_log, "[{}] {}", i, _vals[i]); 47 | } 48 | } 49 | 50 | 51 | void Embed::close() { 52 | _embed_mmf.close(); 53 | } 54 | 55 | 56 | const embedding_t& Embed::operator[](wchar_t chr) const { 57 | const wchar_t* found = reinterpret_cast( 58 | bsearch(&chr, _keys, _vals.size(), sizeof(wchar_t), Embed::_key_cmp)); 59 | int idx = 1; // unknown character index is 1 60 | if (found != nullptr) idx = found - _keys; 61 | #ifndef NDEBUG 62 | wchar_t wstr[2] = {chr, 0}; 63 | SPDLOG_TRACE(_log, "'{}'({}) {}", wstr_to_utf8(wstr), idx, _vals.at(idx)); 64 | #endif 65 | return _vals.at(idx); 66 | } 67 | 68 | 69 | const embedding_t& Embed::left_word_bound() const { 70 | return _vals.at(2); 71 | } 72 | 73 | 74 | const embedding_t& Embed::right_word_bound() const { 75 | return _vals.at(3); 76 | } 77 | 78 | 79 | const embedding_t& Embed::left_padding() const { 80 | return _vals.at(0); // padding index is 0 which is zero vector 81 | } 82 | 83 | 84 | const embedding_t& Embed::right_padding() const { 85 | return _vals.at(0); // padding index is 0 which is zero vector 86 | } 87 | 88 | 89 | int Embed::_key_cmp(const void* left, const void* right) { 90 | const wchar_t* left_ = reinterpret_cast(left); 91 | const wchar_t* right_ = reinterpret_cast(right); 92 | return *left_ - *right_; 93 | } 94 | 95 | 96 | } // namespace khaiii 97 | -------------------------------------------------------------------------------- /src/main/cpp/khaiii/Embed.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2018-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | #ifndef SRC_MAIN_CPP_KHAIII_EMBED_HPP_ 8 | #define SRC_MAIN_CPP_KHAIII_EMBED_HPP_ 9 | 10 | 11 | ////////////// 12 | // includes // 13 | ////////////// 14 | #include 15 | #include 16 | #include 17 | 18 | #include "Eigen/Dense" 19 | #include "spdlog/spdlog.h" 20 | 21 | #include "khaiii/MemMapFile.hpp" 22 | #include "khaiii/nn/tensor.hpp" 23 | 24 | 25 | namespace khaiii { 26 | 27 | 28 | using embedding_t = nn::vector_map_t; 29 | class Config; 30 | 31 | 32 | class Embed { 33 | public: 34 | /** 35 | * open resource with memory data 36 | * @param cfg config 37 | * @param dir base directory 38 | */ 39 | void open(const Config& cfg, std::string dir); 40 | 41 | void close(); ///< 리소스를 닫는다. 42 | 43 | /** 44 | * get embedding vector with character 45 | * @param chr character 46 | * @return embedding vector 47 | */ 48 | const embedding_t& operator[](wchar_t chr) const; 49 | 50 | const embedding_t& left_word_bound() const; ///< left word bound 51 | const embedding_t& right_word_bound() const; ///< right word bound 52 | const embedding_t& left_padding() const; ///< left padding 53 | const embedding_t& right_padding() const; ///< right padding 54 | 55 | private: 56 | static std::shared_ptr _log; ///< logger 57 | 58 | const wchar_t* _keys = nullptr; ///< keys (characters) 59 | std::vector _vals; ///< values (embedding vectors) 60 | 61 | static int _key_cmp(const void* left, const void* right); ///< key comparator for bsearch 62 | 63 | MemMapFile _embed_mmf; ///< model embedding memory mapping 64 | }; 65 | 66 | 67 | } // namespace khaiii 68 | 69 | 70 | #endif // SRC_MAIN_CPP_KHAIII_EMBED_HPP_ 71 | -------------------------------------------------------------------------------- /src/main/cpp/khaiii/ErrPatch.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2018-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | #include "khaiii/ErrPatch.hpp" 8 | 9 | 10 | ////////////// 11 | // includes // 12 | ////////////// 13 | #include 14 | #include 15 | #include 16 | 17 | #include "khaiii/KhaiiiApi.hpp" 18 | #include "khaiii/Sentence.hpp" 19 | #include "khaiii/Word.hpp" 20 | 21 | 22 | namespace khaiii { 23 | 24 | 25 | using std::dynamic_pointer_cast; 26 | using std::exception; 27 | using std::shared_ptr; 28 | using std::string; 29 | using std::vector; 30 | 31 | 32 | //////////////////// 33 | // static members // 34 | //////////////////// 35 | const wchar_t ErrPatch::WORD_DELIM_NUM = -1; 36 | const wchar_t ErrPatch::SENT_DELIM_NUM = -2; 37 | 38 | shared_ptr ErrPatch::_log = spdlog::stderr_color_mt("ErrPatch"); 39 | 40 | 41 | //////////////////// 42 | // ctors and dtor // 43 | //////////////////// 44 | ErrPatch::~ErrPatch() { 45 | close(); 46 | } 47 | 48 | 49 | ///////////// 50 | // methods // 51 | ///////////// 52 | void ErrPatch::open(string dir) { 53 | _trie.open(dir + "/errpatch.tri"); 54 | _val_mmf.open(dir + "/errpatch.val"); 55 | MemMapFile len_mmf; 56 | len_mmf.open(dir + "/errpatch.len"); // 각 value들의 길이 정보 57 | _vals.reserve(len_mmf.size()); 58 | const uint8_t* lens = len_mmf.data(); 59 | const int16_t* val_ptr = _val_mmf.data(); 60 | for (int i = 0; i < len_mmf.size(); ++i) { 61 | // 길이 정보를 이용하여 int16_t 가변길이 배열인 값(_vals)을 세팅한다. 62 | _vals.emplace_back(val_ptr); 63 | val_ptr += lens[i] + 1; // 길이 + 마지막 0 64 | } 65 | assert(_vals.size() == len_mmf.size()); 66 | assert(val_ptr - _val_mmf.data() == _val_mmf.size()); 67 | _log->info("errpatch dictionary opened"); 68 | } 69 | 70 | 71 | void ErrPatch::close() { 72 | _trie.close(); 73 | _val_mmf.close(); 74 | _log->debug("errpatch dictionary closed"); 75 | } 76 | 77 | 78 | void ErrPatch::apply(shared_ptr sent) const { 79 | vector outputs; // 매칭된 패치의 정분석 결과 태그 값을 덮어쓸 출력 위치 80 | vector chars = _get_char_tag_mixes(sent, &outputs); 81 | for (int i = 0; i < chars.size(); ++i) { 82 | auto found = _trie.search_longest_prefix_match(&chars[i]); 83 | if (found == boost::none) continue; 84 | auto val = _vals[found->val]; 85 | for (int j = 0; j < found->len; ++j) { 86 | if (outputs[i + j] == nullptr) { 87 | assert(val[j] == WORD_DELIM_NUM || val[j] == SENT_DELIM_NUM); 88 | continue; 89 | } 90 | *outputs[i + j] = val[j]; 91 | } 92 | i += found->len - 1; 93 | } 94 | } 95 | 96 | 97 | vector ErrPatch::_get_char_tag_mixes(shared_ptr sent, 98 | vector* outputs) { 99 | vector chars; 100 | chars.reserve(2 + 2 * sent->words.size()); 101 | outputs->reserve(2 + 2 * sent->words.size()); 102 | chars.emplace_back(SENT_DELIM_NUM); // 문장 경계 103 | outputs->emplace_back(nullptr); 104 | for (auto& word : sent->words) { 105 | if (chars.size() > 1) { 106 | chars.emplace_back(WORD_DELIM_NUM); // 어절 경계 107 | outputs->emplace_back(nullptr); 108 | } 109 | for (int i = 0; i < word->wlength; ++i) { 110 | wchar_t char_tag_mix = (word->wbegin[i] << 12) | word->char_tags[i]; 111 | _log->debug("{:5x}|{:3x} -> {:08x}", static_cast(word->wbegin[i]), 112 | word->char_tags[i], static_cast(char_tag_mix)); 113 | chars.emplace_back(char_tag_mix); 114 | outputs->emplace_back(&word->char_tags[i]); 115 | } 116 | } 117 | chars.emplace_back(SENT_DELIM_NUM); // 문장 경계 118 | outputs->emplace_back(nullptr); 119 | chars.emplace_back(0); // 마지막 string termination 120 | outputs->emplace_back(nullptr); 121 | return chars; 122 | } 123 | 124 | 125 | } // namespace khaiii 126 | -------------------------------------------------------------------------------- /src/main/cpp/khaiii/ErrPatch.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2018-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | #ifndef SRC_MAIN_CPP_KHAIII_ERRPATCH_HPP_ 8 | #define SRC_MAIN_CPP_KHAIII_ERRPATCH_HPP_ 9 | 10 | 11 | ////////////// 12 | // includes // 13 | ////////////// 14 | #include 15 | #include 16 | #include 17 | 18 | #include "spdlog/spdlog.h" 19 | 20 | #include "khaiii/MemMapFile.hpp" 21 | #include "khaiii/Trie.hpp" 22 | 23 | 24 | namespace khaiii { 25 | 26 | 27 | class Sentence; 28 | 29 | 30 | class ErrPatch { 31 | public: 32 | static const wchar_t WORD_DELIM_NUM; ///< 어절 경계를 나타내는 가상 음절 33 | static const wchar_t SENT_DELIM_NUM; ///< 문장 경계를 나타내는 가상 음절 34 | 35 | virtual ~ErrPatch(); ///< dtor 36 | 37 | /** 38 | * 리소스를 연다. 39 | * @param dir 리소스 디렉토리 40 | */ 41 | void open(std::string dir); 42 | 43 | void close(); ///< 리소스를 닫는다. 44 | 45 | /** 46 | * 오분석 패치를 적용한다. 47 | * @param sent 문장 48 | */ 49 | void apply(std::shared_ptr sent) const; 50 | 51 | private: 52 | static std::shared_ptr _log; ///< logger 53 | 54 | Trie _trie; 55 | MemMapFile _val_mmf; ///< value memory mapping 56 | std::vector _vals; ///< actual values 57 | 58 | /** 59 | * 문장을 Trie 입력에 맞도록 음절과 태그의 비트 조합의 열로 만들고, 출력 위치를 기록한다. 60 | * @param sent 문장 61 | * @param outputs 출력 위치 62 | * @return 음절과 태그의 비트 조합한 열 63 | */ 64 | static std::vector _get_char_tag_mixes(std::shared_ptr sent, 65 | std::vector* outputs); 66 | }; 67 | 68 | 69 | } // namespace khaiii 70 | 71 | 72 | #endif // SRC_MAIN_CPP_KHAIII_ERRPATCH_HPP_ 73 | -------------------------------------------------------------------------------- /src/main/cpp/khaiii/KhaiiiImpl.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2018-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | #ifndef SRC_MAIN_CPP_KHAIII_KHAIIIIMPL_HPP_ 8 | #define SRC_MAIN_CPP_KHAIII_KHAIIIIMPL_HPP_ 9 | 10 | 11 | ////////////// 12 | // includes // 13 | ////////////// 14 | #include 15 | #include 16 | #include 17 | #include // NOLINT 18 | #include 19 | #include 20 | 21 | #include "spdlog/spdlog.h" 22 | 23 | #include "khaiii/Config.hpp" 24 | #include "khaiii/KhaiiiApi.hpp" 25 | #include "khaiii/Resource.hpp" 26 | 27 | 28 | namespace khaiii { 29 | 30 | 31 | class Sentence; 32 | 33 | 34 | /** 35 | * implementation of khaiii API 36 | */ 37 | class KhaiiiImpl: public KhaiiiApi { 38 | public: 39 | virtual ~KhaiiiImpl(); ///< dtor 40 | 41 | void open(std::string rsc_dir = "", std::string opt_str = "") override; 42 | 43 | const khaiii_word_t* analyze(const char* input, const char* opt_str) override; 44 | 45 | /** 46 | * 분석을 수행하고 오분석 패치를 실행하기 직전에 멈춘 다음 그 결과를 리턴한다. 47 | * @param input input text 48 | * @param output output value for each character 49 | * @param opt_str runtime option (JSON format) 50 | * @return output length. -1 if failed 51 | */ 52 | int analyze_bfr_errpatch(const char* input, const char* opt_str, int16_t* output); 53 | 54 | void free_results(const khaiii_word_t* results) override; 55 | 56 | void close() override; 57 | 58 | /** 59 | * get mutex for this api object 60 | * @return mutex 61 | */ 62 | std::recursive_mutex& get_mutex(); 63 | 64 | /** 65 | * set error message 66 | * @param message 67 | */ 68 | void set_err_msg(std::string msg); 69 | 70 | /** 71 | * get error message 72 | * @return message 73 | */ 74 | const char* get_err_msg() const; 75 | 76 | /** 77 | * 로그 레벨을 지정한다. 78 | * @param name 로거 이름. "all"인 경우 모든 로거 79 | * @param level 로거 레벨. trace, debug, info, warn, err, critical 80 | */ 81 | static void set_log_level(std::string name, std::string level); 82 | 83 | /** 84 | * 여러 로그 레벨을 한꺼번에 지정한다. 85 | * @param name_level_pairs 로거 (이름, 레벨) 쌍의 리스트. 86 | * "all:warn,console:info,Tagger:debug"와 같은 형식 87 | */ 88 | static void set_log_levels(std::string name_level_pairs); 89 | 90 | 91 | private: 92 | static std::shared_ptr _log; ///< logger 93 | 94 | std::recursive_mutex _mutex; ///< mutex to access exclusively 95 | bool _is_opened = false; ///< handle is opened 96 | std::string _err_msg; ///< last error message 97 | 98 | Config _cfg; ///< config 99 | Resource _rsc; ///< resource 100 | 101 | // 분석 결과를 C API에 넘겨주고 참조 카운트가 0이 되어 메모리에서 해제되는 것을 방지하기 위해, 102 | // 헤드 어절을 키로 하여 문장 객체 전체를 임시로 넣어두는 보관소 103 | std::map> _result_cloakroom; 104 | 105 | /** 106 | * 보관소에 결과를 맡긴다. 107 | * @param sent 문장 108 | * @return 첫번째 어절의 포인터 109 | */ 110 | const khaiii_word_t* _deposit_sent(std::shared_ptr sent); 111 | 112 | /** 113 | * 보관하던 결과를 삭제한다. 114 | * @param head_word 첫번째 어절의 포인터 115 | */ 116 | void _withdraw_sent(const khaiii_word_t* head_word); 117 | 118 | /** 119 | * 리소스 디렉토리를 점검한다. 120 | * @param rsc_dir resource directory 121 | * @return 존재하는 디렉토리 경로 122 | */ 123 | std::string _check_rsc_dir(std::string rsc_dir); 124 | }; 125 | 126 | 127 | } // namespace khaiii 128 | 129 | 130 | #endif // SRC_MAIN_CPP_KHAIII_KHAIIIIMPL_HPP_ 131 | -------------------------------------------------------------------------------- /src/main/cpp/khaiii/MemMapFile.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2018-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | #ifndef SRC_MAIN_CPP_KHAIII_MEMMAPFILE_HPP_ 8 | #define SRC_MAIN_CPP_KHAIII_MEMMAPFILE_HPP_ 9 | 10 | 11 | ////////////// 12 | // includes // 13 | ////////////// 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #include "fmt/format.h" 21 | 22 | #include "khaiii/KhaiiiApi.hpp" 23 | 24 | 25 | namespace khaiii { 26 | 27 | 28 | /** 29 | * memory mapped file 30 | */ 31 | template 32 | class MemMapFile { 33 | public: 34 | /** 35 | * dtor 36 | */ 37 | virtual ~MemMapFile() { 38 | close(); 39 | } 40 | 41 | /** 42 | * open memory mapped file 43 | * @param path path 44 | */ 45 | void open(std::string path) { 46 | close(); 47 | int fd = ::open(path.c_str(), O_RDONLY, 0660); 48 | if (fd == -1) throw Except(fmt::format("fail to open file: {}", path)); 49 | std::ifstream fin(path, std::ifstream::ate | std::ifstream::binary); 50 | _byte_len = fin.tellg(); 51 | if (_byte_len == -1) throw Except(fmt::format("fail to get size of file: {}", path)); 52 | assert(_byte_len % sizeof(T) == 0); 53 | _data = reinterpret_cast(::mmap(0, _byte_len, PROT_READ, MAP_SHARED, fd, 0)); 54 | ::close(fd); 55 | if (_data == MAP_FAILED) { 56 | throw Except(fmt::format("fail to map file to memory: {}", path)); 57 | } 58 | _path = path; 59 | } 60 | 61 | /** 62 | * close memory mapped file 63 | */ 64 | void close() { 65 | if (_data) { 66 | if (::munmap(const_cast(_data), _byte_len) == -1) { 67 | throw Except(fmt::format("fail to close memory mapped file: {}", _path)); 68 | } 69 | } 70 | _path = ""; 71 | _data = nullptr; 72 | _byte_len = -1; 73 | } 74 | 75 | /** 76 | * get pointer of data 77 | * @return start address of data 78 | */ 79 | const T* data() const { 80 | assert(_data != nullptr && _byte_len >= sizeof(T)); 81 | return _data; 82 | } 83 | 84 | /** 85 | * get data size 86 | * @return number of data elements (not byte length) 87 | */ 88 | int size() const { 89 | assert(_data != nullptr && _byte_len >= sizeof(T)); 90 | return _byte_len / sizeof(T); 91 | } 92 | 93 | private: 94 | std::string _path; ///< file path 95 | const T* _data = nullptr; ///< memory data 96 | int _byte_len = -1; ///< byte length 97 | }; 98 | 99 | 100 | } // namespace khaiii 101 | 102 | 103 | #endif // SRC_MAIN_CPP_KHAIII_MEMMAPFILE_HPP_ 104 | -------------------------------------------------------------------------------- /src/main/cpp/khaiii/Morph.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2018-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | #include "khaiii/Morph.hpp" 8 | 9 | 10 | ////////////// 11 | // includes // 12 | ////////////// 13 | #include 14 | #include 15 | #include 16 | 17 | #include "khaiii/util.hpp" 18 | 19 | 20 | namespace khaiii { 21 | 22 | 23 | using std::string; 24 | using std::vector; 25 | using std::wstring; 26 | using std::wstringstream; 27 | 28 | 29 | /////////////// 30 | // variables // 31 | /////////////// 32 | static const char* _TAG_SET[POS_TAG_SIZE] = { 33 | "EC", "EF", "EP", "ETM", "ETN", "IC", "JC", "JKB", "JKC", "JKG", 34 | "JKO", "JKQ", "JKS", "JKV", "JX", "MAG", "MAJ", "MM", "NNB", "NNG", 35 | "NNP", "NP", "NR", "SE", "SF", "SH", "SL", "SN", "SO", "SP", 36 | "SS", "SW", "SWK", "VA", "VCN", "VCP", "VV", "VX", "XPN", "XR", 37 | "XSA", "XSN", "XSV", "ZN", "ZV", "ZZ", 38 | }; 39 | 40 | 41 | //////////////////// 42 | // ctors and dtor // 43 | //////////////////// 44 | Morph::Morph(wstring wlex, pos_tag_t tag, const wchar_t* wbegin, int wlength) 45 | : wlex(wlex), wbegin(wbegin), wlength(wlength), _lex(wstr_to_utf8(wlex)) { 46 | lex = _lex.c_str(); 47 | this->tag = pos_str(tag); 48 | begin = -1; 49 | length = -1; 50 | next = nullptr; 51 | } 52 | 53 | 54 | ///////////// 55 | // methods // 56 | ///////////// 57 | const char* Morph::pos_str(pos_tag_t num) { 58 | assert(0 < num && num <= POS_TAG_SIZE); 59 | return _TAG_SET[num-1]; 60 | } 61 | 62 | void Morph::organize(const wstring& wraw, const vector& wbegins, const vector& wends) { 63 | int begin_idx = wbegin - wraw.c_str(); 64 | int end_idx = begin_idx + wlength - 1; 65 | begin = wbegins[begin_idx]; 66 | length = wends[end_idx] - begin; 67 | } 68 | 69 | 70 | 71 | string Morph::str() { 72 | return wstr_to_utf8(wstr()); 73 | } 74 | 75 | 76 | wstring Morph::wstr() { 77 | wstringstream wss; 78 | wss << wlex << L"/" << tag << L":" << begin << L"," << length; 79 | return wss.str(); 80 | } 81 | 82 | 83 | } // namespace khaiii 84 | -------------------------------------------------------------------------------- /src/main/cpp/khaiii/Morph.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2018-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | #ifndef SRC_MAIN_CPP_KHAIII_MORPH_HPP_ 8 | #define SRC_MAIN_CPP_KHAIII_MORPH_HPP_ 9 | 10 | 11 | ////////////// 12 | // includes // 13 | ////////////// 14 | #include 15 | #include 16 | 17 | #include "khaiii/khaiii_api.h" 18 | 19 | 20 | namespace khaiii { 21 | 22 | 23 | /** 품사 태그 */ 24 | typedef enum { 25 | EC, EF, EP, ETM, ETN, IC, JC, JKB, JKC, JKG, 26 | JKO, JKQ, JKS, JKV, JX, MAG, MAJ, MM, NNB, NNG, 27 | NNP, NP, NR, SE, SF, SH, SL, SN, SO, SP, 28 | SS, SW, SWK, VA, VCN, VCP, VV, VX, XPN, XR, 29 | XSA, XSN, XSV, ZN, ZV, ZZ, 30 | POS_TAG_SIZE 31 | } pos_tag_t; 32 | 33 | 34 | /** 35 | * 형태소 자료구조 36 | */ 37 | class Morph: public khaiii_morph_t { 38 | public: 39 | std::wstring wlex; ///< unicode lexical 40 | const wchar_t* wbegin = nullptr; ///< unicode string begin address 41 | int wlength = 0; ///< unicode string length 42 | 43 | Morph(std::wstring wlex, pos_tag_t tag, const wchar_t* wbegin, int wlength); ///< ctor 44 | 45 | /** 46 | * API 결과 구조체의 내용을 채운다. 47 | * @param wraw 유니코드 원문 48 | * @param wbegins 각 음절별 시작 byte 위치 49 | * @param wends 각 음절별 끝 byte 위치 50 | */ 51 | void organize(const std::wstring& wraw, const std::vector& wbegins, 52 | const std::vector& wends); 53 | 54 | /** 55 | * pos_tag_t 타입의 숫자 태그에 대응하는 문자열 태그를 리턴한다. 56 | * @param num 숫자 품사 태그 57 | * @return 문자열 품사 태그 58 | */ 59 | static const char* pos_str(pos_tag_t num); 60 | 61 | /** 62 | * 개체명 태그 스트링의 포인터를 전달해서, API 구조체 내 변수에 설정합니다. 63 | * @param tag 개체명 태그 64 | * @return void 65 | */ 66 | void set_ne_str(const char* tag); 67 | 68 | std::string str(); ///< UTF-8 문자열로 표현합니다. 69 | std::wstring wstr(); ///< 유니코드 문자열로 표현합니다. (거의) 디버그용 70 | 71 | private: 72 | std::string _lex; ///< cache of UTF-8 lexical 73 | }; 74 | 75 | 76 | } // namespace khaiii 77 | 78 | 79 | #endif // SRC_MAIN_CPP_KHAIII_MORPH_HPP_ 80 | -------------------------------------------------------------------------------- /src/main/cpp/khaiii/Preanal.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2018-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | #include "khaiii/Preanal.hpp" 8 | 9 | 10 | ////////////// 11 | // includes // 12 | ////////////// 13 | #include 14 | #include 15 | 16 | #include "khaiii/KhaiiiApi.hpp" 17 | #include "khaiii/Word.hpp" 18 | 19 | 20 | namespace khaiii { 21 | 22 | 23 | using std::exception; 24 | using std::shared_ptr; 25 | using std::string; 26 | 27 | 28 | //////////////////// 29 | // static members // 30 | //////////////////// 31 | shared_ptr Preanal::_log = spdlog::stderr_color_mt("Preanal"); 32 | 33 | 34 | //////////////////// 35 | // ctors and dtor // 36 | //////////////////// 37 | Preanal::~Preanal() { 38 | close(); 39 | } 40 | 41 | 42 | ///////////// 43 | // methods // 44 | ///////////// 45 | void Preanal::open(string dir) { 46 | _trie.open(dir + "/preanal.tri"); 47 | _val_mmf.open(dir + "/preanal.val"); 48 | _log->info("preanal dictionary opened"); 49 | } 50 | 51 | 52 | void Preanal::close() { 53 | _trie.close(); 54 | _val_mmf.close(); 55 | _log->debug("preanal dictionary closed"); 56 | } 57 | 58 | 59 | void Preanal::apply(shared_ptr word) const { 60 | auto matches = _trie.search_common_prefix_matches(word->wbegin, word->wlength); 61 | int len = 0; 62 | int idx = -1; 63 | for (auto match = matches.rbegin(); match != matches.rend(); ++match) { 64 | bool is_exact = match->val % 2 == 0; 65 | if (is_exact && match->len == word->wlength) { 66 | len = match->len; 67 | idx = match->val / 2; 68 | } else if (!is_exact) { 69 | len = match->len; 70 | idx = (match->val - 1) / 2; 71 | } 72 | if (len > 1 && idx >= 0) break; 73 | } 74 | if (len <= 0 || idx < 0) return; 75 | const uint16_t* tag_out_start = &_val_mmf.data()[idx]; 76 | for (int i = 0; i < len; ++i) { 77 | word->char_tags[i] = tag_out_start[i]; 78 | } 79 | } 80 | 81 | 82 | } // namespace khaiii 83 | -------------------------------------------------------------------------------- /src/main/cpp/khaiii/Preanal.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2018-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | #ifndef SRC_MAIN_CPP_KHAIII_PREANAL_HPP_ 8 | #define SRC_MAIN_CPP_KHAIII_PREANAL_HPP_ 9 | 10 | 11 | ////////////// 12 | // includes // 13 | ////////////// 14 | #include 15 | #include 16 | 17 | #include "spdlog/spdlog.h" 18 | 19 | #include "khaiii/MemMapFile.hpp" 20 | #include "khaiii/Trie.hpp" 21 | 22 | 23 | namespace khaiii { 24 | 25 | 26 | class Word; 27 | 28 | 29 | class Preanal { 30 | public: 31 | virtual ~Preanal(); ///< dtor 32 | 33 | /** 34 | * 리소스를 연다. 35 | * @param dir 리소스 디렉토리 36 | */ 37 | void open(std::string dir); 38 | 39 | void close(); ///< 리소스를 닫는다. 40 | 41 | /** 42 | * 기분석 사전을 적용하여 음절 별로 태깅한다. 43 | * @param word 어절 44 | */ 45 | void apply(std::shared_ptr word) const; 46 | 47 | private: 48 | static std::shared_ptr _log; ///< logger 49 | 50 | Trie _trie; 51 | MemMapFile _val_mmf; ///< value memory mapping 52 | }; 53 | 54 | 55 | } // namespace khaiii 56 | 57 | 58 | #endif // SRC_MAIN_CPP_KHAIII_PREANAL_HPP_ 59 | -------------------------------------------------------------------------------- /src/main/cpp/khaiii/Resource.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2018-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | #include "khaiii/Resource.hpp" 8 | 9 | 10 | ////////////// 11 | // includes // 12 | ////////////// 13 | #include 14 | #include 15 | 16 | #include "khaiii/Config.hpp" 17 | #include "khaiii/KhaiiiApi.hpp" 18 | #include "khaiii/nn/tensor.hpp" 19 | 20 | 21 | namespace khaiii { 22 | 23 | 24 | using std::exception; 25 | using std::shared_ptr; 26 | using std::string; 27 | 28 | 29 | //////////////////// 30 | // static members // 31 | //////////////////// 32 | shared_ptr Resource::_log = spdlog::stderr_color_mt("Resource"); 33 | 34 | 35 | //////////////////// 36 | // ctors and dtor // 37 | //////////////////// 38 | Resource::~Resource() { 39 | close(); 40 | } 41 | 42 | 43 | ///////////// 44 | // methods // 45 | ///////////// 46 | void Resource::open(const Config& cfg, std::string dir) { 47 | embed.open(cfg, dir); 48 | for (int kernel_size = 2; kernel_size < 6; ++kernel_size) { 49 | string path = fmt::format("{}/conv.{}.fil", dir, kernel_size); 50 | convs[kernel_size].open(path, cfg.embed_dim, cfg.embed_dim, kernel_size, &nn::RELU); 51 | } 52 | cnv2hdn.open(dir + "/cnv2hdn.lin", 4 * cfg.embed_dim, cfg.hidden_dim, true, &nn::RELU); 53 | string path = fmt::format("{}/hdn2tag.lin", dir); 54 | hdn2tag.open(path, cfg.hidden_dim, cfg.class_num, true); 55 | _log->info("NN model loaded"); 56 | preanal.open(dir); 57 | errpatch.open(dir); 58 | restore.open(dir); 59 | _log->info("PoS tagger opened"); 60 | } 61 | 62 | 63 | void Resource::close() { 64 | embed.close(); 65 | hdn2tag.close(); 66 | preanal.close(); 67 | errpatch.close(); 68 | restore.close(); 69 | _log->debug("PoS tagger closed"); 70 | } 71 | 72 | 73 | } // namespace khaiii 74 | -------------------------------------------------------------------------------- /src/main/cpp/khaiii/Resource.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2018-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | #ifndef SRC_MAIN_CPP_KHAIII_RESOURCE_HPP_ 8 | #define SRC_MAIN_CPP_KHAIII_RESOURCE_HPP_ 9 | 10 | 11 | ////////////// 12 | // includes // 13 | ////////////// 14 | #include 15 | #include 16 | 17 | #include "spdlog/spdlog.h" 18 | 19 | #include "khaiii/Embed.hpp" 20 | #include "khaiii/ErrPatch.hpp" 21 | #include "khaiii/Preanal.hpp" 22 | #include "khaiii/Resource.hpp" 23 | #include "khaiii/Restore.hpp" 24 | #include "khaiii/nn/Conv1d.hpp" 25 | #include "khaiii/nn/Linear.hpp" 26 | 27 | 28 | namespace khaiii { 29 | 30 | 31 | class Config; 32 | 33 | 34 | /** 35 | * resources for part-of-speech tagger 36 | */ 37 | class Resource { 38 | public: 39 | virtual ~Resource(); ///< dtor 40 | 41 | Embed embed; ///< character embedding 42 | nn::Linear cnv2hdn; ///< convs -> hidden layer 43 | nn::Linear hdn2tag; ///< hidden -> tag(output) layer 44 | nn::Conv1d convs[6]; ///< convolution layers (0, 1 are dummy) 45 | Preanal preanal; ///< 기분석 사전 46 | ErrPatch errpatch; ///< 오분석 패치 47 | Restore restore; ///< 원형복원 사전 48 | 49 | void open(const Config& cfg, std::string dir); 50 | void close(); 51 | 52 | private: 53 | static std::shared_ptr _log; ///< logger 54 | }; 55 | 56 | 57 | } // namespace khaiii 58 | 59 | 60 | #endif // SRC_MAIN_CPP_KHAIII_RESOURCE_HPP_ 61 | -------------------------------------------------------------------------------- /src/main/cpp/khaiii/Restore.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2018-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | #include "khaiii/Restore.hpp" 8 | 9 | 10 | ////////////// 11 | // includes // 12 | ////////////// 13 | #include 14 | #include 15 | #include 16 | 17 | #include "khaiii/KhaiiiApi.hpp" 18 | #include "khaiii/Morph.hpp" 19 | #include "khaiii/util.hpp" 20 | 21 | 22 | namespace khaiii { 23 | 24 | 25 | using std::exception; 26 | using std::shared_ptr; 27 | using std::string; 28 | using std::vector; 29 | 30 | 31 | /////////////// 32 | // constants // 33 | /////////////// 34 | static const char* _B_STRS[POS_TAG_SIZE] = { 35 | "B-EC", "B-EF", "B-EP", "B-ETM", "B-ETN", "B-IC", "B-JC", "B-JKB", "B-JKC", "B-JKG", 36 | "B-JKO", "B-JKQ", "B-JKS", "B-JKV", "B-JX", "B-MAG", "B-MAJ", "B-MM", "B-NNB", "B-NNG", 37 | "B-NNP", "B-NP", "B-NR", "B-SE", "B-SF", "B-SH", "B-SL", "B-SN", "B-SO", "B-SP", 38 | "B-SS", "B-SW", "B-SWK", "B-VA", "B-VCN", "B-VCP", "B-VV", "B-VX", "B-XPN", "B-XR", 39 | "B-XSA", "B-XSN", "B-XSV", "B-ZN", "B-ZV", "B-ZZ", 40 | }; 41 | 42 | static const char* _I_STRS[POS_TAG_SIZE] = { 43 | "I-EC", "I-EF", "I-EP", "I-ETM", "I-ETN", "I-IC", "I-JC", "I-JKB", "I-JKC", "I-JKG", 44 | "I-JKO", "I-JKQ", "I-JKS", "I-JKV", "I-JX", "I-MAG", "I-MAJ", "I-MM", "I-NNB", "I-NNG", 45 | "I-NNP", "I-NP", "I-NR", "I-SE", "I-SF", "I-SH", "I-SL", "I-SN", "I-SO", "I-SP", 46 | "I-SS", "I-SW", "I-SWK", "I-VA", "I-VCN", "I-VCP", "I-VV", "I-VX", "I-XPN", "I-XR", 47 | "I-XSA", "I-XSN", "I-XSV", "I-ZN", "I-ZV", "I-ZZ", 48 | }; 49 | 50 | 51 | //////////////////// 52 | // static members // 53 | //////////////////// 54 | shared_ptr Restore::_log = spdlog::stderr_color_mt("Restore"); 55 | 56 | 57 | //////////////////// 58 | // ctors and dtor // 59 | //////////////////// 60 | Restore::~Restore() { 61 | close(); 62 | } 63 | 64 | 65 | ///////////// 66 | // methods // 67 | ///////////// 68 | std::string chr_tag_t::str() { 69 | assert(0 < tag && tag <= POS_TAG_SIZE); 70 | wchar_t wstr[2] = {chr, 0}; 71 | const char** table = _B_STRS; 72 | if (bi == chr_tag_t::I) table = _I_STRS; 73 | return wstr_to_utf8(wstr) + "/" + table[tag-1]; 74 | } 75 | 76 | 77 | void Restore::open(string dir) { 78 | _key_mmf.open(dir + "/restore.key"); 79 | _val_mmf.open(dir + "/restore.val"); 80 | assert(_key_mmf.size() * _MAX_VAL_LEN == _val_mmf.size()); 81 | _one_mmf.open(dir + "/restore.one"); 82 | #ifndef NDEBUG 83 | for (int i = 0; i < _one_mmf.size(); ++i) { 84 | SPDLOG_TRACE(_log, "{}: {}, ", i, _one_mmf.data()[i]); 85 | } 86 | #endif 87 | _log->info("restore dictionary opened"); 88 | } 89 | 90 | 91 | void Restore::close() { 92 | _key_mmf.close(); 93 | _val_mmf.close(); 94 | _one_mmf.close(); 95 | _log->debug("restore dictionary closed"); 96 | } 97 | 98 | 99 | vector Restore::restore(wchar_t chr, uint16_t tag_out, bool use_dic) const { 100 | assert(tag_out > 0); 101 | vector restored; 102 | if (!is_need_restore(tag_out)) { 103 | // 원형 복원이 필요없는 경우 104 | restored.emplace_back(chr_tag_t()); 105 | restored.back().chr = chr; 106 | restored.back().set_tag(tag_out); 107 | return restored; 108 | } 109 | 110 | if (!use_dic) { 111 | // 원형 복원 사전을 사용하지 않고 첫번째 태그로 바로 부여한다. 112 | restored.emplace_back(chr_tag_t()); 113 | restored.back().chr = chr; 114 | restored.back().set_tag(get_one(tag_out)); 115 | return restored; 116 | } 117 | 118 | int idx = find(chr, tag_out); 119 | if (idx == -1) { 120 | // 키가 발견되지 않는 경우 태그 조합 중 첫번째 태그로 부여한다. 121 | uint16_t tag_one = get_one(tag_out); 122 | #ifndef NDEBUG 123 | wchar_t wstr[2] = {chr, 0}; 124 | _log->info("restore key not found: {}/{} => {}", wstr_to_utf8(wstr), tag_out, tag_one); 125 | #endif 126 | restored.emplace_back(chr_tag_t()); 127 | restored.back().chr = chr; 128 | restored.back().set_tag(tag_one); 129 | } else { 130 | const uint32_t* val = _val_mmf.data() + (idx * _MAX_VAL_LEN); 131 | for (int i = 0; *val && i < _MAX_VAL_LEN; ++val, ++i) { 132 | restored.emplace_back(chr_tag_t()); 133 | restored.back().from_val(*val); 134 | } 135 | } 136 | return restored; 137 | } 138 | 139 | 140 | bool Restore::is_need_restore(uint16_t tag_out) { 141 | return tag_out > 2 * POS_TAG_SIZE; 142 | } 143 | 144 | 145 | int Restore::find(wchar_t chr, uint16_t tag_out) const { 146 | assert(is_need_restore(tag_out)); 147 | uint32_t key = chr << 12 | tag_out; // key의 경우 12비트를 shift하고 output tag를 합친다. 148 | const uint32_t* found = reinterpret_cast( 149 | bsearch(&key, _key_mmf.data(), _key_mmf.size(), sizeof(uint32_t), Restore::key_cmp)); 150 | if (found == nullptr) return -1; 151 | return found - _key_mmf.data(); 152 | } 153 | 154 | 155 | uint8_t Restore::get_one(uint16_t tag_out) const { 156 | assert(is_need_restore(tag_out)); 157 | assert(tag_out < _one_mmf.size()); 158 | return _one_mmf.data()[tag_out]; 159 | } 160 | 161 | 162 | int Restore::key_cmp(const void* left, const void* right) { 163 | uint32_t left_val = *reinterpret_cast(left); 164 | uint32_t right_val = *reinterpret_cast(right); 165 | if (left_val < right_val) { 166 | return -1; 167 | } else if (left_val > right_val) { 168 | return 1; 169 | } else { 170 | return 0; 171 | } 172 | } 173 | 174 | 175 | } // namespace khaiii 176 | -------------------------------------------------------------------------------- /src/main/cpp/khaiii/Restore.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2018-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | #ifndef SRC_MAIN_CPP_KHAIII_RESTORE_HPP_ 8 | #define SRC_MAIN_CPP_KHAIII_RESTORE_HPP_ 9 | 10 | 11 | ////////////// 12 | // includes // 13 | ////////////// 14 | #include 15 | #include 16 | #include 17 | 18 | #include "spdlog/spdlog.h" 19 | 20 | #include "khaiii/MemMapFile.hpp" 21 | #include "khaiii/Morph.hpp" 22 | 23 | 24 | namespace khaiii { 25 | 26 | 27 | /** 28 | * 원형 복원이 이뤄진 후 음절과 음절별 태그 정보 29 | */ 30 | struct chr_tag_t { 31 | enum BI { B = 0, I = 1, }; ///< enumeration type for B-, I- notation 32 | 33 | wchar_t chr; 34 | uint8_t tag; 35 | BI bi; ///< B-, I- notation 36 | 37 | inline void set_tag(uint16_t tag_out) { 38 | assert(0 < tag_out && tag_out <= 2 * POS_TAG_SIZE); 39 | tag = tag_out; 40 | if (tag > POS_TAG_SIZE) { 41 | tag -= POS_TAG_SIZE; 42 | bi = I; 43 | } 44 | } 45 | 46 | inline void from_val(uint32_t val) { 47 | chr = val >> 8; // value의 경우 8비트를 shift하여 음절을 만든다. 48 | if (val & 0x80) { 49 | bi = I; 50 | } else { 51 | bi = B; 52 | } 53 | set_tag(val & 0x7F); 54 | } 55 | 56 | std::string str(); 57 | }; 58 | 59 | 60 | class Restore { 61 | public: 62 | virtual ~Restore(); ///< dtor 63 | 64 | /* 65 | * 리소스를 연다. 66 | * @param dir 리소스 디렉토리 67 | */ 68 | void open(std::string dir); 69 | 70 | void close(); ///< 리소스를 닫는다. 71 | 72 | /** 73 | * 음절과 그 음절의 태그 번호를 이용해 원형 복원이 필요한 경우 복원한다. 74 | * @param chr 음절 75 | * @param tag_out 태그 번호 76 | * @param use_dic 원형복원 사전을 사용할 지 여부 77 | * @return 복원한 음절 만큼의 태그 리스트 78 | */ 79 | std::vector restore(wchar_t chr, uint16_t tag_out, bool use_dic) const; 80 | 81 | /** 82 | * 원형 복원이 필요한 복합 태그 여부 83 | * @param tag_out 태그 번호 84 | * @return 복합 태그 여부 85 | */ 86 | static bool is_need_restore(uint16_t tag_out); 87 | 88 | /** 89 | * 복합 태그가 원형 복원 사전에 존재하는 지 찾는다. 90 | * @param chr 음절 91 | * @param tag_out 태그 번호 92 | * @return 인덱스. 찾지 못할 경우 -1 93 | */ 94 | int find(wchar_t chr, uint16_t tag_out) const; 95 | 96 | /** 97 | * 원형 복원 사전에 존재하지 않는 복합 태그 번호일 경우 맨 앞에 하나의 태그를 얻는다. 98 | * @param tag_out 태그 번호 99 | * @return 맨 앞에 하나의 태그 100 | */ 101 | uint8_t get_one(uint16_t tag_out) const; 102 | 103 | private: 104 | static const int _MAX_VAL_LEN = 4; ///< maximum array length of value 105 | 106 | static std::shared_ptr _log; ///< logger 107 | 108 | MemMapFile _key_mmf; ///< key memory mapping 109 | MemMapFile _val_mmf; ///< value memory mapping 110 | MemMapFile _one_mmf; ///< one memory mapping 111 | 112 | static int key_cmp(const void* left, const void* right); ///< key comparator for bsearch 113 | }; 114 | 115 | 116 | } // namespace khaiii 117 | 118 | 119 | #endif // SRC_MAIN_CPP_KHAIII_RESTORE_HPP_ 120 | -------------------------------------------------------------------------------- /src/main/cpp/khaiii/Sentence.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2018-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | #include "khaiii/Sentence.hpp" 8 | 9 | 10 | ////////////// 11 | // includes // 12 | ////////////// 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #include "khaiii/Word.hpp" 20 | #include "khaiii/util.hpp" 21 | 22 | 23 | namespace khaiii { 24 | 25 | 26 | using std::codecvt; 27 | using std::codecvt_base; 28 | using std::dec; 29 | using std::hex; 30 | using std::locale; 31 | using std::make_shared; 32 | using std::mbstate_t; 33 | using std::setfill; 34 | using std::setw; 35 | using std::shared_ptr; 36 | using std::string; 37 | using std::use_facet; 38 | using std::wstringstream; 39 | 40 | 41 | //////////////////// 42 | // static members // 43 | //////////////////// 44 | shared_ptr Sentence::_log = spdlog::stderr_color_mt("Sentence"); 45 | 46 | 47 | //////////////////// 48 | // ctors and dtor // 49 | //////////////////// 50 | Sentence::Sentence(const char* raw): _raw(raw), _morph_cnt(0) { 51 | _characterize(); 52 | _tokenize(); 53 | } 54 | 55 | 56 | ///////////// 57 | // methods // 58 | ///////////// 59 | void Sentence::organize() { 60 | for (int i = 0; i < words.size(); ++i) { 61 | if (i > 0) words[i-1]->next = words[i].get(); 62 | words[i]->organize(_wraw, _wbegins, _wends); 63 | #ifndef NDEBUG 64 | _log->debug("[{}] word: {}", i, words[i]->str()); 65 | for (int j = 0; j < words[i]->morph_vec.size(); ++j) { 66 | _log->debug("\t[{}] morph: {}", j, words[i]->morph_vec[j]->str()); 67 | } 68 | #endif 69 | _morph_cnt += words[i]->morph_vec.size(); 70 | } 71 | } 72 | 73 | 74 | int Sentence::get_lwb_delta(int wrd_idx, int chr_idx) { 75 | assert(0 <= chr_idx && chr_idx < words[wrd_idx]->wlength); 76 | return -chr_idx; 77 | } 78 | 79 | 80 | int Sentence::get_rwb_delta(int wrd_idx, int chr_idx) { 81 | assert(0 <= chr_idx && chr_idx < words[wrd_idx]->wlength); 82 | return words[wrd_idx]->wlength - chr_idx - 1; 83 | } 84 | 85 | 86 | void Sentence::_tokenize() { 87 | bool is_in_space = true; 88 | for (int idx = 0; idx < _wraw.size(); ++idx) { 89 | if (is_space(_wraw[idx])) { 90 | is_in_space = true; 91 | } else { 92 | if (is_in_space) { 93 | // first character => start of word 94 | words.emplace_back(make_shared(&_wraw[idx], 1)); 95 | } else { 96 | words.back()->wlength += 1; 97 | } 98 | is_in_space = false; 99 | } 100 | } 101 | 102 | for (auto& word : words) { 103 | word->set_begin_length(_wraw, _wbegins, _wends); 104 | _log->debug("'{}'{}~{}|{},{}", word->str(), word->begin, word->length, 105 | (word->wbegin - &_wraw[0]), word->wlength); 106 | } 107 | } 108 | 109 | 110 | void Sentence::_characterize() { 111 | assert(_raw != nullptr); 112 | auto en_US_utf8 = locale("en_US.UTF-8"); 113 | auto& facet = use_facet>(en_US_utf8); 114 | auto mbst = mbstate_t(); 115 | const char* from_next = nullptr; 116 | wstringstream wss; 117 | for (const char* from_curr = _raw; *from_curr != '\0'; from_curr = from_next) { 118 | wchar_t wchar[2] = L""; 119 | wchar_t* to_next = nullptr; 120 | auto result = facet.in(mbst, from_curr, from_curr + 6, from_next, wchar, wchar + 1, 121 | to_next); 122 | assert(result == codecvt_base::partial || result == codecvt_base::ok); 123 | wss << wchar[0]; 124 | _wbegins.emplace_back(from_curr - _raw); 125 | _wends.emplace_back(from_next - _raw); 126 | // _log->debug("'{}'({}){}~{}|{}~{}", string(from_curr, from_next - from_curr), hex, 127 | // static_cast(wchar[0]), dec, (from_curr - _raw), (from_next - _raw)); 128 | } 129 | _wraw = wss.str(); 130 | assert(_wraw.length() == _wbegins.size()); 131 | assert(_wraw.length() == _wends.size()); 132 | } 133 | 134 | 135 | } // namespace khaiii 136 | -------------------------------------------------------------------------------- /src/main/cpp/khaiii/Sentence.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2018-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | #ifndef SRC_MAIN_CPP_KHAIII_SENTENCE_HPP_ 8 | #define SRC_MAIN_CPP_KHAIII_SENTENCE_HPP_ 9 | 10 | 11 | ////////////// 12 | // includes // 13 | ////////////// 14 | #include 15 | #include 16 | #include 17 | 18 | #include "spdlog/spdlog.h" 19 | 20 | 21 | namespace khaiii { 22 | 23 | 24 | class Word; 25 | 26 | 27 | /** 28 | * sentence data structure 29 | */ 30 | class Sentence { 31 | public: 32 | std::vector> words; ///< vector of words 33 | 34 | /** 35 | * ctor 36 | * @param raw raw sentence (UTF-8) 37 | */ 38 | explicit Sentence(const char* raw = ""); 39 | 40 | void organize(); ///< 결과를 구조화합니다. 41 | 42 | inline int morph_cnt() const { 43 | return _morph_cnt; 44 | } 45 | 46 | inline const char* raw_str() const { 47 | return _raw; 48 | } 49 | 50 | /** 51 | * get delta from left word boundary to this character 52 | * @param wrd_idx word index 53 | * @param chr_idx character index 54 | * @return delta (always less or equal to 0) 55 | */ 56 | int get_lwb_delta(int wrd_idx, int chr_idx); 57 | 58 | /** 59 | * get delta from right word boundary to this character 60 | * @param wrd_idx word index 61 | * @param chr_idx character index 62 | * @return delta (always more or equal to 0) 63 | */ 64 | int get_rwb_delta(int wrd_idx, int chr_idx); 65 | 66 | private: 67 | static std::shared_ptr _log; ///< logger 68 | 69 | const char* _raw = ""; ///< raw sentence (UTF-8) 70 | int _morph_cnt; ///< total morpheme count 71 | std::wstring _wraw; ///< unicode characters 72 | std::vector _wbegins; ///< character begin positions for each unicode characters 73 | std::vector _wends; ///< character end positions for each unicode characters 74 | 75 | void _tokenize(); ///< tokenize by spaces 76 | void _characterize(); ///< convert to unicode characters 77 | }; 78 | 79 | 80 | } // namespace khaiii 81 | 82 | 83 | #endif // SRC_MAIN_CPP_KHAIII_SENTENCE_HPP_ 84 | -------------------------------------------------------------------------------- /src/main/cpp/khaiii/Tagger.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2018-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | #ifndef SRC_MAIN_CPP_KHAIII_TAGGER_HPP_ 8 | #define SRC_MAIN_CPP_KHAIII_TAGGER_HPP_ 9 | 10 | 11 | ////////////// 12 | // includes // 13 | ////////////// 14 | #include 15 | #include 16 | #include 17 | 18 | #include "spdlog/spdlog.h" 19 | 20 | #include "khaiii/Embed.hpp" 21 | 22 | 23 | namespace khaiii { 24 | 25 | 26 | class Config; 27 | class Resource; 28 | class Sentence; 29 | 30 | 31 | class Tagger { 32 | public: 33 | /** 34 | * ctor 35 | * @param cfg config 36 | * @param rsc resource 37 | * @param sent Sentence object 38 | */ 39 | Tagger(const Config& cfg, const Resource& rsc, std::shared_ptr sent); 40 | 41 | void tag(); ///< part-of-speech tag 42 | 43 | private: 44 | static std::shared_ptr _log; ///< logger 45 | 46 | const Config& _cfg; ///< config 47 | const Resource& _rsc; ///< resource 48 | std::shared_ptr _sent; ///< Sentence object 49 | 50 | /** 51 | * add left/right word boundary embedding to batch 52 | * @param data data start point 53 | * @param wrd_idx word index 54 | * @param chr_idx character index 55 | */ 56 | void _add_lwb_rwb(float* data, int wrd_idx, int chr_idx); 57 | 58 | /** 59 | * tag characters with CNN method 60 | * @param data data start point 61 | * @param batch_size batch size 62 | * @param col_dim column dimension for each batch 63 | */ 64 | void _tag_cnn(float* data, int batch_size, int col_dim, 65 | const std::vector>& index); 66 | 67 | /** 68 | * 오분석 패치를 적용하기 전에 예측한 태그를 보정한다. 69 | * 음절과 태그 조합이 원형복원 사전에 없을 경우 1음절용 태그로 벼환한 다음, 70 | * B- 위치에 I- 로 잘못 태깅된 태그를 보정한다. 71 | */ 72 | void _revise_tags(); 73 | 74 | /** 75 | * 이전 태그와 현재 태그가 B-, I- 만 다르고 같은 카테고리인지 여부. 76 | * 이전 태그가 복합 태그일 경우 마지막 태그와 비교한다. 77 | * 현재 태그는 단순 태그이며 B- 태그인 경우에 한해 동작한다. 78 | * @param prev_chr 이전 음절 79 | * @param prev_tag 이전 태그 80 | * @param curr 현재 태그 81 | * @return 태그 카테고리가 동일한지 여부 82 | */ 83 | bool _is_same_tag_cat(wchar_t prev_chr, int prev_tag, int curr); 84 | 85 | void _restore(); ///< restore morphemes 86 | 87 | /** 88 | * get context embeddings 89 | */ 90 | std::vector _get_context(int wrd_idx, int chr_idx); 91 | 92 | /** 93 | * get left context embeddings 94 | */ 95 | std::vector _get_left_context(int wrd_idx, int chr_idx); 96 | 97 | /** 98 | * get right context embeddings 99 | */ 100 | std::vector _get_right_context(int wrd_idx, int chr_idx); 101 | }; 102 | 103 | 104 | } // namespace khaiii 105 | 106 | 107 | #endif // SRC_MAIN_CPP_KHAIII_TAGGER_HPP_ 108 | -------------------------------------------------------------------------------- /src/main/cpp/khaiii/Trie.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2018-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | #include "khaiii/Trie.hpp" 8 | 9 | 10 | ////////////// 11 | // includes // 12 | ////////////// 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #include "boost/lexical_cast.hpp" 21 | 22 | #include "khaiii/KhaiiiApi.hpp" 23 | #include "khaiii/util.hpp" 24 | 25 | 26 | namespace khaiii { 27 | 28 | 29 | using std::exception; 30 | using std::find_if; 31 | using std::list; 32 | using std::shared_ptr; 33 | using std::string; 34 | using std::wstring; 35 | 36 | using boost::optional; 37 | 38 | 39 | /////////////// 40 | // constants // 41 | /////////////// 42 | static const size_t _LIN_BIN_NUM = 32; // linear/binary search 경계가 되는 element 갯수 43 | 44 | 45 | //////////////////// 46 | // static members // 47 | //////////////////// 48 | shared_ptr Trie::_log = spdlog::stderr_color_mt("Trie"); 49 | 50 | 51 | //////////////////// 52 | // ctors and dtor // 53 | //////////////////// 54 | Trie::~Trie() { 55 | close(); 56 | } 57 | 58 | 59 | ///////////// 60 | // methods // 61 | ///////////// 62 | void Trie::open(string path) { 63 | _mmf.open(path); 64 | 65 | #ifndef NDEBUG 66 | const _node_t* root_node = _mmf.data(); 67 | for (int i = 0; i < sizeof(root_node) / sizeof(_node_t); ++i) { 68 | SPDLOG_TRACE(_log, root_node[i].str(root_node)); 69 | } 70 | #endif 71 | } 72 | 73 | 74 | void Trie::close() { 75 | _mmf.close(); 76 | } 77 | 78 | 79 | /* 80 | optional Trie::find(const wstring& key) const { 81 | return find(key.c_str()); 82 | } 83 | 84 | 85 | optional Trie::find(const wchar_t* key) const { 86 | assert(key != nullptr); 87 | if (*key == L'\0') return boost::none; 88 | return _find(key, _mmf.data()); 89 | } 90 | */ 91 | 92 | 93 | list Trie::search_common_prefix_matches(const wstring& text, int max_len) const { 94 | return search_common_prefix_matches(text.c_str(), max_len); 95 | } 96 | 97 | 98 | list Trie::search_common_prefix_matches(const wchar_t* text, int max_len) const { 99 | assert(text != nullptr); 100 | list found; 101 | _search(text, _mmf.data(), &found, 0, max_len); 102 | return found; 103 | } 104 | 105 | 106 | optional Trie::search_longest_prefix_match(const wchar_t* text, int max_len) const { 107 | list found = search_common_prefix_matches(text, max_len); 108 | if (found.empty()) return boost::none; 109 | return optional(found.back()); 110 | } 111 | 112 | 113 | /* 114 | boost::optional Trie::_find(const wchar_t* key, const _node_t* node) const { 115 | SPDLOG_TRACE(_log, "key: [{}], {}", key, node->str(_data())); 116 | if (node->child_start <= 0 || node->child_num <= 0) return boost::none; 117 | auto begin = node + node->child_start; 118 | auto end = begin + node->child_num; 119 | auto found_node = end; 120 | if (node->child_num < _LIN_BIN_NUM) { 121 | // linear search 122 | auto pred = [&key] (const _node_t& _node) { return _node.chr == *key; }; 123 | found_node = find_if(begin, end, pred); 124 | } else { 125 | // binary search 126 | _node_t key_node; 127 | key_node.chr = *key; 128 | void* found_ptr = ::bsearch(&key_node, begin, end - begin, sizeof(_node_t), _node_t::cmp); 129 | if (found_ptr) found_node = static_cast(found_ptr); 130 | } 131 | if (found_node == end) { 132 | SPDLOG_TRACE(_log, " not found"); 133 | return boost::none; 134 | } else { 135 | SPDLOG_TRACE(_log, " found: {}", found_node->str(_data())); 136 | key += 1; 137 | if (*key == L'\0') { 138 | if (found_node->val > 0) { 139 | return optional(found_node->val); 140 | } else { 141 | return boost::none; 142 | } 143 | } else { 144 | return _find(key, found_node); 145 | } 146 | } 147 | } 148 | */ 149 | 150 | 151 | void Trie::_search(const wchar_t* text, const _node_t* node, list* matches, 152 | int len, int max_len) const { 153 | SPDLOG_TRACE(_log, "text({}): [{}], {}", len, wstr_to_utf8(text), node->str(_data())); 154 | if (*text == '\0' || len > max_len || node->child_start <= 0 || node->child_num <= 0) return; 155 | auto begin = node + node->child_start; 156 | auto end = begin + node->child_num; 157 | auto found_node = end; 158 | if (node->child_num < _LIN_BIN_NUM) { 159 | // linear search 160 | auto pred = [&text] (const _node_t& _node) { return _node.chr == *text; }; 161 | found_node = find_if(begin, end, pred); 162 | } else { 163 | // binary search 164 | _node_t key_node; 165 | key_node.chr = *text; 166 | void* found_ptr = ::bsearch(&key_node, begin, end - begin, sizeof(_node_t), _node_t::cmp); 167 | if (found_ptr) found_node = static_cast(found_ptr); 168 | } 169 | if (found_node == end) { 170 | SPDLOG_TRACE(_log, " not matched"); 171 | return; 172 | } else { 173 | SPDLOG_TRACE(_log, " matched: {}", found_node->str(_data())); 174 | if (found_node->val > 0) matches->emplace_back(len + 1, found_node->val); 175 | _search(text + 1, found_node, matches, len + 1, max_len); 176 | } 177 | } 178 | 179 | 180 | } // namespace khaiii 181 | -------------------------------------------------------------------------------- /src/main/cpp/khaiii/Trie.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2018-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | #ifndef SRC_MAIN_CPP_KHAIII_TRIE_HPP_ 8 | #define SRC_MAIN_CPP_KHAIII_TRIE_HPP_ 9 | 10 | 11 | ////////////// 12 | // includes // 13 | ////////////// 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #include "boost/optional.hpp" 21 | #include "spdlog/spdlog.h" 22 | 23 | #include "khaiii/MemMapFile.hpp" 24 | 25 | 26 | namespace khaiii { 27 | 28 | 29 | /** 30 | * 유니코드 TRIE 31 | */ 32 | class Trie { 33 | public: 34 | struct match_t { ///< 접두사 매칭 정보를 담기 위한 구조체. common prefix match 35 | int len; ///< 매칭된 길이 36 | uint32_t val; ///< 값 (양수) 37 | explicit match_t(int len = -1, uint32_t val = 0): len(len), val(val) {} ///< ctor 38 | }; 39 | 40 | virtual ~Trie(); ///< dtor 41 | 42 | /** 43 | * 리소스를 연다. 44 | * @param path 파일 경로 45 | */ 46 | void open(std::string path); 47 | 48 | void close(); ///< 리소스를 닫는다. 49 | 50 | /* 51 | * 키를 이용해 값을 찾는다. 52 | * @param key 키 문자열 53 | * @return 값. 키가 없을 경우 boost::none 54 | */ 55 | // boost::optional find(const std::wstring& key) const; 56 | 57 | /* 58 | * 키를 이용해 값을 찾는다. 59 | * @param key 키 문자열 60 | * @return 값. 키가 없을 경우 boost::none 61 | */ 62 | // boost::optional find(const wchar_t* key) const; 63 | 64 | /* 65 | * 접두사가 같은 모든 매칭 결과를 검색한다. 66 | * @param text 검색할 문자열 67 | * @return 매칭 결과 리스트 68 | */ 69 | std::list search_common_prefix_matches(const std::wstring& text, 70 | int max_len = INT_MAX) const; 71 | 72 | /* 73 | * 접두사가 같은 모든 매칭 결과를 검색한다. 74 | * @param text 검색할 문자열 75 | * @return 매칭 결과 리스트 76 | */ 77 | std::list search_common_prefix_matches(const wchar_t* text, 78 | int max_len = INT_MAX) const; 79 | 80 | boost::optional search_longest_prefix_match(const wchar_t* text, 81 | int max_len = INT_MAX) const; 82 | 83 | private: 84 | static std::shared_ptr _log; ///< logger 85 | 86 | struct _node_t { ///< TRIE의 노드 구조체 87 | wchar_t chr = 0; ///< 유니코드 문자 88 | uint32_t val = 0; ///< 값 (양수). (0인 경우 값이 아님. 즉, 단말 노드가 아님) 89 | int32_t child_start = -1; ///< 현재 노드로부터 자식 노드가 시작되는 위치 90 | int32_t child_num = -1; ///< 자식 노드의 갯수 91 | 92 | /** 93 | * 두 노드를 비교하는 함수 94 | * @param left left hand side 95 | * @param right right hand side 96 | * @return -1: left < right, 0: left == right, 1: left > right 97 | */ 98 | static int cmp(const void* left, const void* right) { 99 | const _node_t* left0 = static_cast(left); 100 | const _node_t* right0 = static_cast(right); 101 | return left0->chr - right0->chr; 102 | } 103 | 104 | inline std::string str(const _node_t* root_node) const { ///< 디버그용 문자열 변환 105 | std::ostringstream oss; 106 | oss << "node[" << (this - root_node) << "]{'"; 107 | if (chr == 0) { 108 | oss << "ROOT"; 109 | } else { 110 | oss << static_cast(chr); 111 | } 112 | oss << "', " << val << ", (" << child_start << ", " << child_num << ")}"; 113 | return oss.str(); 114 | } 115 | }; 116 | 117 | MemMapFile<_node_t> _mmf; ///< memory mapped file 118 | 119 | /* 120 | * 현재 노드로부터 자식 노드로 내려가며 키 값을 찾는다. 121 | * @param key 키 문자열 122 | * @param node 노드 시작 위치 123 | * @return 값. 키가 없을 경우 boost::none 124 | */ 125 | boost::optional _find(const wchar_t* key, const _node_t* node) const; 126 | 127 | /* 128 | * 현재 노드로부터 더이상 매칭되는 키가 없을 때까지 검색한다. 129 | * @param text 찾을 텍스트 130 | * @param node 노드 시작 위치 131 | * @param matches 매칭 결과 리스트 132 | * @param len 현재까지 검색을 진행한 길이(자식 노드의 깊이) 133 | */ 134 | void _search(const wchar_t* text, const _node_t* node, std::list* matches, 135 | int len, int max_len) const; 136 | }; 137 | 138 | 139 | } // namespace khaiii 140 | 141 | 142 | #endif // SRC_MAIN_CPP_KHAIII_TRIE_HPP_ 143 | -------------------------------------------------------------------------------- /src/main/cpp/khaiii/Word.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2017-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | #include "khaiii/Word.hpp" 8 | 9 | 10 | ////////////// 11 | // includes // 12 | ////////////// 13 | #include 14 | #include 15 | 16 | #include "khaiii/Morph.hpp" 17 | #include "khaiii/util.hpp" 18 | 19 | 20 | namespace khaiii { 21 | 22 | 23 | using std::make_shared; 24 | using std::string; 25 | using std::vector; 26 | using std::wstring; 27 | using std::wstringstream; 28 | 29 | 30 | //////////////////// 31 | // ctors and dtor // 32 | //////////////////// 33 | Word::Word(const wchar_t* wbegin, int wlength): wbegin(wbegin), wlength(wlength), 34 | char_tags(wlength) { 35 | begin = -1; 36 | length = -1; 37 | morphs = nullptr; 38 | next = nullptr; 39 | } 40 | 41 | 42 | ///////////// 43 | // methods // 44 | ///////////// 45 | void Word::set_begin_length(const wstring &wchars, const vector &wbegins, 46 | const vector &wends) { 47 | int wbegin_idx = wbegin - wchars.c_str(); 48 | begin = wbegins.at(wbegin_idx); 49 | length = wends.at(wbegin_idx + wlength - 1) - begin; 50 | char_tags.resize(wlength); 51 | } 52 | 53 | 54 | void Word::set_embeds(const Resource& rsc) { 55 | embeds.reserve(wlength); 56 | for (int i = 0; i < wlength; ++i) embeds.emplace_back(rsc.embed[*(wbegin + i)]); 57 | } 58 | 59 | 60 | void Word::add_morph(const wstringstream& wlex, uint8_t tag, int begin_idx, int end_idx) { 61 | const wchar_t* morph_wbegin = wbegin + begin_idx; 62 | int morph_wlength = end_idx - begin_idx + 1; 63 | morph_vec.emplace_back(make_shared(wlex.str(), static_cast(tag), morph_wbegin, 64 | morph_wlength)); 65 | } 66 | 67 | 68 | void Word::organize(const wstring& wraw, const vector& wbegins, const vector& wends) { 69 | for (int i = 0; i < morph_vec.size(); ++i) { 70 | if (i > 0) morph_vec[i-1]->next = morph_vec[i].get(); 71 | morph_vec[i]->organize(wraw, wbegins, wends); 72 | } 73 | } 74 | 75 | 76 | void Word::make_morphs() { 77 | wstringstream wlex; 78 | uint8_t tag = 0; 79 | int begin_idx = -1; 80 | int end_idx = -1; 81 | for (int i = 0; i < restored.size(); ++i) { 82 | for (auto chr : restored[i]) { 83 | if (chr.bi == chr_tag_t::I && chr.tag == tag) { 84 | // 이전 형태소의 연속이므로 새로 생성하지 않고 추가해준다. 85 | wlex << chr.chr; 86 | end_idx = i; 87 | } else { 88 | if (wlex.str().length() > 0) add_morph(wlex, tag, begin_idx, end_idx); 89 | wlex.str(L""); 90 | wlex << chr.chr; 91 | tag = chr.tag; 92 | begin_idx = i; 93 | end_idx = i; 94 | } 95 | } 96 | } 97 | if (wlex.str().length() > 0) add_morph(wlex, tag, begin_idx, end_idx); 98 | 99 | // linked-list 포인터들을 연결해준다. 100 | morphs = morph_vec[0].get(); 101 | for (int i = 0; i < morph_vec.size() - 1; ++i) { 102 | morph_vec[i]->next = morph_vec[i+1].get(); 103 | } 104 | } 105 | 106 | 107 | string Word::str() const { 108 | return wstr_to_utf8(wstr()); 109 | } 110 | 111 | 112 | wstring Word::wstr() const { 113 | wstringstream wss; 114 | wss << wstring(wbegin, wlength) << L":" << begin << L"," << length; 115 | return wss.str(); 116 | } 117 | 118 | 119 | } // namespace khaiii 120 | -------------------------------------------------------------------------------- /src/main/cpp/khaiii/Word.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2017-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | #ifndef SRC_MAIN_CPP_KHAIII_WORD_HPP_ 8 | #define SRC_MAIN_CPP_KHAIII_WORD_HPP_ 9 | 10 | 11 | ////////////// 12 | // includes // 13 | ////////////// 14 | #include 15 | #include 16 | #include 17 | 18 | #include "khaiii/khaiii_api.h" 19 | #include "khaiii/Resource.hpp" 20 | #include "khaiii/Restore.hpp" 21 | 22 | 23 | namespace khaiii { 24 | 25 | 26 | class Morph; 27 | 28 | 29 | /** 30 | * 어절 자료구조 31 | */ 32 | class Word: public khaiii_word_t { 33 | public: 34 | const wchar_t* wbegin = nullptr; ///< unicode string begin address 35 | int wlength = 0; ///< unicode string length 36 | std::vector> morph_vec; ///< 어절에 포함된 형태소 배열 (분석 결과) 37 | 38 | std::vector embeds; ///< embeddings for each character 39 | std::vector char_tags; ///< tag outs for each character 40 | std::vector> restored; ///< restored characters and their tags 41 | 42 | /** 43 | * ctor 44 | * @param wbegin unicode string begin address 45 | * @param length unicode string length 46 | */ 47 | explicit Word(const wchar_t* wbegin = nullptr, int wlength = 0); 48 | 49 | /** 50 | * set begin position and length in raw string for this word 51 | * @param wchars unicode characters 52 | * @param wbegins begin positions for each unicode characters 53 | * @param wends end positions for each unicode characters 54 | */ 55 | void set_begin_length(const std::wstring &wchars, const std::vector &wbegins, 56 | const std::vector &wends); 57 | 58 | /** 59 | * set embedding for decoding 60 | * @param rsc resource 61 | */ 62 | void set_embeds(const Resource& rsc); 63 | 64 | /** 65 | * 하나의 형태소를 추가한다. 66 | * @param wlex 유니코드 형태소 문자열 67 | * @param tag 품사 태그 번호 (1부터 시작. 0은 오류) 68 | * @param begin_idx 시작 인덱스 (유니코드 음절 인덱스) 69 | * @param end_idx 끝 인덱스 (유니코드 음절 인덷스) 70 | */ 71 | void add_morph(const std::wstringstream& wlex, uint8_t tag, int begin_idx, int end_idx); 72 | 73 | /** 74 | * API 결과 구조체의 내용을 채운다. 75 | * @param wraw 유니코드 원문 76 | * @param wbegins 각 음절별 시작 byte 위치 77 | * @param wends 각 음절별 끝 byte 위치 78 | */ 79 | void organize(const std::wstring& wraw, const std::vector& wbegins, 80 | const std::vector& wends); 81 | 82 | /** 83 | * 원형복원된 음절들을 바탕으로 형태소를 생성한다. 84 | */ 85 | void make_morphs(); 86 | 87 | std::string str() const; ///< to string (UTF-8) 88 | std::wstring wstr() const; ///< to unicode string 89 | }; 90 | 91 | 92 | } // namespace khaiii 93 | 94 | 95 | #endif // SRC_MAIN_CPP_KHAIII_WORD_HPP_ 96 | -------------------------------------------------------------------------------- /src/main/cpp/khaiii/khaiii_api.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2017-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | #include "khaiii/khaiii_api.h" 8 | 9 | 10 | ////////////// 11 | // includes // 12 | ////////////// 13 | #include // NOLINT 14 | #include 15 | 16 | #include "khaiii/KhaiiiImpl.hpp" 17 | 18 | 19 | using std::make_shared; 20 | using std::recursive_mutex; 21 | using std::string; 22 | using std::shared_ptr; 23 | using std::unique_lock; 24 | using std::vector; 25 | using khaiii::Except; 26 | using khaiii::KhaiiiApi; 27 | using khaiii::KhaiiiImpl; 28 | 29 | 30 | /////////////// 31 | // variables // 32 | /////////////// 33 | /** 34 | * container for handles. the first (index 0) handle is for special use 35 | */ 36 | vector> KHAIII_HANDLES{ make_shared() }; 37 | 38 | 39 | /////////////// 40 | // functions // 41 | /////////////// 42 | const char* khaiii_version() { 43 | return KHAIII_VERSION; 44 | } 45 | 46 | 47 | int khaiii_open(const char* rsc_dir, const char* opt_str) { 48 | unique_lock lock(KHAIII_HANDLES[0]->get_mutex()); 49 | if (rsc_dir == nullptr) { 50 | KHAIII_HANDLES[0]->set_err_msg("resource directory is null"); 51 | return -1; 52 | } 53 | auto khaiii_impl = make_shared(); 54 | try { 55 | khaiii_impl->open(rsc_dir, opt_str); 56 | KHAIII_HANDLES.emplace_back(khaiii_impl); 57 | } catch (const Except& exc) { 58 | KHAIII_HANDLES[0]->set_err_msg(exc.what()); 59 | return -1; 60 | } 61 | return static_cast(KHAIII_HANDLES.size() - 1); 62 | } 63 | 64 | 65 | const khaiii_word_t* khaiii_analyze(int handle, const char* input, const char* opt_str) { 66 | if (handle <= 0 || handle >= KHAIII_HANDLES.size()) { 67 | unique_lock lock(KHAIII_HANDLES[0]->get_mutex()); 68 | KHAIII_HANDLES[0]->set_err_msg(fmt::format("invalid handle: {}", handle)); 69 | return nullptr; 70 | } 71 | auto khaiii_impl = KHAIII_HANDLES[handle]; 72 | if (input == nullptr) { 73 | khaiii_impl->set_err_msg("input is null"); 74 | return nullptr; 75 | } 76 | try { 77 | return khaiii_impl->analyze(input, opt_str); 78 | } catch (const Except& exc) { 79 | khaiii_impl->set_err_msg(exc.what()); 80 | return nullptr; 81 | } 82 | } 83 | 84 | 85 | void khaiii_free_results(int handle, const khaiii_word_t* results) { 86 | unique_lock lock(KHAIII_HANDLES[0]->get_mutex()); 87 | if (handle <= 0 || handle >= KHAIII_HANDLES.size()) { 88 | KHAIII_HANDLES[0]->set_err_msg(fmt::format("invalid handle: {}", handle)); 89 | return; 90 | } 91 | auto khaiii_impl = KHAIII_HANDLES[handle]; 92 | try { 93 | khaiii_impl->free_results(results); 94 | } catch (const Except& exc) { 95 | khaiii_impl->set_err_msg(exc.what()); 96 | } 97 | } 98 | 99 | 100 | void khaiii_close(int handle) { 101 | unique_lock lock(KHAIII_HANDLES[0]->get_mutex()); 102 | if (handle <= 0 || handle >= KHAIII_HANDLES.size()) { 103 | KHAIII_HANDLES[0]->set_err_msg(fmt::format("invalid handle: {}", handle)); 104 | return; 105 | } 106 | auto khaiii_impl = KHAIII_HANDLES[handle]; 107 | try { 108 | khaiii_impl->close(); 109 | } catch (const Except& exc) { 110 | khaiii_impl->set_err_msg(exc.what()); 111 | } 112 | } 113 | 114 | 115 | const char* khaiii_last_error(int handle) { 116 | if (handle <= 0 || handle >= KHAIII_HANDLES.size()) handle = 0; 117 | return KHAIII_HANDLES[handle]->get_err_msg(); 118 | } 119 | -------------------------------------------------------------------------------- /src/main/cpp/khaiii/khaiii_dev.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2017-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | #include "khaiii/khaiii_dev.h" 8 | 9 | 10 | ////////////// 11 | // includes // 12 | ////////////// 13 | #include 14 | #include 15 | 16 | #include "khaiii/KhaiiiImpl.hpp" 17 | 18 | 19 | using std::map; 20 | using std::recursive_mutex; 21 | using std::shared_ptr; 22 | using std::string; 23 | using std::unique_lock; 24 | using std::vector; 25 | using khaiii::Except; 26 | using khaiii::KhaiiiImpl; 27 | 28 | 29 | /////////////// 30 | // variables // 31 | /////////////// 32 | extern vector> KHAIII_HANDLES; 33 | 34 | 35 | /////////////// 36 | // functions // 37 | /////////////// 38 | int khaiii_analyze_bfr_errpatch(int handle, const char* input, const char* opt_str, 39 | int16_t* output) { 40 | if (handle <= 0 || handle >= KHAIII_HANDLES.size()) { 41 | unique_lock lock(KHAIII_HANDLES[0]->get_mutex()); 42 | KHAIII_HANDLES[0]->set_err_msg(fmt::format("invalid handle: {}", handle)); 43 | return -1; 44 | } 45 | auto khaiii_impl = KHAIII_HANDLES[handle]; 46 | try { 47 | return khaiii_impl->analyze_bfr_errpatch(input, opt_str, output); 48 | } catch (const Except& exc) { 49 | khaiii_impl->set_err_msg(exc.what()); 50 | return -1; 51 | } 52 | } 53 | 54 | 55 | int khaiii_set_log_level(const char* name, const char* level) { 56 | if (name == nullptr || level == nullptr) { 57 | unique_lock lock(KHAIII_HANDLES[0]->get_mutex()); 58 | KHAIII_HANDLES[0]->set_err_msg("log name or level is null"); 59 | return -1; 60 | } 61 | 62 | try { 63 | KhaiiiImpl::set_log_level(name, level); 64 | } catch (const Except& exc) { 65 | KHAIII_HANDLES[0]->set_err_msg(exc.what()); 66 | return -1; 67 | } 68 | return 0; 69 | } 70 | 71 | 72 | int khaiii_set_log_levels(const char* name_level_pairs) { 73 | if (name_level_pairs == nullptr) { 74 | unique_lock lock(KHAIII_HANDLES[0]->get_mutex()); 75 | KHAIII_HANDLES[0]->set_err_msg("log name/level pair is null"); 76 | return -1; 77 | } 78 | 79 | try { 80 | KhaiiiImpl::set_log_levels(name_level_pairs); 81 | } catch (const Except& exc) { 82 | KHAIII_HANDLES[0]->set_err_msg(exc.what()); 83 | return -1; 84 | } 85 | return 0; 86 | } 87 | -------------------------------------------------------------------------------- /src/main/cpp/khaiii/nn/Conv1d.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2018-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | #include "khaiii/nn/Conv1d.hpp" 8 | 9 | 10 | ////////////// 11 | // includes // 12 | ////////////// 13 | #include 14 | 15 | #include "khaiii/util.hpp" 16 | 17 | 18 | namespace khaiii { 19 | namespace nn { 20 | 21 | 22 | using std::make_unique; 23 | using std::string; 24 | 25 | 26 | //////////////////// 27 | // ctors and dtor // 28 | //////////////////// 29 | Conv1d::~Conv1d() { 30 | close(); 31 | } 32 | 33 | 34 | ///////////// 35 | // methods // 36 | ///////////// 37 | void Conv1d::open(string path, int in_ch, int out_ch, int kernel_size, 38 | const activation_t* activation) { 39 | _param_mmf.open(path); 40 | assert(_param_mmf.size() == (in_ch * out_ch * kernel_size + out_ch)); 41 | // [output channel * [kernel * input channel]] ==> transposed 42 | // ==> [[kernel * input channel] * output channel] 43 | // 즉, 저장은 [row, col]으로 했지만 사용은 [col, row]로 접근해야 합니다. 44 | _weight = make_unique(const_cast(_param_mmf.data()), kernel_size * in_ch, 45 | out_ch); 46 | _bias = make_unique(const_cast(_param_mmf.data()) + \ 47 | (in_ch * out_ch * kernel_size), out_ch); 48 | _in_ch = in_ch; 49 | _out_ch = out_ch; 50 | _kernel_size = kernel_size; 51 | _activation = activation; 52 | } 53 | 54 | 55 | vector_t Conv1d::forward_max_pool_vec(const vector_map_t& input) const { 56 | int out_row_size = (input.size() / _in_ch) - (_kernel_size - 1); 57 | int in_col_size = _in_ch * _kernel_size; 58 | matrix_t output(out_row_size, _out_ch); 59 | for (int row = 0; row < out_row_size; ++row) { 60 | output.row(row) = _weight->transpose() * input.segment(row * _in_ch, in_col_size) + *_bias; 61 | } 62 | auto pooled = output.colwise().maxCoeff(); 63 | if (_activation) return pooled.unaryExpr(*_activation); 64 | return pooled; 65 | } 66 | 67 | 68 | matrix_t Conv1d::forward_max_pool_mat(const float* data, int batch_size, int col_dim) const { 69 | matrix_t outputs(batch_size, _out_ch); 70 | for (int batch = 0; batch < batch_size; ++batch) { 71 | vector_map_t vec(const_cast(data + batch * col_dim), col_dim); 72 | outputs.row(batch) = forward_max_pool_vec(vec); 73 | } 74 | return outputs; 75 | } 76 | 77 | 78 | void Conv1d::close() { 79 | _weight.release(); 80 | _bias.release(); 81 | _param_mmf.close(); 82 | } 83 | 84 | 85 | } // namespace nn 86 | } // namespace khaiii 87 | -------------------------------------------------------------------------------- /src/main/cpp/khaiii/nn/Conv1d.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2018-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | #ifndef SRC_MAIN_CPP_KHAIII_NN_CONV1D_HPP_ 8 | #define SRC_MAIN_CPP_KHAIII_NN_CONV1D_HPP_ 9 | 10 | 11 | ////////////// 12 | // includes // 13 | ////////////// 14 | #include 15 | #include 16 | #include 17 | 18 | #include "khaiii/MemMapFile.hpp" 19 | #include "khaiii/nn/tensor.hpp" 20 | 21 | 22 | namespace khaiii { 23 | namespace nn { 24 | 25 | 26 | /** 27 | * 1D convolution layer 28 | */ 29 | class Conv1d { 30 | public: 31 | virtual ~Conv1d(); ///< dtor 32 | 33 | /** 34 | * open layer parameters 35 | * @param path file path 36 | * @param in_ch input channel 37 | * @param out_ch output channel 38 | * @param kernel_size kernel size 39 | * @param activation activation function 40 | */ 41 | void open(std::string path, int in_ch, int out_ch, int kernel_size, 42 | const activation_t* activation = nullptr); 43 | 44 | /** 45 | * apply forward calculation and also apply max pooling for vector input 46 | * @param input input vector 47 | * @return result vector 48 | */ 49 | vector_t forward_max_pool_vec(const vector_map_t& input) const; 50 | 51 | /** 52 | * apply forward calculation and also apply max pooling for matrix input 53 | * @param input input matrix. size: [batch size, imput dim] 54 | * @param batch_size batch size 55 | * @param col_dim column dim (for each batch) 56 | * @return result matrix 57 | */ 58 | matrix_t forward_max_pool_mat(const float* data, int batch_size, int col_dim) const; 59 | 60 | void close(); ///< 리소스를 닫는다. 61 | 62 | private: 63 | std::unique_ptr _weight; ///< weights [out_ch x (in_ch x kernel)] 64 | std::unique_ptr _bias; ///< bias [out_ch x 1] 65 | int _in_ch = 0; ///< input channel dimension 66 | int _out_ch = 0; ///< output chennel dimension 67 | int _kernel_size = 0; ///< kernel size 68 | const activation_t* _activation = nullptr; ///< activation function 69 | 70 | MemMapFile _param_mmf; ///< model parameters map file 71 | }; 72 | 73 | 74 | } // namespace nn 75 | } // namespace khaiii 76 | 77 | 78 | #endif // SRC_MAIN_CPP_KHAIII_NN_CONV1D_HPP_ 79 | -------------------------------------------------------------------------------- /src/main/cpp/khaiii/nn/Linear.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2018-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | #include "khaiii/nn/Linear.hpp" 8 | 9 | 10 | ////////////// 11 | // includes // 12 | ////////////// 13 | #include 14 | #include 15 | 16 | #include "khaiii/util.hpp" 17 | 18 | 19 | namespace khaiii { 20 | namespace nn { 21 | 22 | 23 | using std::cout; 24 | using std::endl; 25 | using std::make_unique; 26 | using std::string; 27 | using std::vector; 28 | 29 | 30 | //////////////////// 31 | // ctors and dtor // 32 | //////////////////// 33 | Linear::~Linear() { 34 | close(); 35 | } 36 | 37 | 38 | ///////////// 39 | // methods // 40 | ///////////// 41 | void Linear::open(string path, int in_dim, int out_dim, bool has_bias, 42 | const activation_t* activation) { 43 | // Eigen은 column 우선으로 저장합니다. 44 | // 따라서 matrix map의 경우 row, col을 거꾸로 해서 생성한 다음, 45 | // 사용할 때에는 transpose()를 해서 사용해야 합니다. 46 | _param_mmf.open(path); 47 | int size = in_dim * out_dim; 48 | if (has_bias) size += out_dim; 49 | assert(_param_mmf.size() == size); 50 | _weight = make_unique(const_cast(_param_mmf.data()), in_dim, out_dim); 51 | if (has_bias) { 52 | _bias = make_unique(const_cast(_param_mmf.data()) + in_dim * out_dim, 53 | out_dim); 54 | } 55 | _activation = activation; 56 | } 57 | 58 | 59 | /* 60 | #ifndef NDEBUG 61 | void Linear::print_weight() const { 62 | int row = _weight->rows(); 63 | int col = _weight->cols(); 64 | fmt::print("============ weight =============\n"); 65 | fmt::print("Size = ({}, {})\n", row, col); 66 | if (row >= 10 && col >= 10) { 67 | cout << "first [5 * 5] contents" << endl; 68 | cout << _weight->block<5, 5>(0, 0) << endl; 69 | cout << "last [5 * 5] contents" << endl; 70 | cout << _weight->block<5, 5>(row-5, col-5) << endl; 71 | } else { 72 | cout << "contnets" << endl; 73 | cout << *_weight << endl; 74 | } 75 | fmt::print("============ bias =============\n"); 76 | cout << "contnets" << endl; 77 | cout << _bias->head(5) << endl; 78 | cout << "..." << endl; 79 | cout << _bias->tail(5) << endl; 80 | } 81 | #endif 82 | */ 83 | 84 | 85 | void Linear::close() { 86 | _weight.reset(); 87 | _bias.reset(); 88 | _param_mmf.close(); 89 | } 90 | 91 | 92 | } // namespace nn 93 | } // namespace khaiii 94 | -------------------------------------------------------------------------------- /src/main/cpp/khaiii/nn/Linear.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2018-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | #ifndef SRC_MAIN_CPP_KHAIII_NN_LINEAR_HPP_ 8 | #define SRC_MAIN_CPP_KHAIII_NN_LINEAR_HPP_ 9 | 10 | 11 | ////////////// 12 | // includes // 13 | ////////////// 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #include "khaiii/MemMapFile.hpp" 20 | #include "khaiii/nn/tensor.hpp" 21 | #include "spdlog/spdlog.h" 22 | #include "fmt/format.h" 23 | 24 | 25 | namespace khaiii { 26 | namespace nn { 27 | 28 | 29 | /** 30 | * fully connected layer 31 | */ 32 | class Linear { 33 | public: 34 | virtual ~Linear(); 35 | /** 36 | * open layer parameters 37 | * @param path file path 38 | * @param in_dim input dimension 39 | * @param out_dim output dimension 40 | * @param has_bias whether has bias or not 41 | * @param activation activation function 42 | */ 43 | void open(std::string path, int in_dim, int out_dim, bool has_bias, 44 | const activation_t* activation = nullptr); 45 | 46 | void close(); ///< 리소스를 닫는다. 47 | 48 | /** 49 | * apply forward calculation for vector input 50 | * @param input input vector 51 | * @return result vector 52 | */ 53 | template 54 | inline vector_t forward_vec(const T &input) const { 55 | auto without_bias = _weight->transpose() * input; 56 | if (_bias.get() == nullptr) { 57 | if (_activation) return without_bias.unaryExpr(*_activation); 58 | return without_bias; 59 | } 60 | auto with_bias = without_bias + *_bias; 61 | if (_activation) return with_bias.unaryExpr(*_activation); 62 | return with_bias; 63 | } 64 | 65 | /** 66 | * apply forward calculation for matrix input 67 | * @param input input matrix. size: [batch size, input dim] 68 | * @return result matrix 69 | */ 70 | template 71 | inline matrix_t forward_mat(const T& input) const { 72 | auto without_bias = input * *_weight; 73 | if (_bias.get() == nullptr) { 74 | if (_activation) return without_bias.unaryExpr(*_activation); 75 | return without_bias; 76 | } 77 | auto with_bias = without_bias.transpose().colwise() + *_bias; 78 | if (_activation) return with_bias.unaryExpr(*_activation).transpose(); 79 | return with_bias.transpose(); 80 | } 81 | 82 | /* 83 | #ifndef NDEBUG 84 | void print_weight() const; ///< print weights for debugging 85 | #endif 86 | */ 87 | 88 | private: 89 | std::unique_ptr _weight; ///< weights [out x in] 90 | std::unique_ptr _bias; ///< bias [out x 1] 91 | const activation_t* _activation = nullptr; ///< activation function 92 | 93 | MemMapFile _param_mmf; ///< model parameters map file 94 | }; 95 | 96 | 97 | } // namespace nn 98 | } // namespace khaiii 99 | 100 | 101 | #endif // SRC_MAIN_CPP_KHAIII_NN_LINEAR_HPP_ 102 | -------------------------------------------------------------------------------- /src/main/cpp/khaiii/nn/tensor.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2017-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | #include "khaiii/nn/tensor.hpp" 8 | 9 | 10 | ////////////// 11 | // includes // 12 | ////////////// 13 | #include 14 | #include 15 | #include 16 | 17 | 18 | namespace khaiii { 19 | namespace nn { 20 | 21 | 22 | using std::vector; 23 | 24 | 25 | ////////////////////////// 26 | // activation functions // 27 | ////////////////////////// 28 | float relu(float x) { 29 | return std::max(x, 0.0f); 30 | } 31 | activation_t RELU = std::ptr_fun(relu); ///< ReLU function pointer 32 | 33 | 34 | /////////////// 35 | // functions // 36 | /////////////// 37 | void add_positional_enc(float* data, int len, int dim) { 38 | for (int pos = 1; pos <= len; ++pos) { 39 | float pos_ = pos; 40 | for (int i = 1; i <= dim; ++i) { 41 | *data++ += (1.0f - pos_ / len - 42 | ((static_cast(i) / dim) * (1.0f - 2.0f * pos_ / len))); 43 | } 44 | } 45 | } 46 | 47 | 48 | } // namespace nn 49 | } // namespace khaiii 50 | -------------------------------------------------------------------------------- /src/main/cpp/khaiii/nn/tensor.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2017-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | #ifndef SRC_MAIN_CPP_KHAIII_NN_TENSOR_HPP_ 8 | #define SRC_MAIN_CPP_KHAIII_NN_TENSOR_HPP_ 9 | 10 | 11 | ////////////// 12 | // includes // 13 | ////////////// 14 | #include 15 | 16 | #include "Eigen/Dense" 17 | 18 | 19 | namespace khaiii { 20 | namespace nn { 21 | 22 | 23 | /////////// 24 | // types // 25 | /////////// 26 | using matrix_map_t = Eigen::Map; 27 | using vector_map_t = Eigen::Map; 28 | using matrix_t = Eigen::MatrixXf; 29 | using vector_t = Eigen::VectorXf; 30 | 31 | 32 | ////////////////////////// 33 | // activation functions // 34 | ////////////////////////// 35 | typedef std::pointer_to_unary_function activation_t; 36 | extern activation_t RELU; 37 | 38 | 39 | /////////////// 40 | // functions // 41 | /////////////// 42 | /** 43 | * add positional encoding to data(array of floats) 44 | * @param data input data. size: [length x dimension] 45 | * @param len position length 46 | * @param dim embedding dimension 47 | */ 48 | void add_positional_enc(float* data, int len, int dim); 49 | 50 | /** 51 | * add two vector in-place (update left vector) 52 | * @param left vector (will be updated) 53 | * @param right vector 54 | */ 55 | inline void add_vec(float* left, const float* right, int dim) { 56 | assert(dim > 0); 57 | for (; dim > 0; --dim) *left++ += *right++; 58 | } 59 | 60 | 61 | } // namespace nn 62 | } // namespace khaiii 63 | 64 | 65 | #endif // SRC_MAIN_CPP_KHAIII_NN_TENSOR_HPP_ 66 | -------------------------------------------------------------------------------- /src/main/cpp/khaiii/util.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2017-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | #ifndef SRC_MAIN_CPP_KHAIII_UTIL_HPP_ 8 | #define SRC_MAIN_CPP_KHAIII_UTIL_HPP_ 9 | 10 | 11 | ////////////// 12 | // includes // 13 | ////////////// 14 | #include 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #include "boost/locale/encoding_utf.hpp" 23 | 24 | 25 | namespace khaiii { 26 | 27 | 28 | 29 | /////////////// 30 | // functions // 31 | /////////////// 32 | /** 33 | * whether is space or not 34 | * @param chr character 35 | * @return true if character is space 36 | */ 37 | inline bool is_space(wchar_t chr) { 38 | static std::wstring space(L" \t\v\r\n\u3000"); 39 | return space.find(chr) != std::wstring::npos; 40 | } 41 | 42 | 43 | /** 44 | * convert UTF-8 string to wstring 45 | * @param str UTF-8 string 46 | * @return wstring 47 | */ 48 | inline std::wstring utf8_to_wstr(const std::string& str) { 49 | return boost::locale::conv::utf_to_utf(str.c_str(), str.c_str() + str.length()); 50 | } 51 | 52 | 53 | /** 54 | * convert wstring to UTF-8 string 55 | * @param wstr wstring 56 | * @return UTF-8 string 57 | */ 58 | inline std::string wstr_to_utf8(const std::wstring& wstr) { 59 | return boost::locale::conv::utf_to_utf(wstr.c_str(), wstr.c_str() + wstr.length()); 60 | } 61 | 62 | 63 | /** 64 | * string splitter 65 | * @param str string to split 66 | * @param deilm delimiter char 67 | * @return list of splitted strings 68 | */ 69 | inline std::vector split(const std::string& str, char delim) { 70 | std::stringstream sss(str); 71 | std::vector elems; 72 | for (std::string item; std::getline(sss, item, delim); ) { 73 | elems.emplace_back(std::move(item)); 74 | } 75 | return elems; 76 | } 77 | 78 | 79 | /** 80 | * whether file (or directory) exists or not 81 | * @param path path 82 | * @return true if exists 83 | */ 84 | inline bool file_exists(std::string path) { 85 | struct stat st; 86 | return stat(path.c_str(), &st) == 0; 87 | } 88 | 89 | 90 | } // namespace khaiii 91 | 92 | 93 | #endif // SRC_MAIN_CPP_KHAIII_UTIL_HPP_ 94 | -------------------------------------------------------------------------------- /src/main/cpp/main.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2018-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | ////////////// 8 | // includes // 9 | ////////////// 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include "cxxopts.hpp" 16 | #include "fmt/printf.h" 17 | #ifdef PROFILER 18 | #include "gperftools/profiler.h" 19 | #endif 20 | #include "spdlog/spdlog.h" 21 | 22 | #include "khaiii/KhaiiiApi.hpp" 23 | #include "khaiii/khaiii_dev.h" 24 | 25 | 26 | using std::cerr; 27 | using std::cin; 28 | using std::endl; 29 | using std::ifstream; 30 | using std::ofstream; 31 | using std::string; 32 | 33 | using khaiii::KhaiiiApi; 34 | 35 | 36 | /////////////// 37 | // functions // 38 | /////////////// 39 | int run(const cxxopts::ParseResult& opts) { 40 | auto _log = spdlog::get("console"); 41 | khaiii_set_log_levels(opts["set-log"].as().c_str()); 42 | 43 | auto khaiii_api = KhaiiiApi::create(); 44 | try { 45 | khaiii_api->open(opts["rsc-dir"].as(), opts["opt-str"].as()); 46 | } catch (const khaiii::Except& exc) { 47 | _log->error("fail to open dir: '{}', opt: '{}'", opts["rsc-dir"].as(), 48 | opts["opt-str"].as()); 49 | _log->error(exc.what()); 50 | return 1; 51 | } 52 | 53 | for (string line; getline(cin, line); ) { 54 | _log->debug("sent: {}", line); 55 | const khaiii_word_t* results = nullptr; 56 | try { 57 | results = khaiii_api->analyze(line.c_str(), ""); 58 | } catch (const khaiii::Except& exc) { 59 | _log->warn("{}: {}", exc.what(), line); 60 | continue; 61 | } 62 | for (auto word = results; word != nullptr; word = word->next) { 63 | fmt::print("{}\t", line.substr(word->begin, word->length)); 64 | const khaiii_morph_t* morphs = word->morphs; 65 | for (auto morph = morphs; morph != nullptr; morph = morph->next) { 66 | if (morph != morphs) fmt::print(" + "); 67 | fmt::print("{}/{}", morph->lex, morph->tag); 68 | } 69 | fmt::print("\n"); 70 | } 71 | fmt::print("\n"); 72 | khaiii_api->free_results(results); 73 | } 74 | 75 | return 0; 76 | } 77 | 78 | 79 | ////////// 80 | // main // 81 | ////////// 82 | int main(int argc, char** argv) { 83 | auto _log = spdlog::stderr_color_mt("console"); 84 | spdlog::set_level(spdlog::level::warn); 85 | 86 | cxxopts::Options options("khaiii", "analyze with khaiii"); 87 | options.add_options() 88 | ("h,help", "print this help") 89 | ("rsc-dir", "resource directory", cxxopts::value()->default_value("")) 90 | ("opt-str", "option (JSON format)", cxxopts::value()->default_value("")) 91 | ("input", "input file (default: stdin)", cxxopts::value()) 92 | ("output", "output file (default: stdout)", cxxopts::value()) 93 | ("set-log", "set log level", cxxopts::value()->default_value("all:info")); 94 | auto opts = options.parse(argc, argv); 95 | 96 | if (opts.count("help")) { 97 | fmt::fprintf(cerr, "%s\n", options.help()); 98 | return 0; 99 | } 100 | if (opts.count("input")) { 101 | string path = opts["input"].as(); 102 | ifstream fin(path); 103 | if (!fin.good()) { 104 | _log->error("input file not found: {}", path); 105 | return 1; 106 | } 107 | if (freopen(path.c_str(), "r", stdin) == nullptr) { 108 | _log->error("fail to open input file: {}", path); 109 | return 2; 110 | } 111 | } 112 | if (opts.count("output")) { 113 | string path = opts["output"].as(); 114 | if (freopen(path.c_str(), "w", stdout) == nullptr) { 115 | _log->error("fail to open output file: {}", path); 116 | return 3; 117 | } 118 | } 119 | 120 | #ifdef PROFILER 121 | ProfilerStart("/tmp/bin_khaiii.prof"); 122 | #endif 123 | 124 | int ret = run(opts); 125 | 126 | #ifdef PROFILER 127 | ProfilerStop(); 128 | #endif 129 | 130 | return ret; 131 | } 132 | -------------------------------------------------------------------------------- /src/main/python/MANIFEST.in.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE 3 | recursive-include khaiii * 4 | include @CPACK_SOURCE_PACKAGE_FILE_NAME@.tar.gz 5 | -------------------------------------------------------------------------------- /src/main/python/khaiii/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kakao/khaiii/3d0c8944374163b1107fd6690ccf3a408430a02d/src/main/python/khaiii/__init__.py -------------------------------------------------------------------------------- /src/main/python/khaiii/__init__.py.in: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | """ 5 | @CPACK_PACKAGE_DESCRIPTION_SUMMARY@ 6 | 7 | __version__ = '@KHAIII_VERSION@' 8 | __author__ = '@CPACK_PACKAGE_VENDOR@' 9 | __copyright__ = 'Copyright (C) 2018-, Kakao Corp. All rights reserved.' 10 | __license__ = 'Apache 2.0' 11 | __maintainer__ = 'Jamie' 12 | __email__ = 'jamie.lim@kakaocorp.com' 13 | """ 14 | 15 | 16 | from .khaiii import KhaiiiApi, KhaiiiExcept, KhaiiiMorph, KhaiiiWord 17 | -------------------------------------------------------------------------------- /src/main/python/khaiii/munjong/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kakao/khaiii/3d0c8944374163b1107fd6690ccf3a408430a02d/src/main/python/khaiii/munjong/__init__.py -------------------------------------------------------------------------------- /src/main/python/khaiii/resource/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kakao/khaiii/3d0c8944374163b1107fd6690ccf3a408430a02d/src/main/python/khaiii/resource/__init__.py -------------------------------------------------------------------------------- /src/main/python/khaiii/resource/morphs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | """ 5 | 형태소 분석 결과를 기술한 문자열을 파싱하는 모듈. 6 | TODO(jamie): sejong_corpus 모듈의 Morph 클래스와 중복되므로 정리 필요 7 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)' 8 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' 9 | """ 10 | 11 | 12 | ########### 13 | # imports # 14 | ########### 15 | from typing import List 16 | 17 | 18 | ############# 19 | # constants # 20 | ############# 21 | # 전체 태그셋. 숫자 -> 태그 매핑 22 | TAGS = sorted(['EC', 'EF', 'EP', 'ETM', 'ETN', 'IC', 'JC', 'JKB', 'JKC', 'JKG', 23 | 'JKO', 'JKQ', 'JKS', 'JKV', 'JX', 'MAG', 'MAJ', 'MM', 'NNB', 'NNG', 24 | 'NNP', 'NP', 'NR', 'SE', 'SF', 'SH', 'SL', 'SN', 'SO', 'SP', 25 | 'SS', 'SW', 'SWK', 'VA', 'VCN', 'VCP', 'VV', 'VX', 'XPN', 'XR', 26 | 'XSA', 'XSN', 'XSV', 'ZN', 'ZV', 'ZZ', ]) 27 | # B- 태그가 가능한 태그 목록 28 | B_TAGS = sorted(['EP', 'IC', 'JKB', 'JX', 'MAG', 'MM', 'NNB', 'NNG', 'NNP', 'NP', 29 | 'NR', 'SE', 'SF', 'SN', 'SO', 'SP', 'SS', 'SW', 'SWK', 'XPN', 30 | 'XR', 'XSN', ]) 31 | TAG_SET = {tag: num for num, tag in enumerate(TAGS, start=1)} # 태그 -> 숫자 매핑 32 | 33 | WORD_DELIM_STR = '_' # 어절 경계(공백)를 나타내는 가상 형태소 34 | SENT_DELIM_STR = '|' # 문장 경계를 나타내는 가상 형태소 35 | WORD_DELIM_NUM = -1 # 어절 경계 가상 태그 번호 36 | SENT_DELIM_NUM = -2 # 문장 경계 가상 태그 번호 37 | 38 | 39 | ######### 40 | # types # 41 | ######### 42 | class ParseError(Exception): 43 | """ 44 | 형태소 분석 결과 문자열을 파싱하면서 발생하는 오류 45 | """ 46 | 47 | 48 | class Morph: 49 | """ 50 | 형태소 51 | """ 52 | def __init__(self, lex: str, tag: str): 53 | """ 54 | Arguments: 55 | lex: 형태소(어휘) 56 | tag: 품사 태그 57 | """ 58 | self.lex = lex 59 | self.tag = tag 60 | 61 | def __str__(self): 62 | if not self.tag: 63 | return self.lex 64 | return '{}/{}'.format(self.lex, self.tag) 65 | 66 | def is_word_delim(self) -> bool: 67 | """ 68 | 어절의 경계를 나타태는 지 여부 69 | Returns: 70 | 어절의 경계 여부 71 | """ 72 | return not self.tag and self.lex == WORD_DELIM_STR 73 | 74 | def is_sent_delim(self) -> bool: 75 | """ 76 | 문장의 경계를 나타태는 지 여부 77 | Returns: 78 | 문장의 경계 여부 79 | """ 80 | return not self.tag and self.lex == SENT_DELIM_STR 81 | 82 | @classmethod 83 | def to_str(cls, morphs: List['Morph']) -> str: 84 | """ 85 | Morph 객체 리스트를 문자열로 변환한다. 86 | Arguments: 87 | morphs: Morph 객체 리스트 88 | Returns: 89 | 변환된 문자열 90 | """ 91 | return ' + '.join([str(m) for m in morphs]) 92 | 93 | @classmethod 94 | def parse(cls, morphs_str: str) -> List['Morph']: 95 | """ 96 | 형태소 분석 결과 형태의 문자열을 파싱하여 Morph 객체 리스트를 반환하는 파싱 함수 97 | Arguments: 98 | morphs_str: 형태소 분석 결과 문자열. 예: "제이미/NNP + 는/JKS" 99 | Returns: 100 | Morph 객체 리스트 101 | """ 102 | if not morphs_str: 103 | raise ParseError('empty to parse') 104 | return [cls._parse_one(m) for m in morphs_str.split(' + ')] 105 | 106 | @classmethod 107 | def _parse_one(cls, morph_str: str) -> 'Morph': 108 | """ 109 | 하나의 형태소 객체를 기술한 문자열을 파싱한다. 110 | Arguments: 111 | morph_str: 형태소 문자열 112 | Returns: 113 | Morph 객체 114 | """ 115 | if ' ' in morph_str: 116 | raise ParseError('space in morph') 117 | try: 118 | if morph_str in [WORD_DELIM_STR, SENT_DELIM_STR]: 119 | return Morph(morph_str, '') 120 | lex, tag = morph_str.rsplit('/', 1) 121 | except ValueError: 122 | raise ParseError('invalid morpheme string format') 123 | if not lex: 124 | raise ParseError('no lexical in morpheme string') 125 | if not tag: 126 | raise ParseError('no pos tag in morpheme string') 127 | if tag not in TAG_SET: 128 | raise ParseError('invalid pos tag: {}'.format(tag)) 129 | return Morph(lex, tag) 130 | 131 | 132 | ############# 133 | # functions # 134 | ############# 135 | def mix_char_tag(chars: str, tags: List[int]) -> List[int]: 136 | """ 137 | 음절과 출력 태그를 비트 연산으로 합쳐서 하나의 (32비트) 숫자로 표현한다. 138 | Args: 139 | chars: 음절 (유니코드) 리스트 (문자열) 140 | tags: 출력 태그 번호의 리스트 141 | Returns: 142 | 합쳐진 숫자의 리스트 143 | """ 144 | char_nums = [ord(c) for c in chars] 145 | if tags[0] == SENT_DELIM_NUM: 146 | char_nums.insert(0, SENT_DELIM_NUM) 147 | if tags[-1] == SENT_DELIM_NUM: 148 | char_nums.append(SENT_DELIM_NUM) 149 | for idx, char_num in enumerate(char_nums): 150 | if char_num == ord(' '): 151 | char_nums[idx] = WORD_DELIM_NUM 152 | continue 153 | elif tags[idx] == SENT_DELIM_NUM: 154 | continue 155 | char_nums[idx] = char_num << 12 | tags[idx] 156 | return char_nums 157 | -------------------------------------------------------------------------------- /src/main/python/khaiii/resource/resource.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | """ 5 | resources for training and tagging 6 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)' 7 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' 8 | """ 9 | 10 | 11 | ########### 12 | # imports # 13 | ########### 14 | from argparse import Namespace 15 | from collections import defaultdict 16 | import logging 17 | import os 18 | from typing import Dict, Tuple 19 | 20 | from khaiii.resource.vocabulary import Vocabulary 21 | 22 | 23 | ############# 24 | # constants # 25 | ############# 26 | UNK_CHR = '@@UNKNOWN@@' 27 | SPECIAL_CHARS = ['', ''] # begin/end of word 28 | 29 | 30 | ######### 31 | # types # 32 | ######### 33 | class Resource: 34 | """ 35 | resources 36 | """ 37 | def __init__(self, cfg: Namespace): 38 | """ 39 | Args: 40 | cfg: config 41 | """ 42 | vocab_in_path = '{}/vocab.in'.format(cfg.rsc_src) 43 | self.vocab_in = Vocabulary(vocab_in_path, cfg.cutoff, UNK_CHR, SPECIAL_CHARS) 44 | vocab_out_path = '{}/vocab.out'.format(cfg.rsc_src) 45 | self.vocab_out = Vocabulary(vocab_out_path) # no unknown, no special 46 | restore_dic_path = '{}/restore.dic'.format(cfg.rsc_src) 47 | self.restore_dic = self.load_restore_dic(restore_dic_path) 48 | 49 | @classmethod 50 | def load_restore_dic(cls, path: str) -> Dict[str, str]: 51 | """ 52 | load character to output tag mapping 53 | Args: 54 | path: file path 55 | Returns: 56 | dictionary 57 | """ 58 | dic = {} 59 | for line in open(path, 'r', encoding='UTF-8'): 60 | line = line.rstrip('\r\n') 61 | if not line: 62 | continue 63 | key, val = line.split('\t') 64 | dic[key] = val 65 | logging.info('%s: %d entries', os.path.basename(path), len(dic)) 66 | return dic 67 | 68 | 69 | ############# 70 | # functions # 71 | ############# 72 | def parse_restore_dic(file_path: str) -> Dict[Tuple[str, str], Dict[int, str]]: 73 | """ 74 | 원형복원 사전을 로드한다. 75 | Args: 76 | file_path: 파일 경로 77 | Returns: 78 | 사전 79 | """ 80 | file_name = os.path.basename(file_path) 81 | restore_dic = defaultdict(dict) 82 | for line_num, line in enumerate(open(file_path, 'r', encoding='UTF-8'), start=1): 83 | line = line.rstrip() 84 | if not line or line[0] == '#': 85 | continue 86 | char_tag_num, mrp_chr_str = line.split('\t') 87 | char, tag_num = char_tag_num.rsplit('/', 1) 88 | tag, num = tag_num.rsplit(':', 1) 89 | num = int(num) 90 | if (char, tag) in restore_dic: 91 | num_mrp_chrs_dic = restore_dic[char, tag] 92 | if num in num_mrp_chrs_dic: 93 | logging.error('%s:%d: duplicated with %s: %s', file_name, line_num, 94 | num_mrp_chrs_dic[num], line) 95 | return {} 96 | restore_dic[char, tag][num] = mrp_chr_str 97 | return restore_dic 98 | 99 | 100 | def load_vocab_out(rsc_src: str) -> Dict[str, int]: 101 | """ 102 | 출력 태그 vocabulary를 로드한다. 103 | Args: 104 | rsc_src: 리소스 디렉토리 105 | Returns: 106 | 출력 태그 vocabulary 107 | """ 108 | file_path = '{}/vocab.out'.format(rsc_src) 109 | vocab_out = [line.strip() for line in open(file_path, 'r', encoding='UTF-8') 110 | if line.strip()] 111 | vocab_out_more = [] 112 | file_path = '{}/vocab.out.more'.format(rsc_src) 113 | if os.path.exists(file_path): 114 | vocab_out_more = [line.strip() for line in open(file_path, 'r', encoding='UTF-8') 115 | if line.strip()] 116 | return {tag: idx for idx, tag in enumerate(vocab_out + vocab_out_more, start=1)} 117 | -------------------------------------------------------------------------------- /src/main/python/khaiii/resource/vocabulary.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | """ 5 | vocabulary library 6 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)' 7 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' 8 | """ 9 | 10 | 11 | ########### 12 | # imports # 13 | ########### 14 | import logging 15 | import os 16 | from typing import List 17 | 18 | 19 | ######### 20 | # types # 21 | ######### 22 | class Vocabulary: 23 | """ 24 | vocabulary class 25 | """ 26 | def __init__(self, path: str, cutoff: int = 1, unk: str = '', special: List[str] = None): 27 | """ 28 | padding index is always 0. None and '' get padding index. 29 | if `unk` is given (such as input vocab), its index is always 1. 30 | if `unk` is not given (such as output vocab), an exception will be thrown for unknown entry 31 | Args: 32 | path: file path 33 | cutoff: cutoff frequency 34 | unk: unknown(OOV) entry 35 | special: special entries located at the first 36 | """ 37 | self.dic = {} # {entry: number} dictionary 38 | self.unk = unk 39 | self.rev = ['', unk] if unk else [] # reverse dictionary 40 | if special: 41 | self.rev.extend(special) 42 | for num, entry in enumerate(self.rev): 43 | self.dic[entry] = num 44 | self._load(path, cutoff) 45 | assert len(self.dic) == len(self.rev) 46 | 47 | def __getitem__(self, key): 48 | """ 49 | Args: 50 | key: key 51 | Returns: 52 | word number for string key, word for int key 53 | """ 54 | if isinstance(key, int): 55 | return self.rev[key] 56 | try: 57 | return self.dic[key] 58 | except KeyError as key_err: 59 | if self.unk: 60 | return self.dic[self.unk] 61 | raise key_err 62 | 63 | def __len__(self): 64 | return len(self.dic) 65 | 66 | def _load(self, path: str, cutoff: int = 1): 67 | """ 68 | load vocabulary from file 69 | Args: 70 | path: file path 71 | cutoff: cutoff frequency 72 | """ 73 | append_num = 0 74 | cutoff_num = 0 75 | for line in open(path, 'r', encoding='UTF-8'): 76 | line = line.rstrip('\r\n') 77 | if not line: 78 | continue 79 | try: 80 | entry, freq = line.split('\t') 81 | if int(freq) <= cutoff: 82 | cutoff_num += 1 83 | continue 84 | except ValueError: 85 | entry = line 86 | if entry in self.dic: 87 | cutoff_num += 1 88 | continue 89 | self.dic[entry] = len(self.dic) 90 | self.rev.append(entry) 91 | append_num += 1 92 | logging.info('%s: %d entries, %d cutoff', os.path.basename(path), append_num, cutoff_num) 93 | -------------------------------------------------------------------------------- /src/main/python/khaiii/train/embedder.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | """ 5 | making embedding models 6 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)' 7 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' 8 | """ 9 | 10 | 11 | ########### 12 | # imports # 13 | ########### 14 | from argparse import Namespace 15 | import math 16 | 17 | import torch 18 | from torch import nn, Tensor 19 | 20 | from khaiii.resource.resource import Resource 21 | 22 | 23 | class Embedder(nn.Module): 24 | """ 25 | embedder class 26 | """ 27 | def __init__(self, cfg: Namespace, rsc: Resource): 28 | """ 29 | Args: 30 | cfg: config 31 | rsc: Resource object 32 | """ 33 | super().__init__() 34 | self.cfg = cfg 35 | self.rsc = rsc 36 | self.embedding = nn.Embedding(len(rsc.vocab_in), cfg.embed_dim, 0) 37 | 38 | def forward(self, *inputs): # pylint: disable=arguments-differ 39 | """ 40 | 임베딩을 생성하는 메소드 41 | Args: 42 | inputs: batch size list of (context, left space mask, right space mask) 43 | Returns: 44 | embedding 45 | """ 46 | contexts, left_spc_masks, right_spc_masks = inputs 47 | embeds = self.embedding(contexts) 48 | if left_spc_masks is not None: 49 | embeds += self.embedding(left_spc_masks) 50 | if right_spc_masks is not None: 51 | embeds += self.embedding(right_spc_masks) 52 | # 왼쪽과 오른쪽 패딩에는 zero 벡터인데 아래 positional encoding이 더해짐 53 | # 사소하지만 아래도 패딩 영역에 대해 마스킹 후 더해줘야 하지 않을까? 54 | embeds += positional_encoding(self.cfg.context_len, self.cfg.context_len, 55 | self.cfg.embed_dim, 1, self.cfg.gpu_num) 56 | return embeds 57 | 58 | 59 | ############# 60 | # functions # 61 | ############# 62 | def memoize(func): 63 | """ 64 | memoize decorator 65 | """ 66 | class Memodict(dict): 67 | """ 68 | Memoization decorator for a function taking one or more arguments. 69 | """ 70 | def __getitem__(self, *key): 71 | return dict.__getitem__(self, key) 72 | 73 | def __missing__(self, key): 74 | ret = self[key] = func(*key) 75 | return ret 76 | 77 | return Memodict().__getitem__ 78 | 79 | 80 | @memoize 81 | def positional_encoding(sent_len: int, max_dim: int, embed_dim: int, method: int = 1, 82 | gpu_num: int = -1) -> Tensor: 83 | """ 84 | positional encoding Tensor 출력. 85 | embeds [batch_size, context_len, embed_dim]에 Broadcasting 으로 더해짐 86 | Args: 87 | sent_len: actual sentence length 88 | max_dim: maximum dimension 89 | embed_dim: embedding dimension 90 | method: method number (1. end-to-end memory networks or 2. attention is all you need) 91 | gpu_num: GPU device number. default: -1 for CPU 92 | Returns: 93 | pe [context_len, embed_dim] 94 | """ 95 | device = gpu_num if gpu_num >= 0 else None 96 | pe_tensor = torch.zeros([max_dim, embed_dim], device=device) # pylint: disable=no-member 97 | for pos in range(1, sent_len + 1): 98 | for i in range(1, embed_dim+1): 99 | if method == 1: 100 | # end-to-end memory networks 101 | pe_tensor[pos-1, i-1] = 1 - pos / sent_len - ((i / embed_dim) * 102 | (1 - 2 * pos / sent_len)) 103 | elif method == 2: 104 | # attention is all you need 105 | if i % 2 == 0: 106 | pe_tensor[pos-1, i-1] = math.sin(pos / 10000 ** (2*i / embed_dim)) 107 | else: 108 | pe_tensor[pos-1, i-1] = math.cos(pos / 10000 ** (2*i / embed_dim)) 109 | pe_tensor.detach() 110 | return pe_tensor 111 | -------------------------------------------------------------------------------- /src/main/python/khaiii/train/evaluator.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | """ 5 | evaluation related module 6 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)' 7 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' 8 | """ 9 | 10 | 11 | ########### 12 | # imports # 13 | ########### 14 | from collections import Counter 15 | import logging 16 | from typing import List, TextIO, Tuple 17 | 18 | from khaiii.train.sentence import PosMorph, PosSentence, PosWord 19 | 20 | 21 | ######### 22 | # types # 23 | ######### 24 | class Evaluator: 25 | """ 26 | evauator 27 | """ 28 | def __init__(self): 29 | self.cnt = Counter() 30 | 31 | def evaluate(self) -> Tuple[float, float, float]: 32 | """ 33 | char/word accuracy, f-score(recall/precision)를 측정한다. 34 | Returns: 35 | character accuracy 36 | word accuracy 37 | f-score 38 | """ 39 | char_acc = self.cnt['match_chars'] / self.cnt['total_chars'] 40 | word_acc = self.cnt['match_words'] / self.cnt['total_words'] 41 | if self.cnt['match_morphs'] == 0: 42 | recall = precision = f_score = 0.0 43 | else: 44 | recall = self.cnt['match_morphs'] / self.cnt['total_gold_morphs'] 45 | precision = self.cnt['match_morphs'] / self.cnt['total_pred_morphs'] 46 | f_score = 2.0 * recall * precision / (recall + precision) 47 | self.cnt.clear() 48 | return char_acc, word_acc, f_score 49 | 50 | def count(self, correct_sent: PosSentence, predict_sent: PosSentence): 51 | """ 52 | 정답 문장과 비교하여 맞춘 갯수를 샌다. 53 | Args: 54 | correct_sent: 정답 문장 55 | predict_sent: 예측한 문장 56 | """ 57 | assert len(correct_sent.words) == len(predict_sent.words) 58 | for gold, pred in zip(correct_sent.pos_tagged_words, predict_sent.pos_tagged_words): 59 | self.cnt['total_chars'] += len(gold.res_tags) 60 | self.cnt['match_chars'] += len([1 for x, y in zip(gold.res_tags, pred.res_tags) 61 | if x == y]) 62 | self._count_word(gold, pred) 63 | 64 | def _count_word(self, gold: PosWord, pred: PosWord): 65 | """ 66 | count with gold standard and predicted (will update counter) 67 | Args: 68 | gold: gold standard word 69 | pred: predicted word 70 | """ 71 | self.cnt['total_words'] += 1 72 | gold_morphs = gold.pos_tagged_morphs 73 | pred_morphs = pred.pos_tagged_morphs 74 | if gold == pred: 75 | self.cnt['match_words'] += 1 76 | num_match = len(gold_morphs) 77 | self.cnt['total_gold_morphs'] += num_match 78 | self.cnt['total_pred_morphs'] += num_match 79 | self.cnt['match_morphs'] += num_match 80 | return 81 | logging.debug('gold: %s', ' '.join([str(_) for _ in gold_morphs])) 82 | logging.debug('pred: %s', ' '.join([str(_) for _ in pred_morphs])) 83 | self.cnt['total_gold_morphs'] += len(gold_morphs) 84 | self.cnt['total_pred_morphs'] += len(pred_morphs) 85 | gold_set = self.morphs_to_set(gold_morphs) 86 | pred_set = self.morphs_to_set(pred_morphs) 87 | self.cnt['match_morphs'] += len(gold_set & pred_set) 88 | 89 | @classmethod 90 | def morphs_to_set(cls, morphs: List[PosMorph]) -> set: 91 | """ 92 | make set from morpheme list 93 | Args: 94 | morphs: morpheme list 95 | Returns: 96 | morphemes set 97 | """ 98 | morph_cnt = Counter([(morph.morph, morph.pos_tag) for morph in morphs]) 99 | morph_set = set() 100 | for (lex, tag), freq in morph_cnt.items(): 101 | if freq == 1: 102 | morph_set.add((lex, tag)) 103 | else: 104 | morph_set.update([(lex, tag, _) for _ in range(1, freq+1)]) 105 | return morph_set 106 | 107 | def report(self, fout: TextIO): 108 | """ 109 | report recall/precision to file 110 | Args: 111 | fout: output file 112 | """ 113 | print('word accuracy: %d / %d = %.4f' % (self.cnt['match_words'], self.cnt['total_words'], 114 | self.cnt['match_words'] / self.cnt['total_words']), 115 | file=fout) 116 | if self.cnt['match_morphs'] == 0: 117 | recall = precision = f_score = 0.0 118 | else: 119 | recall = self.cnt['match_morphs'] / self.cnt['total_gold_morphs'] 120 | precision = self.cnt['match_morphs'] / self.cnt['total_pred_morphs'] 121 | f_score = 2.0 * recall * precision / (recall + precision) 122 | print('f-score / (recall, precision): %.4f / (%.4f, %.4f)' % (f_score, recall, precision), 123 | file=fout) 124 | -------------------------------------------------------------------------------- /src/main/python/khaiii/train/models.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | """ 5 | Pytorch models 6 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)' 7 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' 8 | """ 9 | 10 | 11 | ########### 12 | # imports # 13 | ########### 14 | from argparse import Namespace 15 | 16 | import torch 17 | import torch.nn as nn 18 | import torch.nn.functional as F 19 | 20 | from khaiii.resource.resource import Resource 21 | from khaiii.train.embedder import Embedder 22 | 23 | 24 | ######### 25 | # types # 26 | ######### 27 | class ConvLayer(nn.Module): 28 | """ 29 | 형태소 태깅 모델과 띄어쓰기 모델이 공유하는 컨볼루션 레이어 30 | """ 31 | def __init__(self, cfg: Namespace, rsc: Resource): 32 | """ 33 | Args: 34 | cfg: config 35 | rsc: Resource object 36 | """ 37 | super().__init__() 38 | self.embedder = Embedder(cfg, rsc) 39 | ngram = min(5, cfg.window * 2 + 1) 40 | self.convs = nn.ModuleList([nn.Conv1d(cfg.embed_dim, cfg.embed_dim, kernel_size) 41 | for kernel_size in range(2, ngram+1)]) 42 | 43 | def forward(self, *inputs): 44 | embeds = self.embedder(*inputs) 45 | embeds_t = embeds.transpose(1, 2) 46 | pool_outs = [] 47 | for conv in self.convs: 48 | conv_out = F.relu(conv(embeds_t)) 49 | pool_outs.append(F.max_pool1d(conv_out, conv_out.size(2))) 50 | features = torch.cat([p.view(embeds.size(0), -1) for p in pool_outs], dim=1) # pylint: disable=no-member 51 | return features 52 | 53 | 54 | class HiddenLayer(nn.Module): 55 | """ 56 | 형태소 태깅 모델과 띄어쓰기 모델이 각각 학습하는 히든 레이어 57 | """ 58 | def __init__(self, cfg: Namespace, rsc: Resource, conv_layer_len: int, is_spc: bool): 59 | """ 60 | Args: 61 | cfg: config 62 | rsc: Resource object 63 | conv_layer_len: convolution 레이어의 n-gram 타입 갯수 64 | is_spc: 띄어쓰기 모델 여부 65 | """ 66 | super().__init__() 67 | setattr(cfg, 'hidden_dim', 68 | (cfg.embed_dim * conv_layer_len + len(rsc.vocab_out)) // 2) 69 | feature_dim = cfg.embed_dim * conv_layer_len 70 | tag_dim = 2 if is_spc else len(rsc.vocab_out) 71 | self.layers = nn.ModuleList([nn.Linear(feature_dim, cfg.hidden_dim), 72 | nn.Linear(cfg.hidden_dim, tag_dim)]) 73 | 74 | def forward(self, features): # pylint: disable=arguments-differ 75 | # feature => hidden 76 | features_drop = F.dropout(features) 77 | hidden_out = F.relu(self.layers[0](features_drop)) 78 | # hidden => tag 79 | hidden_out_drop = F.dropout(hidden_out) 80 | tag_out = self.layers[1](hidden_out_drop) 81 | return tag_out 82 | 83 | 84 | class Model(nn.Module): 85 | """ 86 | 형태소 태깅 모델, 띄어쓰기 모델 87 | """ 88 | def __init__(self, cfg: Namespace, rsc: Resource): 89 | """ 90 | Args: 91 | cfg: config 92 | rsc: Resource object 93 | """ 94 | super().__init__() 95 | self.cfg = cfg 96 | self.rsc = rsc 97 | self.conv_layer = ConvLayer(cfg, rsc) 98 | self.hidden_layer_pos = HiddenLayer(cfg, rsc, len(self.conv_layer.convs), is_spc=False) 99 | self.hidden_layer_spc = HiddenLayer(cfg, rsc, len(self.conv_layer.convs), is_spc=True) 100 | 101 | def forward(self, *inputs): 102 | contexts, left_spc_masks, right_spc_masks = inputs 103 | features_pos = self.conv_layer(contexts, left_spc_masks, right_spc_masks) 104 | features_spc = self.conv_layer(contexts, None, None) 105 | logits_pos = self.hidden_layer_pos(features_pos) 106 | logits_spc = self.hidden_layer_spc(features_spc) 107 | return logits_pos, logits_spc 108 | 109 | def save(self, path: str): 110 | """ 111 | 모델을 저장하는 메소드 112 | Args: 113 | path: 경로 114 | """ 115 | torch.save(self.state_dict(), path) 116 | 117 | def load(self, path: str): 118 | """ 119 | 저장된 모델을 로드하는 메소드 120 | Args: 121 | path: 경로 122 | conv_layer: convolution layer 123 | """ 124 | state_dict = torch.load(path, map_location=lambda storage, loc: storage) 125 | self.load_state_dict(state_dict) 126 | if torch.cuda.is_available() and self.cfg.gpu_num >= 0: 127 | self.cuda(device=self.cfg.gpu_num) 128 | -------------------------------------------------------------------------------- /src/main/python/khaiii/train/tagger.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | """ 5 | part-of-speech tagger 6 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)' 7 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' 8 | """ 9 | 10 | 11 | ########### 12 | # imports # 13 | ########### 14 | from argparse import Namespace 15 | import json 16 | import logging 17 | import re 18 | 19 | import torch.nn.functional as F 20 | 21 | from khaiii.resource.resource import Resource 22 | from khaiii.train.dataset import PosSentTensor 23 | from khaiii.train.models import Model 24 | 25 | 26 | ######### 27 | # types # 28 | ######### 29 | class PosTagger: 30 | """ 31 | part-of-speech tagger 32 | """ 33 | def __init__(self, model_dir: str, gpu_num: int = -1): 34 | """ 35 | Args: 36 | model_dir: model dir 37 | gpu_num: GPU number to override 38 | """ 39 | cfg_dict = json.load(open('{}/config.json'.format(model_dir), 'r', encoding='UTF-8')) 40 | self.cfg = Namespace() 41 | for key, val in cfg_dict.items(): 42 | setattr(self.cfg, key, val) 43 | setattr(self.cfg, 'gpu_num', gpu_num) 44 | self.rsc = Resource(self.cfg) 45 | self.model = Model(self.cfg, self.rsc) 46 | self.model.load('{}/model.state'.format(model_dir)) 47 | self.model.eval() 48 | 49 | def tag_raw(self, raw_sent: str, enable_restore: bool = True) -> PosSentTensor: 50 | """ 51 | part-of-speech tagging at raw sentence 52 | Args: 53 | raw_sent: raw input sentence 54 | Returns: 55 | PosSentTensor object 56 | """ 57 | pos_sent = PosSentTensor(raw_sent) 58 | contexts = pos_sent.get_contexts(self.cfg, self.rsc) 59 | left_spc_masks, right_spc_masks = pos_sent.get_spc_masks(self.cfg, self.rsc, False) 60 | outputs, _ = self.model(PosSentTensor.to_tensor(contexts, self.cfg.gpu_num), # pylint: disable=no-member 61 | PosSentTensor.to_tensor(left_spc_masks, self.cfg.gpu_num), # pylint: disable=no-member 62 | PosSentTensor.to_tensor(right_spc_masks, self.cfg.gpu_num)) # pylint: disable=no-member 63 | _, predicts = F.softmax(outputs, dim=1).max(1) 64 | tags = [self.rsc.vocab_out[t.item()] for t in predicts] 65 | pos_sent.set_pos_result(tags, self.rsc.restore_dic if enable_restore else None) 66 | 67 | if logging.getLogger().isEnabledFor(logging.DEBUG): 68 | raw_nospc = re.sub(r'\s+', '', raw_sent) 69 | for idx, (tag, pred) in enumerate(zip(tags, predicts)): 70 | logging.debug('[%2d]%s: %5s(%d)', idx, raw_nospc[idx], tag, pred.data[0]) 71 | 72 | return pos_sent 73 | -------------------------------------------------------------------------------- /src/main/python/setup.py.in: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | """ 5 | @CPACK_PACKAGE_DESCRIPTION_SUMMARY@ 6 | 7 | __version__ = '@KHAIII_VERSION@' 8 | __author__ = '@CPACK_PACKAGE_VENDOR@' 9 | __copyright__ = 'Copyright (C) 2018-, Kakao Corp. All rights reserved.' 10 | __license__ = 'Apache 2.0' 11 | __maintainer__ = 'Jamie' 12 | __email__ = 'jamie.lim@kakaocorp.com' 13 | """ 14 | 15 | 16 | ########### 17 | # imports # 18 | ########### 19 | from distutils.command.build import build 20 | import os 21 | import shutil 22 | import subprocess 23 | import zipfile 24 | 25 | from setuptools import setup 26 | 27 | 28 | ############# 29 | # constants # 30 | ############# 31 | _SRC_NAME = '@CPACK_SOURCE_PACKAGE_FILE_NAME@' 32 | 33 | 34 | ######### 35 | # types # 36 | ######### 37 | class CustomBuild(build): 38 | """ 39 | custom handler for 'build' command 40 | """ 41 | def run(self): 42 | """ 43 | run build command 44 | """ 45 | with zipfile.ZipFile('{}.zip'.format(_SRC_NAME), 'r') as src_zip: 46 | src_zip.extractall() 47 | build_dir = '{}/build'.format(_SRC_NAME) 48 | os.makedirs(build_dir, exist_ok=True) 49 | subprocess.check_call('cmake ..', cwd=build_dir, shell=True) 50 | subprocess.check_call('make all resource', cwd=build_dir, shell=True) 51 | shutil.rmtree('khaiii/lib', ignore_errors=True) 52 | shutil.copytree('{}/lib'.format(build_dir), 'khaiii/lib') 53 | shutil.rmtree('khaiii/share', ignore_errors=True) 54 | shutil.copytree('{}/share'.format(build_dir), 'khaiii/share') 55 | shutil.rmtree(_SRC_NAME) 56 | build.run(self) 57 | 58 | 59 | ############# 60 | # functions # 61 | ############# 62 | def readme(): 63 | """ 64 | read content from README.md file 65 | Returns: 66 | long description (content of README.md) 67 | """ 68 | return open('@CMAKE_SOURCE_DIR@/README.md', 'r', encoding='UTF-8').read() 69 | 70 | 71 | ######### 72 | # setup # 73 | ######### 74 | setup( 75 | name='khaiii', 76 | version='@KHAIII_VERSION@', 77 | description='@CPACK_PACKAGE_DESCRIPTION_SUMMARY@', 78 | long_description=readme(), 79 | url='https://github.com/kakao/khaiii', 80 | author='@CPACK_PACKAGE_VENDOR@', 81 | author_email='jamie.lim@kakaocorp.com', 82 | classifiers=[ 83 | 'Development Status :: 5 - Stable', 84 | 'License :: OSI Approved :: Apache 2.0', 85 | 'Programming Language :: Python :: 3', 86 | ], 87 | license='Apache 2.0', 88 | packages=['khaiii', ], 89 | include_package_data=True, 90 | install_requires=[], 91 | setup_requires=['pytest-runner', ], 92 | tests_require=['pytest', ], 93 | zip_safe=False, 94 | cmdclass={'build': CustomBuild} 95 | ) 96 | -------------------------------------------------------------------------------- /src/test/cpp/khaiii/ErrPatchTest.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2018-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | 8 | ////////////// 9 | // includes // 10 | ////////////// 11 | #include 12 | #include 13 | #include 14 | 15 | #include "cxxopts.hpp" 16 | #include "gtest/gtest.h" 17 | #include "spdlog/spdlog.h" 18 | 19 | #include "khaiii/ErrPatch.hpp" 20 | #include "khaiii/KhaiiiApi.hpp" 21 | #include "khaiii/Word.hpp" 22 | #include "khaiii/util.hpp" 23 | 24 | 25 | /////////////// 26 | // variables // 27 | /////////////// 28 | extern cxxopts::ParseResult* prog_args; // arguments passed to main program 29 | 30 | 31 | namespace khaiii { 32 | 33 | 34 | using std::make_shared; 35 | using std::ostringstream; 36 | using std::pair; 37 | using std::shared_ptr; 38 | using std::string; 39 | using std::vector; 40 | using std::wstring; 41 | 42 | 43 | ////////////////// 44 | // test fixture // 45 | ////////////////// 46 | class ErrPatchTest: public testing::Test { 47 | public: 48 | virtual void SetUp() { 49 | std::string rsc_dir = (*prog_args)["rsc-dir"].as(); 50 | ASSERT_NO_THROW(_khaiii_api->open(rsc_dir, "{\"errpatch\": false}")); 51 | } 52 | 53 | virtual void TearDown() { 54 | ASSERT_NO_THROW(_khaiii_api->close()); 55 | } 56 | 57 | protected: 58 | static shared_ptr _log; ///< logger 59 | 60 | shared_ptr _khaiii_api = KhaiiiApi::create(); 61 | 62 | void _check(string raw, string left, string right) { 63 | auto bfr = _khaiii_api->analyze(raw.c_str(), "{\"errpatch\": false}"); 64 | string bfr_str = _to_str(bfr); 65 | if (left != bfr_str) { 66 | _log->warn("error not found: '{}' => E:'{}' vs A:'{}'", raw, left, bfr_str); 67 | return; 68 | } 69 | auto aft = _khaiii_api->analyze(raw.c_str(), "{\"errpatch\": true}"); 70 | EXPECT_STREQ(right.c_str(), _to_str(aft).c_str()); 71 | } 72 | 73 | string _to_str(const khaiii_word_t* results) { 74 | ostringstream oss; 75 | for (auto word = results; word != nullptr; word = word->next) { 76 | if (word != results) oss << " + _ + "; 77 | const khaiii_morph_t* morphs = word->morphs; 78 | for (auto morph = morphs; morph != nullptr; morph = morph->next) { 79 | if (morph != morphs) oss << " + "; 80 | oss << morph->lex << "/" << morph->tag; 81 | } 82 | } 83 | return oss.str(); 84 | } 85 | }; 86 | 87 | 88 | shared_ptr ErrPatchTest::_log = spdlog::stderr_color_mt("ErrPatchTest"); 89 | 90 | 91 | //////////////// 92 | // test cases // 93 | //////////////// 94 | TEST_F(ErrPatchTest, apply) { 95 | // for base model 96 | _check("지저스크라이스트", "지저스크라이스/NNP + 트/NNG", "지저스/NNP + 크라이스트/NNP"); 97 | _check("지저스 크라이스트", "지저스/NNP + _ + 크라이스/NNP + 트/NNG", 98 | "지저스/NNP + _ + 크라이스트/NNP"); 99 | _check("고타마싯다르타", "고타마싯다르타/NNP", "고타마/NNP + 싯다르타/NNP"); 100 | _check("무함마드압둘라", "무함마드압/NNP + 둘/NR + 라/NNP", "무함마드/NNP + 압둘라/NNP"); 101 | } 102 | 103 | 104 | } // namespace khaiii 105 | -------------------------------------------------------------------------------- /src/test/cpp/khaiii/KhaiiiApiTest.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2018-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | #ifndef SRC_TEST_CPP_KHAIII_KHAIIIAPITEST_HPP_ 8 | #define SRC_TEST_CPP_KHAIII_KHAIIIAPITEST_HPP_ 9 | 10 | 11 | 12 | ////////////// 13 | // includes // 14 | ////////////// 15 | #include 16 | 17 | #include "gtest/gtest.h" 18 | 19 | #include "khaiii/khaiii_api.h" 20 | 21 | 22 | ////////////////// 23 | // test fixture // 24 | ////////////////// 25 | class KhaiiiApiTest: public testing::Test { 26 | public: 27 | virtual void SetUp(); ///< set up 28 | virtual void TearDown(); ///< tear down 29 | 30 | protected: 31 | int _handle = -1; ///< 핸들 32 | 33 | /** 34 | * 어절의 분석 결과를 비교하기위한 함수 (포지션 정보 포함) 35 | * @param expected 기대하는 결과 문자열. 예: "[1:7]\t안녕/IC[1:6] + ?/SF[7:1]" 36 | * @param actual 실제 어절 결과 37 | */ 38 | void _expect_eq_word(std::string expected, const khaiii_word_t& actual) const; 39 | 40 | /** 41 | * 어절의 분석 결과 중 형태소 부분만을 비교하기 위한 함수 42 | * @param expected 기대하는 결과 문자열. 예: "안녕/IC + ?/SF" 43 | * @param actual 실제 어절 결과 44 | */ 45 | void _expect_eq_morphs(std::string expected, const khaiii_word_t& actual) const; 46 | }; 47 | 48 | 49 | #endif // SRC_TEST_CPP_KHAIII_KHAIIIAPITEST_HPP_ 50 | -------------------------------------------------------------------------------- /src/test/cpp/khaiii/KhaiiiDevTest.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2018-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | 8 | ////////////// 9 | // includes // 10 | ////////////// 11 | #include 12 | 13 | #include "khaiii/khaiii_dev.h" 14 | 15 | #include "khaiii/ErrPatch.hpp" 16 | #include "khaiii/KhaiiiApiTest.hpp" 17 | 18 | 19 | using std::array; 20 | using std::string; 21 | 22 | 23 | ////////////////// 24 | // test fixture // 25 | ////////////////// 26 | class KhaiiiDevTest: public KhaiiiApiTest {}; 27 | 28 | 29 | //////////////// 30 | // test cases // 31 | //////////////// 32 | TEST_F(KhaiiiDevTest, analyze_bfr_errorpatch) { 33 | array output; 34 | EXPECT_EQ(13, khaiii_analyze_bfr_errpatch(_handle, u8"진정한 테스트입니다.", "", &output[0])); 35 | EXPECT_EQ(khaiii::ErrPatch::SENT_DELIM_NUM, output[0]); // bos/eos 36 | EXPECT_EQ(khaiii::ErrPatch::WORD_DELIM_NUM, output[4]); // bow/eow 37 | EXPECT_EQ(khaiii::ErrPatch::SENT_DELIM_NUM, output[12]); // bos/eos 38 | 39 | EXPECT_GT(0, khaiii_analyze_bfr_errpatch(-1, u8"", "", &output[0])); // invalid handle 40 | EXPECT_GT(0, khaiii_analyze_bfr_errpatch(_handle, nullptr, "", &output[0])); // null input 41 | EXPECT_GT(0, khaiii_analyze_bfr_errpatch(_handle, u8"", "", nullptr)); // null output 42 | } 43 | 44 | 45 | TEST_F(KhaiiiDevTest, set_log_level) { 46 | EXPECT_EQ(0, khaiii_set_log_level("all", "trace")); 47 | EXPECT_EQ(0, khaiii_set_log_level("all", "debug")); 48 | EXPECT_EQ(0, khaiii_set_log_level("all", "info")); 49 | EXPECT_EQ(0, khaiii_set_log_level("all", "warn")); 50 | EXPECT_EQ(0, khaiii_set_log_level("all", "err")); 51 | EXPECT_EQ(0, khaiii_set_log_level("all", "critical")); 52 | 53 | EXPECT_GT(0, khaiii_set_log_level(nullptr, "debug")); // null logger 54 | EXPECT_GT(0, khaiii_set_log_level("", "debug")); // zero string logger 55 | EXPECT_GT(0, khaiii_set_log_level("__invalid_logger__", "debug")); 56 | EXPECT_GT(0, khaiii_set_log_level("Tagger", nullptr)); // null level 57 | EXPECT_GT(0, khaiii_set_log_level("Tagger", "")); // zero string level 58 | EXPECT_GT(0, khaiii_set_log_level("Tagger", "__invalid_level__")); 59 | } 60 | 61 | 62 | TEST_F(KhaiiiDevTest, set_log_levels) { 63 | EXPECT_EQ(0, khaiii_set_log_levels("all:warn,Tagger:info")); 64 | EXPECT_EQ(0, khaiii_set_log_levels("")); // zero name/level pair 65 | 66 | EXPECT_GT(0, khaiii_set_log_levels(nullptr)); // null name/level pair 67 | EXPECT_GT(0, khaiii_set_log_levels("all,Tagger:info")); // invalid format 68 | } 69 | -------------------------------------------------------------------------------- /src/test/cpp/khaiii/PreanalTest.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2018-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | 8 | ////////////// 9 | // includes // 10 | ////////////// 11 | #include 12 | #include 13 | 14 | #include "cxxopts.hpp" 15 | #include "gtest/gtest.h" 16 | 17 | #include "khaiii/Preanal.hpp" 18 | #include "khaiii/Word.hpp" 19 | 20 | 21 | /////////////// 22 | // variables // 23 | /////////////// 24 | extern cxxopts::ParseResult *prog_args; // arguments passed to main program 25 | 26 | 27 | namespace khaiii { 28 | 29 | 30 | using std::make_shared; 31 | using std::shared_ptr; 32 | using std::string; 33 | using std::wstring; 34 | 35 | 36 | ////////////////// 37 | // test fixture // 38 | ////////////////// 39 | class PreanalTest: public testing::Test { 40 | public: 41 | virtual void SetUp() { 42 | std::string rsc_dir = (*prog_args)["rsc-dir"].as(); 43 | ASSERT_NO_THROW(_preanal.open(rsc_dir)); 44 | } 45 | 46 | virtual void TearDown() { 47 | ASSERT_NO_THROW(_preanal.close()); 48 | } 49 | 50 | protected: 51 | Preanal _preanal; 52 | 53 | inline shared_ptr _apply(wstring raw) { 54 | auto word = make_shared(raw.c_str(), raw.length()); 55 | _preanal.apply(word); 56 | return word; 57 | } 58 | }; 59 | 60 | 61 | //////////////// 62 | // test cases // 63 | //////////////// 64 | TEST_F(PreanalTest, apply_exact) { 65 | // 어절 완전일치 엔트리 "이더리움"에 대해 66 | 67 | auto word1 = _apply(L"이더리움"); // 매칭 68 | EXPECT_LT(0, word1->char_tags[0]); 69 | EXPECT_LT(0, word1->char_tags[1]); 70 | EXPECT_LT(0, word1->char_tags[2]); 71 | EXPECT_LT(0, word1->char_tags[3]); 72 | 73 | auto word2 = _apply(L"이더리움을"); // 매칭 안됨 74 | EXPECT_EQ(0, word2->char_tags[0]); 75 | EXPECT_EQ(0, word2->char_tags[1]); 76 | EXPECT_EQ(0, word2->char_tags[2]); 77 | EXPECT_EQ(0, word2->char_tags[3]); 78 | EXPECT_EQ(0, word2->char_tags[4]); 79 | 80 | auto word3 = _apply(L"이더륨"); // 매칭 안됨 81 | EXPECT_EQ(0, word3->char_tags[0]); 82 | EXPECT_EQ(0, word3->char_tags[1]); 83 | EXPECT_EQ(0, word3->char_tags[2]); 84 | 85 | EXPECT_NO_THROW(_apply(L"")); 86 | } 87 | 88 | 89 | TEST_F(PreanalTest, apply_prefix) { 90 | // 전망매칭 패턴 "가즈아*"에 대해 91 | 92 | auto word1 = _apply(L"가즈아~"); // 매칭 93 | EXPECT_LT(0, word1->char_tags[0]); 94 | EXPECT_LT(0, word1->char_tags[1]); 95 | EXPECT_LT(0, word1->char_tags[2]); 96 | EXPECT_EQ(0, word1->char_tags[3]); 97 | 98 | auto word2 = _apply(L"가즈아"); // 매칭 99 | EXPECT_LT(0, word2->char_tags[0]); 100 | EXPECT_LT(0, word2->char_tags[1]); 101 | EXPECT_LT(0, word2->char_tags[2]); 102 | 103 | auto word3 = _apply(L"가자"); // 매칭 안됨 104 | EXPECT_EQ(0, word3->char_tags[0]); 105 | EXPECT_EQ(0, word3->char_tags[1]); 106 | } 107 | 108 | 109 | } // namespace khaiii 110 | -------------------------------------------------------------------------------- /src/test/cpp/test_main.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jamie (jamie.lim@kakaocorp.com) 3 | * @copyright Copyright (C) 2017-, Kakao Corp. All rights reserved. 4 | */ 5 | 6 | 7 | ////////////// 8 | // includes // 9 | ////////////// 10 | #include 11 | 12 | #include "cxxopts.hpp" 13 | #include "fmt/printf.h" 14 | #ifdef PROFILER 15 | #include "gperftools/profiler.h" 16 | #endif 17 | #include "gtest/gtest.h" 18 | #include "spdlog/spdlog.h" 19 | 20 | #include "khaiii/khaiii_dev.h" 21 | 22 | 23 | using std::cerr; 24 | using std::string; 25 | 26 | 27 | /////////////// 28 | // variables // 29 | /////////////// 30 | // global variable for program arguments 31 | cxxopts::ParseResult* prog_args; 32 | 33 | 34 | ////////// 35 | // main // 36 | ////////// 37 | int main(int argc, char** argv) { 38 | cxxopts::Options options(argv[0], argv[0]); 39 | testing::InitGoogleTest(&argc, argv); 40 | auto _log = spdlog::stderr_color_mt("console"); 41 | spdlog::set_level(spdlog::level::warn); 42 | 43 | options.add_options() 44 | ("h,help", "print this help") 45 | ("rsc-dir", "resource directory", cxxopts::value()->default_value("./share/khaiii")) 46 | ("set-log", "set log level", cxxopts::value()->default_value("all:warn")); 47 | auto args = options.parse(argc, argv); 48 | 49 | if (args.count("help")) { 50 | fmt::fprintf(cerr, "%s\n", options.help()); 51 | return 0; 52 | } 53 | prog_args = &args; 54 | khaiii_set_log_levels(args["set-log"].as().c_str()); 55 | 56 | #ifdef PROFILER 57 | ProfilerStart("/tmp/test_khaiii.prof"); 58 | #endif 59 | 60 | int ret = RUN_ALL_TESTS(); 61 | 62 | #ifdef PROFILER 63 | ProfilerStop(); 64 | #endif 65 | 66 | return ret; 67 | } 68 | -------------------------------------------------------------------------------- /src/test/python/test_khaiii/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kakao/khaiii/3d0c8944374163b1107fd6690ccf3a408430a02d/src/test/python/test_khaiii/__init__.py -------------------------------------------------------------------------------- /src/test/python/test_khaiii/test_khaiii.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """ 6 | khaiii tests 7 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)' 8 | __copyright__ = 'Copyright (C) 2018-, Kakao Corp. All rights reserved.' 9 | """ 10 | 11 | 12 | ########### 13 | # imports # 14 | ########### 15 | import unittest 16 | 17 | import khaiii # pylint: disable=import-error 18 | from khaiii import KhaiiiExcept # pylint: disable=import-error 19 | 20 | 21 | ######### 22 | # tests # 23 | ######### 24 | class TestKhaiii(unittest.TestCase): 25 | """ 26 | khaiii tests 27 | """ 28 | def setUp(self): 29 | self._api = khaiii.KhaiiiApi() 30 | self._api.set_log_level('all', 'warn') 31 | 32 | def tearDown(self): 33 | self._api.close() 34 | 35 | def test_version(self): 36 | """ 37 | test version() api 38 | """ 39 | self.assertRegex(self._api.version(), r'^\d+\.\d+(\.\d+)?$') 40 | 41 | def test_open(self): 42 | """ 43 | test open() api 44 | """ 45 | try: 46 | self._api.open() 47 | except KhaiiiExcept as khaiii_exc: 48 | self.fail(khaiii_exc) 49 | with self.assertRaises(KhaiiiExcept): 50 | self._api.open('/not/existing/dir') 51 | with self.assertRaises(KhaiiiExcept): 52 | self._api.open('', 'invalid option') 53 | 54 | def test_analyze(self): 55 | """ 56 | test analyze() api 57 | """ 58 | try: 59 | words = self._api.analyze('안녕? 반가워!') 60 | self.assertEqual(len(words), 2) 61 | self.assertEqual(len(words[0].morphs), 2) 62 | self.assertEqual(words[0].morphs[0].lex, '안녕') 63 | self.assertEqual(words[0].morphs[0].tag, 'IC') 64 | self.assertEqual(words[0].morphs[1].lex, '?') 65 | self.assertEqual(words[0].morphs[1].tag, 'SF') 66 | self.assertEqual(len(words[1].morphs), 3) 67 | self.assertEqual(words[1].morphs[0].lex, '반갑') 68 | self.assertEqual(words[1].morphs[0].tag, 'VA') 69 | self.assertEqual(words[1].morphs[1].lex, '어') 70 | self.assertEqual(words[1].morphs[1].tag, 'EF') 71 | self.assertEqual(words[1].morphs[2].lex, '!') 72 | self.assertEqual(words[1].morphs[2].tag, 'SF') 73 | except KhaiiiExcept as khaiii_exc: 74 | self.fail(khaiii_exc) 75 | 76 | def test_analyze_bfr_errpatch(self): 77 | """ 78 | test analyze_bfr_errpatch() api 79 | """ 80 | try: 81 | results = self._api.analyze_bfr_errpatch('테스트') 82 | self.assertEqual(len(results), len('테스트') + 2) 83 | except KhaiiiExcept as khaiii_exc: 84 | self.fail(khaiii_exc) 85 | 86 | def test_set_log_level(self): 87 | """ 88 | test set_log_level() api 89 | """ 90 | try: 91 | self._api.set_log_level('all', 'info') 92 | except KhaiiiExcept as khaiii_exc: 93 | self.fail(khaiii_exc) 94 | with self.assertRaises(KhaiiiExcept): 95 | self._api.set_log_level('all', 'not_existing_level') 96 | 97 | 98 | ######## 99 | # main # 100 | ######## 101 | if __name__ == '__main__': 102 | unittest.main() 103 | -------------------------------------------------------------------------------- /train/eval.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """ 6 | khaiii 출력 형태의 두 파일을 읽어들여 f-score를 측정 7 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)' 8 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' 9 | """ 10 | 11 | 12 | ########### 13 | # imports # 14 | ########### 15 | from argparse import ArgumentParser, Namespace 16 | from collections import Counter 17 | import logging 18 | import sys 19 | from typing import Iterator, Set, Tuple 20 | 21 | 22 | ############# 23 | # functions # 24 | ############# 25 | def _load(path: str) -> Iterator[Tuple[str, str]]: 26 | """ 27 | 파일을 읽어들여 (어절, 형태소)를 리턴하는 제너레이터 28 | Args: 29 | path: file path 30 | Yields: 31 | word 32 | morphs 33 | """ 34 | for line in open(path, 'r', encoding='UTF-8'): 35 | line = line.rstrip('\r\n') 36 | if not line: 37 | yield '', '' 38 | continue 39 | word, morphs = line.split('\t') 40 | yield word, morphs 41 | 42 | 43 | def _morphs_to_set(morphs: str) -> Set[Tuple[str, int]]: 44 | """ 45 | make set from morpheme string 46 | Args: 47 | morphs: morpheme string 48 | Returns: 49 | morphemes set 50 | """ 51 | morph_cnt = Counter([m for m in morphs.split(' + ')]) 52 | morph_set = set() 53 | for morph, freq in morph_cnt.items(): 54 | if freq == 1: 55 | morph_set.add(morph) 56 | else: 57 | morph_set.update([(morph, i) for i in range(freq)]) 58 | return morph_set 59 | 60 | 61 | def _count(cnt: Counter, gold: str, pred: str): 62 | """ 63 | count gold and pred morphemes 64 | Args: 65 | cnt: Counter object 66 | gold: gold standard morphemes 67 | pred: prediction morphemes 68 | """ 69 | gold_set = _morphs_to_set(gold) 70 | pred_set = _morphs_to_set(pred) 71 | cnt['gold'] += len(gold_set) 72 | cnt['pred'] += len(pred_set) 73 | cnt['match'] += len(gold_set & pred_set) 74 | 75 | 76 | def _report(cnt: Counter): 77 | """ 78 | report metric 79 | Args: 80 | cnt: Counter object 81 | """ 82 | precision = 100 * cnt['match'] / cnt['pred'] 83 | recall = 100 * cnt['match'] / cnt['gold'] 84 | f_score = 2 * precision * recall / (precision + recall) 85 | print(f'precision: {precision:.2f}') 86 | print(f'recall: {recall:.2f}') 87 | print(f'f-score: {f_score:.2f}') 88 | 89 | 90 | def run(args: Namespace): 91 | """ 92 | run function which is the start point of program 93 | Args: 94 | args: program arguments 95 | """ 96 | cnt = Counter() 97 | for line_num, (gold, pred) in enumerate(zip(_load(args.gold), _load(args.pred)), start=1): 98 | word_gold, morphs_gold = gold 99 | word_pred, morphs_pred = pred 100 | if word_gold != word_pred: 101 | raise ValueError(f'invalid align at {line_num}: {word_gold} vs {word_pred}') 102 | if not word_gold or not word_pred: 103 | continue 104 | _count(cnt, morphs_gold, morphs_pred) 105 | _report(cnt) 106 | 107 | 108 | ######## 109 | # main # 110 | ######## 111 | def main(): 112 | """ 113 | main function processes only argument parsing 114 | """ 115 | parser = ArgumentParser(description='command line part-of-speech tagger demo') 116 | parser.add_argument('-g', '--gold', help='gold standard file', metavar='FILE', required=True) 117 | parser.add_argument('-p', '--pred', help='prediction file', metavar='FILE', required=True) 118 | parser.add_argument('--output', help='output file ', metavar='FILE') 119 | parser.add_argument('--debug', help='enable debug', action='store_true') 120 | args = parser.parse_args() 121 | 122 | if args.output: 123 | sys.stdout = open(args.output, 'w', encoding='UTF-8') 124 | if args.debug: 125 | logging.basicConfig(level=logging.DEBUG) 126 | else: 127 | logging.basicConfig(level=logging.INFO) 128 | 129 | run(args) 130 | 131 | 132 | if __name__ == '__main__': 133 | main() 134 | -------------------------------------------------------------------------------- /train/hd_validate_errpatch.bash: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e -u 3 | 4 | 5 | ############# 6 | # functions # 7 | ############# 8 | function print_usage() { 9 | local msg=$1 10 | echo "Usage: $(basename $0) [options]" 11 | echo "Options:" 12 | echo " -h, --help show this help message and exit" 13 | echo " -i FILE input file" 14 | echo " -c DIR corpus dir" 15 | echo " --rsc-src=DIR " 16 | echo " --lib-path=FILE " 17 | echo " --rsc-dir=DIR " 18 | echo " --num-mapper=NUM " 19 | if [ -z "${msg}" ]; then 20 | exit 0 21 | else 22 | echo 23 | echo "${msg}" 24 | exit 1 25 | fi 26 | } 27 | 28 | 29 | function abspath() { 30 | python3 -c "import os, sys; print(os.path.abspath(sys.argv[1]))" $1 31 | } 32 | 33 | 34 | function parse_args() { 35 | INPUT_FILE="" 36 | CORPUS_DIR="" 37 | LIB_PATH="" 38 | RSC_DIR="" 39 | RSC_SRC="" 40 | NUM_MAPPER="" 41 | 42 | while [[ $# -ge 1 ]]; do 43 | case $1 in 44 | -h|--help) 45 | print_usage "" 46 | ;; 47 | -i) 48 | INPUT_FILE="$2" 49 | shift 50 | ;; 51 | -c) 52 | CORPUS_DIR="$2" 53 | shift 54 | ;; 55 | --rsc-src) 56 | RSC_SRC="$2" 57 | shift 58 | ;; 59 | --lib-path) 60 | LIB_PATH="$2" 61 | shift 62 | ;; 63 | --rsc-dir) 64 | RSC_DIR="$2" 65 | shift 66 | ;; 67 | --num-mapper) 68 | NUM_MAPPER="$2" 69 | shift 70 | ;; 71 | --) break ;; 72 | esac 73 | shift 74 | done 75 | 76 | # input file 검사 77 | if [ -z "${INPUT_FILE}" ]; then 78 | print_usage "no input file" 79 | fi 80 | 81 | # corpus dir 검사 82 | if [ -z "${CORPUS_DIR}" ]; then 83 | print_usage "no corpus dir" 84 | fi 85 | 86 | if [ -z "${RSC_SRC}" ]; then 87 | RSC_SRC=../rsc/src 88 | fi 89 | if [ -z "${LIB_PATH}" ]; then 90 | LIB_PATH=../build/lib/libkhaiii.so 91 | fi 92 | if [ -z "${RSC_DIR}" ]; then 93 | RSC_DIR=../build/share/khaiii 94 | fi 95 | if [ -z "${NUM_MAPPER}" ]; then 96 | NUM_MAPPER=1000 97 | fi 98 | 99 | INPUT_FILE=$(abspath ${INPUT_FILE}) 100 | LIB_PATH=$(abspath ${LIB_PATH}) 101 | RSC_DIR=$(abspath ${RSC_DIR}) 102 | RSC_SRC=$(abspath ${RSC_SRC}) 103 | CORPUS_DIR=$(abspath ${CORPUS_DIR}) 104 | } 105 | 106 | 107 | function init_envs() { 108 | # global variables 109 | INPUT_DIR=errpatch.in 110 | OUTPUT_DIR=errpatch.out 111 | CACHE_DIR=errpatch.cache 112 | } 113 | 114 | 115 | function split_input() { 116 | >&2 echo "{{{{{{{{{{ ${FUNCNAME[0]} {{{{{{{{{{" 117 | 118 | local total_line 119 | total_line=$(wc -l < ${INPUT_FILE}) 120 | local line_per_split=$((total_line / NUM_MAPPER)) 121 | rm -rf ${INPUT_DIR} 122 | mkdir -p ${INPUT_DIR} 123 | shuf ${INPUT_FILE} | split -d -a 5 -l ${line_per_split} - ${INPUT_DIR}/part- 124 | 125 | hadoop fs -test -e ${INPUT_DIR} && hadoop fs -rm -skipTrash -r ${INPUT_DIR} 126 | hadoop fs -put ${INPUT_DIR} 127 | 128 | >&2 echo "}}}}}}}}}} ${FUNCNAME[0]} }}}}}}}}}}" 129 | } 130 | 131 | 132 | function cache_files() { 133 | >&2 echo "{{{{{{{{{{ ${FUNCNAME[0]} {{{{{{{{{{" 134 | 135 | hadoop fs -test -e ${CACHE_DIR} && hadoop fs -rm -skipTrash -r ${CACHE_DIR} 136 | hadoop fs -mkdir -p ${CACHE_DIR} 137 | 138 | hadoop fs -put ../src/main/python/khaiii ${CACHE_DIR} 139 | hadoop fs -mkdir -p ${CACHE_DIR}/khaiii/lib 140 | hadoop fs -put ${LIB_PATH} ${CACHE_DIR}/khaiii/lib 141 | 142 | hadoop fs -mkdir -p ${CACHE_DIR}/khaiii/share 143 | hadoop fs -put ${RSC_DIR} ${CACHE_DIR}/khaiii/share/khaiii 144 | 145 | hadoop fs -put ${RSC_SRC} ${CACHE_DIR}/rsc_src 146 | 147 | hadoop fs -mkdir -p ${CACHE_DIR}/corpus 148 | hadoop fs -put ${CORPUS_DIR}/*.txt ${CACHE_DIR}/corpus 149 | 150 | >&2 echo "}}}}}}}}}} ${FUNCNAME[0]} }}}}}}}}}}" 151 | } 152 | 153 | 154 | function run_hadoop() { 155 | >&2 echo "{{{{{{{{{{ ${FUNCNAME[0]} {{{{{{{{{{" 156 | 157 | hadoop fs -test -e ${OUTPUT_DIR} && hadoop fs -rm -skipTrash -r ${OUTPUT_DIR} 158 | yarn jar ${HADOOP_HOME}/share/hadoop/tools/lib/hadoop-streaming-*.jar \ 159 | -D mapred.job.name=validate_errpatch \ 160 | -D mapred.reduce.tasks=0 \ 161 | -cmdenv PYTHONPATH="./${CACHE_DIR}" \ 162 | -file ./validate_errpatch.py \ 163 | -input "${INPUT_DIR}" \ 164 | -output "${OUTPUT_DIR}" \ 165 | -cacheFile "${CACHE_DIR}#${CACHE_DIR}" \ 166 | -mapper "./validate_errpatch.py -c ./${CACHE_DIR}/corpus --rsc-src ./${CACHE_DIR}/rsc_src" 167 | 168 | hadoop fs -text ${OUTPUT_DIR}/part-* > "$(dirname ${INPUT_FILE})/errpatch.valid" 169 | 170 | >&2 echo "}}}}}}}}}} ${FUNCNAME[0]} }}}}}}}}}}" 171 | } 172 | 173 | 174 | function del_temp() { 175 | >&2 echo "{{{{{{{{{{ ${FUNCNAME[0]} {{{{{{{{{{" 176 | 177 | hadoop fs -rm -skipTrash -r ${INPUT_DIR} ${OUTPUT_DIR} ${CACHE_DIR} 178 | rm -rf ${INPUT_DIR} 179 | 180 | >&2 echo "}}}}}}}}}} ${FUNCNAME[0]} }}}}}}}}}}" 181 | } 182 | 183 | 184 | ######## 185 | # main # 186 | ######## 187 | parse_args $@ 188 | cd "$(dirname $0)" 189 | init_envs 190 | 191 | split_input 192 | cache_files 193 | run_hadoop 194 | del_temp 195 | -------------------------------------------------------------------------------- /train/make_vocab.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """ 6 | 입력(음절) 및 출력(태그) vocabulary를 생성한다. 7 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)' 8 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' 9 | """ 10 | 11 | 12 | ########### 13 | # imports # 14 | ########### 15 | from argparse import ArgumentParser, Namespace 16 | from collections import Counter 17 | import logging 18 | import os 19 | import sys 20 | from typing import TextIO 21 | 22 | from khaiii.resource.morphs import TAGS 23 | 24 | 25 | ############# 26 | # functions # 27 | ############# 28 | def _print(cnt: Counter, fout: TextIO, is_with_freq: bool = True): 29 | """ 30 | vocabulary 사전을 출력한다. 31 | Args: 32 | cnt: Counter object 33 | fout: 출력 파일 34 | is_with_freq: 빈도를 함께 출력할 지 여부 35 | """ 36 | for char, freq in sorted(cnt.items(), key=lambda x: x[0]): 37 | if is_with_freq and freq < 2: 38 | continue 39 | if is_with_freq: 40 | print('{}\t{}'.format(char, freq), file=fout) 41 | else: 42 | print(char, file=fout) 43 | 44 | 45 | def run(args: Namespace): 46 | """ 47 | run function which is the start point of program 48 | Args: 49 | args: program arguments 50 | """ 51 | in_cnt = Counter() 52 | out_cnt = Counter() 53 | for line_num, line in enumerate(sys.stdin, start=1): 54 | if line_num % 1000000 == 0: 55 | logging.info('%dm-th line', line_num // 1000000) 56 | line = line.rstrip('\r\n') 57 | if not line: 58 | continue 59 | raw, tagged = line.split('\t') 60 | in_cnt.update(list(raw)) 61 | out_cnt.update([tag for tag in tagged.split() if tag[2:] not in TAGS]) 62 | os.makedirs(args.rsc_src, exist_ok=True) 63 | with open('{}/vocab.in'.format(args.rsc_src), 'w', encoding='UTF-8') as fout: 64 | _print(in_cnt, fout) 65 | with open('{}/vocab.out'.format(args.rsc_src), 'w', encoding='UTF-8') as fout: 66 | print('\n'.join(['B-{}'.format(tag) for tag in TAGS]), file=fout) 67 | print('\n'.join(['I-{}'.format(tag) for tag in TAGS]), file=fout) 68 | _print(out_cnt, fout, is_with_freq=False) 69 | 70 | 71 | ######## 72 | # main # 73 | ######## 74 | def main(): 75 | """ 76 | main function processes only argument parsing 77 | """ 78 | parser = ArgumentParser(description='입력(음절) 및 출력(태그) vocabulary를 생성한다.') 79 | parser.add_argument('--rsc-src', help='resource source dir ', 80 | metavar='DIR', default='../rsc/src') 81 | parser.add_argument('--input', help='input file ', metavar='FILE') 82 | parser.add_argument('--debug', help='enable debug', action='store_true') 83 | args = parser.parse_args() 84 | 85 | if args.input: 86 | sys.stdin = open(args.input, 'r', encoding='UTF-8') 87 | if args.debug: 88 | logging.basicConfig(level=logging.DEBUG) 89 | else: 90 | logging.basicConfig(level=logging.INFO) 91 | 92 | run(args) 93 | 94 | 95 | if __name__ == '__main__': 96 | main() 97 | -------------------------------------------------------------------------------- /train/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorboardX 2 | torch==0.4.1 3 | tqdm 4 | -------------------------------------------------------------------------------- /train/split_corpus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """ 6 | 코퍼스를 train/dev/test로 분할한다. 7 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)' 8 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' 9 | """ 10 | 11 | 12 | ########### 13 | # imports # 14 | ########### 15 | from argparse import ArgumentParser, Namespace 16 | import logging 17 | import random 18 | import sys 19 | from typing import Iterator, List, TextIO 20 | 21 | 22 | ############# 23 | # functions # 24 | ############# 25 | def _sents(fin: TextIO) -> Iterator[List[str]]: 26 | """ 27 | read from file and yield a sentence (generator) 28 | Args: 29 | fin: input file 30 | Yields: 31 | sentence (list of lines) 32 | """ 33 | sent = [] 34 | for line in fin: 35 | line = line.rstrip('\r\n') 36 | if not line: 37 | if sent: 38 | yield sent 39 | sent = [] 40 | continue 41 | sent.append(line) 42 | if sent: 43 | yield sent 44 | 45 | 46 | def _write_to_file(path: str, sents: List[List[str]]): 47 | """ 48 | 파일에 쓴다. 49 | Args: 50 | path: path 51 | sents: sentences 52 | """ 53 | with open(path, 'w', encoding='UTF-8') as fout: 54 | for sent in sents: 55 | print('\n'.join(sent), file=fout) 56 | print(file=fout) 57 | 58 | 59 | def run(args: Namespace): 60 | """ 61 | run function which is the start point of program 62 | Args: 63 | args: program arguments 64 | """ 65 | sents = [] 66 | for num, sent in enumerate(_sents(sys.stdin), start=1): 67 | if num % 100000 == 0: 68 | logging.info('%d00k-th sent..', num // 100000) 69 | sents.append(sent) 70 | random.shuffle(sents) 71 | _write_to_file('{}.dev'.format(args.out_pfx), sents[:args.dev]) 72 | _write_to_file('{}.test'.format(args.out_pfx), sents[args.dev:args.dev+args.test]) 73 | _write_to_file('{}.train'.format(args.out_pfx), sents[args.dev+args.test:]) 74 | logging.info('dev / test / train: %d / %d / %d', args.dev, args.test, 75 | len(sents[args.dev+args.test:])) 76 | 77 | 78 | ######## 79 | # main # 80 | ######## 81 | def main(): 82 | """ 83 | main function processes only argument parsing 84 | """ 85 | parser = ArgumentParser(description='코퍼스를 train/dev/test로 분할한다.') 86 | parser.add_argument('-o', '--out-pfx', help='output file prefix', metavar='NAME', required=True) 87 | parser.add_argument('--input', help='input file ', metavar='FILE') 88 | parser.add_argument('--dev', help='number of sentence in dev set', metavar='NUM', type=int, 89 | default=5000) 90 | parser.add_argument('--test', help='number of sentence in test set', metavar='NUM', type=int, 91 | default=5000) 92 | parser.add_argument('--debug', help='enable debug', action='store_true') 93 | args = parser.parse_args() 94 | 95 | if args.input: 96 | sys.stdin = open(args.input, 'r', encoding='UTF-8') 97 | if args.debug: 98 | logging.basicConfig(level=logging.DEBUG) 99 | else: 100 | logging.basicConfig(level=logging.INFO) 101 | 102 | run(args) 103 | 104 | 105 | if __name__ == '__main__': 106 | main() 107 | -------------------------------------------------------------------------------- /train/tag.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """ 6 | command line part-of-speech tagger demo 7 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)' 8 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' 9 | """ 10 | 11 | 12 | ########### 13 | # imports # 14 | ########### 15 | from argparse import ArgumentParser, Namespace 16 | import logging 17 | import sys 18 | 19 | from khaiii.train.tagger import PosTagger 20 | 21 | 22 | ############# 23 | # functions # 24 | ############# 25 | def run(args: Namespace): 26 | """ 27 | run function which is the start point of program 28 | Args: 29 | args: program arguments 30 | """ 31 | tgr = PosTagger(args.model_dir, args.gpu_num) 32 | for line_num, line in enumerate(sys.stdin, start=1): 33 | if line_num % 100000 == 0: 34 | logging.info('%d00k-th line..', (line_num // 100000)) 35 | line = line.rstrip('\r\n') 36 | if not line: 37 | print() 38 | continue 39 | pos_sent = tgr.tag_raw(line) 40 | for pos_word in pos_sent.pos_tagged_words: 41 | print(pos_word.raw, end='\t') 42 | print(' + '.join([str(m) for m in pos_word.pos_tagged_morphs])) 43 | print() 44 | 45 | 46 | ######## 47 | # main # 48 | ######## 49 | def main(): 50 | """ 51 | main function processes only argument parsing 52 | """ 53 | parser = ArgumentParser(description='command line part-of-speech tagger demo') 54 | parser.add_argument('-m', '--model-dir', help='model dir', metavar='DIR', required=True) 55 | parser.add_argument('--input', help='input file ', metavar='FILE') 56 | parser.add_argument('--output', help='output file ', metavar='FILE') 57 | parser.add_argument('--gpu-num', help='GPU number to use ', metavar='INT', 58 | type=int, default=-1) 59 | parser.add_argument('--debug', help='enable debug', action='store_true') 60 | args = parser.parse_args() 61 | 62 | if args.input: 63 | sys.stdin = open(args.input, 'r', encoding='UTF-8') 64 | if args.output: 65 | sys.stdout = open(args.output, 'w', encoding='UTF-8') 66 | if args.debug: 67 | logging.basicConfig(level=logging.DEBUG) 68 | else: 69 | logging.basicConfig(level=logging.INFO) 70 | 71 | run(args) 72 | 73 | 74 | if __name__ == '__main__': 75 | main() 76 | -------------------------------------------------------------------------------- /train/train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """ 6 | train part-of-speech model from data set 7 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)' 8 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' 9 | """ 10 | 11 | 12 | ########### 13 | # imports # 14 | ########### 15 | from argparse import ArgumentParser, Namespace 16 | import logging 17 | 18 | from khaiii.train.trainer import Trainer 19 | 20 | 21 | ############# 22 | # functions # 23 | ############# 24 | def run(args: Namespace): 25 | """ 26 | run function which is the start point of program 27 | Args: 28 | args: program arguments (config) 29 | """ 30 | Trainer(args).train() 31 | 32 | 33 | ######## 34 | # main # 35 | ######## 36 | def main(): 37 | """ 38 | main function processes only argument parsing 39 | """ 40 | parser = ArgumentParser(description='train model from data') 41 | parser.add_argument('-i', '--in-pfx', help='input data path prefix', metavar='NAME', 42 | required=True) 43 | parser.add_argument('--rsc-src', help='resource source dir ', 44 | metavar='DIR', default='../rsc/src') 45 | parser.add_argument('--logdir', help='tensorboard log dir ', metavar='DIR', 46 | default='./logdir') 47 | parser.add_argument('--window', help='left/right character window length ', 48 | metavar='INT', type=int, default=4) 49 | parser.add_argument('--spc-dropout', help='space(word delimiter) dropout rate ', 50 | metavar='REAL', type=float, default=0.1) 51 | parser.add_argument('--cutoff', help='cutoff ', metavar='INT', type=int, default=1) 52 | parser.add_argument('--embed-dim', help='embedding dimension ', metavar='INT', 53 | type=int, default=35) 54 | parser.add_argument('--learning-rate', help='learning rate ', metavar='REAL', 55 | type=float, default=0.001) 56 | parser.add_argument('--lr-decay', help='learning rate decay ', metavar='REAL', 57 | type=float, default=0.9) 58 | parser.add_argument('--batch-size', help='batch size ', metavar='INT', type=int, 59 | default=500) 60 | parser.add_argument('--patience', help='maximum patience count to revert model ', 61 | metavar='INT', type=int, default=10) 62 | parser.add_argument('--gpu-num', help='GPU number to use ', metavar='INT', 63 | type=int, default=-1) 64 | parser.add_argument('--debug', help='enable debug', action='store_true') 65 | args = parser.parse_args() 66 | 67 | if args.debug: 68 | logging.basicConfig(level=logging.DEBUG) 69 | else: 70 | logging.basicConfig(level=logging.INFO) 71 | 72 | run(args) 73 | 74 | 75 | if __name__ == '__main__': 76 | main() 77 | --------------------------------------------------------------------------------