├── .github
    ├── doc
    │   ├── khaiii_for_space_error.pptx
    │   └── network.pptx
    ├── img
    │   ├── multi-task-learning.png
    │   ├── network.png
    │   ├── pull-request-to-develop.png
    │   └── win_emb_f.png
    └── pull_request_template.md
├── .gitignore
├── CMakeLists.txt
├── CONTRIBUTING.md
├── LICENSE
├── NOTICE.md
├── README.md
├── cmake
    ├── CodeCoverage.cmake
    ├── FindGperftools.cmake
    ├── FusedMultiplyAdd.cmake
    ├── Hunter
    │   └── config.cmake
    └── HunterGate.cmake
├── docker
    └── Dockerfile
├── include
    └── khaiii
    │   ├── KhaiiiApi.hpp
    │   ├── khaiii_api.h
    │   └── khaiii_dev.h
├── munjong
    ├── apply_patch.py
    ├── convert_jamo_to_compat.py
    ├── detect_sejong_period_error.py
    ├── fix_final_symbol_error.py
    ├── make_patch.py
    ├── recover_english_case.py
    ├── recover_raw_morph_mismatch.py
    ├── recover_wide_quotation.py
    └── remove_sejong_period_error.py
├── requirements.txt
├── rsc
    ├── Makefile
    ├── bin
    │   ├── compile_errpatch.py
    │   ├── compile_model.py
    │   ├── compile_preanal.py
    │   └── compile_restore.py
    └── src
    │   ├── base.config.json
    │   ├── base.errpatch.auto
    │   ├── base.errpatch.manual
    │   ├── base.model.pickle
    │   ├── char_align.map
    │   ├── large.config.json
    │   ├── large.errpatch.auto
    │   ├── large.errpatch.manual
    │   ├── large.model.pickle
    │   ├── preanal.auto
    │   ├── preanal.manual
    │   ├── restore.dic
    │   ├── vocab.in
    │   ├── vocab.out
    │   └── vocab.out.more
├── src
    ├── main
    │   ├── cpp
    │   │   ├── khaiii
    │   │   │   ├── Config.cpp
    │   │   │   ├── Config.hpp
    │   │   │   ├── Embed.cpp
    │   │   │   ├── Embed.hpp
    │   │   │   ├── ErrPatch.cpp
    │   │   │   ├── ErrPatch.hpp
    │   │   │   ├── KhaiiiImpl.cpp
    │   │   │   ├── KhaiiiImpl.hpp
    │   │   │   ├── MemMapFile.hpp
    │   │   │   ├── Morph.cpp
    │   │   │   ├── Morph.hpp
    │   │   │   ├── Preanal.cpp
    │   │   │   ├── Preanal.hpp
    │   │   │   ├── Resource.cpp
    │   │   │   ├── Resource.hpp
    │   │   │   ├── Restore.cpp
    │   │   │   ├── Restore.hpp
    │   │   │   ├── Sentence.cpp
    │   │   │   ├── Sentence.hpp
    │   │   │   ├── Tagger.cpp
    │   │   │   ├── Tagger.hpp
    │   │   │   ├── Trie.cpp
    │   │   │   ├── Trie.hpp
    │   │   │   ├── Word.cpp
    │   │   │   ├── Word.hpp
    │   │   │   ├── khaiii_api.cpp
    │   │   │   ├── khaiii_dev.cpp
    │   │   │   ├── nn
    │   │   │   │   ├── Conv1d.cpp
    │   │   │   │   ├── Conv1d.hpp
    │   │   │   │   ├── Linear.cpp
    │   │   │   │   ├── Linear.hpp
    │   │   │   │   ├── tensor.cpp
    │   │   │   │   └── tensor.hpp
    │   │   │   └── util.hpp
    │   │   └── main.cpp
    │   └── python
    │   │   ├── MANIFEST.in.in
    │   │   ├── khaiii
    │   │       ├── __init__.py
    │   │       ├── __init__.py.in
    │   │       ├── khaiii.py
    │   │       ├── munjong
    │   │       │   ├── __init__.py
    │   │       │   ├── libpatch.py
    │   │       │   └── sejong_corpus.py
    │   │       ├── resource
    │   │       │   ├── __init__.py
    │   │       │   ├── char_align.py
    │   │       │   ├── jaso.py
    │   │       │   ├── morphs.py
    │   │       │   ├── resource.py
    │   │       │   ├── trie.py
    │   │       │   └── vocabulary.py
    │   │       └── train
    │   │       │   ├── dataset.py
    │   │       │   ├── embedder.py
    │   │       │   ├── evaluator.py
    │   │       │   ├── models.py
    │   │       │   ├── sentence.py
    │   │       │   ├── tagger.py
    │   │       │   └── trainer.py
    │   │   └── setup.py.in
    └── test
    │   ├── cpp
    │       ├── khaiii
    │       │   ├── ErrPatchTest.cpp
    │       │   ├── KhaiiiApiTest.cpp
    │       │   ├── KhaiiiApiTest.hpp
    │       │   ├── KhaiiiDevTest.cpp
    │       │   └── PreanalTest.cpp
    │       └── test_main.cpp
    │   └── python
    │       └── test_khaiii
    │           ├── __init__.py
    │           └── test_khaiii.py
└── train
    ├── eval.py
    ├── extract_errpatch.py
    ├── extract_preanal.py
    ├── hd_validate_errpatch.bash
    ├── make_vocab.py
    ├── map_char_to_tag.py
    ├── pickle_model.py
    ├── requirements.txt
    ├── split_corpus.py
    ├── tag.py
    ├── train.py
    ├── transform_corpus.py
    └── validate_errpatch.py


/.github/doc/khaiii_for_space_error.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kakao/khaiii/3d0c8944374163b1107fd6690ccf3a408430a02d/.github/doc/khaiii_for_space_error.pptx


--------------------------------------------------------------------------------
/.github/doc/network.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kakao/khaiii/3d0c8944374163b1107fd6690ccf3a408430a02d/.github/doc/network.pptx


--------------------------------------------------------------------------------
/.github/img/multi-task-learning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kakao/khaiii/3d0c8944374163b1107fd6690ccf3a408430a02d/.github/img/multi-task-learning.png


--------------------------------------------------------------------------------
/.github/img/network.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kakao/khaiii/3d0c8944374163b1107fd6690ccf3a408430a02d/.github/img/network.png


--------------------------------------------------------------------------------
/.github/img/pull-request-to-develop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kakao/khaiii/3d0c8944374163b1107fd6690ccf3a408430a02d/.github/img/pull-request-to-develop.png


--------------------------------------------------------------------------------
/.github/img/win_emb_f.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kakao/khaiii/3d0c8944374163b1107fd6690ccf3a408430a02d/.github/img/win_emb_f.png


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | 설명 (Description)
 2 | ----
 3 | _이 문구를 지우고 여기에 내용을 적어주세요. (Remove this sentence and describe here.)_
 4 | 
 5 | ~~_겁내지 말아요, 저희는 한글을 사랑합니다._~~
 6 | 
 7 | 
 8 | 개발자를 위한 가이드 (Developer's Guide)
 9 | ----
10 | 만약 khaiii에 pull request가 처음이라면 [개발자를 위한 가이드](https://github.com/kakao/khaiii/wiki#%EA%B0%9C%EB%B0%9C%EC%9E%90%EB%A5%BC-%EC%9C%84%ED%95%9C-%EA%B0%80%EC%9D%B4%EB%93%9C) 문서들을 한번 읽어보시길 권고드립니다.
11 | 
12 | If this is your first pull request for khaiii, please see the [Developer's Guide](https://github.com/kakao/khaiii/wiki#%EA%B0%9C%EB%B0%9C%EC%9E%90%EB%A5%BC-%EC%9C%84%ED%95%9C-%EA%B0%80%EC%9D%B4%EB%93%9C).
13 | 
14 | 
15 | 체크 리스트 (Checklist)
16 | ----
17 | pull request 전에 아래 체크 리스트들을 만족하는 지 확인한 후 체크('x') 표시를 해주시기 바랍니다.
18 | 
19 | Before you submit pull requests, please check(set 'x') to the checklist below.
20 | 
21 | - [ ] master 브랜치가 아니라 **develop** 브랜치에 머지하도록 pull request를 작성 중이신가요? (Did you merge into **develop** branch not master?)
22 | - [ ] `build/test/khaiii` 프로그램을 실행하여 **테스트**가 성공했나요? (Did all **tests** are passed when you ran as `build/test/khaiii`)
23 | - [ ] **PyLint** 툴을 실행하여 발생한 에러를 모두 수정하셨나요? (Did you fix all errors after running **PyLint**?)
24 | - [ ] **CppLint** 툴을 실행하여 발생한 에러를 모두 수정하셨나요? (Did you fix all errors after running **CppLint**?)
25 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | __pycache__/
3 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | ## Submitting Pull Requests
2 | 
3 | When you are sending a pull request, please sign the [CLA](https://cla-assistant.io/kakao/khaiii)(Contributor Licensing Agreement) for Individual.  
4 | If you need a Contributor Licensing Agreement for Corporate, please [contact us](mailto:oss@kakaocorp.com).
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | khaiii
  2 | ====
  3 | khaiii는 "Kakao Hangul Analyzer III"의 첫 글자들만 모아 만든 이름으로 카카오에서 개발한 세 번째 형태소분석기입니다. 두 번째 버전의 형태소분석기 이름인 dha2 (Daumkakao Hangul Analyzer 2)를 계승한 이름이기도 합니다.
  4 | 
  5 | 형태소는 언어학에서 일정한 의미가 있는 가장 작은 말의 단위로 발화체 내에서 따로 떼어낼 수 있는 것을 말합니다. 즉, 더 분석하면 뜻이 없어지는 말의 단위입니다. 형태소분석기는 단어를 보고 형태소 단위로 분리해내는 소프트웨어를 말합니다. 이러한 형태소분석은 자연어 처리의 가장 기초적인 절차로 이후 구문 분석이나 의미 분석으로 나아가기 위해 가장 먼저 이루어져야 하는 과정으로 볼 수 있습니다. (한국어 위키피디아에서 인용)
  6 | 
  7 | 
  8 | 데이터 기반
  9 | ----
 10 | 기존 버전이 사전과 규칙에 기반해 분석을 하는 데 반해 khaiii는 데이터(혹은 기계학습) 기반의 알고리즘을 이용하여 분석을 합니다. 학습에 사용한 코퍼스는 국립국어원에서 배포한 [21세기 세종계획 최종 성과물](https://ithub.korean.go.kr/user/noticeView.do?boardSeq=1&articleSeq=16)을 저희 카카오에서 오류를 수정하고 내용을 일부 추가하기도 한 것입니다.
 11 | 
 12 | 전처리 과정에서 오류가 발생하는 문장을 제외하고 약 85만 문장, 천만 어절의 코퍼스를 사용하여 학습을 했습니다. 코퍼스와 품사 체계에 대한 자세한 내용은 [코퍼스](https://github.com/kakao/khaiii/wiki/%EC%BD%94%ED%8D%BC%EC%8A%A4) 문서를 참고하시기 바랍니다.
 13 | 
 14 | 
 15 | 알고리즘
 16 | ----
 17 | 기계학습에 사용한 알고리즘은 신경망 알고리즘들 중에서 Convolutional Neural Network(CNN)을 사용하였습니다. 한국어에서 형태소분석은 자연어처리를 위한 가장 기본적인 전처리 과정이므로 속도가 매우 중요한 요소라고 생각합니다. 따라서 자연어처리에 많이 사용하는 Long-Short Term Memory(LSTM)와 같은 Recurrent Neural Network(RNN) 알고리즘은 속도 면에서 활용도가 떨어질 것으로 예상하여 고려 대상에서 제외하였습니다.
 18 | 
 19 | CNN 모델에 대한 상세한 내용은 [CNN 모델](https://github.com/kakao/khaiii/wiki/CNN-%EB%AA%A8%EB%8D%B8) 문서를 참고하시기 바랍니다.
 20 | 
 21 | 
 22 | 성능
 23 | ----
 24 | ### 정확도
 25 | 
 26 | #### v0.3
 27 | CNN 모델의 주요 하이퍼 파라미터는 분류하려는 음절의 좌/우 문맥의 크기를 나타내는 win 값과, 음절 임베딩의 차원을 나타내는 emb 값입니다. win 값은 {2, 3, 4, 5, 7, 10}의 값을 가지며, emb 값은 {20, 30, 40, 50, 70, 100, 150, 200, 300, 500}의 값을 가집니다. 따라서 이 두 가지 값의 조합은 6 x 10으로 총 60가지를 실험하였고 아래와 같은 성능을 보였습니다. 성능 지표는 정확률과 재현율의 조화 평균값인 F-Score입니다.
 28 | 
 29 | ![](.github/img/win_emb_f.png)
 30 | 
 31 | win 파라미터의 경우 3 혹은 4에서 가장 좋은 성능을 보이며 그 이상에서는 오히려 성능이 떨어집니다. emb 파라미터의 경우 150까지는 성능도 같이 높아지다가 그 이상에서는 별 차이가 없습니다. 최 상위 5위 중 비교적 작은 모델은 win=3, emb=150으로 F-Score 값은 97.11입니다. 이 모델을 large 모델이라 명명합니다.
 32 | 
 33 | #### v0.4
 34 | [띄어쓰기 오류에 강건한 모델을 위한 실험](https://github.com/kakao/khaiii/wiki/%EB%9D%84%EC%96%B4%EC%93%B0%EA%B8%B0-%EC%98%A4%EB%A5%98%EC%97%90-%EA%B0%95%EA%B1%B4%ED%95%9C-%EB%AA%A8%EB%8D%B8%EC%9D%84-%EC%9C%84%ED%95%9C-%EC%8B%A4%ED%97%98)을 통해 모델을 개선하였습니다. v0.4 모델은 띄어쓰기가 잘 되어있지 않은 입력에 대해 보다 좋은 성능을 보이는데 반해 세종 코퍼스에서는 다소 정확도가 떨어집니다. 이러한 점을 보완하기 위해 base 및 large 모델의 파라미터를 아래와 같이 조금 변경했습니다.
 35 | 
 36 | * base 모델: win=4, emb=35, F-Score: 94.96
 37 | * large 모델: win=4, emb=180, F-Score: 96.71
 38 | 
 39 | 
 40 | ### 속도
 41 | 
 42 | #### v0.3
 43 | 모델의 크기가 커지면 정확도가 높아지긴 하지만 그만큼 계산량 또한 많아져 속도가 떨어집니다. 그래서 적당한 정확도를 갖는 모델 중에서 크기가 작아 속도가 빠른 모델을 base 모델로 선정하였습니다. F-Score 값이 95 이상이면서 모델의 크기가 작은 모델은 win=3, emb=30이며 F-Score는 95.30입니다.
 44 | 
 45 | 속도를 비교하기 위해 1만 문장(총 903KB, 문장 평균 91)의 텍스트를 분석해 비교했습니다. base 모델의 경우 약 10.5초, large 모델의 경우 약 78.8초가 걸립니다.
 46 | 
 47 | #### v0.4
 48 | 모델의 크기가 커짐에 따라 아래와 같이 base, large 모델의 속도를 다시 측정했으며 v0.4 버전에서 다소 느려졌습니다.
 49 | 
 50 | * base 모델: 10.8 -> 14.4
 51 | * large 모델: 87.3 -> 165
 52 | 
 53 | 
 54 | 사용자 사전
 55 | ----
 56 | 신경망 알고리즘은 소위 말하는 블랙박스 알고리즘으로 결과를 유추하는 과정을 사람이 따라가기가 쉽지 않습니다. 그래서 오분석이 발생할 경우 모델의 파라미터를 수정하여 바른 결과를 내도록 하는 것이 매우 어렵습니다. 이를 위해 khaiii에서는 신경망 알고리즘의 앞단에 기분석 사전을 뒷단에 오분석 패치라는 두 가지 사용자 사전 장치를 마련해 두었습니다.
 57 | 
 58 | ### 기분석 사전
 59 | 기분석 사전은 단일 어절에 대해 문맥에 상관없이 일괄적인 분석 결과를 갖는 경우에 사용합니다. 예를 들어 아래와 같은 엔트리가 있다면,
 60 | 
 61 | 입력 어절 | 분석 결과
 62 | --------|--------
 63 | 이더리움* | 이더리움/NNP
 64 | 
 65 | 문장에서 `이더리움`으로 시작하는 모든 어절은 신경망 알고리즘을 사용하지 않고 `이더리움/NNP`로 동일하게 분석합니다.
 66 | 
 67 | 세종 코퍼스에서 분석 모호성이 없는 어절들로부터 자동으로 기분석 사전을 추출할 경우 약 8만 개의 엔트리가 생성됩니다. 이를 적용할 경우 약간의 속도 향상도 있어서 base 모델에 적용하면 약 9.2초로 10% 정도 속도 향상이 있었습니다.
 68 | 
 69 | 기분석 사전의 기술 방법 및 자세한 내용은 [기분석 사전 문서](https://github.com/kakao/khaiii/wiki/%EA%B8%B0%EB%B6%84%EC%84%9D-%EC%82%AC%EC%A0%84)를 참고하시기 바랍니다.
 70 | 
 71 | 
 72 | ### 오분석 패치
 73 | 오분석 패치는 여러 어절에 걸쳐서 충분한 문맥과 함께 오분석을 바로잡아야 할 경우에 사용합니다. 예를 들어 아래와 같은 엔트리가 있다면,
 74 | 
 75 | 입력 텍스트 | 오분석 결과 | 정분석 결과
 76 | ---------|-----------|---------
 77 | 이 다른 것 | 이/JKS + _ + 다/VA + 른/MM + _ + 것/NNB | 이/JKS + _ + 다르/VA + ㄴ/ETM + _ + 것/NNB
 78 | 
 79 | 만약 khaiii가 위 "오분석 결과"와 같이 오분석을 발생한 경우에 한해 바른 분석 결과인 "정분석 결과"로 수정합니다. 여기서 "\_"는 어절 간 경계, 즉 공백을 의미합니다.
 80 | 
 81 | 오분석 패치의 기술 방법 및 자세한 내용은 [오분석 패치 문서](https://github.com/kakao/khaiii/wiki/%EC%98%A4%EB%B6%84%EC%84%9D-%ED%8C%A8%EC%B9%98)를 참고하시기 바랍니다.
 82 | 
 83 | 
 84 | 빌드 및 설치
 85 | ----
 86 | khaiii의 빌드 및 설치에 관해서는 [빌드 및 설치 문서](https://github.com/kakao/khaiii/wiki/%EB%B9%8C%EB%93%9C-%EB%B0%8F-%EC%84%A4%EC%B9%98)를 참고하시기 바랍니다.
 87 | 
 88 | 
 89 | Contributing
 90 | ----
 91 | khaiii에 기여하실 분들은 [CONTRIBUTING](CONTRIBUTING.md) 및 [개발자를 위한 가이드](https://github.com/kakao/khaiii/wiki#%EA%B0%9C%EB%B0%9C%EC%9E%90%EB%A5%BC-%EC%9C%84%ED%95%9C-%EA%B0%80%EC%9D%B4%EB%93%9C) 문서를 참고하시기 바랍니다.
 92 | 
 93 | 
 94 | License
 95 | ----
 96 | This software is licensed under the [Apache 2 license](LICENSE), quoted below.
 97 | 
 98 | Copyright 2018 Kakao Corp. <http://www.kakaocorp.com>
 99 | 
100 | Licensed under the Apache License, Version 2.0 (the "License"); you may not
101 | use this project except in compliance with the License. You may obtain a copy
102 | of the License at http://www.apache.org/licenses/LICENSE-2.0.
103 | 
104 | Unless required by applicable law or agreed to in writing, software
105 | distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
106 | WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
107 | License for the specific language governing permissions and limitations under
108 | the License.
109 | 


--------------------------------------------------------------------------------
/cmake/FindGperftools.cmake:
--------------------------------------------------------------------------------
 1 | # Tries to find Gperftools.
 2 | #
 3 | # Usage of this module as follows:
 4 | #
 5 | #     find_package(Gperftools)
 6 | #
 7 | # Variables used by this module, they can change the default behaviour and need
 8 | # to be set before calling find_package:
 9 | #
10 | #  Gperftools_ROOT_DIR  Set this variable to the root installation of
11 | #                       Gperftools if the module has problems finding
12 | #                       the proper installation path.
13 | #
14 | # Variables defined by this module:
15 | #
16 | #  GPERFTOOLS_FOUND              System has Gperftools libs/headers
17 | #  GPERFTOOLS_LIBRARIES          The Gperftools libraries (tcmalloc & profiler)
18 | #  GPERFTOOLS_INCLUDE_DIR        The location of Gperftools headers
19 | 
20 | find_library(GPERFTOOLS_TCMALLOC
21 |   NAMES tcmalloc
22 |   HINTS ${Gperftools_ROOT_DIR}/lib)
23 | 
24 | find_library(GPERFTOOLS_PROFILER
25 |   NAMES profiler
26 |   HINTS ${Gperftools_ROOT_DIR}/lib)
27 | 
28 | find_library(GPERFTOOLS_TCMALLOC_AND_PROFILER
29 |   NAMES tcmalloc_and_profiler
30 |   HINTS ${Gperftools_ROOT_DIR}/lib)
31 | 
32 | find_path(GPERFTOOLS_INCLUDE_DIR
33 |   NAMES gperftools/heap-profiler.h
34 |   HINTS ${Gperftools_ROOT_DIR}/include)
35 | 
36 | set(GPERFTOOLS_LIBRARIES ${GPERFTOOLS_TCMALLOC_AND_PROFILER})
37 | 
38 | include(FindPackageHandleStandardArgs)
39 | find_package_handle_standard_args(
40 |   Gperftools
41 |   DEFAULT_MSG
42 |   GPERFTOOLS_LIBRARIES
43 |   GPERFTOOLS_INCLUDE_DIR)
44 | 
45 | mark_as_advanced(
46 |   Gperftools_ROOT_DIR
47 |   GPERFTOOLS_TCMALLOC
48 |   GPERFTOOLS_PROFILER
49 |   GPERFTOOLS_TCMALLOC_AND_PROFILER
50 |   GPERFTOOLS_LIBRARIES
51 |   GPERFTOOLS_INCLUDE_DIR)
52 | 


--------------------------------------------------------------------------------
/cmake/FusedMultiplyAdd.cmake:
--------------------------------------------------------------------------------
 1 | include(CheckCXXCompilerFlag)
 2 | check_cxx_compiler_flag(-mfma fma_compiles)
 3 | if(fma_compiles)
 4 |     include(CheckCXXSourceRuns)
 5 |     set(test_src
 6 |     	"#include <cmath>
 7 |     	double fma_wrap(double x, double y, double z) { return fma(x, y, z); }
 8 |     	int main() { double a = fma_wrap(1.2, 3.4, 5.6); return 0; }")
 9 |     set(CMAKE_REQUIRED_FLAGS -mfma)
10 |     check_cxx_source_runs("${test_src}" fma_runs)
11 |     if(fma_runs)
12 |         message(STATUS "[khaiii] fused multiply add option enabled")
13 |         add_definitions(-mfma)
14 |     else()
15 |         message(WARNING "[khaiii] cpu does not have fused multiply add instruction")
16 |     endif()
17 | else()
18 |     message(WARNING "[khaiii] compiler does not support fused multiply add option")
19 | endif()
20 | 


--------------------------------------------------------------------------------
/cmake/Hunter/config.cmake:
--------------------------------------------------------------------------------
1 | hunter_config(Boost VERSION 1.68.0-p1)
2 | hunter_config(cxxopts VERSION 2.1.1-pre)
3 | hunter_config(Eigen VERSION 3.3.5)
4 | hunter_config(fmt VERSION 4.1.0)
5 | hunter_config(GTest VERSION 1.8.0-hunter-p11)
6 | hunter_config(nlohmann_json VERSION 3.3.0)
7 | hunter_config(spdlog VERSION 0.16.3-p1)
8 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM pytorch/pytorch:latest
 2 | MAINTAINER nako.sung@navercorp.com
 3 | 
 4 | RUN git clone https://github.com/kakao/khaiii.git
 5 | WORKDIR /workspace/khaiii
 6 | 
 7 | RUN pip install cython
 8 | RUN pip install --upgrade pip
 9 | RUN pip install -r requirements.txt
10 | 
11 | RUN mkdir build
12 | WORKDIR /workspace/khaiii/build
13 | 
14 | RUN cmake ..
15 | RUN make all
16 | RUN make resource
17 | 
18 | RUN apt-get update -y
19 | RUN apt-get install -y language-pack-ko
20 | RUN locale-gen en_US.UTF-8
21 | RUN update-locale LANG=en_US.UTF-8
22 | 


--------------------------------------------------------------------------------
/include/khaiii/KhaiiiApi.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
 3 |  * @copyright  Copyright (C) 2017-, Kakao Corp. All rights reserved.
 4 |  */
 5 | 
 6 | 
 7 | #ifndef INCLUDE_KHAIII_KHAIIIAPI_HPP_
 8 | #define INCLUDE_KHAIII_KHAIIIAPI_HPP_
 9 | 
10 | 
11 | 
12 | //////////////
13 | // includes //
14 | //////////////
15 | #include <exception>
16 | #include <memory>
17 | #include <mutex>    // NOLINT
18 | #include <string>
19 | 
20 | #include "khaiii/khaiii_api.h"
21 | 
22 | 
23 | namespace khaiii {
24 | 
25 | 
26 | class KhaiiiApi {
27 |  public:
28 |     /**
29 |      * create khaiii api object
30 |      * @return  shared pointer of khaiii api object
31 |      */
32 |     static std::shared_ptr<KhaiiiApi> create();
33 | 
34 |     /**
35 |      * open resources
36 |      * @param  rsc_dir  resource directory
37 |      * @param  opt_str  option string (JSON format)
38 |      */
39 |     virtual void open(std::string rsc_dir = "", std::string opt_str = "") = 0;
40 | 
41 |     /**
42 |      * analyze input text
43 |      * @param  input  input text
44 |      * @param  opt_str  runtime option (JSON format)
45 |      * @return  results
46 |      */
47 |     virtual const khaiii_word_t* analyze(const char* input, const char* opt_str) = 0;
48 | 
49 |     /**
50 |      * free memories of analyzed results
51 |      * @param  results  results got from analyze() function
52 |      */
53 |     virtual void free_results(const khaiii_word_t* results) = 0;
54 | 
55 |     virtual void close() = 0;    ///< close resources
56 | };
57 | 
58 | 
59 | /**
60 |  * standard exception thrown by khaiii api
61 |  */
62 | class Except: public std::exception {
63 |  public:
64 |     /**
65 |      * @param  msg  error message
66 |      * @param  file  source file (for debug)
67 |      * @param  line  line number in source file (for debug)
68 |      * @param  func  function name (for debug)
69 |      */
70 |     explicit Except(std::string msg, const char* file = nullptr, const int line = 0,
71 |                     const char* func = nullptr);
72 | 
73 |     virtual const char* what() const noexcept;
74 | 
75 |     std::string debug();    ///< message with some debug information
76 | 
77 |  private:
78 |     std::string _msg;    ///< error message
79 |     const char* _file = nullptr;    ///< source file
80 |     const int _line = 0;    ///< line number in source file
81 |     const char* _func = nullptr;    ///< function name
82 | };
83 | 
84 | 
85 | }    // namespace khaiii
86 | 
87 | 
88 | #endif    // INCLUDE_KHAIII_KHAIIIAPI_HPP_
89 | 


--------------------------------------------------------------------------------
/include/khaiii/khaiii_api.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
  3 |  * @copyright  Copyright (C) 2017-, Kakao Corp. All rights reserved.
  4 |  */
  5 | 
  6 | 
  7 | #ifndef INCLUDE_KHAIII_KHAIII_API_H_
  8 | #define INCLUDE_KHAIII_KHAIII_API_H_
  9 | 
 10 | 
 11 | ///////////////
 12 | // constants //
 13 | ///////////////
 14 | #define KHAIII_VERSION_MAJOR 0
 15 | #define KHAIII_VERSION_MINOR 4
 16 | #define _MAC2STR(m) #m
 17 | #define _JOIN_VER(x,y) _MAC2STR(x) "." _MAC2STR(y)    // NOLINT
 18 | #define KHAIII_VERSION _JOIN_VER(KHAIII_VERSION_MAJOR,KHAIII_VERSION_MINOR)    // NOLINT
 19 | 
 20 | 
 21 | #ifdef __cplusplus
 22 | extern "C" {
 23 | #endif
 24 | 
 25 | 
 26 | /**
 27 |  * morpheme data structure
 28 |  */
 29 | typedef struct khaiii_morph_t_ {
 30 |     const char* lex;    ///< lexical
 31 |     const char* tag;    ///< part-of-speech tag
 32 |     int begin;    ///< morpheme begin position
 33 |     int length;    ///< morpheme length
 34 |     char reserved[8];    ///< reserved
 35 |     const struct khaiii_morph_t_* next;    ///< next pointer
 36 | } khaiii_morph_t;
 37 | 
 38 | 
 39 | /**
 40 |  * word data structure
 41 |  */
 42 | typedef struct khaiii_word_t_ {
 43 |     int begin;    ///< word begin position
 44 |     int length;    ///< word length
 45 |     char reserved[8];    ///< reserved
 46 |     const khaiii_morph_t* morphs;    ///< morpheme list
 47 |     const struct khaiii_word_t_* next;    ///< next pointer
 48 | } khaiii_word_t;
 49 | 
 50 | 
 51 | /**
 52 |  * get version string
 53 |  * @return   version string like "2.1"
 54 |  */
 55 | const char* khaiii_version();
 56 | 
 57 | 
 58 | /**
 59 |  * open resources
 60 |  * @param  rsc_dir  resource directory
 61 |  * @param  opt_str  option string (JSON format)
 62 |  * @return   handle. -1 if failed
 63 |  */
 64 | int khaiii_open(const char* rsc_dir, const char* opt_str);
 65 | 
 66 | 
 67 | /**
 68 |  * analyze input text
 69 |  * @param  handle  handle got from open() function
 70 |  * @param  input  input text
 71 |  * @param  opt_str  runtime option (JSON format)
 72 |  * @return  results. NULL if failed
 73 |  */
 74 | const khaiii_word_t* khaiii_analyze(int handle, const char* input, const char* opt_str);
 75 | 
 76 | 
 77 | /**
 78 |  * free memories of analyzed results
 79 |  * @param  handle  handle got from open() function
 80 |  * @param  results  results got from analyze() function
 81 |  */
 82 | void khaiii_free_results(int handle, const khaiii_word_t* results);
 83 | 
 84 | 
 85 | /**
 86 |  * close resources
 87 |  * @param  handle  handle got from open() function
 88 |  */
 89 | void khaiii_close(int handle);
 90 | 
 91 | 
 92 | /**
 93 |  * get last error
 94 |  * @param  handle  handle got from open() function
 95 |  * @return  message
 96 |  */
 97 | const char* khaiii_last_error(int handle);
 98 | 
 99 | 
100 | #ifdef __cplusplus
101 | }
102 | #endif
103 | 
104 | 
105 | #endif    // INCLUDE_KHAIII_KHAIII_API_H_
106 | 


--------------------------------------------------------------------------------
/include/khaiii/khaiii_dev.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
 3 |  * @copyright  Copyright (C) 2018-, Kakao Corp. All rights reserved.
 4 |  */
 5 | 
 6 | 
 7 | #ifndef INCLUDE_KHAIII_KHAIII_DEV_H_
 8 | #define INCLUDE_KHAIII_KHAIII_DEV_H_
 9 | 
10 | 
11 | //////////////
12 | // includes //
13 | //////////////
14 | #include <stdint.h>
15 | 
16 | 
17 | #ifdef __cplusplus
18 | extern "C" {
19 | #endif
20 | 
21 | 
22 | /**
23 |  * 분석을 수행하고 오분석 패치를 실행하기 직전에 멈춘 다음 그 결과를 리턴한다.
24 |  * @param  handle  handle got from open() function
25 |  * @param  input  input text
26 |  * @param  opt_str  runtime option (JSON format)
27 |  * @param  output  output value for each character
28 |  * @return  output length. -1 if failed
29 |  */
30 | int khaiii_analyze_bfr_errpatch(int handle, const char* input, const char* opt_str,
31 |                                 int16_t* output);
32 | 
33 | /**
34 |  * 로그 레벨을 지정한다.
35 |  * @param  name  로거 이름. "all"인 경우 모든 로거
36 |  * @param  level 로거 레벨. trace, debug, info, warn, err, critical
37 |  * @return  0 if success. -1 if failed
38 |  */
39 | int khaiii_set_log_level(const char* name, const char* level);
40 | 
41 | 
42 | /**
43 |  * 여러 로그 레벨을 한꺼번에 지정한다.
44 |  * @param  name_level_pairs  로거 (이름, 레벨) 쌍의 리스트.
45 |  *                           "all:warn,console:info,Tagger:debug"와 같은 형식
46 |  * @return  0 if success. -1 if failed
47 |  */
48 | int khaiii_set_log_levels(const char* name_level_pairs);
49 | 
50 | 
51 | #ifdef __cplusplus
52 | }
53 | #endif
54 | 
55 | 
56 | #endif    // INCLUDE_KHAIII_KHAIII_DEV_H_
57 | 


--------------------------------------------------------------------------------
/munjong/apply_patch.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | 
 5 | """
 6 | apply patch to original Sejong corpus
 7 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)'
 8 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
 9 | """
10 | 
11 | 
12 | ###########
13 | # imports #
14 | ###########
15 | from argparse import ArgumentParser, Namespace
16 | import logging
17 | import os
18 | import shutil
19 | 
20 | from khaiii.munjong import libpatch
21 | 
22 | 
23 | #############
24 | # functions #
25 | #############
26 | def run(args: Namespace):
27 |     """
28 |     run function which is the start point of program
29 |     Args:
30 |         args:  program arguments
31 |     """
32 |     if not os.path.exists(args.modified):
33 |         logging.info('creating modified corpus dir: %s', args.modified)
34 |         os.mkdir(args.modified)
35 | 
36 |     for name in sorted(os.listdir(args.original)):
37 |         if not name.endswith('.txt'):
38 |             continue
39 |         org_path = '%s/%s' % (args.original, name)
40 |         mod_path = '%s/%s' % (args.modified, name)
41 |         patch_path = '%s/%s.patch' % (args.patch, name[:-len('.txt')])
42 |         if os.path.exists(patch_path):
43 |             logging.info('[%s] + [%s] = [%s]', org_path, patch_path, mod_path)
44 |             libpatch.apply(org_path, args.org_enc, patch_path, mod_path, args.mod_enc)
45 |         else:
46 |             logging.info('[%s] = [%s]', org_path, mod_path)
47 |             shutil.copyfile(org_path, mod_path)
48 | 
49 | 
50 | ########
51 | # main #
52 | ########
53 | def main():
54 |     """
55 |     main function processes only argument parsing
56 |     """
57 |     parser = ArgumentParser(description='apply patch to original Sejong corpus')
58 |     parser.add_argument('-o', '--original', help='original corpus dir', metavar='DIR',
59 |                         required=True)
60 |     parser.add_argument('-p', '--patch', help='patch dir', metavar='DIR', required=True)
61 |     parser.add_argument('-m', '--modified', help='modified corpus output dir', metavar='DIR',
62 |                         required=True)
63 |     parser.add_argument('--org-enc', help='original corpus encoding <default: UTF-16>',
64 |                         metavar='ENCODING', default='UTF-16')
65 |     parser.add_argument('--mod-enc', help='modified corpus encoding <default: UTF-8>',
66 |                         metavar='ENCODING', default='UTF-8')
67 |     parser.add_argument('--debug', help='enable debug', action='store_true')
68 |     args = parser.parse_args()
69 | 
70 |     if args.debug:
71 |         logging.basicConfig(level=logging.DEBUG)
72 |     else:
73 |         logging.basicConfig(level=logging.INFO)
74 | 
75 |     run(args)
76 | 
77 | 
78 | if __name__ == '__main__':
79 |     main()
80 | 


--------------------------------------------------------------------------------
/munjong/convert_jamo_to_compat.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | 
 5 | """
 6 | 한글 자모 영역의 코드를 호환 영역으로 변환
 7 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)'
 8 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
 9 | """
10 | 
11 | 
12 | ###########
13 | # imports #
14 | ###########
15 | from argparse import ArgumentParser
16 | import logging
17 | import sys
18 | 
19 | from khaiii.munjong.sejong_corpus import WORD_ID_PTN
20 | from khaiii.resource.jaso import norm_compat
21 | 
22 | 
23 | #############
24 | # functions #
25 | #############
26 | def _norm(text: str) -> str:
27 |     """
28 |     정규화를 수행하는 함수
29 |     Args:
30 |         text:  입력 텍스트
31 |     Returns:
32 |         정규화된 텍스트
33 |     """
34 |     normalized = norm_compat(text)
35 |     normalized = normalized.replace('ᆞ', 'ㆍ')    # 0x119e -> 0x318d
36 |     normalized = normalized.replace('ᄝ', 'ㅱ')    # 0x111d -> 0x3171
37 |     return normalized
38 | 
39 | 
40 | def run():
41 |     """
42 |     run function which is the start point of program
43 |     """
44 |     for line in sys.stdin:
45 |         line = line.rstrip('\r\n')
46 |         if not WORD_ID_PTN.match(line):
47 |             print(line)
48 |             continue
49 |         wid, word, morph = line.split('\t')
50 |         print('{}\t{}\t{}'.format(wid, _norm(word), _norm(morph)))
51 | 
52 | 
53 | ########
54 | # main #
55 | ########
56 | def main():
57 |     """
58 |     main function processes only argument parsing
59 |     """
60 |     parser = ArgumentParser(description='한글 자모 영역의 코드를 호환 영역으로 변환')
61 |     parser.add_argument('--input', help='input file <default: stdin>', metavar='FILE')
62 |     parser.add_argument('--output', help='output file <default: stdout>', metavar='FILE')
63 |     parser.add_argument('--debug', help='enable debug', action='store_true')
64 |     args = parser.parse_args()
65 | 
66 |     if args.input:
67 |         sys.stdin = open(args.input, 'r', encoding='UTF-8')
68 |     if args.output:
69 |         sys.stdout = open(args.output, 'w', encoding='UTF-8')
70 |     if args.debug:
71 |         logging.basicConfig(level=logging.DEBUG)
72 |     else:
73 |         logging.basicConfig(level=logging.INFO)
74 | 
75 |     run()
76 | 
77 | 
78 | if __name__ == '__main__':
79 |     main()
80 | 


--------------------------------------------------------------------------------
/munjong/detect_sejong_period_error.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | 
  5 | """
  6 | detect period error of Sejong corpus
  7 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)'
  8 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
  9 | """
 10 | 
 11 | 
 12 | ###########
 13 | # imports #
 14 | ###########
 15 | from argparse import ArgumentParser
 16 | import logging
 17 | import os
 18 | import re
 19 | import sys
 20 | from typing import Iterator, TextIO, Tuple
 21 | 
 22 | from khaiii.munjong.sejong_corpus import Morph, WORD_ID_PTN
 23 | 
 24 | 
 25 | #############
 26 | # functions #
 27 | #############
 28 | def _get_two_lines(fin: TextIO) -> Iterator[Tuple[str, str]]:
 29 |     """
 30 |     get two lines tuple from file (generator)
 31 |     Args:
 32 |         fin:  input file
 33 |     Yields:
 34 |         current line
 35 |         next line
 36 |     """
 37 |     curr_line = fin.readline().rstrip('\r\n')
 38 |     for next_line in fin:
 39 |         next_line = next_line.rstrip('\r\n')
 40 |         yield curr_line, next_line
 41 |         curr_line = next_line
 42 | 
 43 | 
 44 | def _is_correct_eos(line: str) -> bool:
 45 |     """
 46 |     whether correct end of sentence or not
 47 |     Args:
 48 |         line:  line (word)
 49 |     Returns:
 50 |         whether correct or not
 51 |     """
 52 |     _, _, morphs_str = line.split('\t')
 53 |     if re.match(r'.+/EF \+ ./SF$', morphs_str):
 54 |         return True
 55 |     if re.match(r'.+/SF \+ [\'"’”」\]]/SS$', morphs_str):
 56 |         return True
 57 |     morphs = [Morph.parse(_) for _ in morphs_str.split(' + ')]
 58 |     tags_str = '+'.join([_.tag for _ in morphs])
 59 |     if tags_str.endswith('+SF+SS+JKQ') or tags_str.endswith('+SF+SS+VCP+ETM'):
 60 |         return True
 61 |     return False
 62 | 
 63 | 
 64 | def run():
 65 |     """
 66 |     run function which is the start point of program
 67 |     """
 68 |     file_name = os.path.basename(sys.stdin.name)
 69 |     for line_num, (curr_line, next_line) in enumerate(_get_two_lines(sys.stdin), start=1):
 70 |         cols = curr_line.split('\t')
 71 |         if len(cols) != 3 or not WORD_ID_PTN.match(cols[0]):
 72 |             continue
 73 |         if '/SF + ' not in cols[2] or not next_line.startswith('</'):
 74 |             continue
 75 |         if _is_correct_eos(curr_line):
 76 |             continue
 77 |         print('{}:{}\t{}'.format(file_name, line_num, curr_line))
 78 | 
 79 | 
 80 | ########
 81 | # main #
 82 | ########
 83 | def main():
 84 |     """
 85 |     main function processes only argument parsing
 86 |     """
 87 |     parser = ArgumentParser(description='detect period error of Sejong corpus')
 88 |     parser.add_argument('--input', help='input file <default: stdin>', metavar='FILE')
 89 |     parser.add_argument('--output', help='output file <default: stdout>', metavar='FILE')
 90 |     parser.add_argument('--debug', help='enable debug', action='store_true')
 91 |     args = parser.parse_args()
 92 | 
 93 |     if args.input:
 94 |         sys.stdin = open(args.input, 'rt')
 95 |     if args.output:
 96 |         sys.stdout = open(args.output, 'wt')
 97 |     if args.debug:
 98 |         logging.basicConfig(level=logging.DEBUG)
 99 |     else:
100 |         logging.basicConfig(level=logging.INFO)
101 | 
102 |     run()
103 | 
104 | 
105 | if __name__ == '__main__':
106 |     main()
107 | 


--------------------------------------------------------------------------------
/munjong/fix_final_symbol_error.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | 
 5 | """
 6 | fix final symbol errors on Sejong corpus
 7 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)'
 8 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
 9 | """
10 | 
11 | 
12 | ###########
13 | # imports #
14 | ###########
15 | from argparse import ArgumentParser
16 | import logging
17 | import os
18 | import sys
19 | 
20 | from khaiii.munjong.sejong_corpus import Morph, Word, WORD_ID_PTN
21 | 
22 | 
23 | #############
24 | # functions #
25 | #############
26 | def _attach_missing_symbol(word: Word):
27 |     """
28 |     attach missing symbol
29 |     Args:
30 |         word:  Word object
31 |     """
32 |     raw_word = word.raw
33 |     raw_morph = ''.join([_.lex for _ in word.morphs])
34 |     if not raw_word.startswith(raw_morph) or len(raw_word) != len(raw_morph)+1:
35 |         return
36 |     last_symbol = raw_word[-1]
37 |     if last_symbol == '.' and word.morphs[-1].tag == 'EC':
38 |         word.morphs.append(Morph('.', 'SF'))
39 |     elif last_symbol == ',':
40 |         word.morphs.append(Morph(',', 'SP'))
41 |     elif last_symbol == '"':
42 |         word.morphs.append(Morph('"', 'SS'))
43 | 
44 | 
45 | def run():
46 |     """
47 |     run function which is the start point of program
48 |     """
49 |     file_name = os.path.basename(sys.stdin.name)
50 |     for line_num, line in enumerate(sys.stdin, start=1):
51 |         line = line.rstrip('\r\n')
52 |         if not WORD_ID_PTN.match(line):
53 |             print(line)
54 |             continue
55 |         word = Word.parse(line, file_name, line_num)
56 |         _attach_missing_symbol(word)
57 |         print(word)
58 | 
59 | 
60 | ########
61 | # main #
62 | ########
63 | def main():
64 |     """
65 |     main function processes only argument parsing
66 |     """
67 |     parser = ArgumentParser(description='fix final symbol errors on Sejong corpus')
68 |     parser.add_argument('--input', help='input file <default: stdin>', metavar='FILE')
69 |     parser.add_argument('--output', help='output file <default: stdout>', metavar='FILE')
70 |     parser.add_argument('--debug', help='enable debug', action='store_true')
71 |     args = parser.parse_args()
72 | 
73 |     if args.input:
74 |         sys.stdin = open(args.input, 'rt')
75 |     if args.output:
76 |         sys.stdout = open(args.output, 'wt')
77 |     if args.debug:
78 |         logging.basicConfig(level=logging.DEBUG)
79 |     else:
80 |         logging.basicConfig(level=logging.INFO)
81 | 
82 |     run()
83 | 
84 | 
85 | if __name__ == '__main__':
86 |     main()
87 | 


--------------------------------------------------------------------------------
/munjong/make_patch.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | 
 5 | """
 6 | make patch from two Sejong corpora
 7 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)'
 8 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
 9 | """
10 | 
11 | 
12 | ###########
13 | # imports #
14 | ###########
15 | from argparse import ArgumentParser, Namespace
16 | import logging
17 | import os
18 | 
19 | from khaiii.munjong import libpatch
20 | 
21 | 
22 | #############
23 | # functions #
24 | #############
25 | def run(args: Namespace):
26 |     """
27 |     run function which is the start point of program
28 |     Args:
29 |         args:  program arguments
30 |     """
31 |     if not os.path.exists(args.patch):
32 |         logging.info('creating patch dir: %s', args.patch)
33 |         os.mkdir(args.patch)
34 | 
35 |     for name in sorted(os.listdir(args.original)):
36 |         if not name.endswith('.txt'):
37 |             continue
38 |         org_path = '%s/%s' % (args.original, name)
39 |         mod_path = '%s/%s' % (args.modified, name)
40 |         patch_path = '%s/%s.patch' % (args.patch, name[:-len('.txt')])
41 |         logging.info('[%s] - [%s] = [%s]', org_path, mod_path, patch_path)
42 |         patches = libpatch.make(org_path, args.org_enc, mod_path, args.mod_enc)
43 |         if patches:
44 |             logging.info('creating patch file: %s', patch_path)
45 |             with open(patch_path, 'w', encoding='UTF-8') as fout:
46 |                 for patch in patches:
47 |                     print(patch, file=fout)
48 |         elif os.path.exists(patch_path):
49 |             logging.info('removing existing patch file: %s', patch_path)
50 |             os.remove(patch_path)
51 | 
52 | 
53 | ########
54 | # main #
55 | ########
56 | def main():
57 |     """
58 |     main function processes only argument parsing
59 |     """
60 |     parser = ArgumentParser(description='make patch from two Sejong corpora')
61 |     parser.add_argument('-o', '--original', help='original corpus dir', metavar='DIR',
62 |                         required=True)
63 |     parser.add_argument('-m', '--modified', help='modified corpus dir', metavar='DIR',
64 |                         required=True)
65 |     parser.add_argument('-p', '--patch', help='patch output dir', metavar='DIR', required=True)
66 |     parser.add_argument('--org-enc', help='original corpus encoding <default: UTF-16>',
67 |                         metavar='ENCODING', default='UTF-16')
68 |     parser.add_argument('--mod-enc', help='modified corpus encoding <default: UTF-8>',
69 |                         metavar='ENCODING', default='UTF-8')
70 |     parser.add_argument('--debug', help='enable debug', action='store_true')
71 |     args = parser.parse_args()
72 | 
73 |     if args.debug:
74 |         logging.basicConfig(level=logging.DEBUG)
75 |     else:
76 |         logging.basicConfig(level=logging.INFO)
77 | 
78 |     run(args)
79 | 
80 | 
81 | if __name__ == '__main__':
82 |     main()
83 | 


--------------------------------------------------------------------------------
/munjong/recover_english_case.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | 
 5 | """
 6 | recover cases of English letters in Sejong corpus
 7 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)'
 8 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
 9 | """
10 | 
11 | 
12 | ###########
13 | # imports #
14 | ###########
15 | from argparse import ArgumentParser
16 | import copy
17 | import logging
18 | import os
19 | import re
20 | import sys
21 | 
22 | from khaiii.munjong.sejong_corpus import Word, WORD_ID_PTN
23 | 
24 | 
25 | #############
26 | # functions #
27 | #############
28 | def _recover(word: Word):
29 |     """
30 |     recover cases
31 |     Args:
32 |         word:  Word object
33 |     """
34 |     word_letters = [_ for _ in word.raw if re.match(r'[a-zA-Z]', _)]
35 |     letter_idx = -1
36 |     is_recovered = False
37 |     word_copy = copy.deepcopy(word)
38 |     for morph in word_copy.morphs:
39 |         for idx, char in enumerate(morph.lex):
40 |             if not re.match(r'[a-zA-Z]', char):
41 |                 continue
42 |             letter_idx += 1
43 |             if word_letters[letter_idx] == char:
44 |                 continue
45 |             morph.lex = morph.lex[:idx] + word_letters[letter_idx] + morph.lex[idx+1:]
46 |             is_recovered = True
47 |     if is_recovered:
48 |         logging.info('%s  =>  %s', str(word), word_copy.morph_str())
49 |         word.morphs = word_copy.morphs
50 | 
51 | 
52 | def run():
53 |     """
54 |     run function which is the start point of program
55 |     """
56 |     file_name = os.path.basename(sys.stdin.name)
57 |     for line_num, line in enumerate(sys.stdin, start=1):
58 |         line = line.rstrip('\r\n')
59 |         if not WORD_ID_PTN.match(line):
60 |             print(line)
61 |             continue
62 |         word = Word.parse(line, file_name, line_num)
63 |         try:
64 |             _recover(word)
65 |         except IndexError as idx_err:
66 |             logging.error('%s(%d): %s: %s', file_name, line_num, idx_err, word)
67 |         print(word)
68 | 
69 | 
70 | ########
71 | # main #
72 | ########
73 | def main():
74 |     """
75 |     main function processes only argument parsing
76 |     """
77 |     parser = ArgumentParser(description='recover cases of English letters in Sejong corpus')
78 |     parser.add_argument('--input', help='input file <default: stdin>', metavar='FILE')
79 |     parser.add_argument('--output', help='output file <default: stdout>', metavar='FILE')
80 |     parser.add_argument('--debug', help='enable debug', action='store_true')
81 |     args = parser.parse_args()
82 | 
83 |     if args.input:
84 |         sys.stdin = open(args.input, 'r', encoding='UTF-8')
85 |     if args.output:
86 |         sys.stdout = open(args.output, 'w', encoding='UTF-8')
87 |     if args.debug:
88 |         logging.basicConfig(level=logging.DEBUG)
89 |     else:
90 |         logging.basicConfig(level=logging.INFO)
91 | 
92 |     run()
93 | 
94 | 
95 | if __name__ == '__main__':
96 |     main()
97 | 


--------------------------------------------------------------------------------
/munjong/recover_raw_morph_mismatch.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | 
  5 | """
  6 | 어절의 원문과 형태소 분석 결과의 문자가 정규화하면 같지만 코드가 다른 경우 원문의 문자로 복원
  7 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)'
  8 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
  9 | """
 10 | 
 11 | 
 12 | ###########
 13 | # imports #
 14 | ###########
 15 | from argparse import ArgumentParser
 16 | import logging
 17 | import os
 18 | import sys
 19 | 
 20 | from khaiii.munjong.sejong_corpus import Morph, ParseError, Word, WORD_ID_PTN
 21 | 
 22 | 
 23 | #############
 24 | # functions #
 25 | #############
 26 | def _recover(line: str) -> str:
 27 |     """
 28 |     문자를 복원한다.
 29 |     Args:
 30 |         line:  어절 라인
 31 |     Returns:
 32 |         복원된 라인
 33 |     """
 34 |     wid, raw, morphs_str = line.split('\t')
 35 |     raw_idx = 0
 36 |     morphs = []
 37 |     for token_str in morphs_str.split(' + '):
 38 |         morph = Morph.parse(token_str)
 39 |         lex = []
 40 |         for _ in range(len(morph.lex)):
 41 |             try:
 42 |                 lex.append(raw[raw_idx])
 43 |                 raw_idx += 1
 44 |             except IndexError as idx_err:
 45 |                 logging.error(line)
 46 |                 raise idx_err
 47 |         morph.lex = ''.join(lex)
 48 |         morphs.append(morph)
 49 |     morphs_new = ' + '.join([str(m) for m in morphs])
 50 |     logging.debug('%s\t%s\t%s  =>  %s', wid, raw, morphs_str, morphs_new)
 51 |     return '{}\t{}\t{}'.format(wid, raw, morphs_new)
 52 | 
 53 | 
 54 | def run():
 55 |     """
 56 |     run function which is the start point of program
 57 |     """
 58 |     file_name = os.path.basename(sys.stdin.name)
 59 |     for line_num, line in enumerate(sys.stdin, start=1):
 60 |         line = line.rstrip('\r\n')
 61 |         if not WORD_ID_PTN.match(line):
 62 |             print(line)
 63 |             continue
 64 |         try:
 65 |             Word.parse(line, file_name, line_num)
 66 |         except ParseError as par_err:
 67 |             if 'raw-morph mismatch' in str(par_err):
 68 |                 line = _recover(line)
 69 |             else:
 70 |                 raise par_err
 71 |         print(line)
 72 | 
 73 | 
 74 | ########
 75 | # main #
 76 | ########
 77 | def main():
 78 |     """
 79 |     main function processes only argument parsing
 80 |     """
 81 |     parser = ArgumentParser(description='어절의 원문과 형태소 분석 결과의 문자가 정규화하면 같지만 코드가 다른 경우'
 82 |                                         ' 원문의 문자로 복원')
 83 |     parser.add_argument('--input', help='input file <default: stdin>', metavar='FILE')
 84 |     parser.add_argument('--output', help='output file <default: stdout>', metavar='FILE')
 85 |     parser.add_argument('--debug', help='enable debug', action='store_true')
 86 |     args = parser.parse_args()
 87 | 
 88 |     if args.input:
 89 |         sys.stdin = open(args.input, 'r', encoding='UTF-8')
 90 |     if args.output:
 91 |         sys.stdout = open(args.output, 'w', encoding='UTF-8')
 92 |     if args.debug:
 93 |         logging.basicConfig(level=logging.DEBUG)
 94 |     else:
 95 |         logging.basicConfig(level=logging.INFO)
 96 | 
 97 |     run()
 98 | 
 99 | 
100 | if __name__ == '__main__':
101 |     main()
102 | 


--------------------------------------------------------------------------------
/munjong/recover_wide_quotation.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | 
  5 | """
  6 | recover wide char quotations in Sejong corpus
  7 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)'
  8 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
  9 | """
 10 | 
 11 | 
 12 | ###########
 13 | # imports #
 14 | ###########
 15 | from argparse import ArgumentParser
 16 | import logging
 17 | import os
 18 | import sys
 19 | 
 20 | from khaiii.munjong.sejong_corpus import Word, WORD_ID_PTN
 21 | 
 22 | 
 23 | #############
 24 | # constants #
 25 | #############
 26 | _QUOT_NORM = {
 27 |     '"': '"',
 28 |     '“': '"',
 29 |     '”': '"',
 30 |     "'": "'",
 31 |     "‘": "'",
 32 |     "’": "'",
 33 |     "`": "'",
 34 | }
 35 | 
 36 | 
 37 | #############
 38 | # functions #
 39 | #############
 40 | def _recover(word: Word):
 41 |     """
 42 |     recover wide char quotations
 43 |     Args:
 44 |         word:  Word object
 45 |     """
 46 |     word_quots = [_ for _ in word.raw if _ in _QUOT_NORM]
 47 |     morph_quots = []
 48 |     for idx, morph in enumerate(word.morphs):
 49 |         if morph.tag != 'SS' or morph.lex not in _QUOT_NORM:
 50 |             continue
 51 |         morph_quots.append((idx, morph))
 52 |         quot_idx = len(morph_quots)-1
 53 |         if len(word_quots) <= quot_idx or _QUOT_NORM[word_quots[quot_idx]] != _QUOT_NORM[morph.lex]:
 54 |             logging.error('%d-th quots are different: %s', quot_idx+1, word)
 55 |             return
 56 |     if len(word_quots) != len(morph_quots):
 57 |         morph_quots = [_ for _ in word.morph_str() if _ in _QUOT_NORM]
 58 |         if word_quots != morph_quots:
 59 |             logging.error('number of quots are different: %s', word)
 60 |         return
 61 |     for word_char, (idx, morph) in zip(word_quots, morph_quots):
 62 |         if word_char == morph.lex:
 63 |             continue
 64 |         morph.lex = word_char
 65 | 
 66 | 
 67 | def run():
 68 |     """
 69 |     run function which is the start point of program
 70 |     """
 71 |     file_name = os.path.basename(sys.stdin.name)
 72 |     for line_num, line in enumerate(sys.stdin, start=1):
 73 |         line = line.rstrip('\r\n')
 74 |         if not WORD_ID_PTN.match(line):
 75 |             print(line)
 76 |             continue
 77 |         word = Word.parse(line, file_name, line_num)
 78 |         _recover(word)
 79 |         print(word)
 80 | 
 81 | 
 82 | ########
 83 | # main #
 84 | ########
 85 | def main():
 86 |     """
 87 |     main function processes only argument parsing
 88 |     """
 89 |     parser = ArgumentParser(description='recover wide char quotations in Sejong corpus')
 90 |     parser.add_argument('--input', help='input file <default: stdin>', metavar='FILE')
 91 |     parser.add_argument('--output', help='output file <default: stdout>', metavar='FILE')
 92 |     parser.add_argument('--debug', help='enable debug', action='store_true')
 93 |     args = parser.parse_args()
 94 | 
 95 |     if args.input:
 96 |         sys.stdin = open(args.input, 'rt')
 97 |     if args.output:
 98 |         sys.stdout = open(args.output, 'wt')
 99 |     if args.debug:
100 |         logging.basicConfig(level=logging.DEBUG)
101 |     else:
102 |         logging.basicConfig(level=logging.INFO)
103 | 
104 |     run()
105 | 
106 | 
107 | if __name__ == '__main__':
108 |     main()
109 | 


--------------------------------------------------------------------------------
/munjong/remove_sejong_period_error.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | 
  5 | """
  6 | remove wrong sentence breaking marks after period error eojeol
  7 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)'
  8 | __copyright__ = 'Copyright (C) 2017-, Kakao Corp. All rights reserved.'
  9 | """
 10 | 
 11 | 
 12 | ###########
 13 | # imports #
 14 | ###########
 15 | from argparse import ArgumentParser
 16 | import logging
 17 | import os
 18 | import re
 19 | import sys
 20 | from typing import TextIO, Tuple
 21 | 
 22 | from khaiii.munjong.sejong_corpus import Morph, WORD_ID_PTN
 23 | 
 24 | 
 25 | #############
 26 | # functions #
 27 | #############
 28 | def _get_three_lines(fin: TextIO) -> Tuple[str, str, str]:
 29 |     """
 30 |     get three lines tuple from file (generator)
 31 |     Args:
 32 |         fin:  input file
 33 |     Yields:
 34 |         prev. prev. line
 35 |         prev. line
 36 |         curr. line
 37 |     """
 38 |     prev_prev_line = fin.readline().rstrip('\r\n')
 39 |     prev_line = fin.readline().rstrip('\r\n')
 40 |     # print first two lines
 41 |     print(prev_prev_line)
 42 |     print(prev_line)
 43 |     for curr_line in fin:
 44 |         curr_line = curr_line.rstrip('\r\n')
 45 |         yield prev_prev_line, prev_line, curr_line
 46 |         prev_prev_line = prev_line
 47 |         prev_line = curr_line
 48 | 
 49 | 
 50 | def _is_known_period_error_eojeol(line: str) -> bool:
 51 |     """
 52 |     알려진 특정 문장분리 오류를 포함하는 어절인 지 여부
 53 |     Args:
 54 |         line:  line (eojeol)
 55 |     Returns:
 56 |         whether has error or not
 57 |     """
 58 |     cols = line.split('\t')
 59 |     if len(cols) != 3 or not WORD_ID_PTN.match(cols[0]):
 60 |         return False
 61 |     if '/SF + ' not in cols[2] or re.match(r'.+/EF \+ ./SF$', cols[2]):
 62 |         return False
 63 |     if re.match(r'.+/SF \+ [\'"’”]/SS$', cols[2]):
 64 |         return False
 65 |     morphs = [Morph.parse(_) for _ in cols[2].split(' + ')]
 66 |     tags_str = '+'.join([_.tag for _ in morphs])
 67 |     if 'SN+SF+SN' in tags_str and not tags_str.endswith('+SF'):
 68 |         # 4.6판: 4/SN + ./SF + 6/SN + 판/NNB
 69 |         if 'XSN+SF+SN' not in tags_str:
 70 |             return True
 71 |     elif 'SL+SF+SL' in tags_str and not tags_str.endswith('+SF'):
 72 |         # S.M.오너: S/SL + ./SF + M/SL + ./SF + 오너/NNG
 73 |         return True
 74 |     return False
 75 | 
 76 | 
 77 | def run():
 78 |     """
 79 |     run function which is the start point of program
 80 |     """
 81 |     file_name = os.path.basename(sys.stdin.name)
 82 |     for line_num, (prev_prev_line, prev_line, curr_line) in enumerate(_get_three_lines(sys.stdin),
 83 |                                                                       start=1):
 84 |         if curr_line == '</p>' and _is_known_period_error_eojeol(prev_line):
 85 |             continue
 86 |         elif prev_line == '</p>' and curr_line == '<p>' and \
 87 |                 _is_known_period_error_eojeol(prev_prev_line):
 88 |             logging.info('%s:%d\t%s', file_name, line_num, prev_prev_line)
 89 |             continue
 90 |         print(curr_line)
 91 | 
 92 | 
 93 | ########
 94 | # main #
 95 | ########
 96 | def main():
 97 |     """
 98 |     main function processes only argument parsing
 99 |     """
100 |     parser = ArgumentParser(description='remove wrong sentence breaking marks after'
101 |                                         ' period error eojeol')
102 |     parser.add_argument('--input', help='input file <default: stdin>', metavar='FILE')
103 |     parser.add_argument('--output', help='output file <default: stdout>', metavar='FILE')
104 |     parser.add_argument('--debug', help='enable debug', action='store_true')
105 |     args = parser.parse_args()
106 | 
107 |     if args.input:
108 |         sys.stdin = open(args.input, 'rt')
109 |     if args.output:
110 |         sys.stdout = open(args.output, 'wt')
111 |     if args.debug:
112 |         logging.basicConfig(level=logging.DEBUG)
113 |     else:
114 |         logging.basicConfig(level=logging.INFO)
115 | 
116 |     run()
117 | 
118 | 
119 | if __name__ == '__main__':
120 |     main()
121 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | cmake>=3.10
2 | 


--------------------------------------------------------------------------------
/rsc/Makefile:
--------------------------------------------------------------------------------
 1 | HOME_DIR = .
 2 | BIN_DIR = $(HOME_DIR)/bin
 3 | SRC_PYTHON = $(HOME_DIR)/../src/main/python
 4 | RSC_SRC = $(HOME_DIR)/src
 5 | PREFIX = /usr/local
 6 | RSC_DIR = $(PREFIX)/share/khaiii
 7 | MODEL_SIZE = base
 8 | 
 9 | MODEL = \
10 |     $(RSC_DIR)/config.json \
11 |     $(RSC_DIR)/embed.bin \
12 |     $(RSC_DIR)/conv.2.fil \
13 |     $(RSC_DIR)/conv.3.fil \
14 |     $(RSC_DIR)/conv.4.fil \
15 |     $(RSC_DIR)/conv.5.fil \
16 |     $(RSC_DIR)/cnv2hdn.lin \
17 |     $(RSC_DIR)/hdn2tag.lin
18 | 
19 | RESTORE = \
20 |     $(RSC_DIR)/restore.key \
21 |     $(RSC_DIR)/restore.val \
22 |     $(RSC_DIR)/restore.one
23 | 
24 | PREANAL = \
25 |     $(RSC_DIR)/preanal.tri \
26 |     $(RSC_DIR)/preanal.val
27 | 
28 | ERRPATCH = \
29 |     $(RSC_DIR)/errpatch.tri \
30 |     $(RSC_DIR)/errpatch.val \
31 |     $(RSC_DIR)/errpatch.len
32 | 
33 | all: $(MODEL) $(PREANAL) $(RESTORE) $(ERRPATCH)
34 | 
35 | $(wordlist 2,100,$(MODEL)): $(firstword $(MODEL))
36 | $(firstword $(MODEL)): $(RSC_SRC)/$(MODEL_SIZE).config.json $(RSC_SRC)/$(MODEL_SIZE).model.pickle
37 | 	mkdir -p $(RSC_DIR)
38 | 	PYTHONPATH=$(SRC_PYTHON) python3 $(BIN_DIR)/compile_model.py --model-size $(MODEL_SIZE) --rsc-src $(RSC_SRC) --rsc-dir $(RSC_DIR)
39 | 
40 | $(wordlist 2,100,$(PREANAL)): $(firstword $(PREANAL))
41 | $(firstword $(PREANAL)): $(RSC_SRC)/preanal.auto $(RSC_SRC)/preanal.manual
42 | 	mkdir -p $(RSC_DIR)
43 | 	PYTHONPATH=$(SRC_PYTHON) python3 $(BIN_DIR)/compile_preanal.py --rsc-src $(RSC_SRC) --rsc-dir $(RSC_DIR)
44 | 
45 | $(wordlist 2,100,$(RESTORE)): $(firstword $(RESTORE))
46 | $(firstword $(RESTORE)): $(RSC_SRC)/restore.dic $(RSC_SRC)/vocab.out $(RSC_SRC)/vocab.out.more
47 | 	mkdir -p $(RSC_DIR)
48 | 	PYTHONPATH=$(SRC_PYTHON) python3 $(BIN_DIR)/compile_restore.py --rsc-src $(RSC_SRC) --rsc-dir $(RSC_DIR)
49 | 
50 | $(wordlist 2,100,$(ERRPATCH)): $(firstword $(ERRPATCH))
51 | $(firstword $(ERRPATCH)): $(RSC_SRC)/$(MODEL_SIZE).errpatch.auto $(RSC_SRC)/$(MODEL_SIZE).errpatch.manual
52 | 	mkdir -p $(RSC_DIR)
53 | 	PYTHONPATH=$(SRC_PYTHON) python3 $(BIN_DIR)/compile_errpatch.py --model-size $(MODEL_SIZE) --rsc-src $(RSC_SRC) --rsc-dir $(RSC_DIR)
54 | 
55 | clean:
56 | 	rm -rf $(RSC_DIR)
57 | 


--------------------------------------------------------------------------------
/rsc/src/base.config.json:
--------------------------------------------------------------------------------
1 | {
2 |     "cutoff": 1,
3 |     "embed_dim": 35,
4 |     "hidden_dim": 320,
5 |     "model_id": "munjong.cut1.win4.sdo0.1.emb35.lr0.001.lrd0.9.bs500",
6 |     "rsc_src": "../rsc/src",
7 |     "window": 4
8 | }


--------------------------------------------------------------------------------
/rsc/src/base.errpatch.manual:
--------------------------------------------------------------------------------
1 | # 아래 엔트리는 단위테스트에 사용되는 것으로 삭제하지 마시기 바랍니다.
2 | 지저스크라이스트	지저스크라이스/NNP + 트/NNG	지저스/NNP + 크라이스트/NNP
3 | 지저스 크라이스트	지저스/NNP + _ + 크라이스/NNP + 트/NNG	지저스/NNP + _ + 크라이스트/NNP
4 | 고타마싯다르타	| + 고타마싯다르타/NNP	| + 고타마/NNP + 싯다르타/NNP
5 | 무함마드압둘라	무함마드압/NNP + 둘/NR + 라/NNP + |	무함마드/NNP + 압둘라/NNP + |
6 | 


--------------------------------------------------------------------------------
/rsc/src/base.model.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kakao/khaiii/3d0c8944374163b1107fd6690ccf3a408430a02d/rsc/src/base.model.pickle


--------------------------------------------------------------------------------
/rsc/src/large.config.json:
--------------------------------------------------------------------------------
1 | {
2 |     "cutoff": 1,
3 |     "embed_dim": 180,
4 |     "hidden_dim": 610,
5 |     "model_id": "munjong.cut1.win4.sdo0.1.emb180.lr0.001.lrd0.9.bs500",
6 |     "rsc_src": "../rsc/src",
7 |     "window": 4
8 | }


--------------------------------------------------------------------------------
/rsc/src/large.errpatch.auto:
--------------------------------------------------------------------------------
  1 | 이름 석자	이름/NNG + _ + 석자/NNG	이름/NNG + _ + 석/MM + 자/NNG
  2 | 채 썬다	채/MAG + _ + 썰/VV + ㄴ다/EF	채/NNG + _ + 썰/VV + ㄴ다/EF
  3 | 중증급성호흡기증후군	중증/NNG + 급성호흡기/NNG + 증후군/NNG	중증/NNG + 급성/NNG + 호흡기/NNG + 증후군/NNG
  4 |  한국교사휴양원	_ + 한국교사휴양/NNP + 원/NNG	_ + 한국교사휴양원/NNP
  5 | 모여 들	모이/VV + 어/EC + _ + 들/VV	모이/VV + 어/EC + _ + 들/VX
  6 |  연탄가스	_ + 연탄/NNG + 가스/NNG	_ + 연탄가스/NNG
  7 |  실수요자	_ + 실수/NNG + 요자/NNG	_ + 실수요자/NNG
  8 | 너랑 나	너/NP + 랑/JKB + _ + 나/NP	너/NP + 랑/JC + _ + 나/NP
  9 | 시아누크공	시아누크공/NNP	시아누크/NNP + 공/NNG
 10 | 하리만치	하/XSA + 리만/EC + 치/MAG	하/XSA + 리만치/EC
 11 | 그래선지	| + 그렇/VA + 어선지/EC	| + 그렇/VA + 어서/EC + 이/VCP + ㄴ지/EC
 12 | 대대적인	대대/NNG + 적/XSN + 이/VCP + ㄴ/ETM	대대적/NNG + 이/VCP + ㄴ/ETM
 13 | 이른바 `	이른/MAJ + 바/MAG + _ + `/SS	이른바/MAJ + _ + `/SS
 14 | 산간벽지	산간/NNG + 벽지/NNG	산간벽지/NNG
 15 | 미스 민을	미스/NNG + _ + 민/NNG + 을/JKO	미스/NNG + _ + 민/NNP + 을/JKO
 16 | 무임승차	무임/NNG + 승차/NNG	무임승차/NNG
 17 | 습니다그려.	습니다/EC + 그/JX + 려/IC + ./SF	습니다/EC + 그려/JX + ./SF
 18 |  진두지휘	_ + 진두/NNG + 지휘/NNG	_ + 진두지휘/NNG
 19 | 한편 1997	한/MAG + 편/NNG + _ + 1997/SN	한편/NNG + _ + 1997/SN
 20 | 한편 1997	| + 한/MAG + 편/NNG + _ + 1997/SN	| + 한편/NNG + _ + 1997/SN
 21 | 지식인이란 	지식인/NNG + 이/VCP + 란/JX + _	지식인/NNG + 이란/JX + _
 22 | 시험공부	시험/NNG + 공부/NNG	시험공부/NNG
 23 | 중증급성호흡기	중증/NNG + 급성호흡기/NNG	중증/NNG + 급성/NNG + 호흡기/NNG
 24 | 기념행사	기념/NNG + 행사/NNG	기념행사/NNG
 25 | 그래선지 	그렇/VA + 어선지/EC + _	그렇/VA + 어서/EC + 이/VCP + ㄴ지/EC + _
 26 | 사 가지고	사/VV + 아/EC + _ + 가/VV + 지/VX + 고/EC	사/VV + 아/EC + _ + 가지/VX + 고/EC
 27 | 한국교사휴양원	한국교사휴양/NNP + 원/NNG	한국교사휴양원/NNP
 28 | 언어문화	언어/NNG + 문화/NNG	언어문화/NNG
 29 | 간 쇠고기	가/VV + ㄴ/ETM + _ + 쇠고기/NNG	갈/VV + ㄴ/ETM + _ + 쇠고기/NNG
 30 | 달래 주	달러/VV + 어/EC + _ + 주/VX	달래/VV + 어/EC + _ + 주/VX
 31 | 기 일원론	기/NNG + _ + 일원/NNG + 론/XSN	기/NNG + _ + 일원론/NNG
 32 | 돼지머리	돼지머리/NNG	돼지/NNG + 머리/NNG
 33 |  제자리걸음	_ + 제자리/NNG + 걸음/NNG	_ + 제자리걸음/NNG
 34 | 전지훈련	전지/NNG + 훈련/NNG	전지훈련/NNG
 35 | 진우 씬	진우/NNP + _ + 씬/NNG	진우/NNP + _ + 씨/NNB + ㄴ/JX
 36 | 이 바람에	이/JKS + _ + 바/NNG + 람/NNB + 에/JKB	이/JKS + _ + 바람/NNG + 에/JKB
 37 |  대대적인	_ + 대대/NNG + 적/XSN + 이/VCP + ㄴ/ETM	_ + 대대적/NNG + 이/VCP + ㄴ/ETM
 38 |  돼지머리	_ + 돼지머리/NNG	_ + 돼지/NNG + 머리/NNG
 39 |  반벌거숭이	_ + 반벌거숭이/NNG	_ + 반/NNG + 벌거숭이/NNG
 40 | 이나 있	이나/JX + _ + 있/VX	이나/JX + _ + 있/VV
 41 |  도시가스	_ + 도시/NNG + 가스/NNG	_ + 도시가스/NNG
 42 | 그 반벌거숭이	그/MM + _ + 반벌거숭이/NNG	그/MM + _ + 반/NNG + 벌거숭이/NNG
 43 | 제자리걸음	제자리/NNG + 걸음/NNG	제자리걸음/NNG
 44 | 만나 보	만나/VV + 아/EC + _ + 보/VV	만나/VV + 아/EC + _ + 보/VX
 45 |  세계정세	_ + 세계/NNG + 정세/NNG	_ + 세계정세/NNG
 46 | 가상공간	가상/NNG + 공간/NNG	가상공간/NNG
 47 | 만병통치약	만병/NNG + 통치약/NNG	만병통치약/NNG
 48 |  조선말기	_ + 조/NNP + 선말기/NNG	_ + 조선/NNP + 말기/NNG
 49 | 그래선지	그렇/VA + 어선지/EC	그렇/VA + 어서/EC + 이/VCP + ㄴ지/EC
 50 | 해임건의안	해임/NNG + 건의/NNG + 안/NNG	해임/NNG + 건의안/NNG
 51 | 생맥주집	생/XPN + 맥주집/NNG	생/XPN + 맥주/NNG + 집/NNG
 52 |  다문화주의	_ + 다문화주의/NNG	_ + 다문화/NNG + 주의/NNG
 53 | 가족계획	가족/NNG + 계획/NNG	가족계획/NNG
 54 | 세대교체	세대/NNG + 교체/NNG	세대교체/NNG
 55 | 물항아리	물항아리/NNG	물/NNG + 항아리/NNG
 56 |  비평용어	_ + 비평용어/NNG	_ + 비평/NNG + 용어/NNG
 57 | 반벌거숭이	반벌거숭이/NNG	반/NNG + 벌거숭이/NNG
 58 | 수사본부	수사/NNG + 본부/NNG	수사본부/NNG
 59 | 전기난로	전기난로/NNG	전기/NNG + 난로/NNG
 60 | 원상회복	원상/NNG + 회복/NNG	원상회복/NNG
 61 |  베이지색 	_ + 베이지색/NNG + _	_ + 베이지/NNG + 색/NNG + _
 62 | 이 바람	이/JKS + _ + 바/NNG + 람/NNB	이/JKS + _ + 바람/NNG
 63 |  시기상조	_ + 시기/NNG + 상조/NNG	_ + 시기상조/NNG
 64 | 하리만치 	하/XSA + 리만/EC + 치/MAG + _	하/XSA + 리만치/EC + _
 65 |  원상회복	_ + 원상/NNG + 회복/NNG	_ + 원상회복/NNG
 66 | 수공예품	수공/NNG + 예품/NNG	수공예품/NNG
 67 | 베이지색	베이지색/NNG	베이지/NNG + 색/NNG
 68 | 신용보증기금	신/NNG + 용보증기금/NNP	신용보증기금/NNP
 69 | 도시가스	도시/NNG + 가스/NNG	도시가스/NNG
 70 |  가상공간	_ + 가상/NNG + 공간/NNG	_ + 가상공간/NNG
 71 | 학력고사	학력/NNG + 고사/NNG	학력고사/NNG
 72 | 사 가지	사/VV + 아/EC + _ + 가/VV + 지/VX	사/VV + 아/EC + _ + 가지/VX
 73 | 시기상조	시기/NNG + 상조/NNG	시기상조/NNG
 74 | 슬기슬기	슬기슬기/NNG	슬기/NNG + 슬기/NNG
 75 |  전기난로	_ + 전기난로/NNG	_ + 전기/NNG + 난로/NNG
 76 |  동물학자	_ + 동물/NNG + 학자/NNG	_ + 동물학자/NNG
 77 | 오리고기	오리고기/NNG	오리/NNG + 고기/NNG
 78 |  슬기슬기	_ + 슬기슬기/NNG	_ + 슬기/NNG + 슬기/NNG
 79 |  가족계획	_ + 가족/NNG + 계획/NNG	_ + 가족계획/NNG
 80 |  위기관리	_ + 위기/NNG + 관리/NNG	_ + 위기관리/NNG
 81 |  전지훈련	_ + 전지/NNG + 훈련/NNG	_ + 전지훈련/NNG
 82 | 습니다그려	습니다/EC + 그/JX + 려/IC	습니다/EC + 그려/JX
 83 | 비평용어	비평용어/NNG	비평/NNG + 용어/NNG
 84 | 지식인이란	지식인/NNG + 이/VCP + 란/JX	지식인/NNG + 이란/JX
 85 | 동물학자	동물/NNG + 학자/NNG	동물학자/NNG
 86 | 예술가촌	예술가촌/NNG	예술가/NNG + 촌/NNG
 87 | 베이지색 	베이지색/NNG + _	베이지/NNG + 색/NNG + _
 88 | 가 주는	가/JKS + _ + 주/VX + 는/ETM	가/JKS + _ + 주/VV + 는/ETM
 89 |  담임교사	_ + 담임/NNG + 교사/NNG	_ + 담임교사/NNG
 90 | 네덜란드인	네/NNP + 덜란드인/NNG	네덜란드인/NNG
 91 | 선불카드	선불/NNG + 카드/NNG	선불카드/NNG
 92 | 다문화주의	다문화주의/NNG	다문화/NNG + 주의/NNG
 93 | 어인 일	어/NNG + 이/VV + ㄴ/ETM + _ + 일/NNG	어인/MM + _ + 일/NNG
 94 | 조선말기	조/NNP + 선말기/NNG	조선/NNP + 말기/NNG
 95 | 진두지휘	진두/NNG + 지휘/NNG	진두지휘/NNG
 96 |  베이지색	_ + 베이지색/NNG	_ + 베이지/NNG + 색/NNG
 97 | 개인연금	개인/NNG + 연금/NNG	개인연금/NNG
 98 | 위기관리	위기/NNG + 관리/NNG	위기관리/NNG
 99 | , 대파	,/SP + _ + 대파/NNG	,/SP + _ + 대/XPN + 파/NNG
100 | 연탄가스	연탄/NNG + 가스/NNG	연탄가스/NNG
101 | 50퍼센트	50/SN + 퍼센/NNG + 트/NNB	50/SN + 퍼센트/NNG
102 | 담임교사	담임/NNG + 교사/NNG	담임교사/NNG
103 |  개인연금	_ + 개인/NNG + 연금/NNG	_ + 개인연금/NNG
104 | 전문학교	전문/NNG + 학교/NNG	전문학교/NNG
105 |  기념행사	_ + 기념/NNG + 행사/NNG	_ + 기념행사/NNG
106 | 실수요자	실수/NNG + 요자/NNG	실수요자/NNG
107 | 세계정세	세계/NNG + 정세/NNG	세계정세/NNG
108 | 아씨마님	아씨마님/NNG	아씨/NNG + 마님/NNG
109 | 미스 민	미스/NNG + _ + 민/NNG	미스/NNG + _ + 민/NNP
110 | 통신업체	통신/NNG + 업체/NNG	통신업체/NNG
111 | 소강상태	소강/NNG + 상태/NNG	소강상태/NNG
112 | 


--------------------------------------------------------------------------------
/rsc/src/large.errpatch.manual:
--------------------------------------------------------------------------------
1 | # 아래 엔트리는 단위테스트에 사용되는 것으로 삭제하지 마시기 바랍니다.
2 | 지저스크라이스트	지/NNG + 저스크라이스/NNP + 트/NNG	지저스/NNP + 크라이스트/NNP
3 | 지저스 크라이스트	지저스/NNP + _ + 크라이스/NNP + 트/NNG	지저스/NNP + _ + 크라이스트/NNP
4 | 고타마싯다르타	| + 고타마싯다르타/NNP	| + 고타마/NNP + 싯다르타/NNP
5 | 무함마드압둘라	무함마드압둘라/NNP + |	무함마드/NNP + 압둘라/NNP + |
6 | 


--------------------------------------------------------------------------------
/rsc/src/large.model.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kakao/khaiii/3d0c8944374163b1107fd6690ccf3a408430a02d/rsc/src/large.model.pickle


--------------------------------------------------------------------------------
/rsc/src/preanal.manual:
--------------------------------------------------------------------------------
1 | # 아래 두 엔트리는 단위테스트에 사용되는 것으로 삭제하지 마시기 바랍니다.
2 | 이더리움	이더리움/NNG
3 | 가즈아*	가/VV + 즈아/EC
4 | 


--------------------------------------------------------------------------------
/rsc/src/vocab.out.more:
--------------------------------------------------------------------------------
1 | I-SS:I-MAG:0
2 | I-SS:I-VCP:0
3 | 


--------------------------------------------------------------------------------
/src/main/cpp/khaiii/Config.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
  3 |  * @copyright  Copyright (C) 2018-, Kakao Corp. All rights reserved.
  4 |  */
  5 | 
  6 | 
  7 | #include "khaiii/Config.hpp"
  8 | 
  9 | 
 10 | //////////////
 11 | // includes //
 12 | //////////////
 13 | #include <fstream>
 14 | 
 15 | #include "fmt/format.h"
 16 | #include "nlohmann/json.hpp"
 17 | 
 18 | #include "khaiii/KhaiiiApi.hpp"
 19 | 
 20 | 
 21 | namespace khaiii {
 22 | 
 23 | 
 24 | using std::exception;
 25 | using std::ifstream;
 26 | using std::make_shared;
 27 | using std::shared_ptr;
 28 | using std::string;
 29 | 
 30 | 
 31 | /////////////
 32 | // methods //
 33 | /////////////
 34 | void Config::read_from_file(string path) {
 35 |     try {
 36 |         ifstream ifs(path);
 37 |         nlohmann::json jsn;
 38 |         ifs >> jsn;
 39 |         set_members(jsn);
 40 |     } catch (const exception& exc) {
 41 |         throw Except(fmt::format("fail to parse config: {}", exc.what()));
 42 |     }
 43 | }
 44 | 
 45 | 
 46 | void Config::override_from_str(const char* opt_str) {
 47 |     if (opt_str == nullptr || opt_str[0] == '\0') return;
 48 | 
 49 |     try {
 50 |         auto jsn = nlohmann::json::parse(opt_str);
 51 |         override_members(jsn);
 52 |     } catch (const exception& exc) {
 53 |         throw Except(fmt::format("fail to parse option: {}\n{}", exc.what(), opt_str));
 54 |     }
 55 | }
 56 | 
 57 | 
 58 | Config* Config::copy_and_override(const char* opt_str) {
 59 |     if (opt_str == nullptr || opt_str[0] == '\0') return this;
 60 | 
 61 |     auto found = _cfg_cache.find(opt_str);
 62 |     if (found != _cfg_cache.end()) return found->second.get();
 63 | 
 64 |     auto cfg = copy();
 65 |     try {
 66 |         auto jsn = nlohmann::json::parse(opt_str);
 67 |         cfg->override_members(jsn);
 68 |         _cfg_cache[opt_str] = cfg;
 69 |     } catch (const exception& exc) {
 70 |         throw Except(fmt::format("fail to parse option: {}\n{}", exc.what(), opt_str));
 71 |     }
 72 | 
 73 |     return cfg.get();
 74 | }
 75 | 
 76 | 
 77 | void Config::set_members(const nlohmann::json& jsn) {
 78 |     class_num = jsn.value("class_num", class_num);
 79 |     if (class_num <= 0) throw Except(fmt::format("invalid 'class_num' value: {}", class_num));
 80 | 
 81 |     embed_dim = jsn.value("embed_dim", embed_dim);
 82 |     if (embed_dim <= 0) throw Except(fmt::format("invalid 'embed_dim' value: {}", embed_dim));
 83 | 
 84 |     hidden_dim = jsn.value("hidden_dim", hidden_dim);
 85 |     if (hidden_dim <= 0) throw Except(fmt::format("invalid 'hidden_dim' value: {}", hidden_dim));
 86 | 
 87 |     vocab_size = jsn.value("vocab_size", vocab_size);
 88 |     if (vocab_size <= 0) throw Except(fmt::format("invalid 'vocab_size' value: {}", vocab_size));
 89 | 
 90 |     window = jsn.value("window", window);
 91 |     if (window <= 0) throw Except(fmt::format("invalid 'window' value: {}", window));
 92 | 
 93 |     override_members(jsn);
 94 | }
 95 | 
 96 | void Config::override_members(const nlohmann::json& jsn) {
 97 |     preanal = jsn.value("preanal", preanal);
 98 |     errpatch = jsn.value("errpatch", errpatch);
 99 |     restore = jsn.value("restore", restore);
100 | }
101 | 
102 | shared_ptr<Config> Config::copy() {
103 |     auto that = make_shared<Config>();
104 |     that->class_num = class_num;
105 |     that->embed_dim = embed_dim;
106 |     that->hidden_dim = hidden_dim;
107 |     that->vocab_size = vocab_size;
108 |     that->window = window;
109 |     that->preanal = preanal;
110 |     that->errpatch = errpatch;
111 |     that->restore = restore;
112 |     return that;
113 | }
114 | 
115 | 
116 | }    // namespace khaiii
117 | 


--------------------------------------------------------------------------------
/src/main/cpp/khaiii/Config.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
 3 |  * @copyright  Copyright (C) 2018-, Kakao Corp. All rights reserved.
 4 |  */
 5 | 
 6 | 
 7 | #ifndef SRC_MAIN_CPP_KHAIII_CONFIG_HPP_
 8 | #define SRC_MAIN_CPP_KHAIII_CONFIG_HPP_
 9 | 
10 | 
11 | //////////////
12 | // includes //
13 | //////////////
14 | #include <memory>
15 | #include <string>
16 | #include <unordered_map>
17 | 
18 | #include "nlohmann/json.hpp"
19 | 
20 | 
21 | namespace khaiii {
22 | 
23 | 
24 | /**
25 |  * JSON format configuration file
26 |  */
27 | class Config {
28 |  public:
29 |     int class_num = -1;    ///< number of classes
30 |     int embed_dim = -1;    ///< embedding dimension
31 |     int hidden_dim = -1;    ///< hidden dimension
32 |     int vocab_size = -1;    ///< vocabulary size
33 |     int window = -1;    ///< context window size
34 | 
35 |     bool preanal = true;    ///< whether apply preanal or not
36 |     bool errpatch = true;    ///< whether apply error patch or not
37 |     bool restore = true;    ///< whether restore morphemes or not
38 | 
39 |     Config() = default;
40 |     Config(const Config&) = delete;    ///< delete copy constructor
41 |     Config& operator=(const Config&) = delete;    ///< delete assignment operator
42 | 
43 |     /**
44 |      * 파일로부터 설정을 읽어들인다.
45 |      * @param  path  file path
46 |      */
47 |     void read_from_file(std::string path);
48 | 
49 |     /**
50 |      * JSON 옵션을 이용해 설정을 override 한다.
51 |      * @param  opt_str  option string (JSON format)
52 |      */
53 |     void override_from_str(const char* opt_str);
54 | 
55 |     /**
56 |      * 객체를 복사하고 설정을 override 한다.
57 |      * @param  opt_str option string (JSON format)
58 |      * @return  존재할 경우 그 옵션 객체
59 |      */
60 |     Config* copy_and_override(const char* opt_str);
61 | 
62 |     /**
63 |      * 파싱된 JSON 객체를 이용해서 멤버를 세팅한다.
64 |      * @param  jsn  JSON 객체
65 |      */
66 |     void set_members(const nlohmann::json& jsn);
67 | 
68 |     /**
69 |      * 파싱된 JSON 객체를 이용해서 오버라이딩할 멤버만 세팅한다.
70 |      * @param  jsn  JSON 객체
71 |      */
72 |     void override_members(const nlohmann::json& jsn);
73 | 
74 |     /**
75 |      * 자기 자신을 복사한 객체를 생성한다.
76 |      * @return  복사된 객체
77 |      */
78 |     std::shared_ptr<Config> copy();
79 | 
80 |  private:
81 |     /**
82 |      * 오버라이딩된 객체의 캐시
83 |      */
84 |     std::unordered_map<std::string, std::shared_ptr<Config>> _cfg_cache;
85 | };
86 | 
87 | 
88 | }    // namespace khaiii
89 | 
90 | 
91 | #endif  // SRC_MAIN_CPP_KHAIII_CONFIG_HPP_
92 | 


--------------------------------------------------------------------------------
/src/main/cpp/khaiii/Embed.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
 3 |  * @copyright  Copyright (C) 2018-, Kakao Corp. All rights reserved.
 4 |  */
 5 | 
 6 | 
 7 | #include "khaiii/Embed.hpp"
 8 | 
 9 | 
10 | //////////////
11 | // includes //
12 | //////////////
13 | #include <cstdlib>
14 | #include <string>
15 | 
16 | #include "khaiii/Config.hpp"
17 | #ifndef NDEBUG
18 | #include "khaiii/util.hpp"
19 | #endif
20 | 
21 | 
22 | namespace khaiii {
23 | 
24 | 
25 | using std::make_shared;
26 | using std::shared_ptr;
27 | using std::string;
28 | 
29 | 
30 | ////////////////////
31 | // static members //
32 | ////////////////////
33 | shared_ptr<spdlog::logger> Embed::_log = spdlog::stderr_color_mt("Embed");
34 | 
35 | 
36 | /////////////
37 | // methods //
38 | /////////////
39 | void Embed::open(const Config& cfg, string dir) {
40 |     _embed_mmf.open(fmt::format("{}/embed.bin", dir));
41 |     _keys = reinterpret_cast<const wchar_t*>(_embed_mmf.data());
42 |     const float* val_start = reinterpret_cast<const float*>(_keys + cfg.vocab_size);
43 |     for (int i = 0; i < cfg.vocab_size; ++i) {
44 |         const float* embed_start = val_start + i * cfg.embed_dim;
45 |         _vals.emplace_back(embedding_t(const_cast<float*>(embed_start), cfg.embed_dim));
46 |         SPDLOG_TRACE(_log, "[{}] {}", i, _vals[i]);
47 |     }
48 | }
49 | 
50 | 
51 | void Embed::close() {
52 |     _embed_mmf.close();
53 | }
54 | 
55 | 
56 | const embedding_t& Embed::operator[](wchar_t chr) const {
57 |     const wchar_t* found = reinterpret_cast<const wchar_t*>(
58 |             bsearch(&chr, _keys, _vals.size(), sizeof(wchar_t), Embed::_key_cmp));
59 |     int idx = 1;    // unknown character index is 1
60 |     if (found != nullptr) idx = found - _keys;
61 | #ifndef NDEBUG
62 |     wchar_t wstr[2] = {chr, 0};
63 |     SPDLOG_TRACE(_log, "'{}'({}) {}", wstr_to_utf8(wstr), idx, _vals.at(idx));
64 | #endif
65 |     return _vals.at(idx);
66 | }
67 | 
68 | 
69 | const embedding_t& Embed::left_word_bound() const {
70 |     return _vals.at(2);
71 | }
72 | 
73 | 
74 | const embedding_t& Embed::right_word_bound() const {
75 |     return _vals.at(3);
76 | }
77 | 
78 | 
79 | const embedding_t& Embed::left_padding() const {
80 |     return _vals.at(0);    // padding index is 0 which is zero vector
81 | }
82 | 
83 | 
84 | const embedding_t& Embed::right_padding() const {
85 |     return _vals.at(0);    // padding index is 0 which is zero vector
86 | }
87 | 
88 | 
89 | int Embed::_key_cmp(const void* left, const void* right) {
90 |     const wchar_t* left_ = reinterpret_cast<const wchar_t*>(left);
91 |     const wchar_t* right_ = reinterpret_cast<const wchar_t*>(right);
92 |     return *left_ - *right_;
93 | }
94 | 
95 | 
96 | }    // namespace khaiii
97 | 


--------------------------------------------------------------------------------
/src/main/cpp/khaiii/Embed.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
 3 |  * @copyright  Copyright (C) 2018-, Kakao Corp. All rights reserved.
 4 |  */
 5 | 
 6 | 
 7 | #ifndef SRC_MAIN_CPP_KHAIII_EMBED_HPP_
 8 | #define SRC_MAIN_CPP_KHAIII_EMBED_HPP_
 9 | 
10 | 
11 | //////////////
12 | // includes //
13 | //////////////
14 | #include <memory>
15 | #include <string>
16 | #include <vector>
17 | 
18 | #include "Eigen/Dense"
19 | #include "spdlog/spdlog.h"
20 | 
21 | #include "khaiii/MemMapFile.hpp"
22 | #include "khaiii/nn/tensor.hpp"
23 | 
24 | 
25 | namespace khaiii {
26 | 
27 | 
28 | using embedding_t = nn::vector_map_t;
29 | class Config;
30 | 
31 | 
32 | class Embed {
33 |  public:
34 |     /**
35 |      * open resource with memory data
36 |      * @param  cfg  config
37 |      * @param  dir  base directory
38 |      */
39 |     void open(const Config& cfg, std::string dir);
40 | 
41 |     void close();    ///< 리소스를 닫는다.
42 | 
43 |     /**
44 |      * get embedding vector with character
45 |      * @param  chr  character
46 |      * @return  embedding vector
47 |      */
48 |     const embedding_t& operator[](wchar_t chr) const;
49 | 
50 |     const embedding_t& left_word_bound() const;    ///< left word bound
51 |     const embedding_t& right_word_bound() const;    ///< right word bound
52 |     const embedding_t& left_padding() const;    ///< left padding
53 |     const embedding_t& right_padding() const;    ///< right padding
54 | 
55 |  private:
56 |     static std::shared_ptr<spdlog::logger> _log;    ///< logger
57 | 
58 |     const wchar_t* _keys = nullptr;    ///< keys (characters)
59 |     std::vector<embedding_t> _vals;    ///< values (embedding vectors)
60 | 
61 |     static int _key_cmp(const void* left, const void* right);    ///< key comparator for bsearch
62 | 
63 |     MemMapFile<char> _embed_mmf;    ///< model embedding memory mapping
64 | };
65 | 
66 | 
67 | }    // namespace khaiii
68 | 
69 | 
70 | #endif    // SRC_MAIN_CPP_KHAIII_EMBED_HPP_
71 | 


--------------------------------------------------------------------------------
/src/main/cpp/khaiii/ErrPatch.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
  3 |  * @copyright  Copyright (C) 2018-, Kakao Corp. All rights reserved.
  4 |  */
  5 | 
  6 | 
  7 | #include "khaiii/ErrPatch.hpp"
  8 | 
  9 | 
 10 | //////////////
 11 | // includes //
 12 | //////////////
 13 | #include <exception>
 14 | #include <memory>
 15 | #include <vector>
 16 | 
 17 | #include "khaiii/KhaiiiApi.hpp"
 18 | #include "khaiii/Sentence.hpp"
 19 | #include "khaiii/Word.hpp"
 20 | 
 21 | 
 22 | namespace khaiii {
 23 | 
 24 | 
 25 | using std::dynamic_pointer_cast;
 26 | using std::exception;
 27 | using std::shared_ptr;
 28 | using std::string;
 29 | using std::vector;
 30 | 
 31 | 
 32 | ////////////////////
 33 | // static members //
 34 | ////////////////////
 35 | const wchar_t ErrPatch::WORD_DELIM_NUM = -1;
 36 | const wchar_t ErrPatch::SENT_DELIM_NUM = -2;
 37 | 
 38 | shared_ptr<spdlog::logger> ErrPatch::_log = spdlog::stderr_color_mt("ErrPatch");
 39 | 
 40 | 
 41 | ////////////////////
 42 | // ctors and dtor //
 43 | ////////////////////
 44 | ErrPatch::~ErrPatch() {
 45 |     close();
 46 | }
 47 | 
 48 | 
 49 | /////////////
 50 | // methods //
 51 | /////////////
 52 | void ErrPatch::open(string dir) {
 53 |     _trie.open(dir + "/errpatch.tri");
 54 |     _val_mmf.open(dir + "/errpatch.val");
 55 |     MemMapFile<uint8_t> len_mmf;
 56 |     len_mmf.open(dir + "/errpatch.len");    // 각 value들의 길이 정보
 57 |     _vals.reserve(len_mmf.size());
 58 |     const uint8_t* lens = len_mmf.data();
 59 |     const int16_t* val_ptr = _val_mmf.data();
 60 |     for (int i = 0; i < len_mmf.size(); ++i) {
 61 |         // 길이 정보를 이용하여 int16_t 가변길이 배열인 값(_vals)을 세팅한다.
 62 |         _vals.emplace_back(val_ptr);
 63 |         val_ptr += lens[i] + 1;    // 길이 + 마지막 0
 64 |     }
 65 |     assert(_vals.size() == len_mmf.size());
 66 |     assert(val_ptr - _val_mmf.data() == _val_mmf.size());
 67 |     _log->info("errpatch dictionary opened");
 68 | }
 69 | 
 70 | 
 71 | void ErrPatch::close() {
 72 |     _trie.close();
 73 |     _val_mmf.close();
 74 |     _log->debug("errpatch dictionary closed");
 75 | }
 76 | 
 77 | 
 78 | void ErrPatch::apply(shared_ptr<Sentence> sent) const {
 79 |     vector<uint16_t*> outputs;    // 매칭된 패치의 정분석 결과 태그 값을 덮어쓸 출력 위치
 80 |     vector<wchar_t> chars = _get_char_tag_mixes(sent, &outputs);
 81 |     for (int i = 0; i < chars.size(); ++i) {
 82 |         auto found = _trie.search_longest_prefix_match(&chars[i]);
 83 |         if (found == boost::none) continue;
 84 |         auto val = _vals[found->val];
 85 |         for (int j = 0; j < found->len; ++j) {
 86 |             if (outputs[i + j] == nullptr) {
 87 |                 assert(val[j] == WORD_DELIM_NUM || val[j] == SENT_DELIM_NUM);
 88 |                 continue;
 89 |             }
 90 |             *outputs[i + j] = val[j];
 91 |         }
 92 |         i += found->len - 1;
 93 |     }
 94 | }
 95 | 
 96 | 
 97 | vector<wchar_t> ErrPatch::_get_char_tag_mixes(shared_ptr<Sentence> sent,
 98 |                                               vector<uint16_t*>* outputs) {
 99 |     vector<wchar_t> chars;
100 |     chars.reserve(2 + 2 * sent->words.size());
101 |     outputs->reserve(2 + 2 * sent->words.size());
102 |     chars.emplace_back(SENT_DELIM_NUM);    // 문장 경계
103 |     outputs->emplace_back(nullptr);
104 |     for (auto& word : sent->words) {
105 |         if (chars.size() > 1) {
106 |             chars.emplace_back(WORD_DELIM_NUM);    // 어절 경계
107 |             outputs->emplace_back(nullptr);
108 |         }
109 |         for (int i = 0; i < word->wlength; ++i) {
110 |             wchar_t char_tag_mix = (word->wbegin[i] << 12) | word->char_tags[i];
111 |             _log->debug("{:5x}|{:3x} -> {:08x}", static_cast<int>(word->wbegin[i]),
112 |                         word->char_tags[i], static_cast<int>(char_tag_mix));
113 |             chars.emplace_back(char_tag_mix);
114 |             outputs->emplace_back(&word->char_tags[i]);
115 |         }
116 |     }
117 |     chars.emplace_back(SENT_DELIM_NUM);    // 문장 경계
118 |     outputs->emplace_back(nullptr);
119 |     chars.emplace_back(0);    // 마지막 string termination
120 |     outputs->emplace_back(nullptr);
121 |     return chars;
122 | }
123 | 
124 | 
125 | }    // namespace khaiii
126 | 


--------------------------------------------------------------------------------
/src/main/cpp/khaiii/ErrPatch.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
 3 |  * @copyright  Copyright (C) 2018-, Kakao Corp. All rights reserved.
 4 |  */
 5 | 
 6 | 
 7 | #ifndef SRC_MAIN_CPP_KHAIII_ERRPATCH_HPP_
 8 | #define SRC_MAIN_CPP_KHAIII_ERRPATCH_HPP_
 9 | 
10 | 
11 | //////////////
12 | // includes //
13 | //////////////
14 | #include <memory>
15 | #include <string>
16 | #include <vector>
17 | 
18 | #include "spdlog/spdlog.h"
19 | 
20 | #include "khaiii/MemMapFile.hpp"
21 | #include "khaiii/Trie.hpp"
22 | 
23 | 
24 | namespace khaiii {
25 | 
26 | 
27 | class Sentence;
28 | 
29 | 
30 | class ErrPatch {
31 |  public:
32 |     static const wchar_t WORD_DELIM_NUM;    ///< 어절 경계를 나타내는 가상 음절
33 |     static const wchar_t SENT_DELIM_NUM;    ///< 문장 경계를 나타내는 가상 음절
34 | 
35 |     virtual ~ErrPatch();    ///< dtor
36 | 
37 |     /**
38 |      * 리소스를 연다.
39 |      * @param  dir  리소스 디렉토리
40 |      */
41 |     void open(std::string dir);
42 | 
43 |     void close();    ///< 리소스를 닫는다.
44 | 
45 |     /**
46 |      * 오분석 패치를 적용한다.
47 |      * @param  sent  문장
48 |      */
49 |     void apply(std::shared_ptr<Sentence> sent) const;
50 | 
51 |  private:
52 |     static std::shared_ptr<spdlog::logger> _log;    ///< logger
53 | 
54 |     Trie _trie;
55 |     MemMapFile<int16_t> _val_mmf;    ///< value memory mapping
56 |     std::vector<const int16_t*> _vals;    ///< actual values
57 | 
58 |     /**
59 |      * 문장을 Trie 입력에 맞도록 음절과 태그의 비트 조합의 열로 만들고, 출력 위치를 기록한다.
60 |      * @param  sent  문장
61 |      * @param  outputs  출력 위치
62 |      * @return  음절과 태그의 비트 조합한 열
63 |      */
64 |     static std::vector<wchar_t> _get_char_tag_mixes(std::shared_ptr<Sentence> sent,
65 |                                                     std::vector<uint16_t*>* outputs);
66 | };
67 | 
68 | 
69 | }    // namespace khaiii
70 | 
71 | 
72 | #endif    // SRC_MAIN_CPP_KHAIII_ERRPATCH_HPP_
73 | 


--------------------------------------------------------------------------------
/src/main/cpp/khaiii/KhaiiiImpl.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
  3 |  * @copyright  Copyright (C) 2018-, Kakao Corp. All rights reserved.
  4 |  */
  5 | 
  6 | 
  7 | #ifndef SRC_MAIN_CPP_KHAIII_KHAIIIIMPL_HPP_
  8 | #define SRC_MAIN_CPP_KHAIII_KHAIIIIMPL_HPP_
  9 | 
 10 | 
 11 | //////////////
 12 | // includes //
 13 | //////////////
 14 | #include <list>
 15 | #include <map>
 16 | #include <memory>
 17 | #include <mutex>    // NOLINT
 18 | #include <string>
 19 | #include <vector>
 20 | 
 21 | #include "spdlog/spdlog.h"
 22 | 
 23 | #include "khaiii/Config.hpp"
 24 | #include "khaiii/KhaiiiApi.hpp"
 25 | #include "khaiii/Resource.hpp"
 26 | 
 27 | 
 28 | namespace khaiii {
 29 | 
 30 | 
 31 | class Sentence;
 32 | 
 33 | 
 34 | /**
 35 |  * implementation of khaiii API
 36 |  */
 37 | class KhaiiiImpl: public KhaiiiApi {
 38 |  public:
 39 |     virtual ~KhaiiiImpl();    ///< dtor
 40 | 
 41 |     void open(std::string rsc_dir = "", std::string opt_str = "") override;
 42 | 
 43 |     const khaiii_word_t* analyze(const char* input, const char* opt_str) override;
 44 | 
 45 |     /**
 46 |      * 분석을 수행하고 오분석 패치를 실행하기 직전에 멈춘 다음 그 결과를 리턴한다.
 47 |      * @param  input  input text
 48 |      * @param  output  output value for each character
 49 |      * @param  opt_str  runtime option (JSON format)
 50 |      * @return  output length. -1 if failed
 51 |      */
 52 |     int analyze_bfr_errpatch(const char* input, const char* opt_str, int16_t* output);
 53 | 
 54 |     void free_results(const khaiii_word_t* results) override;
 55 | 
 56 |     void close() override;
 57 | 
 58 |     /**
 59 |      * get mutex for this api object
 60 |      * @return  mutex
 61 |      */
 62 |     std::recursive_mutex& get_mutex();
 63 | 
 64 |     /**
 65 |      * set error message
 66 |      * @param  message
 67 |      */
 68 |     void set_err_msg(std::string msg);
 69 | 
 70 |     /**
 71 |      * get error message
 72 |      * @return  message
 73 |      */
 74 |     const char* get_err_msg() const;
 75 | 
 76 |     /**
 77 |      * 로그 레벨을 지정한다.
 78 |      * @param  name  로거 이름. "all"인 경우 모든 로거
 79 |      * @param  level 로거 레벨. trace, debug, info, warn, err, critical
 80 |      */
 81 |     static void set_log_level(std::string name, std::string level);
 82 | 
 83 |     /**
 84 |      * 여러 로그 레벨을 한꺼번에 지정한다.
 85 |      * @param  name_level_pairs  로거 (이름, 레벨) 쌍의 리스트.
 86 |      *                           "all:warn,console:info,Tagger:debug"와 같은 형식
 87 |      */
 88 |     static void set_log_levels(std::string name_level_pairs);
 89 | 
 90 | 
 91 |  private:
 92 |     static std::shared_ptr<spdlog::logger> _log;    ///< logger
 93 | 
 94 |     std::recursive_mutex _mutex;    ///< mutex to access exclusively
 95 |     bool _is_opened = false;    ///< handle is opened
 96 |     std::string _err_msg;    ///< last error message
 97 | 
 98 |     Config _cfg;    ///< config
 99 |     Resource _rsc;    ///< resource
100 | 
101 |     // 분석 결과를 C API에 넘겨주고 참조 카운트가 0이 되어 메모리에서 해제되는 것을 방지하기 위해,
102 |     // 헤드 어절을 키로 하여 문장 객체 전체를 임시로 넣어두는 보관소
103 |     std::map<const khaiii_word_t*, std::shared_ptr<Sentence>> _result_cloakroom;
104 | 
105 |     /**
106 |      * 보관소에 결과를 맡긴다.
107 |      * @param  sent  문장
108 |      * @return  첫번째 어절의 포인터
109 |      */
110 |     const khaiii_word_t* _deposit_sent(std::shared_ptr<Sentence> sent);
111 | 
112 |     /**
113 |      * 보관하던 결과를 삭제한다.
114 |      * @param  head_word  첫번째 어절의 포인터
115 |      */
116 |     void _withdraw_sent(const khaiii_word_t* head_word);
117 | 
118 |     /**
119 |      * 리소스 디렉토리를 점검한다.
120 |      * @param  rsc_dir  resource directory
121 |      * @return  존재하는 디렉토리 경로
122 |      */
123 |      std::string _check_rsc_dir(std::string rsc_dir);
124 | };
125 | 
126 | 
127 | }    // namespace khaiii
128 | 
129 | 
130 | #endif  // SRC_MAIN_CPP_KHAIII_KHAIIIIMPL_HPP_
131 | 


--------------------------------------------------------------------------------
/src/main/cpp/khaiii/MemMapFile.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
  3 |  * @copyright  Copyright (C) 2018-, Kakao Corp. All rights reserved.
  4 |  */
  5 | 
  6 | 
  7 | #ifndef SRC_MAIN_CPP_KHAIII_MEMMAPFILE_HPP_
  8 | #define SRC_MAIN_CPP_KHAIII_MEMMAPFILE_HPP_
  9 | 
 10 | 
 11 | //////////////
 12 | // includes //
 13 | //////////////
 14 | #include <fcntl.h>
 15 | #include <sys/mman.h>
 16 | #include <unistd.h>
 17 | #include <fstream>
 18 | #include <string>
 19 | 
 20 | #include "fmt/format.h"
 21 | 
 22 | #include "khaiii/KhaiiiApi.hpp"
 23 | 
 24 | 
 25 | namespace khaiii {
 26 | 
 27 | 
 28 | /**
 29 |  * memory mapped file
 30 |  */
 31 | template<typename T>
 32 | class MemMapFile {
 33 |  public:
 34 |     /**
 35 |      * dtor
 36 |      */
 37 |     virtual ~MemMapFile() {
 38 |         close();
 39 |     }
 40 | 
 41 |     /**
 42 |      * open memory mapped file
 43 |      * @param  path  path
 44 |      */
 45 |     void open(std::string path) {
 46 |         close();
 47 |         int fd = ::open(path.c_str(), O_RDONLY, 0660);
 48 |         if (fd == -1) throw Except(fmt::format("fail to open file: {}", path));
 49 |         std::ifstream fin(path, std::ifstream::ate | std::ifstream::binary);
 50 |         _byte_len = fin.tellg();
 51 |         if (_byte_len == -1) throw Except(fmt::format("fail to get size of file: {}", path));
 52 |         assert(_byte_len % sizeof(T) == 0);
 53 |         _data = reinterpret_cast<const T*>(::mmap(0, _byte_len, PROT_READ, MAP_SHARED, fd, 0));
 54 |         ::close(fd);
 55 |         if (_data == MAP_FAILED) {
 56 |             throw Except(fmt::format("fail to map file to memory: {}", path));
 57 |         }
 58 |         _path = path;
 59 |     }
 60 | 
 61 |     /**
 62 |      * close memory mapped file
 63 |      */
 64 |     void close() {
 65 |         if (_data) {
 66 |             if (::munmap(const_cast<T*>(_data), _byte_len) == -1) {
 67 |                 throw Except(fmt::format("fail to close memory mapped file: {}", _path));
 68 |             }
 69 |         }
 70 |         _path = "";
 71 |         _data = nullptr;
 72 |         _byte_len = -1;
 73 |     }
 74 | 
 75 |     /**
 76 |      * get pointer of data
 77 |      * @return  start address of data
 78 |      */
 79 |     const T* data() const {
 80 |         assert(_data != nullptr && _byte_len >= sizeof(T));
 81 |         return _data;
 82 |     }
 83 | 
 84 |     /**
 85 |      * get data size
 86 |      * @return  number of data elements (not byte length)
 87 |      */
 88 |     int size() const {
 89 |         assert(_data != nullptr && _byte_len >= sizeof(T));
 90 |         return _byte_len / sizeof(T);
 91 |     }
 92 | 
 93 |  private:
 94 |     std::string _path;    ///< file path
 95 |     const T* _data = nullptr;    ///< memory data
 96 |     int _byte_len = -1;    ///< byte length
 97 | };
 98 | 
 99 | 
100 | }    // namespace khaiii
101 | 
102 | 
103 | #endif    // SRC_MAIN_CPP_KHAIII_MEMMAPFILE_HPP_
104 | 


--------------------------------------------------------------------------------
/src/main/cpp/khaiii/Morph.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
 3 |  * @copyright  Copyright (C) 2018-, Kakao Corp. All rights reserved.
 4 |  */
 5 | 
 6 | 
 7 | #include "khaiii/Morph.hpp"
 8 | 
 9 | 
10 | //////////////
11 | // includes //
12 | //////////////
13 | #include <cassert>
14 | #include <string>
15 | #include <vector>
16 | 
17 | #include "khaiii/util.hpp"
18 | 
19 | 
20 | namespace khaiii {
21 | 
22 | 
23 | using std::string;
24 | using std::vector;
25 | using std::wstring;
26 | using std::wstringstream;
27 | 
28 | 
29 | ///////////////
30 | // variables //
31 | ///////////////
32 | static const char* _TAG_SET[POS_TAG_SIZE] = {
33 |      "EC",  "EF",  "EP", "ETM", "ETN",  "IC",  "JC", "JKB", "JKC", "JKG",
34 |     "JKO", "JKQ", "JKS", "JKV",  "JX", "MAG", "MAJ",  "MM", "NNB", "NNG",
35 |     "NNP",  "NP",  "NR",  "SE",  "SF",  "SH",  "SL",  "SN",  "SO",  "SP",
36 |      "SS",  "SW", "SWK",  "VA", "VCN", "VCP",  "VV",  "VX", "XPN",  "XR",
37 |     "XSA", "XSN", "XSV",  "ZN",  "ZV",  "ZZ",
38 | };
39 | 
40 | 
41 | ////////////////////
42 | // ctors and dtor //
43 | ////////////////////
44 | Morph::Morph(wstring wlex, pos_tag_t tag, const wchar_t* wbegin, int wlength)
45 |         : wlex(wlex), wbegin(wbegin), wlength(wlength), _lex(wstr_to_utf8(wlex)) {
46 |     lex = _lex.c_str();
47 |     this->tag = pos_str(tag);
48 |     begin = -1;
49 |     length = -1;
50 |     next = nullptr;
51 | }
52 | 
53 | 
54 | /////////////
55 | // methods //
56 | /////////////
57 | const char* Morph::pos_str(pos_tag_t num) {
58 |     assert(0 < num && num <= POS_TAG_SIZE);
59 |     return _TAG_SET[num-1];
60 | }
61 | 
62 | void Morph::organize(const wstring& wraw, const vector<int>& wbegins, const vector<int>& wends) {
63 |     int begin_idx = wbegin - wraw.c_str();
64 |     int end_idx = begin_idx + wlength - 1;
65 |     begin = wbegins[begin_idx];
66 |     length = wends[end_idx] - begin;
67 | }
68 | 
69 | 
70 | 
71 | string Morph::str() {
72 |     return wstr_to_utf8(wstr());
73 | }
74 | 
75 | 
76 | wstring Morph::wstr() {
77 |     wstringstream wss;
78 |     wss << wlex << L"/" << tag << L":" << begin << L"," << length;
79 |     return wss.str();
80 | }
81 | 
82 | 
83 | }    // namespace khaiii
84 | 


--------------------------------------------------------------------------------
/src/main/cpp/khaiii/Morph.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
 3 |  * @copyright  Copyright (C) 2018-, Kakao Corp. All rights reserved.
 4 |  */
 5 | 
 6 | 
 7 | #ifndef SRC_MAIN_CPP_KHAIII_MORPH_HPP_
 8 | #define SRC_MAIN_CPP_KHAIII_MORPH_HPP_
 9 | 
10 | 
11 | //////////////
12 | // includes //
13 | //////////////
14 | #include <string>
15 | #include <vector>
16 | 
17 | #include "khaiii/khaiii_api.h"
18 | 
19 | 
20 | namespace khaiii {
21 | 
22 | 
23 | /** 품사 태그 */
24 | typedef enum {
25 |      EC,  EF,  EP, ETM, ETN,  IC,  JC, JKB, JKC, JKG,
26 |     JKO, JKQ, JKS, JKV,  JX, MAG, MAJ,  MM, NNB, NNG,
27 |     NNP,  NP,  NR,  SE,  SF,  SH,  SL,  SN,  SO,  SP,
28 |      SS,  SW, SWK,  VA, VCN, VCP,  VV,  VX, XPN,  XR,
29 |     XSA, XSN, XSV,  ZN,  ZV,  ZZ,
30 |     POS_TAG_SIZE
31 | } pos_tag_t;
32 | 
33 | 
34 | /**
35 |  * 형태소 자료구조
36 |  */
37 | class Morph: public khaiii_morph_t {
38 |  public:
39 |     std::wstring wlex;    ///< unicode lexical
40 |     const wchar_t* wbegin = nullptr;    ///< unicode string begin address
41 |     int wlength = 0;    ///< unicode string length
42 | 
43 |     Morph(std::wstring wlex, pos_tag_t tag, const wchar_t* wbegin, int wlength);    ///< ctor
44 | 
45 |     /**
46 |      * API 결과 구조체의 내용을 채운다.
47 |      * @param  wraw  유니코드 원문
48 |      * @param  wbegins  각 음절별 시작 byte 위치
49 |      * @param  wends  각 음절별 끝 byte 위치
50 |      */
51 |     void organize(const std::wstring& wraw, const std::vector<int>& wbegins,
52 |                   const std::vector<int>& wends);
53 | 
54 |     /**
55 |      * pos_tag_t 타입의 숫자 태그에 대응하는 문자열 태그를 리턴한다.
56 |      * @param  num  숫자 품사 태그
57 |      * @return  문자열 품사 태그
58 |      */
59 |     static const char* pos_str(pos_tag_t num);
60 | 
61 |     /**
62 |      * 개체명 태그 스트링의 포인터를 전달해서, API 구조체 내 변수에 설정합니다.
63 |      * @param  tag  개체명 태그
64 |      * @return void
65 |      */
66 |     void set_ne_str(const char* tag);
67 | 
68 |     std::string str();    ///< UTF-8 문자열로 표현합니다.
69 |     std::wstring wstr();    ///< 유니코드 문자열로 표현합니다. (거의) 디버그용
70 | 
71 |  private:
72 |     std::string _lex;    ///< cache of UTF-8 lexical
73 | };
74 | 
75 | 
76 | }    // namespace khaiii
77 | 
78 | 
79 | #endif    // SRC_MAIN_CPP_KHAIII_MORPH_HPP_
80 | 


--------------------------------------------------------------------------------
/src/main/cpp/khaiii/Preanal.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
 3 |  * @copyright  Copyright (C) 2018-, Kakao Corp. All rights reserved.
 4 |  */
 5 | 
 6 | 
 7 | #include "khaiii/Preanal.hpp"
 8 | 
 9 | 
10 | //////////////
11 | // includes //
12 | //////////////
13 | #include <exception>
14 | #include <memory>
15 | 
16 | #include "khaiii/KhaiiiApi.hpp"
17 | #include "khaiii/Word.hpp"
18 | 
19 | 
20 | namespace khaiii {
21 | 
22 | 
23 | using std::exception;
24 | using std::shared_ptr;
25 | using std::string;
26 | 
27 | 
28 | ////////////////////
29 | // static members //
30 | ////////////////////
31 | shared_ptr<spdlog::logger> Preanal::_log = spdlog::stderr_color_mt("Preanal");
32 | 
33 | 
34 | ////////////////////
35 | // ctors and dtor //
36 | ////////////////////
37 | Preanal::~Preanal() {
38 |     close();
39 | }
40 | 
41 | 
42 | /////////////
43 | // methods //
44 | /////////////
45 | void Preanal::open(string dir) {
46 |     _trie.open(dir + "/preanal.tri");
47 |     _val_mmf.open(dir + "/preanal.val");
48 |     _log->info("preanal dictionary opened");
49 | }
50 | 
51 | 
52 | void Preanal::close() {
53 |     _trie.close();
54 |     _val_mmf.close();
55 |     _log->debug("preanal dictionary closed");
56 | }
57 | 
58 | 
59 | void Preanal::apply(shared_ptr<Word> word) const {
60 |     auto matches = _trie.search_common_prefix_matches(word->wbegin, word->wlength);
61 |     int len = 0;
62 |     int idx = -1;
63 |     for (auto match = matches.rbegin(); match != matches.rend(); ++match) {
64 |         bool is_exact = match->val % 2 == 0;
65 |         if (is_exact && match->len == word->wlength) {
66 |             len = match->len;
67 |             idx = match->val / 2;
68 |         } else if (!is_exact) {
69 |             len = match->len;
70 |             idx = (match->val - 1) / 2;
71 |         }
72 |         if (len > 1 && idx >= 0) break;
73 |     }
74 |     if (len <= 0 || idx < 0) return;
75 |     const uint16_t* tag_out_start = &_val_mmf.data()[idx];
76 |     for (int i = 0; i < len; ++i) {
77 |         word->char_tags[i] = tag_out_start[i];
78 |     }
79 | }
80 | 
81 | 
82 | }    // namespace khaiii
83 | 


--------------------------------------------------------------------------------
/src/main/cpp/khaiii/Preanal.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
 3 |  * @copyright  Copyright (C) 2018-, Kakao Corp. All rights reserved.
 4 |  */
 5 | 
 6 | 
 7 | #ifndef SRC_MAIN_CPP_KHAIII_PREANAL_HPP_
 8 | #define SRC_MAIN_CPP_KHAIII_PREANAL_HPP_
 9 | 
10 | 
11 | //////////////
12 | // includes //
13 | //////////////
14 | #include <memory>
15 | #include <string>
16 | 
17 | #include "spdlog/spdlog.h"
18 | 
19 | #include "khaiii/MemMapFile.hpp"
20 | #include "khaiii/Trie.hpp"
21 | 
22 | 
23 | namespace khaiii {
24 | 
25 | 
26 | class Word;
27 | 
28 | 
29 | class Preanal {
30 |  public:
31 |     virtual ~Preanal();    ///< dtor
32 | 
33 |     /**
34 |      * 리소스를 연다.
35 |      * @param  dir  리소스 디렉토리
36 |      */
37 |     void open(std::string dir);
38 | 
39 |     void close();    ///< 리소스를 닫는다.
40 | 
41 |     /**
42 |      * 기분석 사전을 적용하여 음절 별로 태깅한다.
43 |      * @param  word  어절
44 |      */
45 |     void apply(std::shared_ptr<Word> word) const;
46 | 
47 |  private:
48 |     static std::shared_ptr<spdlog::logger> _log;    ///< logger
49 | 
50 |     Trie _trie;
51 |     MemMapFile<uint16_t> _val_mmf;    ///< value memory mapping
52 | };
53 | 
54 | 
55 | }    // namespace khaiii
56 | 
57 | 
58 | #endif    // SRC_MAIN_CPP_KHAIII_PREANAL_HPP_
59 | 


--------------------------------------------------------------------------------
/src/main/cpp/khaiii/Resource.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
 3 |  * @copyright  Copyright (C) 2018-, Kakao Corp. All rights reserved.
 4 |  */
 5 | 
 6 | 
 7 | #include "khaiii/Resource.hpp"
 8 | 
 9 | 
10 | //////////////
11 | // includes //
12 | //////////////
13 | #include <exception>
14 | #include <memory>
15 | 
16 | #include "khaiii/Config.hpp"
17 | #include "khaiii/KhaiiiApi.hpp"
18 | #include "khaiii/nn/tensor.hpp"
19 | 
20 | 
21 | namespace khaiii {
22 | 
23 | 
24 | using std::exception;
25 | using std::shared_ptr;
26 | using std::string;
27 | 
28 | 
29 | ////////////////////
30 | // static members //
31 | ////////////////////
32 | shared_ptr<spdlog::logger> Resource::_log = spdlog::stderr_color_mt("Resource");
33 | 
34 | 
35 | ////////////////////
36 | // ctors and dtor //
37 | ////////////////////
38 | Resource::~Resource() {
39 |     close();
40 | }
41 | 
42 | 
43 | /////////////
44 | // methods //
45 | /////////////
46 | void Resource::open(const Config& cfg, std::string dir) {
47 |     embed.open(cfg, dir);
48 |     for (int kernel_size = 2; kernel_size < 6; ++kernel_size) {
49 |         string path = fmt::format("{}/conv.{}.fil", dir, kernel_size);
50 |         convs[kernel_size].open(path, cfg.embed_dim, cfg.embed_dim, kernel_size, &nn::RELU);
51 |     }
52 |     cnv2hdn.open(dir + "/cnv2hdn.lin", 4 * cfg.embed_dim, cfg.hidden_dim, true, &nn::RELU);
53 |     string path = fmt::format("{}/hdn2tag.lin", dir);
54 |     hdn2tag.open(path, cfg.hidden_dim, cfg.class_num, true);
55 |     _log->info("NN model loaded");
56 |     preanal.open(dir);
57 |     errpatch.open(dir);
58 |     restore.open(dir);
59 |     _log->info("PoS tagger opened");
60 | }
61 | 
62 | 
63 | void Resource::close() {
64 |     embed.close();
65 |     hdn2tag.close();
66 |     preanal.close();
67 |     errpatch.close();
68 |     restore.close();
69 |     _log->debug("PoS tagger closed");
70 | }
71 | 
72 | 
73 | }    // namespace khaiii
74 | 


--------------------------------------------------------------------------------
/src/main/cpp/khaiii/Resource.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
 3 |  * @copyright  Copyright (C) 2018-, Kakao Corp. All rights reserved.
 4 |  */
 5 | 
 6 | 
 7 | #ifndef SRC_MAIN_CPP_KHAIII_RESOURCE_HPP_
 8 | #define SRC_MAIN_CPP_KHAIII_RESOURCE_HPP_
 9 | 
10 | 
11 | //////////////
12 | // includes //
13 | //////////////
14 | #include <memory>
15 | #include <string>
16 | 
17 | #include "spdlog/spdlog.h"
18 | 
19 | #include "khaiii/Embed.hpp"
20 | #include "khaiii/ErrPatch.hpp"
21 | #include "khaiii/Preanal.hpp"
22 | #include "khaiii/Resource.hpp"
23 | #include "khaiii/Restore.hpp"
24 | #include "khaiii/nn/Conv1d.hpp"
25 | #include "khaiii/nn/Linear.hpp"
26 | 
27 | 
28 | namespace khaiii {
29 | 
30 | 
31 | class Config;
32 | 
33 | 
34 | /**
35 |  * resources for part-of-speech tagger
36 |  */
37 | class Resource {
38 |  public:
39 |     virtual ~Resource();    ///< dtor
40 | 
41 |     Embed embed;    ///< character embedding
42 |     nn::Linear cnv2hdn;    ///< convs -> hidden layer
43 |     nn::Linear hdn2tag;    ///< hidden -> tag(output) layer
44 |     nn::Conv1d convs[6];    ///< convolution layers (0, 1 are dummy)
45 |     Preanal preanal;    ///< 기분석 사전
46 |     ErrPatch errpatch;    ///< 오분석 패치
47 |     Restore restore;    ///< 원형복원 사전
48 | 
49 |     void open(const Config& cfg, std::string dir);
50 |     void close();
51 | 
52 |  private:
53 |     static std::shared_ptr<spdlog::logger> _log;    ///< logger
54 | };
55 | 
56 | 
57 | }    // namespace khaiii
58 | 
59 | 
60 | #endif    // SRC_MAIN_CPP_KHAIII_RESOURCE_HPP_
61 | 


--------------------------------------------------------------------------------
/src/main/cpp/khaiii/Restore.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
  3 |  * @copyright  Copyright (C) 2018-, Kakao Corp. All rights reserved.
  4 |  */
  5 | 
  6 | 
  7 | #include "khaiii/Restore.hpp"
  8 | 
  9 | 
 10 | //////////////
 11 | // includes //
 12 | //////////////
 13 | #include <exception>
 14 | #include <memory>
 15 | #include <vector>
 16 | 
 17 | #include "khaiii/KhaiiiApi.hpp"
 18 | #include "khaiii/Morph.hpp"
 19 | #include "khaiii/util.hpp"
 20 | 
 21 | 
 22 | namespace khaiii {
 23 | 
 24 | 
 25 | using std::exception;
 26 | using std::shared_ptr;
 27 | using std::string;
 28 | using std::vector;
 29 | 
 30 | 
 31 | ///////////////
 32 | // constants //
 33 | ///////////////
 34 | static const char* _B_STRS[POS_TAG_SIZE] = {
 35 |      "B-EC",  "B-EF",  "B-EP", "B-ETM", "B-ETN",  "B-IC",  "B-JC", "B-JKB", "B-JKC", "B-JKG",
 36 |     "B-JKO", "B-JKQ", "B-JKS", "B-JKV",  "B-JX", "B-MAG", "B-MAJ",  "B-MM", "B-NNB", "B-NNG",
 37 |     "B-NNP",  "B-NP",  "B-NR",  "B-SE",  "B-SF",  "B-SH",  "B-SL",  "B-SN",  "B-SO",  "B-SP",
 38 |      "B-SS",  "B-SW", "B-SWK",  "B-VA", "B-VCN", "B-VCP",  "B-VV",  "B-VX", "B-XPN",  "B-XR",
 39 |     "B-XSA", "B-XSN", "B-XSV",  "B-ZN",  "B-ZV",  "B-ZZ",
 40 | };
 41 | 
 42 | static const char* _I_STRS[POS_TAG_SIZE] = {
 43 |      "I-EC",  "I-EF",  "I-EP", "I-ETM", "I-ETN",  "I-IC",  "I-JC", "I-JKB", "I-JKC", "I-JKG",
 44 |     "I-JKO", "I-JKQ", "I-JKS", "I-JKV",  "I-JX", "I-MAG", "I-MAJ",  "I-MM", "I-NNB", "I-NNG",
 45 |     "I-NNP",  "I-NP",  "I-NR",  "I-SE",  "I-SF",  "I-SH",  "I-SL",  "I-SN",  "I-SO",  "I-SP",
 46 |      "I-SS",  "I-SW", "I-SWK",  "I-VA", "I-VCN", "I-VCP",  "I-VV",  "I-VX", "I-XPN",  "I-XR",
 47 |     "I-XSA", "I-XSN", "I-XSV",  "I-ZN",  "I-ZV",  "I-ZZ",
 48 | };
 49 | 
 50 | 
 51 | ////////////////////
 52 | // static members //
 53 | ////////////////////
 54 | shared_ptr<spdlog::logger> Restore::_log = spdlog::stderr_color_mt("Restore");
 55 | 
 56 | 
 57 | ////////////////////
 58 | // ctors and dtor //
 59 | ////////////////////
 60 | Restore::~Restore() {
 61 |     close();
 62 | }
 63 | 
 64 | 
 65 | /////////////
 66 | // methods //
 67 | /////////////
 68 | std::string chr_tag_t::str() {
 69 |     assert(0 < tag && tag <= POS_TAG_SIZE);
 70 |     wchar_t wstr[2] = {chr, 0};
 71 |     const char** table = _B_STRS;
 72 |     if (bi == chr_tag_t::I) table = _I_STRS;
 73 |     return wstr_to_utf8(wstr) + "/" + table[tag-1];
 74 | }
 75 | 
 76 | 
 77 | void Restore::open(string dir) {
 78 |     _key_mmf.open(dir + "/restore.key");
 79 |     _val_mmf.open(dir + "/restore.val");
 80 |     assert(_key_mmf.size() * _MAX_VAL_LEN == _val_mmf.size());
 81 |     _one_mmf.open(dir + "/restore.one");
 82 | #ifndef NDEBUG
 83 |     for (int i = 0; i < _one_mmf.size(); ++i) {
 84 |         SPDLOG_TRACE(_log, "{}: {}, ", i, _one_mmf.data()[i]);
 85 |     }
 86 | #endif
 87 |     _log->info("restore dictionary opened");
 88 | }
 89 | 
 90 | 
 91 | void Restore::close() {
 92 |     _key_mmf.close();
 93 |     _val_mmf.close();
 94 |     _one_mmf.close();
 95 |     _log->debug("restore dictionary closed");
 96 | }
 97 | 
 98 | 
 99 | vector<chr_tag_t> Restore::restore(wchar_t chr, uint16_t tag_out, bool use_dic) const {
100 |     assert(tag_out > 0);
101 |     vector<chr_tag_t> restored;
102 |     if (!is_need_restore(tag_out)) {
103 |         // 원형 복원이 필요없는 경우
104 |         restored.emplace_back(chr_tag_t());
105 |         restored.back().chr = chr;
106 |         restored.back().set_tag(tag_out);
107 |         return restored;
108 |     }
109 | 
110 |     if (!use_dic) {
111 |         // 원형 복원 사전을 사용하지 않고 첫번째 태그로 바로 부여한다.
112 |         restored.emplace_back(chr_tag_t());
113 |         restored.back().chr = chr;
114 |         restored.back().set_tag(get_one(tag_out));
115 |         return restored;
116 |     }
117 | 
118 |     int idx = find(chr, tag_out);
119 |     if (idx == -1) {
120 |         // 키가 발견되지 않는 경우 태그 조합 중 첫번째 태그로 부여한다.
121 |         uint16_t tag_one = get_one(tag_out);
122 | #ifndef NDEBUG
123 |         wchar_t wstr[2] = {chr, 0};
124 |         _log->info("restore key not found: {}/{} => {}", wstr_to_utf8(wstr), tag_out, tag_one);
125 | #endif
126 |         restored.emplace_back(chr_tag_t());
127 |         restored.back().chr = chr;
128 |         restored.back().set_tag(tag_one);
129 |     } else {
130 |         const uint32_t* val = _val_mmf.data() + (idx * _MAX_VAL_LEN);
131 |         for (int i = 0; *val && i < _MAX_VAL_LEN; ++val, ++i) {
132 |             restored.emplace_back(chr_tag_t());
133 |             restored.back().from_val(*val);
134 |         }
135 |     }
136 |     return restored;
137 | }
138 | 
139 | 
140 | bool Restore::is_need_restore(uint16_t tag_out) {
141 |     return tag_out > 2 * POS_TAG_SIZE;
142 | }
143 | 
144 | 
145 | int Restore::find(wchar_t chr, uint16_t tag_out) const {
146 |     assert(is_need_restore(tag_out));
147 |     uint32_t key = chr << 12 | tag_out;    // key의 경우 12비트를 shift하고 output tag를 합친다.
148 |     const uint32_t* found = reinterpret_cast<const uint32_t*>(
149 |             bsearch(&key, _key_mmf.data(), _key_mmf.size(), sizeof(uint32_t), Restore::key_cmp));
150 |     if (found == nullptr) return -1;
151 |     return found - _key_mmf.data();
152 | }
153 | 
154 | 
155 | uint8_t Restore::get_one(uint16_t tag_out) const {
156 |     assert(is_need_restore(tag_out));
157 |     assert(tag_out < _one_mmf.size());
158 |     return _one_mmf.data()[tag_out];
159 | }
160 | 
161 | 
162 | int Restore::key_cmp(const void* left, const void* right) {
163 |     uint32_t left_val = *reinterpret_cast<const uint32_t*>(left);
164 |     uint32_t right_val = *reinterpret_cast<const uint32_t*>(right);
165 |     if (left_val < right_val) {
166 |         return -1;
167 |     } else if (left_val > right_val) {
168 |         return 1;
169 |     } else {
170 |         return 0;
171 |     }
172 | }
173 | 
174 | 
175 | }    // namespace khaiii
176 | 


--------------------------------------------------------------------------------
/src/main/cpp/khaiii/Restore.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
  3 |  * @copyright  Copyright (C) 2018-, Kakao Corp. All rights reserved.
  4 |  */
  5 | 
  6 | 
  7 | #ifndef SRC_MAIN_CPP_KHAIII_RESTORE_HPP_
  8 | #define SRC_MAIN_CPP_KHAIII_RESTORE_HPP_
  9 | 
 10 | 
 11 | //////////////
 12 | // includes //
 13 | //////////////
 14 | #include <memory>
 15 | #include <string>
 16 | #include <vector>
 17 | 
 18 | #include "spdlog/spdlog.h"
 19 | 
 20 | #include "khaiii/MemMapFile.hpp"
 21 | #include "khaiii/Morph.hpp"
 22 | 
 23 | 
 24 | namespace khaiii {
 25 | 
 26 | 
 27 | /**
 28 |  * 원형 복원이 이뤄진 후 음절과 음절별 태그 정보
 29 |  */
 30 | struct chr_tag_t {
 31 |     enum BI { B = 0, I = 1, };    ///< enumeration type for B-, I- notation
 32 | 
 33 |     wchar_t chr;
 34 |     uint8_t tag;
 35 |     BI bi;    ///< B-, I- notation
 36 | 
 37 |     inline void set_tag(uint16_t tag_out) {
 38 |         assert(0 < tag_out && tag_out <= 2 * POS_TAG_SIZE);
 39 |         tag = tag_out;
 40 |         if (tag > POS_TAG_SIZE) {
 41 |             tag -= POS_TAG_SIZE;
 42 |             bi = I;
 43 |         }
 44 |     }
 45 | 
 46 |     inline void from_val(uint32_t val) {
 47 |         chr = val >> 8;    // value의 경우 8비트를 shift하여 음절을 만든다.
 48 |         if (val & 0x80) {
 49 |             bi = I;
 50 |         } else {
 51 |             bi = B;
 52 |         }
 53 |         set_tag(val & 0x7F);
 54 |     }
 55 | 
 56 |     std::string str();
 57 | };
 58 | 
 59 | 
 60 | class Restore {
 61 |  public:
 62 |     virtual ~Restore();    ///< dtor
 63 | 
 64 |     /*
 65 |      * 리소스를 연다.
 66 |      * @param  dir  리소스 디렉토리
 67 |      */
 68 |     void open(std::string dir);
 69 | 
 70 |     void close();    ///< 리소스를 닫는다.
 71 | 
 72 |     /**
 73 |      * 음절과 그 음절의 태그 번호를 이용해 원형 복원이 필요한 경우 복원한다.
 74 |      * @param  chr  음절
 75 |      * @param  tag_out  태그 번호
 76 |      * @param  use_dic  원형복원 사전을 사용할 지 여부
 77 |      * @return   복원한 음절 만큼의 태그 리스트
 78 |      */
 79 |     std::vector<chr_tag_t> restore(wchar_t chr, uint16_t tag_out, bool use_dic) const;
 80 | 
 81 |     /**
 82 |      * 원형 복원이 필요한 복합 태그 여부
 83 |      * @param  tag_out  태그 번호
 84 |      * @return  복합 태그 여부
 85 |      */
 86 |      static bool is_need_restore(uint16_t tag_out);
 87 | 
 88 |     /**
 89 |      * 복합 태그가 원형 복원 사전에 존재하는 지 찾는다.
 90 |      * @param  chr  음절
 91 |      * @param  tag_out  태그 번호
 92 |      * @return  인덱스. 찾지 못할 경우 -1
 93 |      */
 94 |      int find(wchar_t chr, uint16_t tag_out) const;
 95 | 
 96 |      /**
 97 |       * 원형 복원 사전에 존재하지 않는 복합 태그 번호일 경우 맨 앞에 하나의 태그를 얻는다.
 98 |       * @param  tag_out  태그 번호
 99 |       * @return  맨 앞에 하나의 태그
100 |       */
101 |      uint8_t get_one(uint16_t tag_out) const;
102 | 
103 |  private:
104 |     static const int _MAX_VAL_LEN = 4;    ///< maximum array length of value
105 | 
106 |     static std::shared_ptr<spdlog::logger> _log;    ///< logger
107 | 
108 |     MemMapFile<uint32_t> _key_mmf;    ///< key memory mapping
109 |     MemMapFile<uint32_t> _val_mmf;    ///< value memory mapping
110 |     MemMapFile<uint8_t> _one_mmf;    ///< one memory mapping
111 | 
112 |     static int key_cmp(const void* left, const void* right);    ///< key comparator for bsearch
113 | };
114 | 
115 | 
116 | }    // namespace khaiii
117 | 
118 | 
119 | #endif    // SRC_MAIN_CPP_KHAIII_RESTORE_HPP_
120 | 


--------------------------------------------------------------------------------
/src/main/cpp/khaiii/Sentence.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
  3 |  * @copyright  Copyright (C) 2018-, Kakao Corp. All rights reserved.
  4 |  */
  5 | 
  6 | 
  7 | #include "khaiii/Sentence.hpp"
  8 | 
  9 | 
 10 | //////////////
 11 | // includes //
 12 | //////////////
 13 | #include <cassert>
 14 | #include <cctype>
 15 | #include <iomanip>
 16 | #include <locale>
 17 | #include <sstream>
 18 | 
 19 | #include "khaiii/Word.hpp"
 20 | #include "khaiii/util.hpp"
 21 | 
 22 | 
 23 | namespace khaiii {
 24 | 
 25 | 
 26 | using std::codecvt;
 27 | using std::codecvt_base;
 28 | using std::dec;
 29 | using std::hex;
 30 | using std::locale;
 31 | using std::make_shared;
 32 | using std::mbstate_t;
 33 | using std::setfill;
 34 | using std::setw;
 35 | using std::shared_ptr;
 36 | using std::string;
 37 | using std::use_facet;
 38 | using std::wstringstream;
 39 | 
 40 | 
 41 | ////////////////////
 42 | // static members //
 43 | ////////////////////
 44 | shared_ptr<spdlog::logger> Sentence::_log = spdlog::stderr_color_mt("Sentence");
 45 | 
 46 | 
 47 | ////////////////////
 48 | // ctors and dtor //
 49 | ////////////////////
 50 | Sentence::Sentence(const char* raw): _raw(raw), _morph_cnt(0) {
 51 |     _characterize();
 52 |     _tokenize();
 53 | }
 54 | 
 55 | 
 56 | /////////////
 57 | // methods //
 58 | /////////////
 59 | void Sentence::organize() {
 60 |     for (int i = 0; i < words.size(); ++i) {
 61 |         if (i > 0) words[i-1]->next = words[i].get();
 62 |         words[i]->organize(_wraw, _wbegins, _wends);
 63 | #ifndef NDEBUG
 64 |         _log->debug("[{}] word: {}", i, words[i]->str());
 65 |         for (int j = 0; j < words[i]->morph_vec.size(); ++j) {
 66 |             _log->debug("\t[{}] morph: {}", j, words[i]->morph_vec[j]->str());
 67 |         }
 68 | #endif
 69 |     _morph_cnt += words[i]->morph_vec.size();
 70 |     }
 71 | }
 72 | 
 73 | 
 74 | int Sentence::get_lwb_delta(int wrd_idx, int chr_idx) {
 75 |     assert(0 <= chr_idx && chr_idx < words[wrd_idx]->wlength);
 76 |     return -chr_idx;
 77 | }
 78 | 
 79 | 
 80 | int Sentence::get_rwb_delta(int wrd_idx, int chr_idx) {
 81 |     assert(0 <= chr_idx && chr_idx < words[wrd_idx]->wlength);
 82 |     return words[wrd_idx]->wlength - chr_idx - 1;
 83 | }
 84 | 
 85 | 
 86 | void Sentence::_tokenize() {
 87 |     bool is_in_space = true;
 88 |     for (int idx = 0; idx < _wraw.size(); ++idx) {
 89 |         if (is_space(_wraw[idx])) {
 90 |             is_in_space = true;
 91 |         } else {
 92 |             if (is_in_space) {
 93 |                 // first character => start of word
 94 |                 words.emplace_back(make_shared<Word>(&_wraw[idx], 1));
 95 |             } else {
 96 |                 words.back()->wlength += 1;
 97 |             }
 98 |             is_in_space = false;
 99 |         }
100 |     }
101 | 
102 |     for (auto& word : words) {
103 |         word->set_begin_length(_wraw, _wbegins, _wends);
104 |         _log->debug("'{}'{}~{}|{},{}", word->str(), word->begin, word->length,
105 |                     (word->wbegin - &_wraw[0]), word->wlength);
106 |     }
107 | }
108 | 
109 | 
110 | void Sentence::_characterize() {
111 |     assert(_raw != nullptr);
112 |     auto en_US_utf8 = locale("en_US.UTF-8");
113 |     auto& facet = use_facet<codecvt<wchar_t, char, mbstate_t>>(en_US_utf8);
114 |     auto mbst = mbstate_t();
115 |     const char* from_next = nullptr;
116 |     wstringstream wss;
117 |     for (const char* from_curr = _raw; *from_curr != '\0'; from_curr = from_next) {
118 |         wchar_t wchar[2] = L"";
119 |         wchar_t* to_next = nullptr;
120 |         auto result = facet.in(mbst, from_curr, from_curr + 6, from_next, wchar, wchar + 1,
121 |                                to_next);
122 |         assert(result == codecvt_base::partial || result == codecvt_base::ok);
123 |         wss << wchar[0];
124 |         _wbegins.emplace_back(from_curr - _raw);
125 |         _wends.emplace_back(from_next - _raw);
126 |         // _log->debug("'{}'({}){}~{}|{}~{}", string(from_curr, from_next - from_curr), hex,
127 |         //             static_cast<int>(wchar[0]), dec, (from_curr - _raw), (from_next - _raw));
128 |     }
129 |     _wraw = wss.str();
130 |     assert(_wraw.length() == _wbegins.size());
131 |     assert(_wraw.length() == _wends.size());
132 | }
133 | 
134 | 
135 | }    // namespace khaiii
136 | 


--------------------------------------------------------------------------------
/src/main/cpp/khaiii/Sentence.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
 3 |  * @copyright  Copyright (C) 2018-, Kakao Corp. All rights reserved.
 4 |  */
 5 | 
 6 | 
 7 | #ifndef SRC_MAIN_CPP_KHAIII_SENTENCE_HPP_
 8 | #define SRC_MAIN_CPP_KHAIII_SENTENCE_HPP_
 9 | 
10 | 
11 | //////////////
12 | // includes //
13 | //////////////
14 | #include <memory>
15 | #include <string>
16 | #include <vector>
17 | 
18 | #include "spdlog/spdlog.h"
19 | 
20 | 
21 | namespace khaiii {
22 | 
23 | 
24 | class Word;
25 | 
26 | 
27 | /**
28 |  * sentence data structure
29 |  */
30 | class Sentence {
31 |  public:
32 |     std::vector<std::shared_ptr<Word>> words;    ///< vector of words
33 | 
34 |     /**
35 |      * ctor
36 |      * @param  raw  raw sentence (UTF-8)
37 |      */
38 |     explicit Sentence(const char* raw = "");
39 | 
40 |     void organize();    ///< 결과를 구조화합니다.
41 | 
42 |     inline int morph_cnt() const {
43 |         return _morph_cnt;
44 |     }
45 | 
46 |     inline const char* raw_str() const {
47 |         return _raw;
48 |     }
49 | 
50 |     /**
51 |      * get delta from left word boundary to this character
52 |      * @param  wrd_idx  word index
53 |      * @param  chr_idx  character index
54 |      * @return  delta (always less or equal to 0)
55 |      */
56 |     int get_lwb_delta(int wrd_idx, int chr_idx);
57 | 
58 |     /**
59 |      * get delta from right word boundary to this character
60 |      * @param  wrd_idx  word index
61 |      * @param  chr_idx  character index
62 |      * @return  delta (always more or equal to 0)
63 |      */
64 |     int get_rwb_delta(int wrd_idx, int chr_idx);
65 | 
66 |  private:
67 |     static std::shared_ptr<spdlog::logger> _log;    ///< logger
68 | 
69 |     const char* _raw = "";    ///< raw sentence (UTF-8)
70 |     int _morph_cnt;    ///< total morpheme count
71 |     std::wstring _wraw;    ///< unicode characters
72 |     std::vector<int> _wbegins;    ///< character begin positions for each unicode characters
73 |     std::vector<int> _wends;    ///< character end positions for each unicode characters
74 | 
75 |     void _tokenize();    ///< tokenize by spaces
76 |     void _characterize();    ///< convert to unicode characters
77 | };
78 | 
79 | 
80 | }    // namespace khaiii
81 | 
82 | 
83 | #endif    // SRC_MAIN_CPP_KHAIII_SENTENCE_HPP_
84 | 


--------------------------------------------------------------------------------
/src/main/cpp/khaiii/Tagger.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
  3 |  * @copyright  Copyright (C) 2018-, Kakao Corp. All rights reserved.
  4 |  */
  5 | 
  6 | 
  7 | #ifndef SRC_MAIN_CPP_KHAIII_TAGGER_HPP_
  8 | #define SRC_MAIN_CPP_KHAIII_TAGGER_HPP_
  9 | 
 10 | 
 11 | //////////////
 12 | // includes //
 13 | //////////////
 14 | #include <memory>
 15 | #include <vector>
 16 | #include <utility>
 17 | 
 18 | #include "spdlog/spdlog.h"
 19 | 
 20 | #include "khaiii/Embed.hpp"
 21 | 
 22 | 
 23 | namespace khaiii {
 24 | 
 25 | 
 26 | class Config;
 27 | class Resource;
 28 | class Sentence;
 29 | 
 30 | 
 31 | class Tagger {
 32 |  public:
 33 |     /**
 34 |      * ctor
 35 |      * @param  cfg  config
 36 |      * @param  rsc  resource
 37 |      * @param  sent  Sentence object
 38 |      */
 39 |     Tagger(const Config& cfg, const Resource& rsc, std::shared_ptr<Sentence> sent);
 40 | 
 41 |     void tag();    ///< part-of-speech tag
 42 | 
 43 |  private:
 44 |     static std::shared_ptr<spdlog::logger> _log;    ///< logger
 45 | 
 46 |     const Config& _cfg;    ///< config
 47 |     const Resource& _rsc;    ///< resource
 48 |     std::shared_ptr<Sentence> _sent;    ///< Sentence object
 49 | 
 50 |    /**
 51 |     * add left/right word boundary embedding to batch
 52 |     * @param  data  data start point
 53 |     * @param  wrd_idx  word index
 54 |     * @param  chr_idx  character index
 55 |     */
 56 |     void _add_lwb_rwb(float* data, int wrd_idx, int chr_idx);
 57 | 
 58 |     /**
 59 |      * tag characters with CNN method
 60 |      * @param  data  data start point
 61 |      * @param  batch_size  batch size
 62 |      * @param  col_dim  column dimension for each batch
 63 |      */
 64 |     void _tag_cnn(float* data, int batch_size, int col_dim,
 65 |                   const std::vector<std::pair<int, int>>& index);
 66 | 
 67 |     /**
 68 |      * 오분석 패치를 적용하기 전에 예측한 태그를 보정한다.
 69 |      * 음절과 태그 조합이 원형복원 사전에 없을 경우 1음절용 태그로 벼환한 다음,
 70 |      * B- 위치에 I- 로 잘못 태깅된 태그를 보정한다.
 71 |      */
 72 |     void _revise_tags();
 73 | 
 74 |    /**
 75 |     * 이전 태그와 현재 태그가 B-, I- 만 다르고 같은 카테고리인지 여부.
 76 |     * 이전 태그가 복합 태그일 경우 마지막 태그와 비교한다.
 77 |     * 현재 태그는 단순 태그이며 B- 태그인 경우에 한해 동작한다.
 78 |     * @param  prev_chr  이전 음절
 79 |     * @param  prev_tag  이전 태그
 80 |     * @param  curr  현재 태그
 81 |     * @return  태그 카테고리가 동일한지 여부
 82 |     */
 83 |     bool _is_same_tag_cat(wchar_t prev_chr, int prev_tag, int curr);
 84 | 
 85 |     void _restore();    ///< restore morphemes
 86 | 
 87 |     /**
 88 |      * get context embeddings
 89 |      */
 90 |     std::vector<const embedding_t*> _get_context(int wrd_idx, int chr_idx);
 91 | 
 92 |     /**
 93 |      * get left context embeddings
 94 |      */
 95 |     std::vector<const embedding_t*> _get_left_context(int wrd_idx, int chr_idx);
 96 | 
 97 |     /**
 98 |      * get right context embeddings
 99 |      */
100 |     std::vector<const embedding_t*> _get_right_context(int wrd_idx, int chr_idx);
101 | };
102 | 
103 | 
104 | }    // namespace khaiii
105 | 
106 | 
107 | #endif    // SRC_MAIN_CPP_KHAIII_TAGGER_HPP_
108 | 


--------------------------------------------------------------------------------
/src/main/cpp/khaiii/Trie.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
  3 |  * @copyright  Copyright (C) 2018-, Kakao Corp. All rights reserved.
  4 |  */
  5 | 
  6 | 
  7 | #include "khaiii/Trie.hpp"
  8 | 
  9 | 
 10 | //////////////
 11 | // includes //
 12 | //////////////
 13 | #include <algorithm>
 14 | #include <cstdlib>
 15 | #include <exception>
 16 | #include <list>
 17 | #include <memory>
 18 | #include <string>
 19 | 
 20 | #include "boost/lexical_cast.hpp"
 21 | 
 22 | #include "khaiii/KhaiiiApi.hpp"
 23 | #include "khaiii/util.hpp"
 24 | 
 25 | 
 26 | namespace khaiii {
 27 | 
 28 | 
 29 | using std::exception;
 30 | using std::find_if;
 31 | using std::list;
 32 | using std::shared_ptr;
 33 | using std::string;
 34 | using std::wstring;
 35 | 
 36 | using boost::optional;
 37 | 
 38 | 
 39 | ///////////////
 40 | // constants //
 41 | ///////////////
 42 | static const size_t _LIN_BIN_NUM = 32;    // linear/binary search 경계가 되는 element 갯수
 43 | 
 44 | 
 45 | ////////////////////
 46 | // static members //
 47 | ////////////////////
 48 | shared_ptr<spdlog::logger> Trie::_log = spdlog::stderr_color_mt("Trie");
 49 | 
 50 | 
 51 | ////////////////////
 52 | // ctors and dtor //
 53 | ////////////////////
 54 | Trie::~Trie() {
 55 |     close();
 56 | }
 57 | 
 58 | 
 59 | /////////////
 60 | // methods //
 61 | /////////////
 62 | void Trie::open(string path) {
 63 |     _mmf.open(path);
 64 | 
 65 | #ifndef NDEBUG
 66 |     const _node_t* root_node = _mmf.data();
 67 |     for (int i = 0; i < sizeof(root_node) / sizeof(_node_t); ++i) {
 68 |         SPDLOG_TRACE(_log, root_node[i].str(root_node));
 69 |     }
 70 | #endif
 71 | }
 72 | 
 73 | 
 74 | void Trie::close() {
 75 |     _mmf.close();
 76 | }
 77 | 
 78 | 
 79 | /*
 80 | optional<uint32_t> Trie::find(const wstring& key) const {
 81 |     return find(key.c_str());
 82 | }
 83 | 
 84 | 
 85 | optional<uint32_t> Trie::find(const wchar_t* key) const {
 86 |     assert(key != nullptr);
 87 |     if (*key == L'\0') return boost::none;
 88 |     return _find(key, _mmf.data());
 89 | }
 90 | */
 91 | 
 92 | 
 93 | list<Trie::match_t> Trie::search_common_prefix_matches(const wstring& text, int max_len) const {
 94 |     return search_common_prefix_matches(text.c_str(), max_len);
 95 | }
 96 | 
 97 | 
 98 | list<Trie::match_t> Trie::search_common_prefix_matches(const wchar_t* text, int max_len) const {
 99 |     assert(text != nullptr);
100 |     list<match_t> found;
101 |     _search(text, _mmf.data(), &found, 0, max_len);
102 |     return found;
103 | }
104 | 
105 | 
106 | optional<Trie::match_t> Trie::search_longest_prefix_match(const wchar_t* text, int max_len) const {
107 |     list<match_t> found = search_common_prefix_matches(text, max_len);
108 |     if (found.empty()) return boost::none;
109 |     return optional<match_t>(found.back());
110 | }
111 | 
112 | 
113 | /*
114 | boost::optional<uint32_t> Trie::_find(const wchar_t* key, const _node_t* node) const {
115 |     SPDLOG_TRACE(_log, "key: [{}], {}", key, node->str(_data()));
116 |     if (node->child_start <= 0 || node->child_num <= 0) return boost::none;
117 |     auto begin = node + node->child_start;
118 |     auto end = begin + node->child_num;
119 |     auto found_node = end;
120 |     if (node->child_num < _LIN_BIN_NUM) {
121 |         // linear search
122 |         auto pred = [&key] (const _node_t& _node) { return _node.chr == *key; };
123 |         found_node = find_if(begin, end, pred);
124 |     } else {
125 |         // binary search
126 |         _node_t key_node;
127 |         key_node.chr = *key;
128 |         void* found_ptr = ::bsearch(&key_node, begin, end - begin, sizeof(_node_t), _node_t::cmp);
129 |         if (found_ptr) found_node = static_cast<const _node_t*>(found_ptr);
130 |     }
131 |     if (found_node == end) {
132 |         SPDLOG_TRACE(_log, "  not found");
133 |         return boost::none;
134 |     } else {
135 |         SPDLOG_TRACE(_log, "  found: {}", found_node->str(_data()));
136 |         key += 1;
137 |         if (*key == L'\0') {
138 |             if (found_node->val > 0) {
139 |                 return optional<uint32_t>(found_node->val);
140 |             } else {
141 |                 return boost::none;
142 |             }
143 |         } else {
144 |             return _find(key, found_node);
145 |         }
146 |     }
147 | }
148 | */
149 | 
150 | 
151 | void Trie::_search(const wchar_t* text, const _node_t* node, list<Trie::match_t>* matches,
152 |                    int len, int max_len) const {
153 |     SPDLOG_TRACE(_log, "text({}): [{}], {}", len, wstr_to_utf8(text), node->str(_data()));
154 |     if (*text == '\0' || len > max_len || node->child_start <= 0 || node->child_num <= 0) return;
155 |     auto begin = node + node->child_start;
156 |     auto end = begin + node->child_num;
157 |     auto found_node = end;
158 |     if (node->child_num < _LIN_BIN_NUM) {
159 |         // linear search
160 |         auto pred = [&text] (const _node_t& _node) { return _node.chr == *text; };
161 |         found_node = find_if(begin, end, pred);
162 |     } else {
163 |         // binary search
164 |         _node_t key_node;
165 |         key_node.chr = *text;
166 |         void* found_ptr = ::bsearch(&key_node, begin, end - begin, sizeof(_node_t), _node_t::cmp);
167 |         if (found_ptr) found_node = static_cast<const _node_t*>(found_ptr);
168 |     }
169 |     if (found_node == end) {
170 |         SPDLOG_TRACE(_log, "  not matched");
171 |         return;
172 |     } else {
173 |         SPDLOG_TRACE(_log, "  matched: {}", found_node->str(_data()));
174 |         if (found_node->val > 0) matches->emplace_back(len + 1, found_node->val);
175 |         _search(text + 1, found_node, matches, len + 1, max_len);
176 |     }
177 | }
178 | 
179 | 
180 | }    // namespace khaiii
181 | 


--------------------------------------------------------------------------------
/src/main/cpp/khaiii/Trie.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
  3 |  * @copyright  Copyright (C) 2018-, Kakao Corp. All rights reserved.
  4 |  */
  5 | 
  6 | 
  7 | #ifndef SRC_MAIN_CPP_KHAIII_TRIE_HPP_
  8 | #define SRC_MAIN_CPP_KHAIII_TRIE_HPP_
  9 | 
 10 | 
 11 | //////////////
 12 | // includes //
 13 | //////////////
 14 | #include <functional>
 15 | #include <list>
 16 | #include <memory>
 17 | #include <sstream>
 18 | #include <string>
 19 | 
 20 | #include "boost/optional.hpp"
 21 | #include "spdlog/spdlog.h"
 22 | 
 23 | #include "khaiii/MemMapFile.hpp"
 24 | 
 25 | 
 26 | namespace khaiii {
 27 | 
 28 | 
 29 | /**
 30 |  * 유니코드 TRIE
 31 |  */
 32 | class Trie {
 33 |  public:
 34 |     struct match_t {    ///< 접두사 매칭 정보를 담기 위한 구조체. common prefix match
 35 |         int len;    ///< 매칭된 길이
 36 |         uint32_t val;    ///< 값 (양수)
 37 |         explicit match_t(int len = -1, uint32_t val = 0): len(len), val(val) {}    ///< ctor
 38 |     };
 39 | 
 40 |     virtual ~Trie();    ///< dtor
 41 | 
 42 |     /**
 43 |      * 리소스를 연다.
 44 |      * @param  path  파일 경로
 45 |      */
 46 |     void open(std::string path);
 47 | 
 48 |     void close();    ///< 리소스를 닫는다.
 49 | 
 50 |     /*
 51 |      * 키를 이용해 값을 찾는다.
 52 |      * @param  key  키 문자열
 53 |      * @return  값. 키가 없을 경우 boost::none
 54 |      */
 55 |     // boost::optional<uint32_t> find(const std::wstring& key) const;
 56 | 
 57 |     /*
 58 |      * 키를 이용해 값을 찾는다.
 59 |      * @param  key  키 문자열
 60 |      * @return  값. 키가 없을 경우 boost::none
 61 |      */
 62 |     // boost::optional<uint32_t> find(const wchar_t* key) const;
 63 | 
 64 |     /*
 65 |      * 접두사가 같은 모든 매칭 결과를 검색한다.
 66 |      * @param  text  검색할 문자열
 67 |      * @return  매칭 결과 리스트
 68 |      */
 69 |     std::list<match_t> search_common_prefix_matches(const std::wstring& text,
 70 |                                                     int max_len = INT_MAX) const;
 71 | 
 72 |     /*
 73 |      * 접두사가 같은 모든 매칭 결과를 검색한다.
 74 |      * @param  text  검색할 문자열
 75 |      * @return  매칭 결과 리스트
 76 |      */
 77 |     std::list<match_t> search_common_prefix_matches(const wchar_t* text,
 78 |                                                     int max_len = INT_MAX) const;
 79 | 
 80 |     boost::optional<match_t> search_longest_prefix_match(const wchar_t* text,
 81 |                                                          int max_len = INT_MAX) const;
 82 | 
 83 |  private:
 84 |     static std::shared_ptr<spdlog::logger> _log;    ///< logger
 85 | 
 86 |     struct _node_t {    ///< TRIE의 노드 구조체
 87 |         wchar_t chr = 0;    ///< 유니코드 문자
 88 |         uint32_t val = 0;    ///< 값 (양수). (0인 경우 값이 아님. 즉, 단말 노드가 아님)
 89 |         int32_t child_start = -1;    ///< 현재 노드로부터 자식 노드가 시작되는 위치
 90 |         int32_t child_num = -1;    ///< 자식 노드의 갯수
 91 | 
 92 |         /**
 93 |          * 두 노드를 비교하는 함수
 94 |          * @param  left  left hand side
 95 |          * @param  right  right hand side
 96 |          * @return  -1: left < right, 0: left == right, 1: left > right
 97 |          */
 98 |         static int cmp(const void* left, const void* right) {
 99 |             const _node_t* left0 = static_cast<const _node_t*>(left);
100 |             const _node_t* right0 = static_cast<const _node_t*>(right);
101 |             return left0->chr - right0->chr;
102 |         }
103 | 
104 |         inline std::string str(const _node_t* root_node) const {    ///< 디버그용 문자열 변환
105 |             std::ostringstream oss;
106 |             oss << "node[" << (this - root_node) << "]{'";
107 |             if (chr == 0) {
108 |                 oss << "ROOT";
109 |             } else {
110 |                 oss << static_cast<char>(chr);
111 |             }
112 |             oss << "', " << val << ", (" << child_start << ", " << child_num << ")}";
113 |             return oss.str();
114 |         }
115 |     };
116 | 
117 |     MemMapFile<_node_t> _mmf;    ///< memory mapped file
118 | 
119 |     /*
120 |      * 현재 노드로부터 자식 노드로 내려가며 키 값을 찾는다.
121 |      * @param  key  키 문자열
122 |      * @param  node  노드 시작 위치
123 |      * @return  값. 키가 없을 경우 boost::none
124 |      */
125 |     boost::optional<uint32_t> _find(const wchar_t* key, const _node_t* node) const;
126 | 
127 |     /*
128 |      * 현재 노드로부터 더이상 매칭되는 키가 없을 때까지 검색한다.
129 |      * @param  text  찾을 텍스트
130 |      * @param  node  노드 시작 위치
131 |      * @param  matches  매칭 결과 리스트
132 |      * @param  len  현재까지 검색을 진행한 길이(자식 노드의 깊이)
133 |      */
134 |     void _search(const wchar_t* text, const _node_t* node, std::list<match_t>* matches,
135 |                  int len, int max_len) const;
136 | };
137 | 
138 | 
139 | }    // namespace khaiii
140 | 
141 | 
142 | #endif  // SRC_MAIN_CPP_KHAIII_TRIE_HPP_
143 | 


--------------------------------------------------------------------------------
/src/main/cpp/khaiii/Word.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
  3 |  * @copyright  Copyright (C) 2017-, Kakao Corp. All rights reserved.
  4 |  */
  5 | 
  6 | 
  7 | #include "khaiii/Word.hpp"
  8 | 
  9 | 
 10 | //////////////
 11 | // includes //
 12 | //////////////
 13 | #include <string>
 14 | #include <vector>
 15 | 
 16 | #include "khaiii/Morph.hpp"
 17 | #include "khaiii/util.hpp"
 18 | 
 19 | 
 20 | namespace khaiii {
 21 | 
 22 | 
 23 | using std::make_shared;
 24 | using std::string;
 25 | using std::vector;
 26 | using std::wstring;
 27 | using std::wstringstream;
 28 | 
 29 | 
 30 | ////////////////////
 31 | // ctors and dtor //
 32 | ////////////////////
 33 | Word::Word(const wchar_t* wbegin, int wlength): wbegin(wbegin), wlength(wlength),
 34 |                                                 char_tags(wlength) {
 35 |     begin = -1;
 36 |     length = -1;
 37 |     morphs = nullptr;
 38 |     next = nullptr;
 39 | }
 40 | 
 41 | 
 42 | /////////////
 43 | // methods //
 44 | /////////////
 45 | void Word::set_begin_length(const wstring &wchars, const vector<int> &wbegins,
 46 |                             const vector<int> &wends) {
 47 |     int wbegin_idx = wbegin - wchars.c_str();
 48 |     begin = wbegins.at(wbegin_idx);
 49 |     length = wends.at(wbegin_idx + wlength - 1) - begin;
 50 |     char_tags.resize(wlength);
 51 | }
 52 | 
 53 | 
 54 | void Word::set_embeds(const Resource& rsc) {
 55 |     embeds.reserve(wlength);
 56 |     for (int i = 0; i < wlength; ++i) embeds.emplace_back(rsc.embed[*(wbegin + i)]);
 57 | }
 58 | 
 59 | 
 60 | void Word::add_morph(const wstringstream& wlex, uint8_t tag, int begin_idx, int end_idx) {
 61 |     const wchar_t* morph_wbegin = wbegin + begin_idx;
 62 |     int morph_wlength = end_idx - begin_idx + 1;
 63 |     morph_vec.emplace_back(make_shared<Morph>(wlex.str(), static_cast<pos_tag_t>(tag), morph_wbegin,
 64 |                            morph_wlength));
 65 | }
 66 | 
 67 | 
 68 | void Word::organize(const wstring& wraw, const vector<int>& wbegins, const vector<int>& wends) {
 69 |     for (int i = 0; i < morph_vec.size(); ++i) {
 70 |         if (i > 0) morph_vec[i-1]->next = morph_vec[i].get();
 71 |         morph_vec[i]->organize(wraw, wbegins, wends);
 72 |     }
 73 | }
 74 | 
 75 | 
 76 | void Word::make_morphs() {
 77 |     wstringstream wlex;
 78 |     uint8_t tag = 0;
 79 |     int begin_idx = -1;
 80 |     int end_idx = -1;
 81 |     for (int i = 0; i < restored.size(); ++i) {
 82 |         for (auto chr : restored[i]) {
 83 |             if (chr.bi == chr_tag_t::I && chr.tag == tag) {
 84 |                 // 이전 형태소의 연속이므로 새로 생성하지 않고 추가해준다.
 85 |                 wlex << chr.chr;
 86 |                 end_idx = i;
 87 |             } else {
 88 |                 if (wlex.str().length() > 0) add_morph(wlex, tag, begin_idx, end_idx);
 89 |                 wlex.str(L"");
 90 |                 wlex << chr.chr;
 91 |                 tag = chr.tag;
 92 |                 begin_idx = i;
 93 |                 end_idx = i;
 94 |             }
 95 |         }
 96 |     }
 97 |     if (wlex.str().length() > 0) add_morph(wlex, tag, begin_idx, end_idx);
 98 | 
 99 |     // linked-list 포인터들을 연결해준다.
100 |     morphs = morph_vec[0].get();
101 |     for (int i = 0; i < morph_vec.size() - 1; ++i) {
102 |         morph_vec[i]->next = morph_vec[i+1].get();
103 |     }
104 | }
105 | 
106 | 
107 | string Word::str() const {
108 |     return wstr_to_utf8(wstr());
109 | }
110 | 
111 | 
112 | wstring Word::wstr() const {
113 |     wstringstream wss;
114 |     wss << wstring(wbegin, wlength) << L":" << begin << L"," << length;
115 |     return wss.str();
116 | }
117 | 
118 | 
119 | }    // namespace khaiii
120 | 


--------------------------------------------------------------------------------
/src/main/cpp/khaiii/Word.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
 3 |  * @copyright  Copyright (C) 2017-, Kakao Corp. All rights reserved.
 4 |  */
 5 | 
 6 | 
 7 | #ifndef SRC_MAIN_CPP_KHAIII_WORD_HPP_
 8 | #define SRC_MAIN_CPP_KHAIII_WORD_HPP_
 9 | 
10 | 
11 | //////////////
12 | // includes //
13 | //////////////
14 | #include <memory>
15 | #include <string>
16 | #include <vector>
17 | 
18 | #include "khaiii/khaiii_api.h"
19 | #include "khaiii/Resource.hpp"
20 | #include "khaiii/Restore.hpp"
21 | 
22 | 
23 | namespace khaiii {
24 | 
25 | 
26 | class Morph;
27 | 
28 | 
29 | /**
30 |  * 어절 자료구조
31 |  */
32 | class Word: public khaiii_word_t {
33 |  public:
34 |     const wchar_t* wbegin = nullptr;    ///< unicode string begin address
35 |     int wlength = 0;    ///< unicode string length
36 |     std::vector<std::shared_ptr<Morph>> morph_vec;   ///< 어절에 포함된 형태소 배열 (분석 결과)
37 | 
38 |     std::vector<embedding_t> embeds;    ///< embeddings for each character
39 |     std::vector<uint16_t> char_tags;    ///< tag outs for each character
40 |     std::vector<std::vector<chr_tag_t>> restored;    ///< restored characters and their tags
41 | 
42 |     /**
43 |      * ctor
44 |      * @param  wbegin  unicode string begin address
45 |      * @param  length  unicode string length
46 |      */
47 |     explicit Word(const wchar_t* wbegin = nullptr, int wlength = 0);
48 | 
49 |     /**
50 |      * set begin position and length in raw string for this word
51 |      * @param  wchars  unicode characters
52 |      * @param  wbegins  begin positions for each unicode characters
53 |      * @param  wends  end positions for each unicode characters
54 |      */
55 |     void set_begin_length(const std::wstring &wchars, const std::vector<int> &wbegins,
56 |                           const std::vector<int> &wends);
57 | 
58 |     /**
59 |      * set embedding for decoding
60 |      * @param  rsc  resource
61 |      */
62 |     void set_embeds(const Resource& rsc);
63 | 
64 |     /**
65 |      * 하나의 형태소를 추가한다.
66 |      * @param  wlex  유니코드 형태소 문자열
67 |      * @param  tag  품사 태그 번호 (1부터 시작. 0은 오류)
68 |      * @param  begin_idx  시작 인덱스 (유니코드 음절 인덱스)
69 |      * @param  end_idx  끝 인덱스 (유니코드 음절 인덷스)
70 |      */
71 |     void add_morph(const std::wstringstream& wlex, uint8_t tag, int begin_idx, int end_idx);
72 | 
73 |     /**
74 |      * API 결과 구조체의 내용을 채운다.
75 |      * @param  wraw  유니코드 원문
76 |      * @param  wbegins  각 음절별 시작 byte 위치
77 |      * @param  wends  각 음절별 끝 byte 위치
78 |      */
79 |     void organize(const std::wstring& wraw, const std::vector<int>& wbegins,
80 |                   const std::vector<int>& wends);
81 | 
82 |     /**
83 |      * 원형복원된 음절들을 바탕으로 형태소를 생성한다.
84 |      */
85 |     void make_morphs();
86 | 
87 |     std::string str() const;    ///< to string (UTF-8)
88 |     std::wstring wstr() const;    ///< to unicode string
89 | };
90 | 
91 | 
92 | }    // namespace khaiii
93 | 
94 | 
95 | #endif    // SRC_MAIN_CPP_KHAIII_WORD_HPP_
96 | 


--------------------------------------------------------------------------------
/src/main/cpp/khaiii/khaiii_api.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
  3 |  * @copyright  Copyright (C) 2017-, Kakao Corp. All rights reserved.
  4 |  */
  5 | 
  6 | 
  7 | #include "khaiii/khaiii_api.h"
  8 | 
  9 | 
 10 | //////////////
 11 | // includes //
 12 | //////////////
 13 | #include <mutex>    // NOLINT
 14 | #include <vector>
 15 | 
 16 | #include "khaiii/KhaiiiImpl.hpp"
 17 | 
 18 | 
 19 | using std::make_shared;
 20 | using std::recursive_mutex;
 21 | using std::string;
 22 | using std::shared_ptr;
 23 | using std::unique_lock;
 24 | using std::vector;
 25 | using khaiii::Except;
 26 | using khaiii::KhaiiiApi;
 27 | using khaiii::KhaiiiImpl;
 28 | 
 29 | 
 30 | ///////////////
 31 | // variables //
 32 | ///////////////
 33 | /**
 34 |  * container for handles. the first (index 0) handle is for special use
 35 |  */
 36 | vector<shared_ptr<KhaiiiImpl>> KHAIII_HANDLES{ make_shared<KhaiiiImpl>() };
 37 | 
 38 | 
 39 | ///////////////
 40 | // functions //
 41 | ///////////////
 42 | const char* khaiii_version() {
 43 |     return KHAIII_VERSION;
 44 | }
 45 | 
 46 | 
 47 | int khaiii_open(const char* rsc_dir, const char* opt_str) {
 48 |     unique_lock<recursive_mutex> lock(KHAIII_HANDLES[0]->get_mutex());
 49 |     if (rsc_dir == nullptr) {
 50 |         KHAIII_HANDLES[0]->set_err_msg("resource directory is null");
 51 |         return -1;
 52 |     }
 53 |     auto khaiii_impl = make_shared<KhaiiiImpl>();
 54 |     try {
 55 |         khaiii_impl->open(rsc_dir, opt_str);
 56 |         KHAIII_HANDLES.emplace_back(khaiii_impl);
 57 |     } catch (const Except& exc) {
 58 |         KHAIII_HANDLES[0]->set_err_msg(exc.what());
 59 |         return -1;
 60 |     }
 61 |     return static_cast<int>(KHAIII_HANDLES.size() - 1);
 62 | }
 63 | 
 64 | 
 65 | const khaiii_word_t* khaiii_analyze(int handle, const char* input, const char* opt_str) {
 66 |     if (handle <= 0 || handle >= KHAIII_HANDLES.size()) {
 67 |         unique_lock<recursive_mutex> lock(KHAIII_HANDLES[0]->get_mutex());
 68 |         KHAIII_HANDLES[0]->set_err_msg(fmt::format("invalid handle: {}", handle));
 69 |         return nullptr;
 70 |     }
 71 |     auto khaiii_impl = KHAIII_HANDLES[handle];
 72 |     if (input == nullptr) {
 73 |         khaiii_impl->set_err_msg("input is null");
 74 |         return nullptr;
 75 |     }
 76 |     try {
 77 |         return khaiii_impl->analyze(input, opt_str);
 78 |     } catch (const Except& exc) {
 79 |         khaiii_impl->set_err_msg(exc.what());
 80 |         return nullptr;
 81 |     }
 82 | }
 83 | 
 84 | 
 85 | void khaiii_free_results(int handle, const khaiii_word_t* results) {
 86 |     unique_lock<recursive_mutex> lock(KHAIII_HANDLES[0]->get_mutex());
 87 |     if (handle <= 0 || handle >= KHAIII_HANDLES.size()) {
 88 |         KHAIII_HANDLES[0]->set_err_msg(fmt::format("invalid handle: {}", handle));
 89 |         return;
 90 |     }
 91 |     auto khaiii_impl = KHAIII_HANDLES[handle];
 92 |     try {
 93 |         khaiii_impl->free_results(results);
 94 |     } catch (const Except& exc) {
 95 |         khaiii_impl->set_err_msg(exc.what());
 96 |     }
 97 | }
 98 | 
 99 | 
100 | void khaiii_close(int handle) {
101 |     unique_lock<recursive_mutex> lock(KHAIII_HANDLES[0]->get_mutex());
102 |     if (handle <= 0 || handle >= KHAIII_HANDLES.size()) {
103 |         KHAIII_HANDLES[0]->set_err_msg(fmt::format("invalid handle: {}", handle));
104 |         return;
105 |     }
106 |     auto khaiii_impl = KHAIII_HANDLES[handle];
107 |     try {
108 |         khaiii_impl->close();
109 |     } catch (const Except& exc) {
110 |         khaiii_impl->set_err_msg(exc.what());
111 |     }
112 | }
113 | 
114 | 
115 | const char* khaiii_last_error(int handle) {
116 |     if (handle <= 0 || handle >= KHAIII_HANDLES.size()) handle = 0;
117 |     return KHAIII_HANDLES[handle]->get_err_msg();
118 | }
119 | 


--------------------------------------------------------------------------------
/src/main/cpp/khaiii/khaiii_dev.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
 3 |  * @copyright  Copyright (C) 2017-, Kakao Corp. All rights reserved.
 4 |  */
 5 | 
 6 | 
 7 | #include "khaiii/khaiii_dev.h"
 8 | 
 9 | 
10 | //////////////
11 | // includes //
12 | //////////////
13 | #include <vector>
14 | #include <memory>
15 | 
16 | #include "khaiii/KhaiiiImpl.hpp"
17 | 
18 | 
19 | using std::map;
20 | using std::recursive_mutex;
21 | using std::shared_ptr;
22 | using std::string;
23 | using std::unique_lock;
24 | using std::vector;
25 | using khaiii::Except;
26 | using khaiii::KhaiiiImpl;
27 | 
28 | 
29 | ///////////////
30 | // variables //
31 | ///////////////
32 | extern vector<shared_ptr<KhaiiiImpl>> KHAIII_HANDLES;
33 | 
34 | 
35 | ///////////////
36 | // functions //
37 | ///////////////
38 | int khaiii_analyze_bfr_errpatch(int handle, const char* input, const char* opt_str,
39 |                                 int16_t* output) {
40 |     if (handle <= 0 || handle >= KHAIII_HANDLES.size()) {
41 |         unique_lock<recursive_mutex> lock(KHAIII_HANDLES[0]->get_mutex());
42 |         KHAIII_HANDLES[0]->set_err_msg(fmt::format("invalid handle: {}", handle));
43 |         return -1;
44 |     }
45 |     auto khaiii_impl = KHAIII_HANDLES[handle];
46 |     try {
47 |         return khaiii_impl->analyze_bfr_errpatch(input, opt_str, output);
48 |     } catch (const Except& exc) {
49 |         khaiii_impl->set_err_msg(exc.what());
50 |         return -1;
51 |     }
52 | }
53 | 
54 | 
55 | int khaiii_set_log_level(const char* name, const char* level) {
56 |     if (name == nullptr || level == nullptr) {
57 |         unique_lock<recursive_mutex> lock(KHAIII_HANDLES[0]->get_mutex());
58 |         KHAIII_HANDLES[0]->set_err_msg("log name or level is null");
59 |         return -1;
60 |     }
61 | 
62 |     try {
63 |         KhaiiiImpl::set_log_level(name, level);
64 |     } catch (const Except& exc) {
65 |         KHAIII_HANDLES[0]->set_err_msg(exc.what());
66 |         return -1;
67 |     }
68 |     return 0;
69 | }
70 | 
71 | 
72 | int khaiii_set_log_levels(const char* name_level_pairs) {
73 |     if (name_level_pairs == nullptr) {
74 |         unique_lock<recursive_mutex> lock(KHAIII_HANDLES[0]->get_mutex());
75 |         KHAIII_HANDLES[0]->set_err_msg("log name/level pair is null");
76 |         return -1;
77 |     }
78 | 
79 |     try {
80 |         KhaiiiImpl::set_log_levels(name_level_pairs);
81 |     } catch (const Except& exc) {
82 |         KHAIII_HANDLES[0]->set_err_msg(exc.what());
83 |         return -1;
84 |     }
85 |     return 0;
86 | }
87 | 


--------------------------------------------------------------------------------
/src/main/cpp/khaiii/nn/Conv1d.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
 3 |  * @copyright  Copyright (C) 2018-, Kakao Corp. All rights reserved.
 4 |  */
 5 | 
 6 | 
 7 | #include "khaiii/nn/Conv1d.hpp"
 8 | 
 9 | 
10 | //////////////
11 | // includes //
12 | //////////////
13 | #include <string>
14 | 
15 | #include "khaiii/util.hpp"
16 | 
17 | 
18 | namespace khaiii {
19 | namespace nn {
20 | 
21 | 
22 | using std::make_unique;
23 | using std::string;
24 | 
25 | 
26 | ////////////////////
27 | // ctors and dtor //
28 | ////////////////////
29 | Conv1d::~Conv1d() {
30 |     close();
31 | }
32 | 
33 | 
34 | /////////////
35 | // methods //
36 | /////////////
37 | void Conv1d::open(string path, int in_ch, int out_ch, int kernel_size,
38 |                   const activation_t* activation) {
39 |     _param_mmf.open(path);
40 |     assert(_param_mmf.size() == (in_ch * out_ch * kernel_size + out_ch));
41 |     // [output channel * [kernel * input channel]] ==> transposed
42 |     // ==> [[kernel * input channel] * output channel]
43 |     // 즉, 저장은 [row, col]으로 했지만 사용은 [col, row]로 접근해야 합니다.
44 |     _weight = make_unique<matrix_map_t>(const_cast<float*>(_param_mmf.data()), kernel_size * in_ch,
45 |                                         out_ch);
46 |     _bias = make_unique<vector_map_t>(const_cast<float*>(_param_mmf.data()) + \
47 |                                       (in_ch * out_ch * kernel_size), out_ch);
48 |     _in_ch = in_ch;
49 |     _out_ch = out_ch;
50 |     _kernel_size = kernel_size;
51 |     _activation = activation;
52 | }
53 | 
54 | 
55 | vector_t Conv1d::forward_max_pool_vec(const vector_map_t& input) const {
56 |     int out_row_size = (input.size() / _in_ch) - (_kernel_size - 1);
57 |     int in_col_size = _in_ch * _kernel_size;
58 |     matrix_t output(out_row_size, _out_ch);
59 |     for (int row = 0; row < out_row_size; ++row) {
60 |         output.row(row) = _weight->transpose() * input.segment(row * _in_ch, in_col_size) + *_bias;
61 |     }
62 |     auto pooled = output.colwise().maxCoeff();
63 |     if (_activation) return pooled.unaryExpr(*_activation);
64 |     return pooled;
65 | }
66 | 
67 | 
68 | matrix_t Conv1d::forward_max_pool_mat(const float* data, int batch_size, int col_dim) const {
69 |     matrix_t outputs(batch_size, _out_ch);
70 |     for (int batch = 0; batch < batch_size; ++batch) {
71 |         vector_map_t vec(const_cast<float*>(data + batch * col_dim), col_dim);
72 |         outputs.row(batch) = forward_max_pool_vec(vec);
73 |     }
74 |     return outputs;
75 | }
76 | 
77 | 
78 | void Conv1d::close() {
79 |     _weight.release();
80 |     _bias.release();
81 |     _param_mmf.close();
82 | }
83 | 
84 | 
85 | }    // namespace nn
86 | }    // namespace khaiii
87 | 


--------------------------------------------------------------------------------
/src/main/cpp/khaiii/nn/Conv1d.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
 3 |  * @copyright  Copyright (C) 2018-, Kakao Corp. All rights reserved.
 4 |  */
 5 | 
 6 | 
 7 | #ifndef SRC_MAIN_CPP_KHAIII_NN_CONV1D_HPP_
 8 | #define SRC_MAIN_CPP_KHAIII_NN_CONV1D_HPP_
 9 | 
10 | 
11 | //////////////
12 | // includes //
13 | //////////////
14 | #include <algorithm>
15 | #include <memory>
16 | #include <string>
17 | 
18 | #include "khaiii/MemMapFile.hpp"
19 | #include "khaiii/nn/tensor.hpp"
20 | 
21 | 
22 | namespace khaiii {
23 | namespace nn {
24 | 
25 | 
26 | /**
27 |  * 1D convolution layer
28 |  */
29 | class Conv1d {
30 |  public:
31 |     virtual ~Conv1d();    ///< dtor
32 | 
33 |     /**
34 |      * open layer parameters
35 |      * @param  path  file path
36 |      * @param  in_ch  input channel
37 |      * @param  out_ch  output channel
38 |      * @param  kernel_size  kernel size
39 |      * @param  activation  activation function
40 |      */
41 |     void open(std::string path, int in_ch, int out_ch, int kernel_size,
42 |               const activation_t* activation = nullptr);
43 | 
44 |     /**
45 |      * apply forward calculation and also apply max pooling for vector input
46 |      * @param  input  input vector
47 |      * @return  result vector
48 |      */
49 |     vector_t forward_max_pool_vec(const vector_map_t& input) const;
50 | 
51 |     /**
52 |      * apply forward calculation and also apply max pooling for matrix input
53 |      * @param  input  input matrix. size: [batch size, imput dim]
54 |      * @param  batch_size  batch size
55 |      * @param  col_dim  column dim (for each batch)
56 |      * @return  result matrix
57 |      */
58 |     matrix_t forward_max_pool_mat(const float* data, int batch_size, int col_dim) const;
59 | 
60 |     void close();    ///< 리소스를 닫는다.
61 | 
62 |  private:
63 |     std::unique_ptr<matrix_map_t> _weight;    ///< weights [out_ch x (in_ch x kernel)]
64 |     std::unique_ptr<vector_map_t> _bias;    ///< bias [out_ch x 1]
65 |     int _in_ch = 0;    ///< input channel dimension
66 |     int _out_ch = 0;    ///< output chennel dimension
67 |     int _kernel_size = 0;    ///< kernel size
68 |     const activation_t* _activation = nullptr;    ///< activation function
69 | 
70 |     MemMapFile<float> _param_mmf;    ///< model parameters map file
71 | };
72 | 
73 | 
74 | }    // namespace nn
75 | }    // namespace khaiii
76 | 
77 | 
78 | #endif    // SRC_MAIN_CPP_KHAIII_NN_CONV1D_HPP_
79 | 


--------------------------------------------------------------------------------
/src/main/cpp/khaiii/nn/Linear.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
 3 |  * @copyright  Copyright (C) 2018-, Kakao Corp. All rights reserved.
 4 |  */
 5 | 
 6 | 
 7 | #include "khaiii/nn/Linear.hpp"
 8 | 
 9 | 
10 | //////////////
11 | // includes //
12 | //////////////
13 | #include <string>
14 | #include <vector>
15 | 
16 | #include "khaiii/util.hpp"
17 | 
18 | 
19 | namespace khaiii {
20 | namespace nn {
21 | 
22 | 
23 | using std::cout;
24 | using std::endl;
25 | using std::make_unique;
26 | using std::string;
27 | using std::vector;
28 | 
29 | 
30 | ////////////////////
31 | // ctors and dtor //
32 | ////////////////////
33 | Linear::~Linear() {
34 |     close();
35 | }
36 | 
37 | 
38 | /////////////
39 | // methods //
40 | /////////////
41 | void Linear::open(string path, int in_dim, int out_dim, bool has_bias,
42 |                   const activation_t* activation) {
43 |     // Eigen은 column 우선으로 저장합니다.
44 |     // 따라서 matrix map의 경우 row, col을 거꾸로 해서 생성한 다음,
45 |     // 사용할 때에는 transpose()를 해서 사용해야 합니다.
46 |     _param_mmf.open(path);
47 |     int size = in_dim * out_dim;
48 |     if (has_bias) size += out_dim;
49 |     assert(_param_mmf.size() == size);
50 |     _weight = make_unique<matrix_map_t>(const_cast<float*>(_param_mmf.data()), in_dim, out_dim);
51 |     if (has_bias) {
52 |         _bias = make_unique<vector_map_t>(const_cast<float*>(_param_mmf.data()) + in_dim * out_dim,
53 |                                           out_dim);
54 |     }
55 |     _activation = activation;
56 | }
57 | 
58 | 
59 | /*
60 | #ifndef NDEBUG
61 |     void Linear::print_weight() const {
62 |         int row = _weight->rows();
63 |         int col = _weight->cols();
64 |         fmt::print("============ weight =============\n");
65 |         fmt::print("Size = ({}, {})\n", row, col);
66 |         if (row >= 10 && col >= 10) {
67 |             cout << "first [5 * 5] contents" << endl;
68 |             cout << _weight->block<5, 5>(0, 0) << endl;
69 |             cout << "last [5 * 5] contents" << endl;
70 |             cout << _weight->block<5, 5>(row-5, col-5) << endl;
71 |         } else {
72 |             cout << "contnets" << endl;
73 |             cout << *_weight << endl;
74 |         }
75 |         fmt::print("============ bias =============\n");
76 |         cout << "contnets" << endl;
77 |         cout << _bias->head(5) << endl;
78 |         cout << "..." << endl;
79 |         cout << _bias->tail(5) << endl;
80 |     }
81 | #endif
82 | */
83 | 
84 | 
85 | void Linear::close() {
86 |     _weight.reset();
87 |     _bias.reset();
88 |     _param_mmf.close();
89 | }
90 | 
91 | 
92 | }    // namespace nn
93 | }    // namespace khaiii
94 | 


--------------------------------------------------------------------------------
/src/main/cpp/khaiii/nn/Linear.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
  3 |  * @copyright  Copyright (C) 2018-, Kakao Corp. All rights reserved.
  4 |  */
  5 | 
  6 | 
  7 | #ifndef SRC_MAIN_CPP_KHAIII_NN_LINEAR_HPP_
  8 | #define SRC_MAIN_CPP_KHAIII_NN_LINEAR_HPP_
  9 | 
 10 | 
 11 | //////////////
 12 | // includes //
 13 | //////////////
 14 | #include <iostream>
 15 | #include <algorithm>
 16 | #include <memory>
 17 | #include <string>
 18 | 
 19 | #include "khaiii/MemMapFile.hpp"
 20 | #include "khaiii/nn/tensor.hpp"
 21 | #include "spdlog/spdlog.h"
 22 | #include "fmt/format.h"
 23 | 
 24 | 
 25 | namespace khaiii {
 26 | namespace nn {
 27 | 
 28 | 
 29 | /**
 30 |  * fully connected layer
 31 |  */
 32 | class Linear {
 33 |  public:
 34 |     virtual ~Linear();
 35 |     /**
 36 |      * open layer parameters
 37 |      * @param  path  file path
 38 |      * @param  in_dim  input dimension
 39 |      * @param  out_dim   output dimension
 40 |      * @param  has_bias  whether has bias or not
 41 |      * @param  activation  activation function
 42 |      */
 43 |     void open(std::string path, int in_dim, int out_dim, bool has_bias,
 44 |               const activation_t* activation = nullptr);
 45 | 
 46 |     void close();    ///< 리소스를 닫는다.
 47 | 
 48 |     /**
 49 |      * apply forward calculation for vector input
 50 |      * @param  input  input vector
 51 |      * @return  result vector
 52 |      */
 53 |     template<typename T>
 54 |     inline vector_t forward_vec(const T &input) const {
 55 |         auto without_bias = _weight->transpose() * input;
 56 |         if (_bias.get() == nullptr) {
 57 |             if (_activation) return without_bias.unaryExpr(*_activation);
 58 |             return without_bias;
 59 |         }
 60 |         auto with_bias = without_bias + *_bias;
 61 |         if (_activation) return with_bias.unaryExpr(*_activation);
 62 |         return with_bias;
 63 |     }
 64 | 
 65 |     /**
 66 |      * apply forward calculation for matrix input
 67 |      * @param  input  input matrix. size: [batch size, input dim]
 68 |      * @return  result matrix
 69 |      */
 70 |     template<typename T>
 71 |     inline matrix_t forward_mat(const T& input) const {
 72 |         auto without_bias = input * *_weight;
 73 |         if (_bias.get() == nullptr) {
 74 |             if (_activation) return without_bias.unaryExpr(*_activation);
 75 |             return without_bias;
 76 |         }
 77 |         auto with_bias = without_bias.transpose().colwise() + *_bias;
 78 |         if (_activation) return with_bias.unaryExpr(*_activation).transpose();
 79 |         return with_bias.transpose();
 80 |     }
 81 | 
 82 |     /*
 83 |     #ifndef NDEBUG
 84 |         void print_weight() const;    ///< print weights for debugging
 85 |     #endif
 86 |     */
 87 | 
 88 |  private:
 89 |     std::unique_ptr<matrix_map_t> _weight;    ///< weights [out x in]
 90 |     std::unique_ptr<vector_map_t> _bias;    ///< bias [out x 1]
 91 |     const activation_t* _activation = nullptr;    ///< activation function
 92 | 
 93 |     MemMapFile<float> _param_mmf;    ///< model parameters map file
 94 | };
 95 | 
 96 | 
 97 | }    // namespace nn
 98 | }    // namespace khaiii
 99 | 
100 | 
101 | #endif    // SRC_MAIN_CPP_KHAIII_NN_LINEAR_HPP_
102 | 


--------------------------------------------------------------------------------
/src/main/cpp/khaiii/nn/tensor.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
 3 |  * @copyright  Copyright (C) 2017-, Kakao Corp. All rights reserved.
 4 |  */
 5 | 
 6 | 
 7 | #include "khaiii/nn/tensor.hpp"
 8 | 
 9 | 
10 | //////////////
11 | // includes //
12 | //////////////
13 | #include <algorithm>
14 | #include <functional>
15 | #include <vector>
16 | 
17 | 
18 | namespace khaiii {
19 | namespace nn {
20 | 
21 | 
22 | using std::vector;
23 | 
24 | 
25 | //////////////////////////
26 | // activation functions //
27 | //////////////////////////
28 | float relu(float x) {
29 |     return std::max(x, 0.0f);
30 | }
31 | activation_t RELU = std::ptr_fun(relu);    ///< ReLU function pointer
32 | 
33 | 
34 | ///////////////
35 | // functions //
36 | ///////////////
37 | void add_positional_enc(float* data, int len, int dim) {
38 |     for (int pos = 1; pos <= len; ++pos) {
39 |         float pos_ = pos;
40 |         for (int i = 1; i <= dim; ++i) {
41 |             *data++ += (1.0f - pos_ / len -
42 |                         ((static_cast<float>(i) / dim) * (1.0f - 2.0f * pos_ / len)));
43 |         }
44 |     }
45 | }
46 | 
47 | 
48 | }    // namespace nn
49 | }    // namespace khaiii
50 | 


--------------------------------------------------------------------------------
/src/main/cpp/khaiii/nn/tensor.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
 3 |  * @copyright  Copyright (C) 2017-, Kakao Corp. All rights reserved.
 4 |  */
 5 | 
 6 | 
 7 | #ifndef SRC_MAIN_CPP_KHAIII_NN_TENSOR_HPP_
 8 | #define SRC_MAIN_CPP_KHAIII_NN_TENSOR_HPP_
 9 | 
10 | 
11 | //////////////
12 | // includes //
13 | //////////////
14 | #include <functional>
15 | 
16 | #include "Eigen/Dense"
17 | 
18 | 
19 | namespace khaiii {
20 | namespace nn {
21 | 
22 | 
23 | ///////////
24 | // types //
25 | ///////////
26 | using matrix_map_t = Eigen::Map<Eigen::MatrixXf>;
27 | using vector_map_t = Eigen::Map<Eigen::VectorXf>;
28 | using matrix_t = Eigen::MatrixXf;
29 | using vector_t = Eigen::VectorXf;
30 | 
31 | 
32 | //////////////////////////
33 | // activation functions //
34 | //////////////////////////
35 | typedef std::pointer_to_unary_function<float, float> activation_t;
36 | extern activation_t RELU;
37 | 
38 | 
39 | ///////////////
40 | // functions //
41 | ///////////////
42 | /**
43 |  * add positional encoding to data(array of floats)
44 |  * @param  data  input data. size: [length x dimension]
45 |  * @param  len  position length
46 |  * @param  dim  embedding dimension
47 |  */
48 | void add_positional_enc(float* data, int len, int dim);
49 | 
50 | /**
51 |  * add two vector in-place (update left vector)
52 |  * @param  left  vector (will be updated)
53 |  * @param  right  vector
54 |  */
55 | inline void add_vec(float* left, const float* right, int dim) {
56 |     assert(dim > 0);
57 |     for (; dim > 0; --dim) *left++ += *right++;
58 | }
59 | 
60 | 
61 | }    // namespace nn
62 | }    // namespace khaiii
63 | 
64 | 
65 | #endif    // SRC_MAIN_CPP_KHAIII_NN_TENSOR_HPP_
66 | 


--------------------------------------------------------------------------------
/src/main/cpp/khaiii/util.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
 3 |  * @copyright  Copyright (C) 2017-, Kakao Corp. All rights reserved.
 4 |  */
 5 | 
 6 | 
 7 | #ifndef SRC_MAIN_CPP_KHAIII_UTIL_HPP_
 8 | #define SRC_MAIN_CPP_KHAIII_UTIL_HPP_
 9 | 
10 | 
11 | //////////////
12 | // includes //
13 | //////////////
14 | #include <sys/stat.h>
15 | 
16 | #include <memory>
17 | #include <sstream>
18 | #include <string>
19 | #include <utility>
20 | #include <vector>
21 | 
22 | #include "boost/locale/encoding_utf.hpp"
23 | 
24 | 
25 | namespace khaiii {
26 | 
27 | 
28 | 
29 | ///////////////
30 | // functions //
31 | ///////////////
32 | /**
33 |  * whether is space or not
34 |  * @param  chr  character
35 |  * @return  true if character is space
36 |  */
37 | inline bool is_space(wchar_t chr) {
38 |     static std::wstring space(L" \t\v\r\n\u3000");
39 |     return space.find(chr) != std::wstring::npos;
40 | }
41 | 
42 | 
43 | /**
44 |  * convert UTF-8 string to wstring
45 |  * @param str  UTF-8 string
46 |  * @return  wstring
47 |  */
48 | inline std::wstring utf8_to_wstr(const std::string& str) {
49 |     return boost::locale::conv::utf_to_utf<wchar_t>(str.c_str(), str.c_str() + str.length());
50 | }
51 | 
52 | 
53 | /**
54 |  * convert wstring to UTF-8 string
55 |  * @param  wstr  wstring
56 |  * @return  UTF-8 string
57 |  */
58 | inline std::string wstr_to_utf8(const std::wstring& wstr) {
59 |     return boost::locale::conv::utf_to_utf<char>(wstr.c_str(), wstr.c_str() + wstr.length());
60 | }
61 | 
62 | 
63 | /**
64 |  * string splitter
65 |  * @param  str  string to split
66 |  * @param  deilm  delimiter char
67 |  * @return  list of splitted strings
68 |  */
69 | inline std::vector<std::string> split(const std::string& str, char delim) {
70 |     std::stringstream sss(str);
71 |     std::vector<std::string> elems;
72 |     for (std::string item; std::getline(sss, item, delim); ) {
73 |         elems.emplace_back(std::move(item));
74 |     }
75 |     return elems;
76 | }
77 | 
78 | 
79 | /**
80 |  * whether file (or directory) exists or not
81 |  * @param  path  path
82 |  * @return  true if exists
83 |  */
84 | inline bool file_exists(std::string path) {
85 |     struct stat st;
86 |     return stat(path.c_str(), &st) == 0;
87 | }
88 | 
89 | 
90 | }    // namespace khaiii
91 | 
92 | 
93 | #endif    // SRC_MAIN_CPP_KHAIII_UTIL_HPP_
94 | 


--------------------------------------------------------------------------------
/src/main/cpp/main.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
  3 |  * @copyright  Copyright (C) 2018-, Kakao Corp. All rights reserved.
  4 |  */
  5 | 
  6 | 
  7 | //////////////
  8 | // includes //
  9 | //////////////
 10 | #include <cstdio>
 11 | #include <iostream>
 12 | #include <fstream>
 13 | #include <string>
 14 | 
 15 | #include "cxxopts.hpp"
 16 | #include "fmt/printf.h"
 17 | #ifdef PROFILER
 18 |     #include "gperftools/profiler.h"
 19 | #endif
 20 | #include "spdlog/spdlog.h"
 21 | 
 22 | #include "khaiii/KhaiiiApi.hpp"
 23 | #include "khaiii/khaiii_dev.h"
 24 | 
 25 | 
 26 | using std::cerr;
 27 | using std::cin;
 28 | using std::endl;
 29 | using std::ifstream;
 30 | using std::ofstream;
 31 | using std::string;
 32 | 
 33 | using khaiii::KhaiiiApi;
 34 | 
 35 | 
 36 | ///////////////
 37 | // functions //
 38 | ///////////////
 39 | int run(const cxxopts::ParseResult& opts) {
 40 |     auto _log = spdlog::get("console");
 41 |     khaiii_set_log_levels(opts["set-log"].as<string>().c_str());
 42 | 
 43 |     auto khaiii_api = KhaiiiApi::create();
 44 |     try {
 45 |         khaiii_api->open(opts["rsc-dir"].as<string>(), opts["opt-str"].as<string>());
 46 |     } catch (const khaiii::Except& exc) {
 47 |         _log->error("fail to open dir: '{}', opt: '{}'", opts["rsc-dir"].as<string>(),
 48 |                     opts["opt-str"].as<string>());
 49 |         _log->error(exc.what());
 50 |         return 1;
 51 |     }
 52 | 
 53 |     for (string line; getline(cin, line); ) {
 54 |         _log->debug("sent: {}", line);
 55 |         const khaiii_word_t* results = nullptr;
 56 |         try {
 57 |             results = khaiii_api->analyze(line.c_str(), "");
 58 |         } catch (const khaiii::Except& exc) {
 59 |             _log->warn("{}: {}", exc.what(), line);
 60 |             continue;
 61 |         }
 62 |         for (auto word = results; word != nullptr; word = word->next) {
 63 |             fmt::print("{}\t", line.substr(word->begin, word->length));
 64 |             const khaiii_morph_t* morphs = word->morphs;
 65 |             for (auto morph = morphs; morph != nullptr; morph = morph->next) {
 66 |                 if (morph != morphs) fmt::print(" + ");
 67 |                 fmt::print("{}/{}", morph->lex, morph->tag);
 68 |             }
 69 |             fmt::print("\n");
 70 |         }
 71 |         fmt::print("\n");
 72 |         khaiii_api->free_results(results);
 73 |     }
 74 | 
 75 |     return 0;
 76 | }
 77 | 
 78 | 
 79 | //////////
 80 | // main //
 81 | //////////
 82 | int main(int argc, char** argv) {
 83 |     auto _log = spdlog::stderr_color_mt("console");
 84 |     spdlog::set_level(spdlog::level::warn);
 85 | 
 86 |     cxxopts::Options options("khaiii", "analyze with khaiii");
 87 |     options.add_options()
 88 |         ("h,help", "print this help")
 89 |         ("rsc-dir", "resource directory", cxxopts::value<string>()->default_value(""))
 90 |         ("opt-str", "option (JSON format)", cxxopts::value<string>()->default_value(""))
 91 |         ("input", "input file (default: stdin)", cxxopts::value<string>())
 92 |         ("output", "output file (default: stdout)", cxxopts::value<string>())
 93 |         ("set-log", "set log level", cxxopts::value<string>()->default_value("all:info"));
 94 |     auto opts = options.parse(argc, argv);
 95 | 
 96 |     if (opts.count("help")) {
 97 |         fmt::fprintf(cerr, "%s\n", options.help());
 98 |         return 0;
 99 |     }
100 |     if (opts.count("input")) {
101 |         string path = opts["input"].as<string>();
102 |         ifstream fin(path);
103 |         if (!fin.good()) {
104 |             _log->error("input file not found: {}", path);
105 |             return 1;
106 |         }
107 |         if (freopen(path.c_str(), "r", stdin) == nullptr) {
108 |             _log->error("fail to open input file: {}", path);
109 |             return 2;
110 |         }
111 |     }
112 |     if (opts.count("output")) {
113 |         string path = opts["output"].as<string>();
114 |         if (freopen(path.c_str(), "w", stdout) == nullptr) {
115 |             _log->error("fail to open output file: {}", path);
116 |             return 3;
117 |         }
118 |     }
119 | 
120 | #ifdef PROFILER
121 |     ProfilerStart("/tmp/bin_khaiii.prof");
122 | #endif
123 | 
124 |     int ret = run(opts);
125 | 
126 | #ifdef PROFILER
127 |     ProfilerStop();
128 | #endif
129 | 
130 |     return ret;
131 | }
132 | 


--------------------------------------------------------------------------------
/src/main/python/MANIFEST.in.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include LICENSE
3 | recursive-include khaiii *
4 | include @CPACK_SOURCE_PACKAGE_FILE_NAME@.tar.gz
5 | 


--------------------------------------------------------------------------------
/src/main/python/khaiii/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kakao/khaiii/3d0c8944374163b1107fd6690ccf3a408430a02d/src/main/python/khaiii/__init__.py


--------------------------------------------------------------------------------
/src/main/python/khaiii/__init__.py.in:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | 
 4 | """
 5 | @CPACK_PACKAGE_DESCRIPTION_SUMMARY@
 6 | 
 7 | __version__ = '@KHAIII_VERSION@'
 8 | __author__ = '@CPACK_PACKAGE_VENDOR@'
 9 | __copyright__ = 'Copyright (C) 2018-, Kakao Corp. All rights reserved.'
10 | __license__ = 'Apache 2.0'
11 | __maintainer__ = 'Jamie'
12 | __email__ = 'jamie.lim@kakaocorp.com'
13 | """
14 | 
15 | 
16 | from .khaiii import KhaiiiApi, KhaiiiExcept, KhaiiiMorph, KhaiiiWord
17 | 


--------------------------------------------------------------------------------
/src/main/python/khaiii/munjong/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kakao/khaiii/3d0c8944374163b1107fd6690ccf3a408430a02d/src/main/python/khaiii/munjong/__init__.py


--------------------------------------------------------------------------------
/src/main/python/khaiii/resource/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kakao/khaiii/3d0c8944374163b1107fd6690ccf3a408430a02d/src/main/python/khaiii/resource/__init__.py


--------------------------------------------------------------------------------
/src/main/python/khaiii/resource/morphs.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | 
  4 | """
  5 | 형태소 분석 결과를 기술한 문자열을 파싱하는 모듈.
  6 | TODO(jamie): sejong_corpus 모듈의 Morph 클래스와 중복되므로 정리 필요
  7 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)'
  8 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
  9 | """
 10 | 
 11 | 
 12 | ###########
 13 | # imports #
 14 | ###########
 15 | from typing import List
 16 | 
 17 | 
 18 | #############
 19 | # constants #
 20 | #############
 21 | # 전체 태그셋. 숫자 -> 태그 매핑
 22 | TAGS = sorted(['EC', 'EF', 'EP', 'ETM', 'ETN', 'IC', 'JC', 'JKB', 'JKC', 'JKG',
 23 |                'JKO', 'JKQ', 'JKS', 'JKV', 'JX', 'MAG', 'MAJ', 'MM', 'NNB', 'NNG',
 24 |                'NNP', 'NP', 'NR', 'SE', 'SF', 'SH', 'SL', 'SN', 'SO', 'SP',
 25 |                'SS', 'SW', 'SWK', 'VA', 'VCN', 'VCP', 'VV', 'VX', 'XPN', 'XR',
 26 |                'XSA', 'XSN', 'XSV', 'ZN', 'ZV', 'ZZ', ])
 27 | # B- 태그가 가능한 태그 목록
 28 | B_TAGS = sorted(['EP', 'IC', 'JKB', 'JX', 'MAG', 'MM', 'NNB', 'NNG', 'NNP', 'NP',
 29 |                  'NR', 'SE', 'SF', 'SN', 'SO', 'SP', 'SS', 'SW', 'SWK', 'XPN',
 30 |                  'XR', 'XSN', ])
 31 | TAG_SET = {tag: num for num, tag in enumerate(TAGS, start=1)}    # 태그 -> 숫자 매핑
 32 | 
 33 | WORD_DELIM_STR = '_'    # 어절 경계(공백)를 나타내는 가상 형태소
 34 | SENT_DELIM_STR = '|'    # 문장 경계를 나타내는 가상 형태소
 35 | WORD_DELIM_NUM = -1    # 어절 경계 가상 태그 번호
 36 | SENT_DELIM_NUM = -2    # 문장 경계 가상 태그 번호
 37 | 
 38 | 
 39 | #########
 40 | # types #
 41 | #########
 42 | class ParseError(Exception):
 43 |     """
 44 |     형태소 분석 결과 문자열을 파싱하면서 발생하는 오류
 45 |     """
 46 | 
 47 | 
 48 | class Morph:
 49 |     """
 50 |     형태소
 51 |     """
 52 |     def __init__(self, lex: str, tag: str):
 53 |         """
 54 |         Arguments:
 55 |             lex:  형태소(어휘)
 56 |             tag:  품사 태그
 57 |         """
 58 |         self.lex = lex
 59 |         self.tag = tag
 60 | 
 61 |     def __str__(self):
 62 |         if not self.tag:
 63 |             return self.lex
 64 |         return '{}/{}'.format(self.lex, self.tag)
 65 | 
 66 |     def is_word_delim(self) -> bool:
 67 |         """
 68 |         어절의 경계를 나타태는 지 여부
 69 |         Returns:
 70 |             어절의 경계 여부
 71 |         """
 72 |         return not self.tag and self.lex == WORD_DELIM_STR
 73 | 
 74 |     def is_sent_delim(self) -> bool:
 75 |         """
 76 |         문장의 경계를 나타태는 지 여부
 77 |         Returns:
 78 |             문장의 경계 여부
 79 |         """
 80 |         return not self.tag and self.lex == SENT_DELIM_STR
 81 | 
 82 |     @classmethod
 83 |     def to_str(cls, morphs: List['Morph']) -> str:
 84 |         """
 85 |         Morph 객체 리스트를 문자열로 변환한다.
 86 |         Arguments:
 87 |             morphs:  Morph 객체 리스트
 88 |         Returns:
 89 |             변환된 문자열
 90 |         """
 91 |         return ' + '.join([str(m) for m in morphs])
 92 | 
 93 |     @classmethod
 94 |     def parse(cls, morphs_str: str) -> List['Morph']:
 95 |         """
 96 |         형태소 분석 결과 형태의 문자열을 파싱하여 Morph 객체 리스트를 반환하는 파싱 함수
 97 |         Arguments:
 98 |             morphs_str:  형태소 분석 결과 문자열. 예: "제이미/NNP + 는/JKS"
 99 |         Returns:
100 |             Morph 객체 리스트
101 |         """
102 |         if not morphs_str:
103 |             raise ParseError('empty to parse')
104 |         return [cls._parse_one(m) for m in morphs_str.split(' + ')]
105 | 
106 |     @classmethod
107 |     def _parse_one(cls, morph_str: str) -> 'Morph':
108 |         """
109 |         하나의 형태소 객체를 기술한 문자열을 파싱한다.
110 |         Arguments:
111 |             morph_str:  형태소 문자열
112 |         Returns:
113 |             Morph 객체
114 |         """
115 |         if ' ' in morph_str:
116 |             raise ParseError('space in morph')
117 |         try:
118 |             if morph_str in [WORD_DELIM_STR, SENT_DELIM_STR]:
119 |                 return Morph(morph_str, '')
120 |             lex, tag = morph_str.rsplit('/', 1)
121 |         except ValueError:
122 |             raise ParseError('invalid morpheme string format')
123 |         if not lex:
124 |             raise ParseError('no lexical in morpheme string')
125 |         if not tag:
126 |             raise ParseError('no pos tag in morpheme string')
127 |         if tag not in TAG_SET:
128 |             raise ParseError('invalid pos tag: {}'.format(tag))
129 |         return Morph(lex, tag)
130 | 
131 | 
132 | #############
133 | # functions #
134 | #############
135 | def mix_char_tag(chars: str, tags: List[int]) -> List[int]:
136 |     """
137 |     음절과 출력 태그를 비트 연산으로 합쳐서 하나의 (32비트) 숫자로 표현한다.
138 |     Args:
139 |         chars:  음절 (유니코드) 리스트 (문자열)
140 |         tags:  출력 태그 번호의 리스트
141 |     Returns:
142 |         합쳐진 숫자의 리스트
143 |     """
144 |     char_nums = [ord(c) for c in chars]
145 |     if tags[0] == SENT_DELIM_NUM:
146 |         char_nums.insert(0, SENT_DELIM_NUM)
147 |     if tags[-1] == SENT_DELIM_NUM:
148 |         char_nums.append(SENT_DELIM_NUM)
149 |     for idx, char_num in enumerate(char_nums):
150 |         if char_num == ord(' '):
151 |             char_nums[idx] = WORD_DELIM_NUM
152 |             continue
153 |         elif tags[idx] == SENT_DELIM_NUM:
154 |             continue
155 |         char_nums[idx] = char_num << 12 | tags[idx]
156 |     return char_nums
157 | 


--------------------------------------------------------------------------------
/src/main/python/khaiii/resource/resource.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | 
  4 | """
  5 | resources for training and tagging
  6 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)'
  7 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
  8 | """
  9 | 
 10 | 
 11 | ###########
 12 | # imports #
 13 | ###########
 14 | from argparse import Namespace
 15 | from collections import defaultdict
 16 | import logging
 17 | import os
 18 | from typing import Dict, Tuple
 19 | 
 20 | from khaiii.resource.vocabulary import Vocabulary
 21 | 
 22 | 
 23 | #############
 24 | # constants #
 25 | #############
 26 | UNK_CHR = '@@UNKNOWN@@'
 27 | SPECIAL_CHARS = ['<w>', '</w>']    # begin/end of word
 28 | 
 29 | 
 30 | #########
 31 | # types #
 32 | #########
 33 | class Resource:
 34 |     """
 35 |     resources
 36 |     """
 37 |     def __init__(self, cfg: Namespace):
 38 |         """
 39 |         Args:
 40 |             cfg:  config
 41 |         """
 42 |         vocab_in_path = '{}/vocab.in'.format(cfg.rsc_src)
 43 |         self.vocab_in = Vocabulary(vocab_in_path, cfg.cutoff, UNK_CHR, SPECIAL_CHARS)
 44 |         vocab_out_path = '{}/vocab.out'.format(cfg.rsc_src)
 45 |         self.vocab_out = Vocabulary(vocab_out_path)    # no unknown, no special
 46 |         restore_dic_path = '{}/restore.dic'.format(cfg.rsc_src)
 47 |         self.restore_dic = self.load_restore_dic(restore_dic_path)
 48 | 
 49 |     @classmethod
 50 |     def load_restore_dic(cls, path: str) -> Dict[str, str]:
 51 |         """
 52 |         load character to output tag mapping
 53 |         Args:
 54 |             path:  file path
 55 |         Returns:
 56 |             dictionary
 57 |         """
 58 |         dic = {}
 59 |         for line in open(path, 'r', encoding='UTF-8'):
 60 |             line = line.rstrip('\r\n')
 61 |             if not line:
 62 |                 continue
 63 |             key, val = line.split('\t')
 64 |             dic[key] = val
 65 |         logging.info('%s: %d entries', os.path.basename(path), len(dic))
 66 |         return dic
 67 | 
 68 | 
 69 | #############
 70 | # functions #
 71 | #############
 72 | def parse_restore_dic(file_path: str) -> Dict[Tuple[str, str], Dict[int, str]]:
 73 |     """
 74 |     원형복원 사전을 로드한다.
 75 |     Args:
 76 |         file_path:  파일 경로
 77 |     Returns:
 78 |         사전
 79 |     """
 80 |     file_name = os.path.basename(file_path)
 81 |     restore_dic = defaultdict(dict)
 82 |     for line_num, line in enumerate(open(file_path, 'r', encoding='UTF-8'), start=1):
 83 |         line = line.rstrip()
 84 |         if not line or line[0] == '#':
 85 |             continue
 86 |         char_tag_num, mrp_chr_str = line.split('\t')
 87 |         char, tag_num = char_tag_num.rsplit('/', 1)
 88 |         tag, num = tag_num.rsplit(':', 1)
 89 |         num = int(num)
 90 |         if (char, tag) in restore_dic:
 91 |             num_mrp_chrs_dic = restore_dic[char, tag]
 92 |             if num in num_mrp_chrs_dic:
 93 |                 logging.error('%s:%d: duplicated with %s: %s', file_name, line_num,
 94 |                               num_mrp_chrs_dic[num], line)
 95 |                 return {}
 96 |         restore_dic[char, tag][num] = mrp_chr_str
 97 |     return restore_dic
 98 | 
 99 | 
100 | def load_vocab_out(rsc_src: str) -> Dict[str, int]:
101 |     """
102 |     출력 태그 vocabulary를 로드한다.
103 |     Args:
104 |         rsc_src:  리소스 디렉토리
105 |     Returns:
106 |         출력 태그 vocabulary
107 |     """
108 |     file_path = '{}/vocab.out'.format(rsc_src)
109 |     vocab_out = [line.strip() for line in open(file_path, 'r', encoding='UTF-8')
110 |                  if line.strip()]
111 |     vocab_out_more = []
112 |     file_path = '{}/vocab.out.more'.format(rsc_src)
113 |     if os.path.exists(file_path):
114 |         vocab_out_more = [line.strip() for line in open(file_path, 'r', encoding='UTF-8')
115 |                           if line.strip()]
116 |     return {tag: idx for idx, tag in enumerate(vocab_out + vocab_out_more, start=1)}
117 | 


--------------------------------------------------------------------------------
/src/main/python/khaiii/resource/vocabulary.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | 
 4 | """
 5 | vocabulary library
 6 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)'
 7 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
 8 | """
 9 | 
10 | 
11 | ###########
12 | # imports #
13 | ###########
14 | import logging
15 | import os
16 | from typing import List
17 | 
18 | 
19 | #########
20 | # types #
21 | #########
22 | class Vocabulary:
23 |     """
24 |     vocabulary class
25 |     """
26 |     def __init__(self, path: str, cutoff: int = 1, unk: str = '', special: List[str] = None):
27 |         """
28 |         padding index is always 0. None and '' get padding index.
29 |         if `unk` is given (such as input vocab), its index is always 1.
30 |         if `unk` is not given (such as output vocab), an exception will be thrown for unknown entry
31 |         Args:
32 |             path:  file path
33 |             cutoff:  cutoff frequency
34 |             unk:  unknown(OOV) entry
35 |             special:  special entries located at the first
36 |         """
37 |         self.dic = {}    # {entry: number} dictionary
38 |         self.unk = unk
39 |         self.rev = ['', unk] if unk else []    # reverse dictionary
40 |         if special:
41 |             self.rev.extend(special)
42 |         for num, entry in enumerate(self.rev):
43 |             self.dic[entry] = num
44 |         self._load(path, cutoff)
45 |         assert len(self.dic) == len(self.rev)
46 | 
47 |     def __getitem__(self, key):
48 |         """
49 |         Args:
50 |             key:  key
51 |         Returns:
52 |             word number for string key, word for int key
53 |         """
54 |         if isinstance(key, int):
55 |             return self.rev[key]
56 |         try:
57 |             return self.dic[key]
58 |         except KeyError as key_err:
59 |             if self.unk:
60 |                 return self.dic[self.unk]
61 |             raise key_err
62 | 
63 |     def __len__(self):
64 |         return len(self.dic)
65 | 
66 |     def _load(self, path: str, cutoff: int = 1):
67 |         """
68 |         load vocabulary from file
69 |         Args:
70 |             path:  file path
71 |             cutoff:  cutoff frequency
72 |         """
73 |         append_num = 0
74 |         cutoff_num = 0
75 |         for line in open(path, 'r', encoding='UTF-8'):
76 |             line = line.rstrip('\r\n')
77 |             if not line:
78 |                 continue
79 |             try:
80 |                 entry, freq = line.split('\t')
81 |                 if int(freq) <= cutoff:
82 |                     cutoff_num += 1
83 |                     continue
84 |             except ValueError:
85 |                 entry = line
86 |             if entry in self.dic:
87 |                 cutoff_num += 1
88 |                 continue
89 |             self.dic[entry] = len(self.dic)
90 |             self.rev.append(entry)
91 |             append_num += 1
92 |         logging.info('%s: %d entries, %d cutoff', os.path.basename(path), append_num, cutoff_num)
93 | 


--------------------------------------------------------------------------------
/src/main/python/khaiii/train/embedder.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | 
  4 | """
  5 | making embedding models
  6 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)'
  7 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
  8 | """
  9 | 
 10 | 
 11 | ###########
 12 | # imports #
 13 | ###########
 14 | from argparse import Namespace
 15 | import math
 16 | 
 17 | import torch
 18 | from torch import nn, Tensor
 19 | 
 20 | from khaiii.resource.resource import Resource
 21 | 
 22 | 
 23 | class Embedder(nn.Module):
 24 |     """
 25 |     embedder class
 26 |     """
 27 |     def __init__(self, cfg: Namespace, rsc: Resource):
 28 |         """
 29 |         Args:
 30 |             cfg:  config
 31 |             rsc:  Resource object
 32 |         """
 33 |         super().__init__()
 34 |         self.cfg = cfg
 35 |         self.rsc = rsc
 36 |         self.embedding = nn.Embedding(len(rsc.vocab_in), cfg.embed_dim, 0)
 37 | 
 38 |     def forward(self, *inputs):    # pylint: disable=arguments-differ
 39 |         """
 40 |         임베딩을 생성하는 메소드
 41 |         Args:
 42 |             inputs:  batch size list of (context, left space mask, right space mask)
 43 |         Returns:
 44 |             embedding
 45 |         """
 46 |         contexts, left_spc_masks, right_spc_masks = inputs
 47 |         embeds = self.embedding(contexts)
 48 |         if left_spc_masks is not None:
 49 |             embeds += self.embedding(left_spc_masks)
 50 |         if right_spc_masks is not None:
 51 |             embeds += self.embedding(right_spc_masks)
 52 |         # 왼쪽과 오른쪽 패딩에는 zero 벡터인데 아래 positional encoding이 더해짐
 53 |         # 사소하지만 아래도 패딩 영역에 대해 마스킹 후 더해줘야 하지 않을까?
 54 |         embeds += positional_encoding(self.cfg.context_len, self.cfg.context_len,
 55 |                                       self.cfg.embed_dim, 1, self.cfg.gpu_num)
 56 |         return embeds
 57 | 
 58 | 
 59 | #############
 60 | # functions #
 61 | #############
 62 | def memoize(func):
 63 |     """
 64 |     memoize decorator
 65 |     """
 66 |     class Memodict(dict):
 67 |         """
 68 |         Memoization decorator for a function taking one or more arguments.
 69 |         """
 70 |         def __getitem__(self, *key):
 71 |             return dict.__getitem__(self, key)
 72 | 
 73 |         def __missing__(self, key):
 74 |             ret = self[key] = func(*key)
 75 |             return ret
 76 | 
 77 |     return Memodict().__getitem__
 78 | 
 79 | 
 80 | @memoize
 81 | def positional_encoding(sent_len: int, max_dim: int, embed_dim: int, method: int = 1,
 82 |                         gpu_num: int = -1) -> Tensor:
 83 |     """
 84 |     positional encoding Tensor 출력.
 85 |     embeds [batch_size, context_len, embed_dim]에 Broadcasting 으로 더해짐
 86 |     Args:
 87 |         sent_len:  actual sentence length
 88 |         max_dim:  maximum dimension
 89 |         embed_dim:  embedding dimension
 90 |         method:  method number (1. end-to-end memory networks or 2. attention is all you need)
 91 |         gpu_num:  GPU device number. default: -1 for CPU
 92 |     Returns:
 93 |         pe [context_len, embed_dim]
 94 |     """
 95 |     device = gpu_num if gpu_num >= 0 else None
 96 |     pe_tensor = torch.zeros([max_dim, embed_dim], device=device)    # pylint: disable=no-member
 97 |     for pos in range(1, sent_len + 1):
 98 |         for i in range(1, embed_dim+1):
 99 |             if method == 1:
100 |                 # end-to-end memory networks
101 |                 pe_tensor[pos-1, i-1] = 1 - pos / sent_len - ((i / embed_dim) *
102 |                                                               (1 - 2 * pos / sent_len))
103 |             elif method == 2:
104 |                 # attention is all you need
105 |                 if i % 2 == 0:
106 |                     pe_tensor[pos-1, i-1] = math.sin(pos / 10000 ** (2*i / embed_dim))
107 |                 else:
108 |                     pe_tensor[pos-1, i-1] = math.cos(pos / 10000 ** (2*i / embed_dim))
109 |     pe_tensor.detach()
110 |     return pe_tensor
111 | 


--------------------------------------------------------------------------------
/src/main/python/khaiii/train/evaluator.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | 
  4 | """
  5 | evaluation related module
  6 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)'
  7 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
  8 | """
  9 | 
 10 | 
 11 | ###########
 12 | # imports #
 13 | ###########
 14 | from collections import Counter
 15 | import logging
 16 | from typing import List, TextIO, Tuple
 17 | 
 18 | from khaiii.train.sentence import PosMorph, PosSentence, PosWord
 19 | 
 20 | 
 21 | #########
 22 | # types #
 23 | #########
 24 | class Evaluator:
 25 |     """
 26 |     evauator
 27 |     """
 28 |     def __init__(self):
 29 |         self.cnt = Counter()
 30 | 
 31 |     def evaluate(self) -> Tuple[float, float, float]:
 32 |         """
 33 |         char/word accuracy, f-score(recall/precision)를 측정한다.
 34 |         Returns:
 35 |             character accuracy
 36 |             word accuracy
 37 |             f-score
 38 |         """
 39 |         char_acc = self.cnt['match_chars'] / self.cnt['total_chars']
 40 |         word_acc = self.cnt['match_words'] / self.cnt['total_words']
 41 |         if self.cnt['match_morphs'] == 0:
 42 |             recall = precision = f_score = 0.0
 43 |         else:
 44 |             recall = self.cnt['match_morphs'] / self.cnt['total_gold_morphs']
 45 |             precision = self.cnt['match_morphs'] / self.cnt['total_pred_morphs']
 46 |             f_score = 2.0 * recall * precision / (recall + precision)
 47 |         self.cnt.clear()
 48 |         return char_acc, word_acc, f_score
 49 | 
 50 |     def count(self, correct_sent: PosSentence, predict_sent: PosSentence):
 51 |         """
 52 |         정답 문장과 비교하여 맞춘 갯수를 샌다.
 53 |         Args:
 54 |             correct_sent:  정답 문장
 55 |             predict_sent:  예측한 문장
 56 |         """
 57 |         assert len(correct_sent.words) == len(predict_sent.words)
 58 |         for gold, pred in zip(correct_sent.pos_tagged_words, predict_sent.pos_tagged_words):
 59 |             self.cnt['total_chars'] += len(gold.res_tags)
 60 |             self.cnt['match_chars'] += len([1 for x, y in zip(gold.res_tags, pred.res_tags)
 61 |                                             if x == y])
 62 |             self._count_word(gold, pred)
 63 | 
 64 |     def _count_word(self, gold: PosWord, pred: PosWord):
 65 |         """
 66 |         count with gold standard and predicted (will update counter)
 67 |         Args:
 68 |             gold:  gold standard word
 69 |             pred:  predicted word
 70 |         """
 71 |         self.cnt['total_words'] += 1
 72 |         gold_morphs = gold.pos_tagged_morphs
 73 |         pred_morphs = pred.pos_tagged_morphs
 74 |         if gold == pred:
 75 |             self.cnt['match_words'] += 1
 76 |             num_match = len(gold_morphs)
 77 |             self.cnt['total_gold_morphs'] += num_match
 78 |             self.cnt['total_pred_morphs'] += num_match
 79 |             self.cnt['match_morphs'] += num_match
 80 |             return
 81 |         logging.debug('gold: %s', ' '.join([str(_) for _ in gold_morphs]))
 82 |         logging.debug('pred: %s', ' '.join([str(_) for _ in pred_morphs]))
 83 |         self.cnt['total_gold_morphs'] += len(gold_morphs)
 84 |         self.cnt['total_pred_morphs'] += len(pred_morphs)
 85 |         gold_set = self.morphs_to_set(gold_morphs)
 86 |         pred_set = self.morphs_to_set(pred_morphs)
 87 |         self.cnt['match_morphs'] += len(gold_set & pred_set)
 88 | 
 89 |     @classmethod
 90 |     def morphs_to_set(cls, morphs: List[PosMorph]) -> set:
 91 |         """
 92 |         make set from morpheme list
 93 |         Args:
 94 |             morphs:  morpheme list
 95 |         Returns:
 96 |             morphemes set
 97 |         """
 98 |         morph_cnt = Counter([(morph.morph, morph.pos_tag) for morph in morphs])
 99 |         morph_set = set()
100 |         for (lex, tag), freq in morph_cnt.items():
101 |             if freq == 1:
102 |                 morph_set.add((lex, tag))
103 |             else:
104 |                 morph_set.update([(lex, tag, _) for _ in range(1, freq+1)])
105 |         return morph_set
106 | 
107 |     def report(self, fout: TextIO):
108 |         """
109 |         report recall/precision to file
110 |         Args:
111 |             fout:  output file
112 |         """
113 |         print('word accuracy: %d / %d = %.4f' % (self.cnt['match_words'], self.cnt['total_words'],
114 |                                                  self.cnt['match_words'] / self.cnt['total_words']),
115 |               file=fout)
116 |         if self.cnt['match_morphs'] == 0:
117 |             recall = precision = f_score = 0.0
118 |         else:
119 |             recall = self.cnt['match_morphs'] / self.cnt['total_gold_morphs']
120 |             precision = self.cnt['match_morphs'] / self.cnt['total_pred_morphs']
121 |             f_score = 2.0 * recall * precision / (recall + precision)
122 |         print('f-score / (recall, precision): %.4f / (%.4f, %.4f)' % (f_score, recall, precision),
123 |               file=fout)
124 | 


--------------------------------------------------------------------------------
/src/main/python/khaiii/train/models.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | 
  4 | """
  5 | Pytorch models
  6 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)'
  7 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
  8 | """
  9 | 
 10 | 
 11 | ###########
 12 | # imports #
 13 | ###########
 14 | from argparse import Namespace
 15 | 
 16 | import torch
 17 | import torch.nn as nn
 18 | import torch.nn.functional as F
 19 | 
 20 | from khaiii.resource.resource import Resource
 21 | from khaiii.train.embedder import Embedder
 22 | 
 23 | 
 24 | #########
 25 | # types #
 26 | #########
 27 | class ConvLayer(nn.Module):
 28 |     """
 29 |     형태소 태깅 모델과 띄어쓰기 모델이 공유하는 컨볼루션 레이어
 30 |     """
 31 |     def __init__(self, cfg: Namespace, rsc: Resource):
 32 |         """
 33 |         Args:
 34 |             cfg:  config
 35 |             rsc:  Resource object
 36 |         """
 37 |         super().__init__()
 38 |         self.embedder = Embedder(cfg, rsc)
 39 |         ngram = min(5, cfg.window * 2 + 1)
 40 |         self.convs = nn.ModuleList([nn.Conv1d(cfg.embed_dim, cfg.embed_dim, kernel_size)
 41 |                                     for kernel_size in range(2, ngram+1)])
 42 | 
 43 |     def forward(self, *inputs):
 44 |         embeds = self.embedder(*inputs)
 45 |         embeds_t = embeds.transpose(1, 2)
 46 |         pool_outs = []
 47 |         for conv in self.convs:
 48 |             conv_out = F.relu(conv(embeds_t))
 49 |             pool_outs.append(F.max_pool1d(conv_out, conv_out.size(2)))
 50 |         features = torch.cat([p.view(embeds.size(0), -1) for p in pool_outs], dim=1)    # pylint: disable=no-member
 51 |         return features
 52 | 
 53 | 
 54 | class HiddenLayer(nn.Module):
 55 |     """
 56 |     형태소 태깅 모델과 띄어쓰기 모델이 각각 학습하는 히든 레이어
 57 |     """
 58 |     def __init__(self, cfg: Namespace, rsc: Resource, conv_layer_len: int, is_spc: bool):
 59 |         """
 60 |         Args:
 61 |             cfg:  config
 62 |             rsc:  Resource object
 63 |             conv_layer_len:  convolution 레이어의 n-gram 타입 갯수
 64 |             is_spc:  띄어쓰기 모델 여부
 65 |         """
 66 |         super().__init__()
 67 |         setattr(cfg, 'hidden_dim',
 68 |                 (cfg.embed_dim * conv_layer_len + len(rsc.vocab_out)) // 2)
 69 |         feature_dim = cfg.embed_dim * conv_layer_len
 70 |         tag_dim = 2 if is_spc else len(rsc.vocab_out)
 71 |         self.layers = nn.ModuleList([nn.Linear(feature_dim, cfg.hidden_dim),
 72 |                                      nn.Linear(cfg.hidden_dim, tag_dim)])
 73 | 
 74 |     def forward(self, features):    # pylint: disable=arguments-differ
 75 |         # feature => hidden
 76 |         features_drop = F.dropout(features)
 77 |         hidden_out = F.relu(self.layers[0](features_drop))
 78 |         # hidden => tag
 79 |         hidden_out_drop = F.dropout(hidden_out)
 80 |         tag_out = self.layers[1](hidden_out_drop)
 81 |         return tag_out
 82 | 
 83 | 
 84 | class Model(nn.Module):
 85 |     """
 86 |     형태소 태깅 모델, 띄어쓰기 모델
 87 |     """
 88 |     def __init__(self, cfg: Namespace, rsc: Resource):
 89 |         """
 90 |         Args:
 91 |             cfg:  config
 92 |             rsc:  Resource object
 93 |         """
 94 |         super().__init__()
 95 |         self.cfg = cfg
 96 |         self.rsc = rsc
 97 |         self.conv_layer = ConvLayer(cfg, rsc)
 98 |         self.hidden_layer_pos = HiddenLayer(cfg, rsc, len(self.conv_layer.convs), is_spc=False)
 99 |         self.hidden_layer_spc = HiddenLayer(cfg, rsc, len(self.conv_layer.convs), is_spc=True)
100 | 
101 |     def forward(self, *inputs):
102 |         contexts, left_spc_masks, right_spc_masks = inputs
103 |         features_pos = self.conv_layer(contexts, left_spc_masks, right_spc_masks)
104 |         features_spc = self.conv_layer(contexts, None, None)
105 |         logits_pos = self.hidden_layer_pos(features_pos)
106 |         logits_spc = self.hidden_layer_spc(features_spc)
107 |         return logits_pos, logits_spc
108 | 
109 |     def save(self, path: str):
110 |         """
111 |         모델을 저장하는 메소드
112 |         Args:
113 |             path:  경로
114 |         """
115 |         torch.save(self.state_dict(), path)
116 | 
117 |     def load(self, path: str):
118 |         """
119 |         저장된 모델을 로드하는 메소드
120 |         Args:
121 |             path:  경로
122 |             conv_layer:  convolution layer
123 |         """
124 |         state_dict = torch.load(path, map_location=lambda storage, loc: storage)
125 |         self.load_state_dict(state_dict)
126 |         if torch.cuda.is_available() and self.cfg.gpu_num >= 0:
127 |             self.cuda(device=self.cfg.gpu_num)
128 | 


--------------------------------------------------------------------------------
/src/main/python/khaiii/train/tagger.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | 
 4 | """
 5 | part-of-speech tagger
 6 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)'
 7 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
 8 | """
 9 | 
10 | 
11 | ###########
12 | # imports #
13 | ###########
14 | from argparse import Namespace
15 | import json
16 | import logging
17 | import re
18 | 
19 | import torch.nn.functional as F
20 | 
21 | from khaiii.resource.resource import Resource
22 | from khaiii.train.dataset import PosSentTensor
23 | from khaiii.train.models import Model
24 | 
25 | 
26 | #########
27 | # types #
28 | #########
29 | class PosTagger:
30 |     """
31 |     part-of-speech tagger
32 |     """
33 |     def __init__(self, model_dir: str, gpu_num: int = -1):
34 |         """
35 |         Args:
36 |             model_dir:  model dir
37 |             gpu_num:  GPU number to override
38 |         """
39 |         cfg_dict = json.load(open('{}/config.json'.format(model_dir), 'r', encoding='UTF-8'))
40 |         self.cfg = Namespace()
41 |         for key, val in cfg_dict.items():
42 |             setattr(self.cfg, key, val)
43 |         setattr(self.cfg, 'gpu_num', gpu_num)
44 |         self.rsc = Resource(self.cfg)
45 |         self.model = Model(self.cfg, self.rsc)
46 |         self.model.load('{}/model.state'.format(model_dir))
47 |         self.model.eval()
48 | 
49 |     def tag_raw(self, raw_sent: str, enable_restore: bool = True) -> PosSentTensor:
50 |         """
51 |         part-of-speech tagging at raw sentence
52 |         Args:
53 |             raw_sent:  raw input sentence
54 |         Returns:
55 |             PosSentTensor object
56 |         """
57 |         pos_sent = PosSentTensor(raw_sent)
58 |         contexts = pos_sent.get_contexts(self.cfg, self.rsc)
59 |         left_spc_masks, right_spc_masks = pos_sent.get_spc_masks(self.cfg, self.rsc, False)
60 |         outputs, _ = self.model(PosSentTensor.to_tensor(contexts, self.cfg.gpu_num),    # pylint: disable=no-member
61 |                                 PosSentTensor.to_tensor(left_spc_masks, self.cfg.gpu_num),    # pylint: disable=no-member
62 |                                 PosSentTensor.to_tensor(right_spc_masks, self.cfg.gpu_num))    # pylint: disable=no-member
63 |         _, predicts = F.softmax(outputs, dim=1).max(1)
64 |         tags = [self.rsc.vocab_out[t.item()] for t in predicts]
65 |         pos_sent.set_pos_result(tags, self.rsc.restore_dic if enable_restore else None)
66 | 
67 |         if logging.getLogger().isEnabledFor(logging.DEBUG):
68 |             raw_nospc = re.sub(r'\s+', '', raw_sent)
69 |             for idx, (tag, pred) in enumerate(zip(tags, predicts)):
70 |                 logging.debug('[%2d]%s: %5s(%d)', idx, raw_nospc[idx], tag, pred.data[0])
71 | 
72 |         return pos_sent
73 | 


--------------------------------------------------------------------------------
/src/main/python/setup.py.in:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | 
 4 | """
 5 | @CPACK_PACKAGE_DESCRIPTION_SUMMARY@
 6 | 
 7 | __version__ = '@KHAIII_VERSION@'
 8 | __author__ = '@CPACK_PACKAGE_VENDOR@'
 9 | __copyright__ = 'Copyright (C) 2018-, Kakao Corp. All rights reserved.'
10 | __license__ = 'Apache 2.0'
11 | __maintainer__ = 'Jamie'
12 | __email__ = 'jamie.lim@kakaocorp.com'
13 | """
14 | 
15 | 
16 | ###########
17 | # imports #
18 | ###########
19 | from distutils.command.build import build
20 | import os
21 | import shutil
22 | import subprocess
23 | import zipfile
24 | 
25 | from setuptools import setup
26 | 
27 | 
28 | #############
29 | # constants #
30 | #############
31 | _SRC_NAME = '@CPACK_SOURCE_PACKAGE_FILE_NAME@'
32 | 
33 | 
34 | #########
35 | # types #
36 | #########
37 | class CustomBuild(build):
38 |     """
39 |     custom handler for 'build' command
40 |     """
41 |     def run(self):
42 |         """
43 |         run build command
44 |         """
45 |         with zipfile.ZipFile('{}.zip'.format(_SRC_NAME), 'r') as src_zip:
46 |             src_zip.extractall()
47 |         build_dir = '{}/build'.format(_SRC_NAME)
48 |         os.makedirs(build_dir, exist_ok=True)
49 |         subprocess.check_call('cmake ..', cwd=build_dir, shell=True)
50 |         subprocess.check_call('make all resource', cwd=build_dir, shell=True)
51 |         shutil.rmtree('khaiii/lib', ignore_errors=True)
52 |         shutil.copytree('{}/lib'.format(build_dir), 'khaiii/lib')
53 |         shutil.rmtree('khaiii/share', ignore_errors=True)
54 |         shutil.copytree('{}/share'.format(build_dir), 'khaiii/share')
55 |         shutil.rmtree(_SRC_NAME)
56 |         build.run(self)
57 | 
58 | 
59 | #############
60 | # functions #
61 | #############
62 | def readme():
63 |     """
64 |     read content from README.md file
65 |     Returns:
66 |         long description (content of README.md)
67 |     """
68 |     return open('@CMAKE_SOURCE_DIR@/README.md', 'r', encoding='UTF-8').read()
69 | 
70 | 
71 | #########
72 | # setup #
73 | #########
74 | setup(
75 |     name='khaiii',
76 |     version='@KHAIII_VERSION@',
77 |     description='@CPACK_PACKAGE_DESCRIPTION_SUMMARY@',
78 |     long_description=readme(),
79 |     url='https://github.com/kakao/khaiii',
80 |     author='@CPACK_PACKAGE_VENDOR@',
81 |     author_email='jamie.lim@kakaocorp.com',
82 |     classifiers=[
83 |         'Development Status :: 5 - Stable',
84 |         'License :: OSI Approved :: Apache 2.0',
85 |         'Programming Language :: Python :: 3',
86 |     ],
87 |     license='Apache 2.0',
88 |     packages=['khaiii', ],
89 |     include_package_data=True,
90 |     install_requires=[],
91 |     setup_requires=['pytest-runner', ],
92 |     tests_require=['pytest', ],
93 |     zip_safe=False,
94 |     cmdclass={'build': CustomBuild}
95 | )
96 | 


--------------------------------------------------------------------------------
/src/test/cpp/khaiii/ErrPatchTest.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
  3 |  * @copyright  Copyright (C) 2018-, Kakao Corp. All rights reserved.
  4 |  */
  5 | 
  6 | 
  7 | 
  8 | //////////////
  9 | // includes //
 10 | //////////////
 11 | #include <memory>
 12 | #include <string>
 13 | #include <vector>
 14 | 
 15 | #include "cxxopts.hpp"
 16 | #include "gtest/gtest.h"
 17 | #include "spdlog/spdlog.h"
 18 | 
 19 | #include "khaiii/ErrPatch.hpp"
 20 | #include "khaiii/KhaiiiApi.hpp"
 21 | #include "khaiii/Word.hpp"
 22 | #include "khaiii/util.hpp"
 23 | 
 24 | 
 25 | ///////////////
 26 | // variables //
 27 | ///////////////
 28 | extern cxxopts::ParseResult* prog_args;    // arguments passed to main program
 29 | 
 30 | 
 31 | namespace khaiii {
 32 | 
 33 | 
 34 | using std::make_shared;
 35 | using std::ostringstream;
 36 | using std::pair;
 37 | using std::shared_ptr;
 38 | using std::string;
 39 | using std::vector;
 40 | using std::wstring;
 41 | 
 42 | 
 43 | //////////////////
 44 | // test fixture //
 45 | //////////////////
 46 | class ErrPatchTest: public testing::Test {
 47 |  public:
 48 |     virtual void SetUp() {
 49 |         std::string rsc_dir = (*prog_args)["rsc-dir"].as<string>();
 50 |         ASSERT_NO_THROW(_khaiii_api->open(rsc_dir, "{\"errpatch\": false}"));
 51 |     }
 52 | 
 53 |     virtual void TearDown() {
 54 |         ASSERT_NO_THROW(_khaiii_api->close());
 55 |     }
 56 | 
 57 |  protected:
 58 |     static shared_ptr<spdlog::logger> _log;    ///< logger
 59 | 
 60 |     shared_ptr<KhaiiiApi> _khaiii_api = KhaiiiApi::create();
 61 | 
 62 |     void _check(string raw, string left, string right) {
 63 |         auto bfr = _khaiii_api->analyze(raw.c_str(), "{\"errpatch\": false}");
 64 |         string bfr_str = _to_str(bfr);
 65 |         if (left != bfr_str) {
 66 |             _log->warn("error not found: '{}' => E:'{}' vs A:'{}'", raw, left, bfr_str);
 67 |             return;
 68 |         }
 69 |         auto aft = _khaiii_api->analyze(raw.c_str(), "{\"errpatch\": true}");
 70 |         EXPECT_STREQ(right.c_str(), _to_str(aft).c_str());
 71 |     }
 72 | 
 73 |     string _to_str(const khaiii_word_t* results) {
 74 |         ostringstream oss;
 75 |         for (auto word = results; word != nullptr; word = word->next) {
 76 |             if (word != results) oss << " + _ + ";
 77 |             const khaiii_morph_t* morphs = word->morphs;
 78 |             for (auto morph = morphs; morph != nullptr; morph = morph->next) {
 79 |                 if (morph != morphs) oss << " + ";
 80 |                 oss << morph->lex << "/" << morph->tag;
 81 |             }
 82 |         }
 83 |         return oss.str();
 84 |     }
 85 | };
 86 | 
 87 | 
 88 | shared_ptr<spdlog::logger> ErrPatchTest::_log = spdlog::stderr_color_mt("ErrPatchTest");
 89 | 
 90 | 
 91 | ////////////////
 92 | // test cases //
 93 | ////////////////
 94 | TEST_F(ErrPatchTest, apply) {
 95 |     // for base model
 96 |     _check("지저스크라이스트", "지저스크라이스/NNP + 트/NNG", "지저스/NNP + 크라이스트/NNP");
 97 |     _check("지저스 크라이스트", "지저스/NNP + _ + 크라이스/NNP + 트/NNG",
 98 |            "지저스/NNP + _ + 크라이스트/NNP");
 99 |     _check("고타마싯다르타", "고타마싯다르타/NNP", "고타마/NNP + 싯다르타/NNP");
100 |     _check("무함마드압둘라", "무함마드압/NNP + 둘/NR + 라/NNP", "무함마드/NNP + 압둘라/NNP");
101 | }
102 | 
103 | 
104 | }    // namespace khaiii
105 | 


--------------------------------------------------------------------------------
/src/test/cpp/khaiii/KhaiiiApiTest.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
 3 |  * @copyright  Copyright (C) 2018-, Kakao Corp. All rights reserved.
 4 |  */
 5 | 
 6 | 
 7 | #ifndef SRC_TEST_CPP_KHAIII_KHAIIIAPITEST_HPP_
 8 | #define SRC_TEST_CPP_KHAIII_KHAIIIAPITEST_HPP_
 9 | 
10 | 
11 | 
12 | //////////////
13 | // includes //
14 | //////////////
15 | #include <string>
16 | 
17 | #include "gtest/gtest.h"
18 | 
19 | #include "khaiii/khaiii_api.h"
20 | 
21 | 
22 | //////////////////
23 | // test fixture //
24 | //////////////////
25 | class KhaiiiApiTest: public testing::Test {
26 |  public:
27 |     virtual void SetUp();    ///< set up
28 |     virtual void TearDown();    ///< tear down
29 | 
30 |  protected:
31 |     int _handle = -1;    ///< 핸들
32 | 
33 |     /**
34 |      * 어절의 분석 결과를 비교하기위한 함수 (포지션 정보 포함)
35 |      * @param  expected  기대하는 결과 문자열. 예: "[1:7]\t안녕/IC[1:6] + ?/SF[7:1]"
36 |      * @param  actual  실제 어절 결과
37 |      */
38 |     void _expect_eq_word(std::string expected, const khaiii_word_t& actual) const;
39 | 
40 |     /**
41 |      * 어절의 분석 결과 중 형태소 부분만을 비교하기 위한 함수
42 |      * @param  expected  기대하는 결과 문자열. 예: "안녕/IC + ?/SF"
43 |      * @param  actual  실제 어절 결과
44 |      */
45 |     void _expect_eq_morphs(std::string expected, const khaiii_word_t& actual) const;
46 | };
47 | 
48 | 
49 | #endif    // SRC_TEST_CPP_KHAIII_KHAIIIAPITEST_HPP_
50 | 


--------------------------------------------------------------------------------
/src/test/cpp/khaiii/KhaiiiDevTest.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
 3 |  * @copyright  Copyright (C) 2018-, Kakao Corp. All rights reserved.
 4 |  */
 5 | 
 6 | 
 7 | 
 8 | //////////////
 9 | // includes //
10 | //////////////
11 | #include <array>
12 | 
13 | #include "khaiii/khaiii_dev.h"
14 | 
15 | #include "khaiii/ErrPatch.hpp"
16 | #include "khaiii/KhaiiiApiTest.hpp"
17 | 
18 | 
19 | using std::array;
20 | using std::string;
21 | 
22 | 
23 | //////////////////
24 | // test fixture //
25 | //////////////////
26 | class KhaiiiDevTest: public KhaiiiApiTest {};
27 | 
28 | 
29 | ////////////////
30 | // test cases //
31 | ////////////////
32 | TEST_F(KhaiiiDevTest, analyze_bfr_errorpatch) {
33 |     array<int16_t, 13> output;
34 |     EXPECT_EQ(13, khaiii_analyze_bfr_errpatch(_handle, u8"진정한 테스트입니다.", "", &output[0]));
35 |     EXPECT_EQ(khaiii::ErrPatch::SENT_DELIM_NUM, output[0]);    // bos/eos
36 |     EXPECT_EQ(khaiii::ErrPatch::WORD_DELIM_NUM, output[4]);    // bow/eow
37 |     EXPECT_EQ(khaiii::ErrPatch::SENT_DELIM_NUM, output[12]);    // bos/eos
38 | 
39 |     EXPECT_GT(0, khaiii_analyze_bfr_errpatch(-1, u8"", "", &output[0]));    // invalid handle
40 |     EXPECT_GT(0, khaiii_analyze_bfr_errpatch(_handle, nullptr, "", &output[0]));    // null input
41 |     EXPECT_GT(0, khaiii_analyze_bfr_errpatch(_handle, u8"", "", nullptr));    // null output
42 | }
43 | 
44 | 
45 | TEST_F(KhaiiiDevTest, set_log_level) {
46 |     EXPECT_EQ(0, khaiii_set_log_level("all", "trace"));
47 |     EXPECT_EQ(0, khaiii_set_log_level("all", "debug"));
48 |     EXPECT_EQ(0, khaiii_set_log_level("all", "info"));
49 |     EXPECT_EQ(0, khaiii_set_log_level("all", "warn"));
50 |     EXPECT_EQ(0, khaiii_set_log_level("all", "err"));
51 |     EXPECT_EQ(0, khaiii_set_log_level("all", "critical"));
52 | 
53 |     EXPECT_GT(0, khaiii_set_log_level(nullptr, "debug"));    // null logger
54 |     EXPECT_GT(0, khaiii_set_log_level("", "debug"));    // zero string logger
55 |     EXPECT_GT(0, khaiii_set_log_level("__invalid_logger__", "debug"));
56 |     EXPECT_GT(0, khaiii_set_log_level("Tagger", nullptr));    // null level
57 |     EXPECT_GT(0, khaiii_set_log_level("Tagger", ""));    // zero string level
58 |     EXPECT_GT(0, khaiii_set_log_level("Tagger", "__invalid_level__"));
59 | }
60 | 
61 | 
62 | TEST_F(KhaiiiDevTest, set_log_levels) {
63 |     EXPECT_EQ(0, khaiii_set_log_levels("all:warn,Tagger:info"));
64 |     EXPECT_EQ(0, khaiii_set_log_levels(""));    // zero name/level pair
65 | 
66 |     EXPECT_GT(0, khaiii_set_log_levels(nullptr));    // null name/level pair
67 |     EXPECT_GT(0, khaiii_set_log_levels("all,Tagger:info"));    // invalid format
68 | }
69 | 


--------------------------------------------------------------------------------
/src/test/cpp/khaiii/PreanalTest.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
  3 |  * @copyright  Copyright (C) 2018-, Kakao Corp. All rights reserved.
  4 |  */
  5 | 
  6 | 
  7 | 
  8 | //////////////
  9 | // includes //
 10 | //////////////
 11 | #include <memory>
 12 | #include <string>
 13 | 
 14 | #include "cxxopts.hpp"
 15 | #include "gtest/gtest.h"
 16 | 
 17 | #include "khaiii/Preanal.hpp"
 18 | #include "khaiii/Word.hpp"
 19 | 
 20 | 
 21 | ///////////////
 22 | // variables //
 23 | ///////////////
 24 | extern cxxopts::ParseResult *prog_args;    // arguments passed to main program
 25 | 
 26 | 
 27 | namespace khaiii {
 28 | 
 29 | 
 30 | using std::make_shared;
 31 | using std::shared_ptr;
 32 | using std::string;
 33 | using std::wstring;
 34 | 
 35 | 
 36 | //////////////////
 37 | // test fixture //
 38 | //////////////////
 39 | class PreanalTest: public testing::Test {
 40 |  public:
 41 |     virtual void SetUp() {
 42 |         std::string rsc_dir = (*prog_args)["rsc-dir"].as<string>();
 43 |         ASSERT_NO_THROW(_preanal.open(rsc_dir));
 44 |     }
 45 | 
 46 |     virtual void TearDown() {
 47 |         ASSERT_NO_THROW(_preanal.close());
 48 |     }
 49 | 
 50 |  protected:
 51 |     Preanal _preanal;
 52 | 
 53 |     inline shared_ptr<Word> _apply(wstring raw) {
 54 |         auto word = make_shared<Word>(raw.c_str(), raw.length());
 55 |         _preanal.apply(word);
 56 |         return word;
 57 |     }
 58 | };
 59 | 
 60 | 
 61 | ////////////////
 62 | // test cases //
 63 | ////////////////
 64 | TEST_F(PreanalTest, apply_exact) {
 65 |     // 어절 완전일치 엔트리 "이더리움"에 대해
 66 | 
 67 |     auto word1 = _apply(L"이더리움");    // 매칭
 68 |     EXPECT_LT(0, word1->char_tags[0]);
 69 |     EXPECT_LT(0, word1->char_tags[1]);
 70 |     EXPECT_LT(0, word1->char_tags[2]);
 71 |     EXPECT_LT(0, word1->char_tags[3]);
 72 | 
 73 |     auto word2 = _apply(L"이더리움을");    // 매칭 안됨
 74 |     EXPECT_EQ(0, word2->char_tags[0]);
 75 |     EXPECT_EQ(0, word2->char_tags[1]);
 76 |     EXPECT_EQ(0, word2->char_tags[2]);
 77 |     EXPECT_EQ(0, word2->char_tags[3]);
 78 |     EXPECT_EQ(0, word2->char_tags[4]);
 79 | 
 80 |     auto word3 = _apply(L"이더륨");    // 매칭 안됨
 81 |     EXPECT_EQ(0, word3->char_tags[0]);
 82 |     EXPECT_EQ(0, word3->char_tags[1]);
 83 |     EXPECT_EQ(0, word3->char_tags[2]);
 84 | 
 85 |     EXPECT_NO_THROW(_apply(L""));
 86 | }
 87 | 
 88 | 
 89 | TEST_F(PreanalTest, apply_prefix) {
 90 |     // 전망매칭 패턴 "가즈아*"에 대해
 91 | 
 92 |     auto word1 = _apply(L"가즈아~");    // 매칭
 93 |     EXPECT_LT(0, word1->char_tags[0]);
 94 |     EXPECT_LT(0, word1->char_tags[1]);
 95 |     EXPECT_LT(0, word1->char_tags[2]);
 96 |     EXPECT_EQ(0, word1->char_tags[3]);
 97 | 
 98 |     auto word2 = _apply(L"가즈아");    // 매칭
 99 |     EXPECT_LT(0, word2->char_tags[0]);
100 |     EXPECT_LT(0, word2->char_tags[1]);
101 |     EXPECT_LT(0, word2->char_tags[2]);
102 | 
103 |     auto word3 = _apply(L"가자");    // 매칭 안됨
104 |     EXPECT_EQ(0, word3->char_tags[0]);
105 |     EXPECT_EQ(0, word3->char_tags[1]);
106 | }
107 | 
108 | 
109 | }    // namespace khaiii
110 | 


--------------------------------------------------------------------------------
/src/test/cpp/test_main.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @author  Jamie (jamie.lim@kakaocorp.com)
 3 |  * @copyright  Copyright (C) 2017-, Kakao Corp. All rights reserved.
 4 |  */
 5 | 
 6 | 
 7 | //////////////
 8 | // includes //
 9 | //////////////
10 | #include <iostream>
11 | 
12 | #include "cxxopts.hpp"
13 | #include "fmt/printf.h"
14 | #ifdef PROFILER
15 |     #include "gperftools/profiler.h"
16 | #endif
17 | #include "gtest/gtest.h"
18 | #include "spdlog/spdlog.h"
19 | 
20 | #include "khaiii/khaiii_dev.h"
21 | 
22 | 
23 | using std::cerr;
24 | using std::string;
25 | 
26 | 
27 | ///////////////
28 | // variables //
29 | ///////////////
30 | // global variable for program arguments
31 | cxxopts::ParseResult* prog_args;
32 | 
33 | 
34 | //////////
35 | // main //
36 | //////////
37 | int main(int argc, char** argv) {
38 |     cxxopts::Options options(argv[0], argv[0]);
39 |     testing::InitGoogleTest(&argc, argv);
40 |     auto _log = spdlog::stderr_color_mt("console");
41 |     spdlog::set_level(spdlog::level::warn);
42 | 
43 |     options.add_options()
44 |         ("h,help", "print this help")
45 |         ("rsc-dir", "resource directory", cxxopts::value<string>()->default_value("./share/khaiii"))
46 |         ("set-log", "set log level", cxxopts::value<string>()->default_value("all:warn"));
47 |     auto args = options.parse(argc, argv);
48 | 
49 |     if (args.count("help")) {
50 |         fmt::fprintf(cerr, "%s\n", options.help());
51 |         return 0;
52 |     }
53 |     prog_args = &args;
54 |     khaiii_set_log_levels(args["set-log"].as<string>().c_str());
55 | 
56 | #ifdef PROFILER
57 |     ProfilerStart("/tmp/test_khaiii.prof");
58 | #endif
59 | 
60 |     int ret = RUN_ALL_TESTS();
61 | 
62 | #ifdef PROFILER
63 |     ProfilerStop();
64 | #endif
65 | 
66 |     return ret;
67 | }
68 | 


--------------------------------------------------------------------------------
/src/test/python/test_khaiii/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kakao/khaiii/3d0c8944374163b1107fd6690ccf3a408430a02d/src/test/python/test_khaiii/__init__.py


--------------------------------------------------------------------------------
/src/test/python/test_khaiii/test_khaiii.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | 
  5 | """
  6 | khaiii tests
  7 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)'
  8 | __copyright__ = 'Copyright (C) 2018-, Kakao Corp. All rights reserved.'
  9 | """
 10 | 
 11 | 
 12 | ###########
 13 | # imports #
 14 | ###########
 15 | import unittest
 16 | 
 17 | import khaiii    # pylint: disable=import-error
 18 | from khaiii import KhaiiiExcept    # pylint: disable=import-error
 19 | 
 20 | 
 21 | #########
 22 | # tests #
 23 | #########
 24 | class TestKhaiii(unittest.TestCase):
 25 |     """
 26 |     khaiii tests
 27 |     """
 28 |     def setUp(self):
 29 |         self._api = khaiii.KhaiiiApi()
 30 |         self._api.set_log_level('all', 'warn')
 31 | 
 32 |     def tearDown(self):
 33 |         self._api.close()
 34 | 
 35 |     def test_version(self):
 36 |         """
 37 |         test version() api
 38 |         """
 39 |         self.assertRegex(self._api.version(), r'^\d+\.\d+(\.\d+)?$')
 40 | 
 41 |     def test_open(self):
 42 |         """
 43 |         test open() api
 44 |         """
 45 |         try:
 46 |             self._api.open()
 47 |         except KhaiiiExcept as khaiii_exc:
 48 |             self.fail(khaiii_exc)
 49 |         with self.assertRaises(KhaiiiExcept):
 50 |             self._api.open('/not/existing/dir')
 51 |         with self.assertRaises(KhaiiiExcept):
 52 |             self._api.open('', 'invalid option')
 53 | 
 54 |     def test_analyze(self):
 55 |         """
 56 |         test analyze() api
 57 |         """
 58 |         try:
 59 |             words = self._api.analyze('안녕? 반가워!')
 60 |             self.assertEqual(len(words), 2)
 61 |             self.assertEqual(len(words[0].morphs), 2)
 62 |             self.assertEqual(words[0].morphs[0].lex, '안녕')
 63 |             self.assertEqual(words[0].morphs[0].tag, 'IC')
 64 |             self.assertEqual(words[0].morphs[1].lex, '?')
 65 |             self.assertEqual(words[0].morphs[1].tag, 'SF')
 66 |             self.assertEqual(len(words[1].morphs), 3)
 67 |             self.assertEqual(words[1].morphs[0].lex, '반갑')
 68 |             self.assertEqual(words[1].morphs[0].tag, 'VA')
 69 |             self.assertEqual(words[1].morphs[1].lex, '어')
 70 |             self.assertEqual(words[1].morphs[1].tag, 'EF')
 71 |             self.assertEqual(words[1].morphs[2].lex, '!')
 72 |             self.assertEqual(words[1].morphs[2].tag, 'SF')
 73 |         except KhaiiiExcept as khaiii_exc:
 74 |             self.fail(khaiii_exc)
 75 | 
 76 |     def test_analyze_bfr_errpatch(self):
 77 |         """
 78 |         test analyze_bfr_errpatch() api
 79 |         """
 80 |         try:
 81 |             results = self._api.analyze_bfr_errpatch('테스트')
 82 |             self.assertEqual(len(results), len('테스트') + 2)
 83 |         except KhaiiiExcept as khaiii_exc:
 84 |             self.fail(khaiii_exc)
 85 | 
 86 |     def test_set_log_level(self):
 87 |         """
 88 |         test set_log_level() api
 89 |         """
 90 |         try:
 91 |             self._api.set_log_level('all', 'info')
 92 |         except KhaiiiExcept as khaiii_exc:
 93 |             self.fail(khaiii_exc)
 94 |         with self.assertRaises(KhaiiiExcept):
 95 |             self._api.set_log_level('all', 'not_existing_level')
 96 | 
 97 | 
 98 | ########
 99 | # main #
100 | ########
101 | if __name__ == '__main__':
102 |     unittest.main()
103 | 


--------------------------------------------------------------------------------
/train/eval.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | 
  5 | """
  6 | khaiii 출력 형태의 두 파일을 읽어들여 f-score를 측정
  7 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)'
  8 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
  9 | """
 10 | 
 11 | 
 12 | ###########
 13 | # imports #
 14 | ###########
 15 | from argparse import ArgumentParser, Namespace
 16 | from collections import Counter
 17 | import logging
 18 | import sys
 19 | from typing import Iterator, Set, Tuple
 20 | 
 21 | 
 22 | #############
 23 | # functions #
 24 | #############
 25 | def _load(path: str) -> Iterator[Tuple[str, str]]:
 26 |     """
 27 |     파일을 읽어들여 (어절, 형태소)를 리턴하는 제너레이터
 28 |     Args:
 29 |         path:  file path
 30 |     Yields:
 31 |         word
 32 |         morphs
 33 |     """
 34 |     for line in open(path, 'r', encoding='UTF-8'):
 35 |         line = line.rstrip('\r\n')
 36 |         if not line:
 37 |             yield '', ''
 38 |             continue
 39 |         word, morphs = line.split('\t')
 40 |         yield word, morphs
 41 | 
 42 | 
 43 | def _morphs_to_set(morphs: str) -> Set[Tuple[str, int]]:
 44 |     """
 45 |     make set from morpheme string
 46 |     Args:
 47 |         morphs:  morpheme string
 48 |     Returns:
 49 |         morphemes set
 50 |     """
 51 |     morph_cnt = Counter([m for m in morphs.split(' + ')])
 52 |     morph_set = set()
 53 |     for morph, freq in morph_cnt.items():
 54 |         if freq == 1:
 55 |             morph_set.add(morph)
 56 |         else:
 57 |             morph_set.update([(morph, i) for i in range(freq)])
 58 |     return morph_set
 59 | 
 60 | 
 61 | def _count(cnt: Counter, gold: str, pred: str):
 62 |     """
 63 |     count gold and pred morphemes
 64 |     Args:
 65 |         cnt:  Counter object
 66 |         gold:  gold standard morphemes
 67 |         pred:  prediction morphemes
 68 |     """
 69 |     gold_set = _morphs_to_set(gold)
 70 |     pred_set = _morphs_to_set(pred)
 71 |     cnt['gold'] += len(gold_set)
 72 |     cnt['pred'] += len(pred_set)
 73 |     cnt['match'] += len(gold_set & pred_set)
 74 | 
 75 | 
 76 | def _report(cnt: Counter):
 77 |     """
 78 |     report metric
 79 |     Args:
 80 |         cnt:  Counter object
 81 |     """
 82 |     precision = 100 * cnt['match'] / cnt['pred']
 83 |     recall = 100 * cnt['match'] / cnt['gold']
 84 |     f_score = 2 * precision * recall / (precision + recall)
 85 |     print(f'precision: {precision:.2f}')
 86 |     print(f'recall: {recall:.2f}')
 87 |     print(f'f-score: {f_score:.2f}')
 88 | 
 89 | 
 90 | def run(args: Namespace):
 91 |     """
 92 |     run function which is the start point of program
 93 |     Args:
 94 |         args:  program arguments
 95 |     """
 96 |     cnt = Counter()
 97 |     for line_num, (gold, pred) in enumerate(zip(_load(args.gold), _load(args.pred)), start=1):
 98 |         word_gold, morphs_gold = gold
 99 |         word_pred, morphs_pred = pred
100 |         if word_gold != word_pred:
101 |             raise ValueError(f'invalid align at {line_num}: {word_gold} vs {word_pred}')
102 |         if not word_gold or not word_pred:
103 |             continue
104 |         _count(cnt, morphs_gold, morphs_pred)
105 |     _report(cnt)
106 | 
107 | 
108 | ########
109 | # main #
110 | ########
111 | def main():
112 |     """
113 |     main function processes only argument parsing
114 |     """
115 |     parser = ArgumentParser(description='command line part-of-speech tagger demo')
116 |     parser.add_argument('-g', '--gold', help='gold standard file', metavar='FILE', required=True)
117 |     parser.add_argument('-p', '--pred', help='prediction file', metavar='FILE', required=True)
118 |     parser.add_argument('--output', help='output file <default: stdout>', metavar='FILE')
119 |     parser.add_argument('--debug', help='enable debug', action='store_true')
120 |     args = parser.parse_args()
121 | 
122 |     if args.output:
123 |         sys.stdout = open(args.output, 'w', encoding='UTF-8')
124 |     if args.debug:
125 |         logging.basicConfig(level=logging.DEBUG)
126 |     else:
127 |         logging.basicConfig(level=logging.INFO)
128 | 
129 |     run(args)
130 | 
131 | 
132 | if __name__ == '__main__':
133 |     main()
134 | 


--------------------------------------------------------------------------------
/train/hd_validate_errpatch.bash:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | set -e -u
  3 | 
  4 | 
  5 | #############
  6 | # functions #
  7 | #############
  8 | function print_usage() {
  9 |     local msg=$1
 10 |     echo "Usage: $(basename $0) [options]"
 11 |     echo "Options:"
 12 |     echo "  -h, --help        show this help message and exit"
 13 |     echo "  -i FILE           input file"
 14 |     echo "  -c DIR            corpus dir"
 15 |     echo "  --rsc-src=DIR     <default: ../rsc/src>"
 16 |     echo "  --lib-path=FILE   <default: ../build/lib/libkhaiii.so>"
 17 |     echo "  --rsc-dir=DIR     <default: ../build/share/khaiii>"
 18 |     echo "  --num-mapper=NUM  <default: 1000>"
 19 |     if [ -z "${msg}" ]; then
 20 |         exit 0
 21 |     else
 22 |         echo
 23 |         echo "${msg}"
 24 |         exit 1
 25 |     fi
 26 | }
 27 | 
 28 | 
 29 | function abspath() {
 30 |     python3 -c "import os, sys; print(os.path.abspath(sys.argv[1]))" $1
 31 | }
 32 | 
 33 | 
 34 | function parse_args() {
 35 |     INPUT_FILE=""
 36 |     CORPUS_DIR=""
 37 |     LIB_PATH=""
 38 |     RSC_DIR=""
 39 |     RSC_SRC=""
 40 |     NUM_MAPPER=""
 41 | 
 42 |     while [[ $# -ge 1 ]]; do
 43 |         case $1 in
 44 |             -h|--help)
 45 |                 print_usage ""
 46 |                 ;;
 47 |             -i)
 48 |                 INPUT_FILE="$2"
 49 |                 shift
 50 |                 ;;
 51 |             -c)
 52 |                 CORPUS_DIR="$2"
 53 |                 shift
 54 |                 ;;
 55 |             --rsc-src)
 56 |                 RSC_SRC="$2"
 57 |                 shift
 58 |                 ;;
 59 |             --lib-path)
 60 |                 LIB_PATH="$2"
 61 |                 shift
 62 |                 ;;
 63 |             --rsc-dir)
 64 |                 RSC_DIR="$2"
 65 |                 shift
 66 |                 ;;
 67 |             --num-mapper)
 68 |                 NUM_MAPPER="$2"
 69 |                 shift
 70 |                 ;;
 71 |             --) break ;;
 72 |         esac
 73 |         shift
 74 |     done
 75 | 
 76 |     # input file 검사
 77 |     if [ -z "${INPUT_FILE}" ]; then
 78 |         print_usage "no input file"
 79 |     fi
 80 | 
 81 |     # corpus dir 검사
 82 |     if [ -z "${CORPUS_DIR}" ]; then
 83 |         print_usage "no corpus dir"
 84 |     fi
 85 | 
 86 |     if [ -z "${RSC_SRC}" ]; then
 87 |         RSC_SRC=../rsc/src
 88 |     fi
 89 |     if [ -z "${LIB_PATH}" ]; then
 90 |         LIB_PATH=../build/lib/libkhaiii.so
 91 |     fi
 92 |     if [ -z "${RSC_DIR}" ]; then
 93 |         RSC_DIR=../build/share/khaiii
 94 |     fi
 95 |     if [ -z "${NUM_MAPPER}" ]; then
 96 |         NUM_MAPPER=1000
 97 |     fi
 98 | 
 99 |     INPUT_FILE=$(abspath ${INPUT_FILE})
100 |     LIB_PATH=$(abspath ${LIB_PATH})
101 |     RSC_DIR=$(abspath ${RSC_DIR})
102 |     RSC_SRC=$(abspath ${RSC_SRC})
103 |     CORPUS_DIR=$(abspath ${CORPUS_DIR})
104 | }
105 | 
106 | 
107 | function init_envs() {
108 |     # global variables
109 |     INPUT_DIR=errpatch.in
110 |     OUTPUT_DIR=errpatch.out
111 |     CACHE_DIR=errpatch.cache
112 | }
113 | 
114 | 
115 | function split_input() {
116 |     >&2 echo "{{{{{{{{{{ ${FUNCNAME[0]} {{{{{{{{{{"
117 | 
118 |     local total_line
119 |     total_line=$(wc -l < ${INPUT_FILE})
120 |     local line_per_split=$((total_line / NUM_MAPPER))
121 |     rm -rf ${INPUT_DIR}
122 |     mkdir -p ${INPUT_DIR}
123 |     shuf ${INPUT_FILE} | split -d -a 5 -l ${line_per_split} - ${INPUT_DIR}/part-
124 | 
125 |     hadoop fs -test -e ${INPUT_DIR} && hadoop fs -rm -skipTrash -r ${INPUT_DIR}
126 |     hadoop fs -put ${INPUT_DIR}
127 | 
128 |     >&2 echo "}}}}}}}}}} ${FUNCNAME[0]} }}}}}}}}}}"
129 | }
130 | 
131 | 
132 | function cache_files() {
133 |     >&2 echo "{{{{{{{{{{ ${FUNCNAME[0]} {{{{{{{{{{"
134 | 
135 |     hadoop fs -test -e ${CACHE_DIR} && hadoop fs -rm -skipTrash -r ${CACHE_DIR}
136 |     hadoop fs -mkdir -p ${CACHE_DIR}
137 | 
138 |     hadoop fs -put ../src/main/python/khaiii ${CACHE_DIR}
139 |     hadoop fs -mkdir -p ${CACHE_DIR}/khaiii/lib
140 |     hadoop fs -put ${LIB_PATH} ${CACHE_DIR}/khaiii/lib
141 | 
142 |     hadoop fs -mkdir -p ${CACHE_DIR}/khaiii/share
143 |     hadoop fs -put ${RSC_DIR} ${CACHE_DIR}/khaiii/share/khaiii
144 | 
145 |     hadoop fs -put ${RSC_SRC} ${CACHE_DIR}/rsc_src
146 | 
147 |     hadoop fs -mkdir -p ${CACHE_DIR}/corpus
148 |     hadoop fs -put ${CORPUS_DIR}/*.txt ${CACHE_DIR}/corpus
149 | 
150 |     >&2 echo "}}}}}}}}}} ${FUNCNAME[0]} }}}}}}}}}}"
151 | }
152 | 
153 | 
154 | function run_hadoop() {
155 |     >&2 echo "{{{{{{{{{{ ${FUNCNAME[0]} {{{{{{{{{{"
156 | 
157 |     hadoop fs -test -e ${OUTPUT_DIR} && hadoop fs -rm -skipTrash -r ${OUTPUT_DIR}
158 |     yarn jar ${HADOOP_HOME}/share/hadoop/tools/lib/hadoop-streaming-*.jar \
159 |         -D mapred.job.name=validate_errpatch \
160 |         -D mapred.reduce.tasks=0 \
161 |         -cmdenv PYTHONPATH="./${CACHE_DIR}" \
162 |         -file ./validate_errpatch.py \
163 |         -input "${INPUT_DIR}" \
164 |         -output "${OUTPUT_DIR}" \
165 |         -cacheFile "${CACHE_DIR}#${CACHE_DIR}" \
166 |         -mapper "./validate_errpatch.py -c ./${CACHE_DIR}/corpus --rsc-src ./${CACHE_DIR}/rsc_src"
167 | 
168 |     hadoop fs -text ${OUTPUT_DIR}/part-* > "$(dirname ${INPUT_FILE})/errpatch.valid"
169 | 
170 |     >&2 echo "}}}}}}}}}} ${FUNCNAME[0]} }}}}}}}}}}"
171 | }
172 | 
173 | 
174 | function del_temp() {
175 |     >&2 echo "{{{{{{{{{{ ${FUNCNAME[0]} {{{{{{{{{{"
176 | 
177 |     hadoop fs -rm -skipTrash -r ${INPUT_DIR} ${OUTPUT_DIR} ${CACHE_DIR}
178 |     rm -rf ${INPUT_DIR}
179 | 
180 |     >&2 echo "}}}}}}}}}} ${FUNCNAME[0]} }}}}}}}}}}"
181 | }
182 | 
183 | 
184 | ########
185 | # main #
186 | ########
187 | parse_args $@
188 | cd "$(dirname $0)"
189 | init_envs
190 | 
191 | split_input
192 | cache_files
193 | run_hadoop
194 | del_temp
195 | 


--------------------------------------------------------------------------------
/train/make_vocab.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | 
 5 | """
 6 | 입력(음절) 및 출력(태그) vocabulary를 생성한다.
 7 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)'
 8 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
 9 | """
10 | 
11 | 
12 | ###########
13 | # imports #
14 | ###########
15 | from argparse import ArgumentParser, Namespace
16 | from collections import Counter
17 | import logging
18 | import os
19 | import sys
20 | from typing import TextIO
21 | 
22 | from khaiii.resource.morphs import TAGS
23 | 
24 | 
25 | #############
26 | # functions #
27 | #############
28 | def _print(cnt: Counter, fout: TextIO, is_with_freq: bool = True):
29 |     """
30 |     vocabulary 사전을 출력한다.
31 |     Args:
32 |         cnt:  Counter object
33 |         fout:  출력 파일
34 |         is_with_freq:  빈도를 함께 출력할 지 여부
35 |     """
36 |     for char, freq in sorted(cnt.items(), key=lambda x: x[0]):
37 |         if is_with_freq and freq < 2:
38 |             continue
39 |         if is_with_freq:
40 |             print('{}\t{}'.format(char, freq), file=fout)
41 |         else:
42 |             print(char, file=fout)
43 | 
44 | 
45 | def run(args: Namespace):
46 |     """
47 |     run function which is the start point of program
48 |     Args:
49 |         args:  program arguments
50 |     """
51 |     in_cnt = Counter()
52 |     out_cnt = Counter()
53 |     for line_num, line in enumerate(sys.stdin, start=1):
54 |         if line_num % 1000000 == 0:
55 |             logging.info('%dm-th line', line_num // 1000000)
56 |         line = line.rstrip('\r\n')
57 |         if not line:
58 |             continue
59 |         raw, tagged = line.split('\t')
60 |         in_cnt.update(list(raw))
61 |         out_cnt.update([tag for tag in tagged.split() if tag[2:] not in TAGS])
62 |     os.makedirs(args.rsc_src, exist_ok=True)
63 |     with open('{}/vocab.in'.format(args.rsc_src), 'w', encoding='UTF-8') as fout:
64 |         _print(in_cnt, fout)
65 |     with open('{}/vocab.out'.format(args.rsc_src), 'w', encoding='UTF-8') as fout:
66 |         print('\n'.join(['B-{}'.format(tag) for tag in TAGS]), file=fout)
67 |         print('\n'.join(['I-{}'.format(tag) for tag in TAGS]), file=fout)
68 |         _print(out_cnt, fout, is_with_freq=False)
69 | 
70 | 
71 | ########
72 | # main #
73 | ########
74 | def main():
75 |     """
76 |     main function processes only argument parsing
77 |     """
78 |     parser = ArgumentParser(description='입력(음절) 및 출력(태그) vocabulary를 생성한다.')
79 |     parser.add_argument('--rsc-src', help='resource source dir <default: ../rsc/src>',
80 |                         metavar='DIR', default='../rsc/src')
81 |     parser.add_argument('--input', help='input file <default: stdin>', metavar='FILE')
82 |     parser.add_argument('--debug', help='enable debug', action='store_true')
83 |     args = parser.parse_args()
84 | 
85 |     if args.input:
86 |         sys.stdin = open(args.input, 'r', encoding='UTF-8')
87 |     if args.debug:
88 |         logging.basicConfig(level=logging.DEBUG)
89 |     else:
90 |         logging.basicConfig(level=logging.INFO)
91 | 
92 |     run(args)
93 | 
94 | 
95 | if __name__ == '__main__':
96 |     main()
97 | 


--------------------------------------------------------------------------------
/train/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorboardX
2 | torch==0.4.1
3 | tqdm
4 | 


--------------------------------------------------------------------------------
/train/split_corpus.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | 
  5 | """
  6 | 코퍼스를 train/dev/test로 분할한다.
  7 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)'
  8 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
  9 | """
 10 | 
 11 | 
 12 | ###########
 13 | # imports #
 14 | ###########
 15 | from argparse import ArgumentParser, Namespace
 16 | import logging
 17 | import random
 18 | import sys
 19 | from typing import Iterator, List, TextIO
 20 | 
 21 | 
 22 | #############
 23 | # functions #
 24 | #############
 25 | def _sents(fin: TextIO) -> Iterator[List[str]]:
 26 |     """
 27 |     read from file and yield a sentence (generator)
 28 |     Args:
 29 |         fin:  input file
 30 |     Yields:
 31 |         sentence (list of lines)
 32 |     """
 33 |     sent = []
 34 |     for line in fin:
 35 |         line = line.rstrip('\r\n')
 36 |         if not line:
 37 |             if sent:
 38 |                 yield sent
 39 |                 sent = []
 40 |             continue
 41 |         sent.append(line)
 42 |     if sent:
 43 |         yield sent
 44 | 
 45 | 
 46 | def _write_to_file(path: str, sents: List[List[str]]):
 47 |     """
 48 |     파일에 쓴다.
 49 |     Args:
 50 |         path:  path
 51 |         sents:  sentences
 52 |     """
 53 |     with open(path, 'w', encoding='UTF-8') as fout:
 54 |         for sent in sents:
 55 |             print('\n'.join(sent), file=fout)
 56 |             print(file=fout)
 57 | 
 58 | 
 59 | def run(args: Namespace):
 60 |     """
 61 |     run function which is the start point of program
 62 |     Args:
 63 |         args:  program arguments
 64 |     """
 65 |     sents = []
 66 |     for num, sent in enumerate(_sents(sys.stdin), start=1):
 67 |         if num % 100000 == 0:
 68 |             logging.info('%d00k-th sent..', num // 100000)
 69 |         sents.append(sent)
 70 |     random.shuffle(sents)
 71 |     _write_to_file('{}.dev'.format(args.out_pfx), sents[:args.dev])
 72 |     _write_to_file('{}.test'.format(args.out_pfx), sents[args.dev:args.dev+args.test])
 73 |     _write_to_file('{}.train'.format(args.out_pfx), sents[args.dev+args.test:])
 74 |     logging.info('dev / test / train: %d / %d / %d', args.dev, args.test,
 75 |                  len(sents[args.dev+args.test:]))
 76 | 
 77 | 
 78 | ########
 79 | # main #
 80 | ########
 81 | def main():
 82 |     """
 83 |     main function processes only argument parsing
 84 |     """
 85 |     parser = ArgumentParser(description='코퍼스를 train/dev/test로 분할한다.')
 86 |     parser.add_argument('-o', '--out-pfx', help='output file prefix', metavar='NAME', required=True)
 87 |     parser.add_argument('--input', help='input file <default: stdin>', metavar='FILE')
 88 |     parser.add_argument('--dev', help='number of sentence in dev set', metavar='NUM', type=int,
 89 |                         default=5000)
 90 |     parser.add_argument('--test', help='number of sentence in test set', metavar='NUM', type=int,
 91 |                         default=5000)
 92 |     parser.add_argument('--debug', help='enable debug', action='store_true')
 93 |     args = parser.parse_args()
 94 | 
 95 |     if args.input:
 96 |         sys.stdin = open(args.input, 'r', encoding='UTF-8')
 97 |     if args.debug:
 98 |         logging.basicConfig(level=logging.DEBUG)
 99 |     else:
100 |         logging.basicConfig(level=logging.INFO)
101 | 
102 |     run(args)
103 | 
104 | 
105 | if __name__ == '__main__':
106 |     main()
107 | 


--------------------------------------------------------------------------------
/train/tag.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | 
 5 | """
 6 | command line part-of-speech tagger demo
 7 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)'
 8 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
 9 | """
10 | 
11 | 
12 | ###########
13 | # imports #
14 | ###########
15 | from argparse import ArgumentParser, Namespace
16 | import logging
17 | import sys
18 | 
19 | from khaiii.train.tagger import PosTagger
20 | 
21 | 
22 | #############
23 | # functions #
24 | #############
25 | def run(args: Namespace):
26 |     """
27 |     run function which is the start point of program
28 |     Args:
29 |         args:  program arguments
30 |     """
31 |     tgr = PosTagger(args.model_dir, args.gpu_num)
32 |     for line_num, line in enumerate(sys.stdin, start=1):
33 |         if line_num % 100000 == 0:
34 |             logging.info('%d00k-th line..', (line_num // 100000))
35 |         line = line.rstrip('\r\n')
36 |         if not line:
37 |             print()
38 |             continue
39 |         pos_sent = tgr.tag_raw(line)
40 |         for pos_word in pos_sent.pos_tagged_words:
41 |             print(pos_word.raw, end='\t')
42 |             print(' + '.join([str(m) for m in pos_word.pos_tagged_morphs]))
43 |         print()
44 | 
45 | 
46 | ########
47 | # main #
48 | ########
49 | def main():
50 |     """
51 |     main function processes only argument parsing
52 |     """
53 |     parser = ArgumentParser(description='command line part-of-speech tagger demo')
54 |     parser.add_argument('-m', '--model-dir', help='model dir', metavar='DIR', required=True)
55 |     parser.add_argument('--input', help='input file <default: stdin>', metavar='FILE')
56 |     parser.add_argument('--output', help='output file <default: stdout>', metavar='FILE')
57 |     parser.add_argument('--gpu-num', help='GPU number to use <default: -1 for CPU>', metavar='INT',
58 |                         type=int, default=-1)
59 |     parser.add_argument('--debug', help='enable debug', action='store_true')
60 |     args = parser.parse_args()
61 | 
62 |     if args.input:
63 |         sys.stdin = open(args.input, 'r', encoding='UTF-8')
64 |     if args.output:
65 |         sys.stdout = open(args.output, 'w', encoding='UTF-8')
66 |     if args.debug:
67 |         logging.basicConfig(level=logging.DEBUG)
68 |     else:
69 |         logging.basicConfig(level=logging.INFO)
70 | 
71 |     run(args)
72 | 
73 | 
74 | if __name__ == '__main__':
75 |     main()
76 | 


--------------------------------------------------------------------------------
/train/train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | 
 5 | """
 6 | train part-of-speech model from data set
 7 | __author__ = 'Jamie (jamie.lim@kakaocorp.com)'
 8 | __copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.'
 9 | """
10 | 
11 | 
12 | ###########
13 | # imports #
14 | ###########
15 | from argparse import ArgumentParser, Namespace
16 | import logging
17 | 
18 | from khaiii.train.trainer import Trainer
19 | 
20 | 
21 | #############
22 | # functions #
23 | #############
24 | def run(args: Namespace):
25 |     """
26 |     run function which is the start point of program
27 |     Args:
28 |         args:  program arguments (config)
29 |     """
30 |     Trainer(args).train()
31 | 
32 | 
33 | ########
34 | # main #
35 | ########
36 | def main():
37 |     """
38 |     main function processes only argument parsing
39 |     """
40 |     parser = ArgumentParser(description='train model from data')
41 |     parser.add_argument('-i', '--in-pfx', help='input data path prefix', metavar='NAME',
42 |                         required=True)
43 |     parser.add_argument('--rsc-src', help='resource source dir <default: ../rsc/src>',
44 |                         metavar='DIR', default='../rsc/src')
45 |     parser.add_argument('--logdir', help='tensorboard log dir <default: ./logdir>', metavar='DIR',
46 |                         default='./logdir')
47 |     parser.add_argument('--window', help='left/right character window length <default: 4>',
48 |                         metavar='INT', type=int, default=4)
49 |     parser.add_argument('--spc-dropout', help='space(word delimiter) dropout rate <default: 0.1>',
50 |                         metavar='REAL', type=float, default=0.1)
51 |     parser.add_argument('--cutoff', help='cutoff <default: 1>', metavar='INT', type=int, default=1)
52 |     parser.add_argument('--embed-dim', help='embedding dimension <default: 35>', metavar='INT',
53 |                         type=int, default=35)
54 |     parser.add_argument('--learning-rate', help='learning rate <default: 0.001>', metavar='REAL',
55 |                         type=float, default=0.001)
56 |     parser.add_argument('--lr-decay', help='learning rate decay <default: 0.9>', metavar='REAL',
57 |                         type=float, default=0.9)
58 |     parser.add_argument('--batch-size', help='batch size <default: 500>', metavar='INT', type=int,
59 |                         default=500)
60 |     parser.add_argument('--patience', help='maximum patience count to revert model <default: 10>',
61 |                         metavar='INT', type=int, default=10)
62 |     parser.add_argument('--gpu-num', help='GPU number to use <default: -1 for CPU>', metavar='INT',
63 |                         type=int, default=-1)
64 |     parser.add_argument('--debug', help='enable debug', action='store_true')
65 |     args = parser.parse_args()
66 | 
67 |     if args.debug:
68 |         logging.basicConfig(level=logging.DEBUG)
69 |     else:
70 |         logging.basicConfig(level=logging.INFO)
71 | 
72 |     run(args)
73 | 
74 | 
75 | if __name__ == '__main__':
76 |     main()
77 | 


--------------------------------------------------------------------------------