├── .gitignore ├── LICENSE ├── README.md ├── get_freq_newword.py ├── get_frequent_word.py ├── get_newword.py └── unsupervised_nlputils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FastText for Korean 2 | 3 | ## 설치 4 | 5 | **아래 메뉴얼은 AWS EC2 r4.large 인스턴스에서 테스트되었습니다.** 6 | 7 | Embedding 에 대한 기초지식은 [여기](https://ratsgo.github.io/embedding/) 가 좋습니다. 인터넷 전체를 구글링하는것보다 여기 블로그를 찾는 것이 낫습니다. 8 | 9 | ### 개발환경 구성 10 | 11 | ```sh 12 | sudo apt-get update 13 | sudo apt-get upgrade 14 | sudo apt-get install python-setuptools 15 | sudo apt install python3-pip 16 | 17 | pip3 install soynlp 18 | pip3 install soyspacing 19 | pip3 install sentencepiece 20 | pip3 install bert 21 | pip3 install bert-tensorflow 22 | pip3 install tensorflow 23 | pip3 install hgtk 24 | 25 | # sudo apt install make 26 | # sudo apt-get install build-essential 27 | 28 | sudo apt install unzip 29 | 30 | sudo apt install cmake 31 | ``` 32 | 33 | ### fasttext 설치 34 | 35 | fasttext 를 컴파일합니다. 36 | 37 | ``` sh 38 | cd ~/ 39 | git clone https://github.com/facebookresearch/fastText.git 40 | cd fastText 41 | make 42 | ``` 43 | 44 | python 모듈을 설치합니다. 45 | 46 | ```sh 47 | pip3 install . 48 | ``` 49 | 50 | ```sh 51 | ./fasttext -help 52 | ``` 53 | 54 | ### 데이타 전처리 55 | 56 | #### 전처리 끝난 데이타 다운받기 57 | 58 | [여기](https://github.com/ratsgo/embedding) 를 클론한다. 59 | 60 | ```sh 61 | cd ~/ 62 | git clone https://github.com/ratsgo/embedding.git 63 | cd embedding 64 | ``` 65 | 66 | [구글 드라이브](https://drive.google.com/file/d/1kUecR7xO7bsHFmUI6AExtY5u2XXlObOG/view) 에서 전처리가 끝난 데이타를 다운받습니다. 67 | 68 | ```sh 69 | mkdir -p ~/bin 70 | vi ~/bin/gdrive_download 71 | 72 | ----------------------------------------- 73 | #!/usr/bin/env bash 74 | 75 | # gdrive_download 76 | # 77 | # script to download Google Drive files from command line 78 | # not guaranteed to work indefinitely 79 | # taken from Stack Overflow answer: 80 | # http://stackoverflow.com/a/38937732/7002068 81 | 82 | gURL=$1 83 | # match more than 26 word characters 84 | ggID=$(echo "$gURL" | egrep -o '(\w|-){26,}') 85 | 86 | ggURL='https://drive.google.com/uc?export=download' 87 | 88 | curl -sc /tmp/gcokie "${ggURL}&id=${ggID}" >/dev/null 89 | getcode="$(awk '/_warning_/ {print $NF}' /tmp/gcokie)" 90 | 91 | cmd='curl --insecure -C - -LOJb /tmp/gcokie "${ggURL}&confirm=${getcode}&id=${ggID}"' 92 | echo -e "Downloading from "$gURL"...\n" 93 | eval $cmd 94 | ----------------------------------------- 95 | 96 | chmod 700 ~/bin/gdrive_download 97 | 98 | ~/bin/gdrive_download https://drive.google.com/file/d/1kUecR7xO7bsHFmUI6AExtY5u2XXlObOG 99 | ``` 100 | 101 | 모델을 학습하고, 데이타를 형태소분석 합니다. 102 | 103 | ```sh 104 | mkdir mywork 105 | cd mywork 106 | mv ../processed.zip ./ 107 | unzip processed.zip 108 | 109 | # train 110 | python3 ../preprocess/unsupervised_nlputils.py --preprocess_mode compute_soy_word_score \ 111 | --input_path ./processed/corrected_ratings_corpus.txt \ 112 | --model_path ./soyword.model 113 | 114 | # tokenize 115 | python3 ../preprocess/unsupervised_nlputils.py --preprocess_mode soy_tokenize \ 116 | --input_path ./processed/corrected_ratings_corpus.txt \ 117 | --model_path ./soyword.model \ 118 | --output_path ./ratings_tokenized_soy.txt 119 | 120 | head -5 ./ratings_tokenized_soy.txt 121 | 어릴때 보고 지금 다시 봐도 재밌 어ㅋㅋ 122 | 디자인을 배우 는 학생 으로, 외국 디자이너와 그들 이 일군 전통을 통해 발전 해가는 문화 산업이 부러웠는데. 사실 우리나라 에서도 그 어려운 시절에 끝까지 열정 을 지킨 노라노 같은 전통이있어 저와 같은 사람들이 꿈을 꾸고 이뤄나갈 수 있다 는 것에 감사합니다. 123 | 폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나 도 없음. . 최고 . 124 | 와.. 연기 가 진짜 개쩔구나.. 지루 할거라고 생각 했는데 몰입 해서 봤다. . 그래 이런 게 진짜 영화 지 125 | ``` 126 | 127 | ### khaiii 설치 128 | 129 | [문서](https://github.com/kakao/khaiii/wiki/%EB%B9%8C%EB%93%9C-%EB%B0%8F-%EC%84%A4%EC%B9%98) 를 참조하여 khaiii 를 설치합니다. 130 | 131 | ```sh 132 | cd ~/ 133 | git clone https://github.com/kakao/khaiii.git 134 | cd khaiii 135 | cmake --version 136 | 137 | mkdir build 138 | cd build/ 139 | cmake .. 140 | 141 | make all 142 | make resource 143 | make large_resource 144 | sudo make install 145 | khaiii --help 146 | ``` 147 | 148 | 테스트하기 149 | 150 | ```sh 151 | vi input.txt 152 | ----------------------------------------- 153 | 동해물과 백두산이 마르고 닳도록 하느님이 보우하사 우리나라 만세 154 | 무궁화 삼천리 화려강산 대한 사람 대한으로 길이 보전하세 155 | ----------------------------------------- 156 | 157 | khaiii --input input.txt 158 | ``` 159 | 160 | 아래 명령으로 python 연동모듈을 설치할 수 있습니다. 161 | 162 | ```sh 163 | make package_python 164 | cd package_python 165 | pip3 install . 166 | ``` 167 | 168 | ### mecab-ko 설치 169 | 170 | ```sh 171 | # mecab-ko 172 | cd ~/ 173 | wget https://bitbucket.org/eunjeon/mecab-ko/downloads/mecab-0.996-ko-0.9.2.tar.gz 174 | tar xvfz mecab-0.996-ko-0.9.2.tar.gz 175 | cd mecab-0.996-ko-0.9.2 176 | ./configure --prefix=/usr 177 | make 178 | make check 179 | sudo make install 180 | 181 | # mecab-ko-dic 182 | sudo ldconfig 183 | ldconfig -p | grep /usr/local/lib 184 | cd ~/ 185 | wget https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.1.1-20180720.tar.gz 186 | tar xvfz mecab-ko-dic-2.1.1-20180720.tar.gz 187 | cd mecab-ko-dic-2.1.1-20180720 188 | ./configure --prefix=/usr 189 | make 190 | sudo make install 191 | 192 | # 하드코딩으로 박혀있는 디렉토리 수정 193 | sed -i -e 's/\/usr\/local/\/usr/g' tools/add-userdic.sh 194 | 195 | # mecab-python 196 | pip3 install python-mecab-ko 197 | ``` 198 | 199 | ## 사용법 200 | 201 | ### fasttext 사용법 202 | 203 | #### 데이타 준비 204 | 205 | ```sh 206 | cd ~/fastText 207 | mkdir mywork 208 | cp ~/embedding/mywork/processed/corrected_ratings_corpus.txt mywork/ 209 | ``` 210 | 211 | ```sh 212 | cd mywork 213 | ``` 214 | 215 | #### 형태소분석 없는 데이타로 학습 216 | 217 | ```sh 218 | # using cbow 219 | ../fasttext cbow -input corrected_ratings_corpus.txt -output model_cbow 220 | 221 | # using skipgram 222 | ../fasttext skipgram -input corrected_ratings_corpus.txt -output model_skipgram 223 | 224 | # nearest neighbors 225 | echo “디즈니” | ../fasttext nn model_skipgram.bin 226 | Query word? 디즈니 0.968521 227 | 디즈니는 0.956993 228 | 디즈니와 0.934998 229 | 디즈니의 0.920893 230 | 클레이 0.8961 231 | 디즈니가 0.889897 232 | 함정 0.871816 233 | 레전드. 0.864659 234 | 쌍벽을 0.86439 235 | 걸작중 0.859125 236 | ``` 237 | 238 | Text Classification 을 위한 학습데이타를 다운받는다. 239 | 240 | ```sh 241 | wget https://dl.fbaipublicfiles.com/fasttext/data/cooking.stackexchange.tar.gz 242 | tar xvzf cooking.stackexchange.tar.gz 243 | head cooking.stackexchange.txt 244 | __label__sauce __label__cheese How much does potato starch affect a cheese sauce recipe? 245 | __label__food-safety __label__acidity Dangerous pathogens capable of growing in acidic environments 246 | __label__cast-iron __label__stove How do I cover up the white spots on my cast iron stove? 247 | __label__restaurant Michelin Three Star Restaurant; but if the chef is not there 248 | __label__knife-skills __label__dicing Without knife skills, how can I quickly and accurately dice vegetables? 249 | __label__storage-method __label__equipment __label__bread What\'s the purpose of a bread box? 250 | __label__baking __label__food-safety __label__substitutions __label__peanuts how to seperate peanut oil from roasted peanuts at home? 251 | __label__chocolate American equivalent for British chocolate terms 252 | __label__baking __label__oven __label__convection Fan bake vs bake 253 | __label__sauce __label__storage-lifetime __label__acidity __label__mayonnaise Regulation and balancing of readymade packed mayonnaise and other sauces 254 | 255 | # 데이타셋 분리 256 | head -n 12404 cooking.stackexchange.txt > cooking.train 257 | tail -n 3000 cooking.stackexchange.txt > cooking.test 258 | ``` 259 | 260 | ```sh 261 | ../fasttext supervised -input cooking.train -output model_cooking 262 | ``` 263 | 264 | ```sh 265 | ../fasttext predict model_cooking.bin - 266 | Which baking dish is best to bake a banana bread ? 267 | __label__baking 268 | ^C 269 | 270 | ../fasttext predict model_cooking.bin - 5 271 | Why not put knives in the dishwasher? 272 | __label__food-safety __label__baking __label__bread __label__equipment __label__substitutions 273 | ^C 274 | ``` 275 | 276 | 파라미터 정리 277 | 278 | | parameter | description | default | 279 | |-------------------|--------------------------------------------------|-----------| 280 | | input | training file path | mandatory | 281 | | output | output file path | mandatory | 282 | | verbose | verbosity level | 2 | 283 | | minCount | minimal number of word occurences | 5 | 284 | | minCountLabel | minimal number of label occurences | 0 | 285 | | wordNgrams | max length of word ngram | 1 | 286 | | bucket | number of buckets | 2000000 | 287 | | minn | min length of char ngram | 3 | 288 | | maxn | max length of char ngram | 6 | 289 | | t | sampling threshold | 0.0001 | 290 | | label | labels prefix | [] | 291 | | lr | learning rate | 0.05 | 292 | | lrUpdateRate | change the rate of updates for the learning rate | 100 | 293 | | dim | size of word vectors | 100 | 294 | | ws | size of the context window | 5 | 295 | | epoch | number of epochs | 5 | 296 | | neg | number of negatives sampled | 5 | 297 | | loss | loss function {ns, hs, softmax} | ns | 298 | | thread | number of threads | 12 | 299 | | pretrainedVectors | pretrained word vectors for supervised learning | [] | 300 | | saveOutput | whether output params should be saved | 0 | 301 | | cutoff | number of words and ngrams to retain | 0 | 302 | | retrain | finetune embeddings if a cutoff is applied | 0 | 303 | | qnorm | quantizing the norm separately | 0 | 304 | | qout | quantizing the classifier | 0 | 305 | | dsub | size of each sub-vector | 2 | 306 | 307 | #### 형태소분석 진행한 데이타로 학습(soy_tokenize) 308 | 309 | ```sh 310 | cd ~/fastText/mywork/ 311 | cp ~/embedding/mywork/ratings_tokenized_soy.txt ./ 312 | ``` 313 | 314 | ```sh 315 | cd mywork 316 | 317 | #### 형태소분석 진행한 데이타로 학습 318 | ../fasttext skipgram -input ratings_tokenized_soy.txt -output model_skipgram 319 | 320 | # nearest neighbors 321 | echo “디즈니” | ../fasttext nn model_skipgram.bin 322 | Query word? 디즈니 0.994486 323 | 픽사 0.707323 324 | 애니메이션 0.70667 325 | 애니중 0.700826 326 | 애니 0.695701 327 | 애니의 0.689524 328 | 웍스 0.678675 329 | 애니를 0.675855 330 | 에니메이션 0.672339 331 | 2D 0.671045 332 | ``` 333 | 334 | #### 형태소분석 진행한 데이타로 학습(khaiii) 335 | 336 | ```sh 337 | vi unsupervised_nlputils.py 338 | ``` 339 | 340 | ```python 341 | import sys, math, argparse, re 342 | from khaiii import KhaiiiApi 343 | import mecab 344 | 345 | def khaiii_tokenize(corpus_fname, output_fname): 346 | api = KhaiiiApi() 347 | 348 | with open(corpus_fname, 'r', encoding='utf-8') as f1, \ 349 | open(output_fname, 'w', encoding='utf-8') as f2: 350 | for line in f1: 351 | sentence = line.replace('\n', '').strip() 352 | tokens = api.analyze(sentence) 353 | tokenized_sent = '' 354 | for token in tokens: 355 | tokenized_sent += ' '.join([str(m) for m in token.morphs]) + ' ' 356 | f2.writelines(tokenized_sent.strip() + '\n') 357 | 358 | 359 | def mecab_tokenize(corpus_fname, output_fname): 360 | mcab = mecab.MeCab() 361 | 362 | with open(corpus_fname, 'r', encoding='utf-8') as f1, \ 363 | open(output_fname, 'w', encoding='utf-8') as f2: 364 | for line in f1: 365 | sentence = line.replace('\n', '').strip() 366 | tokens = mcab.morphs(sentence) 367 | tokenized_sent = ' '.join(tokens) 368 | f2.writelines(tokenized_sent + '\n') 369 | 370 | 371 | if __name__ == '__main__': 372 | parser = argparse.ArgumentParser() 373 | parser.add_argument('--preprocess_mode', type=str, help='preprocess mode') 374 | parser.add_argument('--input_path', type=str, help='Location of input files') 375 | parser.add_argument('--output_path', type=str, help='Location of output files') 376 | args = parser.parse_args() 377 | 378 | if args.preprocess_mode == "khaiii_tokenize": 379 | khaiii_tokenize(args.input_path, args.output_path) 380 | elif args.preprocess_mode == "mecab_tokenize": 381 | mecab_tokenize(args.input_path, args.output_path) 382 | ``` 383 | 384 | ```sh 385 | python3 ./unsupervised_nlputils.py --preprocess_mode khaiii_tokenize \ 386 | --input_path ./corrected_ratings_corpus.txt \ 387 | --output_path ./ratings_tokenized_khaiii.txt 388 | 389 | head -5 ./ratings_tokenized_khaiii.txt 390 | 어리/VA ㄹ/ETM 때/NNG 보/VV 고/EC 지금/MAG 다/NNG 시/MAG 보/VV 아/EC 도/JX 재미있/VA 어요/EC ㅋㅋ/NNG 391 | 디자인/NNG 을/JKO 배우/VV 는/ETM 학생/NNG 으로/JKB ,/SP 외국/NNG 디자이/NNG 너/NP 와/JKB 그/NP 들/XSN 이/JKS 일/VV 군/NNG 전통/NNG 을/JKO 통하/VV 여/EC 발전/NNG 하/XSV 여/EC 가/VX 는/ETM 문화/NNG 산업/NNG 이/JKS 부럽/VA 었/EP 는데/EC ./SF 사실/MAG 우리나라/NNG 에서/JKB 도/JX 그/MM 어렵/VA ㄴ/ETM 시절/NNG 에/JKB 끝/NNG 까지/JX 열정/NNG 을/JKO 지키/VV ㄴ/ETM 노라노/NNG 같/VA 은/ETM 전통/NNG 이/JKS 있/VV 어/EC 저/NP 와/JKB 같/VA 은/ETM 사람/NNG 들/XSN 이/JKS 꿈/NNG 을/JKO 꾸/VV 고/EC 이루/VV 어/EC 나가/VX ㄹ/ETM 수/NNB 있/VV 다는/ETM 것/NNB 에/JKB 감사/NNG 하/XSV ㅂ니다/EF ./SF 392 | 폴리스스토리/NNG 시리즈/NNG 는/JX 1/SN 부터/JX 뉴/NNG 까지/JX 버리/VV ㄹ께/EC 하나/NR 도/JX 없/VA 음/ETN ../SE 최고/NNG ./SF 393 | 와/IC ./SF ./SE 연기/NNG 가/JKS 진짜/MAG 개쩌/VV ㄹ구나/EF ../SE 지루/XR 하/XSA ㄹ/ETM 거/EC 이/VCP 라고/EC 생각/NNG 하/XSV 였/EP 는데/EC 몰입/NNG 하/XSV 여서/EC 보/VV 았/EP 다/EF ../SE 그래/IC 이런/MM 것/NNB 이/JKS 진짜/NNG 영화지/NNG 394 | 안개/NNG 자욱/XR 하/XSA ㄴ/ETM 밤하늘/NNG 에/JKB 뜨/VV 어/EC 있/VX 는/ETM 초승달/NNG 같/VA 은/ETM 영화/NNG ./SF 395 | 396 | ../fasttext skipgram -input ratings_tokenized_khaiii.txt -output model_skipgram 397 | 398 | echo “디즈니/NNP” | ../fasttext nn model_skipgram.bin 399 | Query word? 디즈니/NNP 0.991276 400 | 애니/NNP 0.845127 401 | 즈니/NNG 0.832196 402 | 한국애니/NNP 0.809425 403 | 일본애니/NNP 0.806225 404 | 지브리/NNP 0.77295 405 | 드림웍스/NNP 0.756196 406 | 지니/NNP 0.745549 407 | 베니/NNP 0.740033 408 | 쟈니/NNP 0.730029 409 | ``` 410 | 411 | #### 형태소분석 진행한 데이타로 학습(mecab-ko) 412 | 413 | ```sh 414 | python3 ./unsupervised_nlputils.py --preprocess_mode mecab_tokenize \ 415 | --input_path ./corrected_ratings_corpus.txt \ 416 | --output_path ./ratings_tokenized_mecab.txt 417 | 418 | ../fasttext skipgram -input ratings_tokenized_mecab.txt -output model_skipgram 419 | 420 | echo “디즈니” | ../fasttext nn model_skipgram.bin 421 | Query word? 디즈니 0.996115 422 | 픽사 0.77835 423 | 드림웍스 0.761766 424 | 타잔 0.749565 425 | 애니메 0.719357 426 | 애니메이션 0.694092 427 | 애니 0.691668 428 | 월트 0.690366 429 | 클레이 0.678788 430 | 지브리 0.677055 431 | ``` 432 | 433 | ## 형태소분석 결과 비교 434 | 435 | ```sh 436 | # 형태소분석 없는 데이타 437 | echo “디즈니” | ../fasttext nn model_skipgram.bin 438 | Query word? 디즈니 0.968521 439 | 디즈니는 0.956993 440 | 디즈니와 0.934998 441 | 디즈니의 0.920893 442 | 클레이 0.8961 443 | 디즈니가 0.889897 444 | 함정 0.871816 445 | 레전드. 0.864659 446 | 쌍벽을 0.86439 447 | 걸작중 0.859125 448 | ``` 449 | 450 | `형태소분석 없는 데이타` : 형태소 분석을 하지 않으면 `디즈니` 와 `디즈니는`, `디즈니와` 가 각각 다른 단어로 인식되어 버리는군요. 451 | 452 | ```sh 453 | # soy_tokenize 454 | echo “디즈니” | ../fasttext nn model_skipgram.bin 455 | Query word? 디즈니 0.994486 456 | 픽사 0.707323 457 | 애니메이션 0.70667 458 | 애니중 0.700826 459 | 애니 0.695701 460 | 애니의 0.689524 461 | 웍스 0.678675 462 | 애니를 0.675855 463 | 에니메이션 0.672339 464 | 2D 0.671045 465 | ``` 466 | 467 | `soy` : `soy` 는 띄어쓰기 통계기반으로 형태소를 판단하는데 꽤 괜찮은 결과를 보이네요. 468 | 469 | ```sh 470 | # khaiii 471 | echo “디즈니/NNP” | ../fasttext nn model_skipgram.bin 472 | Query word? 디즈니/NNP 0.991276 473 | 애니/NNP 0.845127 474 | 즈니/NNG 0.832196 475 | 한국애니/NNP 0.809425 476 | 일본애니/NNP 0.806225 477 | 지브리/NNP 0.77295 478 | 드림웍스/NNP 0.756196 479 | 지니/NNP 0.745549 480 | 베니/NNP 0.740033 481 | 쟈니/NNP 0.730029 482 | ``` 483 | 484 | `khaiii` : 사용자사전을 많이 등록해야만 올바른 결과가 나오는 듯 합니다. 485 | 486 | ```sh 487 | # mecab-ko 488 | echo “디즈니” | ../fasttext nn model_skipgram.bin 489 | Query word? 디즈니 0.996115 490 | 픽사 0.77835 491 | 드림웍스 0.761766 492 | 타잔 0.749565 493 | 애니메 0.719357 494 | 애니메이션 0.694092 495 | 애니 0.691668 496 | 월트 0.690366 497 | 클레이 0.678788 498 | 지브리 0.677055 499 | ``` 500 | 501 | `mecab-ko` : 사전파일만 16M 가 넘어서 그런지 꽤 좋은 결과가 나오는군요. 그 와중에 `지브리` 가 나오네요. 502 | 503 | ## mecab-ko 오분석 처리 504 | 505 | mecab-ko 를 형태소 분석기로 정하고 실제 데이타를 분석해 보니 오류가 많이 있습니다. 506 | 507 | `파우치`, `셋트` 등이 `파우 치`, `셋 트` 로 분석되어 신조어/외래어에 취약한 모습을 보입니다. 508 | 509 | ```sh 510 | cd ~/fastText/mywork/ 511 | vi get_newword.py 512 | ``` 513 | 514 | ```python 515 | import sys, math, argparse, re 516 | import mecab 517 | import hgtk 518 | 519 | def word_count(corpus_fname): 520 | with open(corpus_fname, 'r', encoding='utf-8') as f: 521 | sentences = f.read() 522 | words = re.findall("[가-힣]+", sentences) 523 | 524 | # print(words) 525 | d = {} 526 | for word in words: 527 | d[word] = d.get(word, 0) + 1 528 | 529 | # print(d) 530 | word_freq = [] 531 | for key, value in d.items(): 532 | word_freq.append((value, key)) 533 | 534 | # print(word_freq) 535 | word_freq.sort(reverse=True) 536 | return word_freq 537 | 538 | 539 | def check_morphs(lst, corpus_fname, output_fname, log_fname): 540 | mcab = mecab.MeCab() 541 | 542 | with open(corpus_fname, 'r', encoding='utf-8') as f1, \ 543 | open(output_fname, 'w', encoding='utf-8') as f2, \ 544 | open(log_fname, 'w', encoding='utf-8') as f3: 545 | sentences = f1.read() 546 | 547 | for item in lst: 548 | cnt, word = item 549 | 550 | if cnt < 10: 551 | continue 552 | tokens = mcab.morphs(word) 553 | if len(tokens) == 1: 554 | continue 555 | 556 | words = re.findall(' '.join(tokens), sentences) 557 | if len(words) < (cnt * 0.05): 558 | # 형태소 분리된 단어의 빈도수가 분리안된 단어의 빈수도의 5% 미만이면 형태소 분리오류 559 | (cho, jung, jong) = hgtk.letter.decompose(word[-1]) 560 | if 'ㄱ' <= jong <= 'ㅎ': 561 | dic_line = "{},,,,NNP,*,{},{},*,*,*,*,*".format(word, 'T', word) 562 | else: 563 | dic_line = "{},,,,NNP,*,{},{},*,*,*,*,*".format(word, 'F', word) 564 | # print("{}\t{}\t{}\t{}\t{}".format(word, ' '.join(tokens), cnt, len(words), jong)) 565 | f2.writelines(dic_line + '\n') 566 | f3.writelines("{}\t{}\t{}\t{}".format(word, ' '.join(tokens), cnt, len(words)) + '\n') 567 | 568 | 569 | if __name__ == '__main__': 570 | parser = argparse.ArgumentParser() 571 | parser.add_argument('--input_path', type=str, help='Location of input files') 572 | parser.add_argument('--output_path', type=str, help='Location of output files') 573 | parser.add_argument('--log_path', type=str, help='Location of log files') 574 | args = parser.parse_args() 575 | 576 | lst = word_count(args.input_path) 577 | # print(lst) 578 | 579 | check_morphs(lst, args.input_path, args.output_path, args.log_path) 580 | ``` 581 | 582 | ```sh 583 | # item_info.txt 은 상품명을 모아놓은 텍스트파일로 직접 구하셔야 합니다. 584 | python3 get_newword.py \ 585 | --input_path item_info.txt \ 586 | --output_path output.txt \ 587 | --log_path log.txt 588 | 589 | head output.txt 590 | 파우치,,,,NNP,*,F,파우치,*,*,*,*,* 591 | 에코백,,,,NNP,*,T,에코백,*,*,*,*,* 592 | 크로스백,,,,NNP,*,T,크로스백,*,*,*,*,* 593 | 캔들,,,,NNP,*,T,캔들,*,*,*,*,* 594 | 백팩,,,,NNP,*,T,백팩,*,*,*,*,* 595 | 키링,,,,NNP,*,T,키링,*,*,*,*,* 596 | 린넨,,,,NNP,*,T,린넨,*,*,*,*,* 597 | 카드지갑,,,,NNP,*,T,카드지갑,*,*,*,*,* 598 | 발매트,,,,NNP,*,F,발매트,*,*,*,*,* 599 | 정리보관대,,,,NNP,*,F,정리보관대,*,*,*,*,* 600 | 601 | cat output.txt >> ~/mecab-ko-dic-2.1.1-20180720/user-dic/nnp.csv 602 | 603 | cd ~/mecab-ko-dic-2.1.1-20180720/ 604 | ./tools/add-userdic.sh 605 | make clean 606 | make 607 | sudo make install 608 | ``` 609 | 610 | **사용자사전 추출을 자동화하기 위해 몇시간 고민해 보았지만, 어차피 사람이 눈으로 확인을 해야 한다는 결론이 나오는군요.** 611 | 612 | 다른 방식으로 접근해 봤는데요. 키워드는 고객검색어를 이용하는게 가장 낫다는 생각이 드네요. 613 | 614 | ```sh 615 | python3 ~/embedding/preprocess/unsupervised_nlputils.py \ 616 | --preprocess_mode compute_soy_word_score \ 617 | --input_path ./item_info.txt \ 618 | --model_path ./soyword.model 619 | 620 | python3 ~/embedding/preprocess/unsupervised_nlputils.py \ 621 | --preprocess_mode soy_tokenize \ 622 | --input_path ./item_info.txt \ 623 | --model_path ./soyword.model \ 624 | --output_path ./itemname_tokenized_soy.txt 625 | 626 | python3 get_frequent_word.py \ 627 | --input_path itemname_tokenized_soy.txt \ 628 | --output_path frequent.txt \ 629 | --log_path log.txt 630 | 631 | vi frequent.txt 632 | 633 | cp frequent.txt ~/mecab-ko-dic-2.1.1-20180720/user-dic/frequent.csv 634 | cd ~/mecab-ko-dic-2.1.1-20180720/ 635 | ./tools/add-userdic.sh 636 | 637 | # 우선순위 올리기 638 | sed -i -E 's/[0-9]+,NNP/1000,NNP/g' user-frequent.csv 639 | 640 | make clean 641 | make 642 | sudo make install 643 | ``` 644 | 645 | ## 후기 646 | 647 | 형태소분석이 많이 중요하군요. 648 | 649 | fasttext 는 후기 평점 예측, 고객센터 자동응답, 그리고 멀티 라벨링도 지원하기에 상품 속성 추출에도 적용할만 합니다. 650 | 651 | 보다 많은 자료는 [여기](https://github.com/facebookresearch/fastText/tree/master/docs) 에서 확인할 수 있습니다. 652 | -------------------------------------------------------------------------------- /get_freq_newword.py: -------------------------------------------------------------------------------- 1 | import sys, math, argparse, re 2 | import mecab 3 | from soynlp.word import WordExtractor 4 | from soynlp.tokenizer import LTokenizer 5 | from soynlp.normalizer import * 6 | from soyspacing.countbase import CountSpace 7 | from soynlp.hangle import decompose, character_is_korean 8 | import hgtk 9 | 10 | 11 | def word_count(corpus_fname): 12 | with open(corpus_fname, 'r', encoding='utf-8') as f: 13 | sentences = f.read() 14 | words = re.findall("[가-힣]+", sentences) 15 | 16 | # print(words) 17 | d = {} 18 | for word in words: 19 | d[word] = d.get(word, 0) + 1 20 | 21 | # print(d) 22 | word_freq = [] 23 | for key, value in d.items(): 24 | word_freq.append((value, key)) 25 | 26 | # print(word_freq) 27 | word_freq.sort(reverse=True) 28 | return word_freq 29 | 30 | 31 | def is_all_nng(words): 32 | # [('자연주의', 'NNG'), ('쇼핑몰', 'NNG')] 33 | for item in words: 34 | (w, p) = item 35 | if p != 'NNG': 36 | return False 37 | return True 38 | 39 | 40 | def check_morphs(lst, corpus_fname, output_fname, log_fname): 41 | mcab = mecab.MeCab() 42 | 43 | model_fname = 'soyword.model' 44 | word_extractor = WordExtractor( 45 | min_frequency=100, 46 | min_cohesion_forward=0.05, 47 | min_right_branching_entropy=0.0 48 | ) 49 | word_extractor.load(model_fname) 50 | scores = word_extractor.word_scores() 51 | scores = {key:(scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()} 52 | soy_tokenizer = LTokenizer(scores=scores) 53 | 54 | with open(corpus_fname, 'r', encoding='utf-8') as f1, \ 55 | open(output_fname, 'w', encoding='utf-8') as f2, \ 56 | open(log_fname, 'w', encoding='utf-8') as f3: 57 | sentences = f1.read() 58 | 59 | for item in lst: 60 | cnt, word = item 61 | 62 | if cnt < 10 or len(word) == 1: 63 | continue 64 | 65 | tokens = mcab.morphs(word) 66 | if len(tokens) == 1: 67 | continue 68 | 69 | soy_tokens = soy_tokenizer.tokenize(word) 70 | if ' '.join(tokens) == ' '.join(soy_tokens): 71 | continue 72 | 73 | if is_all_nng(mcab.pos(word)): 74 | #print("nouns only : {}".format(word)) 75 | #print("{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt)) 76 | continue 77 | 78 | if len(soy_tokens) > 1: 79 | continue 80 | 81 | #print("{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt)) 82 | 83 | words = re.findall(' '.join(tokens), sentences) 84 | if len(words) < (cnt * 0.05): 85 | # 형태소 분리된 단어의 빈도수가 분리안된 단어의 빈수도의 5% 미만이면 형태소 분리오류 86 | (cho, jung, jong) = hgtk.letter.decompose(word[-1]) 87 | if 'ㄱ' <= jong <= 'ㅎ': 88 | dic_line = "{},,,1000,NNP,*,{},{},*,*,*,*,*".format(word, 'T', word) 89 | else: 90 | dic_line = "{},,,1000,NNP,*,{},{},*,*,*,*,*".format(word, 'F', word) 91 | print("{}\t{}\t{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt, len(words), jong)) 92 | f2.writelines(dic_line + '\n') 93 | f3.writelines("{}\t{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt, len(words)) + '\n') 94 | 95 | 96 | if __name__ == '__main__': 97 | parser = argparse.ArgumentParser() 98 | parser.add_argument('--input_path', type=str, help='Location of input files') 99 | parser.add_argument('--output_path', type=str, help='Location of output files') 100 | parser.add_argument('--log_path', type=str, help='Location of log files') 101 | args = parser.parse_args() 102 | 103 | lst = word_count(args.input_path) 104 | # print(lst) 105 | 106 | # for item in lst: 107 | # cnt, word = item 108 | 109 | # if cnt >= 100 and len(word) > 1: 110 | # print("{}\t{}".format(word, cnt)) 111 | 112 | check_morphs(lst, args.input_path, args.output_path, args.log_path) 113 | -------------------------------------------------------------------------------- /get_frequent_word.py: -------------------------------------------------------------------------------- 1 | import sys, math, argparse, re 2 | import mecab 3 | from soynlp.word import WordExtractor 4 | from soynlp.tokenizer import LTokenizer 5 | from soynlp.normalizer import * 6 | from soyspacing.countbase import CountSpace 7 | from soynlp.hangle import decompose, character_is_korean 8 | import hgtk 9 | 10 | 11 | def word_count(corpus_fname): 12 | with open(corpus_fname, 'r', encoding='utf-8') as f: 13 | sentences = f.read() 14 | words = re.findall("[가-힣]+", sentences) 15 | 16 | # print(words) 17 | d = {} 18 | for word in words: 19 | d[word] = d.get(word, 0) + 1 20 | 21 | # print(d) 22 | word_freq = [] 23 | for key, value in d.items(): 24 | word_freq.append((value, key)) 25 | 26 | # print(word_freq) 27 | word_freq.sort(reverse=True) 28 | return word_freq 29 | 30 | 31 | def is_all_nng(words): 32 | # [('자연주의', 'NNG'), ('쇼핑몰', 'NNG')] 33 | for item in words: 34 | (w, p) = item 35 | if p != 'NNG': 36 | return False 37 | return True 38 | 39 | 40 | def check_morphs(lst, corpus_fname, output_fname, log_fname): 41 | mcab = mecab.MeCab() 42 | 43 | model_fname = 'soyword.model' 44 | word_extractor = WordExtractor( 45 | min_frequency=100, 46 | min_cohesion_forward=0.05, 47 | min_right_branching_entropy=0.0 48 | ) 49 | word_extractor.load(model_fname) 50 | scores = word_extractor.word_scores() 51 | scores = {key:(scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()} 52 | soy_tokenizer = LTokenizer(scores=scores) 53 | 54 | with open(corpus_fname, 'r', encoding='utf-8') as f1, \ 55 | open(output_fname, 'w', encoding='utf-8') as f2, \ 56 | open(log_fname, 'w', encoding='utf-8') as f3: 57 | sentences = f1.read() 58 | 59 | for item in lst: 60 | cnt, word = item 61 | 62 | if cnt < 100 or len(word) == 1: 63 | continue 64 | 65 | tokens = mcab.morphs(word) 66 | if len(tokens) == 1: 67 | continue 68 | 69 | (cho, jung, jong) = hgtk.letter.decompose(word[-1]) 70 | if 'ㄱ' <= jong <= 'ㅎ': 71 | dic_line = "{},,,,NNP,*,{},{},*,*,*,*,*".format(word, 'T', word) 72 | else: 73 | dic_line = "{},,,,NNP,*,{},{},*,*,*,*,*".format(word, 'F', word) 74 | f2.writelines(dic_line + '\n') 75 | f3.writelines("{}\t{}\t{}".format(word, ' '.join(tokens), cnt) + '\n') 76 | 77 | 78 | if __name__ == '__main__': 79 | parser = argparse.ArgumentParser() 80 | parser.add_argument('--input_path', type=str, help='Location of input files') 81 | parser.add_argument('--output_path', type=str, help='Location of output files') 82 | parser.add_argument('--log_path', type=str, help='Location of log files') 83 | args = parser.parse_args() 84 | 85 | lst = word_count(args.input_path) 86 | # print(lst) 87 | 88 | # for item in lst: 89 | # cnt, word = item 90 | 91 | # if cnt >= 100 and len(word) > 1: 92 | # print("{}\t{}".format(word, cnt)) 93 | 94 | check_morphs(lst, args.input_path, args.output_path, args.log_path) 95 | -------------------------------------------------------------------------------- /get_newword.py: -------------------------------------------------------------------------------- 1 | import sys, math, argparse, re 2 | import mecab 3 | import hgtk 4 | 5 | def word_count(corpus_fname): 6 | with open(corpus_fname, 'r', encoding='utf-8') as f: 7 | sentences = f.read() 8 | words = re.findall("[가-힣]+", sentences) 9 | 10 | # print(words) 11 | d = {} 12 | for word in words: 13 | d[word] = d.get(word, 0) + 1 14 | 15 | # print(d) 16 | word_freq = [] 17 | for key, value in d.items(): 18 | word_freq.append((value, key)) 19 | 20 | # print(word_freq) 21 | word_freq.sort(reverse=True) 22 | return word_freq 23 | 24 | 25 | def check_morphs(lst, corpus_fname, output_fname, log_fname): 26 | mcab = mecab.MeCab() 27 | 28 | with open(corpus_fname, 'r', encoding='utf-8') as f1, \ 29 | open(output_fname, 'w', encoding='utf-8') as f2, \ 30 | open(log_fname, 'w', encoding='utf-8') as f3: 31 | sentences = f1.read() 32 | 33 | for item in lst: 34 | cnt, word = item 35 | 36 | if cnt < 10: 37 | continue 38 | tokens = mcab.morphs(word) 39 | if len(tokens) == 1: 40 | continue 41 | 42 | words = re.findall(' '.join(tokens), sentences) 43 | if len(words) < (cnt * 0.05): 44 | # 형태소 분리된 단어의 빈도수가 분리안된 단어의 빈수도의 5% 미만이면 형태소 분리오류 45 | (cho, jung, jong) = hgtk.letter.decompose(word[-1]) 46 | if 'ㄱ' <= jong <= 'ㅎ': 47 | dic_line = "{},,,,NNP,*,{},{},*,*,*,*,*".format(word, 'T', word) 48 | else: 49 | dic_line = "{},,,,NNP,*,{},{},*,*,*,*,*".format(word, 'F', word) 50 | # print("{}\t{}\t{}\t{}\t{}".format(word, ' '.join(tokens), cnt, len(words), jong)) 51 | f2.writelines(dic_line + '\n') 52 | f3.writelines("{}\t{}\t{}\t{}".format(word, ' '.join(tokens), cnt, len(words)) + '\n') 53 | 54 | 55 | if __name__ == '__main__': 56 | parser = argparse.ArgumentParser() 57 | parser.add_argument('--input_path', type=str, help='Location of input files') 58 | parser.add_argument('--output_path', type=str, help='Location of output files') 59 | parser.add_argument('--log_path', type=str, help='Location of log files') 60 | args = parser.parse_args() 61 | 62 | lst = word_count(args.input_path) 63 | # print(lst) 64 | 65 | check_morphs(lst, args.input_path, args.output_path, args.log_path) 66 | -------------------------------------------------------------------------------- /unsupervised_nlputils.py: -------------------------------------------------------------------------------- 1 | import sys, math, argparse, re 2 | from khaiii import KhaiiiApi 3 | import mecab 4 | 5 | def khaiii_tokenize(corpus_fname, output_fname): 6 | api = KhaiiiApi() 7 | 8 | with open(corpus_fname, 'r', encoding='utf-8') as f1, \ 9 | open(output_fname, 'w', encoding='utf-8') as f2: 10 | for line in f1: 11 | sentence = line.replace('\n', '').strip() 12 | tokens = api.analyze(sentence) 13 | tokenized_sent = '' 14 | for token in tokens: 15 | tokenized_sent += ' '.join([str(m) for m in token.morphs]) + ' ' 16 | f2.writelines(tokenized_sent.strip() + '\n') 17 | 18 | 19 | def mecab_tokenize(corpus_fname, output_fname): 20 | mcab = mecab.MeCab() 21 | 22 | with open(corpus_fname, 'r', encoding='utf-8') as f1, \ 23 | open(output_fname, 'w', encoding='utf-8') as f2: 24 | for line in f1: 25 | sentence = line.replace('\n', '').strip() 26 | tokens = mcab.morphs(sentence) 27 | tokenized_sent = ' '.join(tokens) 28 | f2.writelines(tokenized_sent + '\n') 29 | 30 | 31 | if __name__ == '__main__': 32 | parser = argparse.ArgumentParser() 33 | parser.add_argument('--preprocess_mode', type=str, help='preprocess mode') 34 | parser.add_argument('--input_path', type=str, help='Location of input files') 35 | parser.add_argument('--output_path', type=str, help='Location of output files') 36 | args = parser.parse_args() 37 | 38 | if args.preprocess_mode == "khaiii_tokenize": 39 | khaiii_tokenize(args.input_path, args.output_path) 40 | elif args.preprocess_mode == "mecab_tokenize": 41 | mecab_tokenize(args.input_path, args.output_path) --------------------------------------------------------------------------------