├── mecab-as-kkc ├── unk.def ├── char.def ├── rewrite.def └── dicrc ├── .gitmodules ├── .gitignore ├── LICENSE ├── Makefile ├── script └── generate_matrix_def.py └── README.md /mecab-as-kkc/unk.def: -------------------------------------------------------------------------------- 1 | DEFAULT,10,10,8000,* 2 | SPACE,10,10,8000,* 3 | -------------------------------------------------------------------------------- /mecab-as-kkc/char.def: -------------------------------------------------------------------------------- 1 | DEFAULT 1 0 0 2 | SPACE 0 1 0 3 | 0x0020 SPACE 4 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "mozc"] 2 | path = mozc 3 | url = https://github.com/google/mozc.git 4 | -------------------------------------------------------------------------------- /mecab-as-kkc/rewrite.def: -------------------------------------------------------------------------------- 1 | [unigram rewrite] 2 | * $1 3 | [left rewrite] 4 | * $1 5 | [right rewrite] 6 | * $1 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | __pycache__/ 3 | *.py[cod] 4 | mecab-as-kkc/matrix.def 5 | mecab-as-kkc/*.bin 6 | mecab-as-kkc/*.dic 7 | mecab-as-kkc/*.csv 8 | -------------------------------------------------------------------------------- /mecab-as-kkc/dicrc: -------------------------------------------------------------------------------- 1 | dictionary-charset = utf-8 2 | cost-factor = 800 3 | bos-feature = BOS/EOS 4 | output-format-type = kkc 5 | 6 | ; default format 7 | node-format-kkc = %H 8 | unk-format-kkc = %M 9 | eos-format-kkc = \n 10 | 11 | ; yet another wakati 12 | node-format-wakachi = %H\s 13 | unk-format-wakachi = %M 14 | eos-format-wakachi = \n 15 | 16 | ; show also context IDs 17 | node-format-id = %M,%phl,%phr,%H\n 18 | unk-format-id = %M,%phl,%phr,%M,UNK\n 19 | eos-format-id = EOS\n 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 2 | Version 2, December 2004 3 | 4 | Copyright (C) 2019 Yukino Ikegami 5 | 6 | Everyone is permitted to copy and distribute verbatim or modified 7 | copies of this license document, and changing it is allowed as long 8 | as the name is changed. 9 | 10 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 11 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 12 | 13 | 0. You just DO WHAT THE FUCK YOU WANT TO. 14 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: update_mozc build install uninstall clean 2 | 3 | MECAB_LIBEXEC_DIR = $(shell mecab-config --libexecdir) 4 | MECAB_DIC_DIR = $(shell mecab-config --dicdir) 5 | 6 | MATRIX_DEF = matrix.def 7 | LEX_CSV = lex.csv 8 | 9 | all: build 10 | 11 | update_mozc: 12 | git submodule init 13 | git submodule update 14 | 15 | $(MATRIX_DEF): 16 | ./script/generate_matrix_def.py 17 | 18 | $(LEX_CSV): 19 | cat mozc/src/data/dictionary_oss/dictionary*.txt | tr "\\t" "," | grep -v "^," > mecab-as-kkc/lex.csv 20 | 21 | build: update_mozc $(MATRIX_DEF) $(LEX_CSV) 22 | $(MECAB_LIBEXEC_DIR)/mecab-dict-index -d mecab-as-kkc -o mecab-as-kkc 23 | 24 | install: 25 | @if [ ! -d ${MECAB_DIC_DIR}/mecab-as-kkc ] ; then\ 26 | mkdir $(MECAB_DIC_DIR)/mecab-as-kkc;\ 27 | fi 28 | install mecab-as-kkc/* $(MECAB_DIC_DIR)/mecab-as-kkc/ 29 | 30 | uninstall: 31 | $(RM) -r $(MECAB_DIC_DIR)/mecab-as-kkc 32 | 33 | clean: 34 | $(RM) mecab-as-kkc/*.bin 35 | $(RM) mecab-as-kkc/*.dic 36 | $(RM) mecab-as-kkc/lex.csv 37 | $(RM) mecab-as-kkc/matrix.def 38 | -------------------------------------------------------------------------------- /script/generate_matrix_def.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import zlib 4 | 5 | ROOT_DIR = os.path.join(os.path.dirname(__file__), '../') 6 | CONNECTION_DEFLATE_PATH = os.path.join(ROOT_DIR, 7 | 'mozc/src/data/dictionary_oss/' 8 | 'connection.deflate') 9 | MATRIX_DEF_PATH = os.path.join(ROOT_DIR, 'mecab-as-kkc/matrix.def') 10 | 11 | 12 | def decompress_deflate(path): 13 | with open(CONNECTION_DEFLATE_PATH, 'rb') as f: 14 | return zlib.decompress(f.read()).decode() 15 | 16 | 17 | def to_matrix(connections): 18 | num_classes = int(connections[0]) 19 | connection_matrix = ['%s %s' % (num_classes, num_classes)] 20 | for lid in range(num_classes): 21 | for rid in range(num_classes): 22 | line = '%s %s %s' % (lid, rid, 23 | connections[lid * num_classes + rid + 1]) 24 | connection_matrix.append(line) 25 | return '\n'.join(connection_matrix) 26 | 27 | 28 | def main(): 29 | connections = decompress_deflate(CONNECTION_DEFLATE_PATH) 30 | connections = connections.splitlines() 31 | connection_matrix = to_matrix(connections) 32 | with open(MATRIX_DEF_PATH, 'w') as f: 33 | f.write(connection_matrix) 34 | 35 | 36 | if __name__ == '__main__': 37 | main() 38 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MeCab as KKC 2 | 3 | For using MeCab as a Kana-Kanji converter (KKC), this repository provides scripts to convert Mozc dictionary to MeCab dictionary. 4 | 5 | ## Dependencies 6 | 7 | - Git 8 | - MeCab 9 | - mecab, mecab-config and mecab-dict-index 10 | - one dictionary should be installed (Example: ipadic) 11 | - Python 3 12 | 13 | ## Preparation 14 | 15 | ``` 16 | $ git clone --depth 1 https://github.com/ikegami-yukino/mecab-as-kkc.git 17 | ``` 18 | 19 | ## Build dictionary 20 | 21 | ``` 22 | $ make 23 | ``` 24 | 25 | ## Install 26 | 27 | ``` 28 | $ make install 29 | ``` 30 | 31 | or 32 | 33 | ``` 34 | $ cp -r maceb-as-kkc /maceb-as-kkc 35 | ``` 36 | 37 | If you do not want to add the dictionary entry, we recommend executing the following commands. 38 | These save the disk usage (about 160MB). 39 | 40 | ``` 41 | $ rm `mecab-config --dicdir`/mecab-as-kkc/lex.csv 42 | $ rm `mecab-config --dicdir`/mecab-as-kkc/matrix.def 43 | ``` 44 | 45 | ## Uninstall 46 | 47 | ``` 48 | $ make uninstall 49 | ``` 50 | 51 | or 52 | 53 | ``` 54 | $ rm -r /maceb-as-kkc 55 | ``` 56 | 57 | ## Example of usage 58 | 59 | ``` 60 | $ echo ここではきものをぬぎます | mecab -d `mecab-config --dicdir`/mecab-as-kkc -N 5 61 | ここでは着物を脱ぎます 62 | ここでは着物を脱ぎます 63 | ここではきものを脱ぎます 64 | ここではきものを脱ぎます 65 | ここで履物を脱ぎます 66 | ``` 67 | 68 | ## How to Add new entry to this dictionary 69 | 70 | ### Formatting 71 | In lex.csv, we can add an entry as 1 line 1 entry. 72 | The line formatting of lex.csv is as follows: 73 | 74 | ``` 75 | めかぶ,670,1250,4000,和布蕪 76 | ``` 77 | 78 | From the left, reading (Hiragana), left-cotext ID, right-context ID, cost, and word are corresponded to. 79 | In this case, the reading "めかぶ" is converted to the word "和布蕪". 80 | 81 | ### Determine context IDs 82 | left-cotext ID and right-context ID are chosen from `mozc`/src/data/dictionary_oss/id.def` file. 83 | 84 | Usually, the following context IDs are used: 85 | ``` 86 | 1837 名詞,サ変接続,*,*,*,*,* 87 | 1847 名詞,一般,*,*,*,*,* 88 | 1895 名詞,代名詞,一般,*,*,*,* 89 | 1916 名詞,固有名詞,一般,*,*,*,* 90 | 1917 名詞,固有名詞,人名,一般,*,*,* 91 | 1918 名詞,固有名詞,人名,名,*,*,* 92 | 1919 名詞,固有名詞,人名,姓,*,*,* 93 | 1920 名詞,固有名詞,地域,一般,*,*,* 94 | 1921 名詞,固有名詞,地域,一般,*,*,府名 95 | 1922 名詞,固有名詞,地域,一般,*,*,県名 96 | 1923 名詞,固有名詞,地域,一般,*,*,都名 97 | 1924 名詞,固有名詞,地域,国,*,*,* 98 | 1925 名詞,固有名詞,組織,*,*,*,* 99 | ``` 100 | 101 | NOTE that choosing the appropriate context ID needs Japanese language domain knowledge. 102 | 103 | ### Tuning cost 104 | How to tune cost value is as follows: 105 | 1. Give 4000 points cost to the new entry 106 | 2. Recompile the dictionary with the following command: 107 | ``` 108 | $ `mecab-config --libexecdir`/mecab-dict-index -d mecab-as-kkc -o mecab-as-kkc 109 | ``` 110 | 3. Check result: 111 | ``` 112 | $ echo めかぶ | mecab -d `mecab-config --dicdir`/maceb-as-kkc` 113 | ``` 114 | 4. If the new word "和布蕪" is not the best candidate, then the cost value of the new entry should be decreased gradually 115 | 116 | ## Limitation 117 | 118 | Currently, this repository does not support Kana-Symbol conversion and Kana-Emoji conversion because we do not know how to determine their appropriate costs. 119 | 120 | Contributions are welcome. 121 | 122 | ## License 123 | 124 | WTFPL 125 | 126 | ## Acknowledgments 127 | We thank MeCab and Mozc since this repository relies on them. 128 | 129 | - [MeCab](https://taku910.github.io/mecab/) 130 | - T. Kudo, H. Komatsu, T. Hanaoka, A. Mukai, Y. Tabata, K. Yamamoto, Y. Matsumoto. 2004. Applying Conditional Random Fields to Japanese Morphological Analysis. In Proceedings of the EMNLP 2004, pp 230-237. 131 | - [Mozc](https://github.com/google/mozc) 132 | - T. Kudo, T. Hanaoka, J. Mukai, Y. Tabata, H. Komatsu. 2011. Efficient dictionary and language model compression for input method editors. In Proceedings of the Workshop on Advances in Text Input Methods (WTIM 2011), pp 19-25. 133 | --------------------------------------------------------------------------------