├── CLPT ├── README.md └── WordSegmentation │ ├── .DS_Store │ ├── CRFSegment │ ├── README.md │ ├── crf_data_2_word.py │ ├── crf_segmenter.py │ ├── make_crf_test_data.py │ └── make_crf_train_data.py │ ├── MaxentSegment │ ├── README.md │ ├── character_2_word.py │ ├── character_split.py │ └── character_tagging.py │ ├── MeCab │ ├── script │ │ ├── make_mecab_seed_data.py │ │ └── make_mecab_train_data.py │ └── seed │ │ ├── char.def │ │ ├── dicrc │ │ ├── feature.def │ │ ├── rewrite.def │ │ └── unk.def │ └── README.md └── README.md /CLPT/README.md: -------------------------------------------------------------------------------- 1 | Chinese Language Processing Tools(CLPT) 2 | ==================== 3 | 4 | 1. Chinese Word Segmentation 5 | 6 | NLP Education Tools by YuZhen Technology Ltd. (www.yuzhenkeji.com) 7 | 8 | -------------------------------------------------------------------------------- /CLPT/WordSegmentation/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/panyang/yuzhen_nlp_edu_tools/b81098a0e1aa2dd2c31758e475a28c3a74e9fb1d/CLPT/WordSegmentation/.DS_Store -------------------------------------------------------------------------------- /CLPT/WordSegmentation/CRFSegment/README.md: -------------------------------------------------------------------------------- 1 | Character based Word Segmentation by Conditional Random Fields(CRF) 2 | ==================== 3 | 4 | This example is using CRF++ Toolkit, the details info you can find on 52nlp's article: http://www.52nlp.cn/?p=6339 5 | 6 | NLP Education Tools by YuZhen Technology Ltd. (www.yuzhenkeji.com) 7 | -------------------------------------------------------------------------------- /CLPT/WordSegmentation/CRFSegment/crf_data_2_word.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Author: 52nlpcn@gmail.com 4 | # Copyright 2014 @ YuZhen Technology 5 | # 6 | # 4 tags for character tagging: B(Begin), E(End), M(Middle), S(Single) 7 | 8 | import codecs 9 | import sys 10 | 11 | def character_2_word(input_file, output_file): 12 | input_data = codecs.open(input_file, 'r', 'utf-8') 13 | output_data = codecs.open(output_file, 'w', 'utf-8') 14 | for line in input_data.readlines(): 15 | if line == "\n": 16 | output_data.write("\n") 17 | else: 18 | char_tag_pair = line.strip().split('\t') 19 | char = char_tag_pair[0] 20 | tag = char_tag_pair[2] 21 | if tag == 'B': 22 | output_data.write(' ' + char) 23 | elif tag == 'M': 24 | output_data.write(char) 25 | elif tag == 'E': 26 | output_data.write(char + ' ') 27 | else: # tag == 'S' 28 | output_data.write(' ' + char + ' ') 29 | input_data.close() 30 | output_data.close() 31 | 32 | if __name__ == '__main__': 33 | if len(sys.argv) != 3: 34 | print "pls use: python crf_data_2_word.py input output" 35 | sys.exit() 36 | input_file = sys.argv[1] 37 | output_file = sys.argv[2] 38 | character_2_word(input_file, output_file) 39 | -------------------------------------------------------------------------------- /CLPT/WordSegmentation/CRFSegment/crf_segmenter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Author: 52nlpcn@gmail.com 4 | # Copyright 2014 @ YuZhen Technology 5 | # 6 | # CRF Segmenter based character tagging: 7 | # 4 tags for character tagging: B(Begin), E(End), M(Middle), S(Single) 8 | 9 | import codecs 10 | import sys 11 | 12 | import CRFPP 13 | 14 | def crf_segmenter(input_file, output_file, tagger): 15 | input_data = codecs.open(input_file, 'r', 'utf-8') 16 | output_data = codecs.open(output_file, 'w', 'utf-8') 17 | for line in input_data.readlines(): 18 | tagger.clear() 19 | for word in line.strip(): 20 | word = word.strip() 21 | if word: 22 | tagger.add((word + "\to\tB").encode('utf-8')) 23 | tagger.parse() 24 | size = tagger.size() 25 | xsize = tagger.xsize() 26 | for i in range(0, size): 27 | for j in range(0, xsize): 28 | char = tagger.x(i, j).decode('utf-8') 29 | tag = tagger.y2(i) 30 | if tag == 'B': 31 | output_data.write(' ' + char) 32 | elif tag == 'M': 33 | output_data.write(char) 34 | elif tag == 'E': 35 | output_data.write(char + ' ') 36 | else: # tag == 'S' 37 | output_data.write(' ' + char + ' ') 38 | output_data.write('\n') 39 | input_data.close() 40 | output_data.close() 41 | 42 | if __name__ == '__main__': 43 | if len(sys.argv) != 4: 44 | print "pls use: python crf_segmenter.py model input output" 45 | sys.exit() 46 | crf_model = sys.argv[1] 47 | input_file = sys.argv[2] 48 | output_file = sys.argv[3] 49 | tagger = CRFPP.Tagger("-m " + crf_model) 50 | crf_segmenter(input_file, output_file, tagger) 51 | -------------------------------------------------------------------------------- /CLPT/WordSegmentation/CRFSegment/make_crf_test_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Author: 52nlpcn@gmail.com 4 | # Copyright 2014 @ YuZhen Technology 5 | # 6 | # 4 tags for character tagging: B(Begin), E(End), M(Middle), S(Single) 7 | 8 | import codecs 9 | import sys 10 | 11 | def character_split(input_file, output_file): 12 | input_data = codecs.open(input_file, 'r', 'utf-8') 13 | output_data = codecs.open(output_file, 'w', 'utf-8') 14 | for line in input_data.readlines(): 15 | for word in line.strip(): 16 | word = word.strip() 17 | if word: 18 | output_data.write(word + "\tB\n") 19 | output_data.write("\n") 20 | input_data.close() 21 | output_data.close() 22 | 23 | if __name__ == '__main__': 24 | if len(sys.argv) != 3: 25 | print "pls use: python make_crf_test_data.py input output" 26 | sys.exit() 27 | input_file = sys.argv[1] 28 | output_file = sys.argv[2] 29 | character_split(input_file, output_file) 30 | -------------------------------------------------------------------------------- /CLPT/WordSegmentation/CRFSegment/make_crf_train_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Author: 52nlpcn@gmail.com 4 | # Copyright 2014 @ YuZhen Technology 5 | # 6 | # 4 tags for character tagging: B(Begin), E(End), M(Middle), S(Single) 7 | 8 | import codecs 9 | import sys 10 | 11 | def character_tagging(input_file, output_file): 12 | input_data = codecs.open(input_file, 'r', 'utf-8') 13 | output_data = codecs.open(output_file, 'w', 'utf-8') 14 | for line in input_data.readlines(): 15 | word_list = line.strip().split() 16 | for word in word_list: 17 | if len(word) == 1: 18 | output_data.write(word + "\tS\n") 19 | else: 20 | output_data.write(word[0] + "\tB\n") 21 | for w in word[1:len(word)-1]: 22 | output_data.write(w + "\tM\n") 23 | output_data.write(word[len(word)-1] + "\tE\n") 24 | output_data.write("\n") 25 | input_data.close() 26 | output_data.close() 27 | 28 | if __name__ == '__main__': 29 | if len(sys.argv) != 3: 30 | print "pls use: python make_crf_train_data.py input output" 31 | sys.exit() 32 | input_file = sys.argv[1] 33 | output_file = sys.argv[2] 34 | character_tagging(input_file, output_file) 35 | -------------------------------------------------------------------------------- /CLPT/WordSegmentation/MaxentSegment/README.md: -------------------------------------------------------------------------------- 1 | Character based Word Segmentation by Maxent Maximum Entropy Modeling 2 | ==================== 3 | 4 | This example is using Doctor ZhangLe's Maxent Toolkit(Maximum Entropy 5 | Modeling Toolkit for Python and C++), the details info you can find on 6 | 52nlp's article: http://www.52nlp.cn/?p=5682 7 | 8 | NLP Education Tools by YuZhen Technology Ltd. (www.yuzhenkeji.com) 9 | 10 | -------------------------------------------------------------------------------- /CLPT/WordSegmentation/MaxentSegment/character_2_word.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Author: 52nlpcn@gmail.com 4 | # Copyright 2014 @ YuZhen Technology 5 | # 6 | # Combining characters based the 4-tag tagging info 7 | 8 | import codecs 9 | import sys 10 | 11 | def character_2_word(input_file, output_file): 12 | input_data = codecs.open(input_file, 'r', 'utf-8') 13 | output_data = codecs.open(output_file, 'w', 'utf-8') 14 | # 4 tags for character tagging: B(Begin), E(End), M(Middle), S(Single) 15 | for line in input_data.readlines(): 16 | char_tag_list = line.strip().split() 17 | for char_tag in char_tag_list: 18 | char_tag_pair = char_tag.split('/') 19 | char = char_tag_pair[0] 20 | tag = char_tag_pair[1] 21 | if tag == 'B': 22 | output_data.write(' ' + char) 23 | elif tag == 'M': 24 | output_data.write(char) 25 | elif tag == 'E': 26 | output_data.write(char + ' ') 27 | else: # tag == 'S' 28 | output_data.write(' ' + char + ' ') 29 | output_data.write("\n") 30 | input_data.close() 31 | output_data.close() 32 | 33 | if __name__ == '__main__': 34 | if len(sys.argv) != 3: 35 | print "Please use: python character_2_word.py input output" 36 | sys.exit() 37 | input_file = sys.argv[1] 38 | output_file = sys.argv[2] 39 | character_2_word(input_file, output_file) 40 | -------------------------------------------------------------------------------- /CLPT/WordSegmentation/MaxentSegment/character_split.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Author: 52nlpcn@gmail.com 4 | # Copyright 2014 @ YuZhen Technology 5 | # 6 | # split chinese characters to single charachter and add space between them 7 | 8 | import codecs 9 | import sys 10 | 11 | def character_split(input_file, output_file): 12 | input_data = codecs.open(input_file, 'r', 'utf-8') 13 | output_data = codecs.open(output_file, 'w', 'utf-8') 14 | for line in input_data.readlines(): 15 | for word in line.strip(): 16 | output_data.write(word + " ") 17 | output_data.write("\n") 18 | input_data.close() 19 | output_data.close() 20 | 21 | if __name__ == '__main__': 22 | if len(sys.argv) != 3: 23 | print "Please use: python character_split.py input output" 24 | sys.exit() 25 | input_file = sys.argv[1] 26 | output_file = sys.argv[2] 27 | character_split(input_file, output_file) 28 | -------------------------------------------------------------------------------- /CLPT/WordSegmentation/MaxentSegment/character_tagging.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Author: 52nlpcn@gmail.com 4 | # Copyright 2014 @ YuZhen Technology 5 | # 6 | # 4 tags for character tagging: B(Begin), E(End), M(Middle), S(Single) 7 | 8 | import codecs 9 | import sys 10 | 11 | def character_tagging(input_file, output_file): 12 | input_data = codecs.open(input_file, 'r', 'utf-8') 13 | output_data = codecs.open(output_file, 'w', 'utf-8') 14 | for line in input_data.readlines(): 15 | word_list = line.strip().split() 16 | for word in word_list: 17 | if len(word) == 1: 18 | output_data.write(word + "/S ") 19 | else: 20 | output_data.write(word[0] + "/B ") 21 | for w in word[1:len(word)-1]: 22 | output_data.write(w + "/M ") 23 | output_data.write(word[len(word)-1] + "/E ") 24 | output_data.write("\n") 25 | input_data.close() 26 | output_data.close() 27 | 28 | if __name__ == '__main__': 29 | if len(sys.argv) != 3: 30 | print "Please use: python character_tagging.py input output" 31 | sys.exit() 32 | input_file = sys.argv[1] 33 | output_file = sys.argv[2] 34 | character_tagging(input_file, output_file) 35 | -------------------------------------------------------------------------------- /CLPT/WordSegmentation/MeCab/script/make_mecab_seed_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Author: 52nlpcn@gmail.com 4 | # Copyright 2015 @ YuZhen Technology 5 | 6 | import codecs 7 | import sys 8 | 9 | def make_mecab_seed_data(input_file, output_file): 10 | input_data = codecs.open(input_file, 'r', 'utf-8') 11 | output_data = codecs.open(output_file, 'w', 'utf-8') 12 | for line in input_data.readlines(): 13 | word = line.strip() 14 | output_data.write(word+ ",0,0,0,0,0,0\n") 15 | input_data.close() 16 | output_data.close() 17 | 18 | if __name__ == '__main__': 19 | if len(sys.argv) < 3: 20 | print "pls use: python make_mecab_seed_data.py input output" 21 | sys.exit() 22 | input_file = sys.argv[1] 23 | output_file = sys.argv[2] 24 | make_mecab_seed_data(input_file, output_file) 25 | -------------------------------------------------------------------------------- /CLPT/WordSegmentation/MeCab/script/make_mecab_train_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Author: 52nlpcn@gmail.com 4 | # Copyright 2015 @ YuZhen Technology 5 | 6 | import codecs 7 | import sys 8 | 9 | def make_mecab_train_data(input_file, output_file): 10 | input_data = codecs.open(input_file, 'r', 'utf-8') 11 | output_data = codecs.open(output_file, 'w', 'utf-8') 12 | for line in input_data.readlines(): 13 | word_list = line.strip().split() 14 | if len(word_list) == 0: continue 15 | for word in word_list: 16 | output_data.write(word+ "\t0,0,0,0,0,0\n") 17 | output_data.write("EOS\n") 18 | input_data.close() 19 | output_data.close() 20 | 21 | if __name__ == '__main__': 22 | if len(sys.argv) < 3: 23 | print "pls use: python make_mecab_train_data.py input output" 24 | sys.exit() 25 | input_file = sys.argv[1] 26 | output_file = sys.argv[2] 27 | make_mecab_train_data(input_file, output_file) 28 | -------------------------------------------------------------------------------- /CLPT/WordSegmentation/MeCab/seed/char.def: -------------------------------------------------------------------------------- 1 | DEFAULT 0 1 0 # DEFAULT is a mandatory category! 2 | SPACE 0 1 0 3 | CJK 0 0 2 4 | 5 | # SPACE 6 | 0x0020 SPACE # DO NOT REMOVE THIS LINE, 0x0020 is reserved for SPACE 7 | 0x00D0 SPACE 8 | 0x0009 SPACE 9 | 0x000B SPACE 10 | 0x000A SPACE 11 | 12 | #CJK 13 | 0x4E00..0x9FCB CJK 14 | -------------------------------------------------------------------------------- /CLPT/WordSegmentation/MeCab/seed/dicrc: -------------------------------------------------------------------------------- 1 | cost-factor = 800 2 | bos-feature = BOS/EOS,*,*,*,*,*,*,*,* 3 | eval-size = 6 4 | unk-eval-size = 4 5 | config-charset = UTF-8 6 | -------------------------------------------------------------------------------- /CLPT/WordSegmentation/MeCab/seed/feature.def: -------------------------------------------------------------------------------- 1 | UNIGRAM W0:%F[6] 2 | UNIGRAM W1:%F[0]/%F[6] 3 | UNIGRAM W2:%F[0],%F?[1]/%F[6] 4 | UNIGRAM W3:%F[0],%F[1],%F?[2]/%F[6] 5 | UNIGRAM W4:%F[0],%F[1],%F[2],%F?[3]/%F[6] 6 | 7 | UNIGRAM T0:%t 8 | UNIGRAM T1:%F[0]/%t 9 | UNIGRAM T2:%F[0],%F?[1]/%t 10 | UNIGRAM T3:%F[0],%F[1],%F?[2]/%t 11 | UNIGRAM T4:%F[0],%F[1],%F[2],%F?[3]/%t 12 | 13 | BIGRAM B00:%L[0]/%R[0] 14 | BIGRAM B01:%L[0],%L?[1]/%R[0] 15 | BIGRAM B02:%L[0]/%R[0],%R?[1] 16 | BIGRAM B03:%L[0]/%R[0],%R[1],%R?[2] 17 | BIGRAM B04:%L[0],%L?[1]/%R[0],%R[1],%R?[2] 18 | BIGRAM B05:%L[0]/%R[0],%R[1],%R[2],%R?[3] 19 | BIGRAM B06:%L[0],%L?[1]/%R[0],%R[1],%R[2],%R?[3] 20 | -------------------------------------------------------------------------------- /CLPT/WordSegmentation/MeCab/seed/rewrite.def: -------------------------------------------------------------------------------- 1 | [unigram rewrite] 2 | *,*,* $1,$2,$3 3 | 4 | [left rewrite] 5 | *,*,* $1,$2,$3 6 | 7 | [right rewrite] 8 | *,*,* $1,$2,$3 9 | -------------------------------------------------------------------------------- /CLPT/WordSegmentation/MeCab/seed/unk.def: -------------------------------------------------------------------------------- 1 | DEFAULT,0,0,0,unk,*,* 2 | SPACE,0,0,0,unk,*,* 3 | CJK,0,0,0,unk,*,* 4 | -------------------------------------------------------------------------------- /CLPT/WordSegmentation/README.md: -------------------------------------------------------------------------------- 1 | Chinese Word Segmentation 2 | ==================== 3 | 4 | 1、Maxent Segmenter 5 | 6 | 2、CRF Segmenter 7 | 8 | NLP Education Tools by YuZhen Technology Ltd. (www.yuzhenkeji.com) 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | YuZhen NLP Education Tools 2 | ==================== 3 | 4 | NLP Education Tools by YuZhen Technology Ltd. (www.yuzhenkeji.com) 5 | --------------------------------------------------------------------------------