├── CLPT
    ├── README.md
    └── WordSegmentation
    │   ├── .DS_Store
    │   ├── CRFSegment
    │       ├── README.md
    │       ├── crf_data_2_word.py
    │       ├── crf_segmenter.py
    │       ├── make_crf_test_data.py
    │       └── make_crf_train_data.py
    │   ├── MaxentSegment
    │       ├── README.md
    │       ├── character_2_word.py
    │       ├── character_split.py
    │       └── character_tagging.py
    │   ├── MeCab
    │       ├── script
    │       │   ├── make_mecab_seed_data.py
    │       │   └── make_mecab_train_data.py
    │       └── seed
    │       │   ├── char.def
    │       │   ├── dicrc
    │       │   ├── feature.def
    │       │   ├── rewrite.def
    │       │   └── unk.def
    │   └── README.md
└── README.md


/CLPT/README.md:
--------------------------------------------------------------------------------
1 | Chinese Language Processing Tools(CLPT)
2 | ====================
3 | 
4 | 1. Chinese Word Segmentation
5 | 
6 | NLP Education Tools by YuZhen Technology Ltd. (www.yuzhenkeji.com)
7 | 
8 | 


--------------------------------------------------------------------------------
/CLPT/WordSegmentation/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/panyang/yuzhen_nlp_edu_tools/b81098a0e1aa2dd2c31758e475a28c3a74e9fb1d/CLPT/WordSegmentation/.DS_Store


--------------------------------------------------------------------------------
/CLPT/WordSegmentation/CRFSegment/README.md:
--------------------------------------------------------------------------------
1 | Character based Word Segmentation by Conditional Random Fields(CRF)
2 | ====================
3 | 
4 | This example is using CRF++ Toolkit, the details info you can find on 52nlp's article: http://www.52nlp.cn/?p=6339
5 | 
6 | NLP Education Tools by YuZhen Technology Ltd. (www.yuzhenkeji.com)
7 | 


--------------------------------------------------------------------------------
/CLPT/WordSegmentation/CRFSegment/crf_data_2_word.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Author: 52nlpcn@gmail.com
 4 | # Copyright 2014 @ YuZhen Technology
 5 | #
 6 | # 4 tags for character tagging: B(Begin), E(End), M(Middle), S(Single)
 7 | 
 8 | import codecs
 9 | import sys
10 | 
11 | def character_2_word(input_file, output_file):
12 |     input_data = codecs.open(input_file, 'r', 'utf-8')
13 |     output_data = codecs.open(output_file, 'w', 'utf-8')
14 |     for line in input_data.readlines():
15 |         if line == "\n":
16 |             output_data.write("\n")
17 |         else:
18 |             char_tag_pair = line.strip().split('\t')
19 |             char = char_tag_pair[0]
20 |             tag = char_tag_pair[2]
21 |             if tag == 'B':
22 |                 output_data.write(' ' + char)
23 |             elif tag == 'M':
24 |                 output_data.write(char)
25 |             elif tag == 'E':
26 |                 output_data.write(char + ' ')
27 |             else: # tag == 'S'
28 |                 output_data.write(' ' + char + ' ')
29 |     input_data.close()
30 |     output_data.close()
31 | 
32 | if __name__ == '__main__':
33 |     if len(sys.argv) != 3:
34 |         print "pls use: python crf_data_2_word.py input output"
35 |         sys.exit()
36 |     input_file = sys.argv[1]
37 |     output_file = sys.argv[2]
38 |     character_2_word(input_file, output_file)
39 | 


--------------------------------------------------------------------------------
/CLPT/WordSegmentation/CRFSegment/crf_segmenter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Author: 52nlpcn@gmail.com
 4 | # Copyright 2014 @ YuZhen Technology
 5 | #
 6 | # CRF Segmenter based character tagging:
 7 | #     4 tags for character tagging: B(Begin), E(End), M(Middle), S(Single)
 8 | 
 9 | import codecs
10 | import sys
11 | 
12 | import CRFPP
13 | 
14 | def crf_segmenter(input_file, output_file, tagger):
15 |     input_data = codecs.open(input_file, 'r', 'utf-8')
16 |     output_data = codecs.open(output_file, 'w', 'utf-8')
17 |     for line in input_data.readlines():
18 |         tagger.clear()
19 |         for word in line.strip():
20 |             word = word.strip()
21 |             if word:
22 |                 tagger.add((word + "\to\tB").encode('utf-8'))
23 |         tagger.parse()
24 |         size = tagger.size()
25 |         xsize = tagger.xsize()
26 |         for i in range(0, size):
27 |             for j in range(0, xsize):
28 |                 char = tagger.x(i, j).decode('utf-8')
29 |                 tag = tagger.y2(i)
30 |                 if tag == 'B':
31 |                     output_data.write(' ' + char)
32 |                 elif tag == 'M':
33 |                     output_data.write(char)
34 |                 elif tag == 'E':
35 |                     output_data.write(char + ' ')
36 |                 else: # tag == 'S'
37 |                     output_data.write(' ' + char + ' ')
38 |         output_data.write('\n')
39 |     input_data.close()
40 |     output_data.close()
41 | 
42 | if __name__ == '__main__':
43 |     if len(sys.argv) != 4:
44 |         print "pls use: python crf_segmenter.py model input output"
45 |         sys.exit()
46 |     crf_model = sys.argv[1]
47 |     input_file = sys.argv[2]
48 |     output_file = sys.argv[3]
49 |     tagger = CRFPP.Tagger("-m " + crf_model)
50 |     crf_segmenter(input_file, output_file, tagger)
51 | 


--------------------------------------------------------------------------------
/CLPT/WordSegmentation/CRFSegment/make_crf_test_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Author: 52nlpcn@gmail.com
 4 | # Copyright 2014 @ YuZhen Technology
 5 | #
 6 | # 4 tags for character tagging: B(Begin), E(End), M(Middle), S(Single)
 7 | 
 8 | import codecs
 9 | import sys
10 | 
11 | def character_split(input_file, output_file):
12 |     input_data = codecs.open(input_file, 'r', 'utf-8')
13 |     output_data = codecs.open(output_file, 'w', 'utf-8')
14 |     for line in input_data.readlines():
15 |         for word in line.strip():
16 |             word = word.strip()
17 |             if word:
18 |                 output_data.write(word + "\tB\n")
19 |         output_data.write("\n")
20 |     input_data.close()
21 |     output_data.close()
22 | 
23 | if __name__ == '__main__':
24 |     if len(sys.argv) != 3:
25 |         print "pls use: python make_crf_test_data.py input output"
26 |         sys.exit()
27 |     input_file = sys.argv[1]
28 |     output_file = sys.argv[2]
29 |     character_split(input_file, output_file)
30 | 


--------------------------------------------------------------------------------
/CLPT/WordSegmentation/CRFSegment/make_crf_train_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Author: 52nlpcn@gmail.com
 4 | # Copyright 2014 @ YuZhen Technology
 5 | #
 6 | # 4 tags for character tagging: B(Begin), E(End), M(Middle), S(Single)
 7 | 
 8 | import codecs
 9 | import sys
10 | 
11 | def character_tagging(input_file, output_file):
12 |     input_data = codecs.open(input_file, 'r', 'utf-8')
13 |     output_data = codecs.open(output_file, 'w', 'utf-8')
14 |     for line in input_data.readlines():
15 |         word_list = line.strip().split()
16 |         for word in word_list:
17 |             if len(word) == 1:
18 |                 output_data.write(word + "\tS\n")
19 |             else:
20 |                 output_data.write(word[0] + "\tB\n")
21 |                 for w in word[1:len(word)-1]:
22 |                     output_data.write(w + "\tM\n")
23 |                 output_data.write(word[len(word)-1] + "\tE\n")
24 |         output_data.write("\n")
25 |     input_data.close()
26 |     output_data.close()
27 | 
28 | if __name__ == '__main__':
29 |     if len(sys.argv) != 3:
30 |         print "pls use: python make_crf_train_data.py input output"
31 |         sys.exit()
32 |     input_file = sys.argv[1]
33 |     output_file = sys.argv[2]
34 |     character_tagging(input_file, output_file)
35 | 


--------------------------------------------------------------------------------
/CLPT/WordSegmentation/MaxentSegment/README.md:
--------------------------------------------------------------------------------
 1 | Character based Word Segmentation by Maxent Maximum Entropy Modeling
 2 | ====================
 3 | 
 4 | This example is using Doctor ZhangLe's Maxent Toolkit(Maximum Entropy
 5 | Modeling Toolkit for Python and C++), the details info you can find on
 6 | 52nlp's article: http://www.52nlp.cn/?p=5682
 7 | 
 8 | NLP Education Tools by YuZhen Technology Ltd. (www.yuzhenkeji.com)
 9 | 
10 | 


--------------------------------------------------------------------------------
/CLPT/WordSegmentation/MaxentSegment/character_2_word.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Author: 52nlpcn@gmail.com
 4 | # Copyright 2014 @ YuZhen Technology
 5 | #
 6 | # Combining characters based the 4-tag tagging info
 7 | 
 8 | import codecs
 9 | import sys
10 | 
11 | def character_2_word(input_file, output_file):
12 |     input_data = codecs.open(input_file, 'r', 'utf-8')
13 |     output_data = codecs.open(output_file, 'w', 'utf-8')
14 |     # 4 tags for character tagging: B(Begin), E(End), M(Middle), S(Single)
15 |     for line in input_data.readlines():
16 |         char_tag_list = line.strip().split()
17 |         for char_tag in char_tag_list:
18 |             char_tag_pair = char_tag.split('/')
19 |             char = char_tag_pair[0]
20 |             tag = char_tag_pair[1]
21 |             if tag == 'B':
22 |                 output_data.write(' ' + char)
23 |             elif tag == 'M':
24 |                 output_data.write(char)
25 |             elif tag == 'E':
26 |                 output_data.write(char + ' ')
27 |             else: # tag == 'S'
28 |                 output_data.write(' ' + char + ' ')
29 |         output_data.write("\n")
30 |     input_data.close()
31 |     output_data.close()
32 | 
33 | if __name__ == '__main__':
34 |     if len(sys.argv) != 3:
35 |         print "Please use: python character_2_word.py input output"
36 |         sys.exit()
37 |     input_file = sys.argv[1]
38 |     output_file = sys.argv[2]
39 |     character_2_word(input_file, output_file)
40 | 


--------------------------------------------------------------------------------
/CLPT/WordSegmentation/MaxentSegment/character_split.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Author: 52nlpcn@gmail.com
 4 | # Copyright 2014 @ YuZhen Technology
 5 | #
 6 | # split chinese characters to single charachter and add space between them
 7 | 
 8 | import codecs
 9 | import sys
10 | 
11 | def character_split(input_file, output_file):
12 |     input_data = codecs.open(input_file, 'r', 'utf-8')
13 |     output_data = codecs.open(output_file, 'w', 'utf-8')
14 |     for line in input_data.readlines():
15 |         for word in line.strip():
16 |             output_data.write(word + " ")
17 |         output_data.write("\n")
18 |     input_data.close()
19 |     output_data.close()
20 | 
21 | if __name__ == '__main__':
22 |     if len(sys.argv) != 3:
23 |         print "Please use: python character_split.py input output"
24 |         sys.exit()
25 |     input_file = sys.argv[1]
26 |     output_file = sys.argv[2]
27 |     character_split(input_file, output_file)
28 | 


--------------------------------------------------------------------------------
/CLPT/WordSegmentation/MaxentSegment/character_tagging.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Author: 52nlpcn@gmail.com
 4 | # Copyright 2014 @ YuZhen Technology
 5 | #
 6 | # 4 tags for character tagging: B(Begin), E(End), M(Middle), S(Single)
 7 | 
 8 | import codecs
 9 | import sys
10 | 
11 | def character_tagging(input_file, output_file):
12 |     input_data = codecs.open(input_file, 'r', 'utf-8')
13 |     output_data = codecs.open(output_file, 'w', 'utf-8')
14 |     for line in input_data.readlines():
15 |         word_list = line.strip().split()
16 |         for word in word_list:
17 |             if len(word) == 1:
18 |                 output_data.write(word + "/S ")
19 |             else:
20 |                 output_data.write(word[0] + "/B ")
21 |                 for w in word[1:len(word)-1]:
22 |                     output_data.write(w + "/M ")
23 |                 output_data.write(word[len(word)-1] + "/E ")
24 |         output_data.write("\n")
25 |     input_data.close()
26 |     output_data.close()
27 | 
28 | if __name__ == '__main__':
29 |     if len(sys.argv) != 3:
30 |         print "Please use: python character_tagging.py input output"
31 |         sys.exit()
32 |     input_file = sys.argv[1]
33 |     output_file = sys.argv[2]
34 |     character_tagging(input_file, output_file)
35 | 


--------------------------------------------------------------------------------
/CLPT/WordSegmentation/MeCab/script/make_mecab_seed_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Author: 52nlpcn@gmail.com
 4 | # Copyright 2015 @ YuZhen Technology
 5 | 
 6 | import codecs
 7 | import sys
 8 | 
 9 | def make_mecab_seed_data(input_file, output_file):
10 |     input_data = codecs.open(input_file, 'r', 'utf-8')
11 |     output_data = codecs.open(output_file, 'w', 'utf-8')
12 |     for line in input_data.readlines():
13 |         word = line.strip()
14 |         output_data.write(word+ ",0,0,0,0,0,0\n")
15 |     input_data.close()
16 |     output_data.close()
17 | 
18 | if __name__ == '__main__':
19 |     if len(sys.argv) < 3:
20 |         print "pls use: python make_mecab_seed_data.py input output"
21 |         sys.exit()
22 |     input_file = sys.argv[1]
23 |     output_file = sys.argv[2]
24 |     make_mecab_seed_data(input_file, output_file)
25 | 


--------------------------------------------------------------------------------
/CLPT/WordSegmentation/MeCab/script/make_mecab_train_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # Author: 52nlpcn@gmail.com
 4 | # Copyright 2015 @ YuZhen Technology
 5 | 
 6 | import codecs
 7 | import sys
 8 | 
 9 | def make_mecab_train_data(input_file, output_file):
10 |     input_data = codecs.open(input_file, 'r', 'utf-8')
11 |     output_data = codecs.open(output_file, 'w', 'utf-8')
12 |     for line in input_data.readlines():
13 |         word_list = line.strip().split()
14 |         if len(word_list) == 0: continue
15 |         for word in word_list:
16 |             output_data.write(word+ "\t0,0,0,0,0,0\n")
17 |         output_data.write("EOS\n")
18 |     input_data.close()
19 |     output_data.close()
20 | 
21 | if __name__ == '__main__':
22 |     if len(sys.argv) < 3:
23 |         print "pls use: python make_mecab_train_data.py input output"
24 |         sys.exit()
25 |     input_file = sys.argv[1]
26 |     output_file = sys.argv[2]
27 |     make_mecab_train_data(input_file, output_file)
28 | 


--------------------------------------------------------------------------------
/CLPT/WordSegmentation/MeCab/seed/char.def:
--------------------------------------------------------------------------------
 1 | DEFAULT        0 1 0  # DEFAULT is a mandatory category!
 2 | SPACE          0 1 0
 3 | CJK            0 0 2
 4 | 
 5 | # SPACE
 6 | 0x0020 SPACE  # DO NOT REMOVE THIS LINE,  0x0020 is reserved for SPACE
 7 | 0x00D0 SPACE
 8 | 0x0009 SPACE
 9 | 0x000B SPACE
10 | 0x000A SPACE
11 | 
12 | #CJK
13 | 0x4E00..0x9FCB  CJK
14 | 


--------------------------------------------------------------------------------
/CLPT/WordSegmentation/MeCab/seed/dicrc:
--------------------------------------------------------------------------------
1 | cost-factor = 800
2 | bos-feature = BOS/EOS,*,*,*,*,*,*,*,*
3 | eval-size = 6
4 | unk-eval-size = 4
5 | config-charset = UTF-8
6 | 


--------------------------------------------------------------------------------
/CLPT/WordSegmentation/MeCab/seed/feature.def:
--------------------------------------------------------------------------------
 1 | UNIGRAM W0:%F[6]
 2 | UNIGRAM W1:%F[0]/%F[6]
 3 | UNIGRAM W2:%F[0],%F?[1]/%F[6]
 4 | UNIGRAM W3:%F[0],%F[1],%F?[2]/%F[6]
 5 | UNIGRAM W4:%F[0],%F[1],%F[2],%F?[3]/%F[6]
 6 | 
 7 | UNIGRAM T0:%t
 8 | UNIGRAM T1:%F[0]/%t
 9 | UNIGRAM T2:%F[0],%F?[1]/%t
10 | UNIGRAM T3:%F[0],%F[1],%F?[2]/%t
11 | UNIGRAM T4:%F[0],%F[1],%F[2],%F?[3]/%t
12 | 
13 | BIGRAM B00:%L[0]/%R[0]
14 | BIGRAM B01:%L[0],%L?[1]/%R[0]
15 | BIGRAM B02:%L[0]/%R[0],%R?[1]
16 | BIGRAM B03:%L[0]/%R[0],%R[1],%R?[2]
17 | BIGRAM B04:%L[0],%L?[1]/%R[0],%R[1],%R?[2]
18 | BIGRAM B05:%L[0]/%R[0],%R[1],%R[2],%R?[3]
19 | BIGRAM B06:%L[0],%L?[1]/%R[0],%R[1],%R[2],%R?[3]
20 | 


--------------------------------------------------------------------------------
/CLPT/WordSegmentation/MeCab/seed/rewrite.def:
--------------------------------------------------------------------------------
1 | [unigram rewrite]
2 | *,*,*   $1,$2,$3
3 | 
4 | [left rewrite]
5 | *,*,*   $1,$2,$3
6 | 
7 | [right rewrite]
8 | *,*,*   $1,$2,$3
9 | 


--------------------------------------------------------------------------------
/CLPT/WordSegmentation/MeCab/seed/unk.def:
--------------------------------------------------------------------------------
1 | DEFAULT,0,0,0,unk,*,*
2 | SPACE,0,0,0,unk,*,*
3 | CJK,0,0,0,unk,*,*
4 | 


--------------------------------------------------------------------------------
/CLPT/WordSegmentation/README.md:
--------------------------------------------------------------------------------
1 | Chinese Word Segmentation
2 | ====================
3 | 
4 | 1、Maxent Segmenter
5 | 
6 | 2、CRF Segmenter
7 | 
8 | NLP Education Tools by YuZhen Technology Ltd. (www.yuzhenkeji.com)
9 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | YuZhen NLP Education Tools
2 | ====================
3 | 
4 | NLP Education Tools by YuZhen Technology Ltd. (www.yuzhenkeji.com)
5 | 


--------------------------------------------------------------------------------