├── .gitignore
├── README.md
├── compute_candidate_freq.py
├── compute_freedegree.py
└── compute_solidation.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | bin/
12 | build/
13 | develop-eggs/
14 | dist/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | 
25 | # Installer logs
26 | pip-log.txt
27 | pip-delete-this-directory.txt
28 | 
29 | # Unit test / coverage reports
30 | htmlcov/
31 | .tox/
32 | .coverage
33 | .cache
34 | nosetests.xml
35 | coverage.xml
36 | 
37 | # Translations
38 | *.mo
39 | 
40 | # Mr Developer
41 | .mr.developer.cfg
42 | .project
43 | .pydevproject
44 | 
45 | # Rope
46 | .ropeproject
47 | 
48 | # Django stuff:
49 | *.log
50 | *.pot
51 | 
52 | # Sphinx documentation
53 | docs/_build/
54 | 
55 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | new-words-discovery
 2 | ===================
 3 | 
 4 | 这是完成了新词发现功能的python脚本。
 5 | 
 6 | ##使用方法
 7 | ###1. 输入语料，计算出长度为1~5个字的所有候选词的词频
 8 | ```bash
 9 | python compute_candidate_freq.py [-h] [-r] [-o OUTPUT] corpus_file
10 | ```
11 | 加上参数`-r`，会将语料文件的句子都翻转后，再统计所有逆序候选词的词频。
12 | 
13 | ###2. 输入候选词词频文件，计算出长度为2~4个字的所有候选词的凝固度
14 | ```bash
15 | python compute_solidation.py [-h] [-s SEPARATOR] [-f FREQ_LIMIT] [-o OUTPUT] freq_file
16 | ```
17 | 可通过参数`-s`设置词频文件的分隔符，默认是`\t`；设置`-f`可只计算词频大于等于词频阈值的候选词，默认为1。
18 | 
19 | ###3. 输入候选词词频文件，计算出长度为2~4个字的所有候选词的右邻字信息熵
20 | ```bash
21 | python compute_freedegree.py [-h] [-s SEPARATOR] [-f FREQ_LIMIT] [-r] [-o OUTPUT] freq_file
22 | ```
23 | 可通过参数`-s`设置词频文件的分隔符，默认是`\t`；设置`-f`可只计算词频大于等于词频阈值的候选词，默认为1；加上`-r`时，需要输入是逆序候选词词频文件，输出的是正序候选词的左邻字信息熵。
24 | 
25 | ###4. 将词频文件，凝固度文件，左右邻字信息熵文件，合并到一起，然后导入Excel，通过设置词频阈值，凝固度阈值，自由度阈值刷选出新词。
26 | 


--------------------------------------------------------------------------------
/compute_candidate_freq.py:
--------------------------------------------------------------------------------
 1 | #-- encoding:utf-8 --
 2 | import argparse, re
 3 | from collections import Counter
 4 | """
 5 | 从语料中提取候选词序列，并计算词频
 6 | """
 7 | parser = argparse.ArgumentParser()
 8 | parser.add_argument("corpus_file", help="corpus file")
 9 | parser.add_argument("-r", "--reverse", help="reverse the corpus", action="store_true")
10 | parser.add_argument("-o", "--output", help="Candidate Sequence Frequency File")
11 | 
12 | args = parser.parse_args()
13 | 
14 | src_file, des_file = args.corpus_file, args.output
15 | 
16 | with open(src_file, 'r') as fs:
17 |     freq = Counter()
18 |     freq_update = freq.update
19 |     re_chinese = re.compile(u'[^a-zA-Z0-9\u4e00-\u9fa5]+')
20 |     for line in fs:
21 |         sentence = re_chinese.sub('', line.decode('utf-8').rstrip())
22 |         #下面这句放到循环里，合理吗？
23 |         sentence = sentence if not args.reverse else sentence[::-1]
24 |         sen_len = len(sentence)
25 |         freq_update(sentence[i:i+1] for i in xrange(sen_len-1, -1, -1))
26 |         freq_update(sentence[i:i+2] for i in xrange(sen_len-2, -1, -1))
27 |         freq_update(sentence[i:i+3] for i in xrange(sen_len-3, -1, -1))
28 |         freq_update(sentence[i:i+4] for i in xrange(sen_len-4, -1, -1))
29 |         freq_update(sentence[i:i+5] for i in xrange(sen_len-5, -1, -1))
30 | 
31 | with open(des_file, 'w') as fd:
32 |     for key, value in freq.iteritems():
33 |             fd.write("%s\t%d\n" % (key.encode('utf-8'), value))
34 | 


--------------------------------------------------------------------------------
/compute_freedegree.py:
--------------------------------------------------------------------------------
 1 | #-- encoding:utf-8 --
 2 | from __future__ import division
 3 | import argparse, math
 4 | """
 5 | 利用候选词序列词频文件，并计算每个候选词的右(左)邻字熵
 6 | 由于此脚本只负责算词的邻熵，freq_file可以只包含两字及以上的词
 7 | """
 8 | parser = argparse.ArgumentParser()
 9 | parser.add_argument("freq_file", help="candidate words file")
10 | parser.add_argument("-s", "--separator", help="field separator", default="\t")
11 | parser.add_argument("-f", "--freq_limit", help="word minimun frequence", default=1, type=int)
12 | parser.add_argument("-r", "--reverse", help="when freq_file is reversed", action="store_true")
13 | parser.add_argument("-o", "--output", help="Candidate Sequence Solidification File")
14 | 
15 | args = parser.parse_args()
16 | 
17 | src_file, des_file, freq_limit = args.freq_file, args.output, args.freq_limit
18 | 
19 | def compute_entropy(neighbours):
20 |     if neighbours:
21 |         right_sum = sum(neighbours)
22 |         #TODO 计算改词的右领字熵，可以怎么优化呢？
23 |         right_prob = map(lambda x:x/right_sum, neighbours)
24 |         right_entropy = sum(map(lambda x:-(x)*math.log(x), right_prob))
25 |         return right_entropy
26 |     else:
27 |         return 0
28 | 
29 | freq = {}
30 | with open(src_file, 'r') as fs:
31 |     #这里可以优化，只读入满足阈值的词就行
32 |     for line in fs:
33 |         key, count = line.decode('utf-8').rstrip().split(args.separator)
34 |         freq[key] = int(count)
35 | 
36 | #只计算字长为2，3，4的词
37 | #如果只算词频超过阈值的词，那么组成词的字的词频也一定大于或等于阈值，所以先判断阈值效率会高很多。
38 | words = {word for word, count in freq.iteritems() if count>=freq_limit and 2<=len(word)<=4}
39 | right_distribution = {}
40 | for key, count in freq.iteritems():
41 |     length = len(key)
42 |     if length >= 3 and key[:length-1] in words:
43 |         right_distribution.setdefault(key[:length-1], []).append(count)
44 | 
45 | 
46 | entropys = map(lambda x:compute_entropy(right_distribution.get(x, None)), words)
47 | if not args.reverse:
48 |     result = ("%s\t%.9f\n" % (word.encode('utf-8'), entropy) for (word, entropy) in zip(words, entropys))
49 | else:
50 |     result = ("%s\t%.9f\n" % (word[::-1].encode('utf-8'), entropy) for (word, entropy) in zip(words, entropys))
51 | with open(des_file, 'w') as fd:
52 |     fd.writelines(result)
53 | 
54 | 


--------------------------------------------------------------------------------
/compute_solidation.py:
--------------------------------------------------------------------------------
 1 | #-- encoding:utf-8 --
 2 | from __future__ import division
 3 | import argparse, math
 4 | """
 5 | 利用候选词序列词频文件，并计算每个候选词的凝固度
 6 | """
 7 | parser = argparse.ArgumentParser()
 8 | parser.add_argument("freq_file", help="candidate words file")
 9 | parser.add_argument("-s", "--separator", help="field separator", default="\t")
10 | parser.add_argument("-f", "--freq_limit", help="word minimun frequence", default=1, type=int)
11 | parser.add_argument("-o", "--output", help="Candidate Sequence Solidification File")
12 | 
13 | args = parser.parse_args()
14 | 
15 | src_file, des_file, freq_limit = args.freq_file, args.output, args.freq_limit
16 | 
17 | def compute_ninggudu(word, freq, freq_limit):
18 |     w_freq = freq.get(word, freq_limit)
19 |     length = len(word)
20 |     ninggudu = 0
21 |     if length == 2:
22 |         word1, word2 = word
23 |         ninggudu = w_freq / (freq.get(word1, freq_limit)*freq.get(word2, freq_limit))
24 |     elif length == 3:
25 |         word1, word2 = word[:2], word[2:3]
26 |         ninggudu1 = w_freq / (freq.get(word1, freq_limit)*freq.get(word2, freq_limit))
27 |         word1, word2 = word[:1], word[1:3]
28 |         ninggudu2 = w_freq / (freq.get(word1, freq_limit)*freq.get(word2, freq_limit))
29 |         ninggudu = min(ninggudu1, ninggudu2)
30 |     elif length == 4:
31 |         word1, word2 = word[:1], word[1:4]
32 |         ninggudu1 = w_freq / (freq.get(word1, freq_limit)*freq.get(word2, freq_limit))
33 |         word1, word2 = word[:2], word[2:4]
34 |         ninggudu2 = w_freq / (freq.get(word1, freq_limit)*freq.get(word2, freq_limit))
35 |         word1, word2 = word[:3], word[3:4]
36 |         ninggudu3 = w_freq / (freq.get(word1, freq_limit)*freq.get(word2, freq_limit))
37 |         ninggudu = min(ninggudu1, ninggudu2, ninggudu3)
38 |     return ninggudu
39 | 
40 | with open(src_file, 'r') as fs:
41 |     #这里可以优化，只读入满足阈值的词就行
42 |     freq = {}
43 |     for line in fs:
44 |         key, count = line.decode('utf-8').rstrip().split(args.separator)
45 |         freq[key] = int(count)
46 | 
47 | 
48 | with open(des_file, 'w') as fd:
49 |     #只计算字长为2，3，4的词
50 |     #如果只算词频超过阈值的词，那么组成词的字的词频也一定大于或等于阈值，所以先判断阈值效率会高很多。
51 |     words = (word for word, count in freq.iteritems() if count>=freq_limit and 2<=len(word)<=4)
52 |     ninggudus = [(word, freq[word], compute_ninggudu(word, freq, freq_limit)) for word in words]
53 |     result = (("%s\t%d\t%.9f\n" % (word.encode('utf-8'), w_freq, ninggudu)) for (word, w_freq, ninggudu) in ninggudus)
54 |     fd.writelines(result)
55 | 
56 | 


--------------------------------------------------------------------------------