├── .gitignore ├── README.md ├── compute_candidate_freq.py ├── compute_freedegree.py └── compute_solidation.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | bin/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # Installer logs 26 | pip-log.txt 27 | pip-delete-this-directory.txt 28 | 29 | # Unit test / coverage reports 30 | htmlcov/ 31 | .tox/ 32 | .coverage 33 | .cache 34 | nosetests.xml 35 | coverage.xml 36 | 37 | # Translations 38 | *.mo 39 | 40 | # Mr Developer 41 | .mr.developer.cfg 42 | .project 43 | .pydevproject 44 | 45 | # Rope 46 | .ropeproject 47 | 48 | # Django stuff: 49 | *.log 50 | *.pot 51 | 52 | # Sphinx documentation 53 | docs/_build/ 54 | 55 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | new-words-discovery 2 | =================== 3 | 4 | 这是完成了新词发现功能的python脚本。 5 | 6 | ##使用方法 7 | ###1. 输入语料,计算出长度为1~5个字的所有候选词的词频 8 | ```bash 9 | python compute_candidate_freq.py [-h] [-r] [-o OUTPUT] corpus_file 10 | ``` 11 | 加上参数`-r`,会将语料文件的句子都翻转后,再统计所有逆序候选词的词频。 12 | 13 | ###2. 输入候选词词频文件,计算出长度为2~4个字的所有候选词的凝固度 14 | ```bash 15 | python compute_solidation.py [-h] [-s SEPARATOR] [-f FREQ_LIMIT] [-o OUTPUT] freq_file 16 | ``` 17 | 可通过参数`-s`设置词频文件的分隔符,默认是`\t`;设置`-f`可只计算词频大于等于词频阈值的候选词,默认为1。 18 | 19 | ###3. 输入候选词词频文件,计算出长度为2~4个字的所有候选词的右邻字信息熵 20 | ```bash 21 | python compute_freedegree.py [-h] [-s SEPARATOR] [-f FREQ_LIMIT] [-r] [-o OUTPUT] freq_file 22 | ``` 23 | 可通过参数`-s`设置词频文件的分隔符,默认是`\t`;设置`-f`可只计算词频大于等于词频阈值的候选词,默认为1;加上`-r`时,需要输入是逆序候选词词频文件,输出的是正序候选词的左邻字信息熵。 24 | 25 | ###4. 将词频文件,凝固度文件,左右邻字信息熵文件,合并到一起,然后导入Excel,通过设置词频阈值,凝固度阈值,自由度阈值刷选出新词。 26 | -------------------------------------------------------------------------------- /compute_candidate_freq.py: -------------------------------------------------------------------------------- 1 | #-- encoding:utf-8 -- 2 | import argparse, re 3 | from collections import Counter 4 | """ 5 | 从语料中提取候选词序列,并计算词频 6 | """ 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("corpus_file", help="corpus file") 9 | parser.add_argument("-r", "--reverse", help="reverse the corpus", action="store_true") 10 | parser.add_argument("-o", "--output", help="Candidate Sequence Frequency File") 11 | 12 | args = parser.parse_args() 13 | 14 | src_file, des_file = args.corpus_file, args.output 15 | 16 | with open(src_file, 'r') as fs: 17 | freq = Counter() 18 | freq_update = freq.update 19 | re_chinese = re.compile(u'[^a-zA-Z0-9\u4e00-\u9fa5]+') 20 | for line in fs: 21 | sentence = re_chinese.sub('', line.decode('utf-8').rstrip()) 22 | #下面这句放到循环里,合理吗? 23 | sentence = sentence if not args.reverse else sentence[::-1] 24 | sen_len = len(sentence) 25 | freq_update(sentence[i:i+1] for i in xrange(sen_len-1, -1, -1)) 26 | freq_update(sentence[i:i+2] for i in xrange(sen_len-2, -1, -1)) 27 | freq_update(sentence[i:i+3] for i in xrange(sen_len-3, -1, -1)) 28 | freq_update(sentence[i:i+4] for i in xrange(sen_len-4, -1, -1)) 29 | freq_update(sentence[i:i+5] for i in xrange(sen_len-5, -1, -1)) 30 | 31 | with open(des_file, 'w') as fd: 32 | for key, value in freq.iteritems(): 33 | fd.write("%s\t%d\n" % (key.encode('utf-8'), value)) 34 | -------------------------------------------------------------------------------- /compute_freedegree.py: -------------------------------------------------------------------------------- 1 | #-- encoding:utf-8 -- 2 | from __future__ import division 3 | import argparse, math 4 | """ 5 | 利用候选词序列词频文件,并计算每个候选词的右(左)邻字熵 6 | 由于此脚本只负责算词的邻熵,freq_file可以只包含两字及以上的词 7 | """ 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument("freq_file", help="candidate words file") 10 | parser.add_argument("-s", "--separator", help="field separator", default="\t") 11 | parser.add_argument("-f", "--freq_limit", help="word minimun frequence", default=1, type=int) 12 | parser.add_argument("-r", "--reverse", help="when freq_file is reversed", action="store_true") 13 | parser.add_argument("-o", "--output", help="Candidate Sequence Solidification File") 14 | 15 | args = parser.parse_args() 16 | 17 | src_file, des_file, freq_limit = args.freq_file, args.output, args.freq_limit 18 | 19 | def compute_entropy(neighbours): 20 | if neighbours: 21 | right_sum = sum(neighbours) 22 | #TODO 计算改词的右领字熵,可以怎么优化呢? 23 | right_prob = map(lambda x:x/right_sum, neighbours) 24 | right_entropy = sum(map(lambda x:-(x)*math.log(x), right_prob)) 25 | return right_entropy 26 | else: 27 | return 0 28 | 29 | freq = {} 30 | with open(src_file, 'r') as fs: 31 | #这里可以优化,只读入满足阈值的词就行 32 | for line in fs: 33 | key, count = line.decode('utf-8').rstrip().split(args.separator) 34 | freq[key] = int(count) 35 | 36 | #只计算字长为2,3,4的词 37 | #如果只算词频超过阈值的词,那么组成词的字的词频也一定大于或等于阈值,所以先判断阈值效率会高很多。 38 | words = {word for word, count in freq.iteritems() if count>=freq_limit and 2<=len(word)<=4} 39 | right_distribution = {} 40 | for key, count in freq.iteritems(): 41 | length = len(key) 42 | if length >= 3 and key[:length-1] in words: 43 | right_distribution.setdefault(key[:length-1], []).append(count) 44 | 45 | 46 | entropys = map(lambda x:compute_entropy(right_distribution.get(x, None)), words) 47 | if not args.reverse: 48 | result = ("%s\t%.9f\n" % (word.encode('utf-8'), entropy) for (word, entropy) in zip(words, entropys)) 49 | else: 50 | result = ("%s\t%.9f\n" % (word[::-1].encode('utf-8'), entropy) for (word, entropy) in zip(words, entropys)) 51 | with open(des_file, 'w') as fd: 52 | fd.writelines(result) 53 | 54 | -------------------------------------------------------------------------------- /compute_solidation.py: -------------------------------------------------------------------------------- 1 | #-- encoding:utf-8 -- 2 | from __future__ import division 3 | import argparse, math 4 | """ 5 | 利用候选词序列词频文件,并计算每个候选词的凝固度 6 | """ 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("freq_file", help="candidate words file") 9 | parser.add_argument("-s", "--separator", help="field separator", default="\t") 10 | parser.add_argument("-f", "--freq_limit", help="word minimun frequence", default=1, type=int) 11 | parser.add_argument("-o", "--output", help="Candidate Sequence Solidification File") 12 | 13 | args = parser.parse_args() 14 | 15 | src_file, des_file, freq_limit = args.freq_file, args.output, args.freq_limit 16 | 17 | def compute_ninggudu(word, freq, freq_limit): 18 | w_freq = freq.get(word, freq_limit) 19 | length = len(word) 20 | ninggudu = 0 21 | if length == 2: 22 | word1, word2 = word 23 | ninggudu = w_freq / (freq.get(word1, freq_limit)*freq.get(word2, freq_limit)) 24 | elif length == 3: 25 | word1, word2 = word[:2], word[2:3] 26 | ninggudu1 = w_freq / (freq.get(word1, freq_limit)*freq.get(word2, freq_limit)) 27 | word1, word2 = word[:1], word[1:3] 28 | ninggudu2 = w_freq / (freq.get(word1, freq_limit)*freq.get(word2, freq_limit)) 29 | ninggudu = min(ninggudu1, ninggudu2) 30 | elif length == 4: 31 | word1, word2 = word[:1], word[1:4] 32 | ninggudu1 = w_freq / (freq.get(word1, freq_limit)*freq.get(word2, freq_limit)) 33 | word1, word2 = word[:2], word[2:4] 34 | ninggudu2 = w_freq / (freq.get(word1, freq_limit)*freq.get(word2, freq_limit)) 35 | word1, word2 = word[:3], word[3:4] 36 | ninggudu3 = w_freq / (freq.get(word1, freq_limit)*freq.get(word2, freq_limit)) 37 | ninggudu = min(ninggudu1, ninggudu2, ninggudu3) 38 | return ninggudu 39 | 40 | with open(src_file, 'r') as fs: 41 | #这里可以优化,只读入满足阈值的词就行 42 | freq = {} 43 | for line in fs: 44 | key, count = line.decode('utf-8').rstrip().split(args.separator) 45 | freq[key] = int(count) 46 | 47 | 48 | with open(des_file, 'w') as fd: 49 | #只计算字长为2,3,4的词 50 | #如果只算词频超过阈值的词,那么组成词的字的词频也一定大于或等于阈值,所以先判断阈值效率会高很多。 51 | words = (word for word, count in freq.iteritems() if count>=freq_limit and 2<=len(word)<=4) 52 | ninggudus = [(word, freq[word], compute_ninggudu(word, freq, freq_limit)) for word in words] 53 | result = (("%s\t%d\t%.9f\n" % (word.encode('utf-8'), w_freq, ninggudu)) for (word, w_freq, ninggudu) in ninggudus) 54 | fd.writelines(result) 55 | 56 | --------------------------------------------------------------------------------