├── README.md ├── collection_mi.py └── data ├── data.txt └── result.txt /README.md: -------------------------------------------------------------------------------- 1 | # WordCollocation 2 | Self complemented Word Collocation(词语搭配) using MI method which is tested to be effective.. 3 | # 原理   4 | 互信息体现了两个变量之间的相互依赖程度。二元互信息是指两个事件相关性的量,互信息值越高, 表明X和Y相关性越高, 则X和Y 组成短语的可能性越大; 反之, 互信息值越低,X 和Y之间相关性越低, 则X 和Y之间存在短语边界的可能性越大。 5 | # 用途 6 | 利用两个词语之间的相互依赖程度,能够求得一个词的常用搭配,可以有以下用途: 7 | 1、为词语搭配知识库建设,可用于输入短语推荐 8 | 2、词语语义刻画与表示提供帮助,若以搭配强度作为词-词矩阵的weight度量,可以用来计算两个词之间的相似度 9 | 3、若给定历史语料库,可以通过历时搭配来监测词汇语义的变迁 10 | # 提取步骤 11 | step 1/6: build corpus .......... 12 | step 2/6: compute worddict.......... 13 | step 3/6: build cowords.......... 14 | step 4/6: compute coinfos.......... 15 | step 5/6: compute words mi.......... 16 | step 6/6: save words mi.......... 17 | # 输入与输出 18 | 1)输入: 19 | 1、1W个文档,每个文档为一行,保存在'./data/data.txt'中 20 | 2)输出: 21 | 1、格式:(词语 制表符 搭配词1_搭配强度1,搭配词2_搭配强度2) 22 | 2、结果保存,保存在'./data/resut.txt'中 23 | 3)参数: 24 | 1、window_size: 默认为5, 左右窗口为5, 作为词共现窗口 25 | # 效果 26 | 以1W个文档/句子作为训练语料,进行训练,得到结果举例如下: 27 | word:陷入 28 | word collocations Top 10: 29 | 不由得@18.05618124455273 30 | 两难@17.83378882321628 31 | 林林总总@17.57075441738249 32 | 不怎么@17.248826322495123 33 | 误区@17.248826322495123 34 | 市面上@17.248826322495123 35 | 失落@16.386329846245058 36 | 困境@15.83378882321628 37 | 母亲@15.511860728328918 38 | 常@15.471218743831571 39 | 40 | word:乐于 41 | word collocations Top 10: 42 | 吃苦耐劳@19.57075441738249 43 | 奉献@18.57075441738249 44 | 事业心@18.57075441738249 45 | 作风@18.248826322495123 46 | 务实@17.418751323937435 47 | 责任感@17.248826322495123 48 | Kevin@17.248826322495123 49 | 政治素质@16.248826322495123 50 | 热爱@15.959319705300139 51 | 客户@15.734253149665365 52 | 53 | -------------------------------------------------------------------------------- /collection_mi.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding: utf-8 3 | # File: mi.py 4 | # Author: lhy 5 | # Date: 18-3-26 6 | 7 | import collections 8 | import math 9 | import jieba.posseg as pseg 10 | 11 | class MI_Train: 12 | def __init__(self, window_size, filepath, mipath): 13 | self.window_size = window_size 14 | self.filepath = filepath 15 | self.mipath = mipath 16 | 17 | #对语料进行处理 18 | def build_corpus(self): 19 | def cut_words(sent): 20 | return [word.word for word in pseg.cut(sent) if word.flag[0] not in ['x', 'w', 'p', 'u', 'c']] 21 | # sentences = [sent.split(' ') for sent in open(self.filepath).read().split('\n')],若处理英文语料则使用这种方法 22 | sentences = [cut_words(sent) for sent in open(self.filepath).read().split('\n')] 23 | return sentences 24 | 25 | #统计词频 26 | def count_words(self, sentences): 27 | words_all = list() 28 | for sent in sentences: 29 | words_all.extend(sent) 30 | word_dict = {item[0]:item[1] for item in collections.Counter(words_all).most_common()} 31 | return word_dict, len(words_all) 32 | 33 | #读取训练语料 34 | def build_cowords(self, sentences): 35 | train_data = list() 36 | for sent in sentences: 37 | for index, word in enumerate(sent): 38 | if index < self.window_size: 39 | left = sent[:index] 40 | else: 41 | left = sent[index - self.window_size: index] 42 | if index + self.window_size > len(sent): 43 | right = sent[index+1 :] 44 | else: 45 | right = sent[index+1: index + self.window_size + 1] 46 | data = left + right + [sent[index]] 47 | if '' in data: 48 | data.remove('') 49 | train_data.append(data) 50 | return train_data 51 | 52 | #统计共现次数 53 | def count_cowords(self, train_data): 54 | co_dict = dict() 55 | print(len(train_data)) 56 | for index, data in enumerate(train_data): 57 | for index_pre in range(len(data)): 58 | for index_post in range(len(data)): 59 | if data[index_pre] not in co_dict: 60 | co_dict[data[index_pre]] = data[index_post] 61 | else: 62 | co_dict[data[index_pre]] += '@' + data[index_post] 63 | return co_dict 64 | 65 | # 计算互信息 66 | def compute_mi(self, word_dict, co_dict, sum_tf): 67 | def compute_mi(p1, p2, p12): 68 | return math.log2(p12) - math.log2(p1) - math.log2(p2) 69 | 70 | def build_dict(words): 71 | return {item[0]:item[1] for item in collections.Counter(words).most_common()} 72 | 73 | mis_dict = dict() 74 | for word, co_words in co_dict.items(): 75 | co_word_dict = build_dict(co_words.split('@')) 76 | mi_dict = {} 77 | for co_word, co_tf in co_word_dict.items(): 78 | if co_word == word: 79 | continue 80 | p1 = word_dict[word]/sum_tf 81 | p2 = word_dict[co_word]/sum_tf 82 | p12 = co_tf/sum_tf 83 | mi = compute_mi(p1, p2, p12) 84 | mi_dict[co_word] = mi 85 | mi_dict = sorted(mi_dict.items(), key = lambda asd:asd[1], reverse= True) 86 | mis_dict[word] = mi_dict 87 | 88 | return mis_dict 89 | 90 | # 保存互信息文件 91 | def save_mi(self, mis_dict): 92 | f = open(self.mipath, 'w+') 93 | for word, co_words in mis_dict.items(): 94 | co_infos = [item[0] + '@' + str(item[1]) for item in co_words] 95 | f.write(word + '\t' + ','.join(co_infos) + '\n') 96 | f.close() 97 | 98 | # 运行主函数 99 | def mi_main(self): 100 | print('step 1/6: build corpus ..........') 101 | sentences = self.build_corpus() 102 | print('step 2/6: compute worddict..........') 103 | word_dict, sum_tf = self.count_words(sentences) 104 | print('step 3/6: build cowords..........') 105 | train_data = self.build_cowords(sentences) 106 | print('step 4/6: compute coinfos..........') 107 | co_dict = self.count_cowords(train_data) 108 | print('step 5/6: compute words mi..........') 109 | mi_data = self.compute_mi(word_dict, co_dict, sum_tf) 110 | print('step 6/6: save words mi..........') 111 | self.save_mi(mi_data) 112 | print('done!.......') 113 | 114 | #测试 115 | def test(): 116 | filepath = './data/data.txt' 117 | mipath = './data/result.txt' 118 | window_size = 5 119 | mier = MI_Train(window_size, filepath, mipath) 120 | mier.mi_main() 121 | 122 | if __name__=='__main__': 123 | test() --------------------------------------------------------------------------------