├── README.md ├── Response_words.py ├── ac_ahocorasick.py ├── centrality.py ├── createNetwork.py ├── drawNetwork.py ├── getspeaker.py └── op_main.py /README.md: -------------------------------------------------------------------------------- 1 | #OPinion-leaders-mining 2 | Opinion leader is an important node in social networks and social media, and it also is the key factor of information transmission. It is difficult to identify the opinion leaders from a large number of members who participate in different topics in QQ group. This paper proposes a new way which based on the response relationship in the QQ group, to min the opinion leaders. Firstly, we build a response Word library, then build the user-response relationship network using Aho-Corasick algorithm. Lastly, we get the opinion leaders by analyzing the user’s node statistic information of social network. The experimental results show that there is a high accuracy by using the method to min the opinion leaders of the QQ group data, when merging the importance of the user-interaction network nodes feature with QQ group, it is possible to achieve a better effect with mining opinion leaders. 3 | -------------------------------------------------------------------------------- /Response_words.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from ac_ahocorasick import * 4 | import jieba 5 | import jieba.analyse 6 | import re 7 | 8 | def contentinbracket(c):# 取QQ号码 9 | if c.endswith(')'): 10 | p1 = re.compile(r'\(.*\)') 11 | tmp = str(p1.findall(c)) 12 | tmp = tmp[3:len(tmp)-3] 13 | else: tmp = 'null' 14 | return tmp 15 | 16 | def MatchResult(textFile): #文本预处理 17 | key_name1 = [] 18 | key_name2 = [] 19 | key_name = [] 20 | list_key = [] 21 | value_line = [] 22 | dic = {} 23 | ac = UnicodeAcAutomation() #匹配初始化 24 | try: 25 | with open(textFile, 'r') as message: 26 | for line in message: 27 | #区分话题 28 | if line.startswith('-----'): 29 | key_name1.append(key_name) 30 | key_name2.append(dic) 31 | key_name = [] 32 | value_line = [] 33 | continue 34 | #提取聊天人 35 | if line.startswith('201'): 36 | time = re.findall(r'\d{4}-\d{2}-\d{2}\s+\d{1,2}:\d{2}:\d{2}',line) 37 | dele_time = line.replace(time[0], '').strip('\n') 38 | dele_time = contentinbracket(dele_time) 39 | key_name.append(dele_time) 40 | #提取聊天人对应的内容 41 | elif not line.startswith('\n'): 42 | jieba.analyse.set_idf_path('idf.txt.big') 43 | hh = jieba.analyse.extract_tags(line, topK = 1) 44 | rr = ' '.join(hh).encode('utf-8').decode('utf-8') 45 | value_line.append(rr) 46 | list_tuple = zip(key_name, value_line) 47 | dic = dict(list_tuple) 48 | except EnvironmentError: 49 | print 'oops' 50 | 51 | #回应词匹配 52 | for dic in key_name2: 53 | for key, val in dic.items(): 54 | ac.insert(val) 55 | ac.build_automation() 56 | with open('Responsice_words.txt', 'r') as word: 57 | for i in word: 58 | keys = jieba.analyse.extract_tags(i, topK = 1) 59 | keywords = ' '.join(keys).encode('utf-8').decode('utf-8') 60 | tuple_words = ac.matchOne(keywords) 61 | if tuple_words[1] != None: 62 | dic[key] = tuple_words[0] 63 | return key_name1 64 | 65 | #将数据处理成元组列表的形式 66 | def JiaQuan(): 67 | list1 = [] 68 | list2 = [] 69 | list3 = [] 70 | data = MatchResult('2011tx3.txt') 71 | for dic in data: 72 | list1.append(dic) 73 | for li in list1: 74 | list3.append(list2) 75 | list2 = [] 76 | i = 1 77 | for i in range(len(li)): 78 | if i <=1: 79 | list2.append((li[i],li[0],3.0)) 80 | else: 81 | list2.append((li[i],li[0],1.0)) 82 | listdata = list3[2:] 83 | ## print listdata[:10] 84 | return listdata 85 | 86 | def WuQuan(): 87 | list1 = [] 88 | list2 = [] 89 | list3 = [] 90 | data = MatchResult('2011tx3.txt') 91 | for dic in data: 92 | list1.append(dic) 93 | for li in list1: 94 | list3.append(list2) 95 | list2 = [] 96 | i = 1 97 | for i in range(len(li)): 98 | list2.append((li[i],li[0])) 99 | listdata = list3[2:] 100 | return listdata 101 | 102 | 103 | 104 | 105 | -------------------------------------------------------------------------------- /ac_ahocorasick.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | 3 | KIND = 16 4 | 5 | class Node(): 6 | static = 0 7 | def __init__(self): 8 | self.fail = None 9 | self.next = [None]*KIND 10 | self.end = False 11 | self.word = None 12 | Node.static += 1 13 | 14 | class AcAutomation(): 15 | def __init__(self): 16 | self.root = Node() 17 | self.queue = [] 18 | 19 | def getIndex(self, char): 20 | return ord(char)#返回一个代表unicode的整数 21 | 22 | def insert(self, string): 23 | p = self.root 24 | ## p.next = [None]*KIND 25 | for char in string: 26 | index = self.getIndex(char) 27 | if p.next[index] == None: 28 | p.next[index] = Node() 29 | p = p.next[index] 30 | p.end = True 31 | p.word = string 32 | 33 | def build_automation(self): 34 | self.root.fail = None 35 | self.queue.append(self.root) 36 | while len(self.queue)!= 0: 37 | parent = self.queue[0] 38 | self.queue.pop(0) 39 | for i, child in enumerate(parent.next): 40 | ## self.queue = [] 41 | if child == None: 42 | continue 43 | if parent == self.root: 44 | child.fail = self.root 45 | else: 46 | failp = parent.fail 47 | while failp != None: 48 | if failp.next[i] != None: 49 | child.fail = failp.next[i] 50 | break 51 | failp = failp.fail 52 | if failp == None: 53 | child.fail = self.root 54 | self.queue.append(child) 55 | 56 | def matchOne(self, string): 57 | p = self.root 58 | for char in string: 59 | index = self.getIndex(char) 60 | while p.next[index] == None and p != self.root: 61 | p = p.fail 62 | if p.next[index] == None: 63 | p = self.root 64 | else: 65 | p = p.next[index] 66 | if p.end: 67 | return True, p.word 68 | return False, None 69 | 70 | 71 | 72 | 73 | class UnicodeAcAutomation(): 74 | def __init__(self,encoding='utf-8'): 75 | self.ac = AcAutomation() 76 | self.encoding = encoding 77 | 78 | def getAcString(self, string): 79 | string = bytearray(string.encode(self.encoding)) 80 | ac_string = '' 81 | for byte in string: 82 | ac_string += chr(byte%16) 83 | ac_string += chr(byte/16) 84 | return ac_string 85 | 86 | def insert(self, string): 87 | if type(string) != unicode: 88 | raise Exception('UnicodeAcAutomation:: insert type not unicode') 89 | ac_string = self.getAcString(string) 90 | self.ac.insert(ac_string) 91 | 92 | def build_automation(self): 93 | self.ac.build_automation() 94 | 95 | def matchOne(self, string): 96 | if type(string) != unicode: 97 | raise Exception('UnicodeAcAutomation:: insert type not unicode') 98 | ac_string = self.getAcString(string) 99 | retcode, ret = self.ac.matchOne(ac_string) 100 | if ret != None: 101 | s = '' 102 | for i in range(len(ret)/2): 103 | s += chr(ord(ret[2*i])+ ord(ret[2*i+1])*16) 104 | ret = s.decode('utf-8') 105 | return retcode, ret 106 | 107 | -------------------------------------------------------------------------------- /centrality.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import networkx as nx 3 | from operator import itemgetter 4 | 5 | def betweenness_centrality(G): 6 | bc = nx.betweenness_centrality(G,weight='weight')#betweeness 7 | bc1 = sorted(bc.iteritems(),key=itemgetter(1),reverse=True)#排序 8 | nbc = [e[0] for e in bc1] 9 | return nbc[:10] 10 | 11 | def closeness_centrality(G): 12 | cc = nx.closeness_centrality(G)#closeness 13 | cc1 = sorted(cc.iteritems(),key=itemgetter(1),reverse=True)#排序 14 | ncc = [e[0] for e in cc1] 15 | return ncc[:10] 16 | 17 | def degree_centrality(G): 18 | dc = nx.in_degree_centrality(G)#degree 19 | dc1 = sorted(dc.iteritems(),key=itemgetter(1),reverse=True)#排序 20 | ndc = [e[0] for e in dc1] 21 | return ndc[:10] 22 | 23 | def PageRank(G): 24 | pr = nx.pagerank(G,alpha=0.9,weight='weight',max_iter=10000)#带不带权重结果没什么变化 25 | pr1 = sorted(pr.iteritems(),key=itemgetter(1),reverse=True)#排序 26 | npr = [e[0] for e in pr1] 27 | return npr[:10] 28 | -------------------------------------------------------------------------------- /createNetwork.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import networkx as nx 3 | from response_words import * 4 | 5 | def network_wuquan(): #分话题无权网络 6 | G = nx.DiGraph() 7 | listdata = WuQuan() 8 | for data in listdata: 9 | G.add_edges_from(data) 10 | return G 11 | 12 | def network_jiaquan(): #分话题加权网络 13 | G = nx.DiGraph() 14 | listdata = JiaQuan() 15 | for data in listdata: 16 | G.add_weighted_edges_from(data) 17 | return G 18 | 19 | -------------------------------------------------------------------------------- /drawNetwork.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import matplotlib.pyplot as plt 4 | import networkx as nx 5 | 6 | def draw_unweightedG(G):#无权图 7 | pos = nx.spring_layout(G)# 弹簧布局 8 | nx.draw(G,pos,node_size = 100,font_size=10,font_family='sans-serif',with_labels=True) 9 | plt.show() 10 | 11 | def draw_weightedG(G):#加权图 12 | pos = nx.spring_layout(G)# 弹簧布局 13 | edgewidth = [] 14 | for n,nbrs in G.adjacency_iter(): 15 | for nbr, eattr in nbrs.items(): 16 | edgewidth.append(eattr['weight']) 17 | nx.draw(G,pos,width = edgewidth,font_size=10,font_family='sans-serif',with_labels=True) 18 | plt.show() 19 | 20 | -------------------------------------------------------------------------------- /getspeaker.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | 提取发言人列表 4 | """ 5 | import re 6 | import pickle 7 | from operator import itemgetter 8 | 9 | def re_data(strline):#使用正则表达式提取发言人 10 | reg = re.compile('^(?P[^ ]*) (?P[^ ]*) (?P[^ ]*)') 11 | regMatch = reg.match(strline) 12 | dict0 = regMatch.groupdict() 13 | speaker = dict0['speaker'] 14 | return speaker 15 | 16 | #取()中内容 17 | def contentinbracket(c): 18 | if c.endswith(')'): 19 | p1 = re.compile(r'\(.*\)') 20 | tmp = str(p1.findall(c)) 21 | tmp = tmp[3:len(tmp)-3] 22 | else: tmp = 'null' 23 | return tmp 24 | 25 | def important_speakers(f3): #2提取重要话题发起人 26 | j = 0 27 | info_speakers = [] 28 | info_speakers0 = [] 29 | with open(f3) as f: 30 | for each_line in f: 31 | if each_line.startswith('---'): 32 | if '不' not in each_line and info_speakers0 != []: 33 | info_speakers.append(info_speakers0[0]) 34 | info_speakers0 = [] 35 | j += 1 36 | elif each_line.startswith('201'): 37 | (part1,part2,part3) = each_line.strip('\n').split(' ',2) 38 | part3 = contentinbracket(part3) 39 | each_line = part1 +' '+ part2 +' '+ part3 40 | info_speakers0.append(re_data(each_line)) 41 | return info_speakers[1:] 42 | 43 | -------------------------------------------------------------------------------- /op_main.py: -------------------------------------------------------------------------------- 1 | from getspeaker import * 2 | from createNetwork import * 3 | from centrality import * 4 | from draw import * 5 | 6 | 7 | def zqx(l1,l2): 8 | return float(len([e for e in l1 if e in l2]))/len(l1) 9 | 10 | item = '2011tx3.txt' 11 | sponser = important_speakers(item) 12 | yucedict = {e:sponser.count(e) for e in set(sponser)} 13 | paixudict = sorted(yucedict.iteritems(),key=itemgetter(1),reverse=True)#排序 14 | yucepaixu = [e[0] for e in paixudict][:10] 15 | yucepaixu1 = [e[0] for e in paixudict][:5] 16 | print 'yucepaixu' 17 | print yucepaixu 18 | 19 | #有向网络 20 | G0 = network_wuquan() 21 | G1 = network_jiaquan() 22 | 23 | c0_10 = degree_centrality(G0) 24 | c1_10 = degree_centrality(G1) 25 | print 'degree_centrality' 26 | print c0_10,zqx(c0_10[:10],yucepaixu[:10]),zqx(c0_10,yucepaixu) 27 | print c0_10,zqx(c0_10[:5],yucepaixu1[:5]),zqx(c0_10,yucepaixu) 28 | print c1_10,zqx(c1_10[:10],yucepaixu[:10]),zqx(c1_10,yucepaixu) 29 | print c1_10,zqx(c1_10[:5],yucepaixu1[:5]),zqx(c1_10,yucepaixu) 30 | 31 | c0_10 = betweenness_centrality(G0) 32 | c1_10 = betweenness_centrality(G1) 33 | print 'betweenness_centrality' 34 | print c0_10,zqx(c0_10[:10],yucepaixu[:10]),zqx(c0_10,yucepaixu) 35 | print c0_10,zqx(c0_10[:5],yucepaixu1[:5]),zqx(c0_10,yucepaixu) 36 | print c1_10,zqx(c1_10[:10],yucepaixu[:10]),zqx(c1_10,yucepaixu) 37 | print c1_10,zqx(c1_10[:5],yucepaixu1[:5]),zqx(c1_10,yucepaixu) 38 | 39 | c0_10 = closeness_centrality(G0) 40 | c1_10 = closeness_centrality(G1) 41 | print 'closeness_centrality' 42 | print c0_10,zqx(c0_10[:10],yucepaixu[:10]),zqx(c0_10,yucepaixu) 43 | print c0_10,zqx(c0_10[:5],yucepaixu1[:5]),zqx(c0_10,yucepaixu) 44 | print c1_10,zqx(c1_10[:10],yucepaixu[:10]),zqx(c1_10,yucepaixu) 45 | print c1_10,zqx(c1_10[:5],yucepaixu1[:5]),zqx(c1_10,yucepaixu) 46 | 47 | P0_10 = PageRank(G0) 48 | P1_10 = PageRank(G1) 49 | print 'PageRank' 50 | print P0_10,zqx(P0_10[:10],yucepaixu[:10]),zqx(P0_10,yucepaixu) 51 | print P0_10,zqx(P0_10[:5],yucepaixu1[:5]),zqx(P0_10,yucepaixu) 52 | print P1_10,zqx(P1_10[:10],yucepaixu[:10]),zqx(P1_10,yucepaixu) 53 | print P1_10,zqx(P1_10[:5],yucepaixu1[:5]),zqx(P1_10,yucepaixu) 54 | --------------------------------------------------------------------------------