├── README.md
├── Response_words.py
├── ac_ahocorasick.py
├── centrality.py
├── createNetwork.py
├── drawNetwork.py
├── getspeaker.py
└── op_main.py


/README.md:
--------------------------------------------------------------------------------
1 | #OPinion-leaders-mining
2 | Opinion leader is an important node in social networks and social media, and it also is the key factor of information transmission. It is difficult to identify the opinion leaders from a large number of members who participate in different topics in QQ group. This paper proposes a new way which based on the response relationship in the QQ group, to min the opinion leaders. Firstly, we build a response Word library, then build the user-response relationship network using Aho-Corasick algorithm. Lastly, we get the opinion leaders by analyzing the user’s node statistic information of social network. The experimental results show that there is a high accuracy by using the method to min the opinion leaders of the QQ group data, when merging the importance of the user-interaction network nodes feature with QQ group, it is possible to achieve a better effect with mining opinion leaders.
3 | 


--------------------------------------------------------------------------------
/Response_words.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from ac_ahocorasick import *
  4 | import jieba
  5 | import jieba.analyse
  6 | import re
  7 | 
  8 | def contentinbracket(c):# 取QQ号码
  9 |     if c.endswith(')'):
 10 |          p1 = re.compile(r'\(.*\)')
 11 |          tmp = str(p1.findall(c))
 12 |          tmp = tmp[3:len(tmp)-3]
 13 |     else: tmp = 'null'
 14 |     return tmp
 15 | 
 16 | def MatchResult(textFile):  #文本预处理
 17 |          key_name1 = []
 18 |          key_name2 = []
 19 |          key_name = []
 20 |          list_key = []
 21 |          value_line = []
 22 |          dic = {}
 23 |          ac = UnicodeAcAutomation()  #匹配初始化
 24 |          try:
 25 |             with open(textFile, 'r') as message:
 26 |                    for line in message:
 27 |                       #区分话题
 28 |                           if line.startswith('-----'): 
 29 |                               key_name1.append(key_name)
 30 |                               key_name2.append(dic)
 31 |                               key_name = []
 32 |                               value_line = []                  
 33 |                               continue
 34 |                            #提取聊天人
 35 |                           if line.startswith('201'): 
 36 |                               time = re.findall(r'\d{4}-\d{2}-\d{2}\s+\d{1,2}:\d{2}:\d{2}',line)
 37 |                               dele_time = line.replace(time[0], '').strip('\n')
 38 |                               dele_time = contentinbracket(dele_time)
 39 |                               key_name.append(dele_time)
 40 |                            #提取聊天人对应的内容
 41 |                           elif not line.startswith('\n'):
 42 |                               jieba.analyse.set_idf_path('idf.txt.big')
 43 |                               hh = jieba.analyse.extract_tags(line, topK = 1)
 44 |                               rr = ' '.join(hh).encode('utf-8').decode('utf-8')
 45 |                               value_line.append(rr)
 46 |                               list_tuple = zip(key_name, value_line)
 47 |                               dic = dict(list_tuple)
 48 |          except EnvironmentError:
 49 |              print 'oops'
 50 | 
 51 |          #回应词匹配
 52 |          for dic in key_name2:
 53 |             for key, val in dic.items():
 54 |                ac.insert(val)
 55 |                ac.build_automation()
 56 |             with open('Responsice_words.txt', 'r') as word:
 57 |                      for i in word:
 58 |                         keys = jieba.analyse.extract_tags(i, topK = 1)
 59 |                         keywords = ' '.join(keys).encode('utf-8').decode('utf-8')
 60 |                         tuple_words = ac.matchOne(keywords)
 61 |                         if tuple_words[1] != None:
 62 |                            dic[key] = tuple_words[0]
 63 |          return key_name1
 64 |         
 65 | #将数据处理成元组列表的形式
 66 | def JiaQuan():
 67 |         list1 = []
 68 |         list2 = []
 69 |         list3 = []
 70 |         data = MatchResult('2011tx3.txt')
 71 |         for dic in data:
 72 |             list1.append(dic)
 73 |         for li in list1:
 74 |             list3.append(list2)
 75 |             list2 = []
 76 |             i = 1
 77 |             for i in range(len(li)):
 78 |                if i <=1: 
 79 |                    list2.append((li[i],li[0],3.0))
 80 |                else:
 81 |                    list2.append((li[i],li[0],1.0))
 82 |         listdata = list3[2:]
 83 | ##        print listdata[:10]
 84 |         return listdata
 85 |               
 86 | def WuQuan():
 87 |         list1 = []
 88 |         list2 = []
 89 |         list3 = []
 90 |         data = MatchResult('2011tx3.txt')
 91 |         for dic in data:
 92 |             list1.append(dic)
 93 |         for li in list1:
 94 |             list3.append(list2)
 95 |             list2 = []
 96 |             i = 1
 97 |             for i in range(len(li)):
 98 |                    list2.append((li[i],li[0]))
 99 |         listdata = list3[2:]
100 |         return listdata
101 |               
102 | 
103 |       
104 |    
105 | 


--------------------------------------------------------------------------------
/ac_ahocorasick.py:
--------------------------------------------------------------------------------
  1 | #-*- coding:utf-8 -*-
  2 | 
  3 | KIND = 16
  4 | 
  5 | class Node():
  6 |     static = 0
  7 |     def __init__(self):
  8 |         self.fail = None
  9 |         self.next = [None]*KIND
 10 |         self.end = False
 11 |         self.word = None
 12 |         Node.static += 1
 13 | 
 14 | class AcAutomation():
 15 |     def __init__(self):
 16 |         self.root = Node()
 17 |         self.queue = []
 18 |         
 19 |     def getIndex(self, char):
 20 |         return ord(char)#返回一个代表unicode的整数
 21 |     
 22 |     def insert(self, string):
 23 |         p = self.root
 24 | ##        p.next = [None]*KIND
 25 |         for char in string:
 26 |             index = self.getIndex(char)
 27 |             if p.next[index] == None:
 28 |                 p.next[index] = Node()
 29 |             p = p.next[index]
 30 |         p.end = True
 31 |         p.word = string
 32 |         
 33 |     def build_automation(self):
 34 |         self.root.fail = None
 35 |         self.queue.append(self.root)
 36 |         while len(self.queue)!= 0:
 37 |             parent = self.queue[0]
 38 |             self.queue.pop(0)
 39 |             for i, child in enumerate(parent.next):
 40 | ##                self.queue = []
 41 |                 if child == None:
 42 |                     continue
 43 |                 if parent == self.root:
 44 |                     child.fail = self.root
 45 |                 else:
 46 |                     failp = parent.fail
 47 |                     while failp != None:
 48 |                         if failp.next[i] != None:
 49 |                             child.fail = failp.next[i]
 50 |                             break
 51 |                         failp = failp.fail
 52 |                     if failp == None:
 53 |                         child.fail = self.root
 54 |                 self.queue.append(child)
 55 |                 
 56 |     def matchOne(self, string):
 57 |         p = self.root
 58 |         for char in string:
 59 |             index = self.getIndex(char)
 60 |             while p.next[index] == None and p != self.root:
 61 |                 p = p.fail
 62 |             if p.next[index] == None:
 63 |                 p = self.root
 64 |             else:
 65 |                 p = p.next[index]
 66 |             if p.end:
 67 |                 return True, p.word
 68 |         return False, None
 69 |     
 70 |     
 71 | 
 72 | 
 73 | class UnicodeAcAutomation():
 74 |     def __init__(self,encoding='utf-8'):
 75 |         self.ac = AcAutomation()
 76 |         self.encoding = encoding
 77 |         
 78 |     def getAcString(self, string):
 79 |         string = bytearray(string.encode(self.encoding))
 80 |         ac_string = ''
 81 |         for byte in string:
 82 |             ac_string += chr(byte%16)
 83 |             ac_string += chr(byte/16)
 84 |         return ac_string
 85 |     
 86 |     def insert(self, string):
 87 |         if type(string) != unicode:
 88 |             raise Exception('UnicodeAcAutomation:: insert type not unicode')
 89 |         ac_string = self.getAcString(string)
 90 |         self.ac.insert(ac_string)
 91 | 
 92 |     def build_automation(self):
 93 |         self.ac.build_automation()
 94 |     
 95 |     def matchOne(self, string):
 96 |         if type(string) != unicode:
 97 |             raise Exception('UnicodeAcAutomation:: insert type not unicode')
 98 |         ac_string = self.getAcString(string)
 99 |         retcode, ret = self.ac.matchOne(ac_string)
100 |         if ret != None:
101 |             s = ''
102 |             for i in range(len(ret)/2):
103 |                 s += chr(ord(ret[2*i])+ ord(ret[2*i+1])*16)
104 |             ret = s.decode('utf-8')
105 |         return retcode, ret
106 |     
107 | 


--------------------------------------------------------------------------------
/centrality.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import networkx as nx
 3 | from operator import itemgetter
 4 | 
 5 | def betweenness_centrality(G):
 6 |     bc = nx.betweenness_centrality(G,weight='weight')#betweeness
 7 |     bc1 = sorted(bc.iteritems(),key=itemgetter(1),reverse=True)#排序
 8 |     nbc = [e[0] for e in bc1]
 9 |     return nbc[:10]
10 | 
11 | def closeness_centrality(G):
12 |     cc = nx.closeness_centrality(G)#closeness
13 |     cc1 = sorted(cc.iteritems(),key=itemgetter(1),reverse=True)#排序
14 |     ncc = [e[0] for e in cc1]
15 |     return ncc[:10]
16 |     
17 | def degree_centrality(G):
18 |     dc = nx.in_degree_centrality(G)#degree
19 |     dc1 = sorted(dc.iteritems(),key=itemgetter(1),reverse=True)#排序
20 |     ndc = [e[0] for e in dc1]
21 |     return ndc[:10]
22 | 
23 | def PageRank(G):
24 |     pr = nx.pagerank(G,alpha=0.9,weight='weight',max_iter=10000)#带不带权重结果没什么变化
25 |     pr1 = sorted(pr.iteritems(),key=itemgetter(1),reverse=True)#排序
26 |     npr = [e[0] for e in pr1]
27 |     return npr[:10]
28 | 


--------------------------------------------------------------------------------
/createNetwork.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import networkx as nx
 3 | from response_words import *
 4 | 
 5 | def network_wuquan(): #分话题无权网络
 6 |     G = nx.DiGraph()
 7 |     listdata = WuQuan()
 8 |     for data in listdata:
 9 |          G.add_edges_from(data)
10 |     return G
11 | 
12 | def network_jiaquan(): #分话题加权网络
13 |     G = nx.DiGraph()
14 |     listdata = JiaQuan()
15 |     for data in listdata:
16 |         G.add_weighted_edges_from(data)
17 |     return G
18 |         
19 | 


--------------------------------------------------------------------------------
/drawNetwork.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | import networkx as nx
 5 | 
 6 | def draw_unweightedG(G):#无权图
 7 |     pos = nx.spring_layout(G)# 弹簧布局
 8 |     nx.draw(G,pos,node_size = 100,font_size=10,font_family='sans-serif',with_labels=True)
 9 |     plt.show()
10 | 
11 | def draw_weightedG(G):#加权图
12 |     pos = nx.spring_layout(G)# 弹簧布局
13 |     edgewidth = []
14 |     for n,nbrs in G.adjacency_iter():
15 |         for nbr, eattr in nbrs.items():
16 |             edgewidth.append(eattr['weight'])
17 |     nx.draw(G,pos,width = edgewidth,font_size=10,font_family='sans-serif',with_labels=True)
18 |     plt.show()
19 |     
20 | 


--------------------------------------------------------------------------------
/getspeaker.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | 提取发言人列表
 4 | """
 5 | import re
 6 | import pickle
 7 | from operator import itemgetter
 8 | 
 9 | def re_data(strline):#使用正则表达式提取发言人
10 |      reg = re.compile('^(?P<YMD>[^ ]*) (?P<hms>[^ ]*) (?P<speaker>[^ ]*)')
11 |      regMatch = reg.match(strline)
12 |      dict0 = regMatch.groupdict()
13 |      speaker = dict0['speaker']
14 |      return speaker
15 | 
16 | #取()中内容
17 | def contentinbracket(c):
18 |     if c.endswith(')'):
19 |          p1 = re.compile(r'\(.*\)')
20 |          tmp = str(p1.findall(c))
21 |          tmp = tmp[3:len(tmp)-3]
22 |     else: tmp = 'null'
23 |     return tmp
24 | 
25 | def important_speakers(f3): #2提取重要话题发起人
26 |     j = 0
27 |     info_speakers = []
28 |     info_speakers0 = []
29 |     with open(f3) as f:
30 |         for each_line in f:
31 |             if each_line.startswith('---'):
32 |                 if '不' not in each_line and info_speakers0 != []:
33 |                     info_speakers.append(info_speakers0[0])
34 |                 info_speakers0 = []
35 |                 j += 1
36 |             elif each_line.startswith('201'):
37 |                 (part1,part2,part3) = each_line.strip('\n').split(' ',2)
38 |                 part3 = contentinbracket(part3)
39 |                 each_line = part1 +' '+ part2 +' '+ part3
40 |                 info_speakers0.append(re_data(each_line))
41 |         return info_speakers[1:]
42 | 
43 | 


--------------------------------------------------------------------------------
/op_main.py:
--------------------------------------------------------------------------------
 1 | from getspeaker import *
 2 | from createNetwork import *
 3 | from centrality import *
 4 | from draw import *
 5 | 
 6 | 
 7 | def zqx(l1,l2):
 8 |     return float(len([e for e in l1 if e in l2]))/len(l1)
 9 |     
10 | item = '2011tx3.txt'
11 | sponser = important_speakers(item)
12 | yucedict = {e:sponser.count(e) for e in set(sponser)}
13 | paixudict = sorted(yucedict.iteritems(),key=itemgetter(1),reverse=True)#排序
14 | yucepaixu = [e[0] for e in paixudict][:10]
15 | yucepaixu1 = [e[0] for e in paixudict][:5]
16 | print 'yucepaixu'
17 | print yucepaixu
18 | 
19 | #有向网络
20 | G0 = network_wuquan()
21 | G1 = network_jiaquan()
22 | 
23 | c0_10 = degree_centrality(G0)
24 | c1_10 = degree_centrality(G1)
25 | print 'degree_centrality'
26 | print c0_10,zqx(c0_10[:10],yucepaixu[:10]),zqx(c0_10,yucepaixu)
27 | print c0_10,zqx(c0_10[:5],yucepaixu1[:5]),zqx(c0_10,yucepaixu)
28 | print c1_10,zqx(c1_10[:10],yucepaixu[:10]),zqx(c1_10,yucepaixu)
29 | print c1_10,zqx(c1_10[:5],yucepaixu1[:5]),zqx(c1_10,yucepaixu)
30 | 
31 | c0_10 = betweenness_centrality(G0)
32 | c1_10 = betweenness_centrality(G1)
33 | print 'betweenness_centrality'
34 | print c0_10,zqx(c0_10[:10],yucepaixu[:10]),zqx(c0_10,yucepaixu)
35 | print c0_10,zqx(c0_10[:5],yucepaixu1[:5]),zqx(c0_10,yucepaixu)
36 | print c1_10,zqx(c1_10[:10],yucepaixu[:10]),zqx(c1_10,yucepaixu)
37 | print c1_10,zqx(c1_10[:5],yucepaixu1[:5]),zqx(c1_10,yucepaixu)
38 | 
39 | c0_10 = closeness_centrality(G0)
40 | c1_10 = closeness_centrality(G1)
41 | print 'closeness_centrality'
42 | print c0_10,zqx(c0_10[:10],yucepaixu[:10]),zqx(c0_10,yucepaixu)
43 | print c0_10,zqx(c0_10[:5],yucepaixu1[:5]),zqx(c0_10,yucepaixu)
44 | print c1_10,zqx(c1_10[:10],yucepaixu[:10]),zqx(c1_10,yucepaixu)
45 | print c1_10,zqx(c1_10[:5],yucepaixu1[:5]),zqx(c1_10,yucepaixu)
46 | 
47 | P0_10 = PageRank(G0)
48 | P1_10 = PageRank(G1)
49 | print 'PageRank'
50 | print P0_10,zqx(P0_10[:10],yucepaixu[:10]),zqx(P0_10,yucepaixu)
51 | print P0_10,zqx(P0_10[:5],yucepaixu1[:5]),zqx(P0_10,yucepaixu)
52 | print P1_10,zqx(P1_10[:10],yucepaixu[:10]),zqx(P1_10,yucepaixu)
53 | print P1_10,zqx(P1_10[:5],yucepaixu1[:5]),zqx(P1_10,yucepaixu)
54 | 


--------------------------------------------------------------------------------