├── mapper.py
├── reducer.py
├── encode.py
├── README.md
├── view_data.py
├── mapper2.py
├── reducer2.py
├── getAuthors.py
├── viewRelation.py
└── final.py
/mapper.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import sys
3 | def read_input(file):
4 | for line in file:
5 | yield line.split(',')
6 |
7 | def main():
8 | data = read_input(sys.stdin)
9 | for names in data:
10 | for name in names:
11 | if name.isdigit():
12 | print '%s\t%d' % (name, 1)
13 | if __name__ == "__main__":
14 | main()
15 |
--------------------------------------------------------------------------------
/reducer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | from itertools import groupby
3 | from operator import itemgetter
4 | import sys
5 |
6 | def readMapOutput(file):
7 | for line in file:
8 | yield line.strip().split('\t')
9 |
10 | def main():
11 | data = readMapOutput(sys.stdin)
12 | for currentName, group in groupby(data, itemgetter(0)):
13 | try:
14 | total_count = sum(int(count) for currentName, count in group)
15 | print "%s\t%d" % (currentName, total_count)
16 | except ValueError:
17 | pass
18 |
19 | if __name__ == "__main__":
20 | main()
21 |
--------------------------------------------------------------------------------
/encode.py:
--------------------------------------------------------------------------------
1 | import codecs
2 | source = codecs.open('authors.txt','r','utf-8')
3 | result = codecs.open('authors_encoded.txt','w','utf-8')
4 | index = codecs.open('authors_index.txt','w','utf-8')
5 | index_dic = {}
6 | name_id = 0
7 |
8 | for line in source:
9 | name_list = line.split(',')
10 | for name in name_list:
11 | if not (name == '\r\n'):
12 | if name in index_dic:
13 | index_dic[name][1] +=1
14 | else:
15 | index_dic[name] = [name_id,1]
16 | index.write(name + u'\r\n')
17 | name_id += 1
18 | result.write(str(index_dic[name][0]) + u',')
19 | result.write('\r\n')
20 |
21 | source.close()
22 | result.close()
23 | index.close()
24 | #print sorted(index_dic.iteritems(), key = lambda a:a[1][1])
25 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | DBLP-Coauthor-Mining
(从DBLP数据集中挖掘合作者)
2 | =============
3 | 详细说明
4 | -------------
5 | 详细说明请查看 [数据挖掘实战之DBLP中合作者挖掘(Python+Hadoop)](http://www.tianjun.ml/essays/20)
6 |
7 | 文件说明
8 | -------------
9 | ### getAuthors.py
10 | 下载DBLP数据集``dblp.xml``到该目录下,[http://dblp.uni-trier.de/xml/](http://dblp.uni-trier.de/xml/)
运行``getAuthors.py`` 得到``authors.txt``文件
11 | ### encode.py
12 | 运行该文件后将对上一步得到的``authors.txt``文件编码(安装作者姓名出现的顺序依次以正整数编码)得到编码后的文件``authors_encoded.txt``,以及作者姓名与编码对应的文件``authors_index.txt``,其对应关系为姓名所在的行号减1即为其编码ID(ID从0开始)
13 | ### view.data.py
14 | 读取``authors.txt``,统计不同支持度下有多少作者,同时绘制曲线,确定支持度阈值大概范围
15 | ### final.py
16 | 主要借鉴了《机器学习实战》中的例子,将结果写入了 ``result*.txt``文件,注意最后的结果增加了置信度过滤。
17 | ### mapper.py & reduce.py
18 | 第一轮MapReduce的Map和Reduce所用到的文件,其实质就是一个wordCount的过程
19 | ### mapper2.py & reduce2.py
20 | 第二轮MapReduce的Map和Reduce所用到的文件,注意在这里的输出并给出没有完整的挖掘结果,而是输出的条件模式集,有空的话再转化一下。(本s实验目的只是验证FP-growth在分布式下实现的可能性,所以没有给出完整的结果)
21 | ### viewRelation.py
22 | 添加了作者与其合作者之间的可视化功能,使用了networkx包。
23 |
24 |
--------------------------------------------------------------------------------
/view_data.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from matplotlib.font_manager import FontProperties
3 | font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=14)
4 | import codecs
5 | import matplotlib.pyplot as plt
6 | import numpy as np
7 | data = codecs.open('authors_encoded.txt','r','utf-8')
8 | word_counts = {}
9 | maxCounts = 0
10 | for line in data:
11 | line = line.split(',')
12 | for word in line[0:-1]:
13 | word_counts[word] = word_counts.get(word,0) + 1
14 | if word_counts[word] > maxCounts:
15 | maxCounts = word_counts[word]
16 | maxKey = word
17 |
18 | xMax = maxCounts
19 | data.close()
20 | bins = {}
21 | for k,v in word_counts.iteritems():
22 | bins[v] = bins.get(v,0) + 1
23 |
24 | y = []
25 | for i in range(40, 200):
26 | y.append(bins.get(i,0))
27 | plt.plot(y,'-');
28 | plt.grid()
29 | plt.yticks(range(0,1000,100))
30 | plt.xticks(range(0,160,20),range(40,200,20))
31 | plt.xlabel(u'支持度',fontproperties=font)
32 | plt.ylabel(u'对应支持度下的作者个数',fontproperties=font)
33 | plt.title(u'作者数量与支持度之间的对应关系',fontproperties=font)
34 | plt.show()
35 |
--------------------------------------------------------------------------------
/mapper2.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import sys
3 | def creatDic():
4 | freqDic = {}
5 | with open('sortedList', 'r') as sortedList:
6 | for line in sortedList:
7 | line = line.strip().split('\t')
8 | freqDic[int(line[0])] = int(line[1])
9 | return freqDic
10 |
11 | def read_input(inFile):
12 | for line in inFile:
13 | yield line.split(',')
14 |
15 | def main(freqDic, minSup):
16 | data = read_input(sys.stdin)
17 | for names in data:
18 | names = {name:freqDic[int(name)] for name in names \
19 | if name.isdigit() \
20 | and freqDic.get(int(name), 0) >= minSup}
21 | lenth = len(names)
22 | if lenth >= 2:
23 | conPatItems = [name for name, value in \
24 | sorted(names.iteritems(), \
25 | key = lambda p:p[1])]
26 | for i in range(lenth-1):
27 | print "%s\t%s" % (conPatItems[i], conPatItems[i+1::])
28 | else:
29 | continue
30 |
31 | if __name__ == '__main__':
32 | support = 100
33 | dic = creatDic()
34 | main(dic, support)
35 |
--------------------------------------------------------------------------------
/reducer2.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | from itertools import groupby
3 | from operator import itemgetter
4 | import sys
5 |
6 | def readMapOutput(file):
7 | for line in file:
8 | yield line.strip().split('\t')
9 |
10 | def main(minSup):
11 | data = readMapOutput(sys.stdin)
12 | for currentName, group in groupby(data, itemgetter(0)):
13 | localDic = {}
14 | try:
15 | for currentName, conPatItems in group:
16 | conPatItems = conPatItems.strip().strip('[').strip(']')
17 | #print "%s\t%s" % (currentName, conPatItems)
18 | itemList = conPatItems.split(',')
19 | for item in itemList:
20 | item = item.strip().strip("'")
21 | item = int(item)
22 | localDic[item] = localDic.get(item,0) + 1
23 | resultDic = {k:v for k, v in localDic.iteritems() \
24 | if v >= minSup}
25 | #Here we just print out 2-coauthors
26 | if len(resultDic) >= 1:
27 | print "%s\t%s" % (currentName, resultDic.items())
28 |
29 | except:
30 | print "%s\t%s" %("inner err", "sorry!")
31 | pass
32 |
33 | if __name__ == "__main__":
34 | support = 100
35 | main(support)
36 |
--------------------------------------------------------------------------------
/getAuthors.py:
--------------------------------------------------------------------------------
1 | import codecs
2 | from xml.sax import handler, make_parser
3 | paper_tag = ('article','inproceedings','proceedings','book',
4 | 'incollection','phdthesis','mastersthesis','www')
5 |
6 | class mHandler(handler.ContentHandler):
7 | def __init__(self,result):
8 | self.result = result
9 | self.flag = 0
10 |
11 | def startDocument(self):
12 | print 'Document Start'
13 |
14 | def endDocument(self):
15 | print 'Document End'
16 |
17 | def startElement(self, name, attrs):
18 | if name == 'author':
19 | self.flag = 1
20 |
21 | def endElement(self, name):
22 | if name == 'author':
23 | self.result.write(',')
24 | self.flag = 0
25 | if (name in paper_tag) :
26 | self.result.write('\r\n')
27 |
28 | def characters(self, chrs): # [8]
29 | if self.flag:
30 | self.result.write(chrs)
31 |
32 | def parserDblpXml(source,result):
33 | handler = mHandler(result)
34 | parser = make_parser()
35 | parser.setContentHandler(handler)
36 |
37 | parser.parse(source)
38 |
39 |
40 | if __name__ == '__main__':
41 | source = codecs.open('dblp.xml','r','utf-8')
42 | result = codecs.open('authors.txt','w','utf-8')
43 | parserDblpXml(source,result)
44 | result.close()
45 | source.close()
46 |
--------------------------------------------------------------------------------
/viewRelation.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import itertools
3 | import networkx as nx
4 | import matplotlib.pyplot as plt
5 | import numpy as np
6 | import codecs
7 | from matplotlib.font_manager import FontProperties
8 | font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc")
9 | def createEdge(nodeX):
10 | with codecs.open('authors.txt','r','utf-8') as f:
11 | for line in f:
12 | line = line.strip().split(',')
13 | if line[-1] == '':
14 | line.remove('')
15 | if nodeX in line and len(line) >1:
16 | line.remove(nodeX)
17 | for author in line:
18 | yield (author,nodeX)
19 | def makeFreqDic():
20 | print "Creating FreqDic..."
21 | with codecs.open('authors.txt','r','utf-8') as f:
22 | freqDic = {}
23 | for line in f:
24 | line = line.strip().split(',')
25 | if line[-1] == '':
26 | line.remove('')
27 | for author in line:
28 | freqDic[author] = freqDic.get(author,0) + 1
29 | return freqDic
30 | def main(freqDic,nodeX):
31 | G = nx.Graph()
32 | print "Adding edge..."
33 | for A,B in createEdge(nodeX):
34 | edgeDic = G.get_edge_data(A,B,default = {'weight':0})
35 | G.add_edge(A, B, weight = edgeDic['weight'] + 1)
36 | nodes = G.nodes()
37 | nodes.remove(nodeX)
38 | shells = [[nodeX], nodes]
39 | pos = nx.shell_layout(G,shells)
40 | print "Drawing nodes..."
41 | nodeSize = [10*freqDic[n] for n, dic in G.nodes_iter(data=True)]
42 | nodeColors = np.random.rand(len(nodeSize))
43 | nx.draw_networkx_nodes(G, pos, node_size=nodeSize,node_color= nodeColors,alpha=0.7)
44 | print "Drawing edges..."
45 | edgeWidth = [edata['weight'] for u,v,edata in G.edges(data=True)]
46 | edgeColor = np.random.rand(G.number_of_edges())
47 | nx.draw_networkx_edges(G, pos, width = edgeWidth, edge_color=edgeColor,alpha=0.35)
48 | print "Adding label..."
49 | select_labels = {n:n for n,d in G.nodes_iter(data=True) if freqDic[n] >= 80}
50 | select_labels[nodeX]= nodeX
51 | nx.draw_networkx_label(G,pos,labels = select_labels,font_size=8,alpha=0.3)
52 | title = str(nodeX) + u"与其合作者之间的关系网络"
53 | plt.title(title, size=15,fontproperties=font)
54 | plt.text(0.5, 0.94, u"# 节点大小对应该作者发表文章总次数",
55 | horizontalalignment='center',
56 | size=10,color='r',verticalalignment='center',
57 | transform=plt.gca().transAxes,
58 | fontproperties=font)
59 | plt.text(0.5, 0.97, u"# 节点之间连线粗细对应该两个作者一起发表文章总次数",
60 | horizontalalignment='center',
61 | size=10,color='r',verticalalignment='center',
62 | transform=plt.gca().transAxes,
63 | fontproperties=font)
64 | plt.axis('off')
65 | fileName = str(nodeX) + ".png"
66 | plt.savefig(fileName,transparent=True,dpi=500)
67 | plt.show()
68 |
69 | if __name__ == '__main__':
70 | freqDic = makeFreqDic()
71 | nodeX = u'Irith Pomeranz'
72 | main(freqDic, nodeX)
73 |
--------------------------------------------------------------------------------
/final.py:
--------------------------------------------------------------------------------
1 | class treeNode:
2 | def __init__(self, nameValue, numOccur, parentNode):
3 | self.name = nameValue
4 | self.count = numOccur
5 | self.nodeLink = None
6 | self.parent = parentNode #needs to be updated
7 | self.children = {}
8 | def inc(self, numOccur):
9 | self.count += numOccur
10 | def disp(self, ind=1):
11 | print ' '*ind, self.name, ' ', self.count
12 | for child in self.children.values():
13 | child.disp(ind+1)
14 |
15 | def createTree(dataSet, minSup=1): #create FP-tree from dataset but don't mine
16 | freqDic = {}
17 | #go over dataSet twice
18 | for trans in dataSet:#first pass counts frequency of occurance
19 | for item in trans:
20 | freqDic[item] = freqDic.get(item, 0) + dataSet[trans]
21 |
22 | headerTable = {k:v for (k,v) in freqDic.iteritems() if v >= minSup}
23 |
24 |
25 | #print 'freqItemSet: ',freqItemSet
26 | if len(headerTable) == 0: return None, None #if no items meet min support -->get out
27 | for k in headerTable:
28 | headerTable[k] = [headerTable[k], None] #reformat headerTable to use Node link
29 | #print 'headerTable: ',headerTable
30 | retTree = treeNode('Null Set', 1, None) #create tree
31 | for tranSet, count in dataSet.items(): #go through dataset 2nd time
32 | localD = {}
33 | for item in tranSet: #put transaction items in order
34 | if headerTable.get(item,0):
35 | localD[item] = headerTable[item][0]
36 | if len(localD) > 0:
37 | orderedItems = [v[0] for v in sorted(localD.items(), key=lambda p: p[1], reverse=True)]
38 | updateTree(orderedItems, retTree, headerTable, count)#populate tree with ordered freq itemset
39 | return retTree, headerTable #return tree and header table
40 |
41 | def updateTree(items, inTree, headerTable, count):
42 | if items[0] in inTree.children:#check if orderedItems[0] in retTree.children
43 | inTree.children[items[0]].inc(count) #incrament count
44 | else: #add items[0] to inTree.children
45 | inTree.children[items[0]] = treeNode(items[0], count, inTree)
46 | if headerTable[items[0]][1] == None: #update header table
47 | headerTable[items[0]][1] = inTree.children[items[0]]
48 | else:
49 | updateHeader(headerTable[items[0]][1], inTree.children[items[0]])
50 | if len(items) > 1:#call updateTree() with remaining ordered items
51 | updateTree(items[1::], inTree.children[items[0]], headerTable, count)
52 |
53 | def updateHeader(nodeToTest, targetNode): #this version does not use recursion
54 | while (nodeToTest.nodeLink != None): #Do not use recursion to traverse a linked list!
55 | nodeToTest = nodeToTest.nodeLink
56 | nodeToTest.nodeLink = targetNode
57 |
58 | def ascendTree(leafNode, prefixPath): #ascends from leaf node to root
59 | if leafNode.parent != None:
60 | prefixPath.append(leafNode.name)
61 | ascendTree(leafNode.parent, prefixPath)
62 |
63 | def findPrefixPath(basePat, treeNode): #treeNode comes from header table
64 | condPats = {}
65 | while treeNode != None:
66 | prefixPath = []
67 | ascendTree(treeNode, prefixPath)
68 | if len(prefixPath) > 1:
69 | condPats[frozenset(prefixPath[1:])] = treeNode.count
70 | treeNode = treeNode.nodeLink
71 | return condPats
72 |
73 | def mineTree(inTree, headerTable, minSup, preFix, freqItemList):
74 | bigL = [v[0] for v in sorted(headerTable.items(), key=lambda p: p[1])]#(sort header table)
75 | for basePat in bigL: #start from bottom of header table
76 | newFreqSet = preFix.copy()
77 | newFreqSet.add(basePat)
78 | #print 'finalFrequent Item: ',newFreqSet #append to set
79 | if len(newFreqSet) > 1:
80 | freqItemList[frozenset(newFreqSet)] = headerTable[basePat][0]
81 | condPattBases = findPrefixPath(basePat, headerTable[basePat][1])
82 | #print 'condPattBases :',basePat, condPattBases
83 | #2. construct cond FP-tree from cond. pattern base
84 |
85 | myCondTree, myHead = createTree(condPattBases, minSup)
86 | #print 'head from conditional tree: ', myHead
87 | if myHead != None: #3. mine cond. FP-tree
88 | #print 'conditional tree for: ',newFreqSet
89 | #myCondTree.disp(1)
90 | mineTree(myCondTree, myHead, minSup, newFreqSet, freqItemList)
91 |
92 | def loadSimpDat(inFile):
93 | dataSet = {}
94 | for line in inFile:
95 | line =line.strip().split(',')
96 | dataLine = [word for word in line if word.isdigit()]
97 | dataSet[frozenset(dataLine)] = dataSet.get(frozenset(dataLine),0) + 1
98 |
99 | return dataSet
100 |
101 |
102 |
103 | if __name__ == "__main__":
104 | minSup = 100
105 | print "Reading Source File ... Wait..."
106 | with open('authors_encoded.txt','r') as f:
107 | dataSet = loadSimpDat(f)
108 |
109 | print "Constructing FP-tree ... Wait..."
110 | myFPtree, myHeaderTab = createTree(dataSet, minSup)
111 |
112 | print "Mining frequent items ... Wait..."
113 | myFreqList = {}
114 | mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList)
115 | print "Totally %d frequent itemsets found ! " %len(myFreqList)
116 | print "Constructing authors_index... Wait..."
117 |
118 | maxCoauthors = 0
119 | for freqAuthors in myFreqList.keys():
120 | if len(freqAuthors) > maxCoauthors:
121 | maxCoauthors = len(freqAuthors)
122 | print "the max num of coauthors is %d " % (maxCoauthors)
123 |
124 |
125 | with open('authors_index.txt','r') as authorsIndex:
126 | i = 0
127 | authorsDic = {}
128 | for name in authorsIndex:
129 | name = name.strip()
130 | authorsDic[i] = name
131 | i = i+1
132 |
133 | print "Writing result into result.txt... Wait..."
134 |
135 | with open('result4.txt','w') as result2:
136 | with open('result3.txt','w') as result:
137 | result.write("%25s\t%25s\t%15s\t%10s\t%6s\t%6s\t%6s\t%6s\t%6s\t%6s\t%6s\t%6s\n" \
138 | %('authorA','authorB','authorC','Sup(A,B,C)','Sup(A)','Sup(B)','Sup(C)',\
139 | 'Con(A)','Con(B)','Con(C)','MinCon','MaxCon'))
140 | result2.write("%25s\t%25s\t%15s\t%10s\t%6s\t%6s\t%6s\t%6s\t%6s\t%6s\t%6s\t%6s\n" \
141 | %('authorA','authorB','authorC','Sup(A,B,C)','Sup(A)','Sup(B)','Sup(C)',\
142 | 'Con(A)','Con(B)','Con(C)','MinCon','MaxCon'))
143 | resultList = sorted(myFreqList.items(), key=lambda p: p[1], reverse=True)
144 | for itemSet, support in resultList:
145 | itemList = list(itemSet)
146 | A = itemList[0]
147 | authorA = authorsDic.get(int(A),'0')
148 | B = itemList[1]
149 | authorB = authorsDic.get(int(B),'0')
150 | SupAB_C = int(support)
151 | SupA = int(myHeaderTab.get(A,[0])[0])
152 | SupB = int(myHeaderTab.get(B,[0])[0])
153 | ConA = float(SupAB_C) / float(SupA)
154 | ConB = float(SupAB_C) / float(SupB)
155 | (C,authorC,SupC,ConC) = ('','',0.0,0.0)
156 |
157 | if len(itemList) == 3:
158 | C = itemList[2]
159 | authorC = authorsDic.get(int(C),'0')
160 | SupC = int(myHeaderTab.get(C,[0])[0])
161 | ConC = float(SupAB_C) / float(SupC)
162 | MinCon = min([ConA, ConB, ConC])
163 | MaxCon = max([ConA, ConB, ConC])
164 | elif len(itemList) == 2:
165 | MinCon = min([ConA, ConB])
166 | MaxCon = max([ConA, ConB])
167 |
168 |
169 | if MinCon < 0.4 or MaxCon < 0.5 or (MinCon + MaxCon)/2 < 0.5:
170 | continue
171 | result.write("%25s\t%25s\t%15s\t%10.0f\t%6.0f\t%6.0f\t%6.0f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\n" \
172 | %(authorA,authorB,authorC,SupAB_C,\
173 | SupA,SupB,SupC,ConA,ConB,ConC,MinCon,MaxCon))
174 | result2.write("%25s\t%25s\t%15s\t%10.0f\t%6.0f\t%6.0f\t%6.0f\t\%6.4f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\n"\
175 | %(A,B,C,SupAB_C,SupA,SupB,SupC,\
176 | ConA,ConB,ConC,MinCon,MaxCon))
177 | print "Finished !"
178 |
--------------------------------------------------------------------------------