├── run_mapper_reducer.sh
├── copy_to_remote.py
├── article_clients_runner.py
├── parse_xml.py
├── parselinks.py
├── zmq_wikiarticle_server.py
├── simpleWiki.py
├── reducer.py
├── README
├── mapper.py
└── zmq_wikiarticle_client.py


/run_mapper_reducer.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | python -c "from kyotocabinet import *; db=DB(); db.open('tempcabinet.kch',DB.OREADER); db.dump_snapshot('tempcabinet.snapshot'); db.close()" 
4 | rm histogram_* 
5 | nice python ../mapper.py > /dev/null 
6 | nice python ../reducer.py > output5.dot
7 | 


--------------------------------------------------------------------------------
/copy_to_remote.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import sys
 3 | 
 4 | def file_len(fname):
 5 |     p = subprocess.Popen(['wc', '-l', fname], stdout=subprocess.PIPE, 
 6 |                                               stderr=subprocess.PIPE)
 7 |     result, err = p.communicate()
 8 |     if p.returncode != 0:
 9 |         raise IOError(err)
10 |     return int(result.strip().split()[0])
11 | 
12 | flen = file_len(sys.argv[1])
13 | print "size of ",sys.argv[1]," is ",str(flen)
14 | 
15 | 
16 | for i in range(2000,flen,2000):
17 | 	print "copy {0} to {1}".format(i,i+2000)
18 | 	subprocess.call(["/Users/roysh/wikidb/copy_remote.sh",str(i),sys.argv[1]])
19 | 


--------------------------------------------------------------------------------
/article_clients_runner.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing import Pool
 2 | from zmq_wikiarticle_client import *
 3 | 
 4 | def f(num):
 5 | 	filenm = 'links_{0}.DOT'.format(num)
 6 | 	print "start links file ",filenm
 7 | 	client = article_client(open(filenm,'w'),number=num)
 8 | 	client.start_client()
 9 | 	return True
10 | 
11 | NPROCESSES = 6
12 | 
13 | if __name__ == '__main__':
14 |     #zmq_wikiarticle_client.verbose = True
15 | 	context = zmq.Context()
16 | 
17 | 	#  Socket to talk to server
18 | #	print "Connecting to article server..."
19 | #	socket = context.socket(zmq.REQ)
20 | #	socket.connect ("tcp://localhost:5555")
21 | 
22 | #	print "ordering server to restart"
23 | #	socket.send("START_OVER")
24 | 
25 | 	pool = Pool(processes=NPROCESSES)              # start worker processes
26 | 
27 | 	pool.map(f, range(NPROCESSES))          
28 | 


--------------------------------------------------------------------------------
/parse_xml.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import sys,re,os
 4 | from cStringIO import StringIO
 5 | 
 6 | noncharre = re.compile('\W')
 7 | 
 8 | if __name__ == "__main__":
 9 | 	page = StringIO()
10 | 	gotid = False
11 | 	while 1:
12 | 		next = sys.stdin.readline()         # read a one-line string
13 | 		if not next:                        # or an empty string at EOF
14 | 			break
15 | 		
16 | 		page.write(next)
17 | 
18 | 		if next.strip().find('<title>') > -1:
19 | 			title = next[next.find('title>')+6:next.find('</title')]
20 | 			title = noncharre.sub('-',title)
21 | 			title = title[:200]
22 | 
23 | 		if not gotid and next.strip().find('<id>') > -1:
24 | 			title = next[next.find('<id>')+4:next.find('</id')] + '_' + title
25 | 			gotid = True
26 | 		
27 | 		if next.strip().find('</page') > -1:
28 | 			filename = title+'.xml'
29 | 			if not os.path.exists(filename): 
30 | 				open(filename,'w').write(page.getvalue())
31 | 			page.close()
32 | 			page = StringIO()
33 | 			gotid = False
34 | 
35 | 


--------------------------------------------------------------------------------
/parselinks.py:
--------------------------------------------------------------------------------
 1 | from kyotocabinet import *
 2 | import sys
 3 | 
 4 | # create the database object
 5 | db = DB()
 6 | 
 7 | # open the database
 8 | if not db.open("tempcabinet.kch", DB.OWRITER | DB.OCREATE):
 9 | 	print >>sys.stderr, "open error: " + str(db.error())
10 | 	sys.exit(0)
11 | 
12 | if not db.clear():
13 | 	print >>sys.stderr, "cant clear "+str(db.error())
14 | 	sys.exit(0)
15 | 
16 | inc = 0
17 | 
18 | fh = open(sys.argv[1],'r')
19 | for line in fh.xreadlines():
20 | #	if line == None:
21 | #		break
22 | 	if line.find(" -> ") == -1: continue
23 | 	line = line.strip()
24 | 
25 | 	try:
26 | 		(k,v) = line.split(" -> ")
27 | 	except(ValueError):
28 | 		continue
29 | 	v = v.strip()
30 | 	k = k.strip()
31 | 	db[k] = v
32 | #	if not db.get(v):
33 | #		db[v] = 0	
34 | 
35 | #	def incproc(key,value):
36 | #		return value + 1
37 | #	db.accpet(v,incproc)	# attempt using procedure
38 | #	db[v] = int(db[v]) + 1
39 | 
40 | print >>sys.stderr, "done reading file"
41 | 
42 | db.copy("backup.kch") # make a backup
43 | 
44 | # close the database
45 | if not db.close():
46 |     print >>sys.stderr, "close error: " + str(db.error())
47 | 
48 | 


--------------------------------------------------------------------------------
/zmq_wikiarticle_server.py:
--------------------------------------------------------------------------------
 1 | import zmq
 2 | import time
 3 | import sys,re,os
 4 | from cStringIO import StringIO
 5 | 
 6 | def roll_to_start(fh):
 7 | 	last_pos = fh.tell()
 8 | 	line = fh.readline()
 9 | 	while line.find('<page>') == -1:
10 | 		last_pos = fh.tell()
11 | 		line = fh.readline()
12 | 	fh.seek(last_pos)
13 | 	
14 | 
15 | print "opening zmq server"
16 | 
17 | context = zmq.Context()
18 | socket = context.socket(zmq.REP)
19 | socket.bind("tcp://*:5555")
20 | 
21 | noncharre = re.compile('\W')
22 | 
23 | fh = open('enwiki-latest-pages-articles.xml','r')
24 | roll_to_start(fh)
25 | print "starting pages at ",fh.tell()
26 | 
27 | print "serving clients"
28 | serving = True
29 | while serving:
30 | 	#  Wait for next request from client
31 | 	message = socket.recv()
32 | 	#print "Received request: ", message
33 | 
34 | 	if message.find("START_OVER") > -1:
35 | 		print "starting over..."
36 | 		roll_to_start(fh)
37 | 		continue
38 | 
39 | 	page = StringIO()
40 | 	gotid = False
41 | 	filerunning = True
42 | 	response = ''
43 | 	while filerunning:
44 | 		#next = sys.stdin.readline()         # read a one-line string
45 | 		next = fh.readline()
46 | 		if not next:                        # or an empty string at EOF
47 | 			filerunning = False	
48 | 			serving = False
49 | 			response = "ALL_DONE"	
50 | 		else:	
51 | 	#		if not next.find('<page') > -1:
52 | 	#			continue			#roll forward untill beginning of page
53 | 
54 | 			page.write(next)
55 | 
56 | 			if next.strip().find('<title>') > -1:
57 | 				title = next[next.find('title>')+6:next.find('</title')]
58 | #				title = noncharre.sub('-',title)
59 | #				title = title[:200]
60 | 	#
61 | 	#		if not gotid and next.strip().find('<id>') > -1:
62 | 	#			title = next[next.find('<id>')+4:next.find('</id')] + '_' + title
63 | 	#			gotid = True
64 | 
65 | 			#done readine one page
66 | 			if next.strip().find('</page') > -1:
67 | 				response = page.getvalue()
68 | 				page.close()
69 | 				page = StringIO()
70 | 				gotid = False
71 | 				filerunning = False
72 | 
73 | 	#print "serving: ",title
74 | 	#  Send reply back to client
75 | 	socket.send(response)
76 | 
77 | 


--------------------------------------------------------------------------------
/simpleWiki.py:
--------------------------------------------------------------------------------
 1 | from pyparsing import *
 2 | import sys,re
 3 | 
 4 | debug = False 
 5 | 
 6 | def getMediaWikiFirstLink(text):
 7 | 	links = parseMediaWiki(text)
 8 | 
 9 | 	if len(links) == 0:
10 | 		raise ParseException('no links found')
11 | 	
12 | 	links = [ln for ln in links if re.search('^(file|image)',ln.lower()) is None]
13 | 	firstlink = links[0]
14 | 	if firstlink.find('|') > -1:
15 | 		firstlink = firstlink[:firstlink.find('|')]
16 | 
17 | 	return firstlink
18 | 
19 | 
20 | def getNthLink(text,N):
21 | 	links = parseMediaWiki(text)
22 | 	if len(links) == 0:
23 | 		raise ParseException('no links found')
24 | 	links = [ln for ln in links if re.search('^(file|image)',ln.lower()) is None]
25 | 	if len(links) < N:
26 | 		Nlink = links[-1]	#take the last one...
27 | 	else:
28 | 		Nlink = links[N-1]
29 | 	if Nlink.find('|') > -1:
30 | 		Nlink = Nlink[:Nlink.find('|')]
31 | 
32 | 	return Nlink
33 | 
34 | def getNFirstLinks(text,N):
35 | 	links = parseMediaWiki(text)
36 | 	if len(links) == 0:
37 | 		raise ParseException('no links found')
38 | 	links = [ln for ln in links if re.search('^(file|image)',ln.lower()) is None]
39 | 	if len(links) < N:
40 | 		Nlinks = links
41 | 	else:
42 | 		Nlinks = links[0:N-1]
43 | 	returnLinks = []
44 | 	for ln in Nlinks:
45 | 		if ln.find('|') > -1:
46 | 			ln = ln[:ln.find('|')]
47 | 		returnLinks.append(ln)
48 | 	return returnLinks
49 | 
50 | def parseMediaWiki(text):
51 | 
52 | 	# ############################# Grammer #################################
53 | 
54 | 	textNoStop = Regex('[^\s\{\}\[\]\(\)]+')
55 | 	myHtmlComment = QuotedString("<!--",endQuoteChar="-->",multiline=True)
56 | 	regularText = (textNoStop ^ Literal("[") ^ Literal("]") ) 
57 | 
58 | 	regularBrackets = Forward() 
59 | 	regularBrackets << Combine(Literal("(") + ZeroOrMore(Regex('[^\(\)]+') ^ regularBrackets) + Literal(")"))
60 | 
61 | 	link = Forward()
62 | 	link << Combine( Literal("[[").suppress() + ZeroOrMore(Regex('[^\[\]]+') ^ link) + Literal("]]").suppress()) 
63 | 
64 | 	curlyShit = Forward()
65 | 	curlyShit << Combine( Literal("{{") + ZeroOrMore( Regex('[^\{\}]+') ^ curlyShit ) + Literal("}}") , joinString=" ") 
66 | 
67 | 	curlyCurlyBar = QuotedString("{|",endQuoteChar="|}",multiline=True)+Optional(QuotedString("}",endQuoteChar="|}",multiline=True))
68 | 	strangeCurlyBar = QuotedString("|",endQuoteChar="|}",multiline=True) #+NotAny(Literal("}")) # strangely it may also appear like this...
69 | 	curlyBar = curlyCurlyBar ^ strangeCurlyBar
70 | 
71 | 	strangeBeginRemark = Combine(Literal(":") + QuotedString("''") , joinString=" ")
72 | 
73 | 	if debug:
74 | 		wikiMarkup = OneOrMore(regularText ^ strangeBeginRemark ^ curlyBar ^ curlyShit ^ myHtmlComment ^ link ^ regularBrackets)
75 | 	else:
76 | 		wikiMarkup = Optional(OneOrMore(regularText.suppress() ^ strangeBeginRemark.suppress() ^ curlyBar.suppress() ^ curlyShit.suppress() ^ myHtmlComment.suppress() ^ link ^ regularBrackets.suppress()))
77 | 
78 | 	return wikiMarkup.parseString(text)
79 | 
80 | if __name__=="__main__":
81 | 	if len(sys.argv) > 2 and sys.argv[2] == "debug": debug = True
82 | 	text = open(sys.argv[1]).read().decode('utf-8').encode('ascii','ignore')
83 | 	print "Original\n\n",text
84 | 	print "parse\n\n",parseMediaWiki(text),"\n\n"
85 | 	print "first link\n\n",getMediaWikiFirstLink(text),"\n\n"
86 | 


--------------------------------------------------------------------------------
/reducer.py:
--------------------------------------------------------------------------------
 1 | import multiprocessing
 2 | import pickle
 3 | import glob,pprint,sys,re
 4 | 
 5 | def combine_recurse(hist1,hist2):
 6 | 	hist1_k = list(set(hist1.keys()) - set(['w']))
 7 | 
 8 | 	if 'w' in hist1.keys(): hist1['w'] += hist2['w'] 
 9 | 
10 | 	for v2 in list(set(hist2.keys()) - set(['w'])):
11 | 		if v2 in hist1.keys():
12 | 			#print "combine ",v2
13 | 			# already exists - combine recurse
14 | 			#hist1[v2]['w'] = hist1[v2]['w'] + hist2[v2]['w']
15 | 			combine_recurse(hist1[v2],hist2[v2])
16 | 		else:
17 | 			hist1[v2] = hist2[v2]	# just add		
18 | 
19 | 
20 | def reduce(hist_q):
21 | #	if hist_q.empty(): return
22 | 
23 | 	# get two histograms
24 | 	(hist1,hist2) = hist_q.get(True)
25 | 	
26 | 	return combine(hist1,hist2)
27 | 
28 | def combine(hist1,hist2):
29 | 	print "combining: {0} and {1}".format(hist1,hist2)
30 | 
31 | 	# combine them
32 | 	hist1_o = pickle.Unpickler(open(hist1,'r')).load()
33 | 	hist2_o = pickle.Unpickler(open(hist2,'r')).load()
34 | 	combine_recurse(hist1_o,hist2_o)	
35 | 
36 | 	# pickle the result
37 | 	filename = '_'+hist1
38 | 	pickle.Pickler(open(filename,'w')).dump(hist1_o)
39 | 
40 | 	return filename
41 | 
42 | def flatten(hist):
43 | 	flat = {}
44 | 	if hist is None or hist.keys() is None: return {}
45 | 	for i in hist.keys():
46 | 		if not isinstance(hist[i],{}.__class__) or hist[i] is None: continue
47 | 		flat[i] = hist[i]
48 | 		subflat = flatten(hist[i])
49 | 		if subflat is not None and len(subflat)>0: flat.update(subflat)
50 | 
51 | def newcombine(hist1,hist2):
52 | 	for v in hist2.keys():
53 | 		if v == '_w': continue
54 | 
55 | 		if v in hist1:
56 | 			hist1[v]['_w'] = hist1[v]['_w'] + hist2[v]['_w']
57 | #			hist1[v].update(hist2[v])
58 | 			for k in hist2[v]:
59 | 				if k in hist1[v]: hist1[v][k] = hist1[v][k] + hist2[v][k]
60 | 				else: hist1[v][k] = hist2[v][k]
61 | 			if len(hist1[v]) > 80:
62 | 				#pprint.PrettyPrinter(2).pprint(hist[v])
63 | 				histogram_temp = sorted(hist1[v].items(),key=lambda x: x[1])
64 | 				#pprint.PrettyPrinter(2).pprint(histogram_temp[-25:])
65 | 				hist1[v] = dict(histogram_temp[-25:])	
66 | 				#print "cull internal ",v, ", biggest:",histogram_temp[-25],"now:",len(hist[v])
67 | 		else:
68 | 			hist1[v] = hist2[v]
69 | 
70 | def textify_title(title):
71 | 	title = re.sub("\_"," ",title) #underscores
72 | 	title = re.sub("\s+"," ",title) #extra spaces
73 | 	return " ".join([w.capitalize() for w in title.split()]) #capitalize
74 | 
75 | if __name__=="__main__":
76 | #	q = multiprocessing.Queue()
77 | #	q.put(('histogram_1979_ml1.pickle','histogram_1990_great_american_bank_classic.pickle'))
78 | #	reduce(q)
79 | 	files = glob.glob("histogram_*")
80 | #	newh = combine(files.pop(),files.pop())
81 | #	for filen in glob.glob("histogram_*"):
82 | #		newh = combine(newh,filen)	
83 | 
84 | 	print >>sys.stderr, "load ",files[-1]
85 | 	hist = pickle.Unpickler(open(files.pop(),'r')).load()
86 | 	for f in files:
87 | 		print >>sys.stderr, "combine with ",f
88 | 		newcombine(hist,pickle.Unpickler(open(f,'r')).load())
89 | 	#pprint.PrettyPrinter(2).pprint(hist)
90 | 
91 | 	print "digraph {"
92 | 	for k in hist:
93 | 		print "{0} [weight={1},label=\"{2}\"];".format(k,hist[k]['_w'],textify_title(k))
94 | 		for v in hist[k]:
95 | 			if v == '_w': continue
96 | 			print "{1} -> {0} [weight={2}];".format(k,v,hist[k][v])	
97 | 	print "}"
98 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
 1 | Tools for parsing the Wikipedia database in the MediaWiki format, and (potentialy) distributed tools for getting the linkage scheme of the complete database.
 2 | 
 3 | Where To Start
 4 | --------------
 5 | First you should download a dump of the Wikipedia databse: http://en.wikipedia.org/wiki/Wikipedia:Database_download
 6 | 
 7 | Then, use the distributed tools: zmq_wikiartcile_server.py and zmq_wikiarticle_client.py
 8 | They are based on a ZeroMQ pipe that shuffles the raw articles from the database onto worker processes.
 9 | 
10 | The server looks for the "enwiki-latest-pages-articles.xml" file that is contained in the tar.gz you downloaded.
11 | It opens a ZMMQ server on port 5555 and waits for the clients to request an article.
12 | 
13 | To start the clients pool run 'python article_clients_runner.py'
14 | It will open 7 (configurable...) processes of the 'zmw_wikiarticle_client' module.
15 | Each client requests a page, parses it using pyparsing (more on this here: http://www.morethantechnical.com/2011/06/16/getting-all-the-links-from-a-mediawiki-format-using-pyparsing/) and saves all the first found links in a text file.
16 | 
17 | The client can be debugged by running 'python zmw_wikiarticle_client.py raw_article.txt'
18 | 
19 | The workers take some hours to complete parsing the whole DB. This process can be greatly parallelized, distributed, etc., as you can run the workers in a cluster. The work is CPU intensive.
20 | 
21 | The parser code exists in simpleWiki.py.
22 | 
23 | First-Link Paths
24 | ----------------
25 | So far for generic parsing, I moved on to get the first-link-path (http://en.wikipedia.org/wiki/Wikipedia_talk:Get_to_Philosophy#A_plea_to_the_authors_of_the_tools_at_ryanelmquist.com_and_xefer.com).
26 | 
27 | After the workers are done, you will end up with links_*.DOT files
28 | They are in the DOT language format, which means GraphViz and other can read them, but they can't really be visualized as the linkage is way too dense...
29 | So the next step is to run:
30 | 
31 | cat links_* > links_all.DOT
32 | python parselinks.py links_all.DOT
33 | 
34 | This will build a KyotoCabinet (http://fallabs.com/kyotocabinet/pythondoc/) HashDB of all the linkage: tempcabinet.kch. 
35 | It should be ~600Mb.
36 | 
37 | The next step is traversing the first-link paths for all articles, and it is done using mapper.py and reducer.py.
38 | As the names may suggest this is kind of a map-reduce approach to the problem, although as it turned out it doesn't really require a full blown map-reduce run. It is multi-proccess so any multi-core machine with >8Gb RAM does a very quick job.
39 | 
40 | To start the process just run './run_mapper_reducer.sh'
41 | It should create a snapshot of the HashDB that will be loaded into memory by all worker processes. This greatly reduces (to 0...) the I/O intensity of the processing, making it even more nicely parallelizable.
42 | 
43 | The process take a few minutes, depending on your setup (make sure to keep the # of workers below your # of CPU cores, because python's stack is a heavy context-switch for the CPU), and should end with a single file called: outputX.dot.
44 | That file should be easily visualized by any software (example: http://fluid.media.mit.edu/people/roy/media/tree_of_knowledge5.png).
45 | I used the excellent Gephi (http://gephi.org/)
46 | 
47 | The result contains the culled network, with weights appropriately set for edges and nodes, and node's labels also set for nice visualization.
48 | Node weight is determined by the number of first-link-paths going through it, edge weight is essentially the same.
49 | 
50 | For example the outputX.dot may look like:
51 | 
52 | digraph {
53 | phenomenon [weight=720546,label="Phenomenon"];
54 | baal_teshuva_movement -> phenomenon [weight=2];
55 | feedback -> phenomenon [weight=58];
56 | phenomena -> phenomenon [weight=42];
57 | the_spooklight -> phenomenon [weight=2];
58 | tea_leaf_paradox -> phenomenon [weight=1];
59 | cognitive_capture -> phenomenon [weight=1];
60 | ...
61 | }
62 | 
63 | Acknowledgements
64 | ----------------
65 | Thanks for Aaron Zinman and Doug Fritz for their help.
66 | 
67 | Enjoy!
68 | Roy.
69 | 


--------------------------------------------------------------------------------
/mapper.py:
--------------------------------------------------------------------------------
  1 | from kyotocabinet import *
  2 | import sys
  3 | from operator import itemgetter
  4 | from pprint import PrettyPrinter
  5 | import pickle
  6 | from multiprocessing import Pool
  7 | import time
  8 | 
  9 | pp = PrettyPrinter(indent = 4)
 10 | 
 11 | def followOne(k,v,db,histogram,update_hist):
 12 | 	# find the "terminal" node for this source 
 13 | 	# (not efficient, can utilize the path for other sources)
 14 | 	visited = [v]
 15 | 	while db[v] and db[v] not in visited:
 16 | 		#print "\t ->", db[v]
 17 | 		visited.append(db[v])	# break the cycle 
 18 | 		v = db[v]
 19 | 		if v == 'philosophy': break
 20 | 	#if v not in ['philosophy','data_storage_device','association_football','transmission__telecommunications_','comparison','accounting_software','advocacy_group','recording','bloom','isotorpy']:
 21 | 		#print k,'\n\t->',
 22 | 		#print "\n\t->".join(visited)
 23 | 
 24 | 	if histogram is None:
 25 | 		print "\n\t->".join(visited)
 26 | 		return
 27 | 
 28 | 	for i in range(len(visited)-1):
 29 | 		v = visited[i]
 30 | 		v1 = visited[i+1]
 31 | 		if not v1 in histogram: 
 32 | 			histogram[v1] = {'_w':1,v:1}
 33 | 		else:
 34 | 			histogram[v1]['_w'] = histogram[v1]['_w'] + 1
 35 | 			if not v in histogram[v1]: histogram[v1][v] = 0
 36 | 			histogram[v1][v] = histogram[v1][v] + 1
 37 | '''
 38 | 	if v not in histogram:
 39 | 		if update_hist: 
 40 | 			histogram[v] = {'w':1}
 41 | 			if len(visited)>1: histogram[v][visited[-2]] = {'w':1}	# keep populating histogram
 42 | 			if len(visited)>2: histogram[v][visited[-2]][visited[-3]] = {'w':1}
 43 | 			if len(visited)>3: histogram[v][visited[-2]][visited[-3]][visited[-4]] = {'w':1}
 44 | 			if len(visited)>4: histogram[v][visited[-2]][visited[-3]][visited[-4]][visited[-5]] = {'w':1}
 45 | 
 46 | 
 47 | 	else:
 48 | 		histogram[v]['w'] = histogram[v]['w'] + 1
 49 | 		if len(visited)>1: 
 50 | 			if visited[-2] not in histogram[v]: histogram[v][visited[-2]] = {'w':1}
 51 | 			else: histogram[v][visited[-2]]['w'] = histogram[v][visited[-2]]['w'] + 1
 52 | 		if len(visited)>2: 
 53 | 			if visited[-3] not in histogram[v][visited[-2]]: histogram[v][visited[-2]][visited[-3]] = {'w':1}
 54 | 			else: histogram[v][visited[-2]][visited[-3]]['w'] = histogram[v][visited[-2]][visited[-3]]['w'] + 1
 55 |  		if len(visited)>3: 
 56 | 			if visited[-4] not in histogram[v][visited[-2]][visited[-3]]: histogram[v][visited[-2]][visited[-3]][visited[-4]]= {'w':1}
 57 | 			else: histogram[v][visited[-2]][visited[-3]][visited[-4]]['w'] = histogram[v][visited[-2]][visited[-3]][visited[-4]]['w'] + 1
 58 |  		if len(visited)>4: 
 59 | 			if visited[-5] not in histogram[v][visited[-2]][visited[-3]][visited[-4]]: histogram[v][visited[-2]][visited[-3]][visited[-4]][visited[-5]] = {'w':1}
 60 | 			else: histogram[v][visited[-2]][visited[-3]][visited[-4]][visited[-5]]['w'] = histogram[v][visited[-2]][visited[-3]][visited[-4]][visited[-5]]['w'] + 1
 61 | '''
 62 | 
 63 | p_db = None
 64 | 
 65 | def traverse(start_index):	
 66 | 	global p_db
 67 | 
 68 | 	histogram = {}
 69 | 	update_hist = True
 70 | 
 71 | 	if p_db is None:
 72 | 		p_db = DB()
 73 | 		p_db.open(":") #"tempcabinet.kch",DB.OREADER | DB.ONOLOCK)
 74 | 		print >>sys.stderr, "load db snapshot"
 75 | 		p_db.load_snapshot('tempcabinet.snapshot')
 76 | 
 77 | 	start_time = time.time()
 78 | 
 79 | 	'''	
 80 | 	db = DB(opts=[DB.GCONCURRENT])
 81 | 	# open the database, reader, no lock
 82 | 	if not db.open("tempcabinet.kch", DB.OREADER | DB.ONOLOCK):
 83 | 		print >>sys.stderr, "open error: " + str(db.error())
 84 | 		sys.exit(0)
 85 | 	'''
 86 | 	# traverse records
 87 | 	#for i in range(1,2):
 88 | 	print "traverse, jump to",start_index
 89 | 
 90 | 	cur = p_db.cursor()
 91 | 	cur.jump(start_index)
 92 | 	count = 0
 93 | 	#while True:
 94 | 	for j in range (1,10000):
 95 | 		rec = cur.get(True)
 96 | 		if not rec: break
 97 | 		(k,v) = rec
 98 | 		followOne(k,v,p_db,histogram,update_hist)
 99 | 		if j % 1000 == 0: print j,"hist:",len(histogram)
100 | 
101 | 	histogram_temp = sorted(histogram.items(),key=lambda x: x[1]["_w"])
102 | 	histogram = dict(histogram_temp[-55:])
103 | 
104 | 	print >>sys.stderr, "done traverse ",start_index
105 | 
106 | 	#cur.jump()
107 | 	#while True:
108 | 	#	rec = cur.get(True)
109 | 	#	if not rec: break
110 | 	#	print rec[0],':',rec[1]
111 | #	pp.pprint(sorted(histogram.items(), key=lambda x: x[1]["w"]))
112 | 
113 | 	histogram_name = 'histogram_'+start_index+'.pickle'
114 | 	p = pickle.Pickler(open(histogram_name,'w'))
115 | 	p.dump(histogram)
116 | 
117 | 	cur.disable()
118 | 	#p_db.close()
119 | 
120 | 	print "traverse took {0} seconds".format(time.time()-start_time)
121 | 
122 | 	return histogram_name
123 | 
124 | if __name__=="__main__":
125 | 	
126 | 	print >>sys.stderr, "preparing keys jumps for workers"
127 | 	db = DB()
128 | 	#db.open("tempcabinet.kch",DB.OREADER | DB.ONOLOCK)
129 | 	db.open(":")
130 | 	print >>sys.stderr, "load db snapshot"
131 | 	db.load_snapshot('tempcabinet.snapshot')
132 | 
133 | 	if len(sys.argv) > 1:
134 | 		if not db[sys.argv[1]]:
135 | 			print "can;t find key ",sys.argv[1]
136 | 		else:
137 | 			followOne(sys.argv[1],db[sys.argv[1]],db,None,False)
138 | 		exit()
139 | #	else:
140 | 	cur = db.cursor()
141 | 	cur.jump()
142 | 	keys = []
143 | 	for i in range(0,db.count()):
144 | 		rec = cur.get(True)
145 | 		if not rec: break
146 | 		(k,v) = rec
147 | 		if i % 10000 == 0: keys.append(k)
148 | 
149 | 	cur.disable()
150 | 	db.close()
151 | 
152 | 	pool = Pool(processes=6)
153 | 	try:
154 | 		print "histograms created: ",pool.map(traverse,keys)
155 | 	except(KeyboardInterrupt):
156 | 		print "Killing all processes..."
157 | 		pool.terminate()
158 | 		pool.join()
159 | 
160 | #	db.close()
161 | 


--------------------------------------------------------------------------------
/zmq_wikiarticle_client.py:
--------------------------------------------------------------------------------
  1 | import zmq,re,sys,traceback
  2 | from lxml import etree
  3 | import simpleWiki
  4 | 
  5 | verbose = False 
  6 | 
  7 | def removeBalanced(article_text,delim_open,delim_close):
  8 | 	stack = []
  9 | 	ptr = 0
 10 | 	nothingDone = False
 11 | 	while not nothingDone: #article_text.find(delim_open) > -1 or article_text.find(delim_close) > -1 or  # not efficient...
 12 | 		nothingDone = True
 13 | 		open_pos = article_text.find(delim_open,ptr)
 14 | 		close_pos = article_text.find(delim_close,ptr)
 15 | 		if open_pos > -1 and open_pos < close_pos:
 16 | 			ptr = open_pos + len(delim_open) 
 17 | 			if verbose: 
 18 | 				print "found ",delim_open," at ",open_pos
 19 | 			if len(stack)>0 and open_pos == stack[-1]:  # in case we already found this..
 20 | 				if verbose:
 21 | 					print 'skipping...'
 22 | 				continue
 23 | 			stack.append(open_pos)
 24 | 			nothingDone = False
 25 | 		elif close_pos > -1:
 26 | 			if verbose:
 27 | 				print "found ",delim_close," at ",close_pos
 28 | 			#ptr = close_pos
 29 | 			try:
 30 | 				from_pos = stack.pop()
 31 | 				to_pos = close_pos+len(delim_close)
 32 | 				article_text = article_text[:from_pos] + article_text[to_pos:]
 33 | 				if len(stack) > 0:
 34 | 					ptr = stack[-1] + len(delim_open)
 35 | 				else:
 36 | 					ptr = 0
 37 | 				if verbose:
 38 | 					print "delete {0} to {1}, ptr = {2}".format(from_pos,to_pos,ptr)
 39 | 				nothingDone = False
 40 | 			except(IndexError):
 41 | 				break	#some error i probably don't want to deal with...
 42 | 	return article_text
 43 | 
 44 | def getFirstLink(article_text):
 45 | 	firstlink = article_text[article_text.find('[[')+2:article_text.find(']]')]
 46 | 	if firstlink.find('|') > -1:
 47 | 		firstlink = firstlink[:firstlink.find('|')]
 48 | 	return firstlink.strip().lower()
 49 | 
 50 | def normalizeLink(link):
 51 | 	link = re.sub('\#.*$','',link)
 52 | 	link = re.sub('\W','_',link.strip().lower())
 53 | 	return link
 54 | 
 55 | def writeLink(title,link,fh):
 56 | 	link = normalizeLink(link)
 57 | 	title = normalizeLink(title)
 58 | 	if not link == title and len(link) != 0 and len(title) != 0:
 59 | 		linkstr = "{0} -> {1}".format(title,link)
 60 | 		if verbose: print "link: ", linkstr 
 61 | 		fh.write(linkstr+'\n')
 62 | 		fh.flush()
 63 | 
 64 | 
 65 | class article_client:
 66 | 	def __init__(self,outputfile,number=-1):
 67 | 		self.linksFiles = [idx+"_"+outputfile for idx in ["third","forth","fifth"]] #outputfile
 68 | 		self.linksFilesHandles = [open(filename,'w') for filename in self.linksFiles]
 69 | 		self.title = 'temp'
 70 | 		self.number = number
 71 | 
 72 | 	def start_client(self):
 73 | 		context = zmq.Context()
 74 | 
 75 | 		#  Socket to talk to server
 76 | 		print "Connecting to article server..."
 77 | 		socket = context.socket(zmq.REQ)
 78 | 		socket.connect ("tcp://localhost:5555")
 79 | 
 80 | #		self.linksFile = output_file #'links_'+sys.argv[1]+'.DOT','w')
 81 | 
 82 | 		while True:
 83 | 			try:
 84 | 				if verbose:
 85 | 					print "Sending request "
 86 | 				socket.send ("GIVE_ARTICLE")
 87 | 
 88 | 				#  Get the reply.
 89 | 				message = socket.recv()
 90 | 				#print "Received reply [", message, "]"
 91 | 				if message.find('ALL_DONE') == 0:
 92 | 					break
 93 | 
 94 | 				self.parseResponse(message)
 95 | 				 
 96 | 					
 97 | 				#break
 98 | 				#continue
 99 | 			except(KeyboardInterrupt):
100 | 				print "exiting..."
101 | 				break
102 | 
103 | 	def parseResponse(self,message):
104 | 		root = etree.fromstring(message)
105 | 		
106 | 		self.title = root.find("title").text.encode('ascii','replace').strip().lower()
107 | 		#print >> sys.stderr, self.title, "[",self.number,"]"
108 | 
109 | 		if verbose:
110 | 			print "parsing article ",self.title
111 | 
112 | 		try:
113 | 			article_text = root.xpath("/page/revision/text")[0].text.encode('ascii','replace')
114 | 		except(AttributeError):
115 | 			if verbose:
116 | 				print "can't read text!"
117 | 			return False
118 | 
119 | 		if re.search('\{\{(disambig.*?|geodis)\}\}',article_text) is not None or self.title.find('(disambiguation)') > -1:
120 | 			if verbose:
121 | 				print "This is a disambig..."
122 | 			#break	
123 | 			return False
124 | 
125 | 
126 | 		if verbose:
127 | 			print "is this a redirect? ", root.find("redirect") != None
128 | 
129 | 		#if root.find("redirect") != None:
130 | 		#link = getFirstLink(article_text)
131 | 		try:
132 | 			#link = simpleWiki.getMediaWikiFirstLink(article_text)
133 | 			links = simpleWiki.getNFirstLinks(article_text,5) # get first 5 links
134 | 			#link = simpleWiki.getNthLink(article_text,2)
135 | 			for i in range(0,2):				# scan the last 3 (link # 3,4,5)
136 | 				link = links[2+i]
137 | 				writeLink(self.title,link,self.linksFilesHandles[i]) # write each link to a diff file
138 | 		except:
139 | 			exc_type, exc_value, exc_traceback = sys.exc_info()
140 | 			traceback.print_tb(exc_traceback, limit=1, file=sys.stderr)
141 | 		 	
142 | 
143 | 		return True
144 | 
145 | 
146 | 		
147 | #		return self.parseText(article_text)	
148 | 
149 | 	def parseText(self,article_text):
150 | 		try:
151 | 			#link = simpleWiki.getMediaWikiFirstLink(article_text)
152 | 			link = simpleWiki.getNthLink(article_text,2)
153 | 			writeLink(self.title,link,self.linksFile)
154 | 		except:
155 | 			exc_type, exc_value, exc_traceback = sys.exc_info()
156 | 			traceback.print_tb(exc_traceback, limit=1, file=sys.stderr)
157 | 		'''
158 | 		article_text = removeBalanced(article_text,'{{','}}')
159 | 		#article_text = removeBalanced(article_text,'(',')')
160 | 
161 | 	#		article_text = re.sub(r'\{\{[\s\S]*?\}\}','',article_text)
162 | 		article_text = re.sub(r'\[\[([Ii]mage|[Ff]ile)[\s\S]*?\]\]\n','',article_text)  # remove image links
163 | 	#		article_text = re.sub(r'\([\s\S]*?\)','',article_text) 			 # remove paretheses
164 | 		article_text = re.sub(r'&lt;\!--[\s\S]*?--&gt;','',article_text)	# remove html remarks
165 | 		article_text = re.sub(r'<!--[\s\S]*?-->','',article_text)	# remove html remarks
166 | 		article_text = re.sub(r'\:\'\'.*?\'\'','',article_text)			# remove wiki italics
167 | 		article_text = re.sub(r'<ref[\s\S]*?</ref>','',article_text)		# revmoe refs
168 | 		article_text = re.sub(r'\(from \[\[[\s\S]*?\)','',article_text)
169 | 		article_text = re.sub(r'\[\[wikt\:[\s\S]*?\]\]','',article_text) 	# wikitionary links
170 | 
171 | 		if verbose:
172 | 			print article_text	
173 | 
174 | 		firstlink = getFirstLink(article_text)
175 | 		writeLink(self.title,firstlink,self.linksFile)
176 | 		'''
177 | 
178 | 		return True
179 | 
180 | if __name__ == "__main__":
181 | 	client = article_client(sys.stdout)
182 | #	client.start_client('output'+sys.argv[1]+'.DOT')
183 | 	if len(sys.argv) > 2 and sys.argv[2] == 'verbose': verbose = True
184 | 	client.parseText(open(sys.argv[1],'r').read())
185 | 


--------------------------------------------------------------------------------