├── run_mapper_reducer.sh
├── copy_to_remote.py
├── article_clients_runner.py
├── parse_xml.py
├── parselinks.py
├── zmq_wikiarticle_server.py
├── simpleWiki.py
├── reducer.py
├── README
├── mapper.py
└── zmq_wikiarticle_client.py
/run_mapper_reducer.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | python -c "from kyotocabinet import *; db=DB(); db.open('tempcabinet.kch',DB.OREADER); db.dump_snapshot('tempcabinet.snapshot'); db.close()"
4 | rm histogram_*
5 | nice python ../mapper.py > /dev/null
6 | nice python ../reducer.py > output5.dot
7 |
--------------------------------------------------------------------------------
/copy_to_remote.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 | import sys
3 |
4 | def file_len(fname):
5 | p = subprocess.Popen(['wc', '-l', fname], stdout=subprocess.PIPE,
6 | stderr=subprocess.PIPE)
7 | result, err = p.communicate()
8 | if p.returncode != 0:
9 | raise IOError(err)
10 | return int(result.strip().split()[0])
11 |
12 | flen = file_len(sys.argv[1])
13 | print "size of ",sys.argv[1]," is ",str(flen)
14 |
15 |
16 | for i in range(2000,flen,2000):
17 | print "copy {0} to {1}".format(i,i+2000)
18 | subprocess.call(["/Users/roysh/wikidb/copy_remote.sh",str(i),sys.argv[1]])
19 |
--------------------------------------------------------------------------------
/article_clients_runner.py:
--------------------------------------------------------------------------------
1 | from multiprocessing import Pool
2 | from zmq_wikiarticle_client import *
3 |
4 | def f(num):
5 | filenm = 'links_{0}.DOT'.format(num)
6 | print "start links file ",filenm
7 | client = article_client(open(filenm,'w'),number=num)
8 | client.start_client()
9 | return True
10 |
11 | NPROCESSES = 6
12 |
13 | if __name__ == '__main__':
14 | #zmq_wikiarticle_client.verbose = True
15 | context = zmq.Context()
16 |
17 | # Socket to talk to server
18 | # print "Connecting to article server..."
19 | # socket = context.socket(zmq.REQ)
20 | # socket.connect ("tcp://localhost:5555")
21 |
22 | # print "ordering server to restart"
23 | # socket.send("START_OVER")
24 |
25 | pool = Pool(processes=NPROCESSES) # start worker processes
26 |
27 | pool.map(f, range(NPROCESSES))
28 |
--------------------------------------------------------------------------------
/parse_xml.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | import sys,re,os
4 | from cStringIO import StringIO
5 |
6 | noncharre = re.compile('\W')
7 |
8 | if __name__ == "__main__":
9 | page = StringIO()
10 | gotid = False
11 | while 1:
12 | next = sys.stdin.readline() # read a one-line string
13 | if not next: # or an empty string at EOF
14 | break
15 |
16 | page.write(next)
17 |
18 | if next.strip().find('
') > -1:
19 | title = next[next.find('title>')+6:next.find('') > -1:
24 | title = next[next.find('')+4:next.find(' -1:
28 | filename = title+'.xml'
29 | if not os.path.exists(filename):
30 | open(filename,'w').write(page.getvalue())
31 | page.close()
32 | page = StringIO()
33 | gotid = False
34 |
35 |
--------------------------------------------------------------------------------
/parselinks.py:
--------------------------------------------------------------------------------
1 | from kyotocabinet import *
2 | import sys
3 |
4 | # create the database object
5 | db = DB()
6 |
7 | # open the database
8 | if not db.open("tempcabinet.kch", DB.OWRITER | DB.OCREATE):
9 | print >>sys.stderr, "open error: " + str(db.error())
10 | sys.exit(0)
11 |
12 | if not db.clear():
13 | print >>sys.stderr, "cant clear "+str(db.error())
14 | sys.exit(0)
15 |
16 | inc = 0
17 |
18 | fh = open(sys.argv[1],'r')
19 | for line in fh.xreadlines():
20 | # if line == None:
21 | # break
22 | if line.find(" -> ") == -1: continue
23 | line = line.strip()
24 |
25 | try:
26 | (k,v) = line.split(" -> ")
27 | except(ValueError):
28 | continue
29 | v = v.strip()
30 | k = k.strip()
31 | db[k] = v
32 | # if not db.get(v):
33 | # db[v] = 0
34 |
35 | # def incproc(key,value):
36 | # return value + 1
37 | # db.accpet(v,incproc) # attempt using procedure
38 | # db[v] = int(db[v]) + 1
39 |
40 | print >>sys.stderr, "done reading file"
41 |
42 | db.copy("backup.kch") # make a backup
43 |
44 | # close the database
45 | if not db.close():
46 | print >>sys.stderr, "close error: " + str(db.error())
47 |
48 |
--------------------------------------------------------------------------------
/zmq_wikiarticle_server.py:
--------------------------------------------------------------------------------
1 | import zmq
2 | import time
3 | import sys,re,os
4 | from cStringIO import StringIO
5 |
6 | def roll_to_start(fh):
7 | last_pos = fh.tell()
8 | line = fh.readline()
9 | while line.find('') == -1:
10 | last_pos = fh.tell()
11 | line = fh.readline()
12 | fh.seek(last_pos)
13 |
14 |
15 | print "opening zmq server"
16 |
17 | context = zmq.Context()
18 | socket = context.socket(zmq.REP)
19 | socket.bind("tcp://*:5555")
20 |
21 | noncharre = re.compile('\W')
22 |
23 | fh = open('enwiki-latest-pages-articles.xml','r')
24 | roll_to_start(fh)
25 | print "starting pages at ",fh.tell()
26 |
27 | print "serving clients"
28 | serving = True
29 | while serving:
30 | # Wait for next request from client
31 | message = socket.recv()
32 | #print "Received request: ", message
33 |
34 | if message.find("START_OVER") > -1:
35 | print "starting over..."
36 | roll_to_start(fh)
37 | continue
38 |
39 | page = StringIO()
40 | gotid = False
41 | filerunning = True
42 | response = ''
43 | while filerunning:
44 | #next = sys.stdin.readline() # read a one-line string
45 | next = fh.readline()
46 | if not next: # or an empty string at EOF
47 | filerunning = False
48 | serving = False
49 | response = "ALL_DONE"
50 | else:
51 | # if not next.find(' -1:
52 | # continue #roll forward untill beginning of page
53 |
54 | page.write(next)
55 |
56 | if next.strip().find('') > -1:
57 | title = next[next.find('title>')+6:next.find('') > -1:
62 | # title = next[next.find('')+4:next.find(' -1:
67 | response = page.getvalue()
68 | page.close()
69 | page = StringIO()
70 | gotid = False
71 | filerunning = False
72 |
73 | #print "serving: ",title
74 | # Send reply back to client
75 | socket.send(response)
76 |
77 |
--------------------------------------------------------------------------------
/simpleWiki.py:
--------------------------------------------------------------------------------
1 | from pyparsing import *
2 | import sys,re
3 |
4 | debug = False
5 |
6 | def getMediaWikiFirstLink(text):
7 | links = parseMediaWiki(text)
8 |
9 | if len(links) == 0:
10 | raise ParseException('no links found')
11 |
12 | links = [ln for ln in links if re.search('^(file|image)',ln.lower()) is None]
13 | firstlink = links[0]
14 | if firstlink.find('|') > -1:
15 | firstlink = firstlink[:firstlink.find('|')]
16 |
17 | return firstlink
18 |
19 |
20 | def getNthLink(text,N):
21 | links = parseMediaWiki(text)
22 | if len(links) == 0:
23 | raise ParseException('no links found')
24 | links = [ln for ln in links if re.search('^(file|image)',ln.lower()) is None]
25 | if len(links) < N:
26 | Nlink = links[-1] #take the last one...
27 | else:
28 | Nlink = links[N-1]
29 | if Nlink.find('|') > -1:
30 | Nlink = Nlink[:Nlink.find('|')]
31 |
32 | return Nlink
33 |
34 | def getNFirstLinks(text,N):
35 | links = parseMediaWiki(text)
36 | if len(links) == 0:
37 | raise ParseException('no links found')
38 | links = [ln for ln in links if re.search('^(file|image)',ln.lower()) is None]
39 | if len(links) < N:
40 | Nlinks = links
41 | else:
42 | Nlinks = links[0:N-1]
43 | returnLinks = []
44 | for ln in Nlinks:
45 | if ln.find('|') > -1:
46 | ln = ln[:ln.find('|')]
47 | returnLinks.append(ln)
48 | return returnLinks
49 |
50 | def parseMediaWiki(text):
51 |
52 | # ############################# Grammer #################################
53 |
54 | textNoStop = Regex('[^\s\{\}\[\]\(\)]+')
55 | myHtmlComment = QuotedString("",multiline=True)
56 | regularText = (textNoStop ^ Literal("[") ^ Literal("]") )
57 |
58 | regularBrackets = Forward()
59 | regularBrackets << Combine(Literal("(") + ZeroOrMore(Regex('[^\(\)]+') ^ regularBrackets) + Literal(")"))
60 |
61 | link = Forward()
62 | link << Combine( Literal("[[").suppress() + ZeroOrMore(Regex('[^\[\]]+') ^ link) + Literal("]]").suppress())
63 |
64 | curlyShit = Forward()
65 | curlyShit << Combine( Literal("{{") + ZeroOrMore( Regex('[^\{\}]+') ^ curlyShit ) + Literal("}}") , joinString=" ")
66 |
67 | curlyCurlyBar = QuotedString("{|",endQuoteChar="|}",multiline=True)+Optional(QuotedString("}",endQuoteChar="|}",multiline=True))
68 | strangeCurlyBar = QuotedString("|",endQuoteChar="|}",multiline=True) #+NotAny(Literal("}")) # strangely it may also appear like this...
69 | curlyBar = curlyCurlyBar ^ strangeCurlyBar
70 |
71 | strangeBeginRemark = Combine(Literal(":") + QuotedString("''") , joinString=" ")
72 |
73 | if debug:
74 | wikiMarkup = OneOrMore(regularText ^ strangeBeginRemark ^ curlyBar ^ curlyShit ^ myHtmlComment ^ link ^ regularBrackets)
75 | else:
76 | wikiMarkup = Optional(OneOrMore(regularText.suppress() ^ strangeBeginRemark.suppress() ^ curlyBar.suppress() ^ curlyShit.suppress() ^ myHtmlComment.suppress() ^ link ^ regularBrackets.suppress()))
77 |
78 | return wikiMarkup.parseString(text)
79 |
80 | if __name__=="__main__":
81 | if len(sys.argv) > 2 and sys.argv[2] == "debug": debug = True
82 | text = open(sys.argv[1]).read().decode('utf-8').encode('ascii','ignore')
83 | print "Original\n\n",text
84 | print "parse\n\n",parseMediaWiki(text),"\n\n"
85 | print "first link\n\n",getMediaWikiFirstLink(text),"\n\n"
86 |
--------------------------------------------------------------------------------
/reducer.py:
--------------------------------------------------------------------------------
1 | import multiprocessing
2 | import pickle
3 | import glob,pprint,sys,re
4 |
5 | def combine_recurse(hist1,hist2):
6 | hist1_k = list(set(hist1.keys()) - set(['w']))
7 |
8 | if 'w' in hist1.keys(): hist1['w'] += hist2['w']
9 |
10 | for v2 in list(set(hist2.keys()) - set(['w'])):
11 | if v2 in hist1.keys():
12 | #print "combine ",v2
13 | # already exists - combine recurse
14 | #hist1[v2]['w'] = hist1[v2]['w'] + hist2[v2]['w']
15 | combine_recurse(hist1[v2],hist2[v2])
16 | else:
17 | hist1[v2] = hist2[v2] # just add
18 |
19 |
20 | def reduce(hist_q):
21 | # if hist_q.empty(): return
22 |
23 | # get two histograms
24 | (hist1,hist2) = hist_q.get(True)
25 |
26 | return combine(hist1,hist2)
27 |
28 | def combine(hist1,hist2):
29 | print "combining: {0} and {1}".format(hist1,hist2)
30 |
31 | # combine them
32 | hist1_o = pickle.Unpickler(open(hist1,'r')).load()
33 | hist2_o = pickle.Unpickler(open(hist2,'r')).load()
34 | combine_recurse(hist1_o,hist2_o)
35 |
36 | # pickle the result
37 | filename = '_'+hist1
38 | pickle.Pickler(open(filename,'w')).dump(hist1_o)
39 |
40 | return filename
41 |
42 | def flatten(hist):
43 | flat = {}
44 | if hist is None or hist.keys() is None: return {}
45 | for i in hist.keys():
46 | if not isinstance(hist[i],{}.__class__) or hist[i] is None: continue
47 | flat[i] = hist[i]
48 | subflat = flatten(hist[i])
49 | if subflat is not None and len(subflat)>0: flat.update(subflat)
50 |
51 | def newcombine(hist1,hist2):
52 | for v in hist2.keys():
53 | if v == '_w': continue
54 |
55 | if v in hist1:
56 | hist1[v]['_w'] = hist1[v]['_w'] + hist2[v]['_w']
57 | # hist1[v].update(hist2[v])
58 | for k in hist2[v]:
59 | if k in hist1[v]: hist1[v][k] = hist1[v][k] + hist2[v][k]
60 | else: hist1[v][k] = hist2[v][k]
61 | if len(hist1[v]) > 80:
62 | #pprint.PrettyPrinter(2).pprint(hist[v])
63 | histogram_temp = sorted(hist1[v].items(),key=lambda x: x[1])
64 | #pprint.PrettyPrinter(2).pprint(histogram_temp[-25:])
65 | hist1[v] = dict(histogram_temp[-25:])
66 | #print "cull internal ",v, ", biggest:",histogram_temp[-25],"now:",len(hist[v])
67 | else:
68 | hist1[v] = hist2[v]
69 |
70 | def textify_title(title):
71 | title = re.sub("\_"," ",title) #underscores
72 | title = re.sub("\s+"," ",title) #extra spaces
73 | return " ".join([w.capitalize() for w in title.split()]) #capitalize
74 |
75 | if __name__=="__main__":
76 | # q = multiprocessing.Queue()
77 | # q.put(('histogram_1979_ml1.pickle','histogram_1990_great_american_bank_classic.pickle'))
78 | # reduce(q)
79 | files = glob.glob("histogram_*")
80 | # newh = combine(files.pop(),files.pop())
81 | # for filen in glob.glob("histogram_*"):
82 | # newh = combine(newh,filen)
83 |
84 | print >>sys.stderr, "load ",files[-1]
85 | hist = pickle.Unpickler(open(files.pop(),'r')).load()
86 | for f in files:
87 | print >>sys.stderr, "combine with ",f
88 | newcombine(hist,pickle.Unpickler(open(f,'r')).load())
89 | #pprint.PrettyPrinter(2).pprint(hist)
90 |
91 | print "digraph {"
92 | for k in hist:
93 | print "{0} [weight={1},label=\"{2}\"];".format(k,hist[k]['_w'],textify_title(k))
94 | for v in hist[k]:
95 | if v == '_w': continue
96 | print "{1} -> {0} [weight={2}];".format(k,v,hist[k][v])
97 | print "}"
98 |
--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
1 | Tools for parsing the Wikipedia database in the MediaWiki format, and (potentialy) distributed tools for getting the linkage scheme of the complete database.
2 |
3 | Where To Start
4 | --------------
5 | First you should download a dump of the Wikipedia databse: http://en.wikipedia.org/wiki/Wikipedia:Database_download
6 |
7 | Then, use the distributed tools: zmq_wikiartcile_server.py and zmq_wikiarticle_client.py
8 | They are based on a ZeroMQ pipe that shuffles the raw articles from the database onto worker processes.
9 |
10 | The server looks for the "enwiki-latest-pages-articles.xml" file that is contained in the tar.gz you downloaded.
11 | It opens a ZMMQ server on port 5555 and waits for the clients to request an article.
12 |
13 | To start the clients pool run 'python article_clients_runner.py'
14 | It will open 7 (configurable...) processes of the 'zmw_wikiarticle_client' module.
15 | Each client requests a page, parses it using pyparsing (more on this here: http://www.morethantechnical.com/2011/06/16/getting-all-the-links-from-a-mediawiki-format-using-pyparsing/) and saves all the first found links in a text file.
16 |
17 | The client can be debugged by running 'python zmw_wikiarticle_client.py raw_article.txt'
18 |
19 | The workers take some hours to complete parsing the whole DB. This process can be greatly parallelized, distributed, etc., as you can run the workers in a cluster. The work is CPU intensive.
20 |
21 | The parser code exists in simpleWiki.py.
22 |
23 | First-Link Paths
24 | ----------------
25 | So far for generic parsing, I moved on to get the first-link-path (http://en.wikipedia.org/wiki/Wikipedia_talk:Get_to_Philosophy#A_plea_to_the_authors_of_the_tools_at_ryanelmquist.com_and_xefer.com).
26 |
27 | After the workers are done, you will end up with links_*.DOT files
28 | They are in the DOT language format, which means GraphViz and other can read them, but they can't really be visualized as the linkage is way too dense...
29 | So the next step is to run:
30 |
31 | cat links_* > links_all.DOT
32 | python parselinks.py links_all.DOT
33 |
34 | This will build a KyotoCabinet (http://fallabs.com/kyotocabinet/pythondoc/) HashDB of all the linkage: tempcabinet.kch.
35 | It should be ~600Mb.
36 |
37 | The next step is traversing the first-link paths for all articles, and it is done using mapper.py and reducer.py.
38 | As the names may suggest this is kind of a map-reduce approach to the problem, although as it turned out it doesn't really require a full blown map-reduce run. It is multi-proccess so any multi-core machine with >8Gb RAM does a very quick job.
39 |
40 | To start the process just run './run_mapper_reducer.sh'
41 | It should create a snapshot of the HashDB that will be loaded into memory by all worker processes. This greatly reduces (to 0...) the I/O intensity of the processing, making it even more nicely parallelizable.
42 |
43 | The process take a few minutes, depending on your setup (make sure to keep the # of workers below your # of CPU cores, because python's stack is a heavy context-switch for the CPU), and should end with a single file called: outputX.dot.
44 | That file should be easily visualized by any software (example: http://fluid.media.mit.edu/people/roy/media/tree_of_knowledge5.png).
45 | I used the excellent Gephi (http://gephi.org/)
46 |
47 | The result contains the culled network, with weights appropriately set for edges and nodes, and node's labels also set for nice visualization.
48 | Node weight is determined by the number of first-link-paths going through it, edge weight is essentially the same.
49 |
50 | For example the outputX.dot may look like:
51 |
52 | digraph {
53 | phenomenon [weight=720546,label="Phenomenon"];
54 | baal_teshuva_movement -> phenomenon [weight=2];
55 | feedback -> phenomenon [weight=58];
56 | phenomena -> phenomenon [weight=42];
57 | the_spooklight -> phenomenon [weight=2];
58 | tea_leaf_paradox -> phenomenon [weight=1];
59 | cognitive_capture -> phenomenon [weight=1];
60 | ...
61 | }
62 |
63 | Acknowledgements
64 | ----------------
65 | Thanks for Aaron Zinman and Doug Fritz for their help.
66 |
67 | Enjoy!
68 | Roy.
69 |
--------------------------------------------------------------------------------
/mapper.py:
--------------------------------------------------------------------------------
1 | from kyotocabinet import *
2 | import sys
3 | from operator import itemgetter
4 | from pprint import PrettyPrinter
5 | import pickle
6 | from multiprocessing import Pool
7 | import time
8 |
9 | pp = PrettyPrinter(indent = 4)
10 |
11 | def followOne(k,v,db,histogram,update_hist):
12 | # find the "terminal" node for this source
13 | # (not efficient, can utilize the path for other sources)
14 | visited = [v]
15 | while db[v] and db[v] not in visited:
16 | #print "\t ->", db[v]
17 | visited.append(db[v]) # break the cycle
18 | v = db[v]
19 | if v == 'philosophy': break
20 | #if v not in ['philosophy','data_storage_device','association_football','transmission__telecommunications_','comparison','accounting_software','advocacy_group','recording','bloom','isotorpy']:
21 | #print k,'\n\t->',
22 | #print "\n\t->".join(visited)
23 |
24 | if histogram is None:
25 | print "\n\t->".join(visited)
26 | return
27 |
28 | for i in range(len(visited)-1):
29 | v = visited[i]
30 | v1 = visited[i+1]
31 | if not v1 in histogram:
32 | histogram[v1] = {'_w':1,v:1}
33 | else:
34 | histogram[v1]['_w'] = histogram[v1]['_w'] + 1
35 | if not v in histogram[v1]: histogram[v1][v] = 0
36 | histogram[v1][v] = histogram[v1][v] + 1
37 | '''
38 | if v not in histogram:
39 | if update_hist:
40 | histogram[v] = {'w':1}
41 | if len(visited)>1: histogram[v][visited[-2]] = {'w':1} # keep populating histogram
42 | if len(visited)>2: histogram[v][visited[-2]][visited[-3]] = {'w':1}
43 | if len(visited)>3: histogram[v][visited[-2]][visited[-3]][visited[-4]] = {'w':1}
44 | if len(visited)>4: histogram[v][visited[-2]][visited[-3]][visited[-4]][visited[-5]] = {'w':1}
45 |
46 |
47 | else:
48 | histogram[v]['w'] = histogram[v]['w'] + 1
49 | if len(visited)>1:
50 | if visited[-2] not in histogram[v]: histogram[v][visited[-2]] = {'w':1}
51 | else: histogram[v][visited[-2]]['w'] = histogram[v][visited[-2]]['w'] + 1
52 | if len(visited)>2:
53 | if visited[-3] not in histogram[v][visited[-2]]: histogram[v][visited[-2]][visited[-3]] = {'w':1}
54 | else: histogram[v][visited[-2]][visited[-3]]['w'] = histogram[v][visited[-2]][visited[-3]]['w'] + 1
55 | if len(visited)>3:
56 | if visited[-4] not in histogram[v][visited[-2]][visited[-3]]: histogram[v][visited[-2]][visited[-3]][visited[-4]]= {'w':1}
57 | else: histogram[v][visited[-2]][visited[-3]][visited[-4]]['w'] = histogram[v][visited[-2]][visited[-3]][visited[-4]]['w'] + 1
58 | if len(visited)>4:
59 | if visited[-5] not in histogram[v][visited[-2]][visited[-3]][visited[-4]]: histogram[v][visited[-2]][visited[-3]][visited[-4]][visited[-5]] = {'w':1}
60 | else: histogram[v][visited[-2]][visited[-3]][visited[-4]][visited[-5]]['w'] = histogram[v][visited[-2]][visited[-3]][visited[-4]][visited[-5]]['w'] + 1
61 | '''
62 |
63 | p_db = None
64 |
65 | def traverse(start_index):
66 | global p_db
67 |
68 | histogram = {}
69 | update_hist = True
70 |
71 | if p_db is None:
72 | p_db = DB()
73 | p_db.open(":") #"tempcabinet.kch",DB.OREADER | DB.ONOLOCK)
74 | print >>sys.stderr, "load db snapshot"
75 | p_db.load_snapshot('tempcabinet.snapshot')
76 |
77 | start_time = time.time()
78 |
79 | '''
80 | db = DB(opts=[DB.GCONCURRENT])
81 | # open the database, reader, no lock
82 | if not db.open("tempcabinet.kch", DB.OREADER | DB.ONOLOCK):
83 | print >>sys.stderr, "open error: " + str(db.error())
84 | sys.exit(0)
85 | '''
86 | # traverse records
87 | #for i in range(1,2):
88 | print "traverse, jump to",start_index
89 |
90 | cur = p_db.cursor()
91 | cur.jump(start_index)
92 | count = 0
93 | #while True:
94 | for j in range (1,10000):
95 | rec = cur.get(True)
96 | if not rec: break
97 | (k,v) = rec
98 | followOne(k,v,p_db,histogram,update_hist)
99 | if j % 1000 == 0: print j,"hist:",len(histogram)
100 |
101 | histogram_temp = sorted(histogram.items(),key=lambda x: x[1]["_w"])
102 | histogram = dict(histogram_temp[-55:])
103 |
104 | print >>sys.stderr, "done traverse ",start_index
105 |
106 | #cur.jump()
107 | #while True:
108 | # rec = cur.get(True)
109 | # if not rec: break
110 | # print rec[0],':',rec[1]
111 | # pp.pprint(sorted(histogram.items(), key=lambda x: x[1]["w"]))
112 |
113 | histogram_name = 'histogram_'+start_index+'.pickle'
114 | p = pickle.Pickler(open(histogram_name,'w'))
115 | p.dump(histogram)
116 |
117 | cur.disable()
118 | #p_db.close()
119 |
120 | print "traverse took {0} seconds".format(time.time()-start_time)
121 |
122 | return histogram_name
123 |
124 | if __name__=="__main__":
125 |
126 | print >>sys.stderr, "preparing keys jumps for workers"
127 | db = DB()
128 | #db.open("tempcabinet.kch",DB.OREADER | DB.ONOLOCK)
129 | db.open(":")
130 | print >>sys.stderr, "load db snapshot"
131 | db.load_snapshot('tempcabinet.snapshot')
132 |
133 | if len(sys.argv) > 1:
134 | if not db[sys.argv[1]]:
135 | print "can;t find key ",sys.argv[1]
136 | else:
137 | followOne(sys.argv[1],db[sys.argv[1]],db,None,False)
138 | exit()
139 | # else:
140 | cur = db.cursor()
141 | cur.jump()
142 | keys = []
143 | for i in range(0,db.count()):
144 | rec = cur.get(True)
145 | if not rec: break
146 | (k,v) = rec
147 | if i % 10000 == 0: keys.append(k)
148 |
149 | cur.disable()
150 | db.close()
151 |
152 | pool = Pool(processes=6)
153 | try:
154 | print "histograms created: ",pool.map(traverse,keys)
155 | except(KeyboardInterrupt):
156 | print "Killing all processes..."
157 | pool.terminate()
158 | pool.join()
159 |
160 | # db.close()
161 |
--------------------------------------------------------------------------------
/zmq_wikiarticle_client.py:
--------------------------------------------------------------------------------
1 | import zmq,re,sys,traceback
2 | from lxml import etree
3 | import simpleWiki
4 |
5 | verbose = False
6 |
7 | def removeBalanced(article_text,delim_open,delim_close):
8 | stack = []
9 | ptr = 0
10 | nothingDone = False
11 | while not nothingDone: #article_text.find(delim_open) > -1 or article_text.find(delim_close) > -1 or # not efficient...
12 | nothingDone = True
13 | open_pos = article_text.find(delim_open,ptr)
14 | close_pos = article_text.find(delim_close,ptr)
15 | if open_pos > -1 and open_pos < close_pos:
16 | ptr = open_pos + len(delim_open)
17 | if verbose:
18 | print "found ",delim_open," at ",open_pos
19 | if len(stack)>0 and open_pos == stack[-1]: # in case we already found this..
20 | if verbose:
21 | print 'skipping...'
22 | continue
23 | stack.append(open_pos)
24 | nothingDone = False
25 | elif close_pos > -1:
26 | if verbose:
27 | print "found ",delim_close," at ",close_pos
28 | #ptr = close_pos
29 | try:
30 | from_pos = stack.pop()
31 | to_pos = close_pos+len(delim_close)
32 | article_text = article_text[:from_pos] + article_text[to_pos:]
33 | if len(stack) > 0:
34 | ptr = stack[-1] + len(delim_open)
35 | else:
36 | ptr = 0
37 | if verbose:
38 | print "delete {0} to {1}, ptr = {2}".format(from_pos,to_pos,ptr)
39 | nothingDone = False
40 | except(IndexError):
41 | break #some error i probably don't want to deal with...
42 | return article_text
43 |
44 | def getFirstLink(article_text):
45 | firstlink = article_text[article_text.find('[[')+2:article_text.find(']]')]
46 | if firstlink.find('|') > -1:
47 | firstlink = firstlink[:firstlink.find('|')]
48 | return firstlink.strip().lower()
49 |
50 | def normalizeLink(link):
51 | link = re.sub('\#.*$','',link)
52 | link = re.sub('\W','_',link.strip().lower())
53 | return link
54 |
55 | def writeLink(title,link,fh):
56 | link = normalizeLink(link)
57 | title = normalizeLink(title)
58 | if not link == title and len(link) != 0 and len(title) != 0:
59 | linkstr = "{0} -> {1}".format(title,link)
60 | if verbose: print "link: ", linkstr
61 | fh.write(linkstr+'\n')
62 | fh.flush()
63 |
64 |
65 | class article_client:
66 | def __init__(self,outputfile,number=-1):
67 | self.linksFiles = [idx+"_"+outputfile for idx in ["third","forth","fifth"]] #outputfile
68 | self.linksFilesHandles = [open(filename,'w') for filename in self.linksFiles]
69 | self.title = 'temp'
70 | self.number = number
71 |
72 | def start_client(self):
73 | context = zmq.Context()
74 |
75 | # Socket to talk to server
76 | print "Connecting to article server..."
77 | socket = context.socket(zmq.REQ)
78 | socket.connect ("tcp://localhost:5555")
79 |
80 | # self.linksFile = output_file #'links_'+sys.argv[1]+'.DOT','w')
81 |
82 | while True:
83 | try:
84 | if verbose:
85 | print "Sending request "
86 | socket.send ("GIVE_ARTICLE")
87 |
88 | # Get the reply.
89 | message = socket.recv()
90 | #print "Received reply [", message, "]"
91 | if message.find('ALL_DONE') == 0:
92 | break
93 |
94 | self.parseResponse(message)
95 |
96 |
97 | #break
98 | #continue
99 | except(KeyboardInterrupt):
100 | print "exiting..."
101 | break
102 |
103 | def parseResponse(self,message):
104 | root = etree.fromstring(message)
105 |
106 | self.title = root.find("title").text.encode('ascii','replace').strip().lower()
107 | #print >> sys.stderr, self.title, "[",self.number,"]"
108 |
109 | if verbose:
110 | print "parsing article ",self.title
111 |
112 | try:
113 | article_text = root.xpath("/page/revision/text")[0].text.encode('ascii','replace')
114 | except(AttributeError):
115 | if verbose:
116 | print "can't read text!"
117 | return False
118 |
119 | if re.search('\{\{(disambig.*?|geodis)\}\}',article_text) is not None or self.title.find('(disambiguation)') > -1:
120 | if verbose:
121 | print "This is a disambig..."
122 | #break
123 | return False
124 |
125 |
126 | if verbose:
127 | print "is this a redirect? ", root.find("redirect") != None
128 |
129 | #if root.find("redirect") != None:
130 | #link = getFirstLink(article_text)
131 | try:
132 | #link = simpleWiki.getMediaWikiFirstLink(article_text)
133 | links = simpleWiki.getNFirstLinks(article_text,5) # get first 5 links
134 | #link = simpleWiki.getNthLink(article_text,2)
135 | for i in range(0,2): # scan the last 3 (link # 3,4,5)
136 | link = links[2+i]
137 | writeLink(self.title,link,self.linksFilesHandles[i]) # write each link to a diff file
138 | except:
139 | exc_type, exc_value, exc_traceback = sys.exc_info()
140 | traceback.print_tb(exc_traceback, limit=1, file=sys.stderr)
141 |
142 |
143 | return True
144 |
145 |
146 |
147 | # return self.parseText(article_text)
148 |
149 | def parseText(self,article_text):
150 | try:
151 | #link = simpleWiki.getMediaWikiFirstLink(article_text)
152 | link = simpleWiki.getNthLink(article_text,2)
153 | writeLink(self.title,link,self.linksFile)
154 | except:
155 | exc_type, exc_value, exc_traceback = sys.exc_info()
156 | traceback.print_tb(exc_traceback, limit=1, file=sys.stderr)
157 | '''
158 | article_text = removeBalanced(article_text,'{{','}}')
159 | #article_text = removeBalanced(article_text,'(',')')
160 |
161 | # article_text = re.sub(r'\{\{[\s\S]*?\}\}','',article_text)
162 | article_text = re.sub(r'\[\[([Ii]mage|[Ff]ile)[\s\S]*?\]\]\n','',article_text) # remove image links
163 | # article_text = re.sub(r'\([\s\S]*?\)','',article_text) # remove paretheses
164 | article_text = re.sub(r'<\!--[\s\S]*?-->','',article_text) # remove html remarks
165 | article_text = re.sub(r'','',article_text) # remove html remarks
166 | article_text = re.sub(r'\:\'\'.*?\'\'','',article_text) # remove wiki italics
167 | article_text = re.sub(r'[','',article_text) # revmoe refs
168 | article_text = re.sub(r'\(from \[\[[\s\S]*?\)','',article_text)
169 | article_text = re.sub(r'\[\[wikt\:[\s\S]*?\]\]','',article_text) # wikitionary links
170 |
171 | if verbose:
172 | print article_text
173 |
174 | firstlink = getFirstLink(article_text)
175 | writeLink(self.title,firstlink,self.linksFile)
176 | '''
177 |
178 | return True
179 |
180 | if __name__ == "__main__":
181 | client = article_client(sys.stdout)
182 | # client.start_client('output'+sys.argv[1]+'.DOT')
183 | if len(sys.argv) > 2 and sys.argv[2] == 'verbose': verbose = True
184 | client.parseText(open(sys.argv[1],'r').read())
185 |
--------------------------------------------------------------------------------
]