├── run_mapper_reducer.sh ├── copy_to_remote.py ├── article_clients_runner.py ├── parse_xml.py ├── parselinks.py ├── zmq_wikiarticle_server.py ├── simpleWiki.py ├── reducer.py ├── README ├── mapper.py └── zmq_wikiarticle_client.py /run_mapper_reducer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | python -c "from kyotocabinet import *; db=DB(); db.open('tempcabinet.kch',DB.OREADER); db.dump_snapshot('tempcabinet.snapshot'); db.close()" 4 | rm histogram_* 5 | nice python ../mapper.py > /dev/null 6 | nice python ../reducer.py > output5.dot 7 | -------------------------------------------------------------------------------- /copy_to_remote.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import sys 3 | 4 | def file_len(fname): 5 | p = subprocess.Popen(['wc', '-l', fname], stdout=subprocess.PIPE, 6 | stderr=subprocess.PIPE) 7 | result, err = p.communicate() 8 | if p.returncode != 0: 9 | raise IOError(err) 10 | return int(result.strip().split()[0]) 11 | 12 | flen = file_len(sys.argv[1]) 13 | print "size of ",sys.argv[1]," is ",str(flen) 14 | 15 | 16 | for i in range(2000,flen,2000): 17 | print "copy {0} to {1}".format(i,i+2000) 18 | subprocess.call(["/Users/roysh/wikidb/copy_remote.sh",str(i),sys.argv[1]]) 19 | -------------------------------------------------------------------------------- /article_clients_runner.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Pool 2 | from zmq_wikiarticle_client import * 3 | 4 | def f(num): 5 | filenm = 'links_{0}.DOT'.format(num) 6 | print "start links file ",filenm 7 | client = article_client(open(filenm,'w'),number=num) 8 | client.start_client() 9 | return True 10 | 11 | NPROCESSES = 6 12 | 13 | if __name__ == '__main__': 14 | #zmq_wikiarticle_client.verbose = True 15 | context = zmq.Context() 16 | 17 | # Socket to talk to server 18 | # print "Connecting to article server..." 19 | # socket = context.socket(zmq.REQ) 20 | # socket.connect ("tcp://localhost:5555") 21 | 22 | # print "ordering server to restart" 23 | # socket.send("START_OVER") 24 | 25 | pool = Pool(processes=NPROCESSES) # start worker processes 26 | 27 | pool.map(f, range(NPROCESSES)) 28 | -------------------------------------------------------------------------------- /parse_xml.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys,re,os 4 | from cStringIO import StringIO 5 | 6 | noncharre = re.compile('\W') 7 | 8 | if __name__ == "__main__": 9 | page = StringIO() 10 | gotid = False 11 | while 1: 12 | next = sys.stdin.readline() # read a one-line string 13 | if not next: # or an empty string at EOF 14 | break 15 | 16 | page.write(next) 17 | 18 | if next.strip().find('') > -1: 19 | title = next[next.find('title>')+6:next.find('</title')] 20 | title = noncharre.sub('-',title) 21 | title = title[:200] 22 | 23 | if not gotid and next.strip().find('<id>') > -1: 24 | title = next[next.find('<id>')+4:next.find('</id')] + '_' + title 25 | gotid = True 26 | 27 | if next.strip().find('</page') > -1: 28 | filename = title+'.xml' 29 | if not os.path.exists(filename): 30 | open(filename,'w').write(page.getvalue()) 31 | page.close() 32 | page = StringIO() 33 | gotid = False 34 | 35 | -------------------------------------------------------------------------------- /parselinks.py: -------------------------------------------------------------------------------- 1 | from kyotocabinet import * 2 | import sys 3 | 4 | # create the database object 5 | db = DB() 6 | 7 | # open the database 8 | if not db.open("tempcabinet.kch", DB.OWRITER | DB.OCREATE): 9 | print >>sys.stderr, "open error: " + str(db.error()) 10 | sys.exit(0) 11 | 12 | if not db.clear(): 13 | print >>sys.stderr, "cant clear "+str(db.error()) 14 | sys.exit(0) 15 | 16 | inc = 0 17 | 18 | fh = open(sys.argv[1],'r') 19 | for line in fh.xreadlines(): 20 | # if line == None: 21 | # break 22 | if line.find(" -> ") == -1: continue 23 | line = line.strip() 24 | 25 | try: 26 | (k,v) = line.split(" -> ") 27 | except(ValueError): 28 | continue 29 | v = v.strip() 30 | k = k.strip() 31 | db[k] = v 32 | # if not db.get(v): 33 | # db[v] = 0 34 | 35 | # def incproc(key,value): 36 | # return value + 1 37 | # db.accpet(v,incproc) # attempt using procedure 38 | # db[v] = int(db[v]) + 1 39 | 40 | print >>sys.stderr, "done reading file" 41 | 42 | db.copy("backup.kch") # make a backup 43 | 44 | # close the database 45 | if not db.close(): 46 | print >>sys.stderr, "close error: " + str(db.error()) 47 | 48 | -------------------------------------------------------------------------------- /zmq_wikiarticle_server.py: -------------------------------------------------------------------------------- 1 | import zmq 2 | import time 3 | import sys,re,os 4 | from cStringIO import StringIO 5 | 6 | def roll_to_start(fh): 7 | last_pos = fh.tell() 8 | line = fh.readline() 9 | while line.find('<page>') == -1: 10 | last_pos = fh.tell() 11 | line = fh.readline() 12 | fh.seek(last_pos) 13 | 14 | 15 | print "opening zmq server" 16 | 17 | context = zmq.Context() 18 | socket = context.socket(zmq.REP) 19 | socket.bind("tcp://*:5555") 20 | 21 | noncharre = re.compile('\W') 22 | 23 | fh = open('enwiki-latest-pages-articles.xml','r') 24 | roll_to_start(fh) 25 | print "starting pages at ",fh.tell() 26 | 27 | print "serving clients" 28 | serving = True 29 | while serving: 30 | # Wait for next request from client 31 | message = socket.recv() 32 | #print "Received request: ", message 33 | 34 | if message.find("START_OVER") > -1: 35 | print "starting over..." 36 | roll_to_start(fh) 37 | continue 38 | 39 | page = StringIO() 40 | gotid = False 41 | filerunning = True 42 | response = '' 43 | while filerunning: 44 | #next = sys.stdin.readline() # read a one-line string 45 | next = fh.readline() 46 | if not next: # or an empty string at EOF 47 | filerunning = False 48 | serving = False 49 | response = "ALL_DONE" 50 | else: 51 | # if not next.find('<page') > -1: 52 | # continue #roll forward untill beginning of page 53 | 54 | page.write(next) 55 | 56 | if next.strip().find('<title>') > -1: 57 | title = next[next.find('title>')+6:next.find('</title')] 58 | # title = noncharre.sub('-',title) 59 | # title = title[:200] 60 | # 61 | # if not gotid and next.strip().find('<id>') > -1: 62 | # title = next[next.find('<id>')+4:next.find('</id')] + '_' + title 63 | # gotid = True 64 | 65 | #done readine one page 66 | if next.strip().find('</page') > -1: 67 | response = page.getvalue() 68 | page.close() 69 | page = StringIO() 70 | gotid = False 71 | filerunning = False 72 | 73 | #print "serving: ",title 74 | # Send reply back to client 75 | socket.send(response) 76 | 77 | -------------------------------------------------------------------------------- /simpleWiki.py: -------------------------------------------------------------------------------- 1 | from pyparsing import * 2 | import sys,re 3 | 4 | debug = False 5 | 6 | def getMediaWikiFirstLink(text): 7 | links = parseMediaWiki(text) 8 | 9 | if len(links) == 0: 10 | raise ParseException('no links found') 11 | 12 | links = [ln for ln in links if re.search('^(file|image)',ln.lower()) is None] 13 | firstlink = links[0] 14 | if firstlink.find('|') > -1: 15 | firstlink = firstlink[:firstlink.find('|')] 16 | 17 | return firstlink 18 | 19 | 20 | def getNthLink(text,N): 21 | links = parseMediaWiki(text) 22 | if len(links) == 0: 23 | raise ParseException('no links found') 24 | links = [ln for ln in links if re.search('^(file|image)',ln.lower()) is None] 25 | if len(links) < N: 26 | Nlink = links[-1] #take the last one... 27 | else: 28 | Nlink = links[N-1] 29 | if Nlink.find('|') > -1: 30 | Nlink = Nlink[:Nlink.find('|')] 31 | 32 | return Nlink 33 | 34 | def getNFirstLinks(text,N): 35 | links = parseMediaWiki(text) 36 | if len(links) == 0: 37 | raise ParseException('no links found') 38 | links = [ln for ln in links if re.search('^(file|image)',ln.lower()) is None] 39 | if len(links) < N: 40 | Nlinks = links 41 | else: 42 | Nlinks = links[0:N-1] 43 | returnLinks = [] 44 | for ln in Nlinks: 45 | if ln.find('|') > -1: 46 | ln = ln[:ln.find('|')] 47 | returnLinks.append(ln) 48 | return returnLinks 49 | 50 | def parseMediaWiki(text): 51 | 52 | # ############################# Grammer ################################# 53 | 54 | textNoStop = Regex('[^\s\{\}\[\]\(\)]+') 55 | myHtmlComment = QuotedString("<!--",endQuoteChar="-->",multiline=True) 56 | regularText = (textNoStop ^ Literal("[") ^ Literal("]") ) 57 | 58 | regularBrackets = Forward() 59 | regularBrackets << Combine(Literal("(") + ZeroOrMore(Regex('[^\(\)]+') ^ regularBrackets) + Literal(")")) 60 | 61 | link = Forward() 62 | link << Combine( Literal("[[").suppress() + ZeroOrMore(Regex('[^\[\]]+') ^ link) + Literal("]]").suppress()) 63 | 64 | curlyShit = Forward() 65 | curlyShit << Combine( Literal("{{") + ZeroOrMore( Regex('[^\{\}]+') ^ curlyShit ) + Literal("}}") , joinString=" ") 66 | 67 | curlyCurlyBar = QuotedString("{|",endQuoteChar="|}",multiline=True)+Optional(QuotedString("}",endQuoteChar="|}",multiline=True)) 68 | strangeCurlyBar = QuotedString("|",endQuoteChar="|}",multiline=True) #+NotAny(Literal("}")) # strangely it may also appear like this... 69 | curlyBar = curlyCurlyBar ^ strangeCurlyBar 70 | 71 | strangeBeginRemark = Combine(Literal(":") + QuotedString("''") , joinString=" ") 72 | 73 | if debug: 74 | wikiMarkup = OneOrMore(regularText ^ strangeBeginRemark ^ curlyBar ^ curlyShit ^ myHtmlComment ^ link ^ regularBrackets) 75 | else: 76 | wikiMarkup = Optional(OneOrMore(regularText.suppress() ^ strangeBeginRemark.suppress() ^ curlyBar.suppress() ^ curlyShit.suppress() ^ myHtmlComment.suppress() ^ link ^ regularBrackets.suppress())) 77 | 78 | return wikiMarkup.parseString(text) 79 | 80 | if __name__=="__main__": 81 | if len(sys.argv) > 2 and sys.argv[2] == "debug": debug = True 82 | text = open(sys.argv[1]).read().decode('utf-8').encode('ascii','ignore') 83 | print "Original\n\n",text 84 | print "parse\n\n",parseMediaWiki(text),"\n\n" 85 | print "first link\n\n",getMediaWikiFirstLink(text),"\n\n" 86 | -------------------------------------------------------------------------------- /reducer.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import pickle 3 | import glob,pprint,sys,re 4 | 5 | def combine_recurse(hist1,hist2): 6 | hist1_k = list(set(hist1.keys()) - set(['w'])) 7 | 8 | if 'w' in hist1.keys(): hist1['w'] += hist2['w'] 9 | 10 | for v2 in list(set(hist2.keys()) - set(['w'])): 11 | if v2 in hist1.keys(): 12 | #print "combine ",v2 13 | # already exists - combine recurse 14 | #hist1[v2]['w'] = hist1[v2]['w'] + hist2[v2]['w'] 15 | combine_recurse(hist1[v2],hist2[v2]) 16 | else: 17 | hist1[v2] = hist2[v2] # just add 18 | 19 | 20 | def reduce(hist_q): 21 | # if hist_q.empty(): return 22 | 23 | # get two histograms 24 | (hist1,hist2) = hist_q.get(True) 25 | 26 | return combine(hist1,hist2) 27 | 28 | def combine(hist1,hist2): 29 | print "combining: {0} and {1}".format(hist1,hist2) 30 | 31 | # combine them 32 | hist1_o = pickle.Unpickler(open(hist1,'r')).load() 33 | hist2_o = pickle.Unpickler(open(hist2,'r')).load() 34 | combine_recurse(hist1_o,hist2_o) 35 | 36 | # pickle the result 37 | filename = '_'+hist1 38 | pickle.Pickler(open(filename,'w')).dump(hist1_o) 39 | 40 | return filename 41 | 42 | def flatten(hist): 43 | flat = {} 44 | if hist is None or hist.keys() is None: return {} 45 | for i in hist.keys(): 46 | if not isinstance(hist[i],{}.__class__) or hist[i] is None: continue 47 | flat[i] = hist[i] 48 | subflat = flatten(hist[i]) 49 | if subflat is not None and len(subflat)>0: flat.update(subflat) 50 | 51 | def newcombine(hist1,hist2): 52 | for v in hist2.keys(): 53 | if v == '_w': continue 54 | 55 | if v in hist1: 56 | hist1[v]['_w'] = hist1[v]['_w'] + hist2[v]['_w'] 57 | # hist1[v].update(hist2[v]) 58 | for k in hist2[v]: 59 | if k in hist1[v]: hist1[v][k] = hist1[v][k] + hist2[v][k] 60 | else: hist1[v][k] = hist2[v][k] 61 | if len(hist1[v]) > 80: 62 | #pprint.PrettyPrinter(2).pprint(hist[v]) 63 | histogram_temp = sorted(hist1[v].items(),key=lambda x: x[1]) 64 | #pprint.PrettyPrinter(2).pprint(histogram_temp[-25:]) 65 | hist1[v] = dict(histogram_temp[-25:]) 66 | #print "cull internal ",v, ", biggest:",histogram_temp[-25],"now:",len(hist[v]) 67 | else: 68 | hist1[v] = hist2[v] 69 | 70 | def textify_title(title): 71 | title = re.sub("\_"," ",title) #underscores 72 | title = re.sub("\s+"," ",title) #extra spaces 73 | return " ".join([w.capitalize() for w in title.split()]) #capitalize 74 | 75 | if __name__=="__main__": 76 | # q = multiprocessing.Queue() 77 | # q.put(('histogram_1979_ml1.pickle','histogram_1990_great_american_bank_classic.pickle')) 78 | # reduce(q) 79 | files = glob.glob("histogram_*") 80 | # newh = combine(files.pop(),files.pop()) 81 | # for filen in glob.glob("histogram_*"): 82 | # newh = combine(newh,filen) 83 | 84 | print >>sys.stderr, "load ",files[-1] 85 | hist = pickle.Unpickler(open(files.pop(),'r')).load() 86 | for f in files: 87 | print >>sys.stderr, "combine with ",f 88 | newcombine(hist,pickle.Unpickler(open(f,'r')).load()) 89 | #pprint.PrettyPrinter(2).pprint(hist) 90 | 91 | print "digraph {" 92 | for k in hist: 93 | print "{0} [weight={1},label=\"{2}\"];".format(k,hist[k]['_w'],textify_title(k)) 94 | for v in hist[k]: 95 | if v == '_w': continue 96 | print "{1} -> {0} [weight={2}];".format(k,v,hist[k][v]) 97 | print "}" 98 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Tools for parsing the Wikipedia database in the MediaWiki format, and (potentialy) distributed tools for getting the linkage scheme of the complete database. 2 | 3 | Where To Start 4 | -------------- 5 | First you should download a dump of the Wikipedia databse: http://en.wikipedia.org/wiki/Wikipedia:Database_download 6 | 7 | Then, use the distributed tools: zmq_wikiartcile_server.py and zmq_wikiarticle_client.py 8 | They are based on a ZeroMQ pipe that shuffles the raw articles from the database onto worker processes. 9 | 10 | The server looks for the "enwiki-latest-pages-articles.xml" file that is contained in the tar.gz you downloaded. 11 | It opens a ZMMQ server on port 5555 and waits for the clients to request an article. 12 | 13 | To start the clients pool run 'python article_clients_runner.py' 14 | It will open 7 (configurable...) processes of the 'zmw_wikiarticle_client' module. 15 | Each client requests a page, parses it using pyparsing (more on this here: http://www.morethantechnical.com/2011/06/16/getting-all-the-links-from-a-mediawiki-format-using-pyparsing/) and saves all the first found links in a text file. 16 | 17 | The client can be debugged by running 'python zmw_wikiarticle_client.py raw_article.txt' 18 | 19 | The workers take some hours to complete parsing the whole DB. This process can be greatly parallelized, distributed, etc., as you can run the workers in a cluster. The work is CPU intensive. 20 | 21 | The parser code exists in simpleWiki.py. 22 | 23 | First-Link Paths 24 | ---------------- 25 | So far for generic parsing, I moved on to get the first-link-path (http://en.wikipedia.org/wiki/Wikipedia_talk:Get_to_Philosophy#A_plea_to_the_authors_of_the_tools_at_ryanelmquist.com_and_xefer.com). 26 | 27 | After the workers are done, you will end up with links_*.DOT files 28 | They are in the DOT language format, which means GraphViz and other can read them, but they can't really be visualized as the linkage is way too dense... 29 | So the next step is to run: 30 | 31 | cat links_* > links_all.DOT 32 | python parselinks.py links_all.DOT 33 | 34 | This will build a KyotoCabinet (http://fallabs.com/kyotocabinet/pythondoc/) HashDB of all the linkage: tempcabinet.kch. 35 | It should be ~600Mb. 36 | 37 | The next step is traversing the first-link paths for all articles, and it is done using mapper.py and reducer.py. 38 | As the names may suggest this is kind of a map-reduce approach to the problem, although as it turned out it doesn't really require a full blown map-reduce run. It is multi-proccess so any multi-core machine with >8Gb RAM does a very quick job. 39 | 40 | To start the process just run './run_mapper_reducer.sh' 41 | It should create a snapshot of the HashDB that will be loaded into memory by all worker processes. This greatly reduces (to 0...) the I/O intensity of the processing, making it even more nicely parallelizable. 42 | 43 | The process take a few minutes, depending on your setup (make sure to keep the # of workers below your # of CPU cores, because python's stack is a heavy context-switch for the CPU), and should end with a single file called: outputX.dot. 44 | That file should be easily visualized by any software (example: http://fluid.media.mit.edu/people/roy/media/tree_of_knowledge5.png). 45 | I used the excellent Gephi (http://gephi.org/) 46 | 47 | The result contains the culled network, with weights appropriately set for edges and nodes, and node's labels also set for nice visualization. 48 | Node weight is determined by the number of first-link-paths going through it, edge weight is essentially the same. 49 | 50 | For example the outputX.dot may look like: 51 | 52 | digraph { 53 | phenomenon [weight=720546,label="Phenomenon"]; 54 | baal_teshuva_movement -> phenomenon [weight=2]; 55 | feedback -> phenomenon [weight=58]; 56 | phenomena -> phenomenon [weight=42]; 57 | the_spooklight -> phenomenon [weight=2]; 58 | tea_leaf_paradox -> phenomenon [weight=1]; 59 | cognitive_capture -> phenomenon [weight=1]; 60 | ... 61 | } 62 | 63 | Acknowledgements 64 | ---------------- 65 | Thanks for Aaron Zinman and Doug Fritz for their help. 66 | 67 | Enjoy! 68 | Roy. 69 | -------------------------------------------------------------------------------- /mapper.py: -------------------------------------------------------------------------------- 1 | from kyotocabinet import * 2 | import sys 3 | from operator import itemgetter 4 | from pprint import PrettyPrinter 5 | import pickle 6 | from multiprocessing import Pool 7 | import time 8 | 9 | pp = PrettyPrinter(indent = 4) 10 | 11 | def followOne(k,v,db,histogram,update_hist): 12 | # find the "terminal" node for this source 13 | # (not efficient, can utilize the path for other sources) 14 | visited = [v] 15 | while db[v] and db[v] not in visited: 16 | #print "\t ->", db[v] 17 | visited.append(db[v]) # break the cycle 18 | v = db[v] 19 | if v == 'philosophy': break 20 | #if v not in ['philosophy','data_storage_device','association_football','transmission__telecommunications_','comparison','accounting_software','advocacy_group','recording','bloom','isotorpy']: 21 | #print k,'\n\t->', 22 | #print "\n\t->".join(visited) 23 | 24 | if histogram is None: 25 | print "\n\t->".join(visited) 26 | return 27 | 28 | for i in range(len(visited)-1): 29 | v = visited[i] 30 | v1 = visited[i+1] 31 | if not v1 in histogram: 32 | histogram[v1] = {'_w':1,v:1} 33 | else: 34 | histogram[v1]['_w'] = histogram[v1]['_w'] + 1 35 | if not v in histogram[v1]: histogram[v1][v] = 0 36 | histogram[v1][v] = histogram[v1][v] + 1 37 | ''' 38 | if v not in histogram: 39 | if update_hist: 40 | histogram[v] = {'w':1} 41 | if len(visited)>1: histogram[v][visited[-2]] = {'w':1} # keep populating histogram 42 | if len(visited)>2: histogram[v][visited[-2]][visited[-3]] = {'w':1} 43 | if len(visited)>3: histogram[v][visited[-2]][visited[-3]][visited[-4]] = {'w':1} 44 | if len(visited)>4: histogram[v][visited[-2]][visited[-3]][visited[-4]][visited[-5]] = {'w':1} 45 | 46 | 47 | else: 48 | histogram[v]['w'] = histogram[v]['w'] + 1 49 | if len(visited)>1: 50 | if visited[-2] not in histogram[v]: histogram[v][visited[-2]] = {'w':1} 51 | else: histogram[v][visited[-2]]['w'] = histogram[v][visited[-2]]['w'] + 1 52 | if len(visited)>2: 53 | if visited[-3] not in histogram[v][visited[-2]]: histogram[v][visited[-2]][visited[-3]] = {'w':1} 54 | else: histogram[v][visited[-2]][visited[-3]]['w'] = histogram[v][visited[-2]][visited[-3]]['w'] + 1 55 | if len(visited)>3: 56 | if visited[-4] not in histogram[v][visited[-2]][visited[-3]]: histogram[v][visited[-2]][visited[-3]][visited[-4]]= {'w':1} 57 | else: histogram[v][visited[-2]][visited[-3]][visited[-4]]['w'] = histogram[v][visited[-2]][visited[-3]][visited[-4]]['w'] + 1 58 | if len(visited)>4: 59 | if visited[-5] not in histogram[v][visited[-2]][visited[-3]][visited[-4]]: histogram[v][visited[-2]][visited[-3]][visited[-4]][visited[-5]] = {'w':1} 60 | else: histogram[v][visited[-2]][visited[-3]][visited[-4]][visited[-5]]['w'] = histogram[v][visited[-2]][visited[-3]][visited[-4]][visited[-5]]['w'] + 1 61 | ''' 62 | 63 | p_db = None 64 | 65 | def traverse(start_index): 66 | global p_db 67 | 68 | histogram = {} 69 | update_hist = True 70 | 71 | if p_db is None: 72 | p_db = DB() 73 | p_db.open(":") #"tempcabinet.kch",DB.OREADER | DB.ONOLOCK) 74 | print >>sys.stderr, "load db snapshot" 75 | p_db.load_snapshot('tempcabinet.snapshot') 76 | 77 | start_time = time.time() 78 | 79 | ''' 80 | db = DB(opts=[DB.GCONCURRENT]) 81 | # open the database, reader, no lock 82 | if not db.open("tempcabinet.kch", DB.OREADER | DB.ONOLOCK): 83 | print >>sys.stderr, "open error: " + str(db.error()) 84 | sys.exit(0) 85 | ''' 86 | # traverse records 87 | #for i in range(1,2): 88 | print "traverse, jump to",start_index 89 | 90 | cur = p_db.cursor() 91 | cur.jump(start_index) 92 | count = 0 93 | #while True: 94 | for j in range (1,10000): 95 | rec = cur.get(True) 96 | if not rec: break 97 | (k,v) = rec 98 | followOne(k,v,p_db,histogram,update_hist) 99 | if j % 1000 == 0: print j,"hist:",len(histogram) 100 | 101 | histogram_temp = sorted(histogram.items(),key=lambda x: x[1]["_w"]) 102 | histogram = dict(histogram_temp[-55:]) 103 | 104 | print >>sys.stderr, "done traverse ",start_index 105 | 106 | #cur.jump() 107 | #while True: 108 | # rec = cur.get(True) 109 | # if not rec: break 110 | # print rec[0],':',rec[1] 111 | # pp.pprint(sorted(histogram.items(), key=lambda x: x[1]["w"])) 112 | 113 | histogram_name = 'histogram_'+start_index+'.pickle' 114 | p = pickle.Pickler(open(histogram_name,'w')) 115 | p.dump(histogram) 116 | 117 | cur.disable() 118 | #p_db.close() 119 | 120 | print "traverse took {0} seconds".format(time.time()-start_time) 121 | 122 | return histogram_name 123 | 124 | if __name__=="__main__": 125 | 126 | print >>sys.stderr, "preparing keys jumps for workers" 127 | db = DB() 128 | #db.open("tempcabinet.kch",DB.OREADER | DB.ONOLOCK) 129 | db.open(":") 130 | print >>sys.stderr, "load db snapshot" 131 | db.load_snapshot('tempcabinet.snapshot') 132 | 133 | if len(sys.argv) > 1: 134 | if not db[sys.argv[1]]: 135 | print "can;t find key ",sys.argv[1] 136 | else: 137 | followOne(sys.argv[1],db[sys.argv[1]],db,None,False) 138 | exit() 139 | # else: 140 | cur = db.cursor() 141 | cur.jump() 142 | keys = [] 143 | for i in range(0,db.count()): 144 | rec = cur.get(True) 145 | if not rec: break 146 | (k,v) = rec 147 | if i % 10000 == 0: keys.append(k) 148 | 149 | cur.disable() 150 | db.close() 151 | 152 | pool = Pool(processes=6) 153 | try: 154 | print "histograms created: ",pool.map(traverse,keys) 155 | except(KeyboardInterrupt): 156 | print "Killing all processes..." 157 | pool.terminate() 158 | pool.join() 159 | 160 | # db.close() 161 | -------------------------------------------------------------------------------- /zmq_wikiarticle_client.py: -------------------------------------------------------------------------------- 1 | import zmq,re,sys,traceback 2 | from lxml import etree 3 | import simpleWiki 4 | 5 | verbose = False 6 | 7 | def removeBalanced(article_text,delim_open,delim_close): 8 | stack = [] 9 | ptr = 0 10 | nothingDone = False 11 | while not nothingDone: #article_text.find(delim_open) > -1 or article_text.find(delim_close) > -1 or # not efficient... 12 | nothingDone = True 13 | open_pos = article_text.find(delim_open,ptr) 14 | close_pos = article_text.find(delim_close,ptr) 15 | if open_pos > -1 and open_pos < close_pos: 16 | ptr = open_pos + len(delim_open) 17 | if verbose: 18 | print "found ",delim_open," at ",open_pos 19 | if len(stack)>0 and open_pos == stack[-1]: # in case we already found this.. 20 | if verbose: 21 | print 'skipping...' 22 | continue 23 | stack.append(open_pos) 24 | nothingDone = False 25 | elif close_pos > -1: 26 | if verbose: 27 | print "found ",delim_close," at ",close_pos 28 | #ptr = close_pos 29 | try: 30 | from_pos = stack.pop() 31 | to_pos = close_pos+len(delim_close) 32 | article_text = article_text[:from_pos] + article_text[to_pos:] 33 | if len(stack) > 0: 34 | ptr = stack[-1] + len(delim_open) 35 | else: 36 | ptr = 0 37 | if verbose: 38 | print "delete {0} to {1}, ptr = {2}".format(from_pos,to_pos,ptr) 39 | nothingDone = False 40 | except(IndexError): 41 | break #some error i probably don't want to deal with... 42 | return article_text 43 | 44 | def getFirstLink(article_text): 45 | firstlink = article_text[article_text.find('[[')+2:article_text.find(']]')] 46 | if firstlink.find('|') > -1: 47 | firstlink = firstlink[:firstlink.find('|')] 48 | return firstlink.strip().lower() 49 | 50 | def normalizeLink(link): 51 | link = re.sub('\#.*$','',link) 52 | link = re.sub('\W','_',link.strip().lower()) 53 | return link 54 | 55 | def writeLink(title,link,fh): 56 | link = normalizeLink(link) 57 | title = normalizeLink(title) 58 | if not link == title and len(link) != 0 and len(title) != 0: 59 | linkstr = "{0} -> {1}".format(title,link) 60 | if verbose: print "link: ", linkstr 61 | fh.write(linkstr+'\n') 62 | fh.flush() 63 | 64 | 65 | class article_client: 66 | def __init__(self,outputfile,number=-1): 67 | self.linksFiles = [idx+"_"+outputfile for idx in ["third","forth","fifth"]] #outputfile 68 | self.linksFilesHandles = [open(filename,'w') for filename in self.linksFiles] 69 | self.title = 'temp' 70 | self.number = number 71 | 72 | def start_client(self): 73 | context = zmq.Context() 74 | 75 | # Socket to talk to server 76 | print "Connecting to article server..." 77 | socket = context.socket(zmq.REQ) 78 | socket.connect ("tcp://localhost:5555") 79 | 80 | # self.linksFile = output_file #'links_'+sys.argv[1]+'.DOT','w') 81 | 82 | while True: 83 | try: 84 | if verbose: 85 | print "Sending request " 86 | socket.send ("GIVE_ARTICLE") 87 | 88 | # Get the reply. 89 | message = socket.recv() 90 | #print "Received reply [", message, "]" 91 | if message.find('ALL_DONE') == 0: 92 | break 93 | 94 | self.parseResponse(message) 95 | 96 | 97 | #break 98 | #continue 99 | except(KeyboardInterrupt): 100 | print "exiting..." 101 | break 102 | 103 | def parseResponse(self,message): 104 | root = etree.fromstring(message) 105 | 106 | self.title = root.find("title").text.encode('ascii','replace').strip().lower() 107 | #print >> sys.stderr, self.title, "[",self.number,"]" 108 | 109 | if verbose: 110 | print "parsing article ",self.title 111 | 112 | try: 113 | article_text = root.xpath("/page/revision/text")[0].text.encode('ascii','replace') 114 | except(AttributeError): 115 | if verbose: 116 | print "can't read text!" 117 | return False 118 | 119 | if re.search('\{\{(disambig.*?|geodis)\}\}',article_text) is not None or self.title.find('(disambiguation)') > -1: 120 | if verbose: 121 | print "This is a disambig..." 122 | #break 123 | return False 124 | 125 | 126 | if verbose: 127 | print "is this a redirect? ", root.find("redirect") != None 128 | 129 | #if root.find("redirect") != None: 130 | #link = getFirstLink(article_text) 131 | try: 132 | #link = simpleWiki.getMediaWikiFirstLink(article_text) 133 | links = simpleWiki.getNFirstLinks(article_text,5) # get first 5 links 134 | #link = simpleWiki.getNthLink(article_text,2) 135 | for i in range(0,2): # scan the last 3 (link # 3,4,5) 136 | link = links[2+i] 137 | writeLink(self.title,link,self.linksFilesHandles[i]) # write each link to a diff file 138 | except: 139 | exc_type, exc_value, exc_traceback = sys.exc_info() 140 | traceback.print_tb(exc_traceback, limit=1, file=sys.stderr) 141 | 142 | 143 | return True 144 | 145 | 146 | 147 | # return self.parseText(article_text) 148 | 149 | def parseText(self,article_text): 150 | try: 151 | #link = simpleWiki.getMediaWikiFirstLink(article_text) 152 | link = simpleWiki.getNthLink(article_text,2) 153 | writeLink(self.title,link,self.linksFile) 154 | except: 155 | exc_type, exc_value, exc_traceback = sys.exc_info() 156 | traceback.print_tb(exc_traceback, limit=1, file=sys.stderr) 157 | ''' 158 | article_text = removeBalanced(article_text,'{{','}}') 159 | #article_text = removeBalanced(article_text,'(',')') 160 | 161 | # article_text = re.sub(r'\{\{[\s\S]*?\}\}','',article_text) 162 | article_text = re.sub(r'\[\[([Ii]mage|[Ff]ile)[\s\S]*?\]\]\n','',article_text) # remove image links 163 | # article_text = re.sub(r'\([\s\S]*?\)','',article_text) # remove paretheses 164 | article_text = re.sub(r'<\!--[\s\S]*?-->','',article_text) # remove html remarks 165 | article_text = re.sub(r'<!--[\s\S]*?-->','',article_text) # remove html remarks 166 | article_text = re.sub(r'\:\'\'.*?\'\'','',article_text) # remove wiki italics 167 | article_text = re.sub(r'<ref[\s\S]*?</ref>','',article_text) # revmoe refs 168 | article_text = re.sub(r'\(from \[\[[\s\S]*?\)','',article_text) 169 | article_text = re.sub(r'\[\[wikt\:[\s\S]*?\]\]','',article_text) # wikitionary links 170 | 171 | if verbose: 172 | print article_text 173 | 174 | firstlink = getFirstLink(article_text) 175 | writeLink(self.title,firstlink,self.linksFile) 176 | ''' 177 | 178 | return True 179 | 180 | if __name__ == "__main__": 181 | client = article_client(sys.stdout) 182 | # client.start_client('output'+sys.argv[1]+'.DOT') 183 | if len(sys.argv) > 2 and sys.argv[2] == 'verbose': verbose = True 184 | client.parseText(open(sys.argv[1],'r').read()) 185 | --------------------------------------------------------------------------------