├── clean_ttl.py ├── create_pagenames_db.py ├── master_clean.py ├── pres_clean.py ├── query.py ├── readme.md ├── requirements.txt ├── static ├── .DS_Store ├── css │ └── index.css ├── images │ ├── arrow.gif │ ├── arrow.png │ ├── cat.jpg │ ├── logo-background.png │ ├── mem_test.png │ ├── sadpanda.jpg │ └── wikigraph_ss.png └── js │ ├── graph.js │ ├── index.js │ └── summary.js ├── templates └── index.html ├── test_continuity.py └── wikigraph.py /clean_ttl.py: -------------------------------------------------------------------------------- 1 | def clean_ttl(read_path, write_path): 2 | with open(read_path, 'rb') as p, open(write_path, 'wb') as c: 3 | p.next() 4 | for line in p: 5 | l = line.split() 6 | source = l[0][29:-1] 7 | target = l[2][29:-1] 8 | c.write(source + '\t' + target + '\n') 9 | 10 | if __name__ == '__main__': 11 | print "Cleaning page links..." 12 | clean_ttl('data/page_links_en.ttl', 'data/cleaned_links.tsv') 13 | print "Cleaning redirects..." 14 | clean_ttl('data/redirects_en.ttl', 'data/cleaned_redirects.tsv') 15 | print "Done!" -------------------------------------------------------------------------------- /create_pagenames_db.py: -------------------------------------------------------------------------------- 1 | import csv, sqlite3 2 | 3 | conn = sqlite3.connect('pagenames.db') 4 | curs = conn.cursor() 5 | curs.execute('CREATE TABLE pagenames (code INTEGER PRIMARY KEY, title TEXT, title_lower TEXT, degrees INTEGER);') 6 | 7 | with open('data/nodes.tsv', 'rb') as csv_file: 8 | sql_insert = 'INSERT INTO pagenames VALUES(?, ?, ?, ?)' 9 | reader = csv.reader(csv_file, delimiter='\t') 10 | reader.next() 11 | for row in reader: 12 | title = unicode(row[1].replace('_', ' '), 'utf8') 13 | code = int(row[0]) 14 | degrees = int(row[3]) 15 | to_db = [code, title, title.lower(), degrees] 16 | curs.execute(sql_insert, to_db) 17 | 18 | conn.commit() 19 | 20 | curs.execute('CREATE UNIQUE INDEX codes ON pagenames(code);') 21 | curs.execute('CREATE INDEX titles ON pagenames(title);') 22 | curs.execute('CREATE INDEX titles_lower ON pagenames(title_lower);') 23 | curs.execute('CREATE INDEX degrees ON pagenames(degrees);') 24 | 25 | conn.close() -------------------------------------------------------------------------------- /master_clean.py: -------------------------------------------------------------------------------- 1 | import time 2 | import urllib2 3 | 4 | def redirects_dict(redirects_path): 5 | """Iterates through the redirects file and creates a set of redirect page 6 | names.""" 7 | 8 | redirects = {} 9 | with open(redirects_path, 'rb') as reds: 10 | for line in reds: 11 | l = line.split('\t') 12 | source = l[0] 13 | target = l[1].rstrip() 14 | redirects.setdefault(source, target) 15 | 16 | return redirects 17 | 18 | def convert_to_unicode(title): 19 | title = urllib2.unquote(title) 20 | return title 21 | 22 | def assemble_dict(link_path, redirects): 23 | """Iterates through the pagelinks file and returns a dictionary containing 24 | information about the page, its unique code, and what it links to (if 25 | anything). 26 | 27 | Example of returned dictionary: 28 | {'page1': {'code': 41, 29 | 'title': 'page1', 30 | 'links': set([42, 108])}}""" 31 | 32 | with open(link_path, 'rb') as paths: 33 | data = {} 34 | foo = 0 35 | code_counter = 0 36 | 37 | t0 = time.time() 38 | for line in paths: 39 | l = line.split('\t') 40 | start = l[0] 41 | end = l[1].rstrip() 42 | 43 | if end[:5] == "File:" or end[:9] == "Category:": 44 | continue 45 | 46 | if start in redirects or start == end: 47 | continue 48 | 49 | if end[:2] == "S:" or end[:4] == "Help:": 50 | continue 51 | 52 | if '%' in start: 53 | start = convert_to_unicode(start) 54 | 55 | if '%' in end: 56 | end = convert_to_unicode(end) 57 | 58 | if end in redirects: # if start points to a redirect page 59 | end = redirects[end] # replace it with the real page 60 | 61 | if data.get(start, 0) == 0: 62 | data[start] = {'title': start, 63 | 'links': {end}} 64 | else: 65 | data[start]['links'].add(end) 66 | 67 | if data.get(end, 0) == 0: 68 | data[end] = {'title': end, 69 | 'links': set()} 70 | 71 | foo += 1 72 | if foo % 10000000 == 0: 73 | x = foo/1000000 74 | y = (time.time() - t0)/60 75 | print "%d million lines read in %.2f minutes" % (x, y) 76 | 77 | return data 78 | 79 | def find_deadends(data): 80 | """Iterates through the page links dictionary, and for every page without 81 | outgoing links, adds the code number to the 'deadends' set and deletes the 82 | key from the dictionary.""" 83 | 84 | deadends = set() 85 | keys = data.keys() 86 | for key in keys: 87 | value = data[key] 88 | if not value['links']: 89 | deadends.add(value['title']) 90 | del data[key] # remove key from data 91 | 92 | return deadends, keys 93 | 94 | def prune_deadends(data, deadends, keys): 95 | """Iterates through the page links dictionary, and for every page, removes 96 | links that are in the 'deadends' set.""" 97 | 98 | for key in keys: 99 | value = data.get(key) 100 | if value is not None: 101 | links = value['links'].copy() 102 | for link in links: 103 | if link in deadends: 104 | value['links'].remove(link) 105 | 106 | def recode_data(data): 107 | """Iterates through the page links dictionary and assigns a code to 108 | every page. Returns a dictionary of title:code lookups.""" 109 | 110 | codes = {} 111 | code_counter = 0 112 | for key, value in data.iteritems(): 113 | data[key].update({'code': code_counter}) 114 | codes[value['title']] = code_counter 115 | code_counter += 1 116 | 117 | return codes 118 | 119 | def write_rels(data, rels_path, codes): 120 | """Iterates through the page links dictionary and writes the results to 121 | the rels.tsv file (start, end, link_type).""" 122 | 123 | with open(rels_path, 'wb+') as rels: 124 | rels.write('start\tend\ttype\n') 125 | for value in data.values(): 126 | code = str(value['code']) 127 | if value['links']: 128 | for link in value['links']: 129 | rels.write(code + '\t' + str(codes[link]) + '\tLINKS_TO\n') 130 | 131 | def write_nodes(data, nodes_path): 132 | """Iterates through the page links dictionary (sorted by code number) 133 | and writes the results to the nodes.tsv file (code, title, label, degrees). 134 | """ 135 | 136 | with open(nodes_path, 'wb+') as nodes: 137 | nodes.write('node\tname\tl:label\tdegrees\n') 138 | for page in sorted(data.values(), key=lambda k: k['code']): 139 | code = str(page['code']) 140 | deg = str(len(page['links'])) 141 | nodes.write(code + '\t' + page['title'] + '\tPage\t'+ deg + '\n') 142 | 143 | def clean_data(): 144 | """Reads a tab-separated file of Wikipedia links and creates one tsv file for 145 | page links and one for pages. First it assembles a dictionary of redirect 146 | pages, then it creates a page links dictionary, filtering out redirects and 147 | specific page types. Next, pages with no outgoing links are removed and 148 | their title is added to a 'deadend' set. Then, pages in the dictionary 149 | remove links to pages in the deadend set. Finally, the dictionary is 150 | parsed and information is written to two .tsv files.""" 151 | 152 | print "Creating set of redirect pages..." 153 | redirects = redirects_dict('data/cleaned_redirects.tsv') 154 | print "Reading page links..." 155 | data = assemble_dict('data/cleaned_links.tsv', redirects) 156 | raw_length = len(data) 157 | print "Page links dictionary created with %d pages." % raw_length 158 | print "Finding deadends..." 159 | deadends, keys = find_deadends(data) 160 | print "Pruning %d deadends..." % len(deadends) 161 | prune_deadends(data, deadends, keys) 162 | print "Recoding data..." 163 | codes = recode_data(data) 164 | perc = (len(data)/float(raw_length))*100 165 | print "Pages pruned, now %d pages (%.2f%% of original)." % (len(data), perc) 166 | print "Writing 'rels.tsv'..." 167 | write_rels(data, 'data/rels.tsv', codes) 168 | print "Writing 'nodes.tsv'..." 169 | write_nodes(data, 'data/nodes.tsv') 170 | print "Done!" 171 | 172 | if __name__ == "__main__": 173 | clean_data() -------------------------------------------------------------------------------- /pres_clean.py: -------------------------------------------------------------------------------- 1 | PRESIDENTS = set([ 2 | 'George_Washington', 'John_Adams', 'Thomas_Jefferson', 3 | 'James_Madison', 'James_Monroe','John_Quincy_Adams', 4 | 'Andrew_Jackson','Martin_Van_Buren', 'William_Henry_Harrison', 5 | 'John_Tyler', 'James_K._Polk', 'Zachary_Taylor', 6 | 'Millard_Fillmore', 'Franklin_Pierce', 'James_Buchanan', 7 | 'Abraham_Lincoln', 'Andrew_Johnson', 'Ulysses_S._Grant', 8 | 'Rutherford_B._Hayes', 'James_A._Garfield', 'Chester_A._Arthur', 9 | 'Grover_Cleveland', 'Benjamin_Harrison', 'William_McKinley', 10 | 'Theodore_Roosevelt', 'William_Howard_Taft', 'Woodrow_Wilson', 11 | 'Warren_G._Harding', 'Calvin_Coolidge', 'Herbert_Hoover', 12 | 'Franklin_D._Roosevelt', 'Harry_S._Truman', 13 | 'Dwight_D._Eisenhower', 'John_F._Kennedy', 'Lyndon_B._Johnson', 14 | 'Richard_Nixon', 'Gerald_Ford', 'Jimmy_Carter', 'Ronald_Reagan', 15 | 'George_H._W._Bush', 'Bill_Clinton', 'George_W._Bush', 16 | 'Barack_Obama' 17 | ]) 18 | 19 | ### This creates the presidents subgraph 20 | # parse page_links, if source or target is a president, write it 21 | with open('data/cleaned_links.tsv', 'r') as f, open('data/pres_links.tsv', 'wb+') as p: 22 | for line in f: 23 | l = line.split('\t') 24 | start = l[0] 25 | end = l[1].rstrip() 26 | if start in PRESIDENTS or end in PRESIDENTS: 27 | p.write(line) -------------------------------------------------------------------------------- /query.py: -------------------------------------------------------------------------------- 1 | from py2neo import neo4j 2 | import json, time 3 | 4 | def find_shortest_path(node1, node2): 5 | """Connects to graph database, then creates and sends query to graph 6 | database. Returns the shortest path between two nodes. 7 | Format: (67149)-[:'LINKS_TO']->(421)""" 8 | 9 | graph_db = neo4j.GraphDatabaseService() 10 | 11 | t0 = time.time() 12 | 13 | query = neo4j.CypherQuery( 14 | graph_db, 15 | """MATCH (m:Page {node:{n1}}), (n:Page {node:{n2}}), 16 | p = shortestPath((m)-[*..10]->(n)) RETURN p""" 17 | ) 18 | try: 19 | path = query.execute_one(n1=node1, n2=node2) 20 | except: 21 | path = None 22 | 23 | t1 = time.time() 24 | 25 | print "\nShortest Path:", path 26 | print "Time elapsed: %.2f seconds" % (t1 - t0) 27 | 28 | return path 29 | 30 | def parse_node(node, in_path): 31 | """Extract title and code from a node object. Returns a dict of information.""" 32 | 33 | code, deg, title = node.get_properties().values() 34 | title = title.replace('_', ' ') 35 | 36 | if title == "Basque people": # special exception for a changed redirect 37 | title = "Basques" 38 | 39 | node_dict = {'code': int(code), 40 | 'title': title, 41 | 'degrees': deg, 42 | 'group': 'none'} 43 | 44 | if in_path: 45 | node_dict['group'] = 'path' 46 | 47 | return node_dict 48 | 49 | def parse_rel(rel, in_path): 50 | """Extract node code from a relationship object. Returns a dict of 51 | information.""" 52 | 53 | start_id = rel.start_node.get_properties()['node'] 54 | end_id = rel.end_node.get_properties()['node'] 55 | 56 | rel_dict = {'source': int(start_id), 57 | 'target': int(end_id), 58 | 'value': 0} 59 | 60 | if in_path: 61 | rel_dict['value'] = 1 62 | 63 | return rel_dict 64 | 65 | def parse_node_objs(node_objs_list, in_path=False): 66 | """Takes a list of node objects. Returns dict of node dicts.""" 67 | 68 | nodes = {} 69 | 70 | for node in node_objs_list: 71 | node_dict = parse_node(node, in_path=in_path) 72 | if node_dict['code'] not in nodes: 73 | nodes[node_dict['code']] = node_dict 74 | 75 | return nodes 76 | 77 | def parse_rel_objs(rel_objs_list, in_path=False): 78 | """Takes a list of relationship objects. Returns list of rel dicts.""" 79 | 80 | rel_dict_list = [parse_rel(rel=rel, in_path=in_path) for rel in rel_objs_list] 81 | 82 | return rel_dict_list 83 | 84 | def find_other_nodes(node_objs_list): 85 | """Takes a list of node objects. Returns list of rel dicts and list of 86 | node dicts.""" 87 | 88 | rels = [] 89 | nodes = [] 90 | 91 | for node in node_objs_list: 92 | 93 | for rel in node.match_incoming(limit=8): 94 | rels.append(rel) 95 | nodes.append(rel.start_node) 96 | 97 | for rel in node.match_outgoing(limit=8): 98 | rels.append(rel) 99 | nodes.append(rel.end_node) 100 | 101 | rel_dict_list = parse_rel_objs(rels) 102 | node_dict_list = parse_node_objs(nodes) 103 | 104 | return rel_dict_list, node_dict_list 105 | 106 | def merge_node_dicts(path_nodes, npath_nodes): 107 | """Takes and merges the two dictionaries of node dicts. Returns list of 108 | node dicts.""" 109 | 110 | d = dict(npath_nodes.items() + path_nodes.items()) 111 | node_dict_list = [node_dict for node_dict in d.values()] 112 | 113 | return node_dict_list 114 | 115 | def parse_nodes_and_rels(path): 116 | """Takes a path object. Returns two lists, one for rel dicts and one for 117 | node dicts.""" 118 | 119 | # rel dict list for main path 120 | path_rels = parse_rel_objs(rel_objs_list=path.relationships, in_path=True) 121 | 122 | # parse nodes, create list of unique nodes 123 | path_nodes = parse_node_objs(node_objs_list=path.nodes, in_path=True) 124 | 125 | # this is a quick/dirty way to grab the names for each path node in order 126 | path_names = [] 127 | for node in path.nodes: 128 | path_dict = node.get_properties().values()[0] 129 | path_names.append({'title': path_nodes[int(path_dict)]['title'], 130 | 'code': path_nodes[int(path_dict)]['code']}) 131 | 132 | # rel dict list for secondary rels 133 | npath_rels, npath_nodes = find_other_nodes(node_objs_list=path.nodes) 134 | 135 | # filter out reversed or duplicate paths in the path rels 136 | for rel in npath_rels: 137 | for path in path_rels: 138 | if rel['source'] == path['target'] and rel['target'] == path['source']: 139 | rel['value'] = 1 # include it in the path 140 | if rel['source'] == path['source'] and rel['target'] == path['target']: 141 | npath_rels.remove(rel) # remove duplicates 142 | 143 | # combine the two lists for nodes and rels 144 | rels_list = path_rels + npath_rels 145 | nodes_list = merge_node_dicts(path_nodes, npath_nodes) 146 | 147 | return rels_list, nodes_list, path_names 148 | 149 | def create_lists(node1, node2): 150 | """Request the shortest path between two nodes from the database. Assemble 151 | list of nodes and relationships from the path, then process to recode their 152 | IDs. Write output to a JSON file.""" 153 | 154 | path = find_shortest_path(str(node1), str(node2)) 155 | 156 | if path: 157 | 158 | rels_list, nodes_list, path_names = parse_nodes_and_rels(path) 159 | 160 | codes = {} 161 | id_counter = 0 162 | 163 | for node in nodes_list: # create a dict to translate id codes 164 | node_id = node['code'] 165 | if node_id not in codes: 166 | codes[node_id] = id_counter 167 | id_counter += 1 168 | 169 | for rel in rels_list: # look up the source and target in codes 170 | rel['source'] = codes[rel['source']] 171 | rel['target'] = codes[rel['target']] 172 | 173 | response = """{ "path": %s, "results": { "directed": true, "nodes": %s, 174 | "links": %s, "multigraph": false }}""" % (json.dumps(path_names), json.dumps(nodes_list), json.dumps(rels_list)) 175 | 176 | else: 177 | response = '{ "path": "None", "results": "None" }' 178 | 179 | return response 180 | 181 | if __name__ == '__main__': 182 | print create_lists('335354', '3778612') # Abraham Lincoln to Astronomy 183 | 184 | 185 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | wikiGraph 2 | =========== 3 | What connects two topics on Wikipedia? For example, how many links do you have to click to get from Harry Potter to the Spanish Inquisition?* Combining trivia nerdery with graph theory, wikiGraph allows users to find and explore the paths within Wikipedia. 4 | 5 | You can check out the project [here](http://wikigraph.erikaarnold.com). 6 | 7 | *It takes a minimum of 3 clicks. Here's one path: Harry Potter -> British literature -> the spread of the printing press -> the Spanish Inquisition 8 | 9 | ###Contents 10 | - [Features](#features) 11 | - [Data cleaning](#data-cleaning) 12 | - [Queries](#queries) 13 | - [Data visualization](#data-visualization) 14 | - [Improving response time](#improving-response-time) 15 | - [Deployment](#deployment) 16 | 17 | #### Features 18 | *Current* 19 | - [x] Wikipedia page links imported into a graph database (Neo4j) 20 | - [x] Python wrapper queries database for shortest path between two nodes, outputs path and secondary relationships as JSON (py2neo) 21 | - [x] Result rendered as a force-directed graph (d3.js) 22 | - [x] Flask app renders HTML and handles AJAX requests to the database 23 | - [x] Flask app and database deployed (EC2, Apache) 24 | - [x] Search suggest for page titles (typeahead.js, SQLite) 25 | - [x] Embed page images on nodes within the rendered graph (Wikipedia API) 26 | - [x] Option to generate a pseudo-random search query 27 | - [x] Nodes are sized/colored based on the number of links to other nodes 28 | - [x] Incorporate summary of path pages as mouseover tooltips (Wikipedia API) 29 | 30 | *Future* 31 | 32 | - [ ] Path responses cached (CouchDB) 33 | - [ ] Snippets of each path node displayed in the mouseover tooltip (Beautiful Soup) 34 | 35 | ![wikigraph_screenshot](static/images/wikigraph_ss.png) 36 | 37 | #### Data cleaning 38 | I downloaded RDF files (.ttl) for page links and redirects from [DBPedia](http://wiki.dbpedia.org/Downloads2014). Here's what the raw page links file looks like: 39 | ``` 40 | . 41 | . 42 | ``` 43 | Wikipedia is big! This file includes over 150 million relationships. To reduce the file size before filtering data, I ran clean_ttl.py to pull out the page names from the source and target and write them to a tab-separated file. This significantly reduced the file sizes for both links and redirects (23GB -> 6.2GB, 980MB -> 275MB). 44 | 45 | I then used master_clean.py to parse and clean the page links. As a first pass, this meant removing redirect pages and duplicates. After looking at the output, I realized that almost half of the page links were to pages that had no outgoing links--they were dead-ends. Some were specific types of links (e.g. File, Category, Help) so I could filter those. *(Side note: Why don't they keep track of inter-Wikipedia links on Category pages? Those could be useful.)* 46 | 47 | However, even outside those categories, almost half of the pages in the file never linked to anything else. I decided to modify my cleaning script to remove the dead-ends--and any links pointing to them--from my data (see [Pruning the Graph](#pruning-the-graph) for my rationale). 48 | 49 | Here is the main function in master_clean.py: 50 | 51 | ```python 52 | def clean_data(): 53 | """Reads a tab-separated file of page links, creates one tab-separated 54 | file for page links (rels.tsv) and one for pages (nodes.tsv). First it 55 | assembles a dictionary of redirect pages, then it creates a page link 56 | dictionary, filtering out redirects and specific page types. Then, pages 57 | with no outgoing links are removed and their code is added to a 'dead-end' 58 | set. Pages with links to pages in the dead-end set remove those links. Finally, the dictionary is parsed and information is 59 | written to two .tsv files.""" 60 | 61 | redirects = redirects_dict('data/cleaned_redirects.tsv') 62 | data = assemble_dict('data/cleaned_links.tsv', redirects) 63 | keys, deadends = find_deadends(data) 64 | prune_deadends(data, deadends, keys) 65 | recode_data(data) 66 | write_rels(data, 'data/rels.tsv') 67 | write_nodes(data, 'data/nodes.tsv') 68 | ``` 69 | I needed page codes to be continuous so those are assigned after the dictionary is pruned. I also wrote test_continuity.py to test whether or not the nodes.tsv file produced has continuous codes. 70 | 71 | It is quite a memory-intensive script and even on a server with 15G RAM, it took about 30 minutes to execute--but it worked! After cleaning, the complete graph has about 4.5 million nodes and 110 million edges. The data are stored in two tsv files: a list of all relationships and a list of all nodes. 72 | 73 | __nodes.tsv__ (160MB) 74 | ``` 75 | node title l:label degrees 76 | 0 Alabama Pages 83 77 | 1 Andrew Jackson Pages 51 78 | ``` 79 | __rels.tsv__ (2.5GB) 80 | ``` 81 | start end type 82 | 0 1 LINKS_TO 83 | 2 3 LINKS_TO 84 | ``` 85 | I used Michael Hunger's [batch import tool](https://github.com/jexp/batch-import/tree/20) to insert the data into a [Neo4j](http://neo4j.com/) graph database. Also, after much research and many failed batch imports, I appended ```batch_import.csv.quotes=false``` to **batch.properties** because stray double quotes in the page titles cause a lookup failure when importing relationships (which was not detected in the presidents subgraph). 86 | 87 | #####Database and model 88 | Within the database, the data model is quite simple: (Page) -[:LINKS_TO]-> (Page). All nodes have a label (Page) and three properties (node, title, degrees). All relationships are unidrectional and hold no properties. 89 | 90 | Within the database I applied a constraint on all nodes indicating that their id ('node') was unique. This dramatically decreased query response time as the database did not have to do a full scan of the nodes for each lookup. 91 | ``` 92 | CREATE CONSTRAINT ON (p:Page) ASSERT p.node IS UNIQUE; 93 | ``` 94 | 95 | #### Queries 96 | I used Nigel Small's Python library [py2neo](http://nigelsmall.com/py2neo/1.6/) to interact with Neo4j's RESTful web service interface. query.py translates my shortest-path request into a CypherQuery object, queries the database, and returns the results as a Path object. 97 | ```python 98 | query = neo4j.CypherQuery( 99 | graph_db, 100 | """MATCH (m:Page {node:{n1}}), (n:Page {node:{n2}}), 101 | p = shortestPath((m)-[*..20]->(n)) RETURN p""" 102 | ) 103 | query.execute(n1=node1, n2=node2) 104 | ``` 105 | The script then traverses this path object, adding a sample of incoming and outgoing links for each path node, as well as deduping nodes and relationships. For the [d3 library](http://d3js.org/) to graph this result, the ids need to be recoded to be sequential (starting from 0). Finally, the nodes and relationships are formatted and returned as JSON. 106 | ``` 107 | { 108 | "directed": true, 109 | "nodes": [ 110 | { 111 | "degrees": 22, 112 | "node": 0, 113 | "title": "William Persse", 114 | "group": "path" 115 | }, 116 | { 117 | "degrees": 102, 118 | "node": 1, 119 | "title": "George Washington", 120 | "group": "path" 121 | }, 122 | { 123 | "degrees": 35, 124 | "node": 2, 125 | "title": "American Presidents: Life Portraits", 126 | "group": "none" 127 | } 128 | ], 129 | "links": [ 130 | { 131 | "start": 0, 132 | "end": 1, 133 | "value": 1 134 | }, 135 | { 136 | "start": 1, 137 | "end": 2, 138 | "value": 0 139 | } 140 | ], 141 | "multigraph": false 142 | } 143 | ``` 144 | 145 | #### Data visualization 146 | When planning this project, I envisioned the result of a query as an interactive graph. I wanted not only to see the shortest path between two pages but also explore the pages' context and connections. 147 | 148 | wikigraph.py is a small [Flask](http://flask.pocoo.org/) app that handles AJAX requests to the databases. graph.js implements the graph visualization with the d3 library while index.js handles everything else. 149 | 150 | The returned path is displayed in two ways: as a force-directed graph layout and as a simple list of page titles. Both are rendered as SVG images with d3. Page images are displayed in the nodes via clipped-path, as patterning the image over a circle decreases performance during movement. 151 | 152 | Wikipedia page images and extracts are sourced from Wikipedia via the [Wikimedia API](http://www.mediawiki.org/wiki/API:Main_page). The first AJAX request occurs upon the query request (for the start and end nodes), then again once the result is received (for the inner path nodes). URLs and extracts are stored in a global variable (*queryImages*). There are further requests when the user mouses over non-path nodes. 153 | 154 | These AJAX requests are chained together in a set of deferred promises to ensure asynchonous calls return before the next request begins. 155 | 156 | ##### User input 157 | To help users input page names correctly (as well as to suggest possible queries) I implemented a predictive seach tool with [typeahead.js](https://twitter.github.io/typeahead.js/). It uses AJAX requests to query an indexed [SQLite](http://www.sqlite.org/) database that holds the page titles, codes, and their degrees. It also includes a titles_lower column in order to optimize title lookups with 'LIKE'. This database is generated with create_pagenames_db.py and has the following schema: 158 | ``` 159 | CREATE TABLE pagenames (code INTEGER PRIMARY KEY, title TEXT, title_lower TEXT, degrees INTEGER); 160 | CREATE UNIQUE INDEX codes ON pagenames(code); 161 | CREATE INDEX degrees ON pagenames(degrees); 162 | CREATE INDEX titles ON pagenames(title); 163 | CREATE INDEX titles_lower ON pagenames(title_lower); 164 | ``` 165 | #### Improving query response time 166 | As I played around with the database, I realized that a responsive shortest-path query of such a large database would take some refinement and I first wanted to figure out how to display my data, deploy the application, etc. I needed a smaller subgraph to play with until my query time improved. 167 | 168 | I wrote pres_clean.py to sample the pagelinks file for only those pages and links that include the names of U.S. Presidents. After cleaning, this graph had 77 thousand nodes and 140 thousand relationships. I built most of my application using this database, then I scaled everything to use the full database. 169 | 170 | Complete graph | Subgraph 171 | -------------- | ----------- 172 | 4.5m nodes | 77k nodes 173 | 110m links | 140k links 174 | 175 | At the start of the project, I decided there were at least four possible approaches to improve response time. I've tackled three of them so far and I've seen improvements with each: 176 | - [x] Scale vertically (tweak memory allocation, use larger machine) 177 | - [x] More efficient query (change query parameters, possibly rewrite algorithm) 178 | - [x] Prune graph if possible (remove trailing linked tails?) 179 | - [ ] Scale horizontally (distributed processing, e.g. [Giraph](http://giraph.apache.org/)) 180 | 181 | #####Scale vertically 182 | My first approach to improve response time for the full database was to fiddle with Neo4j's memory settings. The settings in **neo4j.properties** (e.g. *neostore.nodestore.db.mapped_memory*) didn't have a large impact on query time. I just set them to be as large as their counterpart was on disk. I had more success with *java.initmemory* and *java.maxmemory* (in **neo4j-wrapper.conf**). 183 | 184 | Each time I increased both init and max memory, I ran the same query three times and recorded the response time. My MacBook Air has 4GB of RAM, which seems to coincide with the dramatic improvement in query time (1400 sec to 60 sec) after passing the 4GB mark. *(Sidenote: This is odd, considering all advice I've seen suggests to leave 1-2GB for the OS, etc.)* 185 | 186 | ![Memory Test Results](static/images/mem_test.png) 187 | 188 | Then, I deployed the database to a larger machine (see [Deployment](#deployment) below). I scaled the java memory settings to the new specs, but the query time only halved (60 sec to 30 sec) despite the four-fold increase in RAM. 189 | 190 | #####Query efficiency 191 | I chose to use the built-in shortest-path algorithm for Neo4j, even though I've been unable to find out exactly what the algorithm is. It is breadth-first, which seems like a good approach. [Here](https://groups.google.com/forum/#!topic/neo4j/GiQPwQC_rII) is the closest description I've found: 192 | 193 | >The shortest path algorithm (i.e. paths with as few relationships as possible) uses breadth first, alternating sides between each visited relationship, not between each fully expanded depth. So if one side expands into fewer relationships than the other, that side can continue to new depths and find candidates even if the other side doesn't advance. When candidates are found the current depths will be fully expanded and shortest candidates will be selected and returned. 194 | 195 | The good folks on the [Neo4j Google Group](https://groups.google.com/forum/#!forum/neo4j) then suggested that the initial lookup of the two nodes was likely the slowest factor (rather than the pathfinding algorithm). This was my original query: 196 | ```python 197 | query = neo4j.CypherQuery( 198 | graph_db, 199 | """MATCH (m {node:'%s'}), (n {node:'%s'}), 200 | p = shortestPath((m)-[*..20]->(n)) RETURN p""" % (node1, node2) 201 | ) 202 | query.execute_one() 203 | ``` 204 | I then added a [constraint](#database-and-model) in the database for the Page label (all nodes are Pages) to express that node id is unique. I modified my query to use the Page label in the node lookup, as well as pass the nodes as arguments (instead of via string substitution). These two changes had the largest impact on query response time--from 30 seconds to 0.3 seconds for some queries. 205 | 206 | Here's the final query: 207 | ```python 208 | query = neo4j.CypherQuery( 209 | graph_db, 210 | """MATCH (m:Page {node:{n1}}), (n:Page {node:{n2}}), 211 | p = shortestPath((m)-[*..20]->(n)) RETURN p""" 212 | ) 213 | query.execute_one(n1=node1, n2=node2) 214 | ``` 215 | 216 | According to the [Neo4j manual](http://neo4j.com/docs/stable/query-constraints.html), unique constraints ensure that property values are unique for all nodes with a specific label. Additionally, unique constraints also add an index on the value--and this is the index used for lookups. *(Sidenote: It's interesting that auto-indexing (on the 'node' property) hadn't had a similar effect.)* 217 | 218 | ##### Pruning the graph 219 | I was very surprised to find that over half of the links in the page links dataset had no outgoing links. After some poking around on Wikipedia, I discovered that most of the 'dead-ends' are [red links](http://en.wikipedia.org/wiki/Wikipedia:Red_link), links that point to a page that does not yet exist. For some pages, when I visited the source page I could not find the link pointing to the dead-end. The DBPedia 2014 dataset is based on dumps from April/May 2014, so perhaps some dead-ends are the result of being deleted. 220 | 221 | In any case, I decided that for the purposes of my project, I was not interested in keeping pages that did not exist. Finding a path from those pages would be futile, and why would you want to find a path to it? Additionally, since there were so many dead-end links, even a one-pass removal of dead-ends would essentially halve my database, improving performance. 222 | 223 | #### Deployment 224 | This code was tested on Amazon's [EC2](http://aws.amazon.com/ec2/) using [Apache](http://httpd.apache.org/) as a web server. The database is housed on a 30 GiB EBS. Currently it is on an r3.large server with 15G RAM, and the query of the full database takes just 0.5 seconds. Since EC2 servers do not come with virtual memory, I set up the 32G SSD ephemeral instance storage as a paging (or swap) partition to give the database access if needed. -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | CouchDB==1.0 2 | Flask==0.10.1 3 | Jinja2==2.7.3 4 | MarkupSafe==0.23 5 | Werkzeug==0.9.6 6 | itsdangerous==0.24 7 | py2neo==1.6.4 8 | wsgiref==0.1.2 9 | -------------------------------------------------------------------------------- /static/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erabug/wikigraph/08fd74ea020f5eedec641e5878128985c1e6c758/static/.DS_Store -------------------------------------------------------------------------------- /static/css/index.css: -------------------------------------------------------------------------------- 1 | /******HTML STUFF******/ 2 | :focus { 3 | outline:none; 4 | border-bottom: solid 2px #999; 5 | } /*changes chrome's focus action*/ 6 | 7 | body { 8 | background-color: #fff; 9 | color: #666; 10 | margin: 0; 11 | font-family: sans-serif; 12 | font-weight: 100; 13 | font-size: 18px; 14 | text-align: center; 15 | } 16 | 17 | .logo { 18 | width: 100%; 19 | } 20 | 21 | .border { 22 | background: url('../images/logo-background.png'); 23 | height: 20px; 24 | } 25 | 26 | .background { 27 | background-color: #333; 28 | padding: 10px 0 10px; 29 | text-align: left; 30 | } 31 | 32 | .title { 33 | font-family: 'Signika', sans-serif; 34 | color: #fff; 35 | font-size: 40px; 36 | display: inline-block; 37 | margin-left: 15%; 38 | } 39 | 40 | .title:hover { 41 | cursor: pointer; 42 | } 43 | 44 | .blurb { 45 | font-family: 'Nothing You Could Do', sans-serif; 46 | font-size: 20px; 47 | display: inline-block; 48 | color: #fff; 49 | margin-left: 10px; 50 | } 51 | 52 | .info { 53 | color: #999; 54 | text-align: right; 55 | position: absolute; 56 | right: 15%; 57 | top: 50px; 58 | } 59 | 60 | .info:hover { 61 | cursor: pointer; 62 | color: #fff; 63 | } 64 | 65 | .query-form { 66 | margin: 30px; 67 | text-align: center; 68 | } 69 | 70 | input { 71 | border: none; 72 | border-bottom: solid 1px #999; 73 | margin: 0 10px 0 10px; 74 | font-size: 16px; 75 | } 76 | 77 | .button { 78 | margin-left: 5px; 79 | color: #fff; 80 | background-color: #999; 81 | border: solid 1px #999; 82 | border-radius: 10px; 83 | position: relative; 84 | font-family: 'Signika', sans-serif; 85 | } 86 | 87 | .button:hover { 88 | cursor: pointer; 89 | background-color: #666; 90 | border: solid 1px #666; 91 | } 92 | 93 | .wtf { 94 | border: none; 95 | border-radius: 16px; 96 | background-color: #B85427; 97 | padding: 2px 8px 2px 8px; 98 | } 99 | 100 | .wtf:hover { 101 | background-color: #FF7436; 102 | border: none; 103 | cursor: auto; 104 | } 105 | 106 | .about { 107 | width: 60%; 108 | margin: auto; 109 | text-align: left; 110 | padding-top: 20px; 111 | line-height: 150%; 112 | } 113 | 114 | h3 { 115 | text-align: center; 116 | font-family: 'Nothing You Could Do'; 117 | } 118 | 119 | #sadpanda { 120 | border: solid 1px #666; 121 | border-radius: 10px; 122 | height: 150px; 123 | } 124 | 125 | /*TYPEAHEAD STUFF*/ 126 | .tt-query { 127 | box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075); 128 | } 129 | 130 | .tt-hint { 131 | color: #999 132 | } 133 | 134 | .tt-dropdown-menu { 135 | margin-top: 12px; 136 | padding: 8px 0; 137 | background-color: #fff; 138 | border: 1px solid #ccc; 139 | border-radius: 8px; 140 | box-shadow: 0 5px 10px rgba(0,0,0,.2); 141 | max-height: 200px; 142 | width: 200px; 143 | overflow-y: auto; 144 | text-align: left; 145 | } 146 | 147 | .tt-suggestion { 148 | padding: 2px 20px; 149 | font-size: 14px; 150 | line-height: 18px; 151 | } 152 | 153 | .tt-cursor { 154 | color: #fff; 155 | background-color: #0097cf; 156 | cursor: pointer; 157 | } 158 | 159 | .tt-suggestion p { 160 | margin: 0; 161 | } 162 | 163 | 164 | /*RESULTS STUFF*/ 165 | .results { 166 | width: 100%; 167 | /*display: inline-block;*/ 168 | } 169 | 170 | .graph { 171 | display: inline-block; 172 | min-width: 350px; 173 | } 174 | 175 | .sidebar { 176 | display: inline-block; 177 | vertical-align: top; 178 | } 179 | 180 | .details { 181 | width: 200px; 182 | margin: auto; 183 | border-radius: 5px; 184 | position: absolute; 185 | top: 172px; 186 | min-height: 200px; 187 | border: solid 2px; 188 | padding: 5px; 189 | background-color: #E5E5E5; 190 | } 191 | 192 | .hidden { 193 | display: none; 194 | } 195 | 196 | .help { 197 | font-size: 14px; 198 | padding: 5px; 199 | } 200 | 201 | .page-image { 202 | margin-top: 5px; 203 | } 204 | 205 | .page-title { 206 | margin-top: 10px; 207 | padding: 5px; 208 | } 209 | 210 | .page-extract { 211 | font-size: 12px; 212 | padding: 5px; 213 | } 214 | 215 | .page { 216 | display: inline-block; 217 | vertical-align: top; 218 | } 219 | 220 | .squareimg { 221 | width: 90px; 222 | height: 90px; 223 | border-radius: 45px; 224 | overflow: hidden; 225 | border: solid 2px black; 226 | margin: 10px; 227 | background-color: #fff; 228 | } 229 | 230 | .loading { 231 | background: url("../images/arrow.gif") no-repeat center; 232 | position: relative; 233 | font-size: 30px; 234 | top: 35px; 235 | width: 50px; 236 | height: 30px; 237 | } 238 | -------------------------------------------------------------------------------- /static/images/arrow.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erabug/wikigraph/08fd74ea020f5eedec641e5878128985c1e6c758/static/images/arrow.gif -------------------------------------------------------------------------------- /static/images/arrow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erabug/wikigraph/08fd74ea020f5eedec641e5878128985c1e6c758/static/images/arrow.png -------------------------------------------------------------------------------- /static/images/cat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erabug/wikigraph/08fd74ea020f5eedec641e5878128985c1e6c758/static/images/cat.jpg -------------------------------------------------------------------------------- /static/images/logo-background.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erabug/wikigraph/08fd74ea020f5eedec641e5878128985c1e6c758/static/images/logo-background.png -------------------------------------------------------------------------------- /static/images/mem_test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erabug/wikigraph/08fd74ea020f5eedec641e5878128985c1e6c758/static/images/mem_test.png -------------------------------------------------------------------------------- /static/images/sadpanda.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erabug/wikigraph/08fd74ea020f5eedec641e5878128985c1e6c758/static/images/sadpanda.jpg -------------------------------------------------------------------------------- /static/images/wikigraph_ss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erabug/wikigraph/08fd74ea020f5eedec641e5878128985c1e6c758/static/images/wikigraph_ss.png -------------------------------------------------------------------------------- /static/js/graph.js: -------------------------------------------------------------------------------- 1 | function drawGraph(json) { 2 | 3 | var pathLength = Object.keys(queryInfo).length; 4 | 5 | // establish width and height of the svg 6 | var width = 600; 7 | var height = pathLength * 110; 8 | 9 | // color established as a scale 10 | var color = d3.scale.category20(); 11 | 12 | // appends svg tag to graph div 13 | var svg = d3.select('.graph').append('svg') 14 | .attr('width', width) 15 | .attr('height', height); 16 | 17 | // this function handles the parameters of the force-directed layout 18 | var force = d3.layout.force() 19 | .gravity(0.05) 20 | .linkDistance(function(d) { 21 | if (d.value == 1) { 22 | return 115; 23 | } else { 24 | // if not in the path, distance is a random number from 70 to 100 25 | return Math.floor(Math.random() * (100 - 70)) + 70; 26 | } 27 | }) 28 | .charge(-100) 29 | .size([width, height]); 30 | 31 | var defs = svg.append('defs'); 32 | 33 | // this appends the marker tag to the svg tag, applies arrowhead attributes 34 | defs.selectAll('marker') 35 | .data(['arrow']) 36 | .enter().append('svg:marker') 37 | .attr('id', String) 38 | .attr('viewBox', '0 -5 10 10') 39 | .attr('refX', 9) 40 | .attr('markerWidth', 7) 41 | .attr('markerHeight', 7) 42 | .attr('orient', 'auto') 43 | .style('fill', '#666') 44 | .append('svg:path') 45 | .attr('d', 'M0,-4L10,0L0,4Z'); 46 | 47 | // this helps predict where the arrowhead should be on the link path 48 | var diagonal = d3.svg.diagonal() 49 | .projection(function(d) { 50 | return [d.y, d.x]; 51 | }); 52 | 53 | // establish links 54 | var link = svg.selectAll('.link') 55 | .data(json.links) 56 | .enter().append('path') 57 | .attr('class', 'link') 58 | .style('stroke', '#666') 59 | .style('opacity', 0.6) 60 | .attr('marker-end', 'url(#arrow)') 61 | .attr('d', diagonal); 62 | 63 | // establish nodes 64 | var node = svg.selectAll('g.node') 65 | .data(json.nodes) 66 | .enter().append('svg:g') 67 | .attr('class', 'node') 68 | .attr('id', function(d) { 69 | return d.title + '|' + d.code; 70 | }) 71 | .call(force.drag); 72 | 73 | // define path ndoes 74 | var pathNode = node.filter(function(d) { 75 | return d.group == 'path'; 76 | }); 77 | 78 | // define non-path nodes 79 | var nonPathNode = node.filter(function(d) { 80 | return d.group != 'path'; 81 | }); 82 | 83 | // define path links 84 | var pathLink = link.filter(function(d) { 85 | return d.value == 1; 86 | }); 87 | 88 | var start; 89 | Object.keys(queryInfo).forEach(function(key) { 90 | // identify the start node 91 | var img = queryInfo[key]; 92 | if (img.code === 0) { 93 | start = key; 94 | } 95 | // define clip paths for each path node 96 | defs.append('clipPath') 97 | .attr('id', 'img' + key.toString()) 98 | .append('circle') 99 | .attr('r', 45); 100 | }); 101 | 102 | // define the start node 103 | var startNode = pathNode.filter(function(d) { 104 | return d.code == start; 105 | }); 106 | 107 | // add styling for the path links 108 | pathLink 109 | .style('stroke-width', '3px') 110 | .style('opacity', 1) 111 | .attr('marker-end', 'url(#arrow)'); 112 | 113 | // append colored circles to non-path nodes 114 | nonPathNode.append('circle') 115 | .attr('r', function(d) { 116 | var size; 117 | // upper bound for scaling size on degrees 118 | if (d.degrees > 600) { 119 | // assign radius to node attribute for arrowhead placement 120 | d.radius = 18; 121 | } else { 122 | d.radius = d.degrees * 0.02 + 7; // scales linearly with degrees 123 | } 124 | return d.radius; 125 | }) 126 | .style('fill', function(d) { 127 | return color(d.degrees); 128 | }); 129 | 130 | // append white circles to path nodes (for the .gifs) 131 | pathNode.append('circle') 132 | .attr('r', function(d) { 133 | d.radius = 45; 134 | return d.radius; 135 | }) 136 | .style('fill', '#fff'); 137 | 138 | // append clip-path to each path node 139 | pathNode.append('image') 140 | .attr('xlink:href', function(d) { return queryInfo[d.code].url;}) 141 | .attr('x', function(d) { return -queryInfo[d.code].width / 2;}) 142 | // this seems to help for portraits, where height > width 143 | .attr('y', function(d) { 144 | var imgHeight = queryInfo[d.code].height; 145 | var offset; 146 | if (h > queryInfo[d.code].width) { 147 | offset = 12; 148 | } else { 149 | offset = 0; 150 | } 151 | return -(imgHeight / 2) + offset; 152 | }) 153 | .attr('height', function(d) { return queryInfo[d.code].height;}) 154 | .attr('width', function(d) { return queryInfo[d.code].width;}) 155 | .attr('clip-path', function(d) { 156 | var clipPathID = 'img' + d.code; 157 | return 'url(#' + clipPathID + ')'; // unique clip path for this node 158 | }); 159 | 160 | // append empty circle to each path node as an outline 161 | pathNode.append('circle') 162 | .attr('r', 45) 163 | .style('fill', 'none') 164 | .style('stroke', '#333') 165 | .style('stroke-width', '2px'); 166 | 167 | // fix the x and y coordinates for the start node 168 | startNode.each(function(d) { 169 | d.fixed = true; 170 | d.x = width/pathLength; 171 | d.y = height/pathLength; 172 | }); 173 | 174 | // this calls the function force on the nodes and links 175 | force 176 | .nodes(json.nodes) 177 | .links(json.links) 178 | .start(); 179 | 180 | // this block occurs each time 'tick' is called by d3 181 | force.on('tick', function() { 182 | node.attr('cx', function(d) { 183 | d.x = Math.max(15, Math.min(width - 15, d.x)); 184 | return d.x; 185 | }) 186 | .attr('cy', function(d) { 187 | d.y = Math.max(15, Math.min(height - 15, d.y)); 188 | return d.y; 189 | }) 190 | .attr('transform', function(d) { 191 | return 'translate(' + d.x + ',' + d.y + ')'; 192 | }); 193 | link.attr('x1', function(d) { return d.source.x; }) 194 | .attr('y1', function(d) { return d.source.y; }) 195 | .attr('x2', function(d) { return d.target.x; }) 196 | .attr('y2', function(d) { return d.target.y; }) 197 | // this places arrowheads based on radius 198 | .attr('d', function(d) { 199 | // Total difference in x and y from source to target 200 | diffX = d.target.x - d.source.x; 201 | diffY = d.target.y - d.source.y; 202 | // Length of path from center of source node to center of target node 203 | pathLength = Math.sqrt((diffX * diffX) + (diffY * diffY)); 204 | // x and y distances from center to outside edge of target node 205 | offsetX = (diffX * d.target.radius) / pathLength; 206 | offsetY = (diffY * d.target.radius) / pathLength; 207 | return 'M' + d.source.x + ',' + d.source.y + 'L' + 208 | (d.target.x - offsetX) + ',' + (d.target.y - offsetY); 209 | }); 210 | }); 211 | 212 | } -------------------------------------------------------------------------------- /static/js/index.js: -------------------------------------------------------------------------------- 1 | 2 | // Global variables 3 | var CODES; // this object will be populated once the user inputs two pages 4 | var response; // global variable for the graph db response 5 | var queryInfo; // an object to organize and pass information to the graph 6 | var imageURLs; // an array for the start and end node images (to retain order) 7 | 8 | /** 9 | * INPUT-RELATED 10 | */ 11 | 12 | var startField = $('#start-node'); 13 | var endField = $('#end-node'); 14 | var aboutDiv = $('.about'); 15 | 16 | // sets up the request parameters for Typeahead 17 | var pageNames = new Bloodhound({ 18 | datumTokenizer: function(d) { 19 | return Bloodhound.tokenizers.whitespace(d.value); 20 | }, 21 | queryTokenizer: Bloodhound.tokenizers.whitespace, 22 | limit: 50, 23 | remote: { 24 | url: '/page-names?query=%QUERY', 25 | filter: function(pageNames) { 26 | // Map the remote source JSON array to a JavaScript array 27 | return $.map(pageNames.results, function(page) { 28 | return { 29 | value: page.title, 30 | code: page.code 31 | }; 32 | }); 33 | } 34 | } 35 | }); 36 | 37 | // send input to the pagenames db, take the first result as the query input 38 | function feelingLucky(inputField, node) { 39 | if (!(CODES[node])) { 40 | return $.get( 41 | '/page-names', 42 | 'query=' + inputField.val(), 43 | function(data) { 44 | var result = data.results[0]; // uses the first result 45 | inputField.val(result.title); 46 | CODES[node] = {'title': result.title, 47 | 'code': result.code.toString()}; 48 | }); 49 | } 50 | } 51 | 52 | // send request to the pagenames db, write the results to the input fields 53 | function getRandomPages() { 54 | $.get('/random-query', 55 | function(data) { 56 | var node1 = data.results[0]; 57 | var node2 = data.results[1]; 58 | CODES.node1 = {'title': node1.title, 59 | 'code': node1.code.toString()}; 60 | CODES.node2 = {'title': node2.title, 61 | 'code': node2.code.toString()}; 62 | startField.val(node1.title); // fill in the search fields 63 | endField.val(node2.title); 64 | }); 65 | } 66 | 67 | // swap both the input field and the global CODES values 68 | function reverseQuery() { 69 | var x = startField.val(); 70 | startField.val(endField.val()); 71 | endField.val(x); 72 | var y = CODES.node1; 73 | CODES.node1 = CODES.node2; 74 | CODES.node2 = y; 75 | } 76 | 77 | // take the input field value and assign it to CODES 78 | function decodeInput(d, node) { 79 | CODES[node] = {'title': d.value, 80 | 'code': d.code.toString()}; 81 | } 82 | 83 | // when the 'Go' button is clicked, check for both values, then run query 84 | $('input#submit-query').click(function() { 85 | clearPartial(); 86 | aboutDiv.addClass('hidden'); 87 | var checkFirst = feelingLucky(startField, 'node1'); 88 | var checkLast = feelingLucky(endField, 'node2'); 89 | $.when( 90 | checkFirst, 91 | checkLast 92 | ).then(function(data) { 93 | query(); 94 | }); 95 | }); 96 | 97 | // get and display random pages when the 'Random' button is clicked 98 | $('input#random-query').click(function() { 99 | getRandomPages(); 100 | }); 101 | 102 | // reverse the pages when the 'Reverse' button is clicked 103 | $('input#reverse-query').click(function() { 104 | reverseQuery(); 105 | }); 106 | 107 | // sets up the typeahead on the two input fields 108 | $('.scrollable-dropdown-menu .typeahead').typeahead(null, { 109 | name: 'pageNames', 110 | displayKey: 'value', 111 | source: pageNames.ttAdapter() 112 | }); 113 | 114 | // delete the code value as soon as the user clicks into an input field 115 | startField.focus(function() { 116 | delete CODES['node1']; 117 | }); 118 | 119 | endField.focus(function() { 120 | delete CODES['node2']; 121 | }); 122 | 123 | // when a suggested title is selected, write that value to CODES 124 | startField.on('typeahead:selected typeahead:autocompleted', function (e, d) { 125 | decodeInput(d, 'node1'); 126 | }); 127 | 128 | endField.on('typeahead:selected typeahead:autocompleted', function (e, d) { 129 | decodeInput(d, 'node2'); 130 | }); 131 | 132 | // select all text when the user clicks into an input field 133 | $('input[type=text]').focus(function() { 134 | this.select(); 135 | }); 136 | 137 | // initialize the bloodhound 138 | pageNames.initialize(); 139 | 140 | /** 141 | * QUERY-RELATED 142 | */ 143 | 144 | var pathDiv = $('.loading-images'); 145 | 146 | // create and return a query URL for images, based on desired size, number of 147 | // pages, and the page titles 148 | function makeQueryURL(size, numPages, pagesParams) { 149 | var queryURL = 'http://en.wikipedia.org/w/api.php?action=query&' + 150 | 'format=json&redirects&prop=pageimages&pithumbsize='+ size + 151 | 'px&pilimit=' + numPages + '&titles=' + pagesParams + '&callback=?'; 152 | return queryURL; 153 | } 154 | 155 | // create and return an object with information about the thumbnail image, if 156 | // available, else use information about the default cat image 157 | function createThumbnailObject(page) { 158 | var thumbnail, width, height; 159 | if ('thumbnail' in page) { 160 | thumbnail = page.thumbnail.source; 161 | width = page.thumbnail.width; 162 | height = page.thumbnail.height; 163 | } else { 164 | thumbnail = '../static/images/cat.jpg'; 165 | width = 100; 166 | height = 100; 167 | } 168 | var item = {'title': page.title, 169 | 'thumbnail': thumbnail, 170 | 'width': width, 171 | 'height': height}; 172 | return item; 173 | } 174 | 175 | // create and return an HTML snippet using the page's code and image url 176 | function makeHTMLSnippet(code, thumbnail) { 177 | var html = '
' + 178 | '
'; 179 | return html; 180 | } 181 | 182 | // add information about a page to the global variable queryInfo 183 | function addImage(item, code) { 184 | queryInfo[code] = {'url': item.thumbnail, 185 | 'title': item.title, 186 | 'height': item.height, 187 | 'width': item.width}; 188 | } 189 | 190 | // create and return both HTML snippets for the two query pages, and update 191 | // the global variables queryInfo and imageURLs 192 | function addQueryInfo(data) { 193 | var pageObject = data.query.pages; 194 | var htmlSnippets = {}; 195 | Object.keys(pageObject).forEach(function(pageKey) { 196 | item = createThumbnailObject(pageObject[pageKey]); 197 | if (item.title == CODES.node1.title) { 198 | code = 0; 199 | } else { 200 | code = 1; 201 | } 202 | htmlSnippets[code] = makeHTMLSnippet(code, item.thumbnail); 203 | addImage(item, CODES['node' + (code + 1)].code); 204 | imageURLs[code] = {'title': item.title, 205 | 'thumbnail': item.thumbnail}; 206 | }); 207 | return htmlSnippets; 208 | } 209 | 210 | // compare a title to each page object in response.path and return the 211 | // matching page object's code number 212 | function getPathCode(title) { 213 | for (var i = 0; i < response.path.length; i++) { 214 | if (response.path[i].title == title) { 215 | return response.path[i].code; 216 | } 217 | } 218 | } 219 | 220 | // parse the results of an AJAX images request and add information about those 221 | // images to queryInfo 222 | function addPathImages(data) { 223 | var pageObject = data.query.pages; 224 | Object.keys(pageObject).forEach(function(pageKey) { 225 | var item = createThumbnailObject(pageObject[pageKey]); 226 | addImage(item, getPathCode(item.title)); 227 | }); 228 | } 229 | 230 | // updates queryInfo with index numbers for ordering purposes 231 | function updateIndexCodes() { 232 | response.path.forEach(function(node) { 233 | if (!(node.code in queryInfo)) { 234 | queryInfo[node.code] = queryInfo['undefined']; 235 | delete queryInfo['undefined']; 236 | var old_index = response.path.indexOf(node); 237 | response.path[old_index] = {'code': node.code, 238 | 'title': queryInfo[node.code].title}; 239 | } 240 | queryInfo[node.code].code = response.path.indexOf(node); 241 | }); 242 | } 243 | 244 | // given a query URL, request a number of page images from Wikipedia, then 245 | // update queryInfo and add index codes to each page to retain path order 246 | function getPathImages(queryURL) { 247 | return $.getJSON( 248 | queryURL, 249 | function(data) { 250 | addPathImages(data); 251 | updateIndexCodes(); 252 | }); 253 | } 254 | 255 | // parse the inner nodes of the path, if they exist then assemble a query URL 256 | // and request those images from Wikipedia 257 | function getInnerImages() { 258 | var inner = response.path.slice(1, -1); 259 | var numPages = inner.length; 260 | var innerNodes = []; 261 | inner.forEach(function(node) { 262 | innerNodes.push(node.title); 263 | }); 264 | var pagesParams; 265 | if (numPages > 1) { 266 | pagesParams = innerNodes.join('|'); 267 | } else { pagesParams = innerNodes; } 268 | 269 | if (inner.length === 0) { 270 | return false; 271 | } else { 272 | var queryURL = makeQueryURL(150, numPages, pagesParams); 273 | return getPathImages(queryURL); 274 | } 275 | } 276 | 277 | // assemble and request query for start/end images from Wikipedia and append 278 | // those to the path div, then request a shortest path from the graph database, 279 | // then get images for the resulting path, update the index codes, draw the 280 | // grah, and set up event handlers for the sidebar 281 | function query() { 282 | var pagesParams = CODES.node1.title + '|' + CODES.node2.title; 283 | var queryURL = makeQueryURL(150, 2, pagesParams); 284 | $.when( 285 | $.getJSON( 286 | queryURL, 287 | function(data) { 288 | var htmlSnippets = addQueryInfo(data); 289 | Object.keys(htmlSnippets).forEach(function(node) { 290 | pathDiv.append(htmlSnippets[node]); 291 | }); 292 | $('#page0').after('
'); 293 | }), 294 | $.get( 295 | '/query', 296 | CODES, 297 | function(data) { 298 | response = JSON.parse(data); 299 | }) 300 | ).then(function() { 301 | try { 302 | return getInnerImages(); 303 | } catch(err) {} 304 | }).done(function() { 305 | pathDiv.empty(); 306 | if (response.path != 'None') { 307 | updateIndexCodes(); 308 | pathDiv.empty(); 309 | drawGraph(response.results); 310 | buildSidebar(); 311 | } else { 312 | $('.path-not-found').removeClass('hidden'); 313 | } 314 | }); 315 | } 316 | 317 | /** 318 | * PATH-RELATED 319 | */ 320 | 321 | var detailsDiv = $('.details'); 322 | var pageImage = $('.page-image'); 323 | var pageTitle = $('.page-title'); 324 | var pageExtract = $('.page-extract'); 325 | 326 | // create and return the query URL for extracts from Wikipedia's API, based on 327 | // number of pages and their titles 328 | function makeExtractURL(numPages, pageParams) { 329 | var extractURL = 'http://en.wikipedia.org/w/api.php?action=query&' + 330 | 'prop=extracts&format=json&exsentences=3&explaintext=&exintro=&' + 331 | 'exlimit=' + numPages + '&titles=' + pageParams + '&callback=?'; 332 | return extractURL; 333 | } 334 | 335 | // create URLs for the page and extract queries for a title, then execute the 336 | // image request, update queryInfo, then execute the extract request, updata 337 | // queryInfo, then add both to their respective DOM elements 338 | function getImageAndExtract(title, code, that) { 339 | var queryURL = makeQueryURL(150, 1, title); 340 | var extractURL = makeExtractURL(1, title); 341 | $.when( 342 | $.getJSON( 343 | queryURL, 344 | function(data) { 345 | var pageObject = data.query.pages; 346 | Object.keys(pageObject).forEach(function(pageKey) { 347 | var item = createThumbnailObject(pageObject[pageKey]); 348 | addImage(item, code); 349 | }); 350 | }) 351 | ).then(function(data) { 352 | return $.getJSON( 353 | extractURL, 354 | function(data) { 355 | var thing = data.query.pages; 356 | var page = thing[Object.keys(thing)[0]]; 357 | var text = page.extract; 358 | queryInfo[code].extract = text; 359 | }); 360 | }).done(function(data) { 361 | // only write to div if user is still hovering 362 | if ($(that).is(':hover')) { 363 | pageImage.html(''); 365 | pageExtract.html(queryInfo[code].extract); 366 | } 367 | }); 368 | } 369 | 370 | // toggles whether the sidebar is displayed 371 | function toggleSidebar() { 372 | detailsDiv.toggleClass('hidden'); 373 | } 374 | 375 | // clears all divs in the sidebar 376 | function clearSidebar() { 377 | toggleSidebar(); 378 | pageImage.empty(); 379 | pageTitle.empty(); 380 | pageExtract.empty(); 381 | } 382 | 383 | // opens an external window for the wikipedia page for a given title 384 | function externalLink() { 385 | $('.node').dblclick(function() { 386 | var title = this.id.split('|')[0]; 387 | window.open('http://en.wikipedia.org/wiki/' + title); 388 | }); 389 | } 390 | 391 | // request extracts for pages in the returned path, update queryInfo with those 392 | function getPathExtracts(numPages, pageParams) { 393 | var extractURL = makeExtractURL(numPages, pageParams); 394 | $.getJSON( 395 | extractURL, 396 | function(data) { 397 | var extracts = data.query.pages; 398 | Object.keys(extracts).forEach(function(key) { 399 | var text = extracts[key].extract; 400 | var code = getPathCode(extracts[key].title); 401 | queryInfo[code].extract = text; 402 | }); 403 | }); 404 | } 405 | 406 | // handles all mouseover and mouseout events for nodes, requesting information 407 | // from Wikipedia if the information is not in queryInfo already 408 | function mouseoverHandler() { 409 | $('.node').mouseover(function(e) { // mouseover event handler 410 | toggleSidebar(); 411 | var info = this.id.split('|'); 412 | var title = info[0]; 413 | var code = info[1]; 414 | pageTitle.html(title); 415 | if (code in queryInfo) { 416 | pageImage.html(''); 418 | pageExtract.html(queryInfo[code].extract); 419 | } else { 420 | getImageAndExtract(title, code, this); 421 | } 422 | }); 423 | $('.node').mouseout(function(e) { 424 | clearSidebar(); 425 | }); 426 | externalLink(); 427 | } 428 | 429 | // takes the result of a query and requests images and extracts from Wikipedia, 430 | // then sets up the node mouseover handler 431 | function buildSidebar() { 432 | var pathNodes = []; 433 | response.path.forEach(function(node) { 434 | pathNodes.push(node.title); 435 | }); 436 | var pageParams = pathNodes.join('|'); 437 | var numPages = pathNodes.length; 438 | getSummaryImages(numPages, pageParams); // get thumbnails for summary 439 | getPathExtracts(numPages, pageParams); // get extracts for path nodes 440 | mouseoverHandler(); 441 | } 442 | 443 | /** 444 | * PAGE-RELATED 445 | */ 446 | 447 | var wtfDiv = $('.wtf'); 448 | var helpDiv = $('.help'); 449 | var queryForm = $('.query-form'); 450 | 451 | // clears the information for a new query, retains information about previous 452 | // searches 453 | function clearPartial() { 454 | $('svg').remove(); 455 | pathDiv.empty(); 456 | queryInfo = {}; 457 | imageURLs = []; 458 | $('.path-not-found').addClass('hidden'); 459 | } 460 | 461 | // full clear of all global variables and input fields 462 | function clearAll() { 463 | CODES = {}; 464 | response = ''; 465 | startField.val(''); 466 | endField.val(''); 467 | clearPartial(); 468 | } 469 | 470 | // toggles diplay for the help button upon mouseover and mouseout 471 | wtfDiv.mouseover(function() { 472 | clearSidebar(); 473 | helpDiv.toggleClass('hidden'); 474 | }); 475 | 476 | wtfDiv.mouseout(function() { 477 | clearSidebar(); 478 | helpDiv.toggleClass('hidden'); 479 | }); 480 | 481 | // toggles display for the query-form when the title is clicked 482 | $('.title').click(function() { 483 | clearAll(); 484 | queryForm.removeClass('hidden'); 485 | aboutDiv.addClass('hidden'); 486 | }); 487 | 488 | // toggles display for the information page when the 'About' button is clicked 489 | $('.info').click(function() { 490 | clearPartial(); 491 | queryForm.addClass('hidden'); 492 | aboutDiv.removeClass('hidden'); 493 | }); 494 | 495 | // clears everything upon page load 496 | clearAll(); 497 | -------------------------------------------------------------------------------- /static/js/summary.js: -------------------------------------------------------------------------------- 1 | function getSummaryImages(numPages, pageParams) { 2 | queryURL = makeQueryURL(60, numPages, pageParams); 3 | $.getJSON( 4 | queryURL, 5 | function(data) { 6 | var pageObject = data.query.pages; 7 | Object.keys(pageObject).forEach(function(pageKey) { 8 | item = createThumbnailObject(pageObject[pageKey]); 9 | code = getPathCode(item.title); 10 | queryInfo[code].tinyurl = item.thumbnail; 11 | queryInfo[code].tinyHeight = item.height; 12 | queryInfo[code].tinyWidth = item.width; 13 | }); 14 | displaySummary(response.path); 15 | }); 16 | } 17 | 18 | function displaySummary(path) { 19 | // draw SVG based on length of path 20 | var svg = d3.select(".summary").append("svg") 21 | .attr("width", 200) 22 | .attr("height", function(d) { 23 | return 50 * path.length; 24 | }); 25 | 26 | var defs = svg.append("defs"); 27 | 28 | // define clip paths for each path node 29 | Object.keys(queryInfo).forEach(function(key) { 30 | defs.append("clipPath") 31 | .attr("id", 'timg' + key.toString()) 32 | .append("circle") 33 | .attr("r", 20); 34 | }); 35 | 36 | // establish nodes 37 | var tinyNode = svg.selectAll("g.tinyNode") 38 | .data(path) 39 | .enter().append("svg:g") 40 | .attr("class", "tinyNode") 41 | .attr("transform", function(d) { // nodes are placed based on order 42 | var index = queryInfo[d.code].code; 43 | var yValue = 25 + (index * 50); 44 | return "translate(23, " + yValue + ")"; 45 | }) 46 | .attr("id", function(d) {return d.title + '|' + d.code;}); 47 | 48 | // append clip-path to each path node 49 | tinyNode.append("image") 50 | .attr("xlink:href", function(d) { 51 | return queryInfo[d.code].tinyurl; 52 | }) 53 | .attr("x", function(d) { return -queryInfo[d.code].tinyWidth / 2;}) 54 | .attr("y", function(d) { 55 | var imgHeight = queryInfo[d.code].tinyHeight; 56 | var offset; 57 | if (h > queryInfo[d.code].tinyWidth) { 58 | offset = 6; 59 | } else { 60 | offset = 0; 61 | } 62 | return -(imgHeight / 2) + offset; 63 | }) 64 | .attr("height", function(d) { return queryInfo[d.code].tinyHeight;}) 65 | .attr("width", function(d) { return queryInfo[d.code].tinyWidth;}) 66 | .attr("clip-path", function(d) { 67 | var clipPathID = 'timg' + d.code; 68 | return "url(#" + clipPathID + ")"; // unique clip path for this node 69 | }); 70 | 71 | // append empty circle to nodes as an outline 72 | tinyNode.append("circle") 73 | .attr("r", 20) 74 | .style("stroke", "#333") 75 | .style("stroke-width", "2px") 76 | .style("fill", "none"); 77 | 78 | tinyNode.append("foreignObject") // necessary for wrapped title strings 79 | .attr({width: 145, height: 45}) 80 | .attr({x: 30, y: function(d) { 81 | var len = d.title.length; 82 | if (len > 42) { 83 | return -22; 84 | } else if (len > 20) { 85 | return -12; 86 | } else { 87 | return -6; 88 | } 89 | }}) 90 | .append("xhtml:body") 91 | .append("xhtml:div") 92 | .style({ 93 | "font-size": "14px", 94 | "text-align": "left", 95 | "padding-left": "1px" 96 | }) 97 | .html(function(d) {return d.title;}); 98 | 99 | } 100 | 101 | -------------------------------------------------------------------------------- /templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | wikiGraph 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 19 | 20 |
21 | 22 | 23 | 25 | 26 | 27 |
28 | 29 |
30 | 34 |
35 | 36 |

What connects two topics on Wikipedia?

37 |

For example, how many links do you have to click to get from Harry Potter to the Spanish Inquisition?* Combining trivia nerdery with graph theory, wikiGraph allows users to find and explore the paths within Wikipedia.

38 |

The data are sourced from Wikipedia and DBPedia, without whom this project would not be possible.

39 |

This app was created by Erika Arnold as a final project for Hackbright Academy in Fall 2014. You can read more about the implementation and details on the Github page.

41 |

*It takes a minimum of 3 clicks. Here's one path: Harry Potter → United Kingdom → Basques → Spanish Inquisition

42 |
43 | 44 |
45 |
46 | 64 |
65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /test_continuity.py: -------------------------------------------------------------------------------- 1 | print "Testing for continuity..." 2 | with open('data/nodes.tsv', 'rb') as f: 3 | f.next() 4 | f.next() 5 | last_start = 0 6 | for line in f: 7 | l = line.split('\t') 8 | start = int(l[0]) 9 | if (last_start + 1) != start: 10 | print "ALERT! %d did not match last_start (%d) + 1" % (start, last_start) 11 | 12 | last_start = start 13 | print "Done." -------------------------------------------------------------------------------- /wikigraph.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, render_template, request, jsonify 2 | import query, sqlite3, random, time, os 3 | 4 | app = Flask(__name__) 5 | app.secret_key = os.environ.get('FLASK_SECRET_KEY', 'lisaneedsbraces') 6 | 7 | def connect(): 8 | 9 | cursor = sqlite3.connect('data/pagenames.db').cursor() 10 | return cursor 11 | 12 | @app.route('/') 13 | def index(): 14 | 15 | return render_template('index.html') 16 | 17 | @app.route('/query') 18 | def get_path(): 19 | 20 | node2, node1, code2, code1 = request.args.values() 21 | path_query = node1.replace(' ', '_') + '|' + node2.replace(' ', '_') 22 | print "%s (%s) -> %s (%s)?" % (node1, code1, node2, code2) 23 | response = query.create_lists(str(code1), str(code2)) 24 | 25 | return response 26 | 27 | @app.route('/page-names') 28 | def get_page_names(): 29 | 30 | entry = request.args.get("query").lower() 31 | print "Requesting page names for '%s'..." % entry 32 | t0 = time.time() 33 | 34 | cursor = connect() 35 | query1 = 'SELECT code, title FROM pagenames WHERE title_lower = ?' 36 | row = cursor.execute(query1, (entry,)).fetchone() 37 | 38 | results = [{ 'title': row[1], 'code': row[0] }] if row != None else [] 39 | 40 | query2 = '''SELECT code, title 41 | FROM pagenames 42 | WHERE title LIKE ? 43 | OR title LIKE ? 44 | LIMIT 50;''' 45 | 46 | rows = cursor.execute(query2, (entry + '%', '% ' + entry, )) 47 | results.extend([{ 'title': row[1], 'code': row[0] } for row in rows]) 48 | response = jsonify(**{ 'results': results }) 49 | 50 | t1 = time.time() 51 | print "DB responded with %d results in %0.2f seconds" % (len(results), t1 - t0) 52 | 53 | return response 54 | 55 | @app.route('/random-query') 56 | def get_random_names(): 57 | 58 | print "Requesting two random pages..." 59 | t0 = time.time() 60 | 61 | cursor = connect() 62 | query = '''SELECT code, title 63 | FROM pagenames 64 | WHERE degrees > 150 65 | AND title NOT BETWEEN 'List' and 'Lisu' 66 | AND NOT title BETWEEN '0' and '9}' 67 | ORDER BY RANDOM() 68 | LIMIT 2''' 69 | 70 | rows = cursor.execute(query) 71 | results = [{ 'title': row[1].replace('_', ' '), 'code': row[0] } for row in rows] 72 | response = jsonify(**{ 'results': results }) 73 | 74 | t1 = time.time() 75 | print "DB responded in %0.2f seconds" % (t1 - t0) 76 | 77 | return response 78 | 79 | if __name__ == '__main__': 80 | app.run(debug=True) 81 | # app.run(host="54") --------------------------------------------------------------------------------