├── clean_ttl.py
├── create_pagenames_db.py
├── master_clean.py
├── pres_clean.py
├── query.py
├── readme.md
├── requirements.txt
├── static
    ├── .DS_Store
    ├── css
    │   └── index.css
    ├── images
    │   ├── arrow.gif
    │   ├── arrow.png
    │   ├── cat.jpg
    │   ├── logo-background.png
    │   ├── mem_test.png
    │   ├── sadpanda.jpg
    │   └── wikigraph_ss.png
    └── js
    │   ├── graph.js
    │   ├── index.js
    │   └── summary.js
├── templates
    └── index.html
├── test_continuity.py
└── wikigraph.py


/clean_ttl.py:
--------------------------------------------------------------------------------
 1 | def clean_ttl(read_path, write_path):
 2 | 	with open(read_path, 'rb') as p, open(write_path, 'wb') as c:
 3 | 		p.next()
 4 | 		for line in p:
 5 | 			l = line.split()
 6 | 			source = l[0][29:-1]
 7 | 			target = l[2][29:-1]
 8 | 			c.write(source + '\t' + target + '\n')
 9 | 
10 | if __name__ == '__main__':
11 | 	print "Cleaning page links..."
12 | 	clean_ttl('data/page_links_en.ttl', 'data/cleaned_links.tsv')
13 | 	print "Cleaning redirects..."
14 | 	clean_ttl('data/redirects_en.ttl', 'data/cleaned_redirects.tsv')
15 | 	print "Done!"


--------------------------------------------------------------------------------
/create_pagenames_db.py:
--------------------------------------------------------------------------------
 1 | import csv, sqlite3
 2 | 
 3 | conn = sqlite3.connect('pagenames.db')
 4 | curs = conn.cursor()
 5 | curs.execute('CREATE TABLE pagenames (code INTEGER PRIMARY KEY, title TEXT, title_lower TEXT, degrees INTEGER);')
 6 | 
 7 | with open('data/nodes.tsv', 'rb') as csv_file:
 8 |     sql_insert = 'INSERT INTO pagenames VALUES(?, ?, ?, ?)'
 9 |     reader = csv.reader(csv_file, delimiter='\t')
10 |     reader.next()
11 |     for row in reader:
12 |     	title = unicode(row[1].replace('_', ' '), 'utf8')
13 |     	code = int(row[0])
14 |     	degrees = int(row[3])
15 |         to_db = [code, title, title.lower(), degrees]
16 |         curs.execute(sql_insert, to_db)
17 | 
18 | conn.commit()
19 | 
20 | curs.execute('CREATE UNIQUE INDEX codes ON pagenames(code);')
21 | curs.execute('CREATE INDEX titles ON pagenames(title);')
22 | curs.execute('CREATE INDEX titles_lower ON pagenames(title_lower);')
23 | curs.execute('CREATE INDEX degrees ON pagenames(degrees);')
24 | 
25 | conn.close()


--------------------------------------------------------------------------------
/master_clean.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import urllib2
  3 | 
  4 | def redirects_dict(redirects_path):
  5 |     """Iterates through the redirects file and creates a set of redirect page 
  6 |     names."""
  7 | 
  8 |     redirects = {}
  9 |     with open(redirects_path, 'rb') as reds:
 10 |         for line in reds:
 11 |             l = line.split('\t')
 12 |             source = l[0]
 13 |             target = l[1].rstrip()
 14 |             redirects.setdefault(source, target)
 15 | 
 16 |     return redirects
 17 | 
 18 | def convert_to_unicode(title):
 19 |     title = urllib2.unquote(title)
 20 |     return title
 21 | 
 22 | def assemble_dict(link_path, redirects):
 23 |     """Iterates through the pagelinks file and returns a dictionary containing
 24 |     information about the page, its unique code, and what it links to (if 
 25 |     anything).
 26 | 
 27 |     Example of returned dictionary: 
 28 |     {'page1': {'code': 41, 
 29 |                'title': 'page1', 
 30 |                'links': set([42, 108])}}"""
 31 | 
 32 |     with open(link_path, 'rb') as paths:
 33 |         data = {}
 34 |         foo = 0
 35 |         code_counter = 0
 36 | 
 37 |         t0 = time.time()
 38 |         for line in paths:
 39 |                 l = line.split('\t')
 40 |                 start = l[0]
 41 |                 end = l[1].rstrip()
 42 | 
 43 |                 if end[:5] == "File:" or end[:9] == "Category:":
 44 |                     continue
 45 | 
 46 |                 if start in redirects or start == end:
 47 |                     continue
 48 | 
 49 |                 if end[:2] == "S:" or end[:4] == "Help:":
 50 |                     continue
 51 | 
 52 |                 if '%' in start:
 53 |                     start = convert_to_unicode(start)
 54 | 
 55 |                 if '%' in end:
 56 |                     end = convert_to_unicode(end)
 57 | 
 58 |                 if end in redirects: # if start points to a redirect page
 59 |                     end = redirects[end] # replace it with the real page
 60 | 
 61 |                 if data.get(start, 0) == 0:
 62 |                     data[start] = {'title': start, 
 63 |                                    'links': {end}}
 64 |                 else:
 65 |                     data[start]['links'].add(end)
 66 | 
 67 |                 if data.get(end, 0) == 0:
 68 |                     data[end] = {'title': end, 
 69 |                                  'links': set()}
 70 | 
 71 |                 foo += 1
 72 |                 if foo % 10000000 == 0:
 73 |                     x = foo/1000000
 74 |                     y = (time.time() - t0)/60
 75 |                     print "%d million lines read in %.2f minutes" % (x, y)
 76 | 
 77 |     return data
 78 | 
 79 | def find_deadends(data):
 80 |     """Iterates through the page links dictionary, and for every page without
 81 |     outgoing links, adds the code number to the 'deadends' set and deletes the
 82 |     key from the dictionary."""
 83 | 
 84 |     deadends = set()
 85 |     keys = data.keys()
 86 |     for key in keys:
 87 |         value = data[key]
 88 |         if not value['links']:
 89 |             deadends.add(value['title'])
 90 |             del data[key] # remove key from data
 91 | 
 92 |     return deadends, keys
 93 | 
 94 | def prune_deadends(data, deadends, keys):
 95 |     """Iterates through the page links dictionary, and for every page, removes
 96 |     links that are in the 'deadends' set."""
 97 | 
 98 |     for key in keys:
 99 |         value = data.get(key)
100 |         if value is not None:
101 |             links = value['links'].copy()
102 |             for link in links:
103 |                 if link in deadends:
104 |                     value['links'].remove(link)
105 | 
106 | def recode_data(data):
107 |     """Iterates through the page links dictionary and assigns a code to 
108 |     every page. Returns a dictionary of title:code lookups."""
109 | 
110 |     codes = {}
111 |     code_counter = 0
112 |     for key, value in data.iteritems():
113 |         data[key].update({'code': code_counter})
114 |         codes[value['title']] = code_counter
115 |         code_counter += 1
116 | 
117 |     return codes
118 | 
119 | def write_rels(data, rels_path, codes):
120 |     """Iterates through the page links dictionary and writes the results to 
121 |     the rels.tsv file (start, end, link_type)."""
122 |     
123 |     with open(rels_path, 'wb+') as rels:
124 |         rels.write('start\tend\ttype\n')
125 |         for value in data.values():
126 |             code = str(value['code'])
127 |             if value['links']:
128 |                 for link in value['links']:
129 |                     rels.write(code + '\t' + str(codes[link]) + '\tLINKS_TO\n')
130 | 
131 | def write_nodes(data, nodes_path):
132 |     """Iterates through the page links dictionary (sorted by code number)
133 |     and writes the results to the nodes.tsv file (code, title, label, degrees).
134 |     """
135 | 
136 |     with open(nodes_path, 'wb+') as nodes:
137 |         nodes.write('node\tname\tl:label\tdegrees\n')
138 |         for page in sorted(data.values(), key=lambda k: k['code']):
139 |             code = str(page['code'])
140 |             deg = str(len(page['links']))
141 |             nodes.write(code + '\t' + page['title'] + '\tPage\t'+ deg + '\n')
142 | 
143 | def clean_data():
144 |     """Reads a tab-separated file of Wikipedia links and creates one tsv file for 
145 |     page links and one for pages. First it assembles a dictionary of redirect 
146 |     pages, then it creates a page links dictionary, filtering out redirects and 
147 |     specific page types. Next, pages with no outgoing links are removed and 
148 |     their title is added to a 'deadend' set. Then, pages in the dictionary 
149 |     remove links to pages in the deadend set. Finally, the dictionary is 
150 |     parsed and information is written to two .tsv files."""
151 | 
152 |     print "Creating set of redirect pages..."
153 |     redirects = redirects_dict('data/cleaned_redirects.tsv')
154 |     print "Reading page links..."
155 |     data = assemble_dict('data/cleaned_links.tsv', redirects)
156 |     raw_length = len(data)
157 |     print "Page links dictionary created with %d pages." % raw_length
158 |     print "Finding deadends..."
159 |     deadends, keys = find_deadends(data)
160 |     print "Pruning %d deadends..." % len(deadends)
161 |     prune_deadends(data, deadends, keys)
162 |     print "Recoding data..."
163 |     codes = recode_data(data)
164 |     perc = (len(data)/float(raw_length))*100
165 |     print "Pages pruned, now %d pages (%.2f%% of original)." % (len(data), perc)
166 |     print "Writing 'rels.tsv'..."
167 |     write_rels(data, 'data/rels.tsv', codes)
168 |     print "Writing 'nodes.tsv'..."
169 |     write_nodes(data, 'data/nodes.tsv')
170 |     print "Done!"
171 | 
172 | if __name__ == "__main__":
173 |     clean_data()


--------------------------------------------------------------------------------
/pres_clean.py:
--------------------------------------------------------------------------------
 1 | PRESIDENTS = set([
 2 | 				'George_Washington', 'John_Adams', 'Thomas_Jefferson',
 3 | 				'James_Madison', 'James_Monroe','John_Quincy_Adams',
 4 | 				'Andrew_Jackson','Martin_Van_Buren', 'William_Henry_Harrison',
 5 | 				'John_Tyler', 'James_K._Polk', 'Zachary_Taylor',
 6 | 				'Millard_Fillmore', 'Franklin_Pierce', 'James_Buchanan',
 7 | 				'Abraham_Lincoln', 'Andrew_Johnson', 'Ulysses_S._Grant',
 8 | 				'Rutherford_B._Hayes', 'James_A._Garfield', 'Chester_A._Arthur',
 9 | 				'Grover_Cleveland', 'Benjamin_Harrison', 'William_McKinley',
10 | 				'Theodore_Roosevelt', 'William_Howard_Taft', 'Woodrow_Wilson', 
11 | 				'Warren_G._Harding', 'Calvin_Coolidge', 'Herbert_Hoover', 
12 | 				'Franklin_D._Roosevelt', 'Harry_S._Truman',
13 | 				'Dwight_D._Eisenhower', 'John_F._Kennedy', 'Lyndon_B._Johnson',
14 | 				'Richard_Nixon', 'Gerald_Ford', 'Jimmy_Carter', 'Ronald_Reagan',
15 | 				'George_H._W._Bush', 'Bill_Clinton', 'George_W._Bush',
16 | 				'Barack_Obama'
17 | ])
18 | 
19 | ### This creates the presidents subgraph
20 | # parse page_links, if source or target is a president, write it
21 | with open('data/cleaned_links.tsv', 'r') as f, open('data/pres_links.tsv', 'wb+') as p:
22 | 	for line in f:
23 | 		l = line.split('\t')
24 | 		start = l[0]
25 | 		end = l[1].rstrip()
26 | 		if start in PRESIDENTS or end in PRESIDENTS:
27 | 			p.write(line)


--------------------------------------------------------------------------------
/query.py:
--------------------------------------------------------------------------------
  1 | from py2neo import neo4j
  2 | import json, time
  3 | 
  4 | def find_shortest_path(node1, node2):
  5 |     """Connects to graph database, then creates and sends query to graph 
  6 |     database. Returns the shortest path between two nodes.
  7 |     Format: (67149)-[:'LINKS_TO']->(421)"""
  8 | 
  9 |     graph_db = neo4j.GraphDatabaseService()
 10 | 
 11 |     t0 = time.time()
 12 | 
 13 |     query = neo4j.CypherQuery(
 14 |         graph_db, 
 15 |         """MATCH (m:Page {node:{n1}}), (n:Page {node:{n2}}), 
 16 |         p = shortestPath((m)-[*..10]->(n)) RETURN p"""
 17 |     )
 18 |     try:
 19 |         path = query.execute_one(n1=node1, n2=node2)
 20 |     except:
 21 |         path = None
 22 | 
 23 |     t1 = time.time()
 24 | 
 25 |     print "\nShortest Path:", path
 26 |     print "Time elapsed: %.2f seconds" % (t1 - t0)
 27 | 
 28 |     return path
 29 | 
 30 | def parse_node(node, in_path):
 31 |     """Extract title and code from a node object. Returns a dict of information."""
 32 | 
 33 |     code, deg, title = node.get_properties().values()
 34 |     title = title.replace('_', ' ')
 35 | 
 36 |     if title == "Basque people": # special exception for a changed redirect
 37 |         title = "Basques"
 38 | 
 39 |     node_dict = {'code': int(code), 
 40 |                  'title': title, 
 41 |                  'degrees': deg, 
 42 |                  'group': 'none'}
 43 | 
 44 |     if in_path:
 45 |         node_dict['group'] = 'path'
 46 | 
 47 |     return node_dict
 48 | 
 49 | def parse_rel(rel, in_path):
 50 |     """Extract node code from a relationship object. Returns a dict of 
 51 |     information."""
 52 | 
 53 |     start_id = rel.start_node.get_properties()['node']
 54 |     end_id = rel.end_node.get_properties()['node']
 55 | 
 56 |     rel_dict = {'source': int(start_id), 
 57 |                 'target': int(end_id), 
 58 |                 'value': 0}
 59 | 
 60 |     if in_path:
 61 |         rel_dict['value'] = 1
 62 | 
 63 |     return rel_dict
 64 | 
 65 | def parse_node_objs(node_objs_list, in_path=False):
 66 |     """Takes a list of node objects. Returns dict of node dicts."""
 67 | 
 68 |     nodes = {}
 69 | 
 70 |     for node in node_objs_list:
 71 |         node_dict = parse_node(node, in_path=in_path)
 72 |         if node_dict['code'] not in nodes:
 73 |             nodes[node_dict['code']] = node_dict
 74 | 
 75 |     return nodes
 76 | 
 77 | def parse_rel_objs(rel_objs_list, in_path=False):
 78 |     """Takes a list of relationship objects. Returns list of rel dicts."""
 79 | 
 80 |     rel_dict_list = [parse_rel(rel=rel, in_path=in_path) for rel in rel_objs_list]
 81 | 
 82 |     return rel_dict_list
 83 | 
 84 | def find_other_nodes(node_objs_list):
 85 |     """Takes a list of node objects. Returns list of rel dicts and list of
 86 |     node dicts."""
 87 | 
 88 |     rels = []
 89 |     nodes = []
 90 | 
 91 |     for node in node_objs_list:
 92 | 
 93 |         for rel in node.match_incoming(limit=8):
 94 |             rels.append(rel)
 95 |             nodes.append(rel.start_node)
 96 | 
 97 |         for rel in node.match_outgoing(limit=8):
 98 |             rels.append(rel)
 99 |             nodes.append(rel.end_node)
100 | 
101 |     rel_dict_list = parse_rel_objs(rels)
102 |     node_dict_list = parse_node_objs(nodes)
103 |     
104 |     return rel_dict_list, node_dict_list
105 | 
106 | def merge_node_dicts(path_nodes, npath_nodes):
107 |     """Takes and merges the two dictionaries of node dicts. Returns list of 
108 |     node dicts."""
109 | 
110 |     d = dict(npath_nodes.items() + path_nodes.items())
111 |     node_dict_list = [node_dict for node_dict in d.values()]
112 | 
113 |     return node_dict_list
114 | 
115 | def parse_nodes_and_rels(path):
116 |     """Takes a path object. Returns two lists, one for rel dicts and one for 
117 |     node dicts."""
118 | 
119 |     # rel dict list for main path
120 |     path_rels = parse_rel_objs(rel_objs_list=path.relationships, in_path=True)
121 | 
122 |     # parse nodes, create list of unique nodes
123 |     path_nodes = parse_node_objs(node_objs_list=path.nodes, in_path=True)
124 | 
125 |     # this is a quick/dirty way to grab the names for each path node in order
126 |     path_names = []
127 |     for node in path.nodes:
128 |         path_dict = node.get_properties().values()[0]
129 |         path_names.append({'title': path_nodes[int(path_dict)]['title'], 
130 |                            'code': path_nodes[int(path_dict)]['code']})
131 | 
132 |     # rel dict list for secondary rels
133 |     npath_rels, npath_nodes = find_other_nodes(node_objs_list=path.nodes)
134 | 
135 |     # filter out reversed or duplicate paths in the path rels
136 |     for rel in npath_rels:
137 |         for path in path_rels:
138 |             if rel['source'] == path['target'] and rel['target'] == path['source']:
139 |                 rel['value'] = 1 # include it in the path
140 |             if rel['source'] == path['source'] and rel['target'] == path['target']:
141 |                 npath_rels.remove(rel) # remove duplicates
142 | 
143 |     # combine the two lists for nodes and rels
144 |     rels_list = path_rels + npath_rels
145 |     nodes_list = merge_node_dicts(path_nodes, npath_nodes)
146 | 
147 |     return rels_list, nodes_list, path_names
148 | 
149 | def create_lists(node1, node2):
150 |     """Request the shortest path between two nodes from the database. Assemble 
151 |     list of nodes and relationships from the path, then process to recode their 
152 |     IDs. Write output to a JSON file."""
153 | 
154 |     path = find_shortest_path(str(node1), str(node2))
155 | 
156 |     if path:
157 |         
158 |         rels_list, nodes_list, path_names = parse_nodes_and_rels(path)
159 | 
160 |         codes = {}
161 |         id_counter = 0
162 | 
163 |         for node in nodes_list: # create a dict to translate id codes
164 |             node_id = node['code']
165 |             if node_id not in codes:
166 |                 codes[node_id] = id_counter
167 |                 id_counter += 1
168 | 
169 |         for rel in rels_list: # look up the source and target in codes
170 |             rel['source'] = codes[rel['source']]
171 |             rel['target'] = codes[rel['target']]
172 | 
173 |         response = """{ "path": %s, "results": { "directed": true, "nodes": %s, 
174 |         "links": %s, "multigraph": false }}""" % (json.dumps(path_names), json.dumps(nodes_list), json.dumps(rels_list))
175 | 
176 |     else:
177 |         response = '{ "path": "None", "results": "None" }'
178 | 
179 |     return response
180 | 
181 | if __name__ == '__main__':
182 |     print create_lists('335354', '3778612') # Abraham Lincoln to Astronomy
183 | 
184 | 
185 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | wikiGraph
  2 | ===========
  3 | What connects two topics on Wikipedia? For example, how many links do you have to click to get from Harry Potter to the Spanish Inquisition?* Combining trivia nerdery with graph theory, wikiGraph allows users to find and explore the paths within Wikipedia.
  4 | 
  5 | You can check out the project [here](http://wikigraph.erikaarnold.com).
  6 | 
  7 | *It takes a minimum of 3 clicks. Here's one path: Harry Potter -> British literature -> the spread of the printing press -> the Spanish Inquisition
  8 | 
  9 | ###Contents
 10 | - [Features](#features)
 11 | - [Data cleaning](#data-cleaning)
 12 | - [Queries](#queries)
 13 | - [Data visualization](#data-visualization)
 14 | - [Improving response time](#improving-response-time)
 15 | - [Deployment](#deployment)
 16 | 
 17 | #### Features
 18 | *Current*
 19 | - [x] Wikipedia page links imported into a graph database (Neo4j)
 20 | - [x] Python wrapper queries database for shortest path between two nodes, outputs path and secondary relationships as JSON (py2neo)
 21 | - [x] Result rendered as a force-directed graph (d3.js)
 22 | - [x] Flask app renders HTML and handles AJAX requests to the database
 23 | - [x] Flask app and database deployed (EC2, Apache)
 24 | - [x] Search suggest for page titles (typeahead.js, SQLite)
 25 | - [x] Embed page images on nodes within the rendered graph (Wikipedia API)
 26 | - [x] Option to generate a pseudo-random search query
 27 | - [x] Nodes are sized/colored based on the number of links to other nodes
 28 | - [x] Incorporate summary of path pages as mouseover tooltips (Wikipedia API)
 29 | 
 30 | *Future*
 31 | 
 32 | - [ ] Path responses cached (CouchDB)
 33 | - [ ] Snippets of each path node displayed in the mouseover tooltip (Beautiful Soup)
 34 | 
 35 | ![wikigraph_screenshot](static/images/wikigraph_ss.png)
 36 | 
 37 | #### Data cleaning
 38 | I downloaded RDF files (.ttl) for page links and redirects from [DBPedia](http://wiki.dbpedia.org/Downloads2014). Here's what the raw page links file looks like:
 39 | ```
 40 | <http://dbpedia.org/resource/Anarchism> <http://dbpedia.org/ontology/wikiPageWikiLink> <http://dbpedia.org/resource/William_McKinley> .
 41 | <http://dbpedia.org/resource/Alabama> <http://dbpedia.org/ontology/wikiPageWikiLink> <http://dbpedia.org/resource/Andrew_Jackson> .
 42 | ```
 43 | Wikipedia is big! This file includes over 150 million relationships. To reduce the file size before filtering data, I ran <kbd>clean_ttl.py</kbd> to pull out the page names from the source and target and write them to a tab-separated file. This significantly reduced the file sizes for both links and redirects (23GB -> 6.2GB, 980MB -> 275MB).
 44 | 
 45 | I then used <kbd>master_clean.py</kbd> to parse and clean the page links. As a first pass, this meant removing redirect pages and duplicates. After looking at the output, I realized that almost half of the page links were to pages that had no outgoing links--they were dead-ends. Some were specific types of links (e.g. File, Category, Help) so I could filter those. *(Side note: Why don't they keep track of inter-Wikipedia links on Category pages? Those could be useful.)*
 46 | 
 47 | However, even outside those categories, almost half of the pages in the file never linked to anything else. I decided to modify my cleaning script to remove the dead-ends--and any links pointing to them--from my data (see [Pruning the Graph](#pruning-the-graph) for my rationale).
 48 | 
 49 | Here is the main function in <kbd>master_clean.py</kbd>:
 50 | 
 51 | ```python
 52 | def clean_data():
 53 |     """Reads a tab-separated file of page links, creates one tab-separated
 54 |     file for page links (rels.tsv) and one for pages (nodes.tsv). First it
 55 |     assembles a dictionary of redirect pages, then it creates a page link 
 56 |     dictionary, filtering out redirects and specific page types. Then, pages 
 57 |     with no outgoing links are removed and their code is added to a 'dead-end' 
 58 |     set. Pages with links to pages in the dead-end set remove those links. Finally, the dictionary is parsed and information is 
 59 |     written to two .tsv files."""
 60 | 
 61 |     redirects = redirects_dict('data/cleaned_redirects.tsv')
 62 |     data = assemble_dict('data/cleaned_links.tsv', redirects)
 63 |     keys, deadends = find_deadends(data)
 64 |     prune_deadends(data, deadends, keys)
 65 |     recode_data(data)
 66 |     write_rels(data, 'data/rels.tsv')
 67 |     write_nodes(data, 'data/nodes.tsv')
 68 | ```
 69 | I needed page codes to be continuous so those are assigned after the dictionary is pruned. I also wrote <kbd>test_continuity.py</kbd> to test whether or not the nodes.tsv file produced has continuous codes. 
 70 | 
 71 | It is quite a memory-intensive script and even on a server with 15G RAM, it took about 30 minutes to execute--but it worked! After cleaning, the complete graph has about 4.5 million nodes and 110 million edges. The data are stored in two tsv files: a list of all relationships and a list of all nodes.
 72 | 
 73 | __nodes.tsv__ (160MB)
 74 | ```
 75 | node    title            l:label    degrees
 76 | 0       Alabama         Pages      83
 77 | 1       Andrew Jackson  Pages      51
 78 | ```
 79 | __rels.tsv__ (2.5GB)
 80 | ```
 81 | start   end type
 82 | 0       1   LINKS_TO
 83 | 2       3   LINKS_TO
 84 | ```
 85 | I used Michael Hunger's [batch import tool](https://github.com/jexp/batch-import/tree/20) to insert the data into a [Neo4j](http://neo4j.com/) graph database. Also, after much research and many failed batch imports, I appended ```batch_import.csv.quotes=false``` to **batch.properties** because stray double quotes in the page titles cause a lookup failure when importing relationships (which was not detected in the presidents subgraph).
 86 | 
 87 | #####Database and model
 88 | Within the database, the data model is quite simple: (Page) -[:LINKS_TO]-> (Page). All nodes have a label (Page) and three properties (node, title, degrees). All relationships are unidrectional and hold no properties.
 89 | 
 90 | Within the database I applied a constraint on all nodes indicating that their id ('node') was unique. This dramatically decreased query response time as the database did not have to do a full scan of the nodes for each lookup.
 91 | ```
 92 | CREATE CONSTRAINT ON (p:Page) ASSERT p.node IS UNIQUE;
 93 | ```
 94 | 
 95 | #### Queries
 96 | I used Nigel Small's Python library [py2neo](http://nigelsmall.com/py2neo/1.6/) to interact with Neo4j's RESTful web service interface. <kbd>query.py</kbd> translates my shortest-path request into a CypherQuery object, queries the database, and returns the results as a Path object. 
 97 | ```python
 98 | query = neo4j.CypherQuery(
 99 |     graph_db, 
100 |     """MATCH (m:Page {node:{n1}}), (n:Page {node:{n2}}), 
101 |     p = shortestPath((m)-[*..20]->(n)) RETURN p"""
102 | )
103 | query.execute(n1=node1, n2=node2)
104 | ```
105 | The script then traverses this path object, adding a sample of incoming and outgoing links for each path node, as well as deduping nodes and relationships. For the [d3 library](http://d3js.org/) to graph this result, the ids need to be recoded to be sequential (starting from 0). Finally, the nodes and relationships are formatted and returned as JSON.
106 | ```
107 | {
108 |     "directed": true,
109 |     "nodes": [
110 |         {
111 |             "degrees": 22,
112 |             "node": 0,
113 |             "title": "William Persse",
114 |             "group": "path"
115 |         },
116 |         {
117 |             "degrees": 102,
118 |             "node": 1,
119 |             "title": "George Washington",
120 |             "group": "path"
121 |         },
122 |         {
123 |             "degrees": 35,
124 |             "node": 2,
125 |             "title": "American Presidents: Life Portraits",
126 |             "group": "none"
127 |         }
128 |     ],
129 |     "links": [
130 |         {
131 |             "start": 0,
132 |             "end": 1,
133 |             "value": 1
134 |         },
135 |         {
136 |             "start": 1,
137 |             "end": 2,
138 |             "value": 0
139 |         }
140 |     ],
141 |     "multigraph": false
142 | }
143 | ```
144 | 
145 | #### Data visualization
146 | When planning this project, I envisioned the result of a query as an interactive graph. I wanted not only to see the shortest path between two pages but also explore the pages' context and connections.
147 | 
148 | <kbd>wikigraph.py</kbd> is a small [Flask](http://flask.pocoo.org/) app that handles AJAX requests to the databases. <kbd>graph.js</kbd> implements the graph visualization with the d3 library while <kbd>index.js</kbd> handles everything else.
149 | 
150 | The returned path is displayed in two ways: as a force-directed graph layout and as a simple list of page titles. Both are rendered as SVG images with d3. Page images are displayed in the nodes via clipped-path, as patterning the image over a circle decreases performance during movement.
151 | 
152 | Wikipedia page images and extracts are sourced from Wikipedia via the [Wikimedia API](http://www.mediawiki.org/wiki/API:Main_page). The first AJAX request occurs upon the query request (for the start and end nodes), then again once the result is received (for the inner path nodes). URLs and extracts are stored in a global variable (*queryImages*). There are further requests when the user mouses over non-path nodes.
153 | 
154 | These AJAX requests are chained together in a set of deferred promises to ensure asynchonous calls return before the next request begins.
155 | 
156 | ##### User input
157 | To help users input page names correctly (as well as to suggest possible queries) I implemented a predictive seach tool with [typeahead.js](https://twitter.github.io/typeahead.js/). It uses AJAX requests to query an indexed [SQLite](http://www.sqlite.org/) database that holds the page titles, codes, and their degrees. It also includes a titles_lower column in order to optimize title lookups with 'LIKE'. This database is generated with <kbd>create_pagenames_db.py</kbd> and has the following schema:
158 | ```
159 | CREATE TABLE pagenames (code INTEGER PRIMARY KEY, title TEXT, title_lower TEXT, degrees INTEGER);
160 | CREATE UNIQUE INDEX codes ON pagenames(code);
161 | CREATE INDEX degrees ON pagenames(degrees);
162 | CREATE INDEX titles ON pagenames(title);
163 | CREATE INDEX titles_lower ON pagenames(title_lower);
164 | ```
165 | #### Improving query response time
166 | As I played around with the database, I realized that a responsive shortest-path query of such a large database would take some refinement and I first wanted to figure out how to display my data, deploy the application, etc. I needed a smaller subgraph to play with until my query time improved.
167 | 
168 | I wrote <kbd>pres_clean.py</kbd> to sample the pagelinks file for only those pages and links that include the names of U.S. Presidents. After cleaning, this graph had 77 thousand nodes and 140 thousand relationships. I built most of my application using this database, then I scaled everything to use the full database. 
169 | 
170 | Complete graph | Subgraph
171 | -------------- | -----------
172 | 4.5m nodes | 77k nodes 
173 | 110m links | 140k links
174 | 
175 | At the start of the project, I decided there were at least four possible approaches to improve response time. I've tackled three of them so far and I've seen improvements with each:
176 | - [x] Scale vertically (tweak memory allocation, use larger machine)
177 | - [x] More efficient query (change query parameters, possibly rewrite algorithm)
178 | - [x] Prune graph if possible (remove trailing linked tails?)
179 | - [ ] Scale horizontally (distributed processing, e.g. [Giraph](http://giraph.apache.org/))
180 | 
181 | #####Scale vertically
182 | My first approach to improve response time for the full database was to fiddle with Neo4j's memory settings. The settings in **neo4j.properties** (e.g. *neostore.nodestore.db.mapped_memory*) didn't have a large impact on query time. I just set them to be as large as their counterpart was on disk. I had more success with *java.initmemory* and *java.maxmemory* (in **neo4j-wrapper.conf**).
183 | 
184 | Each time I increased both init and max memory, I ran the same query three times and recorded the response time. My MacBook Air has 4GB of RAM, which seems to coincide with the dramatic improvement in query time (1400 sec to 60 sec) after passing the 4GB mark. *(Sidenote: This is odd, considering all advice I've seen suggests to leave 1-2GB for the OS, etc.)*
185 | 
186 | ![Memory Test Results](static/images/mem_test.png)
187 | 
188 | Then, I deployed the database to a larger machine (see [Deployment](#deployment) below). I scaled the java memory settings to the new specs, but the query time only halved (60 sec to 30 sec) despite the four-fold increase in RAM.
189 | 
190 | #####Query efficiency
191 | I chose to use the built-in shortest-path algorithm for Neo4j, even though I've been unable to find out exactly what the algorithm is. It is breadth-first, which seems like a good approach. [Here](https://groups.google.com/forum/#!topic/neo4j/GiQPwQC_rII) is the closest description I've found:
192 | 
193 | >The shortest path algorithm (i.e. paths with as few relationships as possible) uses breadth first, alternating sides between each visited relationship, not between each fully expanded depth. So if one side expands into fewer relationships than the other, that side can continue to new depths and find candidates even if the other side doesn't advance. When candidates are found the current depths will be fully expanded and shortest candidates will be selected and returned.
194 | 
195 | The good folks on the [Neo4j Google Group](https://groups.google.com/forum/#!forum/neo4j) then suggested that the initial lookup of the two nodes was likely the slowest factor (rather than the pathfinding algorithm). This was my original query:
196 | ```python
197 | query = neo4j.CypherQuery(
198 |     graph_db, 
199 |     """MATCH (m {node:'%s'}), (n {node:'%s'}), 
200 |     p = shortestPath((m)-[*..20]->(n)) RETURN p""" % (node1, node2)
201 | )
202 | query.execute_one()
203 | ```
204 | I then added a [constraint](#database-and-model) in the database for the Page label (all nodes are Pages) to express that node id is unique. I modified my query to use the Page label in the node lookup, as well as pass the nodes as arguments (instead of via string substitution). These two changes had the largest impact on query response time--from 30 seconds to 0.3 seconds for some queries.
205 | 
206 | Here's the final query:
207 | ```python
208 | query = neo4j.CypherQuery(
209 |     graph_db, 
210 |     """MATCH (m:Page {node:{n1}}), (n:Page {node:{n2}}), 
211 |     p = shortestPath((m)-[*..20]->(n)) RETURN p"""
212 | )
213 | query.execute_one(n1=node1, n2=node2)
214 | ```
215 | 
216 | According to the [Neo4j manual](http://neo4j.com/docs/stable/query-constraints.html), unique constraints ensure that property values are unique for all nodes with a specific label. Additionally, unique constraints also add an index on the value--and this is the index used for lookups. *(Sidenote: It's interesting that auto-indexing (on the 'node' property) hadn't had a similar effect.)*
217 | 
218 | ##### Pruning the graph
219 | I was very surprised to find that over half of the links in the page links dataset had no outgoing links. After some poking around on Wikipedia, I discovered that most of the 'dead-ends' are [red links](http://en.wikipedia.org/wiki/Wikipedia:Red_link), links that point to a page that does not yet exist. For some pages, when I visited the source page I could not find the link pointing to the dead-end. The DBPedia 2014 dataset is based on dumps from April/May 2014, so perhaps some dead-ends are the result of being deleted.
220 | 
221 | In any case, I decided that for the purposes of my project, I was not interested in keeping pages that did not exist. Finding a path from those pages would be futile, and why would you want to find a path to it? Additionally, since there were so many dead-end links, even a one-pass removal of dead-ends would essentially halve my database, improving performance.
222 | 
223 | #### Deployment
224 | This code was tested on Amazon's [EC2](http://aws.amazon.com/ec2/) using [Apache](http://httpd.apache.org/) as a web server. The database is housed on a 30 GiB EBS. Currently it is on an r3.large server with 15G RAM, and the query of the full database takes just 0.5 seconds. Since EC2 servers do not come with virtual memory, I set up the 32G SSD ephemeral instance storage as a paging (or swap) partition to give the database access if needed.


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | CouchDB==1.0
2 | Flask==0.10.1
3 | Jinja2==2.7.3
4 | MarkupSafe==0.23
5 | Werkzeug==0.9.6
6 | itsdangerous==0.24
7 | py2neo==1.6.4
8 | wsgiref==0.1.2
9 | 


--------------------------------------------------------------------------------
/static/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/erabug/wikigraph/08fd74ea020f5eedec641e5878128985c1e6c758/static/.DS_Store


--------------------------------------------------------------------------------
/static/css/index.css:
--------------------------------------------------------------------------------
  1 | /******HTML STUFF******/
  2 | :focus {
  3 |     outline:none; 
  4 |     border-bottom: solid 2px #999;
  5 | } /*changes chrome's focus action*/
  6 | 
  7 | body {
  8 |     background-color: #fff;
  9 |     color: #666;
 10 |     margin: 0;
 11 |     font-family: sans-serif;
 12 |     font-weight: 100;
 13 |     font-size: 18px;
 14 |     text-align: center;
 15 | }
 16 | 
 17 | .logo {
 18 |     width: 100%;
 19 | }
 20 | 
 21 | .border {
 22 |     background: url('../images/logo-background.png');
 23 |     height: 20px;
 24 | }
 25 | 
 26 | .background {
 27 |     background-color: #333;
 28 |     padding: 10px 0 10px;
 29 |     text-align: left;
 30 | }
 31 | 
 32 | .title {
 33 |     font-family: 'Signika', sans-serif;
 34 |     color: #fff;
 35 |     font-size: 40px;
 36 |     display: inline-block;
 37 |     margin-left: 15%;
 38 | }
 39 | 
 40 | .title:hover {
 41 |     cursor: pointer;
 42 | }
 43 | 
 44 | .blurb {
 45 |     font-family: 'Nothing You Could Do', sans-serif;
 46 |     font-size: 20px;
 47 |     display: inline-block;
 48 |     color: #fff;
 49 |     margin-left: 10px;
 50 | }
 51 | 
 52 | .info {
 53 |     color: #999;
 54 |     text-align: right;
 55 |     position: absolute;
 56 |     right: 15%;
 57 |     top: 50px;
 58 | }
 59 | 
 60 | .info:hover {
 61 |     cursor: pointer;
 62 |     color: #fff;
 63 | }
 64 | 
 65 | .query-form {
 66 |     margin: 30px;
 67 |     text-align: center;
 68 | }
 69 | 
 70 | input { 
 71 |     border: none;
 72 |     border-bottom: solid 1px #999; 
 73 |     margin: 0 10px 0 10px;
 74 |     font-size: 16px;
 75 | }
 76 | 
 77 | .button {
 78 |     margin-left: 5px;
 79 |     color: #fff;
 80 |     background-color: #999;
 81 |     border: solid 1px #999;
 82 |     border-radius: 10px;
 83 |     position: relative;
 84 |     font-family: 'Signika', sans-serif;
 85 | }
 86 | 
 87 | .button:hover {
 88 |     cursor: pointer;
 89 |     background-color: #666;
 90 |     border: solid 1px #666;
 91 | }
 92 | 
 93 | .wtf {
 94 |     border: none;
 95 |     border-radius: 16px;
 96 |     background-color: #B85427;
 97 |     padding: 2px 8px 2px 8px;
 98 | }
 99 | 
100 | .wtf:hover {
101 |     background-color: #FF7436;
102 |     border: none;
103 |     cursor: auto;
104 | }
105 | 
106 | .about {
107 |     width: 60%;
108 |     margin: auto;
109 |     text-align: left;
110 |     padding-top: 20px;
111 |     line-height: 150%;
112 | }
113 | 
114 | h3 {
115 |     text-align: center;
116 |     font-family: 'Nothing You Could Do';
117 | }
118 | 
119 | #sadpanda {
120 |     border: solid 1px #666;
121 |     border-radius: 10px;
122 |     height: 150px;
123 | }
124 | 
125 | /*TYPEAHEAD STUFF*/
126 | .tt-query {
127 |     box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075);
128 | }
129 | 
130 | .tt-hint {
131 |     color: #999
132 | }
133 | 
134 | .tt-dropdown-menu {
135 |     margin-top: 12px;
136 |     padding: 8px 0;
137 |     background-color: #fff;
138 |     border: 1px solid #ccc;
139 |     border-radius: 8px;
140 |     box-shadow: 0 5px 10px rgba(0,0,0,.2);
141 |     max-height: 200px;
142 |     width: 200px;
143 |     overflow-y: auto;
144 |     text-align: left;
145 | }
146 | 
147 | .tt-suggestion {
148 |     padding: 2px 20px;
149 |     font-size: 14px;
150 |     line-height: 18px;
151 | }
152 | 
153 | .tt-cursor {
154 |     color: #fff;
155 |     background-color: #0097cf;
156 |     cursor: pointer;
157 | }
158 | 
159 | .tt-suggestion p {
160 |     margin: 0;
161 | }
162 | 
163 | 
164 | /*RESULTS STUFF*/
165 | .results {
166 |     width: 100%;
167 |     /*display: inline-block;*/
168 | }
169 | 
170 | .graph {
171 |     display: inline-block;
172 |     min-width: 350px;
173 | }
174 | 
175 | .sidebar {
176 |     display: inline-block;
177 |     vertical-align: top;
178 | }
179 | 
180 | .details {
181 |     width: 200px;
182 |     margin: auto;
183 |     border-radius: 5px;
184 |     position: absolute;
185 |     top: 172px;
186 |     min-height: 200px;
187 |     border: solid 2px;
188 |     padding: 5px;
189 |     background-color: #E5E5E5;
190 | }
191 | 
192 | .hidden {
193 |     display: none;
194 | }
195 | 
196 | .help {
197 |     font-size: 14px;
198 |     padding: 5px;
199 | }
200 | 
201 | .page-image {
202 |     margin-top: 5px;
203 | }
204 | 
205 | .page-title {
206 |     margin-top: 10px;
207 |     padding: 5px;
208 | }
209 | 
210 | .page-extract {
211 |     font-size: 12px;
212 |     padding: 5px;
213 | }
214 | 
215 | .page {
216 |     display: inline-block;
217 |     vertical-align: top;
218 | }
219 | 
220 | .squareimg {
221 |     width: 90px;
222 |     height: 90px;
223 |     border-radius: 45px;
224 |     overflow: hidden;
225 |     border: solid 2px black;
226 |     margin: 10px;
227 |     background-color: #fff;
228 | }
229 | 
230 | .loading {
231 |     background: url("../images/arrow.gif") no-repeat center;
232 |     position: relative;
233 |     font-size: 30px;
234 |     top: 35px;
235 |     width: 50px;
236 |     height: 30px;
237 | }
238 | 


--------------------------------------------------------------------------------
/static/images/arrow.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/erabug/wikigraph/08fd74ea020f5eedec641e5878128985c1e6c758/static/images/arrow.gif


--------------------------------------------------------------------------------
/static/images/arrow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/erabug/wikigraph/08fd74ea020f5eedec641e5878128985c1e6c758/static/images/arrow.png


--------------------------------------------------------------------------------
/static/images/cat.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/erabug/wikigraph/08fd74ea020f5eedec641e5878128985c1e6c758/static/images/cat.jpg


--------------------------------------------------------------------------------
/static/images/logo-background.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/erabug/wikigraph/08fd74ea020f5eedec641e5878128985c1e6c758/static/images/logo-background.png


--------------------------------------------------------------------------------
/static/images/mem_test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/erabug/wikigraph/08fd74ea020f5eedec641e5878128985c1e6c758/static/images/mem_test.png


--------------------------------------------------------------------------------
/static/images/sadpanda.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/erabug/wikigraph/08fd74ea020f5eedec641e5878128985c1e6c758/static/images/sadpanda.jpg


--------------------------------------------------------------------------------
/static/images/wikigraph_ss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/erabug/wikigraph/08fd74ea020f5eedec641e5878128985c1e6c758/static/images/wikigraph_ss.png


--------------------------------------------------------------------------------
/static/js/graph.js:
--------------------------------------------------------------------------------
  1 | function drawGraph(json) {
  2 | 
  3 |     var pathLength = Object.keys(queryInfo).length;
  4 | 
  5 |     // establish width and height of the svg
  6 |     var width = 600;
  7 |     var height = pathLength * 110;
  8 | 
  9 |     // color established as a scale
 10 |     var color = d3.scale.category20();
 11 | 
 12 |     // appends svg tag to graph div
 13 |     var svg = d3.select('.graph').append('svg')
 14 |         .attr('width', width)
 15 |         .attr('height', height);
 16 | 
 17 |     // this function handles the parameters of the force-directed layout
 18 |     var force = d3.layout.force()
 19 |         .gravity(0.05)
 20 |         .linkDistance(function(d) {
 21 |           if (d.value == 1) {
 22 |             return 115;
 23 |           } else {
 24 |             // if not in the path, distance is a random number from 70 to 100
 25 |             return Math.floor(Math.random() * (100 - 70)) + 70;
 26 |         }
 27 |         })
 28 |         .charge(-100)
 29 |         .size([width, height]);
 30 | 
 31 |     var defs = svg.append('defs');
 32 | 
 33 |     // this appends the marker tag to the svg tag, applies arrowhead attributes
 34 |     defs.selectAll('marker')
 35 |             .data(['arrow'])
 36 |         .enter().append('svg:marker')
 37 |             .attr('id', String)
 38 |             .attr('viewBox', '0 -5 10 10')
 39 |             .attr('refX', 9)
 40 |             .attr('markerWidth', 7)
 41 |             .attr('markerHeight', 7)
 42 |             .attr('orient', 'auto')
 43 |             .style('fill', '#666')
 44 |             .append('svg:path')
 45 |             .attr('d', 'M0,-4L10,0L0,4Z');
 46 | 
 47 |     // this helps predict where the arrowhead should be on the link path
 48 |     var diagonal = d3.svg.diagonal()
 49 |         .projection(function(d) {
 50 |             return [d.y, d.x];
 51 |         });
 52 | 
 53 |     // establish links
 54 |     var link = svg.selectAll('.link')
 55 |             .data(json.links)
 56 |         .enter().append('path')
 57 |             .attr('class', 'link')
 58 |             .style('stroke', '#666')
 59 |             .style('opacity', 0.6)
 60 |             .attr('marker-end', 'url(#arrow)')
 61 |             .attr('d', diagonal);
 62 | 
 63 |     // establish nodes
 64 |     var node = svg.selectAll('g.node')
 65 |             .data(json.nodes)
 66 |         .enter().append('svg:g')
 67 |             .attr('class', 'node')
 68 |             .attr('id', function(d) {
 69 |                 return d.title + '|' + d.code;
 70 |             })
 71 |             .call(force.drag);
 72 | 
 73 |     // define path ndoes
 74 |     var pathNode = node.filter(function(d) {
 75 |         return d.group == 'path';
 76 |     });
 77 | 
 78 |     // define non-path nodes
 79 |     var nonPathNode = node.filter(function(d) {
 80 |         return d.group != 'path';
 81 |     });
 82 | 
 83 |     // define path links
 84 |     var pathLink = link.filter(function(d) {
 85 |         return d.value == 1;
 86 |     });
 87 | 
 88 |     var start;
 89 |     Object.keys(queryInfo).forEach(function(key) {
 90 |         // identify the start node
 91 |         var img = queryInfo[key];
 92 |         if (img.code === 0) {
 93 |             start = key;
 94 |         }
 95 |         // define clip paths for each path node
 96 |         defs.append('clipPath')
 97 |             .attr('id', 'img' + key.toString())
 98 |           .append('circle')
 99 |             .attr('r', 45);
100 |     });
101 | 
102 |     // define the start node
103 |     var startNode = pathNode.filter(function(d) {
104 |         return d.code == start;
105 |     });
106 | 
107 |     // add styling for the path links
108 |     pathLink
109 |         .style('stroke-width', '3px')
110 |         .style('opacity', 1)
111 |         .attr('marker-end', 'url(#arrow)');
112 | 
113 |     // append colored circles to non-path nodes
114 |     nonPathNode.append('circle')
115 |         .attr('r', function(d) {
116 |             var size;
117 |             // upper bound for scaling size on degrees
118 |             if (d.degrees > 600) {
119 |                 // assign radius to node attribute for arrowhead placement
120 |                 d.radius = 18;
121 |             } else {
122 |                 d.radius = d.degrees * 0.02 + 7; // scales linearly with degrees
123 |             }
124 |             return d.radius;
125 |         })
126 |         .style('fill', function(d) {
127 |             return color(d.degrees);
128 |         });
129 | 
130 |     // append white circles to path nodes (for the .gifs)
131 |     pathNode.append('circle')
132 |         .attr('r', function(d) {
133 |             d.radius = 45;
134 |             return d.radius;
135 |         })
136 |         .style('fill', '#fff');
137 | 
138 |     // append clip-path to each path node
139 |     pathNode.append('image')
140 |         .attr('xlink:href', function(d) { return queryInfo[d.code].url;})
141 |         .attr('x', function(d) { return -queryInfo[d.code].width / 2;})
142 |         // this seems to help for portraits, where height > width
143 |         .attr('y', function(d) {
144 |             var imgHeight = queryInfo[d.code].height;
145 |             var offset;
146 |             if (h > queryInfo[d.code].width) {
147 |                 offset = 12;
148 |             } else {
149 |                 offset = 0;
150 |             }
151 |             return -(imgHeight / 2) + offset;
152 |         })
153 |         .attr('height', function(d) { return queryInfo[d.code].height;})
154 |         .attr('width', function(d) { return queryInfo[d.code].width;})
155 |         .attr('clip-path', function(d) {
156 |             var clipPathID = 'img' + d.code;
157 |             return 'url(#' + clipPathID + ')'; // unique clip path for this node
158 |         });
159 | 
160 |     // append empty circle to each path node as an outline
161 |     pathNode.append('circle')
162 |         .attr('r', 45)
163 |         .style('fill', 'none')
164 |         .style('stroke', '#333')
165 |         .style('stroke-width', '2px');
166 | 
167 |     // fix the x and y coordinates for the start node
168 |     startNode.each(function(d) {
169 |         d.fixed = true;
170 |         d.x = width/pathLength;
171 |         d.y = height/pathLength;
172 |     });
173 | 
174 |     // this calls the function force on the nodes and links
175 |     force
176 |         .nodes(json.nodes)
177 |         .links(json.links)
178 |         .start();
179 | 
180 |     // this block occurs each time 'tick' is called by d3
181 |     force.on('tick', function() {
182 |         node.attr('cx', function(d) {
183 |                 d.x = Math.max(15, Math.min(width - 15, d.x));
184 |                 return d.x;
185 |             })
186 |             .attr('cy', function(d) {
187 |                 d.y = Math.max(15, Math.min(height - 15, d.y));
188 |                 return d.y;
189 |             })
190 |             .attr('transform', function(d) {
191 |                 return 'translate(' + d.x + ',' + d.y + ')';
192 |             });
193 |         link.attr('x1', function(d) { return d.source.x; })
194 |             .attr('y1', function(d) { return d.source.y; })
195 |             .attr('x2', function(d) { return d.target.x; })
196 |             .attr('y2', function(d) { return d.target.y; })
197 |             // this places arrowheads based on radius
198 |             .attr('d', function(d) {
199 |                 // Total difference in x and y from source to target
200 |                 diffX = d.target.x - d.source.x;
201 |                 diffY = d.target.y - d.source.y;
202 |                 // Length of path from center of source node to center of target node
203 |                 pathLength = Math.sqrt((diffX * diffX) + (diffY * diffY));
204 |                 // x and y distances from center to outside edge of target node
205 |                 offsetX = (diffX * d.target.radius) / pathLength;
206 |                 offsetY = (diffY * d.target.radius) / pathLength;
207 |                 return 'M' + d.source.x + ',' + d.source.y + 'L' +
208 |                     (d.target.x - offsetX) + ',' + (d.target.y - offsetY);
209 |         });
210 |     });
211 | 
212 | }


--------------------------------------------------------------------------------
/static/js/index.js:
--------------------------------------------------------------------------------
  1 |  
  2 | // Global variables
  3 | var CODES; // this object will be populated once the user inputs two pages
  4 | var response; // global variable for the graph db response
  5 | var queryInfo; // an object to organize and pass information to the graph
  6 | var imageURLs; // an array for the start and end node images (to retain order)
  7 | 
  8 |  /**
  9 |  * INPUT-RELATED
 10 |  */
 11 | 
 12 | var startField = $('#start-node');
 13 | var endField = $('#end-node');
 14 | var aboutDiv = $('.about');
 15 | 
 16 | // sets up the request parameters for Typeahead
 17 | var pageNames = new Bloodhound({
 18 |     datumTokenizer: function(d) {
 19 |         return Bloodhound.tokenizers.whitespace(d.value);
 20 |     },
 21 |     queryTokenizer: Bloodhound.tokenizers.whitespace,
 22 |     limit: 50,
 23 |     remote: {
 24 |         url: '/page-names?query=%QUERY',
 25 |         filter: function(pageNames) {
 26 |             // Map the remote source JSON array to a JavaScript array
 27 |             return $.map(pageNames.results, function(page) {
 28 |                 return {
 29 |                     value: page.title,
 30 |                     code: page.code
 31 |                 };
 32 |             });
 33 |         }
 34 |     }
 35 | });
 36 | 
 37 | // send input to the pagenames db, take the first result as the query input
 38 | function feelingLucky(inputField, node) {
 39 |     if (!(CODES[node])) {
 40 |         return $.get(
 41 |             '/page-names',
 42 |             'query=' + inputField.val(),
 43 |             function(data) {
 44 |                 var result = data.results[0]; // uses the first result
 45 |                 inputField.val(result.title);
 46 |                 CODES[node] = {'title': result.title,
 47 |                                'code': result.code.toString()};
 48 |             });
 49 |     }
 50 | }
 51 | 
 52 | // send request to the pagenames db, write the results to the input fields
 53 | function getRandomPages() {
 54 |     $.get('/random-query',
 55 |         function(data) {
 56 |             var node1 = data.results[0];
 57 |             var node2 = data.results[1];
 58 |             CODES.node1 = {'title': node1.title,
 59 |                            'code': node1.code.toString()};
 60 |             CODES.node2 = {'title': node2.title,
 61 |                            'code': node2.code.toString()};
 62 |             startField.val(node1.title); // fill in the search fields
 63 |             endField.val(node2.title);
 64 |         });
 65 | }
 66 | 
 67 | // swap both the input field and the global CODES values
 68 | function reverseQuery() {
 69 |     var x = startField.val();
 70 |     startField.val(endField.val());
 71 |     endField.val(x);
 72 |     var y = CODES.node1;
 73 |     CODES.node1 = CODES.node2;
 74 |     CODES.node2 = y;
 75 | }
 76 | 
 77 | // take the input field value and assign it to CODES
 78 | function decodeInput(d, node) {
 79 |     CODES[node] = {'title': d.value,
 80 |                    'code': d.code.toString()};
 81 | }
 82 | 
 83 | // when the 'Go' button is clicked, check for both values, then run query
 84 | $('input#submit-query').click(function() {
 85 |     clearPartial();
 86 |     aboutDiv.addClass('hidden');
 87 |     var checkFirst = feelingLucky(startField, 'node1');
 88 |     var checkLast = feelingLucky(endField, 'node2');
 89 |     $.when(
 90 |         checkFirst,
 91 |         checkLast
 92 |         ).then(function(data) {
 93 |             query();
 94 |         });
 95 | });
 96 | 
 97 | // get and display random pages when the 'Random' button is clicked
 98 | $('input#random-query').click(function() {
 99 |     getRandomPages();
100 | });
101 | 
102 | // reverse the pages when the 'Reverse' button is clicked
103 | $('input#reverse-query').click(function() {
104 |     reverseQuery();
105 | });
106 | 
107 | // sets up the typeahead on the two input fields
108 | $('.scrollable-dropdown-menu .typeahead').typeahead(null, {
109 |     name: 'pageNames',
110 |     displayKey: 'value',
111 |     source: pageNames.ttAdapter()
112 | });
113 | 
114 | // delete the code value as soon as the user clicks into an input field
115 | startField.focus(function() {
116 |     delete CODES['node1'];
117 | });
118 | 
119 | endField.focus(function() {
120 |     delete CODES['node2'];
121 | });
122 | 
123 | // when a suggested title is selected, write that value to CODES
124 | startField.on('typeahead:selected typeahead:autocompleted', function (e, d) {
125 |     decodeInput(d, 'node1');
126 | });
127 | 
128 | endField.on('typeahead:selected typeahead:autocompleted', function (e, d) {
129 |     decodeInput(d, 'node2');
130 | });
131 | 
132 | // select all text when the user clicks into an input field
133 | $('input[type=text]').focus(function() {
134 |     this.select();
135 | });
136 | 
137 | // initialize the bloodhound
138 | pageNames.initialize();
139 | 
140 | /**
141 |  * QUERY-RELATED
142 |  */
143 | 
144 | var pathDiv = $('.loading-images');
145 | 
146 | // create and return a query URL for images, based on desired size, number of 
147 | // pages, and the page titles
148 | function makeQueryURL(size, numPages, pagesParams) {
149 |     var queryURL = 'http://en.wikipedia.org/w/api.php?action=query&' +
150 |         'format=json&redirects&prop=pageimages&pithumbsize='+ size +
151 |         'px&pilimit=' + numPages + '&titles=' + pagesParams + '&callback=?';
152 |     return queryURL;
153 | }
154 | 
155 | // create and return an object with information about the thumbnail image, if
156 | // available, else use information about the default cat image
157 | function createThumbnailObject(page) {
158 |     var thumbnail, width, height;
159 |     if ('thumbnail' in page) {
160 |         thumbnail = page.thumbnail.source;
161 |         width = page.thumbnail.width;
162 |         height = page.thumbnail.height;
163 |     } else {
164 |         thumbnail = '../static/images/cat.jpg';
165 |         width = 100;
166 |         height = 100;
167 |     }
168 |     var item = {'title': page.title,
169 |                 'thumbnail': thumbnail,
170 |                 'width': width,
171 |                 'height': height};
172 |     return item;
173 | }
174 | 
175 | // create and return an HTML snippet using the page's code and image url
176 | function makeHTMLSnippet(code, thumbnail) {
177 |     var html = '<div class="page" id="page' + code.toString() + '">' +
178 |         '<div class="squareimg"><img src=' + thumbnail + '></div>';
179 |     return html;
180 | }
181 | 
182 | // add information about a page to the global variable queryInfo
183 | function addImage(item, code) {
184 |     queryInfo[code] = {'url': item.thumbnail,
185 |                        'title': item.title,
186 |                        'height': item.height,
187 |                        'width': item.width};
188 | }
189 | 
190 | // create and return both HTML snippets for the two query pages, and update
191 | // the global variables queryInfo and imageURLs
192 | function addQueryInfo(data) {
193 |     var pageObject = data.query.pages;
194 |     var htmlSnippets = {};
195 |     Object.keys(pageObject).forEach(function(pageKey) {
196 |         item = createThumbnailObject(pageObject[pageKey]);
197 |         if (item.title == CODES.node1.title) {
198 |             code = 0;
199 |         } else {
200 |             code = 1;
201 |         }
202 |         htmlSnippets[code] = makeHTMLSnippet(code, item.thumbnail);
203 |         addImage(item, CODES['node' + (code + 1)].code);
204 |         imageURLs[code] = {'title': item.title,
205 |                            'thumbnail': item.thumbnail};
206 |     });
207 |     return htmlSnippets;
208 | }
209 | 
210 | // compare a title to each page object in response.path and return the
211 | // matching page object's code number
212 | function getPathCode(title) {
213 |     for (var i = 0; i < response.path.length; i++) {
214 |         if (response.path[i].title == title) {
215 |             return response.path[i].code;
216 |         }
217 |     }
218 | }
219 | 
220 | // parse the results of an AJAX images request and add information about those
221 | // images to queryInfo
222 | function addPathImages(data) {
223 |     var pageObject = data.query.pages;
224 |     Object.keys(pageObject).forEach(function(pageKey) {
225 |         var item = createThumbnailObject(pageObject[pageKey]);
226 |         addImage(item, getPathCode(item.title));
227 |     });
228 | }
229 | 
230 | // updates queryInfo with index numbers for ordering purposes
231 | function updateIndexCodes() {
232 |     response.path.forEach(function(node) {
233 |         if (!(node.code in queryInfo)) {
234 |             queryInfo[node.code] = queryInfo['undefined'];
235 |             delete queryInfo['undefined'];
236 |             var old_index = response.path.indexOf(node);
237 |             response.path[old_index] = {'code': node.code,
238 |                                         'title': queryInfo[node.code].title};
239 |         }
240 |         queryInfo[node.code].code = response.path.indexOf(node);
241 |     });
242 | }
243 | 
244 | // given a query URL, request a number of page images from Wikipedia, then
245 | // update queryInfo and add index codes to each page to retain path order
246 | function getPathImages(queryURL) {
247 |     return $.getJSON(
248 |         queryURL,
249 |         function(data) {
250 |             addPathImages(data);
251 |             updateIndexCodes();
252 |         });
253 | }
254 | 
255 | // parse the inner nodes of the path, if they exist then assemble a query URL 
256 | // and request those images from Wikipedia
257 | function getInnerImages() {
258 |     var inner = response.path.slice(1, -1);
259 |     var numPages = inner.length;
260 |     var innerNodes = [];
261 |     inner.forEach(function(node) {
262 |         innerNodes.push(node.title);
263 |     });
264 |     var pagesParams;
265 |     if (numPages > 1) {
266 |         pagesParams = innerNodes.join('|');
267 |     } else { pagesParams = innerNodes; }
268 |     
269 |     if (inner.length === 0) {
270 |         return false;
271 |     } else {
272 |         var queryURL = makeQueryURL(150, numPages, pagesParams);
273 |         return getPathImages(queryURL);
274 |     }
275 | }
276 | 
277 | // assemble and request query for start/end images from Wikipedia and append
278 | // those to the path div, then request a shortest path from the graph database,
279 | // then get images for the resulting path, update the index codes, draw the
280 | // grah, and set up event handlers for the sidebar
281 | function query() {
282 |     var pagesParams = CODES.node1.title + '|' + CODES.node2.title;
283 |     var queryURL = makeQueryURL(150, 2, pagesParams);
284 |     $.when(
285 |         $.getJSON(
286 |             queryURL,
287 |             function(data) {
288 |                 var htmlSnippets = addQueryInfo(data);
289 |                 Object.keys(htmlSnippets).forEach(function(node) {
290 |                     pathDiv.append(htmlSnippets[node]);
291 |                 });
292 |                 $('#page0').after('<div class="page loading"></div>');
293 |             }),
294 |         $.get(
295 |             '/query',
296 |             CODES,
297 |             function(data) {
298 |                 response = JSON.parse(data);
299 |             })
300 |     ).then(function() {
301 |         try {
302 |             return getInnerImages();
303 |         } catch(err) {}
304 |     }).done(function() {
305 |         pathDiv.empty();
306 |         if (response.path != 'None') {
307 |             updateIndexCodes();
308 |             pathDiv.empty();
309 |             drawGraph(response.results);
310 |             buildSidebar();
311 |         } else {
312 |             $('.path-not-found').removeClass('hidden');
313 |         }
314 |     });
315 | }
316 | 
317 |  /**
318 |  * PATH-RELATED
319 |  */
320 | 
321 | var detailsDiv = $('.details');
322 | var pageImage = $('.page-image');
323 | var pageTitle = $('.page-title');
324 | var pageExtract = $('.page-extract');
325 | 
326 | // create and return the query URL for extracts from Wikipedia's API, based on
327 | // number of pages and their titles
328 | function makeExtractURL(numPages, pageParams) {
329 |     var extractURL = 'http://en.wikipedia.org/w/api.php?action=query&' +
330 |         'prop=extracts&format=json&exsentences=3&explaintext=&exintro=&' +
331 |         'exlimit=' + numPages + '&titles=' + pageParams + '&callback=?';
332 |     return extractURL;
333 | }
334 | 
335 | // create URLs for the page and extract queries for a title, then execute the 
336 | // image request, update queryInfo, then execute the extract request, updata
337 | // queryInfo, then add both to their respective DOM elements
338 | function getImageAndExtract(title, code, that) {
339 |     var queryURL = makeQueryURL(150, 1, title);
340 |     var extractURL = makeExtractURL(1, title);
341 |     $.when(
342 |         $.getJSON(
343 |         queryURL,
344 |         function(data) {
345 |             var pageObject = data.query.pages;
346 |             Object.keys(pageObject).forEach(function(pageKey) {
347 |                 var item = createThumbnailObject(pageObject[pageKey]);
348 |                 addImage(item, code);
349 |             });
350 |         })
351 |         ).then(function(data) {
352 |             return $.getJSON(
353 |                 extractURL,
354 |                 function(data) {
355 |                     var thing = data.query.pages;
356 |                     var page = thing[Object.keys(thing)[0]];
357 |                     var text = page.extract;
358 |                     queryInfo[code].extract = text;
359 |             });
360 |         }).done(function(data) {
361 |             // only write to div if user is still hovering
362 |             if ($(that).is(':hover')) {
363 |                 pageImage.html('<img src=' + queryInfo[code].url +
364 |                     ' style="border:solid 2px #666; background-color: #fff">');
365 |                 pageExtract.html(queryInfo[code].extract);
366 |             }
367 |         });
368 | }
369 | 
370 | // toggles whether the sidebar is displayed
371 | function toggleSidebar() {
372 |     detailsDiv.toggleClass('hidden');
373 | }
374 | 
375 | // clears all divs in the sidebar
376 | function clearSidebar() {
377 |     toggleSidebar();
378 |     pageImage.empty();
379 |     pageTitle.empty();
380 |     pageExtract.empty();
381 | }
382 | 
383 | // opens an external window for the wikipedia page for a given title
384 | function externalLink() {
385 |     $('.node').dblclick(function() {
386 |         var title = this.id.split('|')[0];
387 |         window.open('http://en.wikipedia.org/wiki/' + title);
388 |     });
389 | }
390 | 
391 | // request extracts for pages in the returned path, update queryInfo with those
392 | function getPathExtracts(numPages, pageParams) {
393 |     var extractURL = makeExtractURL(numPages, pageParams);
394 |     $.getJSON(
395 |         extractURL,
396 |         function(data) {
397 |             var extracts = data.query.pages;
398 |             Object.keys(extracts).forEach(function(key) {
399 |                 var text = extracts[key].extract;
400 |                 var code = getPathCode(extracts[key].title);
401 |                 queryInfo[code].extract = text;
402 |             });
403 |         });
404 | }
405 | 
406 | // handles all mouseover and mouseout events for nodes, requesting information
407 | // from Wikipedia if the information is not in queryInfo already
408 | function mouseoverHandler() {
409 |     $('.node').mouseover(function(e) { // mouseover event handler
410 |         toggleSidebar();
411 |         var info = this.id.split('|');
412 |         var title = info[0];
413 |         var code = info[1];
414 |         pageTitle.html(title);
415 |         if (code in queryInfo) {
416 |             pageImage.html('<img src=' + queryInfo[code].url +
417 |                 ' style="border:solid 2px #666; background-color: #fff">');
418 |             pageExtract.html(queryInfo[code].extract);
419 |         } else {
420 |             getImageAndExtract(title, code, this);
421 |         }
422 |     });
423 |     $('.node').mouseout(function(e) {
424 |         clearSidebar();
425 |     });
426 |     externalLink();
427 | }
428 | 
429 | // takes the result of a query and requests images and extracts from Wikipedia,
430 | // then sets up the node mouseover handler
431 | function buildSidebar() {
432 |     var pathNodes = [];
433 |     response.path.forEach(function(node) {
434 |         pathNodes.push(node.title);
435 |     });
436 |     var pageParams = pathNodes.join('|');
437 |     var numPages = pathNodes.length;
438 |     getSummaryImages(numPages, pageParams); // get thumbnails for summary
439 |     getPathExtracts(numPages, pageParams); // get extracts for path nodes
440 |     mouseoverHandler();
441 | }
442 | 
443 | /**
444 |  * PAGE-RELATED
445 |  */
446 | 
447 | var wtfDiv = $('.wtf');
448 | var helpDiv = $('.help');
449 | var queryForm = $('.query-form');
450 | 
451 | // clears the information for a new query, retains information about previous
452 | // searches
453 | function clearPartial() {
454 |     $('svg').remove();
455 |     pathDiv.empty();
456 |     queryInfo = {};
457 |     imageURLs = [];
458 |     $('.path-not-found').addClass('hidden');
459 | }
460 | 
461 | // full clear of all global variables and input fields
462 | function clearAll() {
463 |     CODES = {};
464 |     response = '';
465 |     startField.val('');
466 |     endField.val('');
467 |     clearPartial();
468 | }
469 | 
470 | // toggles diplay for the help button upon mouseover and mouseout
471 | wtfDiv.mouseover(function() {
472 |     clearSidebar();
473 |     helpDiv.toggleClass('hidden');
474 | });
475 | 
476 | wtfDiv.mouseout(function() {
477 |     clearSidebar();
478 |     helpDiv.toggleClass('hidden');
479 | });
480 | 
481 | // toggles display for the query-form when the title is clicked
482 | $('.title').click(function() {
483 |     clearAll();
484 |     queryForm.removeClass('hidden');
485 |     aboutDiv.addClass('hidden');
486 | });
487 | 
488 | // toggles display for the information page when the 'About' button is clicked
489 | $('.info').click(function() {
490 |     clearPartial();
491 |     queryForm.addClass('hidden');
492 |     aboutDiv.removeClass('hidden');
493 | });
494 | 
495 | // clears everything upon page load
496 | clearAll();
497 | 


--------------------------------------------------------------------------------
/static/js/summary.js:
--------------------------------------------------------------------------------
  1 | function getSummaryImages(numPages, pageParams) {
  2 |     queryURL = makeQueryURL(60, numPages, pageParams);
  3 |     $.getJSON(
  4 |         queryURL,
  5 |         function(data) {
  6 |             var pageObject = data.query.pages;
  7 |             Object.keys(pageObject).forEach(function(pageKey) {
  8 |                 item = createThumbnailObject(pageObject[pageKey]);
  9 |                 code = getPathCode(item.title);
 10 |                 queryInfo[code].tinyurl = item.thumbnail;
 11 |                 queryInfo[code].tinyHeight = item.height;
 12 |                 queryInfo[code].tinyWidth = item.width;
 13 |             });
 14 |             displaySummary(response.path);
 15 |         });
 16 | }
 17 | 
 18 | function displaySummary(path) {
 19 |     // draw SVG based on length of path
 20 |     var svg = d3.select(".summary").append("svg")
 21 |         .attr("width", 200)
 22 |         .attr("height", function(d) {
 23 |             return 50 * path.length;
 24 |         });
 25 | 
 26 |     var defs = svg.append("defs");
 27 | 
 28 |     // define clip paths for each path node
 29 |     Object.keys(queryInfo).forEach(function(key) {
 30 |         defs.append("clipPath")
 31 |             .attr("id", 'timg' + key.toString())
 32 |           .append("circle")
 33 |             .attr("r", 20);
 34 |     });
 35 | 
 36 |     // establish nodes
 37 |     var tinyNode = svg.selectAll("g.tinyNode")
 38 |             .data(path)
 39 |         .enter().append("svg:g")
 40 |             .attr("class", "tinyNode")
 41 |             .attr("transform", function(d) { // nodes are placed based on order
 42 |                 var index = queryInfo[d.code].code;
 43 |                 var yValue = 25 + (index * 50);
 44 |                 return "translate(23, " + yValue + ")";
 45 |             })
 46 |             .attr("id", function(d) {return d.title + '|' + d.code;});
 47 | 
 48 |     // append clip-path to each path node
 49 |     tinyNode.append("image")
 50 |         .attr("xlink:href", function(d) {
 51 |             return queryInfo[d.code].tinyurl;
 52 |         })
 53 |         .attr("x", function(d) { return -queryInfo[d.code].tinyWidth / 2;})
 54 |         .attr("y", function(d) {
 55 |             var imgHeight = queryInfo[d.code].tinyHeight;
 56 |             var offset;
 57 |             if (h > queryInfo[d.code].tinyWidth) {
 58 |                 offset = 6;
 59 |             } else {
 60 |                 offset = 0;
 61 |             }
 62 |             return -(imgHeight / 2) + offset;
 63 |         })
 64 |         .attr("height", function(d) { return queryInfo[d.code].tinyHeight;})
 65 |         .attr("width", function(d) { return queryInfo[d.code].tinyWidth;})
 66 |         .attr("clip-path", function(d) {
 67 |             var clipPathID = 'timg' + d.code;
 68 |             return "url(#" + clipPathID + ")"; // unique clip path for this node
 69 |         });
 70 | 
 71 |     // append empty circle to nodes as an outline
 72 |     tinyNode.append("circle")
 73 |         .attr("r", 20)
 74 |         .style("stroke", "#333")
 75 |         .style("stroke-width", "2px")
 76 |         .style("fill", "none");
 77 | 
 78 |     tinyNode.append("foreignObject") // necessary for wrapped title strings
 79 |         .attr({width: 145, height: 45})
 80 |         .attr({x: 30, y: function(d) {
 81 |             var len = d.title.length;
 82 |             if (len > 42) {
 83 |                 return -22;
 84 |             } else if (len > 20) {
 85 |                 return -12;
 86 |             } else {
 87 |                 return -6;
 88 |             }
 89 |         }})
 90 |         .append("xhtml:body")
 91 |         .append("xhtml:div")
 92 |         .style({
 93 |             "font-size": "14px",
 94 |             "text-align": "left",
 95 |             "padding-left": "1px"
 96 |         })
 97 |         .html(function(d) {return d.title;});
 98 | 
 99 | }
100 | 
101 | 


--------------------------------------------------------------------------------
/templates/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | 
 4 |   <head>
 5 |     <meta charset="utf-8">
 6 |     <title>wikiGraph</title>
 7 |     <link href='http://fonts.googleapis.com/css?family=Signika|Nothing+You+Could+Do' rel='stylesheet' type='text/css'>
 8 |     <link href="../static/css/index.css" rel="stylesheet" type="text/css">
 9 |   </head>
10 | 
11 | <body>
12 | 
13 |   <div class="border logo"></div>
14 |   <div class="background logo">
15 |     <div class="title">wikiGraph</div>
16 |     <div class="blurb">Wikipedia from a graph perspective</div>
17 |     <div class="info">About</div>
18 |   </div>
19 | 
20 |   <div class="query-form scrollable-dropdown-menu">
21 |     <input type="submit" class="button" id="random-query" value="Random">
22 |     <input type="submit" class="button" id="reverse-query" value="Reverse">
23 |     <input class="typeahead" id="start-node" type="text" placeholder="Start page">&#8594;<!-- 
24 |      --><input class="typeahead" id="end-node" type="text" placeholder="End page">
25 |     <input type="submit" class="button" id="submit-query" value="Go">
26 |     <input type="submit" class="button wtf" value="?">
27 |   </div>
28 |   
29 |   <div class="loading-images"></div>
30 |   <div class="path-not-found hidden">
31 |     <img id="sadpanda" src="../static/images/sadpanda.jpg">
32 |     <p>I couldn't find a path between those pages! So sad.</p>
33 |   </div>
34 |   <div class="about">
35 |     
36 |     <h3>What connects two topics on Wikipedia?</h3>
37 |     <p>For example, how many links do you have to click to get from Harry Potter to the Spanish Inquisition?* Combining trivia nerdery with graph theory, wikiGraph allows users to find and explore the paths within Wikipedia.</p>
38 |     <p>The data are sourced from <a href="http://en.wikipedia.org" target="_">Wikipedia</a> and <a href="http://dbpedia.org/About" target="_">DBPedia</a>, without whom this project would not be possible.</p>
39 |     <p>This app was created by <a href="http://erikaarnold.com" target="_">Erika Arnold</a> as a final project for <a href="http://hackbrightacademy.com" 
40 |     target="_">Hackbright Academy</a> in Fall 2014. You can read more about the implementation and details <a href="https://github.com/erabug/wikigraph" target="_">on the Github page</a>.</p>
41 |     <p><i>*It takes a minimum of 3 clicks. Here's one path: Harry Potter &#8594; United Kingdom &#8594; Basques &#8594; Spanish Inquisition</i></p>
42 |   </div>
43 | 
44 |   <div class="results">
45 |     <div class="graph"></div>
46 |     <div class="sidebar">
47 |       <div class="summary"></div>
48 |       <div class="details hidden">
49 |         <div class="help hidden">
50 |           <p><input class="button" type="submit" value="Go"></p>
51 |           <p>Find a shortest path between two pages. You can select title suggestions from the drop-down menu or simply enter keywords.</p>
52 |           <p><input class="button" type="submit" value="Random"></p>
53 |           <p>Populates the page fields with titles randomly selected from non-list pages that have a high number of outgoing links.</p>
54 |           <p><input class="button" type="submit" value="Reverse"><p>
55 |           <p>Swaps the two selections.</p>
56 |           <p><i>Also...</i></p>
57 |           <p>Double-clicking the large path images will redirect you to that Wikipedia page. Non-path node size represents their number of outgoing links. Grumpy Cat appears if the page has no associated image.</p>
58 |         </div>
59 |         <div class="page-image"></div>
60 |         <div class="page-title"></div>
61 |         <div class="page-extract"></div>
62 |       </div>
63 |     </div>
64 |   </div>
65 | 
66 |   <script src="http://d3js.org/d3.v3.min.js"></script>
67 |   <script src="http://code.jquery.com/jquery-1.11.0.min.js"></script>
68 |   <script src="http://cdnjs.cloudflare.com/ajax/libs/typeahead.js/0.10.4/typeahead.bundle.min.js"></script>
69 | 
70 |   <script src="../static/js/graph.js"></script>
71 |   <script src="../static/js/summary.js"></script>
72 |   <script src="../static/js/index.js"></script>
73 | 
74 | </body>
75 | </html>


--------------------------------------------------------------------------------
/test_continuity.py:
--------------------------------------------------------------------------------
 1 | print "Testing for continuity..."
 2 | with open('data/nodes.tsv', 'rb') as f:
 3 | 	f.next()
 4 | 	f.next()
 5 | 	last_start = 0
 6 | 	for line in f:
 7 | 		l = line.split('\t')
 8 | 		start = int(l[0])
 9 | 		if (last_start + 1) != start:
10 | 			print "ALERT! %d did not match last_start (%d) + 1" % (start, last_start)
11 | 		
12 | 		last_start = start
13 | print "Done."


--------------------------------------------------------------------------------
/wikigraph.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, render_template, request, jsonify
 2 | import query, sqlite3, random, time, os
 3 | 
 4 | app = Flask(__name__)
 5 | app.secret_key = os.environ.get('FLASK_SECRET_KEY', 'lisaneedsbraces')
 6 | 
 7 | def connect():
 8 | 
 9 |     cursor = sqlite3.connect('data/pagenames.db').cursor()    
10 |     return cursor
11 | 
12 | @app.route('/')
13 | def index():
14 | 
15 |     return render_template('index.html')
16 | 
17 | @app.route('/query')
18 | def get_path():
19 | 
20 |     node2, node1, code2, code1 = request.args.values()
21 |     path_query = node1.replace(' ', '_') + '|' + node2.replace(' ', '_')
22 |     print "%s (%s) -> %s (%s)?" % (node1, code1, node2, code2)
23 |     response = query.create_lists(str(code1), str(code2))
24 |     
25 |     return response
26 | 
27 | @app.route('/page-names')
28 | def get_page_names():
29 | 
30 |     entry = request.args.get("query").lower()
31 |     print "Requesting page names for '%s'..." % entry
32 |     t0 = time.time()
33 | 
34 |     cursor = connect()
35 |     query1 = 'SELECT code, title FROM pagenames WHERE title_lower = ?'
36 |     row = cursor.execute(query1, (entry,)).fetchone()
37 | 
38 |     results = [{ 'title': row[1], 'code': row[0] }] if row != None else []
39 | 
40 |     query2 = '''SELECT code, title 
41 |                 FROM pagenames 
42 |                 WHERE title LIKE ? 
43 |                 OR title LIKE ? 
44 |                 LIMIT 50;'''
45 | 
46 |     rows = cursor.execute(query2, (entry + '%', '% ' + entry, ))
47 |     results.extend([{ 'title': row[1], 'code': row[0] } for row in rows])
48 |     response = jsonify(**{ 'results': results })
49 | 
50 |     t1 = time.time()
51 |     print "DB responded with %d results in %0.2f seconds" % (len(results), t1 - t0)
52 | 
53 |     return response
54 | 
55 | @app.route('/random-query')
56 | def get_random_names():
57 | 
58 |     print "Requesting two random pages..."
59 |     t0 = time.time()
60 | 
61 |     cursor = connect()
62 |     query = '''SELECT code, title
63 |                FROM pagenames
64 |                WHERE degrees > 150
65 |                AND title NOT BETWEEN 'List' and 'Lisu'
66 |                AND NOT title BETWEEN '0' and '9}'
67 |                ORDER BY RANDOM()
68 |                LIMIT 2'''
69 | 
70 |     rows = cursor.execute(query)
71 |     results = [{ 'title': row[1].replace('_', ' '), 'code': row[0] } for row in rows]
72 |     response = jsonify(**{ 'results': results })
73 | 
74 |     t1 = time.time()
75 |     print "DB responded in %0.2f seconds" % (t1 - t0)
76 | 
77 |     return response
78 | 
79 | if __name__ == '__main__':
80 |     app.run(debug=True)
81 |     # app.run(host="54")


--------------------------------------------------------------------------------