├── 1-download-sitemaps.py
├── 2-crawl.py
├── 3-jsonify-games.py
├── 4-jsonify-geeklists.py
├── README.md
├── game.py
└── requirements.txt


/1-download-sitemaps.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | from httplib2 import Http
 3 | import os
 4 | 
 5 | http = Http()
 6 | 
 7 | if len(sys.argv) > 1:
 8 |     DATE_DIR = sys.argv[1]
 9 | else:
10 |     DATE_DIR = datetime.strftime(datetime.datetime.now(), "%Y%m")
11 | 
12 | DUMP_DIR = os.path.join("BoardGameGeek.xml", DATE_DIR)
13 | 
14 | SITEMAP_DIRECTORY = os.path.join(DUMP_DIR, "maps")
15 | if not os.path.exists(SITEMAP_DIRECTORY):
16 |     os.makedirs(SITEMAP_DIRECTORY)
17 | 
18 | def req(*args, **kwargs):
19 |     try:
20 |         response, body = http.request(*args, **kwargs)
21 |     except Exception, e:
22 |         print "Could not request %r %r: %s" % (args, kwargs, e)
23 |         return None, None
24 |     return response, body
25 | 
26 | response, body = req('http://boardgamegeek.com/sitemapindex')
27 | import time
28 | 
29 | soup = BeautifulSoup(body, "lxml")
30 | for loc in soup.find_all("loc"):
31 |     url = loc.string.strip()
32 |     filename = url[url.rindex("sitemap_")+len("sitemap_"):]
33 |     path = os.path.join(SITEMAP_DIRECTORY, filename)
34 |     if os.path.exists(path):
35 |         continue
36 |     print "%s -> %s" % (url, path)
37 |     response, body = req(url)
38 |     open(path, "w").write(body)
39 |     time.sleep(1)
40 | 


--------------------------------------------------------------------------------
/2-crawl.py:
--------------------------------------------------------------------------------
  1 | from httplib2 import Http
  2 | import os
  3 | import re
  4 | import time
  5 | 
  6 | if len(sys.argv) > 1:
  7 |     DATE_DIR = sys.argv[1]
  8 | else:
  9 |     DATE_DIR = datetime.strftime(datetime.datetime.now(), "%Y%m")
 10 | 
 11 | DUMP_DIR = os.path.join("BoardGameGeek.xml", DATE_DIR)
 12 | 
 13 | SITEMAP_DIRECTORY = os.path.join(DUMP_DIR, "maps")
 14 | GAME_OUTPUT_DIRECTORY = os.path.join(DUMP_DIR, "boardgame_batches")
 15 | GEEKLIST_OUTPUT_DIRECTORY = os.path.join(DUMP_DIR, "geeklist")
 16 | GAME_NUMBER = re.compile("/boardgame/([0-9]+)/")
 17 | GEEKLIST_NUMBER = re.compile("/geeklist/([0-9]*)/")
 18 | 
 19 | for d in GAME_OUTPUT_DIRECTORY, GEEKLIST_OUTPUT_DIRECTORY:
 20 |     if not os.path.exists(d):
 21 |         os.makedirs(d)
 22 | 
 23 | BATCH_SIZE = 20
 24 | 
 25 | BOARDGAME_URL = "http://boardgamegeek.com/xmlapi/boardgame/%s?comments=1&stats=1"
 26 | GEEKLIST_URL = "http://boardgamegeek.com/xmlapi/geeklist/%s?comments=1"
 27 | 
 28 | http = Http()
 29 | 
 30 | def req(*args, **kwargs):
 31 |     try:
 32 |         response, body = http.request(*args, **kwargs)
 33 |     except Exception, e:
 34 |         print "Could not request %r %r: %s" % (args, kwargs, e)
 35 |         return None, None
 36 |     return response, body
 37 | 
 38 | def download_geeklist(number):
 39 |     filename = os.path.join(GEEKLIST_OUTPUT_DIRECTORY, "geeklist-%s.xml" % number)
 40 |     if os.path.exists(filename):
 41 |         print "Skipping %s" % filename
 42 |         return False
 43 |     url = GEEKLIST_URL % number
 44 |     if number in ("36742", "35076", "34435", "30058", "29485", "16221", "8785", "4368", "49088"):
 45 |         # For whatever reason these are known to be bad.
 46 |         return False
 47 |     print "Downloading geeklist %s" % number
 48 |     response, body = req(
 49 |         url, "GET", headers = {
 50 |             "Accept-Encoding": "gzip,deflate",
 51 |             "User-Agent" : "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0)" })
 52 |     if body is not None:
 53 |         open(filename, "w").write(body)
 54 |     return True
 55 | 
 56 | def download_boardgame_batch(numbers):
 57 |     url = BOARDGAME_URL % ",".join(numbers)
 58 |     if len(numbers) == 1:
 59 |         filename = "boardgame-" + numbers[0] + ".xml"
 60 |     else:
 61 |         filename = "boardgame-" + numbers[0] + "-" + numbers[-1] + ".xml"
 62 |     path = os.path.join(GAME_OUTPUT_DIRECTORY, filename)
 63 |     if os.path.exists(path):
 64 |         print "Skipping %s, already present." % path
 65 |         return False
 66 |     print filename, url
 67 |     response, body = req(url, "GET", headers={
 68 |             "Accept-Encoding": "gzip,deflate",
 69 |             "User-Agent": "Mozilla/5.0 (X11; U; Linux i586; de; rv:5.0) Gecko/20100101 Firefox/5.0" })
 70 |     if body is not None:
 71 |         open(path, "w").write(body)
 72 |     return True
 73 | 
 74 | def crawl_boardgame_file(filename):
 75 |     """Download the listing for every board game in a single site map file."""
 76 |     print "Processing %s" % filename
 77 |     numbers = []
 78 |     for line in open(filename):
 79 |         match = GAME_NUMBER.search(line)
 80 |         if match is not None:
 81 |             (number,) = match.groups()
 82 |             numbers.append(number)
 83 |             if len(numbers) >= BATCH_SIZE:
 84 |                 try:
 85 |                     made_request = download_boardgame_batch(numbers)
 86 |                 except Exception, e:
 87 |                     made_request = download_boardgame_batch(numbers)
 88 |                 if made_request:
 89 |                     time.sleep(1)
 90 |                 numbers = []
 91 |     # Do one last batch.
 92 |     if len(numbers) > 0:
 93 |         download_boardgame_batch(numbers)
 94 | 
 95 | def crawl_geeklist_file(filename):
 96 |     numbers = []
 97 |     for line in open(filename):
 98 |         match = GEEKLIST_NUMBER.search(line)
 99 |         if match is not None:
100 |             number = match.groups()[0]
101 |             made_request = download_geeklist(number)
102 |             if made_request:
103 |                 time.sleep(0.5)
104 | 
105 | def crawl_boardgames():
106 |     """Download the listing for every board game in the site map."""
107 |     for filename in os.listdir(SITEMAP_DIRECTORY):
108 |         if '_boardgame_' in filename:
109 |             crawl_boardgame_file(os.path.join(SITEMAP_DIRECTORY, filename))
110 | 
111 | def crawl_geeklists():
112 |     for filename in os.listdir(SITEMAP_DIRECTORY):
113 |         if 'geeklist' in filename:
114 |             crawl_geeklist_file(os.path.join(SITEMAP_DIRECTORY, filename))
115 | 
116 | crawl_boardgames()
117 | crawl_geeklists()
118 | 


--------------------------------------------------------------------------------
/3-jsonify-games.py:
--------------------------------------------------------------------------------
 1 | from pdb import set_trace
 2 | import os
 3 | from game import Game
 4 | import jsonpickle
 5 | import datetime
 6 | import sys
 7 | 
 8 | INPUT_PATH = "BoardGameGeek.xml/%s/boardgame_batches/"
 9 | OUTPUT_PATH = "BoardGameGeek.json/%s/boardgame_batches/"
10 | 
11 | if len(sys.argv) > 1:
12 |     DATE_DIR = sys.argv[1]
13 | else:
14 |     DATE_DIR = datetime.datetime.strftime(datetime.datetime.now(), "%Y%m")
15 | 
16 | input_dir = INPUT_PATH % DATE_DIR
17 | output_dir = OUTPUT_PATH % DATE_DIR
18 | if not os.path.exists(output_dir):
19 |     os.makedirs(output_dir)
20 | 
21 | for filename in sorted(os.listdir(input_dir)):
22 |     output_filename = filename.replace(".xml", ".json")
23 |     output_path = os.path.join(output_dir, output_filename)
24 |     if os.path.exists(output_path):
25 |         print "Skipping %s" % output_filename
26 |     else:
27 |         print "Writing %s" % output_path
28 |         game = Game.from_xml(open(os.path.join(input_dir, filename)))
29 |         open(output_path, "w").write(jsonpickle.encode(game))
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/4-jsonify-geeklists.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from game import Geeklist
 3 | import jsonpickle
 4 | 
 5 | from pdb import set_trace
 6 | import os
 7 | from game import Game
 8 | import datetime
 9 | import sys
10 | 
11 | 
12 | INPUT_PATH = "BoardGameGeek.xml/%s/geeklist/"
13 | OUTPUT_PATH = "BoardGameGeek.json/%s/geeklist/"
14 | 
15 | if len(sys.argv) > 1:
16 |     DATE_DIR = sys.argv[1]
17 | else:
18 |     DATE_DIR = datetime.datetime.strftime(datetime.datetime.now(), "%Y%m")
19 | 
20 | input_dir = INPUT_PATH % DATE_DIR
21 | output_dir = OUTPUT_PATH % DATE_DIR
22 | if not os.path.exists(output_dir):
23 |     os.makedirs(output_dir)
24 | 
25 | for filename in sorted(os.listdir(input_dir)):
26 |     output_filename = filename.replace(".xml", ".json")
27 |     output_path = os.path.join(output_dir, output_filename)
28 |     if os.path.exists(output_path):
29 |         print "Skipping %s" % output_filename
30 |     else:
31 |         input_path = os.path.join(input_dir, filename)
32 |         print "%s -> %s" % (input_path, output_path)
33 |         geeklist = Geeklist.from_xml(open(input_path))
34 |         if geeklist is None:
35 |             print "Couldn't scrape that one."
36 |         else:
37 |             open(output_path, "w").write(jsonpickle.encode(geeklist))
38 | 
39 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Board Game Geek data dump scripts
 2 | =================================
 3 | 
 4 | This is a series of hacky scripts that uses BoardGameGeek's API to
 5 | download a complete dump of BoardGameGeek's database of board games
 6 | and the "Geeklists" created by site users. I run these scripts every
 7 | July and publish the historical data to track changes in the board
 8 | game industry and how peoples' feelings about games change over time.
 9 | 
10 | Even if you need this data, you probably never need to run these
11 | scripts, but here they are in case I stop doing this project or
12 | whatever.
13 | 
14 | To get a complete data dump, run the scripts in order. The first two
15 | scripts download the data; the last two convert it from XML to a more
16 | usable JSON format.
17 | 
18 | Downloading the games will take a few hours; downloading the geeklists
19 | takes quite a long time.
20 | 


--------------------------------------------------------------------------------
/game.py:
--------------------------------------------------------------------------------
  1 | from pdb import set_trace
  2 | from bs4 import BeautifulSoup
  3 | from dateutil.parser import parse as parse2
  4 | import time
  5 | 
  6 | def parse(date):
  7 |     datetime = parse2(date)
  8 |     return time.mktime(datetime.timetuple())
  9 | 
 10 | class Scraper(object):
 11 | 
 12 |     def contents_string(self, tag):
 13 |         if tag is None:
 14 |             return None
 15 |         value = tag.string
 16 |         if value is None:
 17 |             # It's more than one string
 18 |             value = "\n".join(unicode(x).strip() for x in tag.contents)
 19 |         else:
 20 |             value = value.strip()
 21 |         return value
 22 | 
 23 |     def get_tag_value(self, soup, name):
 24 |         tag = soup.find(name)
 25 |         s = tag.string
 26 |         if not s:
 27 |             return None
 28 |         return s.strip()
 29 | 
 30 |     def get_from_soup(self, soup, attribute):
 31 |         value = None
 32 |         value_tag = getattr(soup, attribute, None)
 33 |         if value_tag is not None:
 34 |             value = self.contents_string(value_tag)
 35 |             value_tag.extract()
 36 |             return value
 37 | 
 38 |     def get_date_from_soup(self, soup, attribute):
 39 |         value = self.get_from_soup(soup, attribute)
 40 |         if value is None:
 41 |             return value
 42 |         return parse(value)
 43 | 
 44 |     def set_from_soup(self, soup, attribute):
 45 |         value = self.get_from_soup(soup, attribute)
 46 |         setattr(self, attribute, value)
 47 | 
 48 |     def get_number_from_soup(self, soup, attribute):
 49 |         value = self.get_from_soup(soup, attribute)
 50 |         if value is not None and value != '':
 51 |             value = int(value)
 52 |         return value
 53 | 
 54 |     def set_number_from_soup(self, soup, attribute):
 55 |         value = self.get_number_from_soup(soup, attribute)
 56 |         setattr(self, attribute, value)
 57 | 
 58 | 
 59 | class Game(Scraper):
 60 | 
 61 |     @classmethod
 62 |     def from_xml(cls, data):
 63 |         data = data.read()
 64 |         soup = BeautifulSoup(data, "xml", from_encoding="utf8").boardgame
 65 |         for i in soup:
 66 |             if isinstance(i, unicode) and i.strip() == '':
 67 |                 i.extract()
 68 | 
 69 |         game = Game()
 70 |         game.objectid = soup['objectid']
 71 | 
 72 |         game.name = []
 73 |         first_name = None
 74 |         for name_tag in soup.find_all("name", recursive=False):
 75 |             name = name_tag.string
 76 |             if name_tag.get('primary') == 'true':
 77 |                 first_name = name.strip()
 78 |             else:
 79 |                 game.name.append(name.strip())
 80 |                 game.name.sort()
 81 |             name_tag.extract()
 82 |         if first_name is not None:
 83 |             game.name.insert(0, first_name)
 84 | 
 85 |         for attribute in (
 86 |             "yearpublished", "minplayers", "maxplayers", "playingtime",
 87 |             "age"):
 88 |             game.set_number_from_soup(soup, attribute)
 89 | 
 90 |         for attribute in ("description", "thumbnail"):
 91 |             game.set_from_soup(soup, attribute)
 92 | 
 93 |         i = soup.image
 94 |         if i:
 95 |             game.image = i.text
 96 |         else:
 97 |             game.image = None
 98 | 
 99 |         game.collect_comments(soup)
100 | 
101 |         for list_name in (
102 |             'boardgamedesigner', 'boardgamepublisher',
103 |             'boardgamecategory', 'boardgamesubdomain',
104 |             'boardgamehonor', 'boardgamepodcastepisode',
105 |             'boardgameversion', 'boardgamefamily', 'boardgamemechanic',
106 |             'boardgameartist'):
107 |             game.collect_list_from_soup(soup, list_name)
108 | 
109 |         game.collect_numplayers_poll(soup)
110 |         game.collect_language_dependence_poll(soup)
111 |         game.collect_player_age_poll(soup)
112 | 
113 |         # any other polls?
114 |         game.collect_polls_from_soup(soup)
115 | 
116 |         game.collect_ranks(soup)
117 |         game.collect_stats(soup)
118 |         return game
119 | 
120 | 
121 |     def collect_numplayers_poll(self, soup):
122 |         poll_tag = soup.find("poll", {"name": "suggested_numplayers"},
123 |                              recursive=False)
124 |         self.numplayers = {}
125 |         if poll_tag is None:
126 |             return
127 |         for results in poll_tag.find_all('results'):
128 |             key = results['numplayers']
129 |             self.numplayers[key] = []
130 |             for value in 'Best', 'Recommended', 'Not Recommended':
131 |                 result = results.find("result", value=value)
132 |                 if result is not None:
133 |                     result = result['numvotes']
134 |                 self.numplayers[key].append(result)
135 |         poll_tag.extract()
136 | 
137 |     def collect_language_dependence_poll(self, soup):
138 |         poll_tag = soup.find("poll", {"name": "language_dependence"},
139 |                              recursive=False)
140 |         self.language_dependence = {}
141 |         if poll_tag is None:
142 |             return
143 |         for result in poll_tag.find_all('result'):
144 |             key = int(result['level'])
145 |             self.language_dependence[key] = int(result['numvotes'])
146 |         poll_tag.extract()
147 | 
148 |     def collect_player_age_poll(self, soup):
149 |         poll_tag = soup.find("poll", {"name": "suggested_playerage"},
150 |                              recursive=False)
151 |         self.suggested_player_age = {}
152 |         if poll_tag is None:
153 |             return
154 |         for result in poll_tag.find_all('result'):
155 |             key = result['value']
156 |             self.suggested_player_age[key] = int(result['numvotes'])
157 |         poll_tag.extract()
158 | 
159 |     def collect_polls_from_soup(self, soup):
160 |         for poll_tag in soup.find_all("poll", recursive=False):
161 |             print "WEIRD POLL DUDE %s" % unicode(poll_tag)
162 | 
163 |     def collect_comments(self, soup):
164 |         self.comments = []
165 |         for comment_tag in soup.find_all("comment", recursive=False):
166 |             rating = comment_tag.get('rating')
167 |             if rating == 'N/A':
168 |                 rating = None
169 |             else:
170 |                 rating = float(rating)
171 |             comment = [
172 |                 comment_tag.get('username'), rating,
173 |                 "\n".join(unicode(x).strip() for x in comment_tag.contents)]
174 |             self.comments.append(comment)
175 |             comment_tag.extract()
176 | 
177 |     def float(self, v):
178 |         if not v:
179 |             return None
180 |         return float(v)
181 | 
182 |     def collect_stats(self, soup):
183 |         self.rating = {}
184 |         self.rating['num_responses'] = self.get_tag_value(soup, 'usersrated')
185 |         self.rating['average'] = self.float(self.get_tag_value(soup, 'average'))
186 |         self.rating['bayes_average'] = self.float(
187 |             self.get_tag_value(soup,'bayesaverage'))
188 |         self.rating['stddev'] = self.float(self.get_tag_value(soup, 'stddev'))
189 |         self.rating['median'] = self.float(self.get_tag_value(soup, 'median'))
190 | 
191 |         self.weight = {}
192 |         self.weight['num_responses'] = self.get_tag_value(soup, 'numweights')
193 |         self.weight['average'] = self.get_tag_value(soup, 'averageweight')
194 | 
195 |         self.ownership = {}
196 |         self.ownership['commented'] = self.get_tag_value(soup, 'numcomments')
197 |         self.ownership['owned'] = self.get_tag_value(soup, 'owned')
198 |         self.ownership['trading'] = self.get_tag_value(soup, 'trading')
199 |         self.ownership['wanting'] = self.get_tag_value(soup, 'wanting')
200 |         self.ownership['wishing'] = self.get_tag_value(soup, 'wishing')
201 | 
202 |     def collect_list_from_soup(self, soup, attribute):
203 |         l = []
204 |         for tag in soup.find_all(attribute, recursive=False):
205 |             objectid = tag['objectid']
206 |             contents = tag.string.strip()
207 |             tag.extract()
208 |             l.append([contents, objectid])
209 |         new_attr = attribute
210 |         if attribute.startswith('boardgame'):
211 |             new_attr = attribute[9:]
212 |         setattr(self, new_attr, l)
213 | 
214 |     def collect_ranks(self, soup):
215 |         self.ranks = {}
216 |         for rank in soup.find_all('rank'):
217 |             name = rank['name']
218 |             friendly_name = rank['friendlyname']
219 |             value = rank['value']
220 |             type = rank['type']
221 |             bayes_average = rank['bayesaverage']
222 |             self.ranks[name] = [friendly_name, value, type, bayes_average]
223 |             rank.extract()
224 | 
225 | 
226 | 
227 | class Geeklist(Scraper):
228 | 
229 |     @classmethod
230 |     def from_xml(self, data):
231 |         geek = Geeklist()
232 |         soup = BeautifulSoup(data, "xml", from_encoding="utf8")
233 |         if soup.error is not None:
234 |             return None
235 |         soup = soup.geeklist
236 |         if not soup:
237 |             return None
238 |         geek.gather_edits(soup)
239 | 
240 |         for key in 'username', 'title', 'description':
241 |             geek.set_from_soup(soup, key)
242 | 
243 |         geek.postdate = geek.get_date_from_soup(soup, 'postdate')
244 |         geek.set_number_from_soup(soup, 'thumbs')
245 | 
246 |         geek.gather_items(soup)
247 |         geek.gather_comments(soup)
248 |         return geek
249 | 
250 |     def gather_edits(self, soup):
251 |         self.edits = []
252 |         for timestamp in soup.find_all('editdate'):
253 |             self.edits.append(parse(timestamp.string))
254 | 
255 |     def gather_items(self, soup):
256 |         self.items = []
257 |         for item_tag in soup.find_all('item', recursive=False):
258 |             item = {}
259 |             for key in (
260 |                 'objecttype', 'subtype', 'objectid', 'objectname',
261 |                 'username', 'thumbs', 'imageid'):
262 |                 item[key] = item_tag[key]
263 |             item['postdate'] = parse(item_tag['postdate'])
264 |             item['editdate'] = parse(item_tag['editdate'])
265 |             item['body'] = self.contents_string(item_tag.body)
266 |             item['comments'] = []
267 |             for comment_tag in item_tag.find_all('comment', recursive=False):
268 |                 item['comments'].append(self.comment_from(comment_tag))
269 |             self.items.append(item)
270 | 
271 |     def gather_comments(self, soup):
272 |         self.comments = [
273 |             self.comment_from(comment_tag) for comment_tag in
274 |                 soup.find_all('comment', recursive=False)]
275 | 
276 |     def comment_from(self, tag):
277 |         comment = {}
278 |         comment['username'] = tag['username']
279 |         comment['thumbs'] = int(tag['thumbs'])
280 |         comment['postdate'] = parse(tag['postdate'])
281 |         comment['postdate'] = parse(tag['editdate'])
282 |         comment['body'] = self.contents_string(tag)
283 |         return comment
284 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | lxml
2 | beautifulsoup
3 | jsonpickle
4 | 


--------------------------------------------------------------------------------