├── 1-download-sitemaps.py ├── 2-crawl.py ├── 3-jsonify-games.py ├── 4-jsonify-geeklists.py ├── README.md ├── game.py └── requirements.txt /1-download-sitemaps.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | from httplib2 import Http 3 | import os 4 | 5 | http = Http() 6 | 7 | if len(sys.argv) > 1: 8 | DATE_DIR = sys.argv[1] 9 | else: 10 | DATE_DIR = datetime.strftime(datetime.datetime.now(), "%Y%m") 11 | 12 | DUMP_DIR = os.path.join("BoardGameGeek.xml", DATE_DIR) 13 | 14 | SITEMAP_DIRECTORY = os.path.join(DUMP_DIR, "maps") 15 | if not os.path.exists(SITEMAP_DIRECTORY): 16 | os.makedirs(SITEMAP_DIRECTORY) 17 | 18 | def req(*args, **kwargs): 19 | try: 20 | response, body = http.request(*args, **kwargs) 21 | except Exception, e: 22 | print "Could not request %r %r: %s" % (args, kwargs, e) 23 | return None, None 24 | return response, body 25 | 26 | response, body = req('http://boardgamegeek.com/sitemapindex') 27 | import time 28 | 29 | soup = BeautifulSoup(body, "lxml") 30 | for loc in soup.find_all("loc"): 31 | url = loc.string.strip() 32 | filename = url[url.rindex("sitemap_")+len("sitemap_"):] 33 | path = os.path.join(SITEMAP_DIRECTORY, filename) 34 | if os.path.exists(path): 35 | continue 36 | print "%s -> %s" % (url, path) 37 | response, body = req(url) 38 | open(path, "w").write(body) 39 | time.sleep(1) 40 | -------------------------------------------------------------------------------- /2-crawl.py: -------------------------------------------------------------------------------- 1 | from httplib2 import Http 2 | import os 3 | import re 4 | import time 5 | 6 | if len(sys.argv) > 1: 7 | DATE_DIR = sys.argv[1] 8 | else: 9 | DATE_DIR = datetime.strftime(datetime.datetime.now(), "%Y%m") 10 | 11 | DUMP_DIR = os.path.join("BoardGameGeek.xml", DATE_DIR) 12 | 13 | SITEMAP_DIRECTORY = os.path.join(DUMP_DIR, "maps") 14 | GAME_OUTPUT_DIRECTORY = os.path.join(DUMP_DIR, "boardgame_batches") 15 | GEEKLIST_OUTPUT_DIRECTORY = os.path.join(DUMP_DIR, "geeklist") 16 | GAME_NUMBER = re.compile("/boardgame/([0-9]+)/") 17 | GEEKLIST_NUMBER = re.compile("/geeklist/([0-9]*)/") 18 | 19 | for d in GAME_OUTPUT_DIRECTORY, GEEKLIST_OUTPUT_DIRECTORY: 20 | if not os.path.exists(d): 21 | os.makedirs(d) 22 | 23 | BATCH_SIZE = 20 24 | 25 | BOARDGAME_URL = "http://boardgamegeek.com/xmlapi/boardgame/%s?comments=1&stats=1" 26 | GEEKLIST_URL = "http://boardgamegeek.com/xmlapi/geeklist/%s?comments=1" 27 | 28 | http = Http() 29 | 30 | def req(*args, **kwargs): 31 | try: 32 | response, body = http.request(*args, **kwargs) 33 | except Exception, e: 34 | print "Could not request %r %r: %s" % (args, kwargs, e) 35 | return None, None 36 | return response, body 37 | 38 | def download_geeklist(number): 39 | filename = os.path.join(GEEKLIST_OUTPUT_DIRECTORY, "geeklist-%s.xml" % number) 40 | if os.path.exists(filename): 41 | print "Skipping %s" % filename 42 | return False 43 | url = GEEKLIST_URL % number 44 | if number in ("36742", "35076", "34435", "30058", "29485", "16221", "8785", "4368", "49088"): 45 | # For whatever reason these are known to be bad. 46 | return False 47 | print "Downloading geeklist %s" % number 48 | response, body = req( 49 | url, "GET", headers = { 50 | "Accept-Encoding": "gzip,deflate", 51 | "User-Agent" : "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0)" }) 52 | if body is not None: 53 | open(filename, "w").write(body) 54 | return True 55 | 56 | def download_boardgame_batch(numbers): 57 | url = BOARDGAME_URL % ",".join(numbers) 58 | if len(numbers) == 1: 59 | filename = "boardgame-" + numbers[0] + ".xml" 60 | else: 61 | filename = "boardgame-" + numbers[0] + "-" + numbers[-1] + ".xml" 62 | path = os.path.join(GAME_OUTPUT_DIRECTORY, filename) 63 | if os.path.exists(path): 64 | print "Skipping %s, already present." % path 65 | return False 66 | print filename, url 67 | response, body = req(url, "GET", headers={ 68 | "Accept-Encoding": "gzip,deflate", 69 | "User-Agent": "Mozilla/5.0 (X11; U; Linux i586; de; rv:5.0) Gecko/20100101 Firefox/5.0" }) 70 | if body is not None: 71 | open(path, "w").write(body) 72 | return True 73 | 74 | def crawl_boardgame_file(filename): 75 | """Download the listing for every board game in a single site map file.""" 76 | print "Processing %s" % filename 77 | numbers = [] 78 | for line in open(filename): 79 | match = GAME_NUMBER.search(line) 80 | if match is not None: 81 | (number,) = match.groups() 82 | numbers.append(number) 83 | if len(numbers) >= BATCH_SIZE: 84 | try: 85 | made_request = download_boardgame_batch(numbers) 86 | except Exception, e: 87 | made_request = download_boardgame_batch(numbers) 88 | if made_request: 89 | time.sleep(1) 90 | numbers = [] 91 | # Do one last batch. 92 | if len(numbers) > 0: 93 | download_boardgame_batch(numbers) 94 | 95 | def crawl_geeklist_file(filename): 96 | numbers = [] 97 | for line in open(filename): 98 | match = GEEKLIST_NUMBER.search(line) 99 | if match is not None: 100 | number = match.groups()[0] 101 | made_request = download_geeklist(number) 102 | if made_request: 103 | time.sleep(0.5) 104 | 105 | def crawl_boardgames(): 106 | """Download the listing for every board game in the site map.""" 107 | for filename in os.listdir(SITEMAP_DIRECTORY): 108 | if '_boardgame_' in filename: 109 | crawl_boardgame_file(os.path.join(SITEMAP_DIRECTORY, filename)) 110 | 111 | def crawl_geeklists(): 112 | for filename in os.listdir(SITEMAP_DIRECTORY): 113 | if 'geeklist' in filename: 114 | crawl_geeklist_file(os.path.join(SITEMAP_DIRECTORY, filename)) 115 | 116 | crawl_boardgames() 117 | crawl_geeklists() 118 | -------------------------------------------------------------------------------- /3-jsonify-games.py: -------------------------------------------------------------------------------- 1 | from pdb import set_trace 2 | import os 3 | from game import Game 4 | import jsonpickle 5 | import datetime 6 | import sys 7 | 8 | INPUT_PATH = "BoardGameGeek.xml/%s/boardgame_batches/" 9 | OUTPUT_PATH = "BoardGameGeek.json/%s/boardgame_batches/" 10 | 11 | if len(sys.argv) > 1: 12 | DATE_DIR = sys.argv[1] 13 | else: 14 | DATE_DIR = datetime.datetime.strftime(datetime.datetime.now(), "%Y%m") 15 | 16 | input_dir = INPUT_PATH % DATE_DIR 17 | output_dir = OUTPUT_PATH % DATE_DIR 18 | if not os.path.exists(output_dir): 19 | os.makedirs(output_dir) 20 | 21 | for filename in sorted(os.listdir(input_dir)): 22 | output_filename = filename.replace(".xml", ".json") 23 | output_path = os.path.join(output_dir, output_filename) 24 | if os.path.exists(output_path): 25 | print "Skipping %s" % output_filename 26 | else: 27 | print "Writing %s" % output_path 28 | game = Game.from_xml(open(os.path.join(input_dir, filename))) 29 | open(output_path, "w").write(jsonpickle.encode(game)) 30 | 31 | 32 | -------------------------------------------------------------------------------- /4-jsonify-geeklists.py: -------------------------------------------------------------------------------- 1 | import os 2 | from game import Geeklist 3 | import jsonpickle 4 | 5 | from pdb import set_trace 6 | import os 7 | from game import Game 8 | import datetime 9 | import sys 10 | 11 | 12 | INPUT_PATH = "BoardGameGeek.xml/%s/geeklist/" 13 | OUTPUT_PATH = "BoardGameGeek.json/%s/geeklist/" 14 | 15 | if len(sys.argv) > 1: 16 | DATE_DIR = sys.argv[1] 17 | else: 18 | DATE_DIR = datetime.datetime.strftime(datetime.datetime.now(), "%Y%m") 19 | 20 | input_dir = INPUT_PATH % DATE_DIR 21 | output_dir = OUTPUT_PATH % DATE_DIR 22 | if not os.path.exists(output_dir): 23 | os.makedirs(output_dir) 24 | 25 | for filename in sorted(os.listdir(input_dir)): 26 | output_filename = filename.replace(".xml", ".json") 27 | output_path = os.path.join(output_dir, output_filename) 28 | if os.path.exists(output_path): 29 | print "Skipping %s" % output_filename 30 | else: 31 | input_path = os.path.join(input_dir, filename) 32 | print "%s -> %s" % (input_path, output_path) 33 | geeklist = Geeklist.from_xml(open(input_path)) 34 | if geeklist is None: 35 | print "Couldn't scrape that one." 36 | else: 37 | open(output_path, "w").write(jsonpickle.encode(geeklist)) 38 | 39 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Board Game Geek data dump scripts 2 | ================================= 3 | 4 | This is a series of hacky scripts that uses BoardGameGeek's API to 5 | download a complete dump of BoardGameGeek's database of board games 6 | and the "Geeklists" created by site users. I run these scripts every 7 | July and publish the historical data to track changes in the board 8 | game industry and how peoples' feelings about games change over time. 9 | 10 | Even if you need this data, you probably never need to run these 11 | scripts, but here they are in case I stop doing this project or 12 | whatever. 13 | 14 | To get a complete data dump, run the scripts in order. The first two 15 | scripts download the data; the last two convert it from XML to a more 16 | usable JSON format. 17 | 18 | Downloading the games will take a few hours; downloading the geeklists 19 | takes quite a long time. 20 | -------------------------------------------------------------------------------- /game.py: -------------------------------------------------------------------------------- 1 | from pdb import set_trace 2 | from bs4 import BeautifulSoup 3 | from dateutil.parser import parse as parse2 4 | import time 5 | 6 | def parse(date): 7 | datetime = parse2(date) 8 | return time.mktime(datetime.timetuple()) 9 | 10 | class Scraper(object): 11 | 12 | def contents_string(self, tag): 13 | if tag is None: 14 | return None 15 | value = tag.string 16 | if value is None: 17 | # It's more than one string 18 | value = "\n".join(unicode(x).strip() for x in tag.contents) 19 | else: 20 | value = value.strip() 21 | return value 22 | 23 | def get_tag_value(self, soup, name): 24 | tag = soup.find(name) 25 | s = tag.string 26 | if not s: 27 | return None 28 | return s.strip() 29 | 30 | def get_from_soup(self, soup, attribute): 31 | value = None 32 | value_tag = getattr(soup, attribute, None) 33 | if value_tag is not None: 34 | value = self.contents_string(value_tag) 35 | value_tag.extract() 36 | return value 37 | 38 | def get_date_from_soup(self, soup, attribute): 39 | value = self.get_from_soup(soup, attribute) 40 | if value is None: 41 | return value 42 | return parse(value) 43 | 44 | def set_from_soup(self, soup, attribute): 45 | value = self.get_from_soup(soup, attribute) 46 | setattr(self, attribute, value) 47 | 48 | def get_number_from_soup(self, soup, attribute): 49 | value = self.get_from_soup(soup, attribute) 50 | if value is not None and value != '': 51 | value = int(value) 52 | return value 53 | 54 | def set_number_from_soup(self, soup, attribute): 55 | value = self.get_number_from_soup(soup, attribute) 56 | setattr(self, attribute, value) 57 | 58 | 59 | class Game(Scraper): 60 | 61 | @classmethod 62 | def from_xml(cls, data): 63 | data = data.read() 64 | soup = BeautifulSoup(data, "xml", from_encoding="utf8").boardgame 65 | for i in soup: 66 | if isinstance(i, unicode) and i.strip() == '': 67 | i.extract() 68 | 69 | game = Game() 70 | game.objectid = soup['objectid'] 71 | 72 | game.name = [] 73 | first_name = None 74 | for name_tag in soup.find_all("name", recursive=False): 75 | name = name_tag.string 76 | if name_tag.get('primary') == 'true': 77 | first_name = name.strip() 78 | else: 79 | game.name.append(name.strip()) 80 | game.name.sort() 81 | name_tag.extract() 82 | if first_name is not None: 83 | game.name.insert(0, first_name) 84 | 85 | for attribute in ( 86 | "yearpublished", "minplayers", "maxplayers", "playingtime", 87 | "age"): 88 | game.set_number_from_soup(soup, attribute) 89 | 90 | for attribute in ("description", "thumbnail"): 91 | game.set_from_soup(soup, attribute) 92 | 93 | i = soup.image 94 | if i: 95 | game.image = i.text 96 | else: 97 | game.image = None 98 | 99 | game.collect_comments(soup) 100 | 101 | for list_name in ( 102 | 'boardgamedesigner', 'boardgamepublisher', 103 | 'boardgamecategory', 'boardgamesubdomain', 104 | 'boardgamehonor', 'boardgamepodcastepisode', 105 | 'boardgameversion', 'boardgamefamily', 'boardgamemechanic', 106 | 'boardgameartist'): 107 | game.collect_list_from_soup(soup, list_name) 108 | 109 | game.collect_numplayers_poll(soup) 110 | game.collect_language_dependence_poll(soup) 111 | game.collect_player_age_poll(soup) 112 | 113 | # any other polls? 114 | game.collect_polls_from_soup(soup) 115 | 116 | game.collect_ranks(soup) 117 | game.collect_stats(soup) 118 | return game 119 | 120 | 121 | def collect_numplayers_poll(self, soup): 122 | poll_tag = soup.find("poll", {"name": "suggested_numplayers"}, 123 | recursive=False) 124 | self.numplayers = {} 125 | if poll_tag is None: 126 | return 127 | for results in poll_tag.find_all('results'): 128 | key = results['numplayers'] 129 | self.numplayers[key] = [] 130 | for value in 'Best', 'Recommended', 'Not Recommended': 131 | result = results.find("result", value=value) 132 | if result is not None: 133 | result = result['numvotes'] 134 | self.numplayers[key].append(result) 135 | poll_tag.extract() 136 | 137 | def collect_language_dependence_poll(self, soup): 138 | poll_tag = soup.find("poll", {"name": "language_dependence"}, 139 | recursive=False) 140 | self.language_dependence = {} 141 | if poll_tag is None: 142 | return 143 | for result in poll_tag.find_all('result'): 144 | key = int(result['level']) 145 | self.language_dependence[key] = int(result['numvotes']) 146 | poll_tag.extract() 147 | 148 | def collect_player_age_poll(self, soup): 149 | poll_tag = soup.find("poll", {"name": "suggested_playerage"}, 150 | recursive=False) 151 | self.suggested_player_age = {} 152 | if poll_tag is None: 153 | return 154 | for result in poll_tag.find_all('result'): 155 | key = result['value'] 156 | self.suggested_player_age[key] = int(result['numvotes']) 157 | poll_tag.extract() 158 | 159 | def collect_polls_from_soup(self, soup): 160 | for poll_tag in soup.find_all("poll", recursive=False): 161 | print "WEIRD POLL DUDE %s" % unicode(poll_tag) 162 | 163 | def collect_comments(self, soup): 164 | self.comments = [] 165 | for comment_tag in soup.find_all("comment", recursive=False): 166 | rating = comment_tag.get('rating') 167 | if rating == 'N/A': 168 | rating = None 169 | else: 170 | rating = float(rating) 171 | comment = [ 172 | comment_tag.get('username'), rating, 173 | "\n".join(unicode(x).strip() for x in comment_tag.contents)] 174 | self.comments.append(comment) 175 | comment_tag.extract() 176 | 177 | def float(self, v): 178 | if not v: 179 | return None 180 | return float(v) 181 | 182 | def collect_stats(self, soup): 183 | self.rating = {} 184 | self.rating['num_responses'] = self.get_tag_value(soup, 'usersrated') 185 | self.rating['average'] = self.float(self.get_tag_value(soup, 'average')) 186 | self.rating['bayes_average'] = self.float( 187 | self.get_tag_value(soup,'bayesaverage')) 188 | self.rating['stddev'] = self.float(self.get_tag_value(soup, 'stddev')) 189 | self.rating['median'] = self.float(self.get_tag_value(soup, 'median')) 190 | 191 | self.weight = {} 192 | self.weight['num_responses'] = self.get_tag_value(soup, 'numweights') 193 | self.weight['average'] = self.get_tag_value(soup, 'averageweight') 194 | 195 | self.ownership = {} 196 | self.ownership['commented'] = self.get_tag_value(soup, 'numcomments') 197 | self.ownership['owned'] = self.get_tag_value(soup, 'owned') 198 | self.ownership['trading'] = self.get_tag_value(soup, 'trading') 199 | self.ownership['wanting'] = self.get_tag_value(soup, 'wanting') 200 | self.ownership['wishing'] = self.get_tag_value(soup, 'wishing') 201 | 202 | def collect_list_from_soup(self, soup, attribute): 203 | l = [] 204 | for tag in soup.find_all(attribute, recursive=False): 205 | objectid = tag['objectid'] 206 | contents = tag.string.strip() 207 | tag.extract() 208 | l.append([contents, objectid]) 209 | new_attr = attribute 210 | if attribute.startswith('boardgame'): 211 | new_attr = attribute[9:] 212 | setattr(self, new_attr, l) 213 | 214 | def collect_ranks(self, soup): 215 | self.ranks = {} 216 | for rank in soup.find_all('rank'): 217 | name = rank['name'] 218 | friendly_name = rank['friendlyname'] 219 | value = rank['value'] 220 | type = rank['type'] 221 | bayes_average = rank['bayesaverage'] 222 | self.ranks[name] = [friendly_name, value, type, bayes_average] 223 | rank.extract() 224 | 225 | 226 | 227 | class Geeklist(Scraper): 228 | 229 | @classmethod 230 | def from_xml(self, data): 231 | geek = Geeklist() 232 | soup = BeautifulSoup(data, "xml", from_encoding="utf8") 233 | if soup.error is not None: 234 | return None 235 | soup = soup.geeklist 236 | if not soup: 237 | return None 238 | geek.gather_edits(soup) 239 | 240 | for key in 'username', 'title', 'description': 241 | geek.set_from_soup(soup, key) 242 | 243 | geek.postdate = geek.get_date_from_soup(soup, 'postdate') 244 | geek.set_number_from_soup(soup, 'thumbs') 245 | 246 | geek.gather_items(soup) 247 | geek.gather_comments(soup) 248 | return geek 249 | 250 | def gather_edits(self, soup): 251 | self.edits = [] 252 | for timestamp in soup.find_all('editdate'): 253 | self.edits.append(parse(timestamp.string)) 254 | 255 | def gather_items(self, soup): 256 | self.items = [] 257 | for item_tag in soup.find_all('item', recursive=False): 258 | item = {} 259 | for key in ( 260 | 'objecttype', 'subtype', 'objectid', 'objectname', 261 | 'username', 'thumbs', 'imageid'): 262 | item[key] = item_tag[key] 263 | item['postdate'] = parse(item_tag['postdate']) 264 | item['editdate'] = parse(item_tag['editdate']) 265 | item['body'] = self.contents_string(item_tag.body) 266 | item['comments'] = [] 267 | for comment_tag in item_tag.find_all('comment', recursive=False): 268 | item['comments'].append(self.comment_from(comment_tag)) 269 | self.items.append(item) 270 | 271 | def gather_comments(self, soup): 272 | self.comments = [ 273 | self.comment_from(comment_tag) for comment_tag in 274 | soup.find_all('comment', recursive=False)] 275 | 276 | def comment_from(self, tag): 277 | comment = {} 278 | comment['username'] = tag['username'] 279 | comment['thumbs'] = int(tag['thumbs']) 280 | comment['postdate'] = parse(tag['postdate']) 281 | comment['postdate'] = parse(tag['editdate']) 282 | comment['body'] = self.contents_string(tag) 283 | return comment 284 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | lxml 2 | beautifulsoup 3 | jsonpickle 4 | --------------------------------------------------------------------------------