├── .gitignore ├── LICENSE ├── README.md ├── README_DEPLOY ├── espn_scraper ├── __init__.py └── tests │ ├── __init__.py │ └── test_espn.py ├── example.py ├── setup.cfg ├── setup.py └── tox.ini /.gitignore: -------------------------------------------------------------------------------- 1 | /dist 2 | /build 3 | /.eggs 4 | /*.egg 5 | /*.egg-info 6 | *.pyc 7 | /venv 8 | *.log 9 | /.ipynb_checkpoints 10 | *.ipynb 11 | *.swp 12 | /cached_json 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013 Thomas Park 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This library will scrape espn.com scoreboards, boxscores, playbyplays for most sports NFL, MLB, NBA, NCAAF, NCAAB, NCAAW, WNBA, NHL. It will optionally save the data for quick lookup later. All the functions can be found in `espn_scaper/__init__.py` Some example usage can be found in example.py or the below README 2 | 3 | Other ESPN API endpoints can be found at https://gist.github.com/nntrn/ee26cb2a0716de0947a0a4e9a157bc1c 4 | 5 | Test coverage is currently pretty minimal. I would welcome a pull request with more robust tests `espn_scraper/tests/test_espn.py` 6 | 7 | ## Example Usage 8 | 9 | Install espn_scraper 10 | 11 | `pip3 install espn_scraper` 12 | 13 | Start python shell 14 | `$ python3` 15 | 16 | Import the package 17 | `>>> import espn_scraper as espn` 18 | 19 | To view supported leagues 20 | ``` 21 | >>> espn.get_leagues() 22 | ['nfl', 'ncf', 'mlb', 'nba', 'ncb', 'ncw', 'wnba', 'nhl'] 23 | ``` 24 | 25 | Before retrieving ESPN data, let's write a pretty json printer function 26 | ``` 27 | import json 28 | def ppjson(data): 29 | print(json.dumps(data, indent=2, sort_keys=True)) 30 | ``` 31 | 32 | Now let's use it print the current NFL team names and their abbreviations 33 | ``` 34 | >>> ppjson(espn.get_teams("nfl")) 35 | https://www.espn.com/nfl/teams 36 | [ 37 | { 38 | "id": "buf", 39 | "name": "Buffalo Bills" 40 | }, 41 | { 42 | "id": "mia", 43 | "name": "Miami Dolphins" 44 | }, 45 | { 46 | "id": "ne", 47 | "name": "New England Patriots" 48 | }, 49 | { 50 | "id": "nyj", 51 | "name": "New York Jets" 52 | }, 53 | { 54 | "id": "bal", 55 | "name": "Baltimore Ravens" 56 | }, 57 | { 58 | "id": "cin", 59 | "name": "Cincinnati Bengals" 60 | }, 61 | { 62 | "id": "cle", 63 | "name": "Cleveland Browns" 64 | }, 65 | { 66 | "id": "pit", 67 | "name": "Pittsburgh Steelers" 68 | }, 69 | { 70 | "id": "hou", 71 | "name": "Houston Texans" 72 | }, 73 | { 74 | "id": "ind", 75 | "name": "Indianapolis Colts" 76 | }, 77 | { 78 | "id": "jax", 79 | "name": "Jacksonville Jaguars" 80 | }, 81 | { 82 | "id": "ten", 83 | "name": "Tennessee Titans" 84 | }, 85 | { 86 | "id": "den", 87 | "name": "Denver Broncos" 88 | }, 89 | { 90 | "id": "kc", 91 | "name": "Kansas City Chiefs" 92 | }, 93 | { 94 | "id": "lac", 95 | "name": "Los Angeles Chargers" 96 | }, 97 | { 98 | "id": "oak", 99 | "name": "Oakland Raiders" 100 | }, 101 | { 102 | "id": "dal", 103 | "name": "Dallas Cowboys" 104 | }, 105 | { 106 | "id": "nyg", 107 | "name": "New York Giants" 108 | }, 109 | { 110 | "id": "phi", 111 | "name": "Philadelphia Eagles" 112 | }, 113 | { 114 | "id": "wsh", 115 | "name": "Washington Redskins" 116 | }, 117 | { 118 | "id": "chi", 119 | "name": "Chicago Bears" 120 | }, 121 | { 122 | "id": "det", 123 | "name": "Detroit Lions" 124 | }, 125 | { 126 | "id": "gb", 127 | "name": "Green Bay Packers" 128 | }, 129 | { 130 | "id": "min", 131 | "name": "Minnesota Vikings" 132 | }, 133 | { 134 | "id": "atl", 135 | "name": "Atlanta Falcons" 136 | }, 137 | { 138 | "id": "car", 139 | "name": "Carolina Panthers" 140 | }, 141 | { 142 | "id": "no", 143 | "name": "New Orleans Saints" 144 | }, 145 | { 146 | "id": "tb", 147 | "name": "Tampa Bay Buccaneers" 148 | }, 149 | { 150 | "id": "ari", 151 | "name": "Arizona Cardinals" 152 | }, 153 | { 154 | "id": "lar", 155 | "name": "Los Angeles Rams" 156 | }, 157 | { 158 | "id": "sf", 159 | "name": "San Francisco 49ers" 160 | }, 161 | { 162 | "id": "sea", 163 | "name": "Seattle Seahawks" 164 | } 165 | ] 166 | ``` 167 | 168 | To get the teams and their abbreviations for an old season_year use the get_standings(league, season_year) function. For example see the old NBA divisions from 2004 including the Seattle Supersonics 169 | ``` 170 | >>> ppjson(espn.get_standings("nba", 2004)) 171 | https://www.espn.com/nba/standings/_/season/2004/group/division 172 | { 173 | "conferences": { 174 | "Eastern Conference": { 175 | "divisions": { 176 | "Atlantic": { 177 | "teams": [ 178 | { 179 | "abbr": "", 180 | "name": "New Jersey Nets" 181 | }, 182 | { 183 | "abbr": "MIA", 184 | "name": "Miami Heat" 185 | }, 186 | { 187 | "abbr": "NY", 188 | "name": "New York Knicks" 189 | }, 190 | { 191 | "abbr": "BOS", 192 | "name": "Boston Celtics" 193 | }, 194 | { 195 | "abbr": "PHI", 196 | "name": "Philadelphia 76ers" 197 | }, 198 | { 199 | "abbr": "WSH", 200 | "name": "Washington Wizards" 201 | }, 202 | { 203 | "abbr": "ORL", 204 | "name": "Orlando Magic" 205 | } 206 | ] 207 | }, 208 | "Central": { 209 | "teams": [ 210 | { 211 | "abbr": "IND", 212 | "name": "Indiana Pacers" 213 | }, 214 | { 215 | "abbr": "DET", 216 | "name": "Detroit Pistons" 217 | }, 218 | { 219 | "abbr": "", 220 | "name": "New Orleans Hornets" 221 | }, 222 | { 223 | "abbr": "MIL", 224 | "name": "Milwaukee Bucks" 225 | }, 226 | { 227 | "abbr": "CLE", 228 | "name": "Cleveland Cavaliers" 229 | }, 230 | { 231 | "abbr": "TOR", 232 | "name": "Toronto Raptors" 233 | }, 234 | { 235 | "abbr": "ATL", 236 | "name": "Atlanta Hawks" 237 | }, 238 | { 239 | "abbr": "CHI", 240 | "name": "Chicago Bulls" 241 | } 242 | ] 243 | }, 244 | "Southeast": { 245 | "teams": [] 246 | } 247 | } 248 | }, 249 | "Western Conference": { 250 | "divisions": { 251 | "Midwest": { 252 | "teams": [ 253 | { 254 | "abbr": "MIN", 255 | "name": "Minnesota Timberwolves" 256 | }, 257 | { 258 | "abbr": "SA", 259 | "name": "San Antonio Spurs" 260 | }, 261 | { 262 | "abbr": "DAL", 263 | "name": "Dallas Mavericks" 264 | }, 265 | { 266 | "abbr": "MEM", 267 | "name": "Memphis Grizzlies" 268 | }, 269 | { 270 | "abbr": "HOU", 271 | "name": "Houston Rockets" 272 | }, 273 | { 274 | "abbr": "DEN", 275 | "name": "Denver Nuggets" 276 | }, 277 | { 278 | "abbr": "UTAH", 279 | "name": "Utah Jazz" 280 | } 281 | ] 282 | }, 283 | "Pacific": { 284 | "teams": [ 285 | { 286 | "abbr": "LAL", 287 | "name": "Los Angeles Lakers" 288 | }, 289 | { 290 | "abbr": "SAC", 291 | "name": "Sacramento Kings" 292 | }, 293 | { 294 | "abbr": "POR", 295 | "name": "Portland Trail Blazers" 296 | }, 297 | { 298 | "abbr": "GS", 299 | "name": "Golden State Warriors" 300 | }, 301 | { 302 | "abbr": "", 303 | "name": "Seattle SuperSonics" 304 | }, 305 | { 306 | "abbr": "PHX", 307 | "name": "Phoenix Suns" 308 | }, 309 | { 310 | "abbr": "LAC", 311 | "name": "LA Clippers" 312 | } 313 | ] 314 | } 315 | } 316 | } 317 | } 318 | } 319 | ``` 320 | 321 | Espn json data can usually be found by appending "&_xhr=1" to the urls. For example, the html for the recent NFL playoff game is 322 | 323 | https://www.espn.com/nfl/boxscore?gameId=401131040 and the json link is 324 | 325 | https://www.espn.com/nfl/boxscore?gameId=401131040&_xhr=1 326 | 327 | To retrieve the json data we can run 328 | ``` 329 | >>> espn.get_url("https://www.espn.com/nfl/boxscore?gameId=401131040&_xhr=1") 330 | https://www.espn.com/nfl/boxscore?gameId=401131040&_xhr=1 331 | {'gameId': 401131040, 'DTCpackages': {'p... 332 | ``` 333 | 334 | You'll notice that the url retrieved was printed to console. This means espn_scraper hit espn.com with a request. If you'll be making many of these requests to parse the data, it's best to download the data. 335 | 336 | First make a directory to hold cached data 337 | `mkdir cached_data` 338 | 339 | Pass the cached_data folder link to the espn.get_url( as an argument 340 | ``` 341 | >>> data = espn.get_url("https://www.espn.com/nfl/boxscore?gameId=401131040&_xhr=1", "cached_data") 342 | https://www.espn.com/nfl/boxscore?gameId=401131040&_xhr=1 343 | ``` 344 | 345 | This json is now stored locally 346 | ``` 347 | $ ls cached_data/nfl/boxscore/ 348 | 'https:||www.espn.com|nfl|boxscore?gameId=401131040&_xhr=1.json' 349 | ``` 350 | 351 | In future requests, if we query this same boxscore, it will be done locally via the saved json file, as long as we pass the cached data folder to the request 352 | ``` 353 | >>> data = espn.get_url("https://www.espn.com/nfl/boxscore?gameId=401131040&_xhr=1", "cached_data") 354 | >>> 355 | ``` 356 | 357 | Notice that no url is printed after the request, indicating that no requests were made to any outside urls. 358 | 359 | If you know the espn game id you can get the JSON recap, boxscore, playbyplay, conversation, or gamecast data. Eg 360 | 361 | ``` 362 | >>> for data_type in ["recap", "boxscore", "playbyplay", "conversation" or "gamecast"]: 363 | ... url = espn.get_game_url(data_type, "nfl", 401131040) 364 | ... data = espn.get_url(url) 365 | ... 366 | https://www.espn.com/nfl/recap?gameId=401131040&_xhr=1 367 | https://www.espn.com/nfl/boxscore?gameId=401131040&_xhr=1 368 | https://www.espn.com/nfl/playbyplay?gameId=401131040&_xhr=1 369 | https://www.espn.com/nfl/conversation?gameId=401131040&_xhr=1 370 | ``` 371 | 372 | If you want to get all the game ids for a season you'll can use the get_all_scoreboard_urls(league, season_year) function 373 | ``` 374 | >>> espn.get_all_scoreboard_urls("nba", 2016) 375 | https://www.espn.com/nba/scoreboard/_/date/20151101?_xhr=1 376 | ['https://www.espn.com/nba/scoreboard/_/date/20151001?_xhr=1', 'https://www.espn.com/nba/scoreboard/_/date/20151002?_xhr=1', 'https://www.espn.com/nba/scoreboard/_/date/20151003?_xhr=1', 'https://www.espn.com/nba/scoreboard/_/date/20151004?_xhr=1', 'https://www.espn.com/nba/scoreboard/_/date/20... 377 | ``` 378 | 379 | You can then get all the espn game_ids for a season by parsing the events for each scoreboard url 380 | ``` 381 | >>> game_ids = [] 382 | >>> for scoreboard_url in scoreboard_urls: 383 | ... data = espn.get_url(scoreboard_url, cached_path="cached_data") 384 | ... for event in data['content']['sbData']['events']: 385 | ... if event['id'] not in game_ids: 386 | ... game_ids.append(event['id']) 387 | ... 388 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/1/week/1?_xhr=1 389 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/1/week/2?_xhr=1 390 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/1/week/3?_xhr=1 391 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/1/week/4?_xhr=1 392 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/1/week/5?_xhr=1 393 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/2/week/1?_xhr=1 394 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/2/week/2?_xhr=1 395 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/2/week/3?_xhr=1 396 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/2/week/4?_xhr=1 397 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/2/week/5?_xhr=1 398 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/2/week/6?_xhr=1 399 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/2/week/7?_xhr=1 400 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/2/week/8?_xhr=1 401 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/2/week/9?_xhr=1 402 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/2/week/10_?xhr=1 403 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/2/week/11_?xhr=1 404 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/2/week/12_?xhr=1 405 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/2/week/13_?xhr=1 406 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/2/week/14_?xhr=1 407 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/2/week/15_?xhr=1 408 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/2/week/16_?xhr=1 409 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/2/week/17_?xhr=1 410 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/3/week/1?_xhr=1 411 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/3/week/2?_xhr=1 412 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/3/week/3?_xhr=1 413 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/3/week/4?_xhr=1 414 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/3/week/5?_xhr=1 415 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/4/week/1?_xhr=1 416 | >>> print(game_ids) 417 | ['400868831', '400874795', '400874854',... 418 | ``` 419 | -------------------------------------------------------------------------------- /README_DEPLOY: -------------------------------------------------------------------------------- 1 | #update setup.py to new version 2 | python3 setup.py sdist bdist_wheel --universal 3 | twine upload --repository espn_scraper dist/* 4 | 5 | #test code 6 | tox r 7 | -------------------------------------------------------------------------------- /espn_scraper/__init__.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pytz 3 | from dateutil import parser 4 | from dateutil.relativedelta import relativedelta 5 | import datetime 6 | import os.path 7 | import requests 8 | from bs4 import BeautifulSoup 9 | BASE_URL = "https://www.espn.com" 10 | QUERY_STRING = "_xhr=1" 11 | #ESPN seems to be blocking requests with default blank headers 12 | DEFAULT_HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"} 13 | API_v2_BASE_URL = "https://site.api.espn.com/apis/site/v2/sports" 14 | 15 | ## General functions 16 | def retry_request(url, headers=DEFAULT_HEADERS): 17 | """Get a url and return the request, try it up to 3 times if it fails initially""" 18 | session = requests.Session() 19 | session.mount("http://", requests.adapters.HTTPAdapter(max_retries=3)) 20 | res = session.get(url=url, allow_redirects=True, headers=headers) 21 | session.close() 22 | return res 23 | 24 | def get_soup(res): 25 | return BeautifulSoup(res.text, "lxml") 26 | 27 | def get_new_json(url, headers=DEFAULT_HEADERS): 28 | print(url) 29 | res = retry_request(url, headers) 30 | if res.status_code == 200: 31 | return res.json() 32 | else: 33 | print("ERROR:", res.status_code) 34 | return {"error_code": res.status_code, "error_msg": "URL Error"} 35 | 36 | def get_new_html_soup(url, headers=DEFAULT_HEADERS): 37 | print(url) 38 | res = retry_request(url, headers) 39 | if res.status_code == 200: 40 | return get_soup(res) 41 | else: 42 | print("ERROR: ESPN", res.status_code) 43 | return {"error_code": res.status_code, "error_msg": "ESPN Error"} 44 | 45 | ## Get constants 46 | def get_date_leagues(): 47 | return ["mlb","nba","ncb","ncw","wnba","nhl"] 48 | 49 | def longify_league(league): 50 | if league == "ncb": 51 | league = "mens-college-basketball" 52 | elif league == "ncw": 53 | league = "womens-college-basketball" 54 | elif league == "ncf": 55 | league = "college-football" 56 | return league 57 | 58 | def get_week_leagues(): 59 | return ["nfl","ncf"] 60 | 61 | def get_ncb_groups(): 62 | return [50,55,56,100] 63 | 64 | def get_ncw_groups(): 65 | return [50,55,100] 66 | 67 | def get_ncf_groups(): 68 | return [80,81] 69 | 70 | def get_leagues(): 71 | """ Return a list of supported leagues """ 72 | return get_week_leagues() + get_date_leagues() 73 | 74 | def get_no_scoreboard_json_leagues(): 75 | """ Scoreboard json isn't easily available for some leagues, have to grab the game_ids from sportscenter_api url """ 76 | return ["wnba", "nhl"] 77 | 78 | def get_sport(league): 79 | if league in ["nba","wnba","ncb","ncw"]: 80 | return "basketball" 81 | elif league in ["mlb"]: 82 | return "baseball" 83 | elif league in ["nfl","ncf"]: 84 | return "football" 85 | elif league in ["nhl"]: 86 | return "hockey" 87 | 88 | ## Get urls 89 | def get_sportscenter_api_url(league, dates, sport=None): 90 | """ Alternative API endpoint """ 91 | if sport == None: 92 | sport = get_sport(league) 93 | return "https://sportscenter.api.espn.com/apis/v1/events?sport={}&league={}&dates={}".format(sport, league, dates) 94 | 95 | def get_date_scoreboard_url(league, dates, groups=None, sport=None, limit=None): 96 | """ Return a scoreboard url for a league that uses dates (nonfootball, but function also works for football)""" 97 | if sport == None: 98 | sport = get_sport(league) 99 | league = longify_league(league) 100 | url = "{}/{}/{}/scoreboard?dates={}".format(API_v2_BASE_URL, sport, league, dates) 101 | if groups !=None: 102 | if limit == None: 103 | limit = 1000 104 | url+= "&groups={}".format(groups) 105 | if limit != None: 106 | url += "&limit={}".format(limit) 107 | return url 108 | 109 | def get_week_scoreboard_url(league, season_year, season_type=None, week=None, groups=None, sport=None): 110 | """ Return a scoreboard url for a league that uses weeks (football) 111 | Example urls 112 | By year: https://site.api.espn.com/apis/site/v2/sports/football/nfl/scoreboard?limit=1000&dates=2022 113 | By year, seasontype, week: https://site.api.espn.com/apis/site/v2/sports/football/nfl/scoreboard?dates=2022&seasontype=2&week=1 114 | By date range: https://site.api.espn.com/apis/site/v2/sports/football/nfl/scoreboard?dates=20200901-20210228 115 | By date: https://site.api.espn.com/apis/site/v2/sports/football/nfl/scoreboard?dates=20200901""" 116 | 117 | if sport == None: 118 | sport = get_sport(league) 119 | league = longify_league(league) 120 | url = "{}/{}/{}/scoreboard?dates={}".format(API_v2_BASE_URL, sport, league, season_year) 121 | if season_type != None: 122 | url += "&seasontype={}".format(season_type) 123 | if week != None: 124 | url += "&week={}".format(week) 125 | #TODO implement groups? 126 | return url 127 | 128 | def get_game_url(url_type, league, espn_id): 129 | valid_url_types = ["recap", "boxscore", "playbyplay", "conversation", "gamecast"] 130 | if url_type not in valid_url_types: 131 | raise ValueError("Unknown url_type for get_game_url. Valid url_types are {}".format(valid_url_types)) 132 | return "{}/{}/{}?gameId={}&{}".format(BASE_URL, league, url_type, espn_id, QUERY_STRING) 133 | 134 | def get_current_scoreboard_urls(league, offset=0): 135 | """ Return a list of the current scoreboard urls for a league 136 | For date leagues optional offset is in days 137 | For week leagues optional offseet is in weeks """ 138 | urls = [] 139 | if league in get_date_leagues(): 140 | date_str = (datetime.datetime.now() + relativedelta(days=+offset)).strftime("%Y%m%d") 141 | if league == "ncb": 142 | for group in get_ncb_groups(): 143 | urls.append(get_date_scoreboard_url(league, date_str, group)) 144 | elif league == "ncw": 145 | for group in get_ncw_groups(): 146 | urls.append(get_date_scoreboard_url(league, date_str, group)) 147 | else: 148 | urls.append(get_date_scoreboard_url(league, date_str)) 149 | return urls 150 | elif league in get_week_leagues(): 151 | # need to add timezone to now to compare with timezoned entry datetimes later 152 | dt = datetime.datetime.now(pytz.utc) + relativedelta(weeks=+offset) 153 | # guess the league season_year 154 | if dt.month > 2: 155 | guessed_season_year = dt.year 156 | else: 157 | guessed_season_year = dt.year - 1 158 | calendar = get_calendar(league, guessed_season_year) 159 | for season_type in calendar: 160 | if 'entries' in season_type: 161 | for entry in season_type['entries']: 162 | if dt >= parser.parse(entry['startDate']) and dt <= parser.parse(entry['endDate']): 163 | if league == "ncf": 164 | for group in get_ncf_groups(): 165 | urls.append(get_week_scoreboard_url(league, guessed_season_year, season_type['value'], entry['value'], group)) 166 | else: 167 | urls.append(get_week_scoreboard_url(league, guessed_season_year, season_type['value'], entry['value'])) 168 | return urls 169 | else: 170 | raise ValueError("Unknown league for get_current_scoreboard_urls") 171 | 172 | def get_all_scoreboard_urls(league, season_year): 173 | """ Return a list of all scoreboard urls for a given league and season year """ 174 | urls = [] 175 | if league in get_date_leagues(): 176 | start_datetime, end_datetime = get_season_start_end_datetimes(league, season_year) 177 | while start_datetime < end_datetime: 178 | if league == "ncb": 179 | for group in get_ncb_groups(): 180 | urls.append(get_date_scoreboard_url(league, start_datetime.strftime("%Y%m%d"), group)) 181 | elif league == "ncw": 182 | for group in get_ncw_groups(): 183 | urls.append(get_date_scoreboard_url(league, start_datetime.strftime("%Y%m%d"), group)) 184 | else: 185 | urls.append(get_date_scoreboard_url(league, start_datetime.strftime("%Y%m%d"))) 186 | start_datetime += relativedelta(days=+1) 187 | return urls 188 | elif league in get_week_leagues(): 189 | calendar = get_calendar(league, season_year) 190 | for season_type in calendar: 191 | if 'entries' in season_type: 192 | for entry in season_type['entries']: 193 | if league == "ncf": 194 | for group in get_ncf_groups(): 195 | urls.append(get_week_scoreboard_url(league, season_year, season_type['value'], entry['value'], group)) 196 | else: 197 | urls.append(get_week_scoreboard_url(league, season_year, season_type['value'], entry['value'])) 198 | return urls 199 | else: 200 | raise ValueError("Unknown league for get_all_scoreboard_urls") 201 | 202 | ## Get stuff from URL or filenames 203 | def get_league_from_url(url): 204 | return url.split('.com/')[1].split('/')[0] 205 | 206 | def get_date_from_scoreboard_url(url): 207 | league = get_league_from_url(url) 208 | if league == "nhl": 209 | return url.split("?date=")[1].split("&")[0] 210 | else: 211 | return url.split('/')[-1].split('?')[0] 212 | 213 | def get_data_type_from_url(url): 214 | """ Guess and return the data_type based on the url """ 215 | data_type = None 216 | valid_data_types = ["scoreboard", "recap", "boxscore", "playbyplay", "conversation", "gamecast"] 217 | for valid_data_type in valid_data_types: 218 | if valid_data_type in url: 219 | data_type = valid_data_type 220 | break 221 | if data_type == None: 222 | raise ValueError("Unknown data_type for url. Url must contain one of {}".format(valid_data_types)) 223 | return data_type 224 | 225 | def get_filename_ext(filename): 226 | if filename.endswith(".json"): 227 | return "json" 228 | elif filename.endswith(".html"): 229 | return "html" 230 | else: 231 | raise ValueError("Uknown filename extension for {}".format(filename)) 232 | 233 | ## Get requests helpers 234 | def get_season_start_end_datetimes_helper(url): 235 | # TODO use cached replies if scoreboard url is older than 1 year 236 | scoreboard = get_url(url) 237 | return parser.parse(scoreboard['content']['sbData']['leagues'][0]['calendarStartDate']), parser.parse(scoreboard['content']['sbData']['leagues'][0]['calendarEndDate']) 238 | 239 | def get_season_start_end_datetimes(league, season_year): 240 | """ Guess a random date in a leagues season and return its calendar start and end dates, only non football adheres to this format""" 241 | if league == "mlb": 242 | return get_season_start_end_datetimes_helper(get_date_scoreboard_url(league, str(season_year) + "0415")) 243 | elif league == "nba": 244 | return get_season_start_end_datetimes_helper(get_date_scoreboard_url(league, str(season_year - 1) + "1101")) 245 | elif league == "ncb" or league == "ncw": 246 | return get_season_start_end_datetimes_helper(get_date_scoreboard_url(league, str(season_year - 1) + "1130")) 247 | elif league == "wnba": 248 | # hardcode wnba start end dates, assumed to be April 20 thru Oct 31 249 | return datetime.datetime(season_year,4,20, tzinfo=pytz.timezone("US/Eastern")).astimezone(pytz.utc), datetime.datetime(season_year,10,31, tzinfo=pytz.timezone("US/Eastern")).astimezone(pytz.utc) 250 | elif league == "nhl": 251 | # hardcode nhl start end dates, assumed to be Oct 1 thru June 30 252 | return datetime.datetime(season_year-1,10,1, tzinfo=pytz.timezone("US/Eastern")).astimezone(pytz.utc), datetime.datetime(season_year,6,30, tzinfo=pytz.timezone("US/Eastern")).astimezone(pytz.utc) 253 | else: 254 | raise ValueError("League must be {} to get season start and end datetimes".format(get_date_leagues())) 255 | 256 | def get_filename(cached_json_path, league, data_type, url): 257 | """ Build a full filename with directories for given league, data_type and url""" 258 | # add slash if necessary to cached_json_path 259 | if cached_json_path[-1] != "/": 260 | cached_json_path += "/" 261 | dir_path = cached_json_path + "/" + league + "/" + data_type + "/" 262 | # create a league directory and data_type directory in cached_json if doesn't already exist 263 | if not os.path.exists(dir_path): 264 | os.makedirs(dir_path) 265 | # create filename with / replaced with | 266 | filename = dir_path + url.replace('/','|') 267 | ext = ".json" 268 | if filename.endswith(ext) == False: 269 | filename = filename + ext 270 | return filename 271 | 272 | def get_cached(filename): 273 | """ Return cached json if it exists """ 274 | data = None 275 | if os.path.isfile(filename): 276 | ext = get_filename_ext(filename) 277 | if ext == "json": 278 | with open(filename) as json_data: 279 | data = json.load(json_data) 280 | elif ext == "html": 281 | data = BeautifulSoup(open(filename), "lxml") 282 | return data 283 | 284 | ## Get requests 285 | def get_teams(league): 286 | """ Returns a list of teams with ids and names """ 287 | teams = [] 288 | if league == "ncf": 289 | # espn's college football teams page only lists fbs (blank division) 290 | # need to grab teams from standings page instead if want all the fbs and fcs teams 291 | for division in ["","fcs-i-aa"]: 292 | url = BASE_URL + "/college-football/standings/_/view/" + division 293 | print(url) 294 | soup = get_soup(retry_request(url)) 295 | selector = ".hide-mobile" 296 | team_divs = soup.select(selector) 297 | for team_div in team_divs: 298 | teams.append({'id': team_div.find("a")['href'].split('/')[-2], 'name': team_div.text}) 299 | else: 300 | url = BASE_URL + "/" + league + "/teams" 301 | print(url) 302 | soup = get_soup(retry_request(url)) 303 | if league == "wnba": 304 | selector = "div.pl3" 305 | else: 306 | selector = "div.mt3" 307 | team_divs = soup.select(selector) 308 | for team_div in team_divs: 309 | teams.append({'id': team_div.find("a")['href'].split('/')[-2], 'name': team_div.find("h2").text}) 310 | return teams 311 | 312 | def get_standings(league, season_year, college_division=None): 313 | standings = {"conferences": {}} 314 | if league in ["nhl","nfl","mlb","nba","wnba","ncf","ncb","ncw"]: 315 | if league == "ncf" and college_division == None: 316 | # default to fbs 317 | college_division = "" 318 | if college_division: 319 | valid_college_divisions = ["fbs", "fcs", "fcs-i-aa", "d2", "d3"] 320 | if college_division == "fcs": 321 | college_division = "fcs-i-aa" 322 | if college_division in valid_college_divisions: 323 | url = "{}/{}/standings/_/season/{}/view/{}".format(BASE_URL, league, season_year, college_division) 324 | else: 325 | raise ValueError("College division must be none or {}".format(",".join(valid_college_divisions))) 326 | elif league in ["wnba"]: 327 | url = "{}/{}/standings/_/season/{}/group/conference".format(BASE_URL, league, season_year) 328 | else: 329 | url = "{}/{}/standings/_/season/{}/group/division".format(BASE_URL, league, season_year) 330 | 331 | print(url) 332 | soup = get_soup(retry_request(url)) 333 | standings_divs = soup.find_all("div", class_="standings__table") 334 | 335 | for i in range(len(standings_divs)): 336 | conference_name = standings_divs[i].find("div", class_="Table__Title").text 337 | standings["conferences"][conference_name] = {"divisions": {}} 338 | division = "" # default blank division name 339 | teams_table = standings_divs[i].find("table", class_="Table--fixed-left") 340 | trs = teams_table.find_all("tr") 341 | for tr in trs: 342 | if "subgroup-headers" in tr["class"]: 343 | division = tr.text # replace default blank division name 344 | standings["conferences"][conference_name]["divisions"][division] = {"teams": []} 345 | elif tr.text != "": 346 | if division == "" and standings["conferences"][conference_name]["divisions"] == {}: 347 | standings["conferences"][conference_name]["divisions"][division] = {"teams": []} 348 | team = {} 349 | team_span_tag = tr.find("td", class_="Table__TD").find("span", class_="hide-mobile") 350 | team_a_tag = team_span_tag.find("a") 351 | if team_a_tag is None: 352 | # some teams are now defunct with no espn links 353 | team["name"] = team_span_tag.text.strip() 354 | team["abbr"] = "" 355 | else: 356 | team["name"] = team_a_tag.text 357 | if league in ["ncf","ncb","ncw"]: 358 | team["abbr"] = team_a_tag["href"].split("/id/")[1].split("/")[0].upper() 359 | else: 360 | team["abbr"] = team_a_tag["href"].split("/name/")[1].split("/")[0].upper() 361 | standings["conferences"][conference_name]["divisions"][division]["teams"].append(team) 362 | 363 | return standings 364 | 365 | def get_calendar(league, date_or_season_year): 366 | """ Return a calendar for a league and season_year""" 367 | if league in get_week_leagues(): 368 | url = get_week_scoreboard_url(league, date_or_season_year, 2, 1) 369 | elif league in get_date_leagues(): 370 | url = get_date_scoreboard_url(league, date_or_season_year) 371 | # TODO use cached replies for older urls 372 | return get_url(url)['leagues'][0]['calendar'] 373 | 374 | def get_url(url, cached_path=None): 375 | """ Retrieve an ESPN JSON data or HTML BeautifulSoup, either from cache or make new request """ 376 | data_type = get_data_type_from_url(url) 377 | league = get_league_from_url(url) 378 | """ 379 | if data_type == "scoreboard": 380 | # for wnba and nhl we'll use a different api to retrieve game_ids and basic game data 381 | if league in get_no_scoreboard_json_leagues(): 382 | url = get_sportscenter_api_url(get_sport(league), league, get_date_from_scoreboard_url(url)) 383 | """ 384 | return get_cached_url(url, league, data_type, cached_path) 385 | 386 | def get_cached_url(url, league, data_type, cached_path, headers=DEFAULT_HEADERS): 387 | """ get_url helper if want to specify the league and datatype (for non espn.com links) """ 388 | if cached_path: 389 | filename = get_filename(cached_path, league, data_type, url) 390 | data = get_cached(filename) 391 | else: 392 | data = None 393 | if data == None: 394 | ext = "json" 395 | data = get_new_json(url, headers) 396 | # dont cache if got an ESPN internal 500 error 397 | if cached_path and "error_code" not in data: 398 | with open(filename, 'w') as f: 399 | json.dump(data, f, ensure_ascii=False, indent=2, sort_keys=True) 400 | return data 401 | 402 | -------------------------------------------------------------------------------- /espn_scraper/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andr3w321/espn_scraper/86f15f32e649a188887b4599b074ec100ca60dcf/espn_scraper/tests/__init__.py -------------------------------------------------------------------------------- /espn_scraper/tests/test_espn.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | import espn_scraper as espn 3 | 4 | def boxscore_helper(self, league, espn_id): 5 | data = espn.get_url(espn.get_game_url("boxscore", league, espn_id)) 6 | self.assertEqual(data['page']['content']['gamepackage']['gmStrp']['gid'], str(espn_id)) 7 | 8 | def standings_helper(self, league, season_year, expected_n_teams): 9 | standings = espn.get_standings(league, season_year) 10 | n_teams = 0 11 | for conference in standings["conferences"]: 12 | for division in standings["conferences"][conference]["divisions"]: 13 | for team in standings["conferences"][conference]["divisions"][division]["teams"]: 14 | n_teams += 1 15 | self.assertEqual(n_teams, expected_n_teams) 16 | 17 | class TestEspn(TestCase): 18 | # leagues 19 | def test_get_leagues(self): 20 | self.assertEqual(len(espn.get_leagues()), 8) 21 | # teams 22 | def test_get_num_nfl_teams(self): 23 | self.assertEqual(len(espn.get_teams("nfl")), 32) 24 | def test_get_num_mlb_teams(self): 25 | self.assertEqual(len(espn.get_teams("mlb")), 30) 26 | def test_get_num_nba_teams(self): 27 | self.assertEqual(len(espn.get_teams("nba")), 30) 28 | def test_get_num_ncf_teams(self): 29 | self.assertGreater(len(espn.get_teams("ncf")), 256) 30 | def test_get_num_ncb_teams(self): 31 | self.assertGreater(len(espn.get_teams("ncb")), 353) 32 | def test_get_num_ncw_teams(self): 33 | assert(len(espn.get_teams("ncw")) > 300) 34 | def test_get_num_wnba_teams(self): 35 | self.assertEqual(len(espn.get_teams("wnba")), 12) 36 | def test_get_num_nhl_teams(self): 37 | self.assertGreater(len(espn.get_teams("nhl")), 31) 38 | 39 | # scoreboards 40 | def test_nfl_scoreboard(self): 41 | url = espn.get_week_scoreboard_url("nfl", 2015, 2, 1) 42 | data = espn.get_url(url) 43 | self.assertEqual(len(data['events']), 16) 44 | def test_mlb_scoreboard(self): 45 | url = espn.get_date_scoreboard_url("mlb", "20160601") 46 | data = espn.get_url(url) 47 | self.assertEqual(len(data['events']), 15) 48 | def test_ncf_scoreboard(self): 49 | url = espn.get_week_scoreboard_url("ncf", 2016, 2, 3) 50 | data = espn.get_url(url) 51 | self.assertEqual(len(data['events']), 21) 52 | def test_ncb_scoreboard(self): 53 | url = espn.get_date_scoreboard_url("ncb", "20170211", 50) 54 | data = espn.get_url(url) 55 | self.assertEqual(len(data['events']), 146) 56 | def test_ncw_scoreboard(self): 57 | url = espn.get_date_scoreboard_url("ncw", "20170120") 58 | data = espn.get_url(url) 59 | self.assertEqual(len(data['events']), 3) 60 | def test_wnba_scoreboard(self): 61 | url = espn.get_date_scoreboard_url("wnba", "20160710") 62 | data = espn.get_url(url) 63 | self.assertEqual(len(data['events']), 5) 64 | def test_nhl_scoreboard(self): 65 | url = espn.get_date_scoreboard_url("nhl", "20170328") 66 | data = espn.get_url(url) 67 | self.assertEqual(len(data['events']), 11) 68 | 69 | # scoreboards for a year 70 | def test_get_all_2016_nfl_scoreboard_urls(self): 71 | scoreboards = espn.get_all_scoreboard_urls("nfl", 2016) 72 | self.assertEqual(len(scoreboards), 28) 73 | 74 | # boxscores 75 | def test_nfl_boxscore(self): 76 | boxscore_helper(self, "nfl", 400874484) 77 | def test_nba_boxscore(self): 78 | boxscore_helper(self, "nba", 400900498) 79 | def test_mlb_boxscore(self): 80 | boxscore_helper(self, "mlb", 370328119) 81 | def test_ncb_boxscore(self): 82 | boxscore_helper(self, "ncb", 400947330) 83 | def test_ncf_boxscore(self): 84 | boxscore_helper(self, "ncf", 400868977) 85 | def test_ncw_boxscore(self): 86 | boxscore_helper(self, "ncw", 400947271) 87 | def test_wnba_boxscore(self): 88 | boxscore_helper(self, "wnba", 400910431) 89 | def test_nhl_boxscore(self): 90 | boxscore_helper(self, "nhl", 400885533) 91 | 92 | # standings 93 | def test_nfl_2016_standings(self): 94 | standings_helper(self, "nfl", 2016, 32) 95 | def test_ncf_2016_standings(self): 96 | standings_helper(self, "ncf", 2016, 128) 97 | def test_mlb_2016_standings(self): 98 | standings_helper(self, "mlb", 2016, 30) 99 | def test_nba_2016_standings(self): 100 | standings_helper(self, "nba", 2016, 30) 101 | def test_ncb_2016_standings(self): 102 | standings_helper(self, "ncb", 2016, 351) 103 | def test_ncw_2016_standings(self): 104 | standings_helper(self, "ncw", 2016, 349) 105 | def test_wnba_2016_standings(self): 106 | standings_helper(self, "wnba", 2016, 12) 107 | def test_nhl_2016_standings(self): 108 | standings_helper(self, "nhl", 2016, 30) 109 | 110 | -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | import espn_scraper as espn 2 | import json 3 | 4 | ''' Pretty print json helper ''' 5 | def ppjson(data): 6 | print(json.dumps(data, indent=2, sort_keys=True)) 7 | 8 | leagues = espn.get_leagues() 9 | print(leagues) 10 | for league in leagues: 11 | teams = espn.get_teams(league) 12 | print(league, len(teams)) 13 | 14 | # print nfl 2016 postseason scores 15 | scoreboard_urls = espn.get_all_scoreboard_urls("nfl", 2016) 16 | for scoreboard_url in scoreboard_urls: 17 | data = espn.get_url(scoreboard_url, cached_path="cached_json") 18 | for event in data['events']: 19 | if event['season']['type'] == 3: 20 | print(event['season']['type'], 21 | event['season']['year'], 22 | event['competitions'][0]['competitors'][0]['team']['abbreviation'], 23 | event['competitions'][0]['competitors'][0]['score'], 24 | event['competitions'][0]['competitors'][1]['team']['abbreviation'], 25 | event['competitions'][0]['competitors'][1]['score']) 26 | 27 | url = espn.get_game_url("boxscore", "nba", 400900498) 28 | json_data = espn.get_url(url) 29 | #ppjson(json_data) # print full long json 30 | 31 | print(json_data['page']['content']['gamepackage']['bxscr'][0]['tm']['dspNm']) 32 | ppjson(json_data['page']['content']['gamepackage']['bxscr'][0]['stats'][0]) 33 | 34 | url = espn.get_game_url("playbyplay", "ncf", 400868977) 35 | json_data = espn.get_url(url) 36 | for play_number, play in enumerate(json_data['page']['content']['gamepackage']['allPlys']): 37 | if 'headline' not in play: 38 | continue 39 | print(play_number, play['teamName'], play['headline'], play['description']) 40 | 41 | 42 | # NHL example 43 | url = espn.get_game_url("boxscore", "nhl", 400885533) 44 | json_data = espn.get_url(url) 45 | away_team = json_data['page']['content']['gamepackage']['bxscr'][0]['tm']['abbrev'] 46 | home_team = json_data['page']['content']['gamepackage']['bxscr'][1]['tm']['abbrev'] 47 | away_score = json_data['page']['content']['gamepackage']['scrSumm']['lnscrs']['awy'][3] 48 | home_score = json_data['page']['content']['gamepackage']['scrSumm']['lnscrs']['hme'][3] 49 | print(away_team, away_score, home_team, home_score) 50 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description_file = README.md 3 | [bdist wheel] 4 | universal=1 5 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup(name='espn_scraper', 4 | version='0.15.00', 5 | description='ESPN scraper for major sports', 6 | url='http://github.com/andr3w321/espn_scraper', 7 | author='Andrew Rennhack', 8 | author_email='andr3w321@gmail.com', 9 | license='MIT', 10 | install_requires=[ 11 | 'pytz', 12 | 'python-dateutil', 13 | 'requests', 14 | 'bs4', 15 | 'lxml' 16 | ], 17 | test_suite='nose.collector', 18 | tests_require=['nose'], 19 | packages=['espn_scraper'], 20 | zip_safe=False) 21 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | env_list = 3 | py310 4 | minversion = 4.14.2 5 | 6 | [testenv] 7 | description = run the tests with pytest 8 | package = wheel 9 | wheel_build_env = .pkg 10 | deps = 11 | pytest>=6 12 | commands = 13 | pytest {tty:--color=yes} {posargs} 14 | --------------------------------------------------------------------------------