├── .gitignore
├── LICENSE
├── README.md
├── README_DEPLOY
├── espn_scraper
    ├── __init__.py
    └── tests
    │   ├── __init__.py
    │   └── test_espn.py
├── example.py
├── setup.cfg
├── setup.py
└── tox.ini


/.gitignore:
--------------------------------------------------------------------------------
 1 | /dist
 2 | /build
 3 | /.eggs
 4 | /*.egg
 5 | /*.egg-info
 6 | *.pyc
 7 | /venv
 8 | *.log
 9 | /.ipynb_checkpoints
10 | *.ipynb
11 | *.swp
12 | /cached_json
13 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2013 Thomas Park
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | This library will scrape espn.com scoreboards, boxscores, playbyplays for most sports NFL, MLB, NBA, NCAAF, NCAAB, NCAAW, WNBA, NHL.  It will optionally save the data for quick lookup later.  All the functions can be found in `espn_scaper/__init__.py`  Some example usage can be found in example.py or the below README
  2 | 
  3 | Other ESPN API endpoints can be found at https://gist.github.com/nntrn/ee26cb2a0716de0947a0a4e9a157bc1c
  4 | 
  5 | Test coverage is currently pretty minimal. I would welcome a pull request with more robust tests `espn_scraper/tests/test_espn.py`
  6 | 
  7 | ## Example Usage
  8 | 
  9 | Install espn_scraper
 10 | 
 11 | `pip3 install espn_scraper`
 12 | 
 13 | Start python shell
 14 | `$ python3`
 15 | 
 16 | Import the package
 17 | `>>> import espn_scraper as espn`
 18 | 
 19 | To view supported leagues
 20 | ```
 21 | >>> espn.get_leagues()
 22 | ['nfl', 'ncf', 'mlb', 'nba', 'ncb', 'ncw', 'wnba', 'nhl']
 23 | ```
 24 | 
 25 | Before retrieving ESPN data, let's write a pretty json printer function
 26 | ```
 27 | import json
 28 | def ppjson(data):
 29 |     print(json.dumps(data, indent=2, sort_keys=True))
 30 | ```
 31 | 
 32 | Now let's use it print the current NFL team names and their abbreviations
 33 | ```
 34 | >>> ppjson(espn.get_teams("nfl"))
 35 | https://www.espn.com/nfl/teams
 36 | [
 37 |   {
 38 |     "id": "buf",
 39 |     "name": "Buffalo Bills"
 40 |   },
 41 |   {
 42 |     "id": "mia",
 43 |     "name": "Miami Dolphins"
 44 |   },
 45 |   {
 46 |     "id": "ne",
 47 |     "name": "New England Patriots"
 48 |   },
 49 |   {
 50 |     "id": "nyj",
 51 |     "name": "New York Jets"
 52 |   },
 53 |   {
 54 |     "id": "bal",
 55 |     "name": "Baltimore Ravens"
 56 |   },
 57 |   {
 58 |     "id": "cin",
 59 |     "name": "Cincinnati Bengals"
 60 |   },
 61 |   {
 62 |     "id": "cle",
 63 |     "name": "Cleveland Browns"
 64 |   },
 65 |   {
 66 |     "id": "pit",
 67 |     "name": "Pittsburgh Steelers"
 68 |   },
 69 |   {
 70 |     "id": "hou",
 71 |     "name": "Houston Texans"
 72 |   },
 73 |   {
 74 |     "id": "ind",
 75 |     "name": "Indianapolis Colts"
 76 |   },
 77 |   {
 78 |     "id": "jax",
 79 |     "name": "Jacksonville Jaguars"
 80 |   },
 81 |   {
 82 |     "id": "ten",
 83 |     "name": "Tennessee Titans"
 84 |   },
 85 |   {
 86 |     "id": "den",
 87 |     "name": "Denver Broncos"
 88 |   },
 89 |   {
 90 |     "id": "kc",
 91 |     "name": "Kansas City Chiefs"
 92 |   },
 93 |   {
 94 |     "id": "lac",
 95 |     "name": "Los Angeles Chargers"
 96 |   },
 97 |   {
 98 |     "id": "oak",
 99 |     "name": "Oakland Raiders"
100 |   },
101 |   {
102 |     "id": "dal",
103 |     "name": "Dallas Cowboys"
104 |   },
105 |   {
106 |     "id": "nyg",
107 |     "name": "New York Giants"
108 |   },
109 |   {
110 |     "id": "phi",
111 |     "name": "Philadelphia Eagles"
112 |   },
113 |   {
114 |     "id": "wsh",
115 |     "name": "Washington Redskins"
116 |   },
117 |   {
118 |     "id": "chi",
119 |     "name": "Chicago Bears"
120 |   },
121 |   {
122 |     "id": "det",
123 |     "name": "Detroit Lions"
124 |   },
125 |   {
126 |     "id": "gb",
127 |     "name": "Green Bay Packers"
128 |   },
129 |   {
130 |     "id": "min",
131 |     "name": "Minnesota Vikings"
132 |   },
133 |   {
134 |     "id": "atl",
135 |     "name": "Atlanta Falcons"
136 |   },
137 |   {
138 |     "id": "car",
139 |     "name": "Carolina Panthers"
140 |   },
141 |   {
142 |     "id": "no",
143 |     "name": "New Orleans Saints"
144 |   },
145 |   {
146 |     "id": "tb",
147 |     "name": "Tampa Bay Buccaneers"
148 |   },
149 |   {
150 |     "id": "ari",
151 |     "name": "Arizona Cardinals"
152 |   },
153 |   {
154 |     "id": "lar",
155 |     "name": "Los Angeles Rams"
156 |   },
157 |   {
158 |     "id": "sf",
159 |     "name": "San Francisco 49ers"
160 |   },
161 |   {
162 |     "id": "sea",
163 |     "name": "Seattle Seahawks"
164 |   }
165 | ]
166 | ```
167 | 
168 | To get the teams and their abbreviations for an old season_year use the get_standings(league, season_year) function. For example see the old NBA divisions from 2004 including the Seattle Supersonics
169 | ```
170 | >>> ppjson(espn.get_standings("nba", 2004))
171 | https://www.espn.com/nba/standings/_/season/2004/group/division
172 | {
173 |   "conferences": {
174 |     "Eastern Conference": {
175 |       "divisions": {
176 |         "Atlantic": {
177 |           "teams": [
178 |             {
179 |               "abbr": "",
180 |               "name": "New Jersey Nets"
181 |             },
182 |             {
183 |               "abbr": "MIA",
184 |               "name": "Miami Heat"
185 |             },
186 |             {
187 |               "abbr": "NY",
188 |               "name": "New York Knicks"
189 |             },
190 |             {
191 |               "abbr": "BOS",
192 |               "name": "Boston Celtics"
193 |             },
194 |             {
195 |               "abbr": "PHI",
196 |               "name": "Philadelphia 76ers"
197 |             },
198 |             {
199 |               "abbr": "WSH",
200 |               "name": "Washington Wizards"
201 |             },
202 |             {
203 |               "abbr": "ORL",
204 |               "name": "Orlando Magic"
205 |             }
206 |           ]
207 |         },
208 |         "Central": {
209 |           "teams": [
210 |             {
211 |               "abbr": "IND",
212 |               "name": "Indiana Pacers"
213 |             },
214 |             {
215 |               "abbr": "DET",
216 |               "name": "Detroit Pistons"
217 |             },
218 |             {
219 |               "abbr": "",
220 |               "name": "New Orleans Hornets"
221 |             },
222 |             {
223 |               "abbr": "MIL",
224 |               "name": "Milwaukee Bucks"
225 |             },
226 |             {
227 |               "abbr": "CLE",
228 |               "name": "Cleveland Cavaliers"
229 |             },
230 |             {
231 |               "abbr": "TOR",
232 |               "name": "Toronto Raptors"
233 |             },
234 |             {
235 |               "abbr": "ATL",
236 |               "name": "Atlanta Hawks"
237 |             },
238 |             {
239 |               "abbr": "CHI",
240 |               "name": "Chicago Bulls"
241 |             }
242 |           ]
243 |         },
244 |         "Southeast": {
245 |           "teams": []
246 |         }
247 |       }
248 |     },
249 |     "Western Conference": {
250 |       "divisions": {
251 |         "Midwest": {
252 |           "teams": [
253 |             {
254 |               "abbr": "MIN",
255 |               "name": "Minnesota Timberwolves"
256 |             },
257 |             {
258 |               "abbr": "SA",
259 |               "name": "San Antonio Spurs"
260 |             },
261 |             {
262 |               "abbr": "DAL",
263 |               "name": "Dallas Mavericks"
264 |             },
265 |             {
266 |               "abbr": "MEM",
267 |               "name": "Memphis Grizzlies"
268 |             },
269 |             {
270 |               "abbr": "HOU",
271 |               "name": "Houston Rockets"
272 |             },
273 |             {
274 |               "abbr": "DEN",
275 |               "name": "Denver Nuggets"
276 |             },
277 |             {
278 |               "abbr": "UTAH",
279 |               "name": "Utah Jazz"
280 |             }
281 |           ]
282 |         },
283 |         "Pacific": {
284 |           "teams": [
285 |             {
286 |               "abbr": "LAL",
287 |               "name": "Los Angeles Lakers"
288 |             },
289 |             {
290 |               "abbr": "SAC",
291 |               "name": "Sacramento Kings"
292 |             },
293 |             {
294 |               "abbr": "POR",
295 |               "name": "Portland Trail Blazers"
296 |             },
297 |             {
298 |               "abbr": "GS",
299 |               "name": "Golden State Warriors"
300 |             },
301 |             {
302 |               "abbr": "",
303 |               "name": "Seattle SuperSonics"
304 |             },
305 |             {
306 |               "abbr": "PHX",
307 |               "name": "Phoenix Suns"
308 |             },
309 |             {
310 |               "abbr": "LAC",
311 |               "name": "LA Clippers"
312 |             }
313 |           ]
314 |         }
315 |       }
316 |     }
317 |   }
318 | }
319 | ```
320 | 
321 | Espn json data can usually be found by appending "&_xhr=1" to the urls. For example, the html for the recent NFL playoff game is 
322 | 
323 | https://www.espn.com/nfl/boxscore?gameId=401131040 and the json link is
324 | 
325 | https://www.espn.com/nfl/boxscore?gameId=401131040&_xhr=1
326 | 
327 | To retrieve the json data we can run
328 | ```
329 | >>> espn.get_url("https://www.espn.com/nfl/boxscore?gameId=401131040&_xhr=1")
330 | https://www.espn.com/nfl/boxscore?gameId=401131040&_xhr=1
331 | {'gameId': 401131040, 'DTCpackages': {'p...
332 | ```
333 | 
334 | You'll notice that the url retrieved was printed to console. This means espn_scraper hit espn.com with a request. If you'll be making many of these requests to parse the data, it's best to download the data.
335 | 
336 | First make a directory to hold cached data
337 | `mkdir cached_data`
338 | 
339 | Pass the cached_data folder link to the espn.get_url( as an argument
340 | ```
341 | >>> data = espn.get_url("https://www.espn.com/nfl/boxscore?gameId=401131040&_xhr=1", "cached_data")
342 | https://www.espn.com/nfl/boxscore?gameId=401131040&_xhr=1
343 | ```
344 | 
345 | This json is now stored locally
346 | ```
347 | $ ls cached_data/nfl/boxscore/
348 | 'https:||www.espn.com|nfl|boxscore?gameId=401131040&_xhr=1.json'
349 | ```
350 | 
351 | In future requests, if we query this same boxscore, it will be done locally via the saved json file, as long as we pass the cached data folder to the request
352 | ```
353 | >>> data = espn.get_url("https://www.espn.com/nfl/boxscore?gameId=401131040&_xhr=1", "cached_data")
354 | >>>
355 | ```
356 | 
357 | Notice that no url is printed after the request, indicating that no requests were made to any outside urls.
358 | 
359 | If you know the espn game id you can get the JSON recap, boxscore, playbyplay, conversation, or gamecast data. Eg
360 | 
361 | ```
362 | >>> for data_type in ["recap", "boxscore", "playbyplay", "conversation" or "gamecast"]:
363 | ...     url = espn.get_game_url(data_type, "nfl", 401131040)
364 | ...     data = espn.get_url(url)
365 | ... 
366 | https://www.espn.com/nfl/recap?gameId=401131040&_xhr=1
367 | https://www.espn.com/nfl/boxscore?gameId=401131040&_xhr=1
368 | https://www.espn.com/nfl/playbyplay?gameId=401131040&_xhr=1
369 | https://www.espn.com/nfl/conversation?gameId=401131040&_xhr=1
370 | ```
371 | 
372 | If you want to get all the game ids for a season you'll can use the get_all_scoreboard_urls(league, season_year) function
373 | ```
374 | >>> espn.get_all_scoreboard_urls("nba", 2016)
375 | https://www.espn.com/nba/scoreboard/_/date/20151101?_xhr=1
376 | ['https://www.espn.com/nba/scoreboard/_/date/20151001?_xhr=1', 'https://www.espn.com/nba/scoreboard/_/date/20151002?_xhr=1', 'https://www.espn.com/nba/scoreboard/_/date/20151003?_xhr=1', 'https://www.espn.com/nba/scoreboard/_/date/20151004?_xhr=1', 'https://www.espn.com/nba/scoreboard/_/date/20...
377 | ```
378 | 
379 | You can then get all the espn game_ids for a season by parsing the events for each scoreboard url
380 | ```
381 | >>> game_ids = []
382 | >>> for scoreboard_url in scoreboard_urls:
383 | ...   data = espn.get_url(scoreboard_url, cached_path="cached_data")
384 | ...   for event in data['content']['sbData']['events']:
385 | ...       if event['id'] not in game_ids:
386 | ...           game_ids.append(event['id'])
387 | ... 
388 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/1/week/1?_xhr=1
389 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/1/week/2?_xhr=1
390 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/1/week/3?_xhr=1
391 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/1/week/4?_xhr=1
392 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/1/week/5?_xhr=1
393 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/2/week/1?_xhr=1
394 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/2/week/2?_xhr=1
395 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/2/week/3?_xhr=1
396 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/2/week/4?_xhr=1
397 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/2/week/5?_xhr=1
398 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/2/week/6?_xhr=1
399 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/2/week/7?_xhr=1
400 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/2/week/8?_xhr=1
401 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/2/week/9?_xhr=1
402 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/2/week/10_?xhr=1
403 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/2/week/11_?xhr=1
404 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/2/week/12_?xhr=1
405 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/2/week/13_?xhr=1
406 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/2/week/14_?xhr=1
407 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/2/week/15_?xhr=1
408 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/2/week/16_?xhr=1
409 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/2/week/17_?xhr=1
410 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/3/week/1?_xhr=1
411 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/3/week/2?_xhr=1
412 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/3/week/3?_xhr=1
413 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/3/week/4?_xhr=1
414 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/3/week/5?_xhr=1
415 | https://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/4/week/1?_xhr=1
416 | >>> print(game_ids)
417 | ['400868831', '400874795', '400874854',...
418 | ```
419 | 


--------------------------------------------------------------------------------
/README_DEPLOY:
--------------------------------------------------------------------------------
1 | #update setup.py to new version
2 | python3 setup.py sdist bdist_wheel --universal
3 | twine upload --repository espn_scraper dist/*
4 | 
5 | #test code
6 | tox r
7 | 


--------------------------------------------------------------------------------
/espn_scraper/__init__.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import pytz
  3 | from dateutil import parser
  4 | from dateutil.relativedelta import relativedelta
  5 | import datetime
  6 | import os.path
  7 | import requests
  8 | from bs4 import BeautifulSoup
  9 | BASE_URL = "https://www.espn.com"
 10 | QUERY_STRING = "_xhr=1"
 11 | #ESPN seems to be blocking requests with default blank headers
 12 | DEFAULT_HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"}
 13 | API_v2_BASE_URL = "https://site.api.espn.com/apis/site/v2/sports"
 14 | 
 15 | ## General functions
 16 | def retry_request(url, headers=DEFAULT_HEADERS):
 17 |     """Get a url and return the request, try it up to 3 times if it fails initially"""
 18 |     session = requests.Session()
 19 |     session.mount("http://", requests.adapters.HTTPAdapter(max_retries=3))
 20 |     res = session.get(url=url, allow_redirects=True, headers=headers)
 21 |     session.close()
 22 |     return res
 23 | 
 24 | def get_soup(res):
 25 |     return BeautifulSoup(res.text, "lxml")
 26 | 
 27 | def get_new_json(url, headers=DEFAULT_HEADERS):
 28 |     print(url)
 29 |     res = retry_request(url, headers)
 30 |     if res.status_code == 200:
 31 |         return res.json()
 32 |     else:
 33 |         print("ERROR:", res.status_code)
 34 |         return {"error_code": res.status_code, "error_msg": "URL Error"}
 35 | 
 36 | def get_new_html_soup(url, headers=DEFAULT_HEADERS):
 37 |     print(url)
 38 |     res = retry_request(url, headers)
 39 |     if res.status_code == 200:
 40 |         return get_soup(res)
 41 |     else:
 42 |         print("ERROR: ESPN", res.status_code)
 43 |         return {"error_code": res.status_code, "error_msg": "ESPN Error"}
 44 | 
 45 | ## Get constants
 46 | def get_date_leagues():
 47 |     return ["mlb","nba","ncb","ncw","wnba","nhl"]
 48 | 
 49 | def longify_league(league):
 50 |     if league == "ncb":
 51 |         league = "mens-college-basketball"
 52 |     elif league == "ncw":
 53 |         league = "womens-college-basketball"
 54 |     elif league == "ncf":
 55 |         league = "college-football"
 56 |     return league
 57 | 
 58 | def get_week_leagues():
 59 |     return ["nfl","ncf"]
 60 | 
 61 | def get_ncb_groups():
 62 |     return [50,55,56,100]
 63 | 
 64 | def get_ncw_groups():
 65 |     return [50,55,100]
 66 | 
 67 | def get_ncf_groups():
 68 |     return [80,81]
 69 | 
 70 | def get_leagues():
 71 |     """ Return a list of supported leagues """
 72 |     return get_week_leagues() + get_date_leagues()
 73 | 
 74 | def get_no_scoreboard_json_leagues():
 75 |     """ Scoreboard json isn't easily available for some leagues, have to grab the game_ids from sportscenter_api url """
 76 |     return ["wnba", "nhl"]
 77 | 
 78 | def get_sport(league):
 79 |     if league in ["nba","wnba","ncb","ncw"]:
 80 |         return "basketball"
 81 |     elif league in ["mlb"]:
 82 |         return "baseball"
 83 |     elif league in ["nfl","ncf"]:
 84 |         return "football"
 85 |     elif league in ["nhl"]:
 86 |         return "hockey"
 87 | 
 88 | ## Get urls
 89 | def get_sportscenter_api_url(league, dates, sport=None):
 90 |     """ Alternative API endpoint """
 91 |     if sport == None:
 92 |         sport = get_sport(league)
 93 |     return "https://sportscenter.api.espn.com/apis/v1/events?sport={}&league={}&dates={}".format(sport, league, dates)
 94 | 
 95 | def get_date_scoreboard_url(league, dates, groups=None, sport=None, limit=None):
 96 |     """ Return a scoreboard url for a league that uses dates (nonfootball, but function also works for football)"""
 97 |     if sport == None:
 98 |         sport = get_sport(league)
 99 |     league = longify_league(league)
100 |     url = "{}/{}/{}/scoreboard?dates={}".format(API_v2_BASE_URL, sport, league, dates)
101 |     if groups !=None:
102 |         if limit == None:
103 |             limit = 1000
104 |         url+= "&groups={}".format(groups)
105 |     if limit != None:
106 |         url += "&limit={}".format(limit)
107 |     return url
108 | 
109 | def get_week_scoreboard_url(league, season_year, season_type=None, week=None, groups=None, sport=None):
110 |     """ Return a scoreboard url for a league that uses weeks (football)
111 |     Example urls
112 |     By year: https://site.api.espn.com/apis/site/v2/sports/football/nfl/scoreboard?limit=1000&dates=2022
113 |     By year, seasontype, week: https://site.api.espn.com/apis/site/v2/sports/football/nfl/scoreboard?dates=2022&seasontype=2&week=1
114 |     By date range: https://site.api.espn.com/apis/site/v2/sports/football/nfl/scoreboard?dates=20200901-20210228
115 |     By date: https://site.api.espn.com/apis/site/v2/sports/football/nfl/scoreboard?dates=20200901"""
116 | 
117 |     if sport == None:
118 |         sport = get_sport(league)
119 |     league = longify_league(league)
120 |     url = "{}/{}/{}/scoreboard?dates={}".format(API_v2_BASE_URL, sport, league, season_year)
121 |     if season_type != None:
122 |         url += "&seasontype={}".format(season_type)
123 |     if week != None:
124 |         url += "&week={}".format(week)
125 |     #TODO implement groups?
126 |     return url
127 | 
128 | def get_game_url(url_type, league, espn_id):
129 |     valid_url_types = ["recap", "boxscore", "playbyplay", "conversation", "gamecast"]
130 |     if url_type not in valid_url_types:
131 |         raise ValueError("Unknown url_type for get_game_url. Valid url_types are {}".format(valid_url_types))
132 |     return "{}/{}/{}?gameId={}&{}".format(BASE_URL, league, url_type, espn_id, QUERY_STRING)
133 | 
134 | def get_current_scoreboard_urls(league, offset=0):
135 |     """ Return a list of the current scoreboard urls for a league 
136 |     For date leagues optional offset is in days
137 |     For week leagues optional offseet is in weeks """
138 |     urls = []
139 |     if league in get_date_leagues():
140 |         date_str = (datetime.datetime.now() + relativedelta(days=+offset)).strftime("%Y%m%d")
141 |         if league == "ncb":
142 |             for group in get_ncb_groups():
143 |                 urls.append(get_date_scoreboard_url(league, date_str, group))
144 |         elif league == "ncw":
145 |             for group in get_ncw_groups():
146 |                 urls.append(get_date_scoreboard_url(league, date_str, group))
147 |         else:
148 |             urls.append(get_date_scoreboard_url(league, date_str))
149 |         return urls
150 |     elif league in get_week_leagues():
151 |         # need to add timezone to now to compare with timezoned entry datetimes later
152 |         dt = datetime.datetime.now(pytz.utc) + relativedelta(weeks=+offset)
153 |         # guess the league season_year
154 |         if dt.month > 2:
155 |             guessed_season_year = dt.year
156 |         else:
157 |             guessed_season_year = dt.year - 1
158 |         calendar = get_calendar(league, guessed_season_year)
159 |         for season_type in calendar:
160 |             if 'entries' in season_type:
161 |                 for entry in season_type['entries']:
162 |                     if dt >= parser.parse(entry['startDate']) and dt <= parser.parse(entry['endDate']):
163 |                         if league == "ncf":
164 |                             for group in get_ncf_groups():
165 |                                 urls.append(get_week_scoreboard_url(league, guessed_season_year, season_type['value'], entry['value'], group))
166 |                         else:
167 |                             urls.append(get_week_scoreboard_url(league, guessed_season_year, season_type['value'], entry['value']))
168 |         return urls
169 |     else:
170 |         raise ValueError("Unknown league for get_current_scoreboard_urls")
171 | 
172 | def get_all_scoreboard_urls(league, season_year):
173 |     """ Return a list of all scoreboard urls for a given league and season year """
174 |     urls = []
175 |     if league in get_date_leagues():
176 |         start_datetime, end_datetime = get_season_start_end_datetimes(league, season_year)
177 |         while start_datetime < end_datetime:
178 |             if league == "ncb":
179 |                 for group in get_ncb_groups():
180 |                     urls.append(get_date_scoreboard_url(league, start_datetime.strftime("%Y%m%d"), group))
181 |             elif league == "ncw":
182 |                 for group in get_ncw_groups():
183 |                     urls.append(get_date_scoreboard_url(league, start_datetime.strftime("%Y%m%d"), group))
184 |             else:
185 |                 urls.append(get_date_scoreboard_url(league, start_datetime.strftime("%Y%m%d")))
186 |             start_datetime += relativedelta(days=+1)
187 |         return urls
188 |     elif league in get_week_leagues():
189 |         calendar = get_calendar(league, season_year)
190 |         for season_type in calendar:
191 |             if 'entries' in season_type:
192 |                 for entry in season_type['entries']:
193 |                     if league == "ncf":
194 |                         for group in get_ncf_groups():
195 |                             urls.append(get_week_scoreboard_url(league, season_year, season_type['value'], entry['value'], group))
196 |                     else:
197 |                         urls.append(get_week_scoreboard_url(league, season_year, season_type['value'], entry['value']))
198 |         return urls
199 |     else:
200 |         raise ValueError("Unknown league for get_all_scoreboard_urls")
201 | 
202 | ## Get stuff from URL or filenames
203 | def get_league_from_url(url):
204 |     return url.split('.com/')[1].split('/')[0]
205 | 
206 | def get_date_from_scoreboard_url(url):
207 |     league = get_league_from_url(url)
208 |     if league == "nhl":
209 |         return url.split("?date=")[1].split("&")[0]
210 |     else:
211 |         return url.split('/')[-1].split('?')[0]
212 | 
213 | def get_data_type_from_url(url):
214 |     """ Guess and return the data_type based on the url """
215 |     data_type = None
216 |     valid_data_types = ["scoreboard", "recap", "boxscore", "playbyplay", "conversation", "gamecast"]
217 |     for valid_data_type in valid_data_types:
218 |         if valid_data_type in url:
219 |             data_type = valid_data_type
220 |             break
221 |     if data_type == None:
222 |         raise ValueError("Unknown data_type for url. Url must contain one of {}".format(valid_data_types))
223 |     return data_type
224 | 
225 | def get_filename_ext(filename):
226 |     if filename.endswith(".json"):
227 |         return "json"
228 |     elif filename.endswith(".html"):
229 |         return "html"
230 |     else:
231 |         raise ValueError("Uknown filename extension for {}".format(filename))
232 | 
233 | ## Get requests helpers
234 | def get_season_start_end_datetimes_helper(url):
235 |     # TODO use cached replies if scoreboard url is older than 1 year
236 |     scoreboard = get_url(url)
237 |     return parser.parse(scoreboard['content']['sbData']['leagues'][0]['calendarStartDate']), parser.parse(scoreboard['content']['sbData']['leagues'][0]['calendarEndDate'])
238 | 
239 | def get_season_start_end_datetimes(league, season_year):
240 |     """ Guess a random date in a leagues season and return its calendar start and end dates, only non football adheres to this format"""
241 |     if league == "mlb":
242 |         return get_season_start_end_datetimes_helper(get_date_scoreboard_url(league, str(season_year) + "0415"))
243 |     elif league == "nba":
244 |         return get_season_start_end_datetimes_helper(get_date_scoreboard_url(league, str(season_year - 1) + "1101"))
245 |     elif league == "ncb" or league == "ncw":
246 |         return get_season_start_end_datetimes_helper(get_date_scoreboard_url(league, str(season_year - 1) + "1130"))
247 |     elif league == "wnba":
248 |         # hardcode wnba start end dates, assumed to be April 20 thru Oct 31
249 |         return datetime.datetime(season_year,4,20, tzinfo=pytz.timezone("US/Eastern")).astimezone(pytz.utc), datetime.datetime(season_year,10,31, tzinfo=pytz.timezone("US/Eastern")).astimezone(pytz.utc)
250 |     elif league == "nhl":
251 |         # hardcode nhl start end dates, assumed to be Oct 1 thru June 30
252 |         return datetime.datetime(season_year-1,10,1, tzinfo=pytz.timezone("US/Eastern")).astimezone(pytz.utc), datetime.datetime(season_year,6,30, tzinfo=pytz.timezone("US/Eastern")).astimezone(pytz.utc)
253 |     else:
254 |         raise ValueError("League must be {} to get season start and end datetimes".format(get_date_leagues()))
255 | 
256 | def get_filename(cached_json_path, league, data_type, url):
257 |     """ Build a full filename with directories for given league, data_type and url"""
258 |     # add slash if necessary to cached_json_path
259 |     if cached_json_path[-1] != "/":
260 |         cached_json_path += "/"
261 |     dir_path = cached_json_path + "/" + league + "/" + data_type + "/"
262 |     # create a league directory and data_type directory in cached_json if doesn't already exist
263 |     if not os.path.exists(dir_path):
264 |         os.makedirs(dir_path)
265 |     # create filename with / replaced with |
266 |     filename = dir_path + url.replace('/','|')
267 |     ext = ".json"
268 |     if filename.endswith(ext) == False:
269 |         filename = filename + ext
270 |     return filename
271 | 
272 | def get_cached(filename):
273 |     """ Return cached json if it exists """
274 |     data = None
275 |     if os.path.isfile(filename):
276 |         ext = get_filename_ext(filename)
277 |         if ext == "json":
278 |             with open(filename) as json_data:
279 |                 data = json.load(json_data)
280 |         elif ext == "html":
281 |             data = BeautifulSoup(open(filename), "lxml") 
282 |     return data
283 | 
284 | ## Get requests
285 | def get_teams(league):
286 |     """ Returns a list of teams with ids and names """
287 |     teams = []
288 |     if league == "ncf":
289 |         # espn's college football teams page only lists fbs (blank division)
290 |         # need to grab teams from standings page instead if want all the fbs and fcs teams
291 |         for division in ["","fcs-i-aa"]:
292 |             url = BASE_URL + "/college-football/standings/_/view/" + division
293 |             print(url)
294 |             soup = get_soup(retry_request(url))
295 |             selector = ".hide-mobile"
296 |             team_divs = soup.select(selector)
297 |             for team_div in team_divs:
298 |                 teams.append({'id': team_div.find("a")['href'].split('/')[-2], 'name': team_div.text})
299 |     else:
300 |         url = BASE_URL + "/" + league + "/teams"
301 |         print(url)
302 |         soup = get_soup(retry_request(url))
303 |         if league == "wnba":
304 |             selector = "div.pl3"
305 |         else:
306 |             selector = "div.mt3"
307 |         team_divs = soup.select(selector)
308 |         for team_div in team_divs:
309 |             teams.append({'id': team_div.find("a")['href'].split('/')[-2], 'name': team_div.find("h2").text})
310 |     return teams
311 | 
312 | def get_standings(league, season_year, college_division=None):
313 |     standings = {"conferences": {}}
314 |     if league in ["nhl","nfl","mlb","nba","wnba","ncf","ncb","ncw"]:
315 |         if league == "ncf" and college_division == None:
316 |             # default to fbs
317 |             college_division = ""
318 |         if college_division:
319 |             valid_college_divisions = ["fbs", "fcs", "fcs-i-aa", "d2", "d3"]
320 |             if college_division == "fcs":
321 |                 college_division = "fcs-i-aa"
322 |             if college_division in valid_college_divisions:
323 |                 url = "{}/{}/standings/_/season/{}/view/{}".format(BASE_URL, league, season_year, college_division)
324 |             else:
325 |                 raise ValueError("College division must be none or {}".format(",".join(valid_college_divisions)))
326 |         elif league in ["wnba"]:
327 |             url = "{}/{}/standings/_/season/{}/group/conference".format(BASE_URL, league, season_year)
328 |         else:
329 |             url = "{}/{}/standings/_/season/{}/group/division".format(BASE_URL, league, season_year)
330 | 
331 |         print(url)
332 |         soup = get_soup(retry_request(url))
333 |         standings_divs = soup.find_all("div", class_="standings__table")
334 | 
335 |         for i in range(len(standings_divs)):
336 |             conference_name = standings_divs[i].find("div", class_="Table__Title").text
337 |             standings["conferences"][conference_name] = {"divisions": {}}
338 |             division = "" # default blank division name
339 |             teams_table = standings_divs[i].find("table", class_="Table--fixed-left")
340 |             trs = teams_table.find_all("tr")
341 |             for tr in trs:
342 |                 if "subgroup-headers" in tr["class"]:
343 |                     division = tr.text # replace default blank division name
344 |                     standings["conferences"][conference_name]["divisions"][division] = {"teams": []}
345 |                 elif tr.text != "":
346 |                     if division == "" and standings["conferences"][conference_name]["divisions"] == {}:
347 |                         standings["conferences"][conference_name]["divisions"][division] = {"teams": []}
348 |                     team = {}
349 |                     team_span_tag = tr.find("td", class_="Table__TD").find("span", class_="hide-mobile")
350 |                     team_a_tag = team_span_tag.find("a")
351 |                     if team_a_tag is None:
352 |                         # some teams are now defunct with no espn links
353 |                         team["name"] = team_span_tag.text.strip()
354 |                         team["abbr"] = ""
355 |                     else:
356 |                         team["name"] = team_a_tag.text
357 |                         if league in ["ncf","ncb","ncw"]:
358 |                             team["abbr"] = team_a_tag["href"].split("/id/")[1].split("/")[0].upper()
359 |                         else:
360 |                             team["abbr"] = team_a_tag["href"].split("/name/")[1].split("/")[0].upper()
361 |                     standings["conferences"][conference_name]["divisions"][division]["teams"].append(team)
362 | 
363 |     return standings
364 |                 
365 | def get_calendar(league, date_or_season_year):
366 |     """ Return a calendar for a league and season_year"""
367 |     if league in get_week_leagues():
368 |         url = get_week_scoreboard_url(league, date_or_season_year, 2, 1)
369 |     elif league in get_date_leagues():
370 |         url = get_date_scoreboard_url(league, date_or_season_year)
371 |     # TODO use cached replies for older urls
372 |     return get_url(url)['leagues'][0]['calendar']
373 | 
374 | def get_url(url, cached_path=None):
375 |     """ Retrieve an ESPN JSON data or HTML BeautifulSoup, either from cache or make new request """
376 |     data_type = get_data_type_from_url(url)
377 |     league = get_league_from_url(url)
378 |     """
379 |     if data_type == "scoreboard":
380 |         # for wnba and nhl we'll use a different api to retrieve game_ids and basic game data
381 |         if league in get_no_scoreboard_json_leagues():
382 |             url = get_sportscenter_api_url(get_sport(league), league, get_date_from_scoreboard_url(url))
383 |     """
384 |     return get_cached_url(url, league, data_type, cached_path)
385 | 
386 | def get_cached_url(url, league, data_type, cached_path, headers=DEFAULT_HEADERS):
387 |     """ get_url helper if want to specify the league and datatype (for non espn.com links) """
388 |     if cached_path:
389 |         filename = get_filename(cached_path, league, data_type, url)
390 |         data = get_cached(filename)
391 |     else:
392 |         data = None
393 |     if data == None:
394 |         ext = "json"
395 |         data = get_new_json(url, headers)
396 |         # dont cache if got an ESPN internal 500 error
397 |         if cached_path and "error_code" not in data:
398 |             with open(filename, 'w') as f:
399 |                 json.dump(data, f, ensure_ascii=False, indent=2, sort_keys=True)
400 |     return data
401 | 
402 | 


--------------------------------------------------------------------------------
/espn_scraper/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andr3w321/espn_scraper/86f15f32e649a188887b4599b074ec100ca60dcf/espn_scraper/tests/__init__.py


--------------------------------------------------------------------------------
/espn_scraper/tests/test_espn.py:
--------------------------------------------------------------------------------
  1 | from unittest import TestCase
  2 | import espn_scraper as espn
  3 | 
  4 | def boxscore_helper(self, league, espn_id):
  5 |     data = espn.get_url(espn.get_game_url("boxscore", league, espn_id))
  6 |     self.assertEqual(data['page']['content']['gamepackage']['gmStrp']['gid'], str(espn_id))
  7 | 
  8 | def standings_helper(self, league, season_year, expected_n_teams):
  9 |     standings = espn.get_standings(league, season_year)
 10 |     n_teams = 0
 11 |     for conference in standings["conferences"]:
 12 |         for division in standings["conferences"][conference]["divisions"]:
 13 |             for team in standings["conferences"][conference]["divisions"][division]["teams"]:
 14 |                 n_teams += 1
 15 |     self.assertEqual(n_teams, expected_n_teams)
 16 | 
 17 | class TestEspn(TestCase):
 18 |     # leagues
 19 |     def test_get_leagues(self):
 20 |         self.assertEqual(len(espn.get_leagues()), 8)
 21 |     # teams
 22 |     def test_get_num_nfl_teams(self):
 23 |         self.assertEqual(len(espn.get_teams("nfl")), 32)
 24 |     def test_get_num_mlb_teams(self):
 25 |         self.assertEqual(len(espn.get_teams("mlb")), 30)
 26 |     def test_get_num_nba_teams(self):
 27 |         self.assertEqual(len(espn.get_teams("nba")), 30)
 28 |     def test_get_num_ncf_teams(self):
 29 |         self.assertGreater(len(espn.get_teams("ncf")), 256)
 30 |     def test_get_num_ncb_teams(self):
 31 |         self.assertGreater(len(espn.get_teams("ncb")), 353)
 32 |     def test_get_num_ncw_teams(self):
 33 |         assert(len(espn.get_teams("ncw")) > 300)
 34 |     def test_get_num_wnba_teams(self):
 35 |         self.assertEqual(len(espn.get_teams("wnba")), 12)
 36 |     def test_get_num_nhl_teams(self):
 37 |         self.assertGreater(len(espn.get_teams("nhl")), 31)
 38 | 
 39 |     # scoreboards
 40 |     def test_nfl_scoreboard(self):
 41 |         url = espn.get_week_scoreboard_url("nfl", 2015, 2, 1)
 42 |         data = espn.get_url(url)
 43 |         self.assertEqual(len(data['events']), 16)
 44 |     def test_mlb_scoreboard(self):
 45 |         url = espn.get_date_scoreboard_url("mlb", "20160601")
 46 |         data = espn.get_url(url)
 47 |         self.assertEqual(len(data['events']), 15)
 48 |     def test_ncf_scoreboard(self):
 49 |         url = espn.get_week_scoreboard_url("ncf", 2016, 2, 3)
 50 |         data = espn.get_url(url)
 51 |         self.assertEqual(len(data['events']), 21)
 52 |     def test_ncb_scoreboard(self):
 53 |         url = espn.get_date_scoreboard_url("ncb", "20170211", 50)
 54 |         data = espn.get_url(url)
 55 |         self.assertEqual(len(data['events']), 146)
 56 |     def test_ncw_scoreboard(self):
 57 |         url = espn.get_date_scoreboard_url("ncw", "20170120")
 58 |         data = espn.get_url(url)
 59 |         self.assertEqual(len(data['events']), 3)
 60 |     def test_wnba_scoreboard(self):
 61 |         url = espn.get_date_scoreboard_url("wnba", "20160710")
 62 |         data = espn.get_url(url)
 63 |         self.assertEqual(len(data['events']), 5)
 64 |     def test_nhl_scoreboard(self):
 65 |         url = espn.get_date_scoreboard_url("nhl", "20170328")
 66 |         data = espn.get_url(url)
 67 |         self.assertEqual(len(data['events']), 11)
 68 | 
 69 |     # scoreboards for a year
 70 |     def test_get_all_2016_nfl_scoreboard_urls(self):
 71 |         scoreboards = espn.get_all_scoreboard_urls("nfl", 2016)
 72 |         self.assertEqual(len(scoreboards), 28)
 73 | 
 74 |     # boxscores
 75 |     def test_nfl_boxscore(self):
 76 |         boxscore_helper(self, "nfl", 400874484)
 77 |     def test_nba_boxscore(self):
 78 |         boxscore_helper(self, "nba", 400900498)
 79 |     def test_mlb_boxscore(self):
 80 |         boxscore_helper(self, "mlb", 370328119)
 81 |     def test_ncb_boxscore(self):
 82 |         boxscore_helper(self, "ncb", 400947330)
 83 |     def test_ncf_boxscore(self):
 84 |         boxscore_helper(self, "ncf", 400868977)
 85 |     def test_ncw_boxscore(self):
 86 |         boxscore_helper(self, "ncw", 400947271)
 87 |     def test_wnba_boxscore(self):
 88 |         boxscore_helper(self, "wnba", 400910431)
 89 |     def test_nhl_boxscore(self):
 90 |         boxscore_helper(self, "nhl", 400885533)
 91 | 
 92 |     # standings
 93 |     def test_nfl_2016_standings(self):
 94 |         standings_helper(self, "nfl", 2016, 32)
 95 |     def test_ncf_2016_standings(self):
 96 |         standings_helper(self, "ncf", 2016, 128)
 97 |     def test_mlb_2016_standings(self):
 98 |         standings_helper(self, "mlb", 2016, 30)
 99 |     def test_nba_2016_standings(self):
100 |         standings_helper(self, "nba", 2016, 30)
101 |     def test_ncb_2016_standings(self):
102 |         standings_helper(self, "ncb", 2016, 351)
103 |     def test_ncw_2016_standings(self):
104 |         standings_helper(self, "ncw", 2016, 349)
105 |     def test_wnba_2016_standings(self):
106 |         standings_helper(self, "wnba", 2016, 12)
107 |     def test_nhl_2016_standings(self):
108 |         standings_helper(self, "nhl", 2016, 30)
109 | 
110 | 


--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
 1 | import espn_scraper as espn
 2 | import json
 3 | 
 4 | ''' Pretty print json helper '''
 5 | def ppjson(data):
 6 |     print(json.dumps(data, indent=2, sort_keys=True))
 7 | 
 8 | leagues = espn.get_leagues()
 9 | print(leagues)
10 | for league in leagues:
11 |     teams = espn.get_teams(league)
12 |     print(league, len(teams))
13 | 
14 | # print nfl 2016 postseason scores
15 | scoreboard_urls = espn.get_all_scoreboard_urls("nfl", 2016)
16 | for scoreboard_url in scoreboard_urls:
17 |     data = espn.get_url(scoreboard_url, cached_path="cached_json")
18 |     for event in data['events']:
19 |         if event['season']['type'] == 3:
20 |             print(event['season']['type'],
21 |                   event['season']['year'],
22 |                   event['competitions'][0]['competitors'][0]['team']['abbreviation'],
23 |                   event['competitions'][0]['competitors'][0]['score'],
24 |                   event['competitions'][0]['competitors'][1]['team']['abbreviation'],
25 |                   event['competitions'][0]['competitors'][1]['score'])
26 | 
27 | url = espn.get_game_url("boxscore", "nba", 400900498)
28 | json_data = espn.get_url(url)
29 | #ppjson(json_data) # print full long json
30 | 
31 | print(json_data['page']['content']['gamepackage']['bxscr'][0]['tm']['dspNm'])
32 | ppjson(json_data['page']['content']['gamepackage']['bxscr'][0]['stats'][0])
33 | 
34 | url = espn.get_game_url("playbyplay", "ncf", 400868977)
35 | json_data = espn.get_url(url)
36 | for play_number, play in enumerate(json_data['page']['content']['gamepackage']['allPlys']):
37 |     if 'headline' not in play:
38 |         continue
39 |     print(play_number, play['teamName'], play['headline'], play['description'])
40 | 
41 | 
42 | # NHL example
43 | url = espn.get_game_url("boxscore", "nhl", 400885533)
44 | json_data = espn.get_url(url)
45 | away_team = json_data['page']['content']['gamepackage']['bxscr'][0]['tm']['abbrev']
46 | home_team = json_data['page']['content']['gamepackage']['bxscr'][1]['tm']['abbrev']
47 | away_score = json_data['page']['content']['gamepackage']['scrSumm']['lnscrs']['awy'][3]
48 | home_score = json_data['page']['content']['gamepackage']['scrSumm']['lnscrs']['hme'][3]
49 | print(away_team, away_score, home_team, home_score)
50 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description_file = README.md
3 | [bdist wheel]
4 | universal=1
5 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(name='espn_scraper',
 4 |       version='0.15.00',
 5 |       description='ESPN scraper for major sports',
 6 |       url='http://github.com/andr3w321/espn_scraper',
 7 |       author='Andrew Rennhack',
 8 |       author_email='andr3w321@gmail.com',
 9 |       license='MIT',
10 |       install_requires=[
11 |           'pytz',
12 |           'python-dateutil',
13 |           'requests',
14 |           'bs4',
15 |           'lxml'
16 |       ],
17 |       test_suite='nose.collector',
18 |       tests_require=['nose'],
19 |       packages=['espn_scraper'],
20 |       zip_safe=False)
21 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | env_list =
 3 |     py310
 4 | minversion = 4.14.2
 5 | 
 6 | [testenv]
 7 | description = run the tests with pytest
 8 | package = wheel
 9 | wheel_build_env = .pkg
10 | deps =
11 |     pytest>=6
12 | commands =
13 |     pytest {tty:--color=yes} {posargs}
14 | 


--------------------------------------------------------------------------------