├── .style.yapf ├── README.md ├── setup.cfg ├── sportsref ├── nba │ ├── __init__.py │ ├── teams.py │ ├── players.py │ ├── seasons.py │ ├── boxscores.py │ └── pbp.py ├── options.py ├── __init__.py ├── nfl │ ├── __init__.py │ ├── winProb.py │ ├── finders │ │ ├── __init__.py │ │ ├── PSF.py │ │ └── GPF.py │ ├── seasons.py │ ├── players.py │ ├── boxscores.py │ ├── teams.py │ └── pbp.py ├── decorators.py └── utils.py ├── setup.py ├── pyproject.toml ├── .pre-commit-config.yaml └── .gitignore /.style.yapf: -------------------------------------------------------------------------------- 1 | [style] 2 | COLUMN_LIMIT=100 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # sportsref 2 | Scraping sports data from sports-reference.com and related sites 3 | 4 | NOTE: Very much still a WIP. Feel free to use, just bear in mind that the API 5 | is subject to change. Documentation is on the to-do list, once the API is a bit 6 | more rigid. 7 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | line_length = 100 3 | multi_line_output = 3 4 | include_trailing_comma = True 5 | 6 | [flake8] 7 | max-line-length = 100 8 | exclude = 9 | .git, 10 | .venv, 11 | build, 12 | dist 13 | ignore = 14 | E203, # whitespace before ':' 15 | E265, # block comment 16 | W503, # line break before binary operator (e.g. `and` or `or`) 17 | -------------------------------------------------------------------------------- /sportsref/nba/__init__.py: -------------------------------------------------------------------------------- 1 | from . import boxscores 2 | from . import pbp 3 | from . import seasons 4 | from . import teams 5 | 6 | from .boxscores import BoxScore 7 | from .seasons import Season 8 | from .teams import Team 9 | from .players import Player 10 | 11 | BASE_URL = "http://www.basketball-reference.com" 12 | 13 | __all__ = [ 14 | "BASE_URL", 15 | "boxscores", 16 | "BoxScore", 17 | "pbp", 18 | "seasons", 19 | "Season", 20 | "teams", 21 | "Team", 22 | "players", 23 | "Player", 24 | ] 25 | -------------------------------------------------------------------------------- /sportsref/options.py: -------------------------------------------------------------------------------- 1 | OPTIONS = {"cache": True, "memoize": True} 2 | 3 | 4 | def get_option(option): 5 | option = option.lower() 6 | if option in OPTIONS: 7 | return OPTIONS[option] 8 | else: 9 | # TODO: log 10 | print(f"option {option} not recognized") 11 | return None 12 | 13 | 14 | def set_option(option, value): 15 | option = option.lower() 16 | if option in OPTIONS: 17 | OPTIONS[option] = value 18 | else: 19 | # TODO: log 20 | print("option {option} not recognized") 21 | -------------------------------------------------------------------------------- /sportsref/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | SITE_ABBREV = { 4 | "http://www.pro-football-reference.com": "pfr", 5 | "http://www.basketball-reference.com": "bkref", 6 | "http://www.sports-reference.com/cfb": "ncaaf", 7 | "http://www.sports-reference.com/cbb": "ncaab", 8 | } 9 | 10 | from sportsref.options import get_option, set_option 11 | from sportsref import decorators, utils, nfl, nba 12 | 13 | __all__ = [ 14 | "decorators", 15 | "utils", 16 | "nfl", 17 | "nba", 18 | "get_option", 19 | "set_option", 20 | "SITE_ABBREV", 21 | ] 22 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | setup( 4 | name="sportsref", 5 | version="0.13.0", 6 | description="Scraping data from sports-reference.com and related sites", 7 | url="https://github.com/mdgoldberg/sportsref", 8 | author="Matt Goldberg", 9 | author_email="matt.goldberg7@gmail.com", 10 | packages=find_packages(), 11 | install_requires=[ 12 | "appdirs", 13 | "boltons", 14 | "mementos", 15 | "numexpr", 16 | "numpy", 17 | "pandas", 18 | "pyquery", 19 | "requests", 20 | ], 21 | ) 22 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "sportsref" 3 | version = "0.13.0" 4 | description = "" 5 | authors = ["Matt Goldberg "] 6 | 7 | [tool.poetry.dependencies] 8 | python = "^3.6.1" 9 | mementos = "^1.3.1" 10 | numexpr = "^2.7.1" 11 | numpy = "^1.19.4" 12 | pandas = "^1.1.4" 13 | pyquery = "^1.4.3" 14 | requests = "^2.25.0" 15 | 16 | [tool.poetry.dev-dependencies] 17 | black = "^20.8b1" 18 | flake8 = "^3.8.4" 19 | ipdb = "^0.13.4" 20 | ipython = "^7.15.0" 21 | jupyter = "^1.0.0" 22 | pre-commit = "^2.9.2" 23 | pylint = "^2.6.0" 24 | 25 | [build-system] 26 | requires = ["poetry-core>=1.0.0"] 27 | build-backend = "poetry.core.masonry.api" 28 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | repos: 4 | - repo: https://github.com/pre-commit/pre-commit-hooks 5 | rev: v2.0.0 6 | hooks: 7 | - id: trailing-whitespace 8 | - id: end-of-file-fixer 9 | - id: check-yaml 10 | - id: check-added-large-files 11 | - repo: https://github.com/prettier/prettier 12 | rev: 1.18.2 13 | hooks: 14 | - id: prettier 15 | - repo: https://github.com/psf/black 16 | rev: stable 17 | hooks: 18 | - id: black 19 | # - repo: https://github.com/pre-commit/mirrors-mypy 20 | # rev: "v0.782" 21 | # hooks: 22 | # - id: mypy 23 | -------------------------------------------------------------------------------- /sportsref/nfl/__init__.py: -------------------------------------------------------------------------------- 1 | from . import finders 2 | from . import teams 3 | from . import players 4 | from . import boxscores 5 | 6 | # from . import winProb 7 | from . import pbp 8 | 9 | from .players import Player 10 | from .seasons import Season 11 | from .teams import Team 12 | from .boxscores import BoxScore 13 | from .finders import GamePlayFinder, PlayerSeasonFinder 14 | 15 | BASE_URL = "http://www.pro-football-reference.com" 16 | 17 | # modules/variables to expose 18 | __all__ = [ 19 | "BASE_URL", 20 | "finders", 21 | "GamePlayFinder", 22 | "PlayerSeasonFinder", 23 | "boxscores", 24 | "BoxScore", 25 | "players", 26 | "Player", 27 | "seasons", 28 | "Season", 29 | "teams", 30 | "Team", 31 | # "winProb", 32 | "pbp", 33 | ] 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/build/ 55 | 56 | # PyBuilder 57 | target/ 58 | 59 | # Misc 60 | **/.DS_Store 61 | **/*.swp 62 | **/*.json 63 | **/.R* 64 | scripts/ 65 | csv/ 66 | -------------------------------------------------------------------------------- /sportsref/nfl/winProb.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.stats import norm 3 | 4 | 5 | def initialWinProb(line): 6 | """Gets the initial win probability of a game given its Vegas line. 7 | 8 | :line: The Vegas line from the home team's perspective (negative means 9 | home team is favored). 10 | :returns: A float in [0., 100.] that represents the win probability. 11 | """ 12 | line = float(line) 13 | probWin = 1.0 - norm.cdf(0.5, -line, 13.86) 14 | probTie = norm.cdf(0.5, -line, 13.86) - norm.cdf(-0.5, -line, 13.86) 15 | return 100.0 * (probWin + 0.5 * probTie) 16 | 17 | 18 | def winProb(line, margin, secsElapsed, expPts): 19 | line = float(line) 20 | margin = float(margin) 21 | expPts = float(expPts) 22 | baseMean = -line 23 | baseStd = 13.46 24 | expMargin = margin + expPts 25 | minRemain = 60 - secsElapsed / 60 + 0.00001 26 | adjMean = baseMean * minRemain / 60 27 | adjStd = baseStd / np.sqrt(60 / minRemain) 28 | probWin = 1.0 - norm.cdf(-expMargin + 0.5, adjMean, adjStd) 29 | probTie = norm.cdf(-expMargin + 0.5, adjMean, adjStd) - norm.cdf( 30 | -expMargin - 0.5, adjMean, adjStd 31 | ) 32 | return 100.0 * (probWin + 0.5 * probTie) 33 | -------------------------------------------------------------------------------- /sportsref/nba/teams.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pyquery import PyQuery as pq 3 | 4 | import sportsref 5 | 6 | 7 | class Team(object, metaclass=sportsref.decorators.Cached): 8 | def __init__(self, team_id): 9 | self.team_id = team_id.upper() 10 | 11 | def __eq__(self, other): 12 | return self.team_id == other.team_id 13 | 14 | def __hash__(self): 15 | return hash(self.team_id) 16 | 17 | @sportsref.decorators.memoize 18 | def team_year_url(self, yr_str): 19 | return f"{sportsref.nba.BASE_URL}/teams/{self.team_id}/{yr_str}.htm" 20 | 21 | @sportsref.decorators.memoize 22 | def get_main_doc(self): 23 | team_url = f"{sportsref.nba.BASE_URL}/teams/{self.team_id}" 24 | main_doc = pq(sportsref.utils.get_html(team_url)) 25 | return main_doc 26 | 27 | @sportsref.decorators.memoize 28 | def get_year_doc(self, yr_str): 29 | return pq(sportsref.utils.get_html(self.team_year_url(yr_str))) 30 | 31 | @sportsref.decorators.memoize 32 | def name(self): 33 | """Returns the real name of the franchise given the team ID. 34 | 35 | Examples: 36 | 'BOS' -> 'Boston Celtics' 37 | 'NJN' -> 'Brooklyn Nets' 38 | 39 | :returns: A string corresponding to the team's full name. 40 | """ 41 | doc = self.get_main_doc() 42 | name = doc('div#info h1[itemprop="name"]').text() 43 | return name 44 | 45 | @sportsref.decorators.memoize 46 | def roster(self, year): 47 | """Returns the roster table for the given year. 48 | 49 | :year: The year for which we want the roster; defaults to current year. 50 | :returns: A DataFrame containing roster information for that year. 51 | """ 52 | doc = self.get_year_doc(year) 53 | table = doc("table#roster") 54 | df = sportsref.utils.parse_table(table) 55 | df["years_experience"] = ( 56 | df["years_experience"].replace("R", 0).replace("", np.nan).astype(float) 57 | ) 58 | return df 59 | 60 | # TODO: kind_rpb 61 | @sportsref.decorators.memoize 62 | def schedule(self, year): 63 | """Gets schedule information for a team-season. 64 | 65 | :year: The year for which we want the schedule. 66 | :returns: DataFrame of schedule information. 67 | """ 68 | doc = self.get_year_doc(f"{year}_games") 69 | table = doc("table#games") 70 | df = sportsref.utils.parse_table(table) 71 | return df 72 | -------------------------------------------------------------------------------- /sportsref/nfl/finders/__init__.py: -------------------------------------------------------------------------------- 1 | from . import GPF 2 | from . import PSF 3 | 4 | from .PSF import PlayerSeasonFinder 5 | from .GPF import GamePlayFinder 6 | 7 | # modules/variables to expose 8 | __all__ = ["PlayerSeasonFinder", "GamePlayFinder"] 9 | 10 | # Fill in PlayerSeasonFinder docstring 11 | 12 | IOD = PSF.inputs_options_defaults() 13 | 14 | paramStr = "\n".join( 15 | ':param {}: default="{}"'.format(name, ",".join(dct["value"])) 16 | for name, dct in sorted(IOD.items()) 17 | ) 18 | optsStr = "\n".join( 19 | "{}: {}".format(name, ",".join('"{}"'.format(opt) for opt in dct["options"])) 20 | if len(dct["options"]) <= 10 21 | else "{}: {}...{}".format( 22 | name, 23 | ",".join('"{}"'.format(opt) for opt in dct["options"][:10]), 24 | ",".join('"{}"'.format(opt) for opt in dct["options"][-2:]), 25 | ) 26 | for name, dct in sorted(IOD.items()) 27 | ) 28 | 29 | 30 | PSF.PlayerSeasonFinder.__doc__ = """ 31 | Finds player-seasons that match criteria supplied by keyword arguments. 32 | 33 | * Can use tm or team for team_id. 34 | * Can use yr, year, yrs, or years for year_min, year_max. 35 | * Can use [draft_]pos, [draft_]position, [draft_]positions for a shortcut for 36 | [draft_]positions. 37 | 38 | Options for inputs: 39 | {} 40 | 41 | {} 42 | :returns: list of matching player-season tuples 43 | :rtype: [(player ID, season year)] 44 | 45 | """.format( 46 | paramStr, optsStr 47 | ) 48 | 49 | # clean up namespace 50 | del IOD, paramStr, optsStr 51 | 52 | 53 | # Fill in GamePlayFinder docstring 54 | 55 | IOD = GPF.inputs_options_defaults() 56 | 57 | paramStr = "\n".join( 58 | ':param {}: default="{}"'.format(name, ",".join(dct["value"])) 59 | for name, dct in sorted(IOD.items()) 60 | ) 61 | 62 | optsStr = "\n".join( 63 | "{}: {}".format(name, ",".join('"{}"'.format(opt) for opt in dct["options"])) 64 | if len(dct["options"]) <= 10 65 | else "{}: {}...{}".format( 66 | name, 67 | ",".join('"{}"'.format(opt) for opt in dct["options"][:10]), 68 | ",".join('"{}"'.format(opt) for opt in dct["options"][-2:]), 69 | ) 70 | for name, dct in sorted(IOD.items()) 71 | ) 72 | 73 | GPF.GamePlayFinder.__doc__ = """ 74 | Finds plays that match criteria supplied by keyword arguments. 75 | 76 | * Can use tm or team instead of team_id. 77 | * Can use yr, year, yrs, or years instead of year_min, year_max. 78 | * For multi-valued options (like down or rush direction), separate values with 79 | commas or use a list. 80 | * For options that are yes/no/either or yes/no/any, -1 is either/any, 0 is no, 81 | 1 is yes. 82 | 83 | Options for the inputs: 84 | {} 85 | 86 | {} 87 | :returns: Pandas dataframe of plays 88 | :rtype: pd.DataFrame 89 | """.format( 90 | paramStr, optsStr 91 | ) 92 | 93 | # clean up namespace 94 | del IOD, paramStr, optsStr 95 | -------------------------------------------------------------------------------- /sportsref/nfl/seasons.py: -------------------------------------------------------------------------------- 1 | from pyquery import PyQuery as pq 2 | 3 | import sportsref 4 | 5 | 6 | __all__ = ["Season"] 7 | 8 | 9 | class Season(object, metaclass=sportsref.decorators.Cached): 10 | 11 | """Object representing a given NFL season.""" 12 | 13 | def __init__(self, year): 14 | """Initializes a Season object for an NFL season. 15 | 16 | :year: The year of the season we want. 17 | """ 18 | self.yr = int(year) 19 | 20 | def __eq__(self, other): 21 | return self.yr == other.yr 22 | 23 | def __hash__(self): 24 | return hash(self.yr) 25 | 26 | def __repr__(self): 27 | return "Season({})".format(self.yr) 28 | 29 | def _subpage_url(self, page): 30 | return sportsref.nfl.BASE_URL + "/years/{}/{}.htm".format(self.yr, page) 31 | 32 | @sportsref.decorators.memoize 33 | def get_main_doc(self): 34 | """Returns PyQuery object for the main season URL. 35 | :returns: PyQuery object. 36 | """ 37 | url = sportsref.nfl.BASE_URL + "/years/{}/".format(self.yr) 38 | return pq(sportsref.utils.get_html(url)) 39 | 40 | @sportsref.decorators.memoize 41 | def get_sub_doc(self, subpage): 42 | """Returns PyQuery object for a given subpage URL. 43 | :subpage: The subpage of the season, e.g. 'per_game'. 44 | :returns: PyQuery object. 45 | """ 46 | html = sportsref.utils.get_html(self._subpage_url(subpage)) 47 | return pq(html) 48 | 49 | @sportsref.decorators.memoize 50 | def get_team_ids(self): 51 | """Returns a list of the team IDs for the given year. 52 | :returns: List of team IDs. 53 | """ 54 | return sportsref.nfl.teams.list_teams(self.yr) 55 | 56 | @sportsref.decorators.memoize 57 | def team_ids_to_names(self): 58 | """Mapping from 3-letter team IDs to full team names. 59 | 60 | :returns: Dictionary with team IDs as keys and full team strings as 61 | values. 62 | """ 63 | return sportsref.nfl.teams.team_names(self.yr) 64 | 65 | @sportsref.decorators.memoize 66 | def team_names_to_ids(self): 67 | """Mapping from full team names to 3-letter team IDs. 68 | :returns: Dictionary with tean names as keys and team IDs as values. 69 | """ 70 | return sportsref.nfl.teams.team_ids(self.yr) 71 | 72 | @sportsref.decorators.memoize 73 | def _get_player_stats_table(self, subpage, table_id): 74 | """Helper function for player season stats. 75 | 76 | :identifier: string identifying the type of stat, e.g. 'passing'. 77 | :returns: A DataFrame of stats. 78 | """ 79 | doc = self.get_sub_doc(subpage) 80 | table = doc("table#{}".format(table_id)) 81 | df = sportsref.utils.parse_table(table) 82 | return df 83 | 84 | def player_stats_passing(self): 85 | """Returns a DataFrame of passing player stats for a season.""" 86 | return self._get_player_stats_table("passing", "passing") 87 | 88 | def player_stats_rushing(self): 89 | """Returns a DataFrame of rushing player stats for a season.""" 90 | return self._get_player_stats_table("rushing", "rushing_and_receiving") 91 | 92 | def player_stats_receiving(self): 93 | """Returns a DataFrame of receiving player stats for a season.""" 94 | return self._get_player_stats_table("receiving", "receiving") 95 | -------------------------------------------------------------------------------- /sportsref/decorators.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import datetime 3 | import functools 4 | import getpass 5 | import hashlib 6 | import os 7 | import re 8 | import time 9 | 10 | import appdirs 11 | import mementos 12 | import pandas as pd 13 | from pyquery import PyQuery as pq 14 | 15 | import sportsref 16 | 17 | 18 | # TODO: move PSFConstants and GPFConstants to appdirs cache dir 19 | def switch_to_dir(dir_path): 20 | """ 21 | Decorator that switches to given directory before executing function, and 22 | then returning to orignal directory. 23 | """ 24 | 25 | def decorator(func): 26 | @functools.wraps(func) 27 | def wrapper(*args, **kwargs): 28 | orig_cwd = os.getcwd() 29 | os.chdir(dir_path) 30 | ret = func(*args, **kwargs) 31 | os.chdir(orig_cwd) 32 | return ret 33 | 34 | return wrapper 35 | 36 | return decorator 37 | 38 | 39 | def _days_valid_pfr(url): 40 | # boxscores are static, but refresh quarterly to be sure 41 | if "boxscore" in url: 42 | return 90 43 | # important dates 44 | today = datetime.date.today() 45 | start_of_season = datetime.date(today.year, 8, 15) 46 | end_of_season = datetime.date(today.year, 2, 15) 47 | # check for a year in the filename 48 | m = re.search(r"(\d{4})", url) 49 | if m: 50 | # if it was a year prior to the current season, we're good 51 | year = int(m.group(1)) 52 | cur_season = today.year - (today <= end_of_season) 53 | if year < cur_season: 54 | return 90 55 | # if it's the offseason, refresh cache twice a month 56 | if end_of_season < today < start_of_season: 57 | return 15 58 | # otherwise, refresh every 2 days 59 | return 2 60 | 61 | 62 | def _days_valid_bkref(url): 63 | # boxscores are static, but refresh quarterly to be sure 64 | if "boxscore" in url: 65 | return 90 66 | # important dates 67 | today = datetime.date.today() 68 | start_of_season = datetime.date(today.year, 10, 1) 69 | end_of_season = datetime.date(today.year, 7, 1) 70 | # check for a year in the filename 71 | m = re.search(r"(\d{4})", url) 72 | if m: 73 | # if it was a year prior to the current season, we're good 74 | year = int(m.group(1)) 75 | cur_season = today.year - (today <= end_of_season) + 1 76 | if year < cur_season: 77 | return 90 78 | # if it's the offseason, refresh cache once a month 79 | if end_of_season < today < start_of_season: 80 | return 30 81 | # otherwise, refresh every 2 days 82 | return 2 83 | 84 | 85 | def _days_valid_cfb(url): 86 | # TODO: caching for CFB 87 | return 365 88 | 89 | 90 | def cache(func): 91 | """Caches the HTML returned by the specified function `func`. Caches it in 92 | the user cache determined by the appdirs package. 93 | """ 94 | 95 | CACHE_DIR = appdirs.user_cache_dir("sportsref", getpass.getuser()) 96 | os.makedirs(CACHE_DIR, exist_ok=True) 97 | 98 | @functools.wraps(func) 99 | def wrapper(url): 100 | # hash based on the URL 101 | file_hash = hashlib.md5() 102 | encoded_url = url.encode(errors="replace") 103 | file_hash.update(encoded_url) 104 | file_hash = file_hash.hexdigest() 105 | filename = f"{CACHE_DIR}/{file_hash}" 106 | 107 | sport_id = None 108 | for a_base_url, a_sport_id in sportsref.SITE_ABBREV.items(): 109 | if url.startswith(a_base_url): 110 | sport_id = a_sport_id 111 | break 112 | else: 113 | # TODO: log 114 | print(f"No sport ID found for {url}, not able to check cache") 115 | 116 | # check whether cache is valid or stale 117 | file_exists = os.path.isfile(filename) 118 | if sport_id and file_exists: 119 | cur_time = int(time.time()) 120 | mod_time = int(os.path.getmtime(filename)) 121 | days_since_mod = datetime.timedelta(seconds=(cur_time - mod_time)).days 122 | # TODO: refactor _days_valid_ functions to not use globals 123 | days_cache_valid = globals()[f"_days_valid_{sport_id}"](url) 124 | cache_is_valid = days_since_mod < days_cache_valid 125 | else: 126 | cache_is_valid = False 127 | 128 | # if file found and cache is valid, read from file 129 | allow_caching = sportsref.get_option("cache") 130 | if file_exists and cache_is_valid and allow_caching: 131 | with open(filename, "r", encoding="utf-8", errors="replace") as f: 132 | text = f.read() 133 | # otherwise, execute function and cache results 134 | else: 135 | text = func(url) 136 | with open(filename, "w+", encoding="utf-8") as f: 137 | f.write(text) 138 | return text 139 | 140 | return wrapper 141 | 142 | 143 | def get_class_instance_key(cls, args, kwargs): 144 | """ 145 | Returns a unique identifier for a class instantiation. 146 | """ 147 | identifiers = [id(cls)] 148 | for arg in args: 149 | identifiers.append(id(arg)) 150 | identifiers.extend((k, id(v)) for k, v in list(kwargs.items())) 151 | return tuple(sorted(identifiers)) 152 | 153 | 154 | # used as a metaclass for classes that should be memoized 155 | # (technically not a decorator, but it's similar enough) 156 | Cached = mementos.memento_factory("Cached", get_class_instance_key) 157 | 158 | 159 | def memoize(fun): 160 | """A decorator for memoizing functions. 161 | 162 | Only works on functions that take simple arguments - arguments that take 163 | list-like or dict-like arguments will not be memoized, and this function 164 | will raise a TypeError. 165 | """ 166 | 167 | @functools.wraps(fun) 168 | def wrapper(*args, **kwargs): 169 | 170 | do_memoization = sportsref.get_option("memoize") 171 | if not do_memoization: 172 | return fun(*args, **kwargs) 173 | 174 | hash_args = tuple(args) 175 | hash_kwargs = frozenset(sorted(kwargs.items())) 176 | key = (hash_args, hash_kwargs) 177 | 178 | def _copy(v): 179 | if isinstance(v, pq): 180 | return v.clone() 181 | else: 182 | return copy.deepcopy(v) 183 | 184 | try: 185 | ret = _copy(cache[key]) 186 | return ret 187 | except KeyError: 188 | cache[key] = fun(*args, **kwargs) 189 | ret = _copy(cache[key]) 190 | return ret 191 | except TypeError: 192 | print( 193 | f"memoization type error in function {fun.__name__} for arguments {key}" 194 | ) 195 | raise 196 | 197 | cache = {} 198 | return wrapper 199 | 200 | 201 | def kind_rpb(include_type=False): 202 | def decorator(fun): 203 | """Supports functions that return a DataFrame and have a `kind` keyword 204 | argument that specifies regular season ('R'), playoffs ('P'), or both 205 | ('B'). If given 'B', it will call the function with both 'R' and 'P' 206 | and concatenate the results. 207 | """ 208 | 209 | @functools.wraps(fun) 210 | def wrapper(*args, **kwargs): 211 | kind = kwargs.get("kind", "R").upper() 212 | if kind == "B": 213 | kwargs["kind"] = "R" 214 | reg = fun(*args, **kwargs) 215 | if include_type: 216 | reg["is_playoffs"] = False 217 | kwargs["kind"] = "P" 218 | poffs = fun(*args, **kwargs) 219 | if include_type: 220 | poffs["is_playoffs"] = True 221 | return pd.concat((reg, poffs), ignore_index=True) 222 | else: 223 | df = fun(*args, **kwargs) 224 | if include_type: 225 | df["is_playoffs"] = kind == "P" 226 | return df 227 | 228 | return wrapper 229 | 230 | return decorator 231 | -------------------------------------------------------------------------------- /sportsref/nfl/finders/PSF.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import json 3 | import os 4 | import time 5 | import urllib.parse 6 | 7 | from pyquery import PyQuery as pq 8 | 9 | from ... import decorators, utils 10 | 11 | PSF_URL = "http://www.pro-football-reference.com/" "play-index/psl_finder.cgi" 12 | 13 | PSF_CONSTANTS_FILENAME = "PSFConstants.json" 14 | 15 | 16 | def PlayerSeasonFinder(**kwargs): 17 | """ Docstring will be filled in by __init__.py """ 18 | 19 | if "offset" not in kwargs: 20 | kwargs["offset"] = 0 21 | 22 | playerSeasons = [] 23 | while True: 24 | querystring = _kwargs_to_qs(**kwargs) 25 | url = "{}?{}".format(PSF_URL, querystring) 26 | if kwargs.get("verbose", False): 27 | print(url) 28 | html = utils.get_html(url) 29 | doc = pq(html) 30 | table = doc("table#results") 31 | df = utils.parse_table(table) 32 | if df.empty: 33 | break 34 | 35 | thisSeason = list(zip(df.player_id, df.year)) 36 | playerSeasons.extend(thisSeason) 37 | 38 | if doc('*:contains("Next Page")'): 39 | kwargs["offset"] += 100 40 | else: 41 | break 42 | 43 | return playerSeasons 44 | 45 | 46 | def _kwargs_to_qs(**kwargs): 47 | """Converts kwargs given to PSF to a querystring. 48 | 49 | :returns: the querystring. 50 | """ 51 | # start with defaults 52 | inpOptDef = inputs_options_defaults() 53 | opts = {name: dct["value"] for name, dct in list(inpOptDef.items())} 54 | 55 | # clean up keys and values 56 | for k, v in list(kwargs.items()): 57 | del kwargs[k] 58 | # bool => 'Y'|'N' 59 | if isinstance(v, bool): 60 | kwargs[k] = "Y" if v else "N" 61 | # tm, team => team_id 62 | elif k.lower() in ("tm", "team"): 63 | kwargs["team_id"] = v 64 | # yr, year, yrs, years => year_min, year_max 65 | elif k.lower() in ("yr", "year", "yrs", "years"): 66 | if isinstance(v, collections.Iterable): 67 | lst = list(v) 68 | kwargs["year_min"] = min(lst) 69 | kwargs["year_max"] = max(lst) 70 | elif isinstance(v, str): 71 | v = list(map(int, v.split(","))) 72 | kwargs["year_min"] = min(v) 73 | kwargs["year_max"] = max(v) 74 | else: 75 | kwargs["year_min"] = v 76 | kwargs["year_max"] = v 77 | # pos, position, positions => pos[] 78 | elif k.lower() in ("pos", "position", "positions"): 79 | if isinstance(v, str): 80 | v = v.split(",") 81 | elif not isinstance(v, collections.Iterable): 82 | v = [v] 83 | kwargs["pos[]"] = v 84 | # draft_pos, ... => draft_pos[] 85 | elif k.lower() in ( 86 | "draft_pos", 87 | "draftpos", 88 | "draftposition", 89 | "draftpositions", 90 | "draft_position", 91 | "draft_positions", 92 | ): 93 | if isinstance(v, str): 94 | v = v.split(",") 95 | elif not isinstance(v, collections.Iterable): 96 | v = [v] 97 | kwargs["draft_pos[]"] = v 98 | # if not one of these cases, put it back in kwargs 99 | else: 100 | kwargs[k] = v 101 | 102 | # update based on kwargs 103 | for k, v in list(kwargs.items()): 104 | # if overwriting a default, overwrite it (with a list so the 105 | # opts -> querystring list comp works) 106 | if k in opts or k in ("pos[]", "draft_pos[]"): 107 | # if multiple values separated by commas, split em 108 | if isinstance(v, str): 109 | v = v.split(",") 110 | # otherwise, make sure it's a list 111 | elif not isinstance(v, collections.Iterable): 112 | v = [v] 113 | # then, add list of values to the querystring dict *opts* 114 | opts[k] = v 115 | if "draft" in k: 116 | opts["draft"] = [1] 117 | 118 | opts["request"] = [1] 119 | opts["offset"] = [kwargs.get("offset", 0)] 120 | 121 | qs = "&".join( 122 | "{}={}".format(urllib.parse.quote_plus(name), val) 123 | for name, vals in sorted(opts.items()) 124 | for val in vals 125 | ) 126 | 127 | return qs 128 | 129 | 130 | @decorators.switch_to_dir(os.path.dirname(os.path.realpath(__file__))) 131 | def inputs_options_defaults(): 132 | """Handles scraping options for player-season finder form. 133 | 134 | :returns: {'name1': {'value': val, 'options': [opt1, ...] }, ... } 135 | """ 136 | # set time variables 137 | if os.path.isfile(PSF_CONSTANTS_FILENAME): 138 | modtime = int(os.path.getmtime(PSF_CONSTANTS_FILENAME)) 139 | curtime = int(time.time()) 140 | # if file found and it's been <= a week 141 | if os.path.isfile(PSF_CONSTANTS_FILENAME) and curtime - modtime <= 7 * 24 * 60 * 60: 142 | 143 | # just read the dict from cached file 144 | with open(PSF_CONSTANTS_FILENAME, "r") as const_f: 145 | def_dict = json.load(const_f) 146 | 147 | # otherwise, we must regenerate the dict and rewrite it 148 | else: 149 | 150 | print("Regenerating PSFConstants file") 151 | 152 | html = utils.get_html(PSF_URL) 153 | doc = pq(html) 154 | 155 | def_dict = {} 156 | # start with input elements 157 | for inp in doc("form#psl_finder input[name]"): 158 | name = inp.attrib["name"] 159 | # add blank dict if not present 160 | if name not in def_dict: 161 | def_dict[name] = { 162 | "value": set(), 163 | "options": set(), 164 | "type": inp.attrib["type"], 165 | } 166 | 167 | # handle checkboxes and radio buttons 168 | if inp.attrib["type"] in ("checkbox", "radio"): 169 | # deal with default value 170 | if "checked" in inp.attrib: 171 | def_dict[name]["value"].add(inp.attrib["value"]) 172 | # add to options 173 | def_dict[name]["options"].add(inp.attrib["value"]) 174 | # handle other types of inputs (only other type is hidden?) 175 | else: 176 | def_dict[name]["value"].add(inp.attrib.get("value", "")) 177 | 178 | # deal with dropdowns (select elements) 179 | for sel in doc.items("form#psl_finder select[name]"): 180 | name = sel.attr["name"] 181 | # add blank dict if not present 182 | if name not in def_dict: 183 | def_dict[name] = {"value": set(), "options": set(), "type": "select"} 184 | 185 | # deal with default value 186 | defaultOpt = sel("option[selected]") 187 | if len(defaultOpt): 188 | defaultOpt = defaultOpt[0] 189 | def_dict[name]["value"].add(defaultOpt.attrib.get("value", "")) 190 | else: 191 | def_dict[name]["value"].add(sel("option")[0].attrib.get("value", "")) 192 | 193 | # deal with options 194 | def_dict[name]["options"] = { 195 | opt.attrib["value"] for opt in sel("option") if opt.attrib.get("value") 196 | } 197 | 198 | def_dict.pop("request", None) 199 | def_dict.pop("use_favorites", None) 200 | 201 | with open(PSF_CONSTANTS_FILENAME, "w+") as f: 202 | for k in def_dict: 203 | try: 204 | def_dict[k]["value"] = sorted(list(def_dict[k]["value"]), key=int) 205 | def_dict[k]["options"] = sorted( 206 | list(def_dict[k]["options"]), key=int 207 | ) 208 | except Exception: 209 | def_dict[k]["value"] = sorted(list(def_dict[k]["value"])) 210 | def_dict[k]["options"] = sorted(list(def_dict[k]["options"])) 211 | json.dump(def_dict, f) 212 | 213 | return def_dict 214 | -------------------------------------------------------------------------------- /sportsref/nfl/finders/GPF.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import json 3 | import os 4 | import time 5 | 6 | from pyquery import PyQuery as pq 7 | 8 | from ... import decorators, utils 9 | from .. import pbp 10 | 11 | GPF_URL = "http://www.pro-football-reference.com/" "play-index/play_finder.cgi" 12 | 13 | GPF_CONSTANTS_FILENAME = "GPFConstants.json" 14 | 15 | 16 | def GamePlayFinder(**kwargs): 17 | """ Docstring will be filled in by __init__.py """ 18 | 19 | querystring = _kwargs_to_qs(**kwargs) 20 | url = "{}?{}".format(GPF_URL, querystring) 21 | # if verbose, print url 22 | if kwargs.get("verbose", False): 23 | print(url) 24 | html = utils.get_html(url) 25 | doc = pq(html) 26 | 27 | # parse 28 | table = doc("table#all_plays") 29 | plays = utils.parse_table(table) 30 | 31 | # parse score column 32 | if "score" in plays.columns: 33 | oScore, dScore = list(zip(*plays.score.apply(lambda s: s.split("-")))) 34 | plays["teamScore"] = oScore 35 | plays["oppScore"] = dScore 36 | # add parsed pbp info 37 | if "description" in plays.columns: 38 | plays = pbp.expand_details(plays, detailCol="description") 39 | 40 | return plays 41 | 42 | 43 | def _kwargs_to_qs(**kwargs): 44 | """Converts kwargs given to GPF to a querystring. 45 | 46 | :returns: the querystring. 47 | """ 48 | # start with defaults 49 | inpOptDef = inputs_options_defaults() 50 | opts = {name: dct["value"] for name, dct in list(inpOptDef.items())} 51 | 52 | # clean up keys and values 53 | for k, v in list(kwargs.items()): 54 | # pID, playerID => player_id 55 | if k.lower() in ("pid", "playerid"): 56 | del kwargs[k] 57 | kwargs["player_id"] = v 58 | # player_id can accept rel URLs 59 | if k == "player_id": 60 | if v.startswith("/players/"): 61 | kwargs[k] = utils.rel_url_to_id(v) 62 | # bool => 'Y'|'N' 63 | if isinstance(v, bool): 64 | kwargs[k] = "Y" if v else "N" 65 | # tm, team => team_id 66 | if k.lower() in ("tm", "team"): 67 | del kwargs[k] 68 | kwargs["team_id"] = v 69 | # yr_min, yr_max => year_min, year_max 70 | if k.lower() in ("yr_min", "yr_max"): 71 | del kwargs[k] 72 | if k.lower() == "yr_min": 73 | kwargs["year_min"] = int(v) 74 | else: 75 | kwargs["year_max"] = int(v) 76 | # wk_min, wk_max => week_num_min, week_num_max 77 | if k.lower() in ("wk_min", "wk_max"): 78 | del kwargs[k] 79 | if k.lower() == "wk_min": 80 | kwargs["week_num_min"] = int(v) 81 | else: 82 | kwargs["week_num_max"] = int(v) 83 | # yr, year, yrs, years => year_min, year_max 84 | if k.lower() in ("yr", "year", "yrs", "years"): 85 | del kwargs[k] 86 | if isinstance(v, collections.Iterable): 87 | lst = list(v) 88 | kwargs["year_min"] = min(lst) 89 | kwargs["year_max"] = max(lst) 90 | elif isinstance(v, str): 91 | v = list(map(int, v.split(","))) 92 | kwargs["year_min"] = min(v) 93 | kwargs["year_max"] = max(v) 94 | else: 95 | kwargs["year_min"] = v 96 | kwargs["year_max"] = v 97 | # wk, week, wks, weeks => week_num_min, week_num_max 98 | if k.lower() in ("wk", "week", "wks", "weeks"): 99 | del kwargs[k] 100 | if isinstance(v, collections.Iterable): 101 | lst = list(v) 102 | kwargs["week_num_min"] = min(lst) 103 | kwargs["week_num_max"] = max(lst) 104 | elif isinstance(v, str): 105 | v = list(map(int, v.split(","))) 106 | kwargs["week_num_min"] = min(v) 107 | kwargs["week_num_max"] = max(v) 108 | else: 109 | kwargs["week_num_min"] = v 110 | kwargs["week_num_max"] = v 111 | # if playoff_round defined, then turn on playoff flag 112 | if k == "playoff_round": 113 | kwargs["game_type"] = "P" 114 | if isinstance(v, str): 115 | v = v.split(",") 116 | if not isinstance(v, collections.Iterable): 117 | v = [v] 118 | 119 | # reset values to blank for defined kwargs 120 | for k in kwargs: 121 | if k in opts: 122 | opts[k] = [] 123 | 124 | # update based on kwargs 125 | for k, v in list(kwargs.items()): 126 | # if overwriting a default, overwrite it 127 | if k in opts: 128 | # if multiple values separated by commas, split em 129 | if isinstance(v, str): 130 | v = v.split(",") 131 | elif not isinstance(v, collections.Iterable): 132 | v = [v] 133 | for val in v: 134 | opts[k].append(val) 135 | 136 | opts["request"] = [1] 137 | 138 | qs = "&".join( 139 | "{}={}".format(name, val) for name, vals in sorted(opts.items()) for val in vals 140 | ) 141 | 142 | return qs 143 | 144 | 145 | @decorators.switch_to_dir(os.path.dirname(os.path.realpath(__file__))) 146 | def inputs_options_defaults(): 147 | """Handles scraping options for play finder form. 148 | 149 | :returns: {'name1': {'value': val, 'options': [opt1, ...] }, ... } 150 | 151 | """ 152 | # set time variables 153 | if os.path.isfile(GPF_CONSTANTS_FILENAME): 154 | modtime = int(os.path.getmtime(GPF_CONSTANTS_FILENAME)) 155 | curtime = int(time.time()) 156 | # if file found and it's been <= a week 157 | if os.path.isfile(GPF_CONSTANTS_FILENAME) and curtime - modtime <= 7 * 24 * 60 * 60: 158 | 159 | # just read the dict from the cached file 160 | with open(GPF_CONSTANTS_FILENAME, "r") as const_f: 161 | def_dict = json.load(const_f) 162 | 163 | # otherwise, we must regenerate the dict and rewrite it 164 | else: 165 | 166 | print("Regenerating GPFConstants file") 167 | 168 | html = utils.get_html(GPF_URL) 169 | doc = pq(html) 170 | 171 | def_dict = {} 172 | # start with input elements 173 | for inp in doc("form#play_finder input[name]"): 174 | name = inp.attrib["name"] 175 | # add blank dict if not present 176 | if name not in def_dict: 177 | def_dict[name] = {"value": set(), "options": set(), "type": inp.type} 178 | 179 | val = inp.attrib.get("value", "") 180 | # handle checkboxes and radio buttons 181 | if inp.type in ("checkbox", "radio"): 182 | # deal with default value 183 | if "checked" in inp.attrib: 184 | def_dict[name]["value"].add(val) 185 | # add to options 186 | def_dict[name]["options"].add(val) 187 | # handle other types of inputs (only other type is hidden?) 188 | else: 189 | def_dict[name]["value"].add(val) 190 | 191 | # for dropdowns (select elements) 192 | for sel in doc.items("form#play_finder select[name]"): 193 | name = sel.attr["name"] 194 | # add blank dict if not present 195 | if name not in def_dict: 196 | def_dict[name] = {"value": set(), "options": set(), "type": "select"} 197 | 198 | # deal with default value 199 | defaultOpt = sel("option[selected]") 200 | if len(defaultOpt): 201 | defaultOpt = defaultOpt[0] 202 | def_dict[name]["value"].add(defaultOpt.attrib.get("value", "")) 203 | else: 204 | def_dict[name]["value"].add(sel("option")[0].attrib.get("value", "")) 205 | 206 | # deal with options 207 | def_dict[name]["options"] = { 208 | opt.attrib["value"] for opt in sel("option") if opt.attrib.get("value") 209 | } 210 | 211 | # ignore QB kneels by default 212 | def_dict["include_kneels"]["value"] = ["0"] 213 | 214 | def_dict.pop("request", None) 215 | def_dict.pop("use_favorites", None) 216 | 217 | with open(GPF_CONSTANTS_FILENAME, "w+") as f: 218 | for k in def_dict: 219 | try: 220 | def_dict[k]["value"] = sorted(list(def_dict[k]["value"]), key=int) 221 | def_dict[k]["options"] = sorted( 222 | list(def_dict[k]["options"]), key=int 223 | ) 224 | except Exception: 225 | def_dict[k]["value"] = sorted(list(def_dict[k]["value"])) 226 | def_dict[k]["options"] = sorted(list(def_dict[k]["options"])) 227 | json.dump(def_dict, f) 228 | 229 | return def_dict 230 | -------------------------------------------------------------------------------- /sportsref/nba/players.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import re 3 | 4 | from pyquery import PyQuery as pq 5 | 6 | import sportsref 7 | 8 | __all__ = ["Player"] 9 | 10 | 11 | class Player(object, metaclass=sportsref.decorators.Cached): 12 | 13 | """Each instance of this class represents an NBA player, uniquely 14 | identified by a player ID. The instance methods give various data available 15 | from the player's Basketball Reference player page.""" 16 | 17 | def __init__(self, player_id): 18 | self.player_id = player_id 19 | self.url_base = f"{sportsref.nba.BASE_URL}/players/{player_id[0]}/{player_id}" 20 | self.main_url = self.url_base + ".htm" 21 | 22 | def __eq__(self, other): 23 | return self.player_id == other.player_id 24 | 25 | def __hash__(self): 26 | return hash(self.player_id) 27 | 28 | def __repr__(self): 29 | return f"Player({self.player_id})" 30 | 31 | def __str__(self): 32 | return self.name() 33 | 34 | @sportsref.decorators.memoize 35 | def get_main_doc(self): 36 | return pq(sportsref.utils.get_html(self.main_url)) 37 | 38 | @sportsref.decorators.memoize 39 | def get_sub_doc(self, rel_url): 40 | url = f"{self.url_base}/{rel_url}" 41 | return pq(sportsref.utils.get_html(url)) 42 | 43 | @sportsref.decorators.memoize 44 | def name(self): 45 | """Returns the name of the player as a string.""" 46 | doc = self.get_main_doc() 47 | return doc('h1[itemprop="name"]').text() 48 | 49 | @sportsref.decorators.memoize 50 | def age(self, year, month=2, day=1): 51 | """Returns the age of the player on a given date. 52 | 53 | :year: int representing the year. 54 | :month: int representing the month (1-12). 55 | :day: int representing the day within the month (1-31). 56 | :returns: Age in years as a float. 57 | """ 58 | doc = self.get_main_doc() 59 | date_string = doc('span[itemprop="birthDate"]').attr("data-birth") 60 | regex = r"(\d{4})\-(\d{2})\-(\d{2})" 61 | date_args = list(map(int, re.match(regex, date_string).groups())) 62 | birth_date = datetime.date(*date_args) 63 | age_date = datetime.date(year=year, month=month, day=day) 64 | delta = age_date - birth_date 65 | age = delta.days / 365.0 66 | return age 67 | 68 | @sportsref.decorators.memoize 69 | def position(self): 70 | """TODO: Docstring for position. 71 | :returns: TODO 72 | """ 73 | raise Exception("not yet implemented - nba.Player.position") 74 | 75 | @sportsref.decorators.memoize 76 | def height(self): 77 | """Returns the player's height (in inches). 78 | :returns: An int representing a player's height in inches. 79 | """ 80 | doc = self.get_main_doc() 81 | raw = doc('span[itemprop="height"]').text() 82 | try: 83 | feet, inches = list(map(int, raw.split("-"))) 84 | return feet * 12 + inches 85 | except ValueError: 86 | return None 87 | 88 | @sportsref.decorators.memoize 89 | def weight(self): 90 | """Returns the player's weight (in pounds). 91 | :returns: An int representing a player's weight in pounds. 92 | """ 93 | doc = self.get_main_doc() 94 | raw = doc('span[itemprop="weight"]').text() 95 | try: 96 | weight = re.match(r"(\d+)lb", raw).group(1) 97 | return int(weight) 98 | except ValueError: 99 | return None 100 | 101 | @sportsref.decorators.memoize 102 | def hand(self): 103 | """Returns the player's handedness. 104 | :returns: 'L' for left-handed, 'R' for right-handed. 105 | """ 106 | doc = self.get_main_doc() 107 | hand = re.search(r"Shoots:\s*(L|R)", doc.text()).group(1) 108 | return hand 109 | 110 | @sportsref.decorators.memoize 111 | def draft_pick(self): 112 | """Returns when in the draft the player was picked. 113 | :returns: TODO 114 | """ 115 | doc = self.get_main_doc() 116 | try: 117 | p_tags = doc("div#meta p") 118 | draft_p_tag = next( 119 | p for p in list(p_tags.items()) if p.text().lower().startswith("draft") 120 | ) 121 | draft_pick = int( 122 | re.search(r"(\d+)\w{,3}\s+?overall", draft_p_tag.text()).group(1) 123 | ) 124 | return draft_pick 125 | except Exception: 126 | return None 127 | 128 | @sportsref.decorators.memoize 129 | def draft_year(self): 130 | """Returns the year the player was selected (or undrafted). 131 | :returns: TODO 132 | """ 133 | raise Exception("not yet implemented - nba.Player.draft_year") 134 | 135 | @sportsref.decorators.kind_rpb(include_type=True) 136 | def _get_stats_table(self, table_id, kind="R", summary=False): 137 | """Gets a stats table from the player page; helper function that does 138 | the work for per-game, per-100-poss, etc. stats. 139 | 140 | :table_id: the ID of the HTML table. 141 | :kind: specifies regular season, playoffs, or both. One of 'R', 'P', 142 | 'B'. Defaults to 'R'. 143 | :returns: A DataFrame of stats. 144 | """ 145 | doc = self.get_main_doc() 146 | table_id = f"table#{'playoffs_' if kind == 'P' else ''}{table_id}" 147 | table = doc(table_id) 148 | df = sportsref.utils.parse_table(table, flatten=(not summary), footer=summary) 149 | return df 150 | 151 | @sportsref.decorators.memoize 152 | def stats_per_game(self, kind="R", summary=False): 153 | """Returns a DataFrame of per-game box score stats.""" 154 | return self._get_stats_table("per_game", kind=kind, summary=summary) 155 | 156 | @sportsref.decorators.memoize 157 | def stats_totals(self, kind="R", summary=False): 158 | """Returns a DataFrame of total box score statistics by season.""" 159 | return self._get_stats_table("totals", kind=kind, summary=summary) 160 | 161 | @sportsref.decorators.memoize 162 | def stats_per36(self, kind="R", summary=False): 163 | """Returns a DataFrame of per-36-minutes stats.""" 164 | return self._get_stats_table("per_minute", kind=kind, summary=summary) 165 | 166 | @sportsref.decorators.memoize 167 | def stats_per100(self, kind="R", summary=False): 168 | """Returns a DataFrame of per-100-possession stats.""" 169 | return self._get_stats_table("per_poss", kind=kind, summary=summary) 170 | 171 | @sportsref.decorators.memoize 172 | def stats_advanced(self, kind="R", summary=False): 173 | """Returns a DataFrame of advanced stats.""" 174 | return self._get_stats_table("advanced", kind=kind, summary=summary) 175 | 176 | @sportsref.decorators.memoize 177 | def stats_shooting(self, kind="R", summary=False): 178 | """Returns a DataFrame of shooting stats.""" 179 | return self._get_stats_table("shooting", kind=kind, summary=summary) 180 | 181 | @sportsref.decorators.memoize 182 | def stats_adjusted_shooting(self, kind="R", summary=False): 183 | """Returns a DataFrame of adjusted shooting stats.""" 184 | return self._get_stats_table("adj-shooting", kind=kind, summary=summary) 185 | 186 | @sportsref.decorators.memoize 187 | def stats_pbp(self, kind="R", summary=False): 188 | """Returns a DataFrame of play-by-play stats.""" 189 | return self._get_stats_table("pbp", kind=kind, summary=summary) 190 | 191 | @sportsref.decorators.memoize 192 | @sportsref.decorators.kind_rpb(include_type=True) 193 | def gamelog_basic(self, year, kind="R"): 194 | """Returns a table of a player's basic game-by-game stats for a season. 195 | 196 | :param year: The year representing the desired season. 197 | :param kind: specifies regular season, playoffs, or both. One of 'R', 198 | 'P', 'B'. Defaults to 'R'. 199 | :returns: A DataFrame of the player's standard boxscore stats from each 200 | game of the season. 201 | :rtype: pd.DataFrame 202 | """ 203 | doc = self.get_sub_doc(f"gamelog/{year}") 204 | table = ( 205 | doc("table#pgl_basic_playoffs") if kind == "P" else doc("table#pgl_basic") 206 | ) 207 | df = sportsref.utils.parse_table(table) 208 | return df 209 | 210 | @sportsref.decorators.memoize 211 | @sportsref.decorators.kind_rpb(include_type=True) 212 | def gamelog_advanced(self, year, kind="R"): 213 | """Returns a table of a player's advanced game-by-game stats for a 214 | season. 215 | 216 | :param year: The year representing the desired season. 217 | :param kind: specifies regular season, playoffs, or both. One of 'R', 218 | 'P', 'B'. Defaults to 'R'. 219 | :returns: A DataFrame of the player's advanced stats from each game of 220 | the season. 221 | :rtype: pd.DataFrame 222 | """ 223 | doc = self.get_sub_doc(f"gamelog-advanced/{year}") 224 | table = ( 225 | doc("table#pgl_advanced_playoffs") 226 | if kind == "P" 227 | else doc("table#pgl_advanced") 228 | ) 229 | df = sportsref.utils.parse_table(table) 230 | return df 231 | -------------------------------------------------------------------------------- /sportsref/nba/seasons.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from pyquery import PyQuery as pq 3 | 4 | import sportsref 5 | 6 | 7 | class Season(object, metaclass=sportsref.decorators.Cached): 8 | 9 | """Object representing a given NBA season.""" 10 | 11 | def __init__(self, year): 12 | """Initializes a Season object for an NBA season. 13 | 14 | :year: The year of the season we want. 15 | """ 16 | self.yr = int(year) 17 | 18 | def __eq__(self, other): 19 | return self.yr == other.yr 20 | 21 | def __hash__(self): 22 | return hash(self.yr) 23 | 24 | def __repr__(self): 25 | return "Season({})".format(self.yr) 26 | 27 | def _subpage_url(self, page): 28 | return sportsref.nba.BASE_URL + "/leagues/NBA_{}_{}.html".format(self.yr, page) 29 | 30 | @sportsref.decorators.memoize 31 | def get_main_doc(self): 32 | """Returns PyQuery object for the main season URL. 33 | :returns: PyQuery object. 34 | """ 35 | url = sportsref.nba.BASE_URL + "/leagues/NBA_{}.html".format(self.yr) 36 | return pq(sportsref.utils.get_html(url)) 37 | 38 | @sportsref.decorators.memoize 39 | def get_sub_doc(self, subpage): 40 | """Returns PyQuery object for a given subpage URL. 41 | :subpage: The subpage of the season, e.g. 'per_game'. 42 | :returns: PyQuery object. 43 | """ 44 | html = sportsref.utils.get_html(self._subpage_url(subpage)) 45 | return pq(html) 46 | 47 | @sportsref.decorators.memoize 48 | def get_team_ids(self): 49 | """Returns a list of the team IDs for the given year. 50 | :returns: List of team IDs. 51 | """ 52 | df = self.team_stats_per_game() 53 | if not df.empty: 54 | return df.index.tolist() 55 | else: 56 | print("ERROR: no teams found") 57 | return [] 58 | 59 | @sportsref.decorators.memoize 60 | def team_ids_to_names(self): 61 | """Mapping from 3-letter team IDs to full team names. 62 | :returns: Dictionary with team IDs as keys and full team strings as 63 | values. 64 | """ 65 | doc = self.get_main_doc() 66 | table = doc("table#team-stats-per_game") 67 | flattened = sportsref.utils.parse_table(table, flatten=True) 68 | unflattened = sportsref.utils.parse_table(table, flatten=False) 69 | team_ids = flattened["team_id"] 70 | team_names = unflattened["team_name"] 71 | if len(team_names) != len(team_ids): 72 | raise Exception("team names and team IDs don't align") 73 | return dict(list(zip(team_ids, team_names))) 74 | 75 | @sportsref.decorators.memoize 76 | def team_names_to_ids(self): 77 | """Mapping from full team names to 3-letter team IDs. 78 | :returns: Dictionary with tean names as keys and team IDs as values. 79 | """ 80 | d = self.team_ids_to_names() 81 | return {v: k for k, v in list(d.items())} 82 | 83 | @sportsref.decorators.memoize 84 | @sportsref.decorators.kind_rpb(include_type=True) 85 | def schedule(self, kind="R"): 86 | """Returns a list of BoxScore IDs for every game in the season. 87 | Only needs to handle 'R' or 'P' options because decorator handles 'B'. 88 | 89 | :param kind: 'R' for regular season, 'P' for playoffs, 'B' for both. 90 | Defaults to 'R'. 91 | :returns: DataFrame of schedule information. 92 | :rtype: pd.DataFrame 93 | """ 94 | kind = kind.upper()[0] 95 | dfs = [] 96 | 97 | # get games from each month 98 | for month in ( 99 | "october", 100 | "november", 101 | "december", 102 | "january", 103 | "february", 104 | "march", 105 | "april", 106 | "may", 107 | "june", 108 | ): 109 | try: 110 | doc = self.get_sub_doc("games-{}".format(month)) 111 | except ValueError: 112 | continue 113 | table = doc("table#schedule") 114 | df = sportsref.utils.parse_table(table) 115 | dfs.append(df) 116 | df = pd.concat(dfs).reset_index(drop=True) 117 | 118 | # figure out how many regular season games 119 | try: 120 | sportsref.utils.get_html( 121 | "{}/playoffs/NBA_{}.html".format(sportsref.nba.BASE_URL, self.yr) 122 | ) 123 | is_past_season = True 124 | except ValueError: 125 | is_past_season = False 126 | 127 | if is_past_season: 128 | team_per_game = self.team_stats_per_game() 129 | n_reg_games = int(team_per_game.g.sum() // 2) 130 | else: 131 | n_reg_games = len(df) 132 | 133 | # subset appropriately based on `kind` 134 | if kind == "P": 135 | return df.iloc[n_reg_games:] 136 | else: 137 | return df.iloc[:n_reg_games] 138 | 139 | def finals_winner(self): 140 | """Returns the team ID for the winner of that year's NBA Finals. 141 | :returns: 3-letter team ID for champ. 142 | """ 143 | raise NotImplementedError("nba.Season.finals_winner") 144 | 145 | def finals_loser(self): 146 | """Returns the team ID for the loser of that year's NBA Finals. 147 | :returns: 3-letter team ID for runner-up. 148 | """ 149 | raise NotImplementedError("nba.Season.finals_loser") 150 | 151 | def standings(self): 152 | """Returns a DataFrame containing standings information.""" 153 | doc = self.get_sub_doc("standings") 154 | 155 | east_table = doc("table#confs_standings_E") 156 | east_df = sportsref.utils.parse_table(east_table) 157 | east_df.sort_values("wins", ascending=False, inplace=True) 158 | east_df["seed"] = list(range(1, len(east_df) + 1)) 159 | east_df["conference"] = "E" 160 | 161 | west_table = doc("table#confs_standings_W") 162 | west_df = sportsref.utils.parse_table(west_table) 163 | west_df.sort_values("wins", ascending=False, inplace=True) 164 | west_df["seed"] = list(range(1, len(west_df) + 1)) 165 | west_df["conference"] = "W" 166 | 167 | full_df = pd.concat([east_df, west_df], axis=0).reset_index(drop=True) 168 | full_df["gb"] = [ 169 | gb if isinstance(gb, int) or isinstance(gb, float) else 0 170 | for gb in full_df["gb"] 171 | ] 172 | full_df = full_df.drop("has_class_full_table", axis=1) 173 | 174 | expanded_table = doc("table#expanded_standings") 175 | expanded_df = sportsref.utils.parse_table(expanded_table) 176 | 177 | full_df = pd.merge(full_df, expanded_df, on="team_id") 178 | return full_df 179 | 180 | @sportsref.decorators.memoize 181 | def _get_team_stats_table(self, selector): 182 | """Helper function for stats tables on season pages. Returns a 183 | DataFrame.""" 184 | doc = self.get_main_doc() 185 | table = doc(selector) 186 | df = sportsref.utils.parse_table(table) 187 | df.set_index("team_id", inplace=True) 188 | return df 189 | 190 | def team_stats_per_game(self): 191 | """Returns a Pandas DataFrame of each team's basic per-game stats for 192 | the season.""" 193 | return self._get_team_stats_table("table#team-stats-per_game") 194 | 195 | def opp_stats_per_game(self): 196 | """Returns a Pandas DataFrame of each team's opponent's basic per-game 197 | stats for the season.""" 198 | return self._get_team_stats_table("table#opponent-stats-per_game") 199 | 200 | def team_stats_totals(self): 201 | """Returns a Pandas DataFrame of each team's basic stat totals for the 202 | season.""" 203 | return self._get_team_stats_table("table#team-stats-base") 204 | 205 | def opp_stats_totals(self): 206 | """Returns a Pandas DataFrame of each team's opponent's basic stat 207 | totals for the season.""" 208 | return self._get_team_stats_table("table#opponent-stats-base") 209 | 210 | def misc_stats(self): 211 | """Returns a Pandas DataFrame of miscellaneous stats about each team's 212 | season.""" 213 | return self._get_team_stats_table("table#misc_stats") 214 | 215 | def team_stats_shooting(self): 216 | """Returns a Pandas DataFrame of each team's shooting stats for the 217 | season.""" 218 | return self._get_team_stats_table("table#team_shooting") 219 | 220 | def opp_stats_shooting(self): 221 | """Returns a Pandas DataFrame of each team's opponent's shooting stats 222 | for the season.""" 223 | return self._get_team_stats_table("table#opponent_shooting") 224 | 225 | @sportsref.decorators.memoize 226 | def _get_player_stats_table(self, identifier): 227 | """Helper function for player season stats. 228 | 229 | :identifier: string identifying the type of stat, e.g. 'per_game'. 230 | :returns: A DataFrame of stats. 231 | """ 232 | doc = self.get_sub_doc(identifier) 233 | table = doc("table#{}_stats".format(identifier)) 234 | df = sportsref.utils.parse_table(table) 235 | return df 236 | 237 | def player_stats_per_game(self): 238 | """Returns a DataFrame of per-game player stats for a season.""" 239 | return self._get_player_stats_table("per_game") 240 | 241 | def player_stats_totals(self): 242 | """Returns a DataFrame of player stat totals for a season.""" 243 | return self._get_player_stats_table("totals") 244 | 245 | def player_stats_per36(self): 246 | """Returns a DataFrame of player per-36 min stats for a season.""" 247 | return self._get_player_stats_table("per_minute") 248 | 249 | def player_stats_per100(self): 250 | """Returns a DataFrame of player per-100 poss stats for a season.""" 251 | return self._get_player_stats_table("per_poss") 252 | 253 | def player_stats_advanced(self): 254 | """Returns a DataFrame of player per-100 poss stats for a season.""" 255 | return self._get_player_stats_table("advanced") 256 | 257 | def mvp_voting(self): 258 | """Returns a DataFrame containing information about MVP voting.""" 259 | raise NotImplementedError("nba.Season.mvp_voting") 260 | 261 | def roy_voting(self): 262 | """Returns a DataFrame containing information about ROY voting.""" 263 | url = "{}/awards/awards_{}.html".format(sportsref.nba.BASE_URL, self.yr) 264 | doc = pq(sportsref.utils.get_html(url)) 265 | table = doc("table#roy") 266 | df = sportsref.utils.parse_table(table) 267 | return df 268 | -------------------------------------------------------------------------------- /sportsref/utils.py: -------------------------------------------------------------------------------- 1 | import ctypes 2 | import multiprocessing 3 | import re 4 | import threading 5 | import time 6 | 7 | import pandas as pd 8 | import requests 9 | from pyquery import PyQuery as pq 10 | 11 | import sportsref 12 | 13 | # time between requests, in seconds 14 | THROTTLE_DELAY = 0.5 15 | 16 | # variables used to throttle requests across processes 17 | throttle_thread_lock = threading.Lock() 18 | throttle_process_lock = multiprocessing.Lock() 19 | last_request_time = multiprocessing.Value( 20 | ctypes.c_longdouble, time.time() - 10 * THROTTLE_DELAY 21 | ) 22 | 23 | 24 | @sportsref.decorators.cache 25 | def get_html(url): 26 | """Gets the HTML for the given URL using a GET request. 27 | 28 | :url: the absolute URL of the desired page. 29 | :returns: a string of HTML. 30 | """ 31 | global last_request_time 32 | with throttle_process_lock: 33 | with throttle_thread_lock: 34 | # sleep until THROTTLE_DELAY secs have passed since last request 35 | wait_left = THROTTLE_DELAY - (time.time() - last_request_time.value) 36 | if wait_left > 0: 37 | time.sleep(wait_left) 38 | 39 | # make request 40 | response = requests.get(url) 41 | 42 | # update last request time for throttling 43 | last_request_time.value = time.time() 44 | 45 | # raise ValueError on 4xx status code, get rid of comments, and return 46 | if 400 <= response.status_code < 500: 47 | raise ValueError( 48 | f'Status Code {response.status_code} received fetching URL "{url}"' 49 | ) 50 | html = response.text 51 | html = html.replace("", "") 52 | 53 | return html 54 | 55 | 56 | def parse_table(table, flatten=True, footer=False): 57 | """Parses a table from sports-reference sites into a pandas dataframe. 58 | 59 | :param table: the PyQuery object representing the HTML table 60 | :param flatten: if True, flattens relative URLs to IDs. otherwise, leaves 61 | all fields as text without cleaning. 62 | :param footer: If True, returns the summary/footer of the page. Recommended 63 | to use this with flatten=False. Defaults to False. 64 | :returns: pd.DataFrame 65 | """ 66 | if not len(table): 67 | return pd.DataFrame() 68 | 69 | # get columns 70 | columns = [ 71 | c.attrib["data-stat"] for c in table("thead tr:not([class]) th[data-stat]") 72 | ] 73 | 74 | # get data 75 | rows = list( 76 | table("tbody tr" if not footer else "tfoot tr") 77 | .not_(".thead, .stat_total, .stat_average") 78 | .items() 79 | ) 80 | data = [ 81 | [flatten_links(td) if flatten else td.text() for td in row.items("th,td")] 82 | for row in rows 83 | ] 84 | 85 | # make DataFrame 86 | df = pd.DataFrame(data, columns=columns, dtype="float") 87 | 88 | # add has_class columns 89 | all_classes = set( 90 | cls for row in rows if row.attr["class"] for cls in row.attr["class"].split() 91 | ) 92 | for cls in all_classes: 93 | df["has_class_" + cls] = [ 94 | bool(row.attr["class"] and cls in row.attr["class"].split()) for row in rows 95 | ] 96 | 97 | # cleaning the DataFrame 98 | 99 | df.drop(["ranker", "Xxx", "Yyy", "Zzz"], axis=1, inplace=True, errors="ignore") 100 | 101 | # year_id -> year (as int) 102 | if "year_id" in df.columns: 103 | df.rename(columns={"year_id": "year"}, inplace=True) 104 | if flatten: 105 | df.year = df.year.fillna(method="ffill") 106 | df["year"] = df.year.map(lambda s: str(s)[:4]).astype(int) 107 | 108 | # pos -> position 109 | if "pos" in df.columns: 110 | df.rename(columns={"pos": "position"}, inplace=True) 111 | 112 | # boxscore_word, game_date -> boxscore_id and separate into Y, M, D columns 113 | for bs_id_col in ("boxscore_word", "game_date", "box_score_text"): 114 | if bs_id_col in df.columns: 115 | df.rename(columns={bs_id_col: "boxscore_id"}, inplace=True) 116 | break 117 | 118 | # ignore *, +, and other characters used to note things 119 | df.replace(re.compile(r"[\*\+\u2605]", re.U), "", inplace=True) 120 | for col in df.columns: 121 | if hasattr(df[col], "str"): 122 | df[col] = df[col].str.strip() 123 | 124 | # player -> player_id and/or player_name 125 | if "player" in df.columns: 126 | if flatten: 127 | df.rename(columns={"player": "player_id"}, inplace=True) 128 | # when flattening, keep a column for names 129 | player_names = parse_table(table, flatten=False)["player_name"] 130 | df["player_name"] = player_names 131 | else: 132 | df.rename(columns={"player": "player_name"}, inplace=True) 133 | 134 | # team, team_name -> team_id 135 | for team_col in ("team", "team_name"): 136 | if team_col in df.columns: 137 | # first, get rid of faulty rows 138 | df = df.loc[~df[team_col].isin(["XXX"])] 139 | if flatten: 140 | df.rename(columns={team_col: "team_id"}, inplace=True) 141 | 142 | # season -> int 143 | if "season" in df.columns and flatten: 144 | df["season"] = df["season"].astype(int) 145 | 146 | # handle date_game columns (different types) 147 | if "date_game" in df.columns and flatten: 148 | date_re = r"month=(?P\d+)&day=(?P\d+)&year=(?P\d+)" 149 | date_df = df["date_game"].str.extract(date_re, expand=True) 150 | if date_df.notnull().all(axis=1).any(): 151 | df = pd.concat((df, date_df), axis=1) 152 | else: 153 | df.rename(columns={"date_game": "boxscore_id"}, inplace=True) 154 | 155 | # game_location -> is_home 156 | if "game_location" in df.columns and flatten: 157 | df["game_location"] = df["game_location"].isnull() 158 | df.rename(columns={"game_location": "is_home"}, inplace=True) 159 | 160 | # mp: (min:sec) -> float(min + sec / 60), notes -> NaN, new column 161 | if "mp" in df.columns and df.dtypes["mp"] == object and flatten: 162 | mp_df = ( 163 | df["mp"].str.extract(r"(?P\d+):(?P\d+)", expand=True).astype(float) 164 | ) 165 | no_match = mp_df.isnull().all(axis=1) 166 | if no_match.any(): 167 | df.loc[no_match, "note"] = df.loc[no_match, "mp"] 168 | df["mp"] = mp_df["m"] + mp_df["s"] / 60 169 | 170 | # converts number-y things to floats 171 | def convert_to_float(val): 172 | # percentages: (number%) -> float(number * 0.01) 173 | m = re.search(r"([-\.\d]+)\%", val if isinstance(val, str) else str(val), re.U) 174 | try: 175 | if m: 176 | return float(m.group(1)) / 100 if m else val 177 | if m: 178 | return int(m.group(1)) + int(m.group(2)) / 60 179 | except ValueError: 180 | return val 181 | # salaries: $ABC,DEF,GHI -> float(ABCDEFGHI) 182 | m = re.search(r"\$[\d,]+", val if isinstance(val, str) else str(val), re.U) 183 | try: 184 | if m: 185 | return float(re.sub(r"\$|,", "", val)) 186 | except Exception: 187 | return val 188 | # generally try to coerce to float, unless it's an int or bool 189 | try: 190 | if isinstance(val, (int, bool)): 191 | return val 192 | else: 193 | return float(val) 194 | except Exception: 195 | return val 196 | 197 | if flatten: 198 | df = df.applymap(convert_to_float) 199 | 200 | df = df.loc[df.astype(bool).any(axis=1)] 201 | 202 | return df 203 | 204 | 205 | def parse_info_table(table): 206 | """Parses an info table, like the "Game Info" table or the "Officials" 207 | table on the PFR Boxscore page. Keys are lower case and have spaces/special 208 | characters converted to underscores. 209 | 210 | :table: PyQuery object representing the HTML table. 211 | :returns: A dictionary representing the information. 212 | """ 213 | ret = {} 214 | for tr in list(table("tr").not_(".thead").items()): 215 | th, td = list(tr("th, td").items()) 216 | key = th.text().lower() 217 | key = re.sub(r"\W", "_", key) 218 | val = sportsref.utils.flatten_links(td) 219 | ret[key] = val 220 | return ret 221 | 222 | 223 | def parse_awards_table(table): 224 | """Parses an awards table, like the "Pro Bowls" table on a PFR player page. 225 | 226 | :table: PyQuery object representing the HTML table. 227 | :returns: A list of the entries in the table, with flattened links. 228 | """ 229 | return [flatten_links(tr) for tr in list(table("tr").items())] 230 | 231 | 232 | def flatten_links(td, _recurse=False): 233 | """Flattens relative URLs within text of a table cell to IDs and returns 234 | the result. 235 | 236 | :td: the PyQuery object for the HTML to convert 237 | :returns: the string with the links flattened to IDs 238 | """ 239 | 240 | # helper function to flatten individual strings/links 241 | def _flatten_node(c): 242 | if isinstance(c, str): 243 | return c 244 | elif "href" in c.attrib: 245 | c_id = rel_url_to_id(c.attrib["href"]) 246 | return c_id if c_id else c.text_content() 247 | else: 248 | return flatten_links(pq(c), _recurse=True) 249 | 250 | # if there's no text, just return None 251 | if td is None or not td.text(): 252 | return "" if _recurse else None 253 | 254 | td.remove("span.note") 255 | return "".join(_flatten_node(c) for c in td.contents()) 256 | 257 | 258 | @sportsref.decorators.memoize 259 | def rel_url_to_id(url): 260 | """Converts a relative URL to a unique ID. 261 | 262 | Here, 'ID' refers generally to the unique ID for a given 'type' that a 263 | given datum has. For example, 'BradTo00' is Tom Brady's player ID - this 264 | corresponds to his relative URL, '/players/B/BradTo00.htm'. Similarly, 265 | '201409070dal' refers to the boxscore of the SF @ DAL game on 09/07/14. 266 | 267 | Supported types: 268 | * player/... 269 | * boxscores/... 270 | * teams/... 271 | * years/... 272 | * leagues/... 273 | * awards/... 274 | * coaches/... 275 | * officials/... 276 | * schools/... 277 | * schools/high_schools.cgi?id=... 278 | 279 | :returns: ID associated with the given relative URL. 280 | """ 281 | year_regex = r".*/years/(\d{4}).*|.*/gamelog/(\d{4}).*" 282 | player_regex = r".*/players/(?:\w/)?(.+?)(?:/|\.html?)" 283 | boxscores_regex = r".*/boxscores/(.+?)\.html?" 284 | team_regex = r".*/teams/(\w{3})/.*" 285 | coach_regex = r".*/coaches/(.+?)\.html?" 286 | stadium_regex = r".*/stadiums/(.+?)\.html?" 287 | ref_regex = r".*/officials/(.+?r)\.html?" 288 | college_regex = r".*/schools/(\S+?)/.*|.*college=([^&]+)" 289 | hs_regex = r".*/schools/high_schools\.cgi\?id=([^\&]{8})" 290 | bs_date_regex = r".*/boxscores/index\.f?cgi\?(month=\d+&day=\d+&year=\d+)" 291 | league_regex = r".*/leagues/(.*_\d{4}).*" 292 | award_regex = r".*/awards/(.+)\.htm" 293 | 294 | regexes = [ 295 | year_regex, 296 | player_regex, 297 | boxscores_regex, 298 | team_regex, 299 | coach_regex, 300 | stadium_regex, 301 | ref_regex, 302 | college_regex, 303 | hs_regex, 304 | bs_date_regex, 305 | league_regex, 306 | award_regex, 307 | ] 308 | 309 | for regex in regexes: 310 | match = re.match(regex, url, re.I) 311 | if match: 312 | return [_f for _f in match.groups() if _f][0] 313 | 314 | # things we don't want to match but don't want to print a WARNING 315 | if any(url.startswith(s) for s in ("/play-index/",)): 316 | return url 317 | 318 | print(f'WARNING. NO MATCH WAS FOUND FOR "{url}"') 319 | return url 320 | -------------------------------------------------------------------------------- /sportsref/nfl/players.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import re 3 | import urllib.parse 4 | 5 | from pyquery import PyQuery as pq 6 | 7 | import sportsref 8 | 9 | __all__ = ["Player"] 10 | 11 | 12 | class Player(object, metaclass=sportsref.decorators.Cached): 13 | def __init__(self, player_id): 14 | self.player_id = player_id 15 | self.mainURL = (sportsref.nfl.BASE_URL + "/players/{0[0]}/{0}.htm").format( 16 | self.player_id 17 | ) 18 | 19 | def __eq__(self, other): 20 | return self.player_id == other.player_id 21 | 22 | def __hash__(self): 23 | return hash(self.player_id) 24 | 25 | def __repr__(self): 26 | return "Player({})".format(self.player_id) 27 | 28 | def __str__(self): 29 | return self.name() 30 | 31 | def __reduce__(self): 32 | return Player, (self.player_id,) 33 | 34 | def _subpage_url(self, page, year=None): 35 | # if no year, return career version 36 | if year is None: 37 | return urllib.parse.urljoin( 38 | self.mainURL, "{}/{}/".format(self.player_id, page) 39 | ) 40 | # otherwise, return URL for a given year 41 | else: 42 | return urllib.parse.urljoin( 43 | self.mainURL, "{}/{}/{}/".format(self.player_id, page, year) 44 | ) 45 | 46 | @sportsref.decorators.memoize 47 | def get_doc(self): 48 | doc = pq(sportsref.utils.get_html(self.mainURL)) 49 | return doc 50 | 51 | @sportsref.decorators.memoize 52 | def name(self): 53 | doc = self.get_doc() 54 | name = doc("div#meta h1:first").text() 55 | return name 56 | 57 | @sportsref.decorators.memoize 58 | def age(self, year, month=9, day=1): 59 | doc = self.get_doc() 60 | span = doc("div#meta span#necro-birth") 61 | birthstring = span.attr("data-birth") 62 | try: 63 | dateargs = re.match(r"(\d{4})\-(\d{2})\-(\d{2})", birthstring).groups() 64 | dateargs = list(map(int, dateargs)) 65 | birthDate = datetime.date(*dateargs) 66 | delta = datetime.date(year=year, month=month, day=day) - birthDate 67 | age = delta.days / 365 68 | return age 69 | except Exception: 70 | return None 71 | 72 | @sportsref.decorators.memoize 73 | def position(self): 74 | doc = self.get_doc() 75 | rawText = ( 76 | doc("div#meta p").filter(lambda i, e: "Position" in e.text_content()).text() 77 | ) 78 | rawPos = re.search(r"Position\W*(\S+)", rawText, re.I).group(1) 79 | allPositions = rawPos.split("-") 80 | # right now, returning just the primary position for those with 81 | # multiple positions 82 | return allPositions[0] 83 | 84 | @sportsref.decorators.memoize 85 | def height(self): 86 | doc = self.get_doc() 87 | rawText = doc('div#meta p span[itemprop="height"]').text() 88 | try: 89 | feet, inches = list(map(int, rawText.split("-"))) 90 | return feet * 12 + inches 91 | except ValueError: 92 | return None 93 | 94 | @sportsref.decorators.memoize 95 | def weight(self): 96 | doc = self.get_doc() 97 | rawText = doc('div#meta p span[itemprop="weight"]').text() 98 | try: 99 | weight = re.match(r"(\d+)lb", rawText, re.I).group(1) 100 | return int(weight) 101 | except AttributeError: 102 | return None 103 | 104 | @sportsref.decorators.memoize 105 | def hand(self): 106 | doc = self.get_doc() 107 | try: 108 | rawText = ( 109 | doc("div#meta p") 110 | .filter(lambda i, e: "Throws" in e.text_content()) 111 | .text() 112 | ) 113 | rawHand = re.search(r"Throws\W+(\S+)", rawText, re.I).group(1) 114 | except AttributeError: 115 | return None 116 | return rawHand[0] # 'L' or 'R' 117 | 118 | @sportsref.decorators.memoize 119 | def current_team(self): 120 | doc = self.get_doc() 121 | team = doc("div#meta p").filter(lambda i, e: "Team" in e.text_content()) 122 | text = sportsref.utils.flatten_links(team) 123 | try: 124 | m = re.match(r"Team: (\w{3})", text) 125 | return m.group(1) 126 | except Exception: 127 | return None 128 | 129 | @sportsref.decorators.memoize 130 | def draft_pick(self): 131 | doc = self.get_doc() 132 | rawDraft = ( 133 | doc("div#meta p").filter(lambda i, e: "Draft" in e.text_content()).text() 134 | ) 135 | m = re.search(r"Draft.*? round \((\d+).*?overall\)", rawDraft, re.I) 136 | # if not drafted or taken in supplemental draft, return NaN 137 | if m is None or "Supplemental" in rawDraft: 138 | return None 139 | else: 140 | return int(m.group(1)) 141 | 142 | @sportsref.decorators.memoize 143 | def draft_class(self): 144 | doc = self.get_doc() 145 | rawDraft = ( 146 | doc("div#meta p").filter(lambda i, e: "Draft" in e.text_content()).text() 147 | ) 148 | m = re.search(r"Draft.*?of the (\d{4}) NFL", rawDraft, re.I) 149 | if not m: 150 | return None 151 | else: 152 | return int(m.group(1)) 153 | 154 | @sportsref.decorators.memoize 155 | def draft_team(self): 156 | doc = self.get_doc() 157 | rawDraft = doc("div#meta p").filter(lambda i, e: "Draft" in e.text_content()) 158 | try: 159 | draftStr = sportsref.utils.flatten_links(rawDraft) 160 | m = re.search(r"Draft\W+(\w+)", draftStr) 161 | return m.group(1) 162 | except Exception: 163 | return None 164 | 165 | @sportsref.decorators.memoize 166 | def college(self): 167 | doc = self.get_doc() 168 | rawText = doc("div#meta p").filter(lambda i, e: "College" in e.text_content()) 169 | cleanedText = sportsref.utils.flatten_links(rawText) 170 | college = re.search(r"College:\s*(\S+)", cleanedText).group(1) 171 | return college 172 | 173 | @sportsref.decorators.memoize 174 | def high_school(self): 175 | doc = self.get_doc() 176 | rawText = doc("div#meta p").filter( 177 | lambda i, e: "High School" in e.text_content() 178 | ) 179 | cleanedText = sportsref.utils.flatten_links(rawText) 180 | hs = re.search(r"High School:\s*(\S+)", cleanedText).group(1) 181 | return hs 182 | 183 | @sportsref.decorators.memoize 184 | @sportsref.decorators.kind_rpb(include_type=True) 185 | def gamelog(self, year=None, kind="R"): 186 | """Gets the career gamelog of the given player. 187 | :kind: One of 'R', 'P', or 'B' (for regular season, playoffs, or both). 188 | Case-insensitive; defaults to 'R'. 189 | :year: The year for which the gamelog should be returned; if None, 190 | return entire career gamelog. Defaults to None. 191 | :returns: A DataFrame with the player's career gamelog. 192 | """ 193 | url = self._subpage_url("gamelog", None) # year is filtered later 194 | doc = pq(sportsref.utils.get_html(url)) 195 | table = doc("table#stats") if kind == "R" else doc("table#stats_playoffs") 196 | df = sportsref.utils.parse_table(table) 197 | if year is not None: 198 | df = df.query("year == @year").reset_index(drop=True) 199 | return df 200 | 201 | @sportsref.decorators.memoize 202 | @sportsref.decorators.kind_rpb(include_type=True) 203 | def passing(self, kind="R"): 204 | """Gets yearly passing stats for the player. 205 | 206 | :kind: One of 'R', 'P', or 'B'. Case-insensitive; defaults to 'R'. 207 | :returns: Pandas DataFrame with passing stats. 208 | """ 209 | doc = self.get_doc() 210 | table = doc("table#passing") if kind == "R" else doc("table#passing_playoffs") 211 | df = sportsref.utils.parse_table(table) 212 | return df 213 | 214 | @sportsref.decorators.memoize 215 | @sportsref.decorators.kind_rpb(include_type=True) 216 | def rushing_and_receiving(self, kind="R"): 217 | """Gets yearly rushing/receiving stats for the player. 218 | 219 | :kind: One of 'R', 'P', or 'B'. Case-insensitive; defaults to 'R'. 220 | :returns: Pandas DataFrame with rushing/receiving stats. 221 | """ 222 | doc = self.get_doc() 223 | table = ( 224 | doc("table#rushing_and_receiving") 225 | if kind == "R" 226 | else doc("table#rushing_and_receiving_playoffs") 227 | ) 228 | if not table: 229 | table = ( 230 | doc("table#receiving_and_rushing") 231 | if kind == "R" 232 | else doc("table#receiving_and_rushing_playoffs") 233 | ) 234 | df = sportsref.utils.parse_table(table) 235 | return df 236 | 237 | @sportsref.decorators.memoize 238 | @sportsref.decorators.kind_rpb(include_type=True) 239 | def defense(self, kind="R"): 240 | """Gets yearly defense stats for the player (also has AV stats for OL). 241 | 242 | :kind: One of 'R', 'P', or 'B'. Case-insensitive; defaults to 'R'. 243 | :returns: Pandas DataFrame with rushing/receiving stats. 244 | """ 245 | doc = self.get_doc() 246 | table = doc("table#defense") if kind == "R" else doc("table#defense_playoffs") 247 | df = sportsref.utils.parse_table(table) 248 | return df 249 | 250 | def _plays(self, year, play_type, expand_details): 251 | """Returns a DataFrame of plays for a given year for a given play type 252 | (like rushing, receiving, or passing). 253 | 254 | :year: The year for the season. 255 | :play_type: A type of play for which there are plays (as of this 256 | writing, either "passing", "rushing", or "receiving") 257 | :expand_details: Bool for whether PBP should be parsed. 258 | :returns: A DataFrame of plays, each row is a play. Returns None if 259 | there were no such plays in that year. 260 | """ 261 | url = self._subpage_url("{}-plays".format(play_type), year) 262 | doc = pq(sportsref.utils.get_html(url)) 263 | table = doc("table#all_plays") 264 | if table: 265 | if expand_details: 266 | plays = sportsref.nfl.pbp.expand_details( 267 | sportsref.utils.parse_table(table), detailCol="description" 268 | ) 269 | return plays 270 | else: 271 | return sportsref.utils.parse_table(table) 272 | else: 273 | return None 274 | 275 | @sportsref.decorators.memoize 276 | def passing_plays(self, year, expand_details=True): 277 | """Returns a pbp DataFrame of a player's passing plays in a season. 278 | 279 | :year: The year for the season. 280 | :expand_details: bool for whether PBP should be parsed. 281 | :returns: A DataFrame of stats, each row is a play. 282 | """ 283 | return self._plays(year, "passing", expand_details) 284 | 285 | @sportsref.decorators.memoize 286 | def rushing_plays(self, year, expand_details=True): 287 | """Returns a pbp DataFrame of a player's rushing plays in a season. 288 | 289 | :year: The year for the season. 290 | :expand_details: bool for whether PBP should be parsed. 291 | :returns: A DataFrame of stats, each row is a play. 292 | """ 293 | return self._plays(year, "rushing", expand_details) 294 | 295 | @sportsref.decorators.memoize 296 | def receiving_plays(self, year, expand_details=True): 297 | """Returns a pbp DataFrame of a player's receiving plays in a season. 298 | 299 | :year: The year for the season. 300 | :expand_details: bool for whether PBP should be parsed. 301 | :returns: A DataFrame of stats, each row is a play. 302 | """ 303 | return self._plays(year, "receiving", expand_details) 304 | 305 | @sportsref.decorators.memoize 306 | def splits(self, year=None): 307 | """Returns a DataFrame of splits data for a player-year. 308 | 309 | :year: The year for the season in question. If None, returns career 310 | splits. 311 | :returns: A DataFrame of splits data. 312 | """ 313 | # get the table 314 | url = self._subpage_url("splits", year) 315 | doc = pq(sportsref.utils.get_html(url)) 316 | table = doc("table#stats") 317 | df = sportsref.utils.parse_table(table) 318 | # cleaning the data 319 | if not df.empty: 320 | df.split_id.fillna(method="ffill", inplace=True) 321 | return df 322 | 323 | @sportsref.decorators.memoize 324 | def advanced_splits(self, year=None): 325 | """Returns a DataFrame of advanced splits data for a player-year. Note: 326 | only go back to 2012. 327 | 328 | :year: The year for the season in question. If None, returns career 329 | advanced splits. 330 | :returns: A DataFrame of advanced splits data. 331 | """ 332 | # get the table 333 | url = self._subpage_url("splits", year) 334 | doc = pq(sportsref.utils.get_html(url)) 335 | table = doc("table#advanced_splits") 336 | df = sportsref.utils.parse_table(table) 337 | # cleaning the data 338 | if not df.empty: 339 | df.split_type.fillna(method="ffill", inplace=True) 340 | return df 341 | 342 | @sportsref.decorators.memoize 343 | def _simple_year_award(self, award_id): 344 | """Template for simple award functions that simply list years, such as 345 | pro bowls and first-team all pro. 346 | 347 | :award_id: The div ID that is appended to "leaderboard_" in selecting 348 | the table's div. 349 | :returns: List of years for the award. 350 | """ 351 | doc = self.get_doc() 352 | table = doc("div#leaderboard_{} table".format(award_id)) 353 | return list(map(int, sportsref.utils.parse_awards_table(table))) 354 | 355 | def pro_bowls(self): 356 | """Returns a list of years in which the player made the Pro Bowl.""" 357 | return self._simple_year_award("pro_bowls") 358 | 359 | def first_team_all_pros(self): 360 | """Returns a list of years in which the player made 1st-Tm All Pro.""" 361 | return self._simple_year_award("all_pro") 362 | 363 | # TODO: other awards like MVP, OPOY, DPOY, NFL Top 100, etc. 364 | -------------------------------------------------------------------------------- /sportsref/nfl/boxscores.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import functools 3 | import re 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from pyquery import PyQuery as pq 8 | 9 | import sportsref 10 | 11 | __all__ = ["BoxScore"] 12 | 13 | 14 | class BoxScore(object, metaclass=sportsref.decorators.Cached): 15 | def __init__(self, boxscore_id): 16 | self.boxscore_id = boxscore_id 17 | 18 | def __eq__(self, other): 19 | return self.boxscore_id == other.boxscore_id 20 | 21 | def __hash__(self): 22 | return hash(self.boxscore_id) 23 | 24 | def __repr__(self): 25 | return "BoxScore({})".format(self.boxscore_id) 26 | 27 | def __str__(self): 28 | return "{} Week {}: {} @ {}".format( 29 | self.season(), self.week(), self.away(), self.home() 30 | ) 31 | 32 | def __reduce__(self): 33 | return BoxScore, (self.boxscore_id,) 34 | 35 | @sportsref.decorators.memoize 36 | def get_doc(self): 37 | url = sportsref.nfl.BASE_URL + "/boxscores/{}.htm".format(self.boxscore_id) 38 | doc = pq(sportsref.utils.get_html(url)) 39 | return doc 40 | 41 | @sportsref.decorators.memoize 42 | def date(self): 43 | """Returns the date of the game. See Python datetime.date documentation 44 | for more. 45 | :returns: A datetime.date object with year, month, and day attributes. 46 | """ 47 | match = re.match(r"(\d{4})(\d{2})(\d{2})", self.boxscore_id) 48 | year, month, day = list(map(int, match.groups())) 49 | return datetime.date(year=year, month=month, day=day) 50 | 51 | @sportsref.decorators.memoize 52 | def weekday(self): 53 | """Returns the day of the week on which the game occurred. 54 | :returns: String representation of the day of the week for the game. 55 | 56 | """ 57 | days = [ 58 | "Monday", 59 | "Tuesday", 60 | "Wednesday", 61 | "Thursday", 62 | "Friday", 63 | "Saturday", 64 | "Sunday", 65 | ] 66 | date = self.date() 67 | wd = date.weekday() 68 | return days[wd] 69 | 70 | @sportsref.decorators.memoize 71 | def home(self): 72 | """Returns home team ID. 73 | :returns: 3-character string representing home team's ID. 74 | """ 75 | doc = self.get_doc() 76 | table = doc("table.linescore") 77 | relURL = table("tr").eq(2)("a").eq(2).attr["href"] 78 | home = sportsref.utils.rel_url_to_id(relURL) 79 | return home 80 | 81 | @sportsref.decorators.memoize 82 | def away(self): 83 | """Returns away team ID. 84 | :returns: 3-character string representing away team's ID. 85 | """ 86 | doc = self.get_doc() 87 | table = doc("table.linescore") 88 | relURL = table("tr").eq(1)("a").eq(2).attr["href"] 89 | away = sportsref.utils.rel_url_to_id(relURL) 90 | return away 91 | 92 | @sportsref.decorators.memoize 93 | def home_score(self): 94 | """Returns score of the home team. 95 | :returns: int of the home score. 96 | """ 97 | doc = self.get_doc() 98 | table = doc("table.linescore") 99 | home_score = table("tr").eq(2)("td")[-1].text_content() 100 | return int(home_score) 101 | 102 | @sportsref.decorators.memoize 103 | def away_score(self): 104 | """Returns score of the away team. 105 | :returns: int of the away score. 106 | """ 107 | doc = self.get_doc() 108 | table = doc("table.linescore") 109 | away_score = table("tr").eq(1)("td")[-1].text_content() 110 | return int(away_score) 111 | 112 | @sportsref.decorators.memoize 113 | def winner(self): 114 | """Returns the team ID of the winning team. Returns NaN if a tie.""" 115 | hmScore = self.home_score() 116 | awScore = self.away_score() 117 | if hmScore > awScore: 118 | return self.home() 119 | elif hmScore < awScore: 120 | return self.away() 121 | else: 122 | return None 123 | 124 | @sportsref.decorators.memoize 125 | def week(self): 126 | """Returns the week in which this game took place. 18 is WC round, 19 127 | is Div round, 20 is CC round, 21 is SB. 128 | :returns: Integer from 1 to 21. 129 | """ 130 | doc = self.get_doc() 131 | raw = doc("div#div_other_scores h2 a").attr["href"] 132 | match = re.match(r"/years/{}/week_(\d+)\.htm".format(self.season()), raw) 133 | if match: 134 | return int(match.group(1)) 135 | else: 136 | return 21 # super bowl is week 21 137 | 138 | @sportsref.decorators.memoize 139 | def season(self): 140 | """ 141 | Returns the year ID of the season in which this game took place. 142 | Useful for week 17 January games. 143 | 144 | :returns: An int representing the year of the season. 145 | """ 146 | date = self.date() 147 | return date.year - 1 if date.month <= 3 else date.year 148 | 149 | @sportsref.decorators.memoize 150 | def starters(self): 151 | """Returns a DataFrame where each row is an entry in the starters table 152 | from PFR. 153 | 154 | The columns are: 155 | * player_id - the PFR player ID for the player (note that this column 156 | is not necessarily all unique; that is, one player can be a starter in 157 | multiple positions, in theory). 158 | * playerName - the listed name of the player; this too is not 159 | necessarily unique. 160 | * position - the position at which the player started for their team. 161 | * team - the team for which the player started. 162 | * home - True if the player's team was at home, False if they were away 163 | * offense - True if the player is starting on an offensive position, 164 | False if defense. 165 | 166 | :returns: A pandas DataFrame. See the description for details. 167 | """ 168 | doc = self.get_doc() 169 | a = doc("table#vis_starters") 170 | h = doc("table#home_starters") 171 | data = [] 172 | for h, table in enumerate((a, h)): 173 | team = self.home() if h else self.away() 174 | for i, row in enumerate(table("tbody tr").items()): 175 | datum = {} 176 | datum["player_id"] = sportsref.utils.rel_url_to_id( 177 | row("a")[0].attrib["href"] 178 | ) 179 | datum["playerName"] = row("th").text() 180 | datum["position"] = row("td").text() 181 | datum["team"] = team 182 | datum["home"] = h == 1 183 | datum["offense"] = i <= 10 184 | data.append(datum) 185 | return pd.DataFrame(data) 186 | 187 | @sportsref.decorators.memoize 188 | def line(self): 189 | doc = self.get_doc() 190 | table = doc("table#game_info") 191 | giTable = sportsref.utils.parse_info_table(table) 192 | line_text = giTable.get("vegas_line", None) 193 | if line_text is None: 194 | return None 195 | m = re.match(r"(.+?) ([\-\.\d]+)$", line_text) 196 | if m: 197 | favorite, line = m.groups() 198 | line = float(line) 199 | # give in terms of the home team 200 | year = self.season() 201 | if favorite != sportsref.nfl.teams.team_names(year)[self.home()]: 202 | line = -line 203 | else: 204 | line = 0 205 | return line 206 | 207 | @sportsref.decorators.memoize 208 | def surface(self): 209 | """The playing surface on which the game was played. 210 | 211 | :returns: string representing the type of surface. Returns np.nan if 212 | not avaiable. 213 | """ 214 | doc = self.get_doc() 215 | table = doc("table#game_info") 216 | giTable = sportsref.utils.parse_info_table(table) 217 | return giTable.get("surface", np.nan) 218 | 219 | @sportsref.decorators.memoize 220 | def over_under(self): 221 | """ 222 | Returns the over/under for the game as a float, or np.nan if not 223 | available. 224 | """ 225 | doc = self.get_doc() 226 | table = doc("table#game_info") 227 | giTable = sportsref.utils.parse_info_table(table) 228 | if "over_under" in giTable: 229 | ou = giTable["over_under"] 230 | return float(ou.split()[0]) 231 | else: 232 | return None 233 | 234 | @sportsref.decorators.memoize 235 | def coin_toss(self): 236 | """Gets information relating to the opening coin toss. 237 | 238 | Keys are: 239 | * wonToss - contains the ID of the team that won the toss 240 | * deferred - bool whether the team that won the toss deferred it 241 | 242 | :returns: Dictionary of coin toss-related info. 243 | """ 244 | doc = self.get_doc() 245 | table = doc("table#game_info") 246 | giTable = sportsref.utils.parse_info_table(table) 247 | if "Won Toss" in giTable: 248 | # TODO: finish coinToss function 249 | pass 250 | else: 251 | return None 252 | 253 | @sportsref.decorators.memoize 254 | def weather(self): 255 | """Returns a dictionary of weather-related info. 256 | 257 | Keys of the returned dict: 258 | * temp 259 | * windChill 260 | * relHumidity 261 | * windMPH 262 | 263 | :returns: Dict of weather data. 264 | """ 265 | doc = self.get_doc() 266 | table = doc("table#game_info") 267 | giTable = sportsref.utils.parse_info_table(table) 268 | if "weather" in giTable: 269 | regex = ( 270 | r"(?:(?P\-?\d+) degrees )?" 271 | r"(?:relative humidity (?P\d+)%, )?" 272 | r"(?:wind (?P\d+) mph, )?" 273 | r"(?:wind chill (?P\-?\d+))?" 274 | ) 275 | m = re.match(regex, giTable["weather"]) 276 | d = m.groupdict() 277 | 278 | # cast values to int 279 | for k in d: 280 | try: 281 | d[k] = int(d[k]) 282 | except TypeError: 283 | pass 284 | 285 | # one-off fixes 286 | d["windChill"] = d["windChill"] if pd.notnull(d["windChill"]) else d["temp"] 287 | d["windMPH"] = d["windMPH"] if pd.notnull(d["windMPH"]) else 0 288 | return d 289 | else: 290 | # no weather found, because it's a dome 291 | # TODO: what's relative humidity in a dome? 292 | return {"temp": 70, "windChill": 70, "relHumidity": None, "windMPH": 0} 293 | 294 | @sportsref.decorators.memoize 295 | def pbp(self): 296 | """Returns a dataframe of the play-by-play data from the game. 297 | 298 | Order of function calls: 299 | 1. parse_table on the play-by-play table 300 | 2. expand_details 301 | - calls parse_play_details & _clean_features 302 | 3. _add_team_columns 303 | 4. various fixes to clean data 304 | 5. _add_team_features 305 | 306 | :returns: pandas DataFrame of play-by-play. Similar to GPF. 307 | """ 308 | doc = self.get_doc() 309 | table = doc("table#pbp") 310 | df = sportsref.utils.parse_table(table) 311 | # make the following features conveniently available on each row 312 | df["boxscore_id"] = self.boxscore_id 313 | df["home"] = self.home() 314 | df["away"] = self.away() 315 | df["season"] = self.season() 316 | df["week"] = self.week() 317 | feats = sportsref.nfl.pbp.expand_details(df) 318 | 319 | # add team and opp columns by iterating through rows 320 | df = sportsref.nfl.pbp._add_team_columns(feats) 321 | # add WPA column (requires diff, can't be done row-wise) 322 | df["home_wpa"] = df.home_wp.diff() 323 | # lag score columns, fill in 0-0 to start 324 | for col in ("home_wp", "pbp_score_hm", "pbp_score_aw"): 325 | if col in df.columns: 326 | df[col] = df[col].shift(1) 327 | df.loc[0, ["pbp_score_hm", "pbp_score_aw"]] = 0 328 | # fill in WP NaN's 329 | df.home_wp.fillna(method="ffill", inplace=True) 330 | # fix first play border after diffing/shifting for WP and WPA 331 | firstPlaysOfGame = df[df.secsElapsed == 0].index 332 | line = self.line() 333 | for i in firstPlaysOfGame: 334 | initwp = sportsref.nfl.winProb.initialWinProb(line) 335 | df.loc[i, "home_wp"] = initwp 336 | df.loc[i, "home_wpa"] = df.loc[i + 1, "home_wp"] - initwp 337 | # fix last play border after diffing/shifting for WP and WPA 338 | lastPlayIdx = df.index[-1] 339 | lastPlayWP = df.loc[lastPlayIdx, "home_wp"] 340 | # if a tie, final WP is 50%; otherwise, determined by winner 341 | winner = self.winner() 342 | finalWP = 50.0 if pd.isnull(winner) else (winner == self.home()) * 100.0 343 | df.loc[lastPlayIdx, "home_wpa"] = finalWP - lastPlayWP 344 | # fix WPA for timeouts and plays after timeouts 345 | timeouts = df[df.isTimeout].index 346 | for to in timeouts: 347 | df.loc[to, "home_wpa"] = 0.0 348 | if to + 2 in df.index: 349 | wpa = df.loc[to + 2, "home_wp"] - df.loc[to + 1, "home_wp"] 350 | else: 351 | wpa = finalWP - df.loc[to + 1, "home_wp"] 352 | df.loc[to + 1, "home_wpa"] = wpa 353 | # add team-related features to DataFrame 354 | df = sportsref.nfl.pbp._add_team_features(df) 355 | # fill distToGoal NaN's 356 | df["distToGoal"] = np.where(df.isKickoff, 65, df.distToGoal) 357 | df.distToGoal.fillna(method="bfill", inplace=True) 358 | df.distToGoal.fillna(method="ffill", inplace=True) # for last play 359 | 360 | return df 361 | 362 | @sportsref.decorators.memoize 363 | def ref_info(self): 364 | """Gets a dictionary of ref positions and the ref IDs of the refs for 365 | that game. 366 | 367 | :returns: A dictionary of ref positions and IDs. 368 | """ 369 | doc = self.get_doc() 370 | table = doc("table#officials") 371 | return sportsref.utils.parse_info_table(table) 372 | 373 | @sportsref.decorators.memoize 374 | def player_stats(self): 375 | """Gets the stats for offense, defense, returning, and kicking of 376 | individual players in the game. 377 | :returns: A DataFrame containing individual player stats. 378 | """ 379 | doc = self.get_doc() 380 | tableIDs = ("player_offense", "player_defense", "returns", "kicking") 381 | dfs = [] 382 | for tID in tableIDs: 383 | table = doc("table#{}".format(tID)) 384 | dfs.append(sportsref.utils.parse_table(table)) 385 | dfs = [df for df in dfs if not df.empty] 386 | df = functools.reduce( 387 | lambda x, y: pd.merge( 388 | x, y, how="outer", on=list(set(x.columns) & set(y.columns)) 389 | ), 390 | dfs, 391 | ).reset_index(drop=True) 392 | return df 393 | 394 | @sportsref.decorators.memoize 395 | def snap_counts(self): 396 | """Gets the snap counts for both teams' players and returns them in a 397 | DataFrame. Note: only goes back to 2012. 398 | 399 | :returns: DataFrame of snap count data 400 | """ 401 | # TODO: combine duplicate players, see 201312150mia - ThomDa03 402 | doc = self.get_doc() 403 | table_ids = ("vis_snap_counts", "home_snap_counts") 404 | tms = (self.away(), self.home()) 405 | df = pd.concat( 406 | [ 407 | sportsref.utils.parse_table(doc("table#{}".format(table_id))).assign( 408 | is_home=bool(i), team=tms[i], opp=tms[i * -1 + 1] 409 | ) 410 | for i, table_id in enumerate(table_ids) 411 | ] 412 | ) 413 | if df.empty: 414 | return df 415 | return df.set_index("player_id") 416 | -------------------------------------------------------------------------------- /sportsref/nfl/teams.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from pyquery import PyQuery as pq 6 | 7 | import sportsref 8 | 9 | __all__ = ["team_names", "team_ids", "list_teams", "Team"] 10 | 11 | 12 | @sportsref.decorators.memoize 13 | def team_names(year): 14 | """Returns a mapping from team ID to full team name for a given season. 15 | Example of a full team name: "New England Patriots" 16 | 17 | :year: The year of the season in question (as an int). 18 | :returns: A dictionary with teamID keys and full team name values. 19 | """ 20 | doc = pq(sportsref.utils.get_html(sportsref.nfl.BASE_URL + "/teams/")) 21 | active_table = doc("table#teams_active") 22 | active_df = sportsref.utils.parse_table(active_table) 23 | inactive_table = doc("table#teams_inactive") 24 | inactive_df = sportsref.utils.parse_table(inactive_table) 25 | df = pd.concat((active_df, inactive_df)) 26 | df = df.loc[~df["has_class_partial_table"]] 27 | ids = df.team_id.str[:3].values 28 | names = [tr("th a") for tr in list(active_table("tr").items())] 29 | names.extend(tr("th a") for tr in list(inactive_table("tr").items())) 30 | names = [_f for _f in names if _f] 31 | names = [lst[0].text_content() for lst in names] 32 | # combine IDs and team names into pandas series 33 | series = pd.Series(names, index=ids) 34 | # create a mask to filter to teams from the given year 35 | mask = ((df.year_min <= year) & (year <= df.year_max)).values 36 | # filter, convert to a dict, and return 37 | return series[mask].to_dict() 38 | 39 | 40 | @sportsref.decorators.memoize 41 | def team_ids(year): 42 | """Returns a mapping from team name to team ID for a given season. Inverse 43 | mapping of team_names. Example of a full team name: "New England Patriots" 44 | 45 | :year: The year of the season in question (as an int). 46 | :returns: A dictionary with full team name keys and teamID values. 47 | """ 48 | names = team_names(year) 49 | return {v: k for k, v in list(names.items())} 50 | 51 | 52 | @sportsref.decorators.memoize 53 | def list_teams(year): 54 | """Returns a list of team IDs for a given season. 55 | 56 | :year: The year of the season in question (as an int). 57 | :returns: A list of team IDs. 58 | """ 59 | return list(team_names(year).keys()) 60 | 61 | 62 | class Team(object, metaclass=sportsref.decorators.Cached): 63 | def __init__(self, teamID): 64 | self.teamID = teamID 65 | 66 | def __eq__(self, other): 67 | return self.teamID == other.teamID 68 | 69 | def __hash__(self): 70 | return hash(self.teamID) 71 | 72 | def __repr__(self): 73 | return "Team({})".format(self.teamID) 74 | 75 | def __str__(self): 76 | return self.name() 77 | 78 | def __reduce__(self): 79 | return Team, (self.teamID,) 80 | 81 | @sportsref.decorators.memoize 82 | def team_year_url(self, yr_str): 83 | return sportsref.nfl.BASE_URL + "/teams/{}/{}.htm".format(self.teamID, yr_str) 84 | 85 | @sportsref.decorators.memoize 86 | def get_main_doc(self): 87 | relURL = "/teams/{}".format(self.teamID) 88 | teamURL = sportsref.nfl.BASE_URL + relURL 89 | mainDoc = pq(sportsref.utils.get_html(teamURL)) 90 | return mainDoc 91 | 92 | @sportsref.decorators.memoize 93 | def get_year_doc(self, yr_str): 94 | return pq(sportsref.utils.get_html(self.team_year_url(yr_str))) 95 | 96 | @sportsref.decorators.memoize 97 | def name(self): 98 | """Returns the real name of the franchise given the team ID. 99 | 100 | Examples: 101 | 'nwe' -> 'New England Patriots' 102 | 'sea' -> 'Seattle Seahawks' 103 | 104 | :returns: A string corresponding to the team's full name. 105 | """ 106 | doc = self.get_main_doc() 107 | headerwords = doc("div#meta h1")[0].text_content().split() 108 | lastIdx = headerwords.index("Franchise") 109 | teamwords = headerwords[:lastIdx] 110 | return " ".join(teamwords) 111 | 112 | @sportsref.decorators.memoize 113 | def roster(self, year): 114 | """Returns the roster table for the given year. 115 | 116 | :year: The year for which we want the roster; defaults to current year. 117 | :returns: A DataFrame containing roster information for that year. 118 | """ 119 | doc = self.get_year_doc("{}_roster".format(year)) 120 | roster_table = doc("table#games_played_team") 121 | df = sportsref.utils.parse_table(roster_table) 122 | starter_table = doc("table#starters") 123 | if not starter_table.empty: 124 | start_df = sportsref.utils.parse_table(starter_table) 125 | start_df = start_df.dropna(axis=0, subset=["position"]) 126 | starters = start_df.set_index("position").player_id 127 | df["is_starter"] = df.player_id.isin(starters) 128 | df["starting_pos"] = df.player_id.map( 129 | lambda pid: ( 130 | starters[starters == pid].index[0] 131 | if pid in starters.values 132 | else None 133 | ) 134 | ) 135 | return df 136 | 137 | @sportsref.decorators.memoize 138 | def boxscores(self, year): 139 | """Gets list of BoxScore objects corresponding to the box scores from 140 | that year. 141 | 142 | :year: The year for which we want the boxscores; defaults to current 143 | year. 144 | :returns: np.array of strings representing boxscore IDs. 145 | """ 146 | doc = self.get_year_doc(year) 147 | table = doc("table#games") 148 | df = sportsref.utils.parse_table(table) 149 | if df.empty: 150 | return np.array([]) 151 | return df.boxscore_id.values 152 | 153 | @sportsref.decorators.memoize 154 | def _year_info_pq(self, year, keyword): 155 | """Returns a PyQuery object containing the info from the meta div at 156 | the top of the team year page with the given keyword. 157 | 158 | :year: Int representing the season. 159 | :keyword: A keyword to filter to a single p tag in the meta div. 160 | :returns: A PyQuery object for the selected p element. 161 | """ 162 | doc = self.get_year_doc(year) 163 | p_tags = doc("div#meta div:not(.logo) p") 164 | texts = [p_tag.text_content().strip() for p_tag in p_tags] 165 | try: 166 | return next( 167 | pq(p_tag) 168 | for p_tag, text in zip(p_tags, texts) 169 | if keyword.lower() in text.lower() 170 | ) 171 | except StopIteration: 172 | if len(texts): 173 | raise ValueError("Keyword not found in any p tag.") 174 | else: 175 | raise ValueError("No meta div p tags found.") 176 | 177 | # TODO: add functions for OC, DC, PF, PA, W-L, etc. 178 | # TODO: Also give a function at BoxScore.homeCoach and BoxScore.awayCoach 179 | # TODO: BoxScore needs a gameNum function to do this? 180 | 181 | @sportsref.decorators.memoize 182 | def head_coaches_by_game(self, year): 183 | """Returns head coach data by game. 184 | 185 | :year: An int representing the season in question. 186 | :returns: An array with an entry per game of the season that the team 187 | played (including playoffs). Each entry is the head coach's ID for that 188 | game in the season. 189 | """ 190 | coach_str = self._year_info_pq(year, "Coach").text() 191 | regex = r"(\S+?) \((\d+)-(\d+)-(\d+)\)" 192 | coachAndTenure = [] 193 | m = True 194 | while m: 195 | m = re.search(regex, coach_str) 196 | coachID, wins, losses, ties = m.groups() 197 | # nextIndex = m.end(4) + 1 198 | # coachStr = coachStr[nextIndex:] 199 | tenure = int(wins) + int(losses) + int(ties) 200 | coachAndTenure.append((coachID, tenure)) 201 | 202 | coachIDs = [cID for cID, games in coachAndTenure for _ in range(games)] 203 | return np.array(coachIDs[::-1]) 204 | 205 | @sportsref.decorators.memoize 206 | def wins(self, year): 207 | """Returns the # of regular season wins a team in a year. 208 | 209 | :year: The year for the season in question. 210 | :returns: The number of regular season wins. 211 | """ 212 | schedule = self.schedule(year) 213 | if schedule.empty: 214 | return np.nan 215 | return schedule.query("week_num <= 17").is_win.sum() 216 | 217 | @sportsref.decorators.memoize 218 | def schedule(self, year): 219 | """Returns a DataFrame with schedule information for the given year. 220 | 221 | :year: The year for the season in question. 222 | :returns: Pandas DataFrame with schedule information. 223 | """ 224 | doc = self.get_year_doc(year) 225 | table = doc("table#games") 226 | df = sportsref.utils.parse_table(table) 227 | if df.empty: 228 | return pd.DataFrame() 229 | df = df.loc[df["week_num"].notnull()] 230 | df["week_num"] = np.arange(len(df)) + 1 231 | df["is_win"] = df["game_outcome"] == "W" 232 | df["is_loss"] = df["game_outcome"] == "L" 233 | df["is_tie"] = df["game_outcome"] == "T" 234 | df["is_bye"] = df["game_outcome"].isnull() 235 | df["is_ot"] = df["overtime"].notnull() 236 | return df 237 | 238 | @sportsref.decorators.memoize 239 | def srs(self, year): 240 | """Returns the SRS (Simple Rating System) for a team in a year. 241 | 242 | :year: The year for the season in question. 243 | :returns: A float of SRS. 244 | """ 245 | try: 246 | srs_text = self._year_info_pq(year, "SRS").text() 247 | except ValueError: 248 | return None 249 | m = re.match(r"SRS\s*?:\s*?(\S+)", srs_text) 250 | if m: 251 | return float(m.group(1)) 252 | else: 253 | return None 254 | 255 | @sportsref.decorators.memoize 256 | def sos(self, year): 257 | """Returns the SOS (Strength of Schedule) for a team in a year, based 258 | on SRS. 259 | 260 | :year: The year for the season in question. 261 | :returns: A float of SOS. 262 | """ 263 | try: 264 | sos_text = self._year_info_pq(year, "SOS").text() 265 | except ValueError: 266 | return None 267 | m = re.search(r"SOS\s*:\s*(\S+)", sos_text) 268 | if m: 269 | return float(m.group(1)) 270 | else: 271 | return None 272 | 273 | @sportsref.decorators.memoize 274 | def off_coordinator(self, year): 275 | """Returns the coach ID for the team's OC in a given year. 276 | 277 | :year: An int representing the year. 278 | :returns: A string containing the coach ID of the OC. 279 | """ 280 | try: 281 | oc_anchor = self._year_info_pq(year, "Offensive Coordinator")("a") 282 | if oc_anchor: 283 | return oc_anchor.attr["href"] 284 | except ValueError: 285 | return None 286 | 287 | @sportsref.decorators.memoize 288 | def def_coordinator(self, year): 289 | """Returns the coach ID for the team's DC in a given year. 290 | 291 | :year: An int representing the year. 292 | :returns: A string containing the coach ID of the DC. 293 | """ 294 | try: 295 | dc_anchor = self._year_info_pq(year, "Defensive Coordinator")("a") 296 | if dc_anchor: 297 | return dc_anchor.attr["href"] 298 | except ValueError: 299 | return None 300 | 301 | @sportsref.decorators.memoize 302 | def stadium(self, year): 303 | """Returns the ID for the stadium in which the team played in a given 304 | year. 305 | 306 | :year: The year in question. 307 | :returns: A string representing the stadium ID. 308 | """ 309 | anchor = self._year_info_pq(year, "Stadium")("a") 310 | return sportsref.utils.rel_url_to_id(anchor.attr["href"]) 311 | 312 | @sportsref.decorators.memoize 313 | def off_scheme(self, year): 314 | """Returns the name of the offensive scheme the team ran in the given 315 | year. 316 | 317 | :year: Int representing the season year. 318 | :returns: A string representing the offensive scheme. 319 | """ 320 | scheme_text = self._year_info_pq(year, "Offensive Scheme").text() 321 | m = re.search(r"Offensive Scheme[:\s]*(.+)\s*", scheme_text, re.I) 322 | if m: 323 | return m.group(1) 324 | else: 325 | return None 326 | 327 | @sportsref.decorators.memoize 328 | def def_alignment(self, year): 329 | """Returns the name of the defensive alignment the team ran in the 330 | given year. 331 | 332 | :year: Int representing the season year. 333 | :returns: A string representing the defensive alignment. 334 | """ 335 | scheme_text = self._year_info_pq(year, "Defensive Alignment").text() 336 | m = re.search(r"Defensive Alignment[:\s]*(.+)\s*", scheme_text, re.I) 337 | if m: 338 | return m.group(1) 339 | else: 340 | return None 341 | 342 | @sportsref.decorators.memoize 343 | def team_stats(self, year): 344 | """Returns a Series (dict-like) of team stats from the team-season 345 | page. 346 | 347 | :year: Int representing the season. 348 | :returns: A Series of team stats. 349 | """ 350 | doc = self.get_year_doc(year) 351 | table = doc("table#team_stats") 352 | df = sportsref.utils.parse_table(table) 353 | if df.empty: 354 | return pd.Series() 355 | return df.loc[df.player_id == "Team Stats"].iloc[0] 356 | 357 | @sportsref.decorators.memoize 358 | def opp_stats(self, year): 359 | """Returns a Series (dict-like) of the team's opponent's stats from the 360 | team-season page. 361 | 362 | :year: Int representing the season. 363 | :returns: A Series of team stats. 364 | """ 365 | doc = self.get_year_doc(year) 366 | table = doc("table#team_stats") 367 | df = sportsref.utils.parse_table(table) 368 | return df.loc[df.player_id == "Opp. Stats"].iloc[0] 369 | 370 | @sportsref.decorators.memoize 371 | def passing(self, year): 372 | doc = self.get_year_doc(year) 373 | table = doc("table#passing") 374 | df = sportsref.utils.parse_table(table) 375 | return df 376 | 377 | @sportsref.decorators.memoize 378 | def rushing_and_receiving(self, year): 379 | doc = self.get_year_doc(year) 380 | table = doc("#rushing_and_receiving") 381 | df = sportsref.utils.parse_table(table) 382 | return df 383 | 384 | @sportsref.decorators.memoize 385 | def off_splits(self, year): 386 | """Returns a DataFrame of offensive team splits for a season. 387 | 388 | :year: int representing the season. 389 | :returns: Pandas DataFrame of split data. 390 | """ 391 | doc = self.get_year_doc("{}_splits".format(year)) 392 | tables = doc("table.stats_table") 393 | dfs = [sportsref.utils.parse_table(table) for table in list(tables.items())] 394 | dfs = [ 395 | df.assign(split=df.columns[0]).rename( 396 | columns={df.columns[0]: "split_value"} 397 | ) 398 | for df in dfs 399 | ] 400 | if not dfs: 401 | return pd.DataFrame() 402 | return pd.concat(dfs).reset_index(drop=True) 403 | 404 | @sportsref.decorators.memoize 405 | def def_splits(self, year): 406 | """Returns a DataFrame of defensive team splits (i.e. opponent splits) 407 | for a season. 408 | 409 | :year: int representing the season. 410 | :returns: Pandas DataFrame of split data. 411 | """ 412 | doc = self.get_year_doc("{}_opp_splits".format(year)) 413 | tables = doc("table.stats_table") 414 | dfs = [sportsref.utils.parse_table(table) for table in list(tables.items())] 415 | dfs = [ 416 | df.assign(split=df.columns[0]).rename( 417 | columns={df.columns[0]: "split_value"} 418 | ) 419 | for df in dfs 420 | ] 421 | if not dfs: 422 | return pd.DataFrame() 423 | return pd.concat(dfs).reset_index(drop=True) 424 | -------------------------------------------------------------------------------- /sportsref/nba/boxscores.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import re 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from pyquery import PyQuery as pq 7 | 8 | import sportsref 9 | 10 | CLOCK_REGEX = re.compile(r"(\d+):(\d+)\.(\d+)") 11 | 12 | 13 | class BoxScore(object, metaclass=sportsref.decorators.Cached): 14 | def __init__(self, boxscore_id): 15 | self.boxscore_id = boxscore_id 16 | 17 | def __eq__(self, other): 18 | return self.boxscore_id == other.boxscore_id 19 | 20 | def __hash__(self): 21 | return hash(self.boxscore_id) 22 | 23 | def __repr__(self): 24 | return f"BoxScore({self.boxscore_id})" 25 | 26 | @sportsref.decorators.memoize 27 | def get_main_doc(self): 28 | url = f"{sportsref.nba.BASE_URL}/boxscores/{self.boxscore_id}.html" 29 | doc = pq(sportsref.utils.get_html(url)) 30 | return doc 31 | 32 | @sportsref.decorators.memoize 33 | def get_subpage_doc(self, page): 34 | url = f"{sportsref.nba.BASE_URL}/boxscores/{page}/{self.boxscore_id}.html" 35 | doc = pq(sportsref.utils.get_html(url)) 36 | return doc 37 | 38 | @sportsref.decorators.memoize 39 | def date(self): 40 | """Returns the date of the game. See Python datetime.date documentation 41 | for more. 42 | :returns: A datetime.date object with year, month, and day attributes. 43 | """ 44 | match = re.match(r"(\d{4})(\d{2})(\d{2})", self.boxscore_id) 45 | year, month, day = list(map(int, match.groups())) 46 | return datetime.date(year=year, month=month, day=day) 47 | 48 | @sportsref.decorators.memoize 49 | def weekday(self): 50 | days = [ 51 | "Monday", 52 | "Tuesday", 53 | "Wednesday", 54 | "Thursday", 55 | "Friday", 56 | "Saturday", 57 | "Sunday", 58 | ] 59 | date = self.date() 60 | wd = date.weekday() 61 | return days[wd] 62 | 63 | @sportsref.decorators.memoize 64 | def linescore(self): 65 | """Returns the linescore for the game as a DataFrame.""" 66 | doc = self.get_main_doc() 67 | table = doc("table#line_score") 68 | df = sportsref.utils.parse_table(table) 69 | df.index = ["away", "home"] 70 | return df 71 | 72 | @sportsref.decorators.memoize 73 | def home(self): 74 | """Returns home team ID. 75 | :returns: 3-character string representing home team's ID. 76 | """ 77 | linescore = self.linescore() 78 | return linescore.loc["home", "team_id"] 79 | 80 | @sportsref.decorators.memoize 81 | def away(self): 82 | """Returns away team ID. 83 | :returns: 3-character string representing away team's ID. 84 | """ 85 | linescore = self.linescore() 86 | return linescore.loc["away", "team_id"] 87 | 88 | @sportsref.decorators.memoize 89 | def home_score(self): 90 | """Returns score of the home team. 91 | :returns: int of the home score. 92 | """ 93 | linescore = self.linescore() 94 | return linescore.loc["home", "T"] 95 | 96 | @sportsref.decorators.memoize 97 | def away_score(self): 98 | """Returns score of the away team. 99 | :returns: int of the away score. 100 | """ 101 | linescore = self.linescore() 102 | return linescore.loc["away", "T"] 103 | 104 | @sportsref.decorators.memoize 105 | def winner(self): 106 | """Returns the team ID of the winning team. Returns NaN if a tie.""" 107 | hm_score = self.home_score() 108 | aw_score = self.away_score() 109 | if hm_score > aw_score: 110 | return self.home() 111 | elif hm_score < aw_score: 112 | return self.away() 113 | else: 114 | return None 115 | 116 | @sportsref.decorators.memoize 117 | def season(self): 118 | """ 119 | Returns the year ID of the season in which this game took place. 120 | 121 | :returns: An int representing the year of the season. 122 | """ 123 | d = self.date() 124 | if d.month >= 9: 125 | return d.year + 1 126 | else: 127 | return d.year 128 | 129 | def _get_player_stats(self, table_id_fmt): 130 | """Returns a DataFrame of player stats from the game (either basic or 131 | advanced, depending on the argument. 132 | 133 | :param table_id_fmt: Format string for str.format with a placeholder 134 | for the team ID (e.g. 'box-{}-game-basic') 135 | :returns: DataFrame of player stats 136 | """ 137 | 138 | # get data 139 | doc = self.get_main_doc() 140 | tms = self.away(), self.home() 141 | team_table_ids = [table_id_fmt.format(tm.upper()) for tm in tms] 142 | tables = [doc(f"table#{table_id}") for table_id in team_table_ids] 143 | dfs = [sportsref.utils.parse_table(table) for table in tables] 144 | 145 | # clean data and add features 146 | for i, (tm, df) in enumerate(zip(tms, dfs)): 147 | no_time = df["mp"] == 0 148 | stat_cols = [ 149 | col for col, dtype in list(df.dtypes.items()) if dtype != "object" 150 | ] 151 | df.loc[no_time, stat_cols] = 0 152 | df["team_id"] = tm 153 | df["is_home"] = i == 1 154 | df["is_starter"] = [p < 5 for p in range(df.shape[0])] 155 | df.drop_duplicates(subset="player_id", keep="first", inplace=True) 156 | 157 | return pd.concat(dfs) 158 | 159 | @sportsref.decorators.memoize 160 | def basic_stats(self): 161 | """Returns a DataFrame of basic player stats from the game.""" 162 | return self._get_player_stats("box-{}-game-basic") 163 | 164 | @sportsref.decorators.memoize 165 | def advanced_stats(self): 166 | """Returns a DataFrame of advanced player stats from the game.""" 167 | return self._get_player_stats("box-{}-game-advanced") 168 | 169 | @sportsref.decorators.memoize 170 | def pbp(self, dense_lineups=False, sparse_lineups=False): 171 | """Returns a dataframe of the play-by-play data from the game. 172 | 173 | :param dense_lineups: If True, adds 10 columns containing the names of 174 | the players on the court. Defaults to False. 175 | :param sparse_lineups: If True, adds binary columns denoting whether a 176 | given player is in the game at the time of a pass. Defaults to 177 | False. 178 | :returns: pandas DataFrame of play-by-play. Similar to GPF. 179 | """ 180 | try: 181 | doc = self.get_subpage_doc("pbp") 182 | except Exception: 183 | raise ValueError( 184 | f"Error fetching PBP subpage for boxscore {self.boxscore_id}" 185 | ) 186 | 187 | table = doc("table#pbp") 188 | trs = [ 189 | tr 190 | for tr in list(table("tr").items()) 191 | if ( 192 | not tr.attr["class"] 193 | or (tr.attr["id"] and tr.attr["id"].startswith("q")) 194 | ) 195 | ] 196 | rows = [tr.children("td") for tr in trs] 197 | n_rows = len(trs) 198 | data = [] 199 | cur_qtr = 0 200 | 201 | for i in range(n_rows): 202 | tr = trs[i] 203 | row = rows[i] 204 | play = {} 205 | 206 | # increment cur_qtr when we hit a new quarter 207 | if tr.attr["id"] and tr.attr["id"].startswith("q"): 208 | assert int(tr.attr["id"][1:]) == cur_qtr + 1 209 | cur_qtr += 1 210 | continue 211 | 212 | # add time of play to entry 213 | clock_str = row.eq(0).text() 214 | mins, secs, tenths = list( 215 | map(int, re.match(CLOCK_REGEX, clock_str).groups()) 216 | ) 217 | secs_in_period = 12 * 60 * min(cur_qtr, 4) + 5 * 60 * ( 218 | cur_qtr - 4 if cur_qtr > 4 else 0 219 | ) 220 | secs_elapsed = secs_in_period - (60 * mins + secs + 0.1 * tenths) 221 | play["secs_elapsed"] = secs_elapsed 222 | play["clock_str"] = clock_str 223 | play["quarter"] = cur_qtr 224 | 225 | # handle single play description 226 | # ex: beginning/end of quarter, jump ball 227 | if row.length == 2: 228 | desc = row.eq(1) 229 | # handle jump balls 230 | if desc.text().lower().startswith("jump ball: "): 231 | play["is_jump_ball"] = True 232 | jump_ball_str = sportsref.utils.flatten_links(desc) 233 | play.update( 234 | sportsref.nba.pbp.parse_play( 235 | self.boxscore_id, jump_ball_str, is_home=None 236 | ) 237 | ) 238 | # ignore rows marking beginning/end of quarters 239 | elif desc.text().lower().startswith( 240 | "start of " 241 | ) or desc.text().lower().startswith("end of "): 242 | continue 243 | # if another case, log and continue 244 | else: 245 | if not desc.text().lower().startswith("end of "): 246 | print( 247 | f"{self.boxscore_id}, Q{cur_qtr}, {clock_str} other case: {desc.text()}" 248 | ) 249 | continue 250 | 251 | # handle team play description 252 | # ex: shot, turnover, rebound, foul, sub, etc. 253 | elif row.length == 6: 254 | aw_desc, hm_desc = row.eq(1), row.eq(5) 255 | is_hm_play = bool(hm_desc.text()) 256 | desc = hm_desc if is_hm_play else aw_desc 257 | desc = sportsref.utils.flatten_links(desc) 258 | # parse the play 259 | new_play = sportsref.nba.pbp.parse_play( 260 | self.boxscore_id, desc, is_hm_play 261 | ) 262 | if not new_play: 263 | continue 264 | elif isinstance(new_play, list): 265 | # this happens when a row needs to be expanded to 2 rows; 266 | # ex: double personal foul -> two PF rows 267 | 268 | # first, update and append the first row 269 | orig_play = dict(play) 270 | play.update(new_play[0]) 271 | data.append(play) 272 | # second, set up the second row to be appended below 273 | play = orig_play 274 | new_play = new_play[1] 275 | elif new_play.get("is_error"): 276 | print(f"can't parse: {desc}, boxscore: {self.boxscore_id}") 277 | # import pdb; pdb.set_trace() 278 | play.update(new_play) 279 | 280 | # otherwise, I don't know what this was 281 | else: 282 | raise Exception(f"don't know how to handle row of length {row.length}") 283 | 284 | data.append(play) 285 | 286 | # convert to DataFrame and clean columns 287 | df = pd.DataFrame.from_records(data) 288 | df.sort_values("secs_elapsed", inplace=True, kind="mergesort") 289 | df = sportsref.nba.pbp.clean_features(df) 290 | 291 | # add columns for home team, away team, boxscore_id, date 292 | away, home = self.away(), self.home() 293 | df["home"] = home 294 | df["away"] = away 295 | df["boxscore_id"] = self.boxscore_id 296 | df["season"] = self.season() 297 | date = self.date() 298 | df["year"] = date.year 299 | df["month"] = date.month 300 | df["day"] = date.day 301 | 302 | def _clean_rebs(df): 303 | df.reset_index(drop=True, inplace=True) 304 | no_reb_after = ( 305 | ((df.fta_num < df.tot_fta) | df.is_ftm | df.get("is_tech_fta", False)) 306 | .shift(1) 307 | .fillna(False) 308 | ) 309 | no_reb_before = ((df.fta_num == df.tot_fta)).shift(-1).fillna(False) 310 | se_end_qtr = df.loc[df.clock_str == "0:00.0", "secs_elapsed"].unique() 311 | no_reb_when = df.secs_elapsed.isin(se_end_qtr) 312 | drop_mask = (df.rebounder == "Team") & ( 313 | no_reb_after | no_reb_before | no_reb_when 314 | ) 315 | df.drop(df.loc[drop_mask].index, axis=0, inplace=True) 316 | df.reset_index(drop=True, inplace=True) 317 | return df 318 | 319 | # get rid of 'rebounds' after FTM, non-final FTA, or tech FTA 320 | df = _clean_rebs(df) 321 | 322 | # track possession number for each possession 323 | # TODO: see 201604130PHO, secs_elapsed == 2756 324 | # things that end a poss: 325 | # FGM, dreb, TO, end of Q, made last FT, lost jump ball, 326 | # def goaltending, shot clock violation 327 | new_poss = (df.off_team == df.home).diff().fillna(False) 328 | # def rebound considered part of the new possession 329 | df["poss_id"] = np.cumsum(new_poss) + df.is_dreb 330 | # create poss_id with rebs -> new possessions for granular groupbys 331 | poss_id_reb = np.cumsum(new_poss | df.is_reb) 332 | 333 | # make sure plays with the same clock time are in the right order 334 | # TODO: make sort_cols depend on what cols are in the play? 335 | # or combine related plays, like and-1 shot and foul 336 | # issues come up with FGA after timeout in 201604130LAL 337 | # issues come up with PF between FGA and DREB in 201604120SAS 338 | sort_cols = [ 339 | col 340 | for col in [ 341 | "is_reb", 342 | "is_fga", 343 | "is_pf", 344 | "is_tech_foul", 345 | "is_ejection", 346 | "is_tech_fta", 347 | "is_timeout", 348 | "is_pf_fta", 349 | "fta_num", 350 | "is_viol", 351 | "is_to", 352 | "is_jump_ball", 353 | "is_sub", 354 | ] 355 | if col in df.columns 356 | ] 357 | asc_true = ["fta_num"] 358 | ascend = [(col in asc_true) for col in sort_cols] 359 | for label, group in df.groupby([df.secs_elapsed, poss_id_reb]): 360 | if len(group) > 1: 361 | df.loc[group.index, :] = group.sort_values( 362 | sort_cols, ascending=ascend, kind="mergesort" 363 | ).values 364 | 365 | # 2nd pass: get rid of 'rebounds' after FTM, non-final FTA, etc. 366 | df = _clean_rebs(df) 367 | 368 | # makes sure off/def and poss_id are correct for subs after rearranging 369 | # some possessions above 370 | df.loc[df["is_sub"], ["off_team", "def_team", "poss_id"]] = np.nan 371 | df.off_team.fillna(method="bfill", inplace=True) 372 | df.def_team.fillna(method="bfill", inplace=True) 373 | df.poss_id.fillna(method="bfill", inplace=True) 374 | # make off_team and def_team NaN for jump balls 375 | if "is_jump_ball" in df.columns: 376 | df.loc[df["is_jump_ball"], ["off_team", "def_team"]] = np.nan 377 | 378 | # make sure 'off_team' is always the team shooting FTs, even on techs 379 | # (impt for keeping track of the score) 380 | if "is_tech_fta" in df.columns: 381 | tech_fta = df["is_tech_fta"] 382 | df.loc[tech_fta, "off_team"] = df.loc[tech_fta, "fta_team"] 383 | df.loc[tech_fta, "def_team"] = np.where( 384 | df.loc[tech_fta, "off_team"] == home, away, home 385 | ) 386 | df.drop("fta_team", axis=1, inplace=True) 387 | # redefine poss_id_reb 388 | new_poss = (df.off_team == df.home).diff().fillna(False) 389 | poss_id_reb = np.cumsum(new_poss | df.is_reb) 390 | 391 | # get rid of redundant subs 392 | for (se, tm, pnum), group in df[df.is_sub].groupby( 393 | [df.secs_elapsed, df.sub_team, poss_id_reb] 394 | ): 395 | if len(group) > 1: 396 | sub_in = set() 397 | sub_out = set() 398 | # first, figure out who's in and who's out after subs 399 | for i, row in group.iterrows(): 400 | if row["sub_in"] in sub_out: 401 | sub_out.remove(row["sub_in"]) 402 | else: 403 | sub_in.add(row["sub_in"]) 404 | if row["sub_out"] in sub_in: 405 | sub_in.remove(row["sub_out"]) 406 | else: 407 | sub_out.add(row["sub_out"]) 408 | assert len(sub_in) == len(sub_out) 409 | # second, add those subs 410 | n_subs = len(sub_in) 411 | for idx, p_in, p_out in zip(group.index[:n_subs], sub_in, sub_out): 412 | assert df.loc[idx, "is_sub"] 413 | df.loc[idx, "sub_in"] = p_in 414 | df.loc[idx, "sub_out"] = p_out 415 | df.loc[idx, "sub_team"] = tm 416 | df.loc[idx, "detail"] = f"{p_in} enters the game for {p_out}" 417 | # third, if applicable, remove old sub entries when there are 418 | # redundant subs 419 | n_extra = len(group) - len(sub_in) 420 | if n_extra: 421 | extra_idxs = group.index[-n_extra:] 422 | df.drop(extra_idxs, axis=0, inplace=True) 423 | 424 | df.reset_index(drop=True, inplace=True) 425 | 426 | # add column for pts and score 427 | df["pts"] = df["is_ftm"] + 2 * df["is_fgm"] + (df["is_fgm"] & df["is_three"]) 428 | df["hm_pts"] = np.where(df.off_team == df.home, df.pts, 0) 429 | df["aw_pts"] = np.where(df.off_team == df.away, df.pts, 0) 430 | df["hm_score"] = np.cumsum(df["hm_pts"]) 431 | df["aw_score"] = np.cumsum(df["aw_pts"]) 432 | 433 | # more helpful columns 434 | # "play" is differentiated from "poss" by counting OReb as new play 435 | # "plays" end with non-and1 FGA, TO, last non-tech FTA, or end of qtr 436 | # (or double lane viol) 437 | new_qtr = df.quarter.diff().shift(-1).fillna(False).astype(bool) # noqa 438 | and1 = ( # noqa 439 | df.is_fgm 440 | & df.is_pf.shift(-1).fillna(False) 441 | & df.is_fta.shift(-2).fillna(False) 442 | & ~df.secs_elapsed.diff().shift(-1).fillna(False).astype(bool) 443 | ) 444 | double_lane = df.get("viol_type") == "double lane" # noqa 445 | 446 | new_play = df.eval( 447 | "(is_fga & ~(@and1)) | is_to | @new_qtr |" 448 | "(is_fta & ~is_tech_fta & fta_num == tot_fta) |" 449 | "@double_lane" 450 | ) 451 | df["play_id"] = np.cumsum(new_play).shift(1).fillna(0) 452 | df["hm_off"] = df.off_team == df.home 453 | 454 | # get lineup data 455 | if dense_lineups: 456 | df = pd.concat((df, sportsref.nba.pbp.get_dense_lineups(df)), axis=1) 457 | if sparse_lineups: 458 | df = pd.concat((df, sportsref.nba.pbp.get_sparse_lineups(df)), axis=1) 459 | 460 | # TODO: add shot clock as a feature 461 | 462 | return df 463 | -------------------------------------------------------------------------------- /sportsref/nfl/pbp.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import re 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | import sportsref 8 | 9 | RUSH_OPTS = { 10 | "left end": "LE", 11 | "left tackle": "LT", 12 | "left guard": "LG", 13 | "up the middle": "M", 14 | "middle": "M", 15 | "right end": "RE", 16 | "right tackle": "RT", 17 | "right guard": "RG", 18 | } 19 | PASS_OPTS = { 20 | "short left": "SL", 21 | "short middle": "SM", 22 | "short right": "SR", 23 | "deep left": "DL", 24 | "deep middle": "DM", 25 | "deep right": "DR", 26 | } 27 | 28 | 29 | def expand_details(df, detailCol="detail"): 30 | """Expands the details column of the given dataframe and returns the 31 | resulting DataFrame. 32 | 33 | :df: The input DataFrame. 34 | :detailCol: The detail column name. 35 | :returns: Returns DataFrame with new columns from pbp parsing. 36 | """ 37 | df = copy.deepcopy(df) 38 | df["detail"] = df[detailCol] 39 | dicts = [ 40 | sportsref.nfl.pbp.parse_play_details(detail) for detail in df["detail"].values 41 | ] 42 | # clean up unmatched details 43 | cols = {c for d in dicts if d for c in list(d.keys())} 44 | blankEntry = {c: np.nan for c in cols} 45 | newDicts = [d if d else blankEntry for d in dicts] 46 | # get details DataFrame and merge it with original to create main DataFrame 47 | details = pd.DataFrame(newDicts) 48 | df = pd.merge(df, details, left_index=True, right_index=True) 49 | # add isError column 50 | errors = [i for i, d in enumerate(dicts) if d is None] 51 | df["isError"] = False 52 | df.loc[errors, "isError"] = True 53 | # fill in some NaN's necessary for _clean_features 54 | df.loc[0, "qtr_time_remain"] = "15:00" 55 | df.qtr_time_remain.fillna(method="bfill", inplace=True) 56 | df.qtr_time_remain.fillna( 57 | pd.Series(np.where(df.quarter == 4, "0:00", "15:00")), inplace=True 58 | ) 59 | # use _clean_features to clean up and add columns 60 | new_df = df.apply(_clean_features, axis=1) 61 | return new_df 62 | 63 | 64 | @sportsref.decorators.memoize 65 | def parse_play_details(details): 66 | """Parses play details from play-by-play string and returns structured 67 | data. 68 | 69 | :details: detail string for play 70 | :returns: dictionary of play attributes 71 | """ 72 | 73 | # if input isn't a string, return None 74 | if not isinstance(details, str): 75 | return None 76 | 77 | rushOptRE = r"(?P{})".format(r"|".join(list(RUSH_OPTS.keys()))) 78 | passOptRE = r"(?P{})".format(r"|".join(list(PASS_OPTS.keys()))) 79 | 80 | playerRE = r"\S{6,8}\d{2}" 81 | 82 | # initialize return dictionary - struct 83 | struct = {} 84 | 85 | # handle challenges 86 | # TODO: record the play both before & after an overturned challenge 87 | challengeRE = re.compile( 88 | r".+\. (?P.+?) challenged.*? the play was " 89 | r"(?Pupheld|overturned)\.", 90 | re.IGNORECASE, 91 | ) 92 | match = challengeRE.search(details) 93 | if match: 94 | struct["isChallenge"] = True 95 | struct.update(match.groupdict()) 96 | # if overturned, only record updated play 97 | if "overturned" in details: 98 | overturnedIdx = details.index("overturned.") 99 | newStart = overturnedIdx + len("overturned.") 100 | details = details[newStart:].strip() 101 | else: 102 | struct["isChallenge"] = False 103 | 104 | # TODO: expand on laterals 105 | struct["isLateral"] = details.find("lateral") != -1 106 | 107 | # create rushing regex 108 | rusherRE = r"(?P{0})".format(playerRE) 109 | rushOptRE = r"(?: {})?".format(rushOptRE) 110 | rushYardsRE = r"(?:(?:(?P\-?\d+) yards?)|(?:no gain))" 111 | # cases: tackle, fumble, td, penalty 112 | tackleRE = ( 113 | r"(?: \(tackle by (?P{0})" 114 | r"(?: and (?P{0}))?\))?".format(playerRE) 115 | ) 116 | # currently, plays with multiple fumbles record the original fumbler 117 | # and the final fumble recoverer 118 | fumbleRE = ( 119 | r"(?:" 120 | r"\.? ?(?P{0}) fumbles" 121 | r"(?: \(forced by (?P{0})\))?" 122 | r"(?:.*, recovered by (?P{0}) at )?" 123 | r"(?:, ball out of bounds at )?" 124 | r"(?:(?P[a-z]+)?\-?(?P\-?\d+))?" 125 | r"(?: and returned for (?P\-?\d*) yards)?" 126 | r")?".format(playerRE) 127 | ) 128 | tdSafetyRE = r"(?:(?P, touchdown)|(?P, safety))?" 129 | # TODO: offsetting penalties 130 | penaltyRE = ( 131 | r"(?:.*?" 132 | r"\. Penalty on (?P{0}|): " 133 | r"(?P[^\(,]+)" 134 | r"(?: \((?PDeclined)\)|" 135 | r", (?P\d*) yards?)" 136 | r"(?: \(no play\))?" 137 | r")?".format(playerRE) 138 | ) 139 | 140 | rushREstr = (r"{}{}(?: for {}{}{}{}{})?").format( 141 | rusherRE, rushOptRE, rushYardsRE, tackleRE, fumbleRE, tdSafetyRE, penaltyRE 142 | ) 143 | rushRE = re.compile(rushREstr, re.IGNORECASE) 144 | 145 | # create passing regex 146 | # TODO: capture "defended by X" for defensive stats 147 | passerRE = r"(?P{0})".format(playerRE) 148 | sackRE = ( 149 | r"(?:sacked (?:by (?P{0})(?: and (?P{0}))? )?" 150 | r"for (?P\-?\d+) yards?)".format(playerRE) 151 | ) 152 | # create throw RE 153 | completeRE = r"pass (?P(?:in)?complete)" 154 | passOptRE = r"(?: {})?".format(passOptRE) 155 | targetedRE = r"(?: (?:to |intended for )?(?P{0}))?".format(playerRE) 156 | passYardsRE = r"(?: for (?:(?P\-?\d+) yards?|no gain))" 157 | intRE = ( 158 | r"(?: is intercepted by (?P{0}) at ".format(playerRE) 159 | + r"(?:(?P[a-z]*)?\-?(?P\-?\d*))?" 160 | + r"(?: and returned for (?P\-?\d+) yards?\.?)?)?" 161 | ) 162 | throwRE = r"(?:{}{}{}(?:(?:{}|{}){})?)".format( 163 | completeRE, passOptRE, targetedRE, passYardsRE, intRE, tackleRE 164 | ) 165 | passREstr = (r"{} (?:{}|{})(?:{}{}{})?").format( 166 | passerRE, sackRE, throwRE, fumbleRE, tdSafetyRE, penaltyRE 167 | ) 168 | passRE = re.compile(passREstr, re.IGNORECASE) 169 | 170 | # create kickoff regex 171 | koKickerRE = r"(?P{0})".format(playerRE) 172 | koYardsRE = ( 173 | r" kicks (?:off|(?Ponside))" r" (?:(?P\d+) yards?|no gain)" 174 | ) 175 | nextREs = [] 176 | nextREs.append( 177 | ( 178 | r", (?:returned|recovered) by (?P{0})(?: for " 179 | r"(?:(?P\-?\d+) yards?|no gain))?" 180 | ).format(playerRE) 181 | ) 182 | nextREs.append( 183 | ( 184 | r"(?P, muffed catch by )(?P{0})," 185 | r"(?: recovered by (?P{0}))?" 186 | ).format(playerRE) 187 | + r"(?: and returned for (?:(?P\-?\d+) yards|no gain))?" 188 | ) 189 | nextREs.append(r", recovered by (?P{0})".format(playerRE)) 190 | nextREs.append(r"(?P, out of bounds)") 191 | nextREs.append(r"(?P, touchback)") 192 | # TODO: test the following line to fix a small subset of cases 193 | # (ex: muff -> oob) 194 | nextRE = "".join(r"(?:{})?".format(nre) for nre in nextREs) 195 | kickoffREstr = r"{}{}{}{}{}{}{}".format( 196 | koKickerRE, koYardsRE, nextRE, tackleRE, fumbleRE, tdSafetyRE, penaltyRE 197 | ) 198 | kickoffRE = re.compile(kickoffREstr, re.IGNORECASE) 199 | 200 | # create timeout regex 201 | timeoutREstr = r"Timeout #(?P\d) by (?P.+)" 202 | timeoutRE = re.compile(timeoutREstr, re.IGNORECASE) 203 | 204 | # create FG regex 205 | fgKickerRE = r"(?P{0})".format(playerRE) 206 | fgBaseRE = r" (?P\d+) yard field goal" r" (?Pgood|no good)" 207 | fgBlockRE = ( 208 | r"(?:, (?Pblocked) by " 209 | r"(?P{0}))?".format(playerRE) 210 | + r"(?:, recovered by (?P{0}))?".format(playerRE) 211 | + r"(?: and returned for (?:(?P\-?\d+) yards?|no gain))?" 212 | ) 213 | fgREstr = r"{}{}{}{}{}".format( 214 | fgKickerRE, fgBaseRE, fgBlockRE, tdSafetyRE, penaltyRE 215 | ) 216 | fgRE = re.compile(fgREstr, re.IGNORECASE) 217 | 218 | # create punt regex 219 | punterRE = r".*?(?P{0})".format(playerRE) 220 | puntBlockRE = ( 221 | ( 222 | r" punts, (?Pblocked) by (?P{0})" 223 | r"(?:, recovered by (?P{0})" 224 | ).format(playerRE) 225 | + r"(?: and returned (?:(?P\-?\d+) yards|no gain))?)?" 226 | ) 227 | puntYdsRE = r" punts (?P\d+) yards?" 228 | nextREs = [] 229 | nextREs.append( 230 | r", (?Pfair catch) by (?P{0})".format(playerRE) 231 | ) 232 | nextREs.append(r", (?Pout of bounds)") 233 | nextREs.append( 234 | ( 235 | r"(?P, muffed catch by )(?P{0})," 236 | r" recovered by (?P{0})" 237 | ).format(playerRE) 238 | + r" and returned for " 239 | + r"(?:(?P\d+) yards|no gain)" 240 | ) 241 | nextREs.append( 242 | r", returned by (?P{0}) for ".format(playerRE) 243 | + r"(?:(?P\-?\d+) yards?|no gain)" 244 | ) 245 | nextRE = r"(?:{})?".format("|".join(nextREs)) 246 | puntREstr = r"{}(?:{}|{}){}{}{}{}{}".format( 247 | punterRE, 248 | puntBlockRE, 249 | puntYdsRE, 250 | nextRE, 251 | tackleRE, 252 | fumbleRE, 253 | tdSafetyRE, 254 | penaltyRE, 255 | ) 256 | puntRE = re.compile(puntREstr, re.IGNORECASE) 257 | 258 | # create kneel regex 259 | kneelREstr = ( 260 | r"(?P{0}) kneels for ".format(playerRE) 261 | + r"(?:(?P\-?\d+) yards?|no gain)" 262 | ) 263 | kneelRE = re.compile(kneelREstr, re.IGNORECASE) 264 | 265 | # create spike regex 266 | spikeREstr = r"(?P{0}) spiked the ball".format(playerRE) 267 | spikeRE = re.compile(spikeREstr, re.IGNORECASE) 268 | 269 | # create XP regex 270 | extraPointREstr = ( 271 | r"(?:(?P{0}) kicks)? ?extra point " r"(?Pgood|no good)" 272 | ).format(playerRE) 273 | extraPointRE = re.compile(extraPointREstr, re.IGNORECASE) 274 | 275 | # create 2pt conversion regex 276 | twoPointREstr = ( 277 | r"Two Point Attempt: (?P.*?),?\s+conversion\s+" 278 | r"(?Psucceeds|fails)" 279 | ) 280 | twoPointRE = re.compile(twoPointREstr, re.IGNORECASE) 281 | 282 | # create penalty regex 283 | psPenaltyREstr = ( 284 | r"^Penalty on (?P{0}|".format(playerRE) 285 | + r"\w{3}): " 286 | + r"(?P[^\(,]+)(?: \((?PDeclined)\)|" 287 | + r", (?P\d*) yards?|" 288 | + r".*?(?: \(no play\)))" 289 | ) 290 | psPenaltyRE = re.compile(psPenaltyREstr, re.IGNORECASE) 291 | 292 | # try parsing as a kickoff 293 | match = kickoffRE.search(details) 294 | if match: 295 | # parse as a kickoff 296 | struct["isKickoff"] = True 297 | struct.update(match.groupdict()) 298 | return struct 299 | 300 | # try parsing as a timeout 301 | match = timeoutRE.search(details) 302 | if match: 303 | # parse as timeout 304 | struct["isTimeout"] = True 305 | struct.update(match.groupdict()) 306 | return struct 307 | 308 | # try parsing as a field goal 309 | match = fgRE.search(details) 310 | if match: 311 | # parse as a field goal 312 | struct["isFieldGoal"] = True 313 | struct.update(match.groupdict()) 314 | return struct 315 | 316 | # try parsing as a punt 317 | match = puntRE.search(details) 318 | if match: 319 | # parse as a punt 320 | struct["isPunt"] = True 321 | struct.update(match.groupdict()) 322 | return struct 323 | 324 | # try parsing as a kneel 325 | match = kneelRE.search(details) 326 | if match: 327 | # parse as a kneel 328 | struct["isKneel"] = True 329 | struct.update(match.groupdict()) 330 | return struct 331 | 332 | # try parsing as a spike 333 | match = spikeRE.search(details) 334 | if match: 335 | # parse as a spike 336 | struct["isSpike"] = True 337 | struct.update(match.groupdict()) 338 | return struct 339 | 340 | # try parsing as an XP 341 | match = extraPointRE.search(details) 342 | if match: 343 | # parse as an XP 344 | struct["isXP"] = True 345 | struct.update(match.groupdict()) 346 | return struct 347 | 348 | # try parsing as a 2-point conversion 349 | match = twoPointRE.search(details) 350 | if match: 351 | # parse as a 2-point conversion 352 | struct["isTwoPoint"] = True 353 | struct["twoPointSuccess"] = match.group("twoPointSuccess") 354 | realPlay = sportsref.nfl.pbp.parse_play_details(match.group("twoPoint")) 355 | if realPlay: 356 | struct.update(realPlay) 357 | return struct 358 | 359 | # try parsing as a pass 360 | match = passRE.search(details) 361 | if match: 362 | # parse as a pass 363 | struct["isPass"] = True 364 | struct.update(match.groupdict()) 365 | return struct 366 | 367 | # try parsing as a pre-snap penalty 368 | match = psPenaltyRE.search(details) 369 | if match: 370 | # parse as a pre-snap penalty 371 | struct["isPresnapPenalty"] = True 372 | struct.update(match.groupdict()) 373 | return struct 374 | 375 | # try parsing as a run 376 | match = rushRE.search(details) 377 | if match: 378 | # parse as a run 379 | struct["isRun"] = True 380 | struct.update(match.groupdict()) 381 | return struct 382 | 383 | return None 384 | 385 | 386 | def _clean_features(struct): 387 | """Cleans up the features collected in parse_play_details. 388 | 389 | :struct: Pandas Series of features parsed from details string. 390 | :returns: the same dict, but with cleaner features (e.g., convert bools, 391 | ints, etc.) 392 | """ 393 | struct = dict(struct) 394 | # First, clean up play type bools 395 | ptypes = [ 396 | "isKickoff", 397 | "isTimeout", 398 | "isFieldGoal", 399 | "isPunt", 400 | "isKneel", 401 | "isSpike", 402 | "isXP", 403 | "isTwoPoint", 404 | "isPresnapPenalty", 405 | "isPass", 406 | "isRun", 407 | ] 408 | for pt in ptypes: 409 | struct[pt] = struct[pt] if pd.notnull(struct.get(pt)) else False 410 | # Second, clean up other existing variables on a one-off basis 411 | struct["callUpheld"] = struct.get("callUpheld") == "upheld" 412 | struct["fgGood"] = struct.get("fgGood") == "good" 413 | struct["isBlocked"] = struct.get("isBlocked") == "blocked" 414 | struct["isComplete"] = struct.get("isComplete") == "complete" 415 | struct["isFairCatch"] = struct.get("isFairCatch") == "fair catch" 416 | struct["isMuffedCatch"] = pd.notnull(struct.get("isMuffedCatch")) 417 | struct["isNoPlay"] = ( 418 | " (no play)" in struct["detail"] 419 | and "penalty enforced in end zone" not in struct["detail"] 420 | if struct.get("detail") 421 | else False 422 | ) 423 | struct["isOnside"] = struct.get("isOnside") == "onside" 424 | struct["isSack"] = pd.notnull(struct.get("sackYds")) 425 | struct["isSafety"] = struct.get("isSafety") == ", safety" or ( 426 | struct.get("detail") and "enforced in end zone, safety" in struct["detail"] 427 | ) 428 | struct["isTD"] = struct.get("isTD") == ", touchdown" 429 | struct["isTouchback"] = struct.get("isTouchback") == ", touchback" 430 | struct["oob"] = pd.notnull(struct.get("oob")) 431 | struct["passLoc"] = PASS_OPTS.get(struct.get("passLoc"), np.nan) 432 | if struct["isPass"]: 433 | pyds = struct["passYds"] 434 | struct["passYds"] = pyds if pd.notnull(pyds) else 0 435 | if pd.notnull(struct["penalty"]): 436 | struct["penalty"] = struct["penalty"].strip() 437 | struct["penDeclined"] = struct.get("penDeclined") == "Declined" 438 | if struct["quarter"] == "OT": 439 | struct["quarter"] = 5 440 | struct["rushDir"] = RUSH_OPTS.get(struct.get("rushDir"), np.nan) 441 | if struct["isRun"]: 442 | ryds = struct["rushYds"] 443 | struct["rushYds"] = ryds if pd.notnull(ryds) else 0 444 | year = struct.get("season", np.nan) 445 | struct["timeoutTeam"] = sportsref.nfl.teams.team_ids(year).get( 446 | struct.get("timeoutTeam"), np.nan 447 | ) 448 | struct["twoPointSuccess"] = struct.get("twoPointSuccess") == "succeeds" 449 | struct["xpGood"] = struct.get("xpGood") == "good" 450 | 451 | # Third, ensure types are correct 452 | bool_vars = [ 453 | "fgGood", 454 | "isBlocked", 455 | "isChallenge", 456 | "isComplete", 457 | "isFairCatch", 458 | "isFieldGoal", 459 | "isKickoff", 460 | "isKneel", 461 | "isLateral", 462 | "isNoPlay", 463 | "isPass", 464 | "isPresnapPenalty", 465 | "isPunt", 466 | "isRun", 467 | "isSack", 468 | "isSafety", 469 | "isSpike", 470 | "isTD", 471 | "isTimeout", 472 | "isTouchback", 473 | "isTwoPoint", 474 | "isXP", 475 | "isMuffedCatch", 476 | "oob", 477 | "penDeclined", 478 | "twoPointSuccess", 479 | "xpGood", 480 | ] 481 | int_vars = [ 482 | "down", 483 | "fgBlockRetYds", 484 | "fgDist", 485 | "fumbRecYdLine", 486 | "fumbRetYds", 487 | "intRetYds", 488 | "intYdLine", 489 | "koRetYds", 490 | "koYds", 491 | "muffRetYds", 492 | "pbp_score_aw", 493 | "pbp_score_hm", 494 | "passYds", 495 | "penYds", 496 | "puntBlockRetYds", 497 | "puntRetYds", 498 | "puntYds", 499 | "quarter", 500 | "rushYds", 501 | "sackYds", 502 | "timeoutNum", 503 | "ydLine", 504 | "yds_to_go", 505 | ] 506 | float_vars = ["exp_pts_after", "exp_pts_before", "home_wp"] 507 | string_vars = [ 508 | "challenger", 509 | "detail", 510 | "fairCatcher", 511 | "fgBlockRecoverer", 512 | "fgBlocker", 513 | "fgKicker", 514 | "fieldSide", 515 | "fumbForcer", 516 | "fumbRecFieldSide", 517 | "fumbRecoverer", 518 | "fumbler", 519 | "intFieldSide", 520 | "interceptor", 521 | "kneelQB", 522 | "koKicker", 523 | "koReturner", 524 | "muffRecoverer", 525 | "muffedBy", 526 | "passLoc", 527 | "passer", 528 | "penOn", 529 | "penalty", 530 | "puntBlockRecoverer", 531 | "puntBlocker", 532 | "puntReturner", 533 | "punter", 534 | "qtr_time_remain", 535 | "rushDir", 536 | "rusher", 537 | "sacker1", 538 | "sacker2", 539 | "spikeQB", 540 | "tackler1", 541 | "tackler2", 542 | "target", 543 | "timeoutTeam", 544 | "xpKicker", 545 | ] 546 | for var in bool_vars: 547 | struct[var] = struct.get(var) is True 548 | for var in int_vars: 549 | try: 550 | struct[var] = int(struct.get(var)) 551 | except (ValueError, TypeError): 552 | struct[var] = np.nan 553 | for var in float_vars: 554 | try: 555 | struct[var] = float(struct.get(var)) 556 | except (ValueError, TypeError): 557 | struct[var] = np.nan 558 | for var in string_vars: 559 | if var not in struct or pd.isnull(struct[var]) or var == "": 560 | struct[var] = np.nan 561 | 562 | # Fourth, create new helper variables based on parsed variables 563 | # creating fieldSide and ydline from location 564 | if struct["isXP"]: 565 | struct["fieldSide"] = struct["ydLine"] = np.nan 566 | else: 567 | fieldSide, ydline = _loc_to_features(struct.get("location")) 568 | struct["fieldSide"] = fieldSide 569 | struct["ydLine"] = ydline 570 | # creating secsElapsed (in entire game) from qtr_time_remain and quarter 571 | if pd.notnull(struct.get("qtr_time_remain")): 572 | qtr = struct["quarter"] 573 | mins, secs = list(map(int, struct["qtr_time_remain"].split(":"))) 574 | struct["secsElapsed"] = qtr * 900 - mins * 60 - secs 575 | # creating columns for turnovers 576 | struct["isInt"] = pd.notnull(struct.get("interceptor")) 577 | struct["isFumble"] = pd.notnull(struct.get("fumbler")) 578 | # create column for isPenalty 579 | struct["isPenalty"] = pd.notnull(struct.get("penalty")) 580 | # create columns for EPA 581 | struct["team_epa"] = struct["exp_pts_after"] - struct["exp_pts_before"] 582 | struct["opp_epa"] = struct["exp_pts_before"] - struct["exp_pts_after"] 583 | return pd.Series(struct) 584 | 585 | 586 | def _loc_to_features(loc): 587 | """Converts a location string "{Half}, {YardLine}" into a tuple of those 588 | values, the second being an int. 589 | 590 | :l: The string from the play by play table representing location. 591 | :returns: A tuple that separates out the values, making them missing 592 | (np.nan) when necessary. 593 | 594 | """ 595 | if loc: 596 | if isinstance(loc, str): 597 | loc = loc.strip() 598 | if " " in loc: 599 | r = loc.split() 600 | r[0] = r[0].lower() 601 | r[1] = int(r[1]) 602 | else: 603 | r = (np.nan, int(loc)) 604 | elif isinstance(loc, float): 605 | return (np.nan, 50) 606 | else: 607 | r = (np.nan, np.nan) 608 | return r 609 | 610 | 611 | def _add_team_columns(features): 612 | """Function that adds 'team' and 'opp' columns to the features by iterating 613 | through the rows in order. A precondition is that the features dicts are in 614 | order in a continuous game sense and that all rows are from the same game. 615 | 616 | :features: A DataFrame with each row representing each play (in order). 617 | :returns: A similar DataFrame but with 'team' and 'opp' columns added. 618 | """ 619 | features = features.to_dict("records") 620 | curTm = curOpp = None 621 | playAfterKickoff = False 622 | # fill in team and opp columns 623 | for row in features: 624 | # if it's a kickoff or the play after a kickoff, 625 | # figure out who has possession manually 626 | if row["isKickoff"] or playAfterKickoff: 627 | curTm, curOpp = _team_and_opp(row) 628 | else: 629 | curTm, curOpp = _team_and_opp(row, curTm, curOpp) 630 | row["team"], row["opp"] = curTm, curOpp 631 | # set playAfterKickoff 632 | playAfterKickoff = row["isKickoff"] 633 | 634 | features = pd.DataFrame(features) 635 | features.team.fillna(method="bfill", inplace=True) 636 | features.opp.fillna(method="bfill", inplace=True) 637 | # ffill for last row 638 | features.team.fillna(method="ffill", inplace=True) 639 | features.opp.fillna(method="ffill", inplace=True) 640 | return features 641 | 642 | 643 | def _team_and_opp(struct, curTm=None, curOpp=None): 644 | """Given a dict representing a play and the current team with the ball, 645 | returns (team, opp) where team is the team with the ball and opp is the 646 | team without the ball at the end of the play. 647 | 648 | :struct: A Series/dict representing the play. 649 | :curTm: The current team with the ball; None means it's the first play of 650 | the game or the offensive team on the previous play's offensive team was 651 | somehow undetermined. 652 | :curOpp: The current team on defense; None means same as curTm. 653 | :returns: (team, opp) tuple where team and opp are the 3-character team IDs 654 | or the offensive and defensive teams respectively. 655 | """ 656 | # if we don't know the current team, figure it out 657 | if pd.isnull(curTm): 658 | if struct["isRun"]: 659 | pID = struct["rusher"] 660 | elif struct["isPass"]: 661 | pID = struct["passer"] 662 | elif struct["isFieldGoal"]: 663 | pID = struct["fgKicker"] 664 | elif struct["isPunt"]: 665 | pID = struct["punter"] 666 | elif struct["isXP"]: 667 | pID = struct["xpKicker"] 668 | elif struct["isKickoff"]: 669 | pID = struct["koKicker"] 670 | elif struct["isSpike"]: 671 | pID = struct["spikeQB"] 672 | elif struct["isKneel"]: 673 | pID = struct["kneelQB"] 674 | else: 675 | pID = None 676 | curTm = curOpp = np.nan 677 | bs = sportsref.nfl.boxscores.BoxScore(struct["boxscore_id"]) 678 | if pID and len(pID) == 3: 679 | curTm = pID 680 | curOpp = bs.away() if bs.home() == curTm else bs.home() 681 | elif pID: 682 | player = sportsref.nfl.Player(pID) 683 | gamelog = player.gamelog(kind="B") 684 | curTm = gamelog.loc[ 685 | gamelog.boxscore_id == struct["boxscore_id"], "team_id" 686 | ].item() 687 | curOpp = bs.home() if bs.home() != curTm else bs.away() 688 | 689 | return curTm, curOpp 690 | 691 | # use row's class to determine when possession changes 692 | if struct["has_class_divider"]: 693 | return curOpp, curTm 694 | else: 695 | return curTm, curOpp 696 | 697 | 698 | def _add_team_features(df): 699 | """Adds extra convenience features based on teams with and without 700 | possession, with the precondition that the there are 'team' and 'opp' 701 | specified in row. 702 | 703 | :df: A DataFrame representing a game's play-by-play data after 704 | _clean_features has been called and 'team' and 'opp' have been added by 705 | _add_team_columns. 706 | :returns: A dict with new features in addition to previous features. 707 | """ 708 | assert df.team.notnull().all() 709 | 710 | homeOnOff = df["team"] == df["home"] 711 | # create column for distToGoal 712 | df["distToGoal"] = np.where( 713 | df["team"] != df["fieldSide"], df["ydLine"], 100 - df["ydLine"] 714 | ) 715 | df["distToGoal"] = np.where(df["isXP"] | df["isTwoPoint"], 2, df["distToGoal"]) 716 | # create column for each team's WP 717 | df["team_wp"] = np.where(homeOnOff, df["home_wp"], 100.0 - df["home_wp"]) 718 | df["opp_wp"] = 100.0 - df["team_wp"] 719 | # create columns for each team's WPA 720 | df["team_wpa"] = np.where(homeOnOff, df["home_wpa"], -df["home_wpa"]) 721 | df["opp_wpa"] = -df["team_wpa"] 722 | # create column for offense and defense scores if not already there 723 | assert df["boxscore_id"].nunique() == 1 724 | bs_id = df["boxscore_id"].values[0] 725 | bs = sportsref.nfl.boxscores.BoxScore(bs_id) 726 | df["team_score"] = np.where( 727 | df["team"] == bs.home(), df["pbp_score_hm"], df["pbp_score_aw"] 728 | ) 729 | df["opp_score"] = np.where( 730 | df["team"] == bs.home(), df["pbp_score_aw"], df["pbp_score_hm"] 731 | ) 732 | 733 | return df 734 | -------------------------------------------------------------------------------- /sportsref/nba/pbp.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | import sportsref 7 | 8 | HM_LINEUP_COLS = ["hm_player{}".format(i) for i in range(1, 6)] 9 | AW_LINEUP_COLS = ["aw_player{}".format(i) for i in range(1, 6)] 10 | ALL_LINEUP_COLS = AW_LINEUP_COLS + HM_LINEUP_COLS 11 | 12 | PLAYER_RE = r"\w{0,7}\d{2}" 13 | 14 | # parsing field goal attempts 15 | shot_re = ( 16 | rf"(?P{PLAYER_RE}) " 17 | r"(?Pmakes|misses) " 18 | r"(?P2|3)\-pt " 19 | r"(?Pjump shot|hook shot|layup|dunk) " 20 | r"(?:from (?P\d+) ft|at rim)" 21 | ) 22 | assist_re = rf" \(assist by (?P{PLAYER_RE})\)" 23 | block_re = rf" \(block by (?P{PLAYER_RE})\)" 24 | SHOT_RE = re.compile(rf"{shot_re}(?:{assist_re}|{block_re})?", flags=re.I) 25 | 26 | # parsing jump balls 27 | jump_re = ( 28 | rf"Jump ball: (?P{PLAYER_RE}) vs\. (?P{PLAYER_RE})" 29 | rf"(?: \((?P{PLAYER_RE}) gains possession\))?" 30 | ) 31 | JUMP_RE = re.compile(jump_re, flags=re.I) 32 | 33 | # parsing rebounds 34 | reb_re = rf"(?POffensive|Defensive) rebound by (?P{PLAYER_RE}|Team)" 35 | REB_RE = re.compile(reb_re, flags=re.I) 36 | 37 | # parsing free throws 38 | ft_re = ( 39 | rf"(?P{PLAYER_RE}) (?Pmakes|misses) " 40 | r"(?Ptechnical )?(?Pflagrant )?" 41 | r"(?Pclear path )?free throw" 42 | r"(?: (?P\d+) of (?P\d+))?" 43 | ) 44 | FT_RE = re.compile(ft_re, flags=re.I) 45 | 46 | # parsing substitutions 47 | sub_re = rf"(?P{PLAYER_RE}) enters the game for (?P{PLAYER_RE})" 48 | SUB_RE = re.compile(sub_re, flags=re.I) 49 | 50 | # parsing turnovers 51 | to_reasons = rf"(?P[^;]+)(?:; steal by (?P{PLAYER_RE}))?" 52 | to_re = rf"Turnover by (?P{PLAYER_RE}|Team) \((?:{to_reasons})\)" 53 | TO_RE = re.compile(to_re, flags=re.I) 54 | 55 | # parsing shooting fouls 56 | shot_foul_re = ( 57 | r"Shooting(?P block)? foul " 58 | rf"by (?P{PLAYER_RE})" 59 | rf"(?: \(drawn by (?P{PLAYER_RE})\))?" 60 | ) 61 | SHOT_FOUL_RE = re.compile(shot_foul_re, flags=re.I) 62 | 63 | # parsing offensive fouls 64 | off_foul_re = ( 65 | r"Offensive(?P charge)? foul " 66 | rf"by (?P{PLAYER_RE})" 67 | rf"(?: \(drawn by (?P{PLAYER_RE})\))?" 68 | ) 69 | OFF_FOUL_RE = re.compile(off_foul_re, flags=re.I) 70 | 71 | # parsing personal fouls 72 | foul_re = ( 73 | r"Personal (?Ptake )?(?Pblock )?" 74 | rf"foul by (?P{PLAYER_RE})(?: \(drawn by " 75 | rf"(?P{PLAYER_RE})\))?" 76 | ) 77 | FOUL_RE = re.compile(foul_re, flags=re.I) 78 | 79 | # parsing loose ball fouls 80 | loose_ball_re = ( 81 | rf"Loose ball foul by (?P{PLAYER_RE})" 82 | rf"(?: \(drawn by (?P{PLAYER_RE})\))?" 83 | ) 84 | LOOSE_BALL_RE = re.compile(loose_ball_re, flags=re.I) 85 | 86 | # parsing away from play fouls 87 | away_from_ball_re = ( 88 | rf"Away from play foul by (?P{PLAYER_RE})" 89 | rf"(?: \(drawn by (?P{PLAYER_RE})\))?" 90 | ) 91 | AWAY_FROM_BALL_RE = re.compile(away_from_ball_re, flags=re.I) 92 | 93 | # parsing inbound fouls 94 | inbound_re = ( 95 | rf"Inbound foul by (?P{PLAYER_RE})" 96 | rf"(?: \(drawn by (?P{PLAYER_RE})\))?" 97 | ) 98 | INBOUND_RE = re.compile(inbound_re, flags=re.I) 99 | 100 | # parsing flagrant fouls 101 | flagrant_re = ( 102 | rf"Flagrant foul type (?P1|2) by (?P{PLAYER_RE})" 103 | rf"(?: \(drawn by (?P{PLAYER_RE})\))?" 104 | ) 105 | FLAGRANT_RE = re.compile(flagrant_re, flags=re.I) 106 | 107 | # parsing clear path fouls 108 | clear_path_re = ( 109 | rf"Clear path foul by (?P{PLAYER_RE})" 110 | rf"(?: \(drawn by (?P{PLAYER_RE})\))?" 111 | ) 112 | CLEAR_PATH_RE = re.compile(clear_path_re, flags=re.I) 113 | 114 | # parsing timeouts 115 | timeout_re = r"(?P.*?) (?:full )?timeout" 116 | TIMEOUT_RE = re.compile(timeout_re, flags=re.I) 117 | 118 | # parsing technical fouls 119 | tech_re = ( 120 | r"(?PHanging )?" 121 | r"(?PTaunting )?" 122 | r"(?PIll def )?" 123 | r"(?PDelay )?" 124 | r"(?PNon unsport )?" 125 | r"tech(?:nical)? foul by " 126 | rf"(?P{PLAYER_RE}|Team)" 127 | ) 128 | TECH_RE = re.compile(tech_re, flags=re.I) 129 | 130 | # parsing ejections 131 | eject_re = rf"(?P{PLAYER_RE}|Team) ejected from game" 132 | EJECT_RE = re.compile(eject_re, flags=re.I) 133 | 134 | # parsing defensive 3 seconds techs 135 | def3_tech_re = ( 136 | r"(?:Def 3 sec tech foul|Defensive three seconds)" 137 | rf" by (?P{PLAYER_RE})" 138 | ) 139 | DEF3_TECH_RE = re.compile(def3_tech_re, flags=re.I) 140 | 141 | # parsing violations 142 | viol_re = rf"Violation by (?P{PLAYER_RE}|Team) \((?P.*)\)" 143 | VIOL_RE = re.compile(viol_re, flags=re.I) 144 | 145 | 146 | def sparse_lineup_cols(df): 147 | regex = "{}_in".format(PLAYER_RE) 148 | return [c for c in df.columns if re.match(regex, c)] 149 | 150 | 151 | def parse_play(boxscore_id, details, is_home): 152 | """Parse play details from a play-by-play string describing a play. 153 | 154 | Assuming valid input, this function returns structured data in a dictionary 155 | describing the play. If the play detail string was invalid, this function 156 | returns None. 157 | 158 | :param boxscore_id: the boxscore ID of the play 159 | :param details: detail string for the play 160 | :param is_home: bool indicating whether the offense is at home 161 | :param returns: dictionary of play attributes or None if invalid 162 | :rtype: dictionary or None 163 | """ 164 | # if input isn't a string, return None 165 | if not details or not isinstance(details, str): 166 | return None 167 | 168 | bs = sportsref.nba.BoxScore(boxscore_id) 169 | aw, hm = bs.away(), bs.home() 170 | season = sportsref.nba.Season(bs.season()) 171 | hm_roster = set(bs.basic_stats().query("is_home == True").player_id.values) 172 | 173 | play = {} 174 | play["detail"] = details 175 | play["home"] = hm 176 | play["away"] = aw 177 | play["is_home_play"] = is_home 178 | 179 | match = re.match(SHOT_RE, details) 180 | if match: 181 | play["is_fga"] = True 182 | play.update(match.groupdict()) 183 | play["shot_dist"] = play["shot_dist"] if play["shot_dist"] is not None else 0 184 | play["shot_dist"] = int(play["shot_dist"]) 185 | play["is_fgm"] = play["is_fgm"] == "makes" 186 | play["is_three"] = play["is_three"] == "3" 187 | play["is_assist"] = pd.notnull(play.get("assister")) 188 | play["is_block"] = pd.notnull(play.get("blocker")) 189 | shooter_home = play["shooter"] in hm_roster 190 | play["off_team"] = hm if shooter_home else aw 191 | play["def_team"] = aw if shooter_home else hm 192 | return play 193 | 194 | match = re.match(JUMP_RE, details) 195 | if match: 196 | play["is_jump_ball"] = True 197 | play.update(match.groupdict()) 198 | return play 199 | 200 | match = re.match(REB_RE, details) 201 | if match: 202 | play["is_reb"] = True 203 | play.update(match.groupdict()) 204 | play["is_oreb"] = play["is_oreb"].lower() == "offensive" 205 | play["is_dreb"] = not play["is_oreb"] 206 | play["is_team_rebound"] = play["rebounder"] == "Team" 207 | if play["is_team_rebound"]: 208 | play["reb_team"], other = (hm, aw) if is_home else (aw, hm) 209 | else: 210 | reb_home = play["rebounder"] in hm_roster 211 | play["reb_team"], other = (hm, aw) if reb_home else (aw, hm) 212 | play["off_team"] = play["reb_team"] if play["is_oreb"] else other 213 | play["def_team"] = play["reb_team"] if play["is_dreb"] else other 214 | return play 215 | 216 | match = re.match(FT_RE, details) 217 | if match: 218 | play["is_fta"] = True 219 | play.update(match.groupdict()) 220 | play["is_ftm"] = play["is_ftm"] == "makes" 221 | play["is_tech_fta"] = bool(play["is_tech_fta"]) 222 | play["is_flag_fta"] = bool(play["is_flag_fta"]) 223 | play["is_clearpath_fta"] = bool(play["is_clearpath_fta"]) 224 | play["is_pf_fta"] = not play["is_tech_fta"] 225 | if play["tot_fta"]: 226 | play["tot_fta"] = int(play["tot_fta"]) 227 | if play["fta_num"]: 228 | play["fta_num"] = int(play["fta_num"]) 229 | ft_home = play["ft_shooter"] in hm_roster 230 | play["fta_team"] = hm if ft_home else aw 231 | if not play["is_tech_fta"]: 232 | play["off_team"] = hm if ft_home else aw 233 | play["def_team"] = aw if ft_home else hm 234 | return play 235 | 236 | match = re.match(SUB_RE, details) 237 | if match: 238 | play["is_sub"] = True 239 | play.update(match.groupdict()) 240 | sub_home = play["sub_in"] in hm_roster or play["sub_out"] in hm_roster 241 | play["sub_team"] = hm if sub_home else aw 242 | return play 243 | 244 | match = re.match(TO_RE, details) 245 | if match: 246 | play["is_to"] = True 247 | play.update(match.groupdict()) 248 | play["to_type"] = play["to_type"].lower() 249 | if play["to_type"] == "offensive foul": 250 | return None 251 | play["is_steal"] = pd.notnull(play["stealer"]) 252 | play["is_travel"] = play["to_type"] == "traveling" 253 | play["is_shot_clock_viol"] = play["to_type"] == "shot clock" 254 | play["is_oob"] = play["to_type"] == "step out of bounds" 255 | play["is_three_sec_viol"] = play["to_type"] == "3 sec" 256 | play["is_backcourt_viol"] = play["to_type"] == "back court" 257 | play["is_off_goaltend"] = play["to_type"] == "offensive goaltending" 258 | play["is_double_dribble"] = play["to_type"] == "dbl dribble" 259 | play["is_discont_dribble"] = play["to_type"] == "discontinued dribble" 260 | play["is_carry"] = play["to_type"] == "palming" 261 | if play["to_by"] == "Team": 262 | play["off_team"] = hm if is_home else aw 263 | play["def_team"] = aw if is_home else hm 264 | else: 265 | to_home = play["to_by"] in hm_roster 266 | play["off_team"] = hm if to_home else aw 267 | play["def_team"] = aw if to_home else hm 268 | return play 269 | 270 | match = re.match(SHOT_FOUL_RE, details) 271 | if match: 272 | play["is_pf"] = True 273 | play["is_shot_foul"] = True 274 | play.update(match.groupdict()) 275 | play["is_block_foul"] = bool(play["is_block_foul"]) 276 | foul_on_home = play["fouler"] in hm_roster 277 | play["off_team"] = aw if foul_on_home else hm 278 | play["def_team"] = hm if foul_on_home else aw 279 | play["foul_team"] = play["def_team"] 280 | return play 281 | 282 | match = re.match(OFF_FOUL_RE, details) 283 | if match: 284 | play["is_pf"] = True 285 | play["is_off_foul"] = True 286 | play["is_to"] = True 287 | play["to_type"] = "offensive foul" 288 | play.update(match.groupdict()) 289 | play["is_charge"] = bool(play["is_charge"]) 290 | play["fouler"] = play["to_by"] 291 | foul_on_home = play["fouler"] in hm_roster 292 | play["off_team"] = hm if foul_on_home else aw 293 | play["def_team"] = aw if foul_on_home else hm 294 | play["foul_team"] = play["off_team"] 295 | return play 296 | 297 | match = re.match(FOUL_RE, details) 298 | if match: 299 | play["is_pf"] = True 300 | play.update(match.groupdict()) 301 | play["is_take_foul"] = bool(play["is_take_foul"]) 302 | play["is_block_foul"] = bool(play["is_block_foul"]) 303 | foul_on_home = play["fouler"] in hm_roster 304 | play["off_team"] = aw if foul_on_home else hm 305 | play["def_team"] = hm if foul_on_home else aw 306 | play["foul_team"] = play["def_team"] 307 | return play 308 | 309 | # TODO: parsing double personal fouls 310 | # double_foul_re = (r'Double personal foul by (?P{0}) and ' 311 | # r'(?P{0})').format(PLAYER_RE) 312 | # m = re.match(double_Foul_re, details) 313 | # if m: 314 | # p['is_pf'] = True 315 | # p.update(m.groupdict()) 316 | # p['off_team'] = 317 | 318 | match = re.match(LOOSE_BALL_RE, details) 319 | if match: 320 | play["is_pf"] = True 321 | play["is_loose_ball_foul"] = True 322 | play.update(match.groupdict()) 323 | foul_home = play["fouler"] in hm_roster 324 | play["foul_team"] = hm if foul_home else aw 325 | return play 326 | 327 | # parsing punching fouls 328 | # TODO 329 | 330 | match = re.match(AWAY_FROM_BALL_RE, details) 331 | if match: 332 | play["is_pf"] = True 333 | play["is_away_from_play_foul"] = True 334 | play.update(match.groupdict()) 335 | foul_on_home = play["fouler"] in hm_roster 336 | # TODO: figure out who had the ball based on previous play 337 | play["foul_team"] = hm if foul_on_home else aw 338 | return play 339 | 340 | match = re.match(INBOUND_RE, details) 341 | if match: 342 | play["is_pf"] = True 343 | play["is_inbound_foul"] = True 344 | play.update(match.groupdict()) 345 | foul_on_home = play["fouler"] in hm_roster 346 | play["off_team"] = aw if foul_on_home else hm 347 | play["def_team"] = hm if foul_on_home else aw 348 | play["foul_team"] = play["def_team"] 349 | return play 350 | 351 | match = re.match(FLAGRANT_RE, details) 352 | if match: 353 | play["is_pf"] = True 354 | play["is_flagrant"] = True 355 | play.update(match.groupdict()) 356 | foul_on_home = play["fouler"] in hm_roster 357 | play["foul_team"] = hm if foul_on_home else aw 358 | return play 359 | 360 | match = re.match(CLEAR_PATH_RE, details) 361 | if match: 362 | play["is_pf"] = True 363 | play["is_clear_path_foul"] = True 364 | play.update(match.groupdict()) 365 | foul_on_home = play["fouler"] in hm_roster 366 | play["off_team"] = aw if foul_on_home else hm 367 | play["def_team"] = hm if foul_on_home else aw 368 | play["foul_team"] = play["def_team"] 369 | return play 370 | 371 | match = re.match(TIMEOUT_RE, details) 372 | if match: 373 | play["is_timeout"] = True 374 | play.update(match.groupdict()) 375 | is_official_to = play["timeout_team"].lower() == "official" 376 | name_to_id = season.team_names_to_ids() 377 | play["timeout_team"] = ( 378 | "Official" 379 | if is_official_to 380 | else name_to_id.get(hm, name_to_id.get(aw, play["timeout_team"])) 381 | ) 382 | return play 383 | 384 | match = re.match(TECH_RE, details) 385 | if match: 386 | play["is_tech_foul"] = True 387 | play.update(match.groupdict()) 388 | play["is_hanging"] = bool(play["is_hanging"]) 389 | play["is_taunting"] = bool(play["is_taunting"]) 390 | play["is_ill_def"] = bool(play["is_ill_def"]) 391 | play["is_delay"] = bool(play["is_delay"]) 392 | play["is_unsport"] = bool(play["is_unsport"]) 393 | foul_on_home = play["tech_fouler"] in hm_roster 394 | play["foul_team"] = hm if foul_on_home else aw 395 | return play 396 | 397 | match = re.match(EJECT_RE, details) 398 | if match: 399 | play["is_ejection"] = True 400 | play.update(match.groupdict()) 401 | if play["ejectee"] == "Team": 402 | play["ejectee_team"] = hm if is_home else aw 403 | else: 404 | eject_home = play["ejectee"] in hm_roster 405 | play["ejectee_team"] = hm if eject_home else aw 406 | return play 407 | 408 | match = re.match(DEF3_TECH_RE, details) 409 | if match: 410 | play["is_tech_foul"] = True 411 | play["is_def_three_secs"] = True 412 | play.update(match.groupdict()) 413 | foul_on_home = play["tech_fouler"] in hm_roster 414 | play["off_team"] = aw if foul_on_home else hm 415 | play["def_team"] = hm if foul_on_home else aw 416 | play["foul_team"] = play["def_team"] 417 | return play 418 | 419 | match = re.match(VIOL_RE, details) 420 | if match: 421 | play["is_viol"] = True 422 | play.update(match.groupdict()) 423 | if play["viol_type"] == "kicked_ball": 424 | play["is_to"] = True 425 | play["to_by"] = play["violator"] 426 | if play["violator"] == "Team": 427 | play["viol_team"] = hm if is_home else aw 428 | else: 429 | viol_home = play["violator"] in hm_roster 430 | play["viol_team"] = hm if viol_home else aw 431 | return play 432 | 433 | play["is_error"] = True 434 | return play 435 | 436 | 437 | def clean_features(df): 438 | """Fixes up columns of the passed DataFrame, such as casting T/F columns to 439 | boolean and filling in NaNs for team and opp. 440 | 441 | :param df: DataFrame of play-by-play data. 442 | :returns: Dataframe with cleaned columns. 443 | """ 444 | df = pd.DataFrame(df) 445 | 446 | bool_vals = set([True, False, None, np.nan]) 447 | sparse_cols = sparse_lineup_cols(df) 448 | for col in df: 449 | 450 | # make indicator columns boolean type (and fill in NaNs) 451 | if set(df[col].unique()[:5]) <= bool_vals: 452 | df[col] = df[col] == True # noqa 453 | 454 | # fill NaN's in sparse lineup columns to 0 455 | elif col in sparse_cols: 456 | df[col] = df[col].fillna(0) 457 | 458 | # fix free throw columns on technicals 459 | df.loc[df.is_tech_fta, ["fta_num", "tot_fta"]] = 1 460 | 461 | # fill in NaN's/fix off_team and def_team columns 462 | df.off_team.fillna(method="bfill", inplace=True) 463 | df.def_team.fillna(method="bfill", inplace=True) 464 | df.off_team.fillna(method="ffill", inplace=True) 465 | df.def_team.fillna(method="ffill", inplace=True) 466 | 467 | return df 468 | 469 | 470 | def clean_multigame_features(df): 471 | """TODO: Docstring for clean_multigame_features. 472 | 473 | :df: TODO 474 | :returns: TODO 475 | """ 476 | df = pd.DataFrame(df) 477 | if df.index.value_counts().max() > 1: 478 | df.reset_index(drop=True, inplace=True) 479 | 480 | df = clean_features(df) 481 | 482 | # if it's many games in one DataFrame, make poss_id and play_id unique 483 | for col in ("play_id", "poss_id"): 484 | diffs = df[col].diff().fillna(0) 485 | if (diffs < 0).any(): 486 | new_col = np.cumsum(diffs.astype(bool)) # noqa 487 | df.eval("{} = @new_col".format(col), inplace=True) 488 | 489 | return df 490 | 491 | 492 | def get_period_starters(df): 493 | """TODO""" 494 | 495 | def players_from_play(play): 496 | """Figures out what players are in the game based on the players 497 | mentioned in a play. Returns away and home players as two sets. 498 | 499 | :param play: A dictionary representing a parsed play. 500 | :returns: (aw_players, hm_players) 501 | :rtype: tuple of lists 502 | """ 503 | # if it's a tech FT from between periods, don't count this play 504 | if play["clock_str"] == "12:00.0" and ( 505 | play.get("is_tech_foul") or play.get("is_tech_fta") 506 | ): 507 | return [], [] 508 | 509 | stats = sportsref.nba.BoxScore(play["boxscore_id"]).basic_stats() 510 | home_grouped = stats.groupby("is_home") 511 | hm_roster = set(home_grouped.player_id.get_group(True).values) 512 | aw_roster = set(home_grouped.player_id.get_group(False).values) 513 | player_keys = [ 514 | "assister", 515 | "away_jumper", 516 | "blocker", 517 | "drew_foul", 518 | "fouler", 519 | "ft_shooter", 520 | "gains_poss", 521 | "home_jumper", 522 | "rebounder", 523 | "shooter", 524 | "stealer", 525 | "sub_in", 526 | "sub_out", 527 | "to_by", 528 | ] 529 | players = [p for p in play[player_keys] if pd.notnull(p)] 530 | 531 | aw_players = [p for p in players if p in aw_roster] 532 | hm_players = [p for p in players if p in hm_roster] 533 | return aw_players, hm_players 534 | 535 | # create a mapping { quarter => (away_starters, home_starters) } 536 | n_periods = df.quarter.nunique() 537 | period_starters = [(set(), set()) for _ in range(n_periods)] 538 | 539 | # fill out this mapping quarter by quarter 540 | for qtr, qtr_grp in df.groupby(df.quarter): 541 | aw_starters, hm_starters = period_starters[qtr - 1] 542 | exclude = set() 543 | # loop through sets of plays that happen at the "same time" 544 | for label, time_grp in qtr_grp.groupby(qtr_grp.secs_elapsed): 545 | # first, if they sub in and weren't already starters, exclude them 546 | sub_ins = set(time_grp.sub_in.dropna().values) 547 | exclude.update(sub_ins - aw_starters - hm_starters) 548 | # second, figure out new starters from each play at this time 549 | for i, row in time_grp.iterrows(): 550 | aw_players, hm_players = players_from_play(row) 551 | # update overall sets for the quarter 552 | aw_starters.update(aw_players) 553 | hm_starters.update(hm_players) 554 | # remove excluded (subbed-in) players 555 | hm_starters -= exclude 556 | aw_starters -= exclude 557 | # check whether we have found all starters 558 | if len(hm_starters) > 5 or len(aw_starters) > 5: 559 | import ipdb 560 | 561 | ipdb.set_trace() 562 | if len(hm_starters) >= 5 and len(aw_starters) >= 5: 563 | break 564 | 565 | if len(hm_starters) != 5 or len(aw_starters) != 5: 566 | print( 567 | "WARNING: wrong number of starters for a team in Q{} of {}".format( 568 | qtr, df.boxscore_id.iloc[0] 569 | ) 570 | ) 571 | 572 | return period_starters 573 | 574 | 575 | def get_sparse_lineups(df): 576 | """TODO: Docstring for get_sparse_lineups. 577 | 578 | :param df: TODO 579 | :returns: TODO 580 | """ 581 | 582 | # get the lineup data using get_dense_lineups if necessary 583 | if set(ALL_LINEUP_COLS) - set(df.columns): 584 | lineup_df = get_dense_lineups(df) 585 | else: 586 | lineup_df = df[ALL_LINEUP_COLS] 587 | 588 | # create the sparse representation 589 | hm_lineups = lineup_df[HM_LINEUP_COLS].values 590 | aw_lineups = lineup_df[AW_LINEUP_COLS].values 591 | # +1 for home, -1 for away 592 | hm_df = pd.DataFrame( 593 | [ 594 | {"{}_in".format(player_id): 1 for player_id in lineup} 595 | for lineup in hm_lineups 596 | ], 597 | dtype=int, 598 | ) 599 | aw_df = pd.DataFrame( 600 | [ 601 | {"{}_in".format(player_id): -1 for player_id in lineup} 602 | for lineup in aw_lineups 603 | ], 604 | dtype=int, 605 | ) 606 | sparse_df = pd.concat((hm_df, aw_df), axis=1).fillna(0) 607 | return sparse_df 608 | 609 | 610 | def get_dense_lineups(df): 611 | """Returns a new DataFrame based on the one it is passed. Specifically, it 612 | adds five columns for each team (ten total), where each column has the ID 613 | of a player on the court during the play. Assumes the DataFrame corresponds 614 | to only a single game (one unique boxscore ID). 615 | 616 | This information is figured out sequentially from the game's substitution 617 | data in the passed DataFrame, so the DataFrame passed as an argument must 618 | be from a specific BoxScore (rather than a DataFrame of non-consecutive 619 | plays). That is, the DataFrame must be of the form returned by 620 | :func:`nba.BoxScore.pbp `. 621 | 622 | .. note:: Note that the lineups reflect the teams in the game when the play 623 | happened, not after the play. For example, if a play is a substitution, 624 | the lineups for that play will be the lineups before the substituion 625 | occurs. 626 | 627 | :param df: A DataFrame of a game's play-by-play data. 628 | :returns: A DataFrame with additional lineup columns. 629 | 630 | """ 631 | assert df["boxscore_id"].nunique() == 1 632 | 633 | def lineup_dict(aw_lineup, hm_lineup): 634 | """Returns a dictionary of lineups to be converted to columns. 635 | Specifically, the columns are 'aw_player1' through 'aw_player5' and 636 | 'hm_player1' through 'hm_player5'. 637 | 638 | :param aw_lineup: The away team's current lineup. 639 | :param hm_lineup: The home team's current lineup. 640 | :returns: A dictionary of lineups. 641 | """ 642 | return { 643 | "{}_player{}".format(tm, i + 1): player 644 | for tm, lineup in zip(["aw", "hm"], [aw_lineup, hm_lineup]) 645 | for i, player in enumerate(lineup) 646 | } 647 | 648 | def handle_sub(row, aw_lineup, hm_lineup): 649 | """Modifies the aw_lineup and hm_lineup lists based on the substitution 650 | that takes place in the given row.""" 651 | assert row["is_sub"] 652 | sub_lineup = hm_lineup if row["sub_team"] == row["home"] else aw_lineup 653 | try: 654 | # make the sub 655 | idx = sub_lineup.index(row["sub_out"]) 656 | sub_lineup[idx] = row["sub_in"] 657 | except ValueError: 658 | # if the sub was double-entered and it's already been executed... 659 | if row["sub_in"] in sub_lineup and row["sub_out"] not in sub_lineup: 660 | return aw_lineup, hm_lineup 661 | # otherwise, let's print and pretend this never happened 662 | print( 663 | "ERROR IN SUB IN {}, Q{}, {}: {}".format( 664 | row["boxscore_id"], row["quarter"], row["clock_str"], row["detail"] 665 | ) 666 | ) 667 | raise 668 | return aw_lineup, hm_lineup 669 | 670 | per_starters = get_period_starters(df) 671 | cur_qtr = 0 672 | aw_lineup, hm_lineup = [], [] 673 | df = df.reset_index(drop=True) 674 | lineups = [{} for _ in range(df.shape[0])] 675 | 676 | # loop through select plays to determine lineups 677 | sub_or_per_start = df.is_sub | df.quarter.diff().astype(bool) 678 | for i, row in df.loc[sub_or_per_start].iterrows(): 679 | if row["quarter"] > cur_qtr: 680 | # first row in a quarter 681 | assert row["quarter"] == cur_qtr + 1 682 | # first, finish up the last quarter's lineups 683 | if cur_qtr > 0 and not df.loc[i - 1, "is_sub"]: 684 | lineups[i - 1] = lineup_dict(aw_lineup, hm_lineup) 685 | # then, move on to the quarter, and enter the starting lineups 686 | cur_qtr += 1 687 | aw_lineup, hm_lineup = list(map(list, per_starters[cur_qtr - 1])) 688 | lineups[i] = lineup_dict(aw_lineup, hm_lineup) 689 | # if the first play in the quarter is a sub, handle that 690 | if row["is_sub"]: 691 | aw_lineup, hm_lineup = handle_sub(row, aw_lineup, hm_lineup) 692 | else: 693 | # during the quarter 694 | # update lineups first then change lineups based on subs 695 | lineups[i] = lineup_dict(aw_lineup, hm_lineup) 696 | if row["is_sub"]: 697 | aw_lineup, hm_lineup = handle_sub(row, aw_lineup, hm_lineup) 698 | 699 | # create and clean DataFrame 700 | lineup_df = pd.DataFrame(lineups) 701 | if lineup_df.iloc[-1].isnull().all(): 702 | lineup_df.iloc[-1] = lineup_dict(aw_lineup, hm_lineup) 703 | lineup_df = lineup_df.groupby(df.quarter).fillna(method="bfill") 704 | 705 | # fill in NaN's based on minutes played 706 | bool_mat = lineup_df.isnull() 707 | mask = bool_mat.any(axis=1) 708 | if mask.any(): 709 | bs = sportsref.nba.BoxScore(df.boxscore_id[0]) 710 | # first, get the true minutes played from the box score 711 | stats = sportsref.nba.BoxScore(df.boxscore_id.iloc[0]).basic_stats() 712 | true_mp = ( 713 | pd.Series( 714 | stats.query("mp > 0")[["player_id", "mp"]] 715 | .set_index("player_id") 716 | .to_dict()["mp"] 717 | ) 718 | * 60 719 | ) 720 | # next, calculate minutes played based on the lineup data 721 | calc_mp = pd.Series( 722 | { 723 | p: ( 724 | df.secs_elapsed.diff() * [p in row for row in lineup_df.values] 725 | ).sum() 726 | for p in stats.query("mp > 0").player_id.values 727 | } 728 | ) 729 | # finally, figure which players are missing minutes 730 | diff = true_mp - calc_mp 731 | players_missing = diff.loc[diff.abs() >= 150] 732 | hm_roster = bs.basic_stats().query("is_home == True").player_id.values 733 | missing_df = pd.DataFrame( 734 | { 735 | "secs": players_missing.values, 736 | "is_home": players_missing.index.isin(hm_roster), 737 | }, 738 | index=players_missing.index, 739 | ) 740 | 741 | if missing_df.empty: 742 | # TODO: log this as a warning (or error?) 743 | print( 744 | "There are NaNs in the lineup data, but no players were " 745 | "found to be missing significant minutes" 746 | ) 747 | else: 748 | for is_home, group in missing_df.groupby("is_home"): 749 | player_id = group.index.item() 750 | tm_cols = ( 751 | sportsref.nba.pbp.HM_LINEUP_COLS 752 | if is_home 753 | else sportsref.nba.pbp.AW_LINEUP_COLS 754 | ) 755 | row_mask = lineup_df[tm_cols].isnull().any(axis=1) 756 | lineup_df.loc[row_mask, tm_cols] = ( 757 | lineup_df.loc[row_mask, tm_cols].fillna(player_id).values 758 | ) 759 | 760 | return lineup_df 761 | --------------------------------------------------------------------------------