├── .style.yapf
├── README.md
├── setup.cfg
├── sportsref
    ├── nba
    │   ├── __init__.py
    │   ├── teams.py
    │   ├── players.py
    │   ├── seasons.py
    │   ├── boxscores.py
    │   └── pbp.py
    ├── options.py
    ├── __init__.py
    ├── nfl
    │   ├── __init__.py
    │   ├── winProb.py
    │   ├── finders
    │   │   ├── __init__.py
    │   │   ├── PSF.py
    │   │   └── GPF.py
    │   ├── seasons.py
    │   ├── players.py
    │   ├── boxscores.py
    │   ├── teams.py
    │   └── pbp.py
    ├── decorators.py
    └── utils.py
├── setup.py
├── pyproject.toml
├── .pre-commit-config.yaml
└── .gitignore


/.style.yapf:
--------------------------------------------------------------------------------
1 | [style]
2 | COLUMN_LIMIT=100
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # sportsref
2 | Scraping sports data from sports-reference.com and related sites
3 | 
4 | NOTE: Very much still a WIP. Feel free to use, just bear in mind that the API
5 | is subject to change. Documentation is on the to-do list, once the API is a bit
6 | more rigid.
7 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [isort]
 2 | line_length = 100
 3 | multi_line_output = 3
 4 | include_trailing_comma = True
 5 | 
 6 | [flake8]
 7 | max-line-length = 100
 8 | exclude =
 9 |     .git,
10 |     .venv,
11 |     build,
12 |     dist
13 | ignore =
14 |     E203,  # whitespace before ':'
15 |     E265,  # block comment
16 |     W503,   # line break before binary operator (e.g. `and` or `or`)
17 | 


--------------------------------------------------------------------------------
/sportsref/nba/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import boxscores
 2 | from . import pbp
 3 | from . import seasons
 4 | from . import teams
 5 | 
 6 | from .boxscores import BoxScore
 7 | from .seasons import Season
 8 | from .teams import Team
 9 | from .players import Player
10 | 
11 | BASE_URL = "http://www.basketball-reference.com"
12 | 
13 | __all__ = [
14 |     "BASE_URL",
15 |     "boxscores",
16 |     "BoxScore",
17 |     "pbp",
18 |     "seasons",
19 |     "Season",
20 |     "teams",
21 |     "Team",
22 |     "players",
23 |     "Player",
24 | ]
25 | 


--------------------------------------------------------------------------------
/sportsref/options.py:
--------------------------------------------------------------------------------
 1 | OPTIONS = {"cache": True, "memoize": True}
 2 | 
 3 | 
 4 | def get_option(option):
 5 |     option = option.lower()
 6 |     if option in OPTIONS:
 7 |         return OPTIONS[option]
 8 |     else:
 9 |         # TODO: log
10 |         print(f"option {option} not recognized")
11 |         return None
12 | 
13 | 
14 | def set_option(option, value):
15 |     option = option.lower()
16 |     if option in OPTIONS:
17 |         OPTIONS[option] = value
18 |     else:
19 |         # TODO: log
20 |         print("option {option} not recognized")
21 | 


--------------------------------------------------------------------------------
/sportsref/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | 
 3 | SITE_ABBREV = {
 4 |     "http://www.pro-football-reference.com": "pfr",
 5 |     "http://www.basketball-reference.com": "bkref",
 6 |     "http://www.sports-reference.com/cfb": "ncaaf",
 7 |     "http://www.sports-reference.com/cbb": "ncaab",
 8 | }
 9 | 
10 | from sportsref.options import get_option, set_option
11 | from sportsref import decorators, utils, nfl, nba
12 | 
13 | __all__ = [
14 |     "decorators",
15 |     "utils",
16 |     "nfl",
17 |     "nba",
18 |     "get_option",
19 |     "set_option",
20 |     "SITE_ABBREV",
21 | ]
22 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | setup(
 4 |     name="sportsref",
 5 |     version="0.13.0",
 6 |     description="Scraping data from sports-reference.com and related sites",
 7 |     url="https://github.com/mdgoldberg/sportsref",
 8 |     author="Matt Goldberg",
 9 |     author_email="matt.goldberg7@gmail.com",
10 |     packages=find_packages(),
11 |     install_requires=[
12 |         "appdirs",
13 |         "boltons",
14 |         "mementos",
15 |         "numexpr",
16 |         "numpy",
17 |         "pandas",
18 |         "pyquery",
19 |         "requests",
20 |     ],
21 | )
22 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "sportsref"
 3 | version = "0.13.0"
 4 | description = ""
 5 | authors = ["Matt Goldberg <matt.goldberg7@gmail.com>"]
 6 | 
 7 | [tool.poetry.dependencies]
 8 | python = "^3.6.1"
 9 | mementos = "^1.3.1"
10 | numexpr = "^2.7.1"
11 | numpy = "^1.19.4"
12 | pandas = "^1.1.4"
13 | pyquery = "^1.4.3"
14 | requests = "^2.25.0"
15 | 
16 | [tool.poetry.dev-dependencies]
17 | black = "^20.8b1"
18 | flake8 = "^3.8.4"
19 | ipdb = "^0.13.4"
20 | ipython = "^7.15.0"
21 | jupyter = "^1.0.0"
22 | pre-commit = "^2.9.2"
23 | pylint = "^2.6.0"
24 | 
25 | [build-system]
26 | requires = ["poetry-core>=1.0.0"]
27 | build-backend = "poetry.core.masonry.api"
28 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # See https://pre-commit.com for more information
 2 | # See https://pre-commit.com/hooks.html for more hooks
 3 | repos:
 4 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 5 |     rev: v2.0.0
 6 |     hooks:
 7 |       - id: trailing-whitespace
 8 |       - id: end-of-file-fixer
 9 |       - id: check-yaml
10 |       - id: check-added-large-files
11 |   - repo: https://github.com/prettier/prettier
12 |     rev: 1.18.2
13 |     hooks:
14 |       - id: prettier
15 |   - repo: https://github.com/psf/black
16 |     rev: stable
17 |     hooks:
18 |       - id: black
19 |   # - repo: https://github.com/pre-commit/mirrors-mypy
20 |   #   rev: "v0.782"
21 |   #   hooks:
22 |   #     - id: mypy
23 | 


--------------------------------------------------------------------------------
/sportsref/nfl/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import finders
 2 | from . import teams
 3 | from . import players
 4 | from . import boxscores
 5 | 
 6 | # from . import winProb
 7 | from . import pbp
 8 | 
 9 | from .players import Player
10 | from .seasons import Season
11 | from .teams import Team
12 | from .boxscores import BoxScore
13 | from .finders import GamePlayFinder, PlayerSeasonFinder
14 | 
15 | BASE_URL = "http://www.pro-football-reference.com"
16 | 
17 | # modules/variables to expose
18 | __all__ = [
19 |     "BASE_URL",
20 |     "finders",
21 |     "GamePlayFinder",
22 |     "PlayerSeasonFinder",
23 |     "boxscores",
24 |     "BoxScore",
25 |     "players",
26 |     "Player",
27 |     "seasons",
28 |     "Season",
29 |     "teams",
30 |     "Team",
31 |     # "winProb",
32 |     "pbp",
33 | ]
34 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/build/
55 | 
56 | # PyBuilder
57 | target/
58 | 
59 | # Misc
60 | **/.DS_Store
61 | **/*.swp
62 | **/*.json
63 | **/.R*
64 | scripts/
65 | csv/
66 | 


--------------------------------------------------------------------------------
/sportsref/nfl/winProb.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from scipy.stats import norm
 3 | 
 4 | 
 5 | def initialWinProb(line):
 6 |     """Gets the initial win probability of a game given its Vegas line.
 7 | 
 8 |     :line: The Vegas line from the home team's perspective (negative means
 9 |     home team is favored).
10 |     :returns: A float in [0., 100.] that represents the win probability.
11 |     """
12 |     line = float(line)
13 |     probWin = 1.0 - norm.cdf(0.5, -line, 13.86)
14 |     probTie = norm.cdf(0.5, -line, 13.86) - norm.cdf(-0.5, -line, 13.86)
15 |     return 100.0 * (probWin + 0.5 * probTie)
16 | 
17 | 
18 | def winProb(line, margin, secsElapsed, expPts):
19 |     line = float(line)
20 |     margin = float(margin)
21 |     expPts = float(expPts)
22 |     baseMean = -line
23 |     baseStd = 13.46
24 |     expMargin = margin + expPts
25 |     minRemain = 60 - secsElapsed / 60 + 0.00001
26 |     adjMean = baseMean * minRemain / 60
27 |     adjStd = baseStd / np.sqrt(60 / minRemain)
28 |     probWin = 1.0 - norm.cdf(-expMargin + 0.5, adjMean, adjStd)
29 |     probTie = norm.cdf(-expMargin + 0.5, adjMean, adjStd) - norm.cdf(
30 |         -expMargin - 0.5, adjMean, adjStd
31 |     )
32 |     return 100.0 * (probWin + 0.5 * probTie)
33 | 


--------------------------------------------------------------------------------
/sportsref/nba/teams.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pyquery import PyQuery as pq
 3 | 
 4 | import sportsref
 5 | 
 6 | 
 7 | class Team(object, metaclass=sportsref.decorators.Cached):
 8 |     def __init__(self, team_id):
 9 |         self.team_id = team_id.upper()
10 | 
11 |     def __eq__(self, other):
12 |         return self.team_id == other.team_id
13 | 
14 |     def __hash__(self):
15 |         return hash(self.team_id)
16 | 
17 |     @sportsref.decorators.memoize
18 |     def team_year_url(self, yr_str):
19 |         return f"{sportsref.nba.BASE_URL}/teams/{self.team_id}/{yr_str}.htm"
20 | 
21 |     @sportsref.decorators.memoize
22 |     def get_main_doc(self):
23 |         team_url = f"{sportsref.nba.BASE_URL}/teams/{self.team_id}"
24 |         main_doc = pq(sportsref.utils.get_html(team_url))
25 |         return main_doc
26 | 
27 |     @sportsref.decorators.memoize
28 |     def get_year_doc(self, yr_str):
29 |         return pq(sportsref.utils.get_html(self.team_year_url(yr_str)))
30 | 
31 |     @sportsref.decorators.memoize
32 |     def name(self):
33 |         """Returns the real name of the franchise given the team ID.
34 | 
35 |         Examples:
36 |         'BOS' -> 'Boston Celtics'
37 |         'NJN' -> 'Brooklyn Nets'
38 | 
39 |         :returns: A string corresponding to the team's full name.
40 |         """
41 |         doc = self.get_main_doc()
42 |         name = doc('div#info h1[itemprop="name"]').text()
43 |         return name
44 | 
45 |     @sportsref.decorators.memoize
46 |     def roster(self, year):
47 |         """Returns the roster table for the given year.
48 | 
49 |         :year: The year for which we want the roster; defaults to current year.
50 |         :returns: A DataFrame containing roster information for that year.
51 |         """
52 |         doc = self.get_year_doc(year)
53 |         table = doc("table#roster")
54 |         df = sportsref.utils.parse_table(table)
55 |         df["years_experience"] = (
56 |             df["years_experience"].replace("R", 0).replace("", np.nan).astype(float)
57 |         )
58 |         return df
59 | 
60 |     # TODO: kind_rpb
61 |     @sportsref.decorators.memoize
62 |     def schedule(self, year):
63 |         """Gets schedule information for a team-season.
64 | 
65 |         :year: The year for which we want the schedule.
66 |         :returns: DataFrame of schedule information.
67 |         """
68 |         doc = self.get_year_doc(f"{year}_games")
69 |         table = doc("table#games")
70 |         df = sportsref.utils.parse_table(table)
71 |         return df
72 | 


--------------------------------------------------------------------------------
/sportsref/nfl/finders/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import GPF
 2 | from . import PSF
 3 | 
 4 | from .PSF import PlayerSeasonFinder
 5 | from .GPF import GamePlayFinder
 6 | 
 7 | # modules/variables to expose
 8 | __all__ = ["PlayerSeasonFinder", "GamePlayFinder"]
 9 | 
10 | # Fill in PlayerSeasonFinder docstring
11 | 
12 | IOD = PSF.inputs_options_defaults()
13 | 
14 | paramStr = "\n".join(
15 |     ':param {}: default="{}"'.format(name, ",".join(dct["value"]))
16 |     for name, dct in sorted(IOD.items())
17 | )
18 | optsStr = "\n".join(
19 |     "{}: {}".format(name, ",".join('"{}"'.format(opt) for opt in dct["options"]))
20 |     if len(dct["options"]) <= 10
21 |     else "{}: {}...{}".format(
22 |         name,
23 |         ",".join('"{}"'.format(opt) for opt in dct["options"][:10]),
24 |         ",".join('"{}"'.format(opt) for opt in dct["options"][-2:]),
25 |     )
26 |     for name, dct in sorted(IOD.items())
27 | )
28 | 
29 | 
30 | PSF.PlayerSeasonFinder.__doc__ = """
31 | Finds player-seasons that match criteria supplied by keyword arguments.
32 | 
33 | * Can use tm or team for team_id.
34 | * Can use yr, year, yrs, or years for year_min, year_max.
35 | * Can use [draft_]pos, [draft_]position, [draft_]positions for a shortcut for
36 | [draft_]positions.
37 | 
38 | Options for inputs:
39 | {}
40 | 
41 | {}
42 | :returns: list of matching player-season tuples
43 | :rtype: [(player ID, season year)]
44 | 
45 | """.format(
46 |     paramStr, optsStr
47 | )
48 | 
49 | # clean up namespace
50 | del IOD, paramStr, optsStr
51 | 
52 | 
53 | # Fill in GamePlayFinder docstring
54 | 
55 | IOD = GPF.inputs_options_defaults()
56 | 
57 | paramStr = "\n".join(
58 |     ':param {}: default="{}"'.format(name, ",".join(dct["value"]))
59 |     for name, dct in sorted(IOD.items())
60 | )
61 | 
62 | optsStr = "\n".join(
63 |     "{}: {}".format(name, ",".join('"{}"'.format(opt) for opt in dct["options"]))
64 |     if len(dct["options"]) <= 10
65 |     else "{}: {}...{}".format(
66 |         name,
67 |         ",".join('"{}"'.format(opt) for opt in dct["options"][:10]),
68 |         ",".join('"{}"'.format(opt) for opt in dct["options"][-2:]),
69 |     )
70 |     for name, dct in sorted(IOD.items())
71 | )
72 | 
73 | GPF.GamePlayFinder.__doc__ = """
74 | Finds plays that match criteria supplied by keyword arguments.
75 | 
76 | * Can use tm or team instead of team_id.
77 | * Can use yr, year, yrs, or years instead of year_min, year_max.
78 | * For multi-valued options (like down or rush direction), separate values with
79 | commas or use a list.
80 | * For options that are yes/no/either or yes/no/any, -1 is either/any, 0 is no,
81 | 1 is yes.
82 | 
83 | Options for the inputs:
84 | {}
85 | 
86 | {}
87 | :returns: Pandas dataframe of plays
88 | :rtype: pd.DataFrame
89 | """.format(
90 |     paramStr, optsStr
91 | )
92 | 
93 | # clean up namespace
94 | del IOD, paramStr, optsStr
95 | 


--------------------------------------------------------------------------------
/sportsref/nfl/seasons.py:
--------------------------------------------------------------------------------
 1 | from pyquery import PyQuery as pq
 2 | 
 3 | import sportsref
 4 | 
 5 | 
 6 | __all__ = ["Season"]
 7 | 
 8 | 
 9 | class Season(object, metaclass=sportsref.decorators.Cached):
10 | 
11 |     """Object representing a given NFL season."""
12 | 
13 |     def __init__(self, year):
14 |         """Initializes a Season object for an NFL season.
15 | 
16 |         :year: The year of the season we want.
17 |         """
18 |         self.yr = int(year)
19 | 
20 |     def __eq__(self, other):
21 |         return self.yr == other.yr
22 | 
23 |     def __hash__(self):
24 |         return hash(self.yr)
25 | 
26 |     def __repr__(self):
27 |         return "Season({})".format(self.yr)
28 | 
29 |     def _subpage_url(self, page):
30 |         return sportsref.nfl.BASE_URL + "/years/{}/{}.htm".format(self.yr, page)
31 | 
32 |     @sportsref.decorators.memoize
33 |     def get_main_doc(self):
34 |         """Returns PyQuery object for the main season URL.
35 |         :returns: PyQuery object.
36 |         """
37 |         url = sportsref.nfl.BASE_URL + "/years/{}/".format(self.yr)
38 |         return pq(sportsref.utils.get_html(url))
39 | 
40 |     @sportsref.decorators.memoize
41 |     def get_sub_doc(self, subpage):
42 |         """Returns PyQuery object for a given subpage URL.
43 |         :subpage: The subpage of the season, e.g. 'per_game'.
44 |         :returns: PyQuery object.
45 |         """
46 |         html = sportsref.utils.get_html(self._subpage_url(subpage))
47 |         return pq(html)
48 | 
49 |     @sportsref.decorators.memoize
50 |     def get_team_ids(self):
51 |         """Returns a list of the team IDs for the given year.
52 |         :returns: List of team IDs.
53 |         """
54 |         return sportsref.nfl.teams.list_teams(self.yr)
55 | 
56 |     @sportsref.decorators.memoize
57 |     def team_ids_to_names(self):
58 |         """Mapping from 3-letter team IDs to full team names.
59 | 
60 |         :returns: Dictionary with team IDs as keys and full team strings as
61 |             values.
62 |         """
63 |         return sportsref.nfl.teams.team_names(self.yr)
64 | 
65 |     @sportsref.decorators.memoize
66 |     def team_names_to_ids(self):
67 |         """Mapping from full team names to 3-letter team IDs.
68 |         :returns: Dictionary with tean names as keys and team IDs as values.
69 |         """
70 |         return sportsref.nfl.teams.team_ids(self.yr)
71 | 
72 |     @sportsref.decorators.memoize
73 |     def _get_player_stats_table(self, subpage, table_id):
74 |         """Helper function for player season stats.
75 | 
76 |         :identifier: string identifying the type of stat, e.g. 'passing'.
77 |         :returns: A DataFrame of stats.
78 |         """
79 |         doc = self.get_sub_doc(subpage)
80 |         table = doc("table#{}".format(table_id))
81 |         df = sportsref.utils.parse_table(table)
82 |         return df
83 | 
84 |     def player_stats_passing(self):
85 |         """Returns a DataFrame of passing player stats for a season."""
86 |         return self._get_player_stats_table("passing", "passing")
87 | 
88 |     def player_stats_rushing(self):
89 |         """Returns a DataFrame of rushing player stats for a season."""
90 |         return self._get_player_stats_table("rushing", "rushing_and_receiving")
91 | 
92 |     def player_stats_receiving(self):
93 |         """Returns a DataFrame of receiving player stats for a season."""
94 |         return self._get_player_stats_table("receiving", "receiving")
95 | 


--------------------------------------------------------------------------------
/sportsref/decorators.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import datetime
  3 | import functools
  4 | import getpass
  5 | import hashlib
  6 | import os
  7 | import re
  8 | import time
  9 | 
 10 | import appdirs
 11 | import mementos
 12 | import pandas as pd
 13 | from pyquery import PyQuery as pq
 14 | 
 15 | import sportsref
 16 | 
 17 | 
 18 | # TODO: move PSFConstants and GPFConstants to appdirs cache dir
 19 | def switch_to_dir(dir_path):
 20 |     """
 21 |     Decorator that switches to given directory before executing function, and
 22 |     then returning to orignal directory.
 23 |     """
 24 | 
 25 |     def decorator(func):
 26 |         @functools.wraps(func)
 27 |         def wrapper(*args, **kwargs):
 28 |             orig_cwd = os.getcwd()
 29 |             os.chdir(dir_path)
 30 |             ret = func(*args, **kwargs)
 31 |             os.chdir(orig_cwd)
 32 |             return ret
 33 | 
 34 |         return wrapper
 35 | 
 36 |     return decorator
 37 | 
 38 | 
 39 | def _days_valid_pfr(url):
 40 |     # boxscores are static, but refresh quarterly to be sure
 41 |     if "boxscore" in url:
 42 |         return 90
 43 |     # important dates
 44 |     today = datetime.date.today()
 45 |     start_of_season = datetime.date(today.year, 8, 15)
 46 |     end_of_season = datetime.date(today.year, 2, 15)
 47 |     # check for a year in the filename
 48 |     m = re.search(r"(\d{4})", url)
 49 |     if m:
 50 |         # if it was a year prior to the current season, we're good
 51 |         year = int(m.group(1))
 52 |         cur_season = today.year - (today <= end_of_season)
 53 |         if year < cur_season:
 54 |             return 90
 55 |     # if it's the offseason, refresh cache twice a month
 56 |     if end_of_season < today < start_of_season:
 57 |         return 15
 58 |     # otherwise, refresh every 2 days
 59 |     return 2
 60 | 
 61 | 
 62 | def _days_valid_bkref(url):
 63 |     # boxscores are static, but refresh quarterly to be sure
 64 |     if "boxscore" in url:
 65 |         return 90
 66 |     # important dates
 67 |     today = datetime.date.today()
 68 |     start_of_season = datetime.date(today.year, 10, 1)
 69 |     end_of_season = datetime.date(today.year, 7, 1)
 70 |     # check for a year in the filename
 71 |     m = re.search(r"(\d{4})", url)
 72 |     if m:
 73 |         # if it was a year prior to the current season, we're good
 74 |         year = int(m.group(1))
 75 |         cur_season = today.year - (today <= end_of_season) + 1
 76 |         if year < cur_season:
 77 |             return 90
 78 |     # if it's the offseason, refresh cache once a month
 79 |     if end_of_season < today < start_of_season:
 80 |         return 30
 81 |     # otherwise, refresh every 2 days
 82 |     return 2
 83 | 
 84 | 
 85 | def _days_valid_cfb(url):
 86 |     # TODO: caching for CFB
 87 |     return 365
 88 | 
 89 | 
 90 | def cache(func):
 91 |     """Caches the HTML returned by the specified function `func`. Caches it in
 92 |     the user cache determined by the appdirs package.
 93 |     """
 94 | 
 95 |     CACHE_DIR = appdirs.user_cache_dir("sportsref", getpass.getuser())
 96 |     os.makedirs(CACHE_DIR, exist_ok=True)
 97 | 
 98 |     @functools.wraps(func)
 99 |     def wrapper(url):
100 |         # hash based on the URL
101 |         file_hash = hashlib.md5()
102 |         encoded_url = url.encode(errors="replace")
103 |         file_hash.update(encoded_url)
104 |         file_hash = file_hash.hexdigest()
105 |         filename = f"{CACHE_DIR}/{file_hash}"
106 | 
107 |         sport_id = None
108 |         for a_base_url, a_sport_id in sportsref.SITE_ABBREV.items():
109 |             if url.startswith(a_base_url):
110 |                 sport_id = a_sport_id
111 |                 break
112 |         else:
113 |             # TODO: log
114 |             print(f"No sport ID found for {url}, not able to check cache")
115 | 
116 |         # check whether cache is valid or stale
117 |         file_exists = os.path.isfile(filename)
118 |         if sport_id and file_exists:
119 |             cur_time = int(time.time())
120 |             mod_time = int(os.path.getmtime(filename))
121 |             days_since_mod = datetime.timedelta(seconds=(cur_time - mod_time)).days
122 |             # TODO: refactor _days_valid_ functions to not use globals
123 |             days_cache_valid = globals()[f"_days_valid_{sport_id}"](url)
124 |             cache_is_valid = days_since_mod < days_cache_valid
125 |         else:
126 |             cache_is_valid = False
127 | 
128 |         # if file found and cache is valid, read from file
129 |         allow_caching = sportsref.get_option("cache")
130 |         if file_exists and cache_is_valid and allow_caching:
131 |             with open(filename, "r", encoding="utf-8", errors="replace") as f:
132 |                 text = f.read()
133 |         # otherwise, execute function and cache results
134 |         else:
135 |             text = func(url)
136 |             with open(filename, "w+", encoding="utf-8") as f:
137 |                 f.write(text)
138 |         return text
139 | 
140 |     return wrapper
141 | 
142 | 
143 | def get_class_instance_key(cls, args, kwargs):
144 |     """
145 |     Returns a unique identifier for a class instantiation.
146 |     """
147 |     identifiers = [id(cls)]
148 |     for arg in args:
149 |         identifiers.append(id(arg))
150 |     identifiers.extend((k, id(v)) for k, v in list(kwargs.items()))
151 |     return tuple(sorted(identifiers))
152 | 
153 | 
154 | # used as a metaclass for classes that should be memoized
155 | # (technically not a decorator, but it's similar enough)
156 | Cached = mementos.memento_factory("Cached", get_class_instance_key)
157 | 
158 | 
159 | def memoize(fun):
160 |     """A decorator for memoizing functions.
161 | 
162 |     Only works on functions that take simple arguments - arguments that take
163 |     list-like or dict-like arguments will not be memoized, and this function
164 |     will raise a TypeError.
165 |     """
166 | 
167 |     @functools.wraps(fun)
168 |     def wrapper(*args, **kwargs):
169 | 
170 |         do_memoization = sportsref.get_option("memoize")
171 |         if not do_memoization:
172 |             return fun(*args, **kwargs)
173 | 
174 |         hash_args = tuple(args)
175 |         hash_kwargs = frozenset(sorted(kwargs.items()))
176 |         key = (hash_args, hash_kwargs)
177 | 
178 |         def _copy(v):
179 |             if isinstance(v, pq):
180 |                 return v.clone()
181 |             else:
182 |                 return copy.deepcopy(v)
183 | 
184 |         try:
185 |             ret = _copy(cache[key])
186 |             return ret
187 |         except KeyError:
188 |             cache[key] = fun(*args, **kwargs)
189 |             ret = _copy(cache[key])
190 |             return ret
191 |         except TypeError:
192 |             print(
193 |                 f"memoization type error in function {fun.__name__} for arguments {key}"
194 |             )
195 |             raise
196 | 
197 |     cache = {}
198 |     return wrapper
199 | 
200 | 
201 | def kind_rpb(include_type=False):
202 |     def decorator(fun):
203 |         """Supports functions that return a DataFrame and have a `kind` keyword
204 |         argument that specifies regular season ('R'), playoffs ('P'), or both
205 |         ('B'). If given 'B', it will call the function with both 'R' and 'P'
206 |         and concatenate the results.
207 |         """
208 | 
209 |         @functools.wraps(fun)
210 |         def wrapper(*args, **kwargs):
211 |             kind = kwargs.get("kind", "R").upper()
212 |             if kind == "B":
213 |                 kwargs["kind"] = "R"
214 |                 reg = fun(*args, **kwargs)
215 |                 if include_type:
216 |                     reg["is_playoffs"] = False
217 |                 kwargs["kind"] = "P"
218 |                 poffs = fun(*args, **kwargs)
219 |                 if include_type:
220 |                     poffs["is_playoffs"] = True
221 |                 return pd.concat((reg, poffs), ignore_index=True)
222 |             else:
223 |                 df = fun(*args, **kwargs)
224 |                 if include_type:
225 |                     df["is_playoffs"] = kind == "P"
226 |                 return df
227 | 
228 |         return wrapper
229 | 
230 |     return decorator
231 | 


--------------------------------------------------------------------------------
/sportsref/nfl/finders/PSF.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import json
  3 | import os
  4 | import time
  5 | import urllib.parse
  6 | 
  7 | from pyquery import PyQuery as pq
  8 | 
  9 | from ... import decorators, utils
 10 | 
 11 | PSF_URL = "http://www.pro-football-reference.com/" "play-index/psl_finder.cgi"
 12 | 
 13 | PSF_CONSTANTS_FILENAME = "PSFConstants.json"
 14 | 
 15 | 
 16 | def PlayerSeasonFinder(**kwargs):
 17 |     """ Docstring will be filled in by __init__.py """
 18 | 
 19 |     if "offset" not in kwargs:
 20 |         kwargs["offset"] = 0
 21 | 
 22 |     playerSeasons = []
 23 |     while True:
 24 |         querystring = _kwargs_to_qs(**kwargs)
 25 |         url = "{}?{}".format(PSF_URL, querystring)
 26 |         if kwargs.get("verbose", False):
 27 |             print(url)
 28 |         html = utils.get_html(url)
 29 |         doc = pq(html)
 30 |         table = doc("table#results")
 31 |         df = utils.parse_table(table)
 32 |         if df.empty:
 33 |             break
 34 | 
 35 |         thisSeason = list(zip(df.player_id, df.year))
 36 |         playerSeasons.extend(thisSeason)
 37 | 
 38 |         if doc('*:contains("Next Page")'):
 39 |             kwargs["offset"] += 100
 40 |         else:
 41 |             break
 42 | 
 43 |     return playerSeasons
 44 | 
 45 | 
 46 | def _kwargs_to_qs(**kwargs):
 47 |     """Converts kwargs given to PSF to a querystring.
 48 | 
 49 |     :returns: the querystring.
 50 |     """
 51 |     # start with defaults
 52 |     inpOptDef = inputs_options_defaults()
 53 |     opts = {name: dct["value"] for name, dct in list(inpOptDef.items())}
 54 | 
 55 |     # clean up keys and values
 56 |     for k, v in list(kwargs.items()):
 57 |         del kwargs[k]
 58 |         # bool => 'Y'|'N'
 59 |         if isinstance(v, bool):
 60 |             kwargs[k] = "Y" if v else "N"
 61 |         # tm, team => team_id
 62 |         elif k.lower() in ("tm", "team"):
 63 |             kwargs["team_id"] = v
 64 |         # yr, year, yrs, years => year_min, year_max
 65 |         elif k.lower() in ("yr", "year", "yrs", "years"):
 66 |             if isinstance(v, collections.Iterable):
 67 |                 lst = list(v)
 68 |                 kwargs["year_min"] = min(lst)
 69 |                 kwargs["year_max"] = max(lst)
 70 |             elif isinstance(v, str):
 71 |                 v = list(map(int, v.split(",")))
 72 |                 kwargs["year_min"] = min(v)
 73 |                 kwargs["year_max"] = max(v)
 74 |             else:
 75 |                 kwargs["year_min"] = v
 76 |                 kwargs["year_max"] = v
 77 |         # pos, position, positions => pos[]
 78 |         elif k.lower() in ("pos", "position", "positions"):
 79 |             if isinstance(v, str):
 80 |                 v = v.split(",")
 81 |             elif not isinstance(v, collections.Iterable):
 82 |                 v = [v]
 83 |             kwargs["pos[]"] = v
 84 |         # draft_pos, ... => draft_pos[]
 85 |         elif k.lower() in (
 86 |             "draft_pos",
 87 |             "draftpos",
 88 |             "draftposition",
 89 |             "draftpositions",
 90 |             "draft_position",
 91 |             "draft_positions",
 92 |         ):
 93 |             if isinstance(v, str):
 94 |                 v = v.split(",")
 95 |             elif not isinstance(v, collections.Iterable):
 96 |                 v = [v]
 97 |             kwargs["draft_pos[]"] = v
 98 |         # if not one of these cases, put it back in kwargs
 99 |         else:
100 |             kwargs[k] = v
101 | 
102 |     # update based on kwargs
103 |     for k, v in list(kwargs.items()):
104 |         # if overwriting a default, overwrite it (with a list so the
105 |         # opts -> querystring list comp works)
106 |         if k in opts or k in ("pos[]", "draft_pos[]"):
107 |             # if multiple values separated by commas, split em
108 |             if isinstance(v, str):
109 |                 v = v.split(",")
110 |             # otherwise, make sure it's a list
111 |             elif not isinstance(v, collections.Iterable):
112 |                 v = [v]
113 |             # then, add list of values to the querystring dict *opts*
114 |             opts[k] = v
115 |         if "draft" in k:
116 |             opts["draft"] = [1]
117 | 
118 |     opts["request"] = [1]
119 |     opts["offset"] = [kwargs.get("offset", 0)]
120 | 
121 |     qs = "&".join(
122 |         "{}={}".format(urllib.parse.quote_plus(name), val)
123 |         for name, vals in sorted(opts.items())
124 |         for val in vals
125 |     )
126 | 
127 |     return qs
128 | 
129 | 
130 | @decorators.switch_to_dir(os.path.dirname(os.path.realpath(__file__)))
131 | def inputs_options_defaults():
132 |     """Handles scraping options for player-season finder form.
133 | 
134 |     :returns: {'name1': {'value': val, 'options': [opt1, ...] }, ... }
135 |     """
136 |     # set time variables
137 |     if os.path.isfile(PSF_CONSTANTS_FILENAME):
138 |         modtime = int(os.path.getmtime(PSF_CONSTANTS_FILENAME))
139 |         curtime = int(time.time())
140 |     # if file found and it's been <= a week
141 |     if os.path.isfile(PSF_CONSTANTS_FILENAME) and curtime - modtime <= 7 * 24 * 60 * 60:
142 | 
143 |         # just read the dict from cached file
144 |         with open(PSF_CONSTANTS_FILENAME, "r") as const_f:
145 |             def_dict = json.load(const_f)
146 | 
147 |     # otherwise, we must regenerate the dict and rewrite it
148 |     else:
149 | 
150 |         print("Regenerating PSFConstants file")
151 | 
152 |         html = utils.get_html(PSF_URL)
153 |         doc = pq(html)
154 | 
155 |         def_dict = {}
156 |         # start with input elements
157 |         for inp in doc("form#psl_finder input[name]"):
158 |             name = inp.attrib["name"]
159 |             # add blank dict if not present
160 |             if name not in def_dict:
161 |                 def_dict[name] = {
162 |                     "value": set(),
163 |                     "options": set(),
164 |                     "type": inp.attrib["type"],
165 |                 }
166 | 
167 |             # handle checkboxes and radio buttons
168 |             if inp.attrib["type"] in ("checkbox", "radio"):
169 |                 # deal with default value
170 |                 if "checked" in inp.attrib:
171 |                     def_dict[name]["value"].add(inp.attrib["value"])
172 |                 # add to options
173 |                 def_dict[name]["options"].add(inp.attrib["value"])
174 |             # handle other types of inputs (only other type is hidden?)
175 |             else:
176 |                 def_dict[name]["value"].add(inp.attrib.get("value", ""))
177 | 
178 |         # deal with dropdowns (select elements)
179 |         for sel in doc.items("form#psl_finder select[name]"):
180 |             name = sel.attr["name"]
181 |             # add blank dict if not present
182 |             if name not in def_dict:
183 |                 def_dict[name] = {"value": set(), "options": set(), "type": "select"}
184 | 
185 |             # deal with default value
186 |             defaultOpt = sel("option[selected]")
187 |             if len(defaultOpt):
188 |                 defaultOpt = defaultOpt[0]
189 |                 def_dict[name]["value"].add(defaultOpt.attrib.get("value", ""))
190 |             else:
191 |                 def_dict[name]["value"].add(sel("option")[0].attrib.get("value", ""))
192 | 
193 |             # deal with options
194 |             def_dict[name]["options"] = {
195 |                 opt.attrib["value"] for opt in sel("option") if opt.attrib.get("value")
196 |             }
197 | 
198 |         def_dict.pop("request", None)
199 |         def_dict.pop("use_favorites", None)
200 | 
201 |         with open(PSF_CONSTANTS_FILENAME, "w+") as f:
202 |             for k in def_dict:
203 |                 try:
204 |                     def_dict[k]["value"] = sorted(list(def_dict[k]["value"]), key=int)
205 |                     def_dict[k]["options"] = sorted(
206 |                         list(def_dict[k]["options"]), key=int
207 |                     )
208 |                 except Exception:
209 |                     def_dict[k]["value"] = sorted(list(def_dict[k]["value"]))
210 |                     def_dict[k]["options"] = sorted(list(def_dict[k]["options"]))
211 |             json.dump(def_dict, f)
212 | 
213 |     return def_dict
214 | 


--------------------------------------------------------------------------------
/sportsref/nfl/finders/GPF.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import json
  3 | import os
  4 | import time
  5 | 
  6 | from pyquery import PyQuery as pq
  7 | 
  8 | from ... import decorators, utils
  9 | from .. import pbp
 10 | 
 11 | GPF_URL = "http://www.pro-football-reference.com/" "play-index/play_finder.cgi"
 12 | 
 13 | GPF_CONSTANTS_FILENAME = "GPFConstants.json"
 14 | 
 15 | 
 16 | def GamePlayFinder(**kwargs):
 17 |     """ Docstring will be filled in by __init__.py """
 18 | 
 19 |     querystring = _kwargs_to_qs(**kwargs)
 20 |     url = "{}?{}".format(GPF_URL, querystring)
 21 |     # if verbose, print url
 22 |     if kwargs.get("verbose", False):
 23 |         print(url)
 24 |     html = utils.get_html(url)
 25 |     doc = pq(html)
 26 | 
 27 |     # parse
 28 |     table = doc("table#all_plays")
 29 |     plays = utils.parse_table(table)
 30 | 
 31 |     # parse score column
 32 |     if "score" in plays.columns:
 33 |         oScore, dScore = list(zip(*plays.score.apply(lambda s: s.split("-"))))
 34 |         plays["teamScore"] = oScore
 35 |         plays["oppScore"] = dScore
 36 |     # add parsed pbp info
 37 |     if "description" in plays.columns:
 38 |         plays = pbp.expand_details(plays, detailCol="description")
 39 | 
 40 |     return plays
 41 | 
 42 | 
 43 | def _kwargs_to_qs(**kwargs):
 44 |     """Converts kwargs given to GPF to a querystring.
 45 | 
 46 |     :returns: the querystring.
 47 |     """
 48 |     # start with defaults
 49 |     inpOptDef = inputs_options_defaults()
 50 |     opts = {name: dct["value"] for name, dct in list(inpOptDef.items())}
 51 | 
 52 |     # clean up keys and values
 53 |     for k, v in list(kwargs.items()):
 54 |         # pID, playerID => player_id
 55 |         if k.lower() in ("pid", "playerid"):
 56 |             del kwargs[k]
 57 |             kwargs["player_id"] = v
 58 |         # player_id can accept rel URLs
 59 |         if k == "player_id":
 60 |             if v.startswith("/players/"):
 61 |                 kwargs[k] = utils.rel_url_to_id(v)
 62 |         # bool => 'Y'|'N'
 63 |         if isinstance(v, bool):
 64 |             kwargs[k] = "Y" if v else "N"
 65 |         # tm, team => team_id
 66 |         if k.lower() in ("tm", "team"):
 67 |             del kwargs[k]
 68 |             kwargs["team_id"] = v
 69 |         # yr_min, yr_max => year_min, year_max
 70 |         if k.lower() in ("yr_min", "yr_max"):
 71 |             del kwargs[k]
 72 |             if k.lower() == "yr_min":
 73 |                 kwargs["year_min"] = int(v)
 74 |             else:
 75 |                 kwargs["year_max"] = int(v)
 76 |         # wk_min, wk_max => week_num_min, week_num_max
 77 |         if k.lower() in ("wk_min", "wk_max"):
 78 |             del kwargs[k]
 79 |             if k.lower() == "wk_min":
 80 |                 kwargs["week_num_min"] = int(v)
 81 |             else:
 82 |                 kwargs["week_num_max"] = int(v)
 83 |         # yr, year, yrs, years => year_min, year_max
 84 |         if k.lower() in ("yr", "year", "yrs", "years"):
 85 |             del kwargs[k]
 86 |             if isinstance(v, collections.Iterable):
 87 |                 lst = list(v)
 88 |                 kwargs["year_min"] = min(lst)
 89 |                 kwargs["year_max"] = max(lst)
 90 |             elif isinstance(v, str):
 91 |                 v = list(map(int, v.split(",")))
 92 |                 kwargs["year_min"] = min(v)
 93 |                 kwargs["year_max"] = max(v)
 94 |             else:
 95 |                 kwargs["year_min"] = v
 96 |                 kwargs["year_max"] = v
 97 |         # wk, week, wks, weeks => week_num_min, week_num_max
 98 |         if k.lower() in ("wk", "week", "wks", "weeks"):
 99 |             del kwargs[k]
100 |             if isinstance(v, collections.Iterable):
101 |                 lst = list(v)
102 |                 kwargs["week_num_min"] = min(lst)
103 |                 kwargs["week_num_max"] = max(lst)
104 |             elif isinstance(v, str):
105 |                 v = list(map(int, v.split(",")))
106 |                 kwargs["week_num_min"] = min(v)
107 |                 kwargs["week_num_max"] = max(v)
108 |             else:
109 |                 kwargs["week_num_min"] = v
110 |                 kwargs["week_num_max"] = v
111 |         # if playoff_round defined, then turn on playoff flag
112 |         if k == "playoff_round":
113 |             kwargs["game_type"] = "P"
114 |         if isinstance(v, str):
115 |             v = v.split(",")
116 |         if not isinstance(v, collections.Iterable):
117 |             v = [v]
118 | 
119 |     # reset values to blank for defined kwargs
120 |     for k in kwargs:
121 |         if k in opts:
122 |             opts[k] = []
123 | 
124 |     # update based on kwargs
125 |     for k, v in list(kwargs.items()):
126 |         # if overwriting a default, overwrite it
127 |         if k in opts:
128 |             # if multiple values separated by commas, split em
129 |             if isinstance(v, str):
130 |                 v = v.split(",")
131 |             elif not isinstance(v, collections.Iterable):
132 |                 v = [v]
133 |             for val in v:
134 |                 opts[k].append(val)
135 | 
136 |     opts["request"] = [1]
137 | 
138 |     qs = "&".join(
139 |         "{}={}".format(name, val) for name, vals in sorted(opts.items()) for val in vals
140 |     )
141 | 
142 |     return qs
143 | 
144 | 
145 | @decorators.switch_to_dir(os.path.dirname(os.path.realpath(__file__)))
146 | def inputs_options_defaults():
147 |     """Handles scraping options for play finder form.
148 | 
149 |     :returns: {'name1': {'value': val, 'options': [opt1, ...] }, ... }
150 | 
151 |     """
152 |     # set time variables
153 |     if os.path.isfile(GPF_CONSTANTS_FILENAME):
154 |         modtime = int(os.path.getmtime(GPF_CONSTANTS_FILENAME))
155 |         curtime = int(time.time())
156 |     # if file found and it's been <= a week
157 |     if os.path.isfile(GPF_CONSTANTS_FILENAME) and curtime - modtime <= 7 * 24 * 60 * 60:
158 | 
159 |         # just read the dict from the cached file
160 |         with open(GPF_CONSTANTS_FILENAME, "r") as const_f:
161 |             def_dict = json.load(const_f)
162 | 
163 |     # otherwise, we must regenerate the dict and rewrite it
164 |     else:
165 | 
166 |         print("Regenerating GPFConstants file")
167 | 
168 |         html = utils.get_html(GPF_URL)
169 |         doc = pq(html)
170 | 
171 |         def_dict = {}
172 |         # start with input elements
173 |         for inp in doc("form#play_finder input[name]"):
174 |             name = inp.attrib["name"]
175 |             # add blank dict if not present
176 |             if name not in def_dict:
177 |                 def_dict[name] = {"value": set(), "options": set(), "type": inp.type}
178 | 
179 |             val = inp.attrib.get("value", "")
180 |             # handle checkboxes and radio buttons
181 |             if inp.type in ("checkbox", "radio"):
182 |                 # deal with default value
183 |                 if "checked" in inp.attrib:
184 |                     def_dict[name]["value"].add(val)
185 |                 # add to options
186 |                 def_dict[name]["options"].add(val)
187 |             # handle other types of inputs (only other type is hidden?)
188 |             else:
189 |                 def_dict[name]["value"].add(val)
190 | 
191 |         # for dropdowns (select elements)
192 |         for sel in doc.items("form#play_finder select[name]"):
193 |             name = sel.attr["name"]
194 |             # add blank dict if not present
195 |             if name not in def_dict:
196 |                 def_dict[name] = {"value": set(), "options": set(), "type": "select"}
197 | 
198 |             # deal with default value
199 |             defaultOpt = sel("option[selected]")
200 |             if len(defaultOpt):
201 |                 defaultOpt = defaultOpt[0]
202 |                 def_dict[name]["value"].add(defaultOpt.attrib.get("value", ""))
203 |             else:
204 |                 def_dict[name]["value"].add(sel("option")[0].attrib.get("value", ""))
205 | 
206 |             # deal with options
207 |             def_dict[name]["options"] = {
208 |                 opt.attrib["value"] for opt in sel("option") if opt.attrib.get("value")
209 |             }
210 | 
211 |         # ignore QB kneels by default
212 |         def_dict["include_kneels"]["value"] = ["0"]
213 | 
214 |         def_dict.pop("request", None)
215 |         def_dict.pop("use_favorites", None)
216 | 
217 |         with open(GPF_CONSTANTS_FILENAME, "w+") as f:
218 |             for k in def_dict:
219 |                 try:
220 |                     def_dict[k]["value"] = sorted(list(def_dict[k]["value"]), key=int)
221 |                     def_dict[k]["options"] = sorted(
222 |                         list(def_dict[k]["options"]), key=int
223 |                     )
224 |                 except Exception:
225 |                     def_dict[k]["value"] = sorted(list(def_dict[k]["value"]))
226 |                     def_dict[k]["options"] = sorted(list(def_dict[k]["options"]))
227 |             json.dump(def_dict, f)
228 | 
229 |     return def_dict
230 | 


--------------------------------------------------------------------------------
/sportsref/nba/players.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import re
  3 | 
  4 | from pyquery import PyQuery as pq
  5 | 
  6 | import sportsref
  7 | 
  8 | __all__ = ["Player"]
  9 | 
 10 | 
 11 | class Player(object, metaclass=sportsref.decorators.Cached):
 12 | 
 13 |     """Each instance of this class represents an NBA player, uniquely
 14 |     identified by a player ID. The instance methods give various data available
 15 |     from the player's Basketball Reference player page."""
 16 | 
 17 |     def __init__(self, player_id):
 18 |         self.player_id = player_id
 19 |         self.url_base = f"{sportsref.nba.BASE_URL}/players/{player_id[0]}/{player_id}"
 20 |         self.main_url = self.url_base + ".htm"
 21 | 
 22 |     def __eq__(self, other):
 23 |         return self.player_id == other.player_id
 24 | 
 25 |     def __hash__(self):
 26 |         return hash(self.player_id)
 27 | 
 28 |     def __repr__(self):
 29 |         return f"Player({self.player_id})"
 30 | 
 31 |     def __str__(self):
 32 |         return self.name()
 33 | 
 34 |     @sportsref.decorators.memoize
 35 |     def get_main_doc(self):
 36 |         return pq(sportsref.utils.get_html(self.main_url))
 37 | 
 38 |     @sportsref.decorators.memoize
 39 |     def get_sub_doc(self, rel_url):
 40 |         url = f"{self.url_base}/{rel_url}"
 41 |         return pq(sportsref.utils.get_html(url))
 42 | 
 43 |     @sportsref.decorators.memoize
 44 |     def name(self):
 45 |         """Returns the name of the player as a string."""
 46 |         doc = self.get_main_doc()
 47 |         return doc('h1[itemprop="name"]').text()
 48 | 
 49 |     @sportsref.decorators.memoize
 50 |     def age(self, year, month=2, day=1):
 51 |         """Returns the age of the player on a given date.
 52 | 
 53 |         :year: int representing the year.
 54 |         :month: int representing the month (1-12).
 55 |         :day: int representing the day within the month (1-31).
 56 |         :returns: Age in years as a float.
 57 |         """
 58 |         doc = self.get_main_doc()
 59 |         date_string = doc('span[itemprop="birthDate"]').attr("data-birth")
 60 |         regex = r"(\d{4})\-(\d{2})\-(\d{2})"
 61 |         date_args = list(map(int, re.match(regex, date_string).groups()))
 62 |         birth_date = datetime.date(*date_args)
 63 |         age_date = datetime.date(year=year, month=month, day=day)
 64 |         delta = age_date - birth_date
 65 |         age = delta.days / 365.0
 66 |         return age
 67 | 
 68 |     @sportsref.decorators.memoize
 69 |     def position(self):
 70 |         """TODO: Docstring for position.
 71 |         :returns: TODO
 72 |         """
 73 |         raise Exception("not yet implemented - nba.Player.position")
 74 | 
 75 |     @sportsref.decorators.memoize
 76 |     def height(self):
 77 |         """Returns the player's height (in inches).
 78 |         :returns: An int representing a player's height in inches.
 79 |         """
 80 |         doc = self.get_main_doc()
 81 |         raw = doc('span[itemprop="height"]').text()
 82 |         try:
 83 |             feet, inches = list(map(int, raw.split("-")))
 84 |             return feet * 12 + inches
 85 |         except ValueError:
 86 |             return None
 87 | 
 88 |     @sportsref.decorators.memoize
 89 |     def weight(self):
 90 |         """Returns the player's weight (in pounds).
 91 |         :returns: An int representing a player's weight in pounds.
 92 |         """
 93 |         doc = self.get_main_doc()
 94 |         raw = doc('span[itemprop="weight"]').text()
 95 |         try:
 96 |             weight = re.match(r"(\d+)lb", raw).group(1)
 97 |             return int(weight)
 98 |         except ValueError:
 99 |             return None
100 | 
101 |     @sportsref.decorators.memoize
102 |     def hand(self):
103 |         """Returns the player's handedness.
104 |         :returns: 'L' for left-handed, 'R' for right-handed.
105 |         """
106 |         doc = self.get_main_doc()
107 |         hand = re.search(r"Shoots:\s*(L|R)", doc.text()).group(1)
108 |         return hand
109 | 
110 |     @sportsref.decorators.memoize
111 |     def draft_pick(self):
112 |         """Returns when in the draft the player was picked.
113 |         :returns: TODO
114 |         """
115 |         doc = self.get_main_doc()
116 |         try:
117 |             p_tags = doc("div#meta p")
118 |             draft_p_tag = next(
119 |                 p for p in list(p_tags.items()) if p.text().lower().startswith("draft")
120 |             )
121 |             draft_pick = int(
122 |                 re.search(r"(\d+)\w{,3}\s+?overall", draft_p_tag.text()).group(1)
123 |             )
124 |             return draft_pick
125 |         except Exception:
126 |             return None
127 | 
128 |     @sportsref.decorators.memoize
129 |     def draft_year(self):
130 |         """Returns the year the player was selected (or undrafted).
131 |         :returns: TODO
132 |         """
133 |         raise Exception("not yet implemented - nba.Player.draft_year")
134 | 
135 |     @sportsref.decorators.kind_rpb(include_type=True)
136 |     def _get_stats_table(self, table_id, kind="R", summary=False):
137 |         """Gets a stats table from the player page; helper function that does
138 |         the work for per-game, per-100-poss, etc. stats.
139 | 
140 |         :table_id: the ID of the HTML table.
141 |         :kind: specifies regular season, playoffs, or both. One of 'R', 'P',
142 |             'B'. Defaults to 'R'.
143 |         :returns: A DataFrame of stats.
144 |         """
145 |         doc = self.get_main_doc()
146 |         table_id = f"table#{'playoffs_' if kind == 'P' else ''}{table_id}"
147 |         table = doc(table_id)
148 |         df = sportsref.utils.parse_table(table, flatten=(not summary), footer=summary)
149 |         return df
150 | 
151 |     @sportsref.decorators.memoize
152 |     def stats_per_game(self, kind="R", summary=False):
153 |         """Returns a DataFrame of per-game box score stats."""
154 |         return self._get_stats_table("per_game", kind=kind, summary=summary)
155 | 
156 |     @sportsref.decorators.memoize
157 |     def stats_totals(self, kind="R", summary=False):
158 |         """Returns a DataFrame of total box score statistics by season."""
159 |         return self._get_stats_table("totals", kind=kind, summary=summary)
160 | 
161 |     @sportsref.decorators.memoize
162 |     def stats_per36(self, kind="R", summary=False):
163 |         """Returns a DataFrame of per-36-minutes stats."""
164 |         return self._get_stats_table("per_minute", kind=kind, summary=summary)
165 | 
166 |     @sportsref.decorators.memoize
167 |     def stats_per100(self, kind="R", summary=False):
168 |         """Returns a DataFrame of per-100-possession stats."""
169 |         return self._get_stats_table("per_poss", kind=kind, summary=summary)
170 | 
171 |     @sportsref.decorators.memoize
172 |     def stats_advanced(self, kind="R", summary=False):
173 |         """Returns a DataFrame of advanced stats."""
174 |         return self._get_stats_table("advanced", kind=kind, summary=summary)
175 | 
176 |     @sportsref.decorators.memoize
177 |     def stats_shooting(self, kind="R", summary=False):
178 |         """Returns a DataFrame of shooting stats."""
179 |         return self._get_stats_table("shooting", kind=kind, summary=summary)
180 | 
181 |     @sportsref.decorators.memoize
182 |     def stats_adjusted_shooting(self, kind="R", summary=False):
183 |         """Returns a DataFrame of adjusted shooting stats."""
184 |         return self._get_stats_table("adj-shooting", kind=kind, summary=summary)
185 | 
186 |     @sportsref.decorators.memoize
187 |     def stats_pbp(self, kind="R", summary=False):
188 |         """Returns a DataFrame of play-by-play stats."""
189 |         return self._get_stats_table("pbp", kind=kind, summary=summary)
190 | 
191 |     @sportsref.decorators.memoize
192 |     @sportsref.decorators.kind_rpb(include_type=True)
193 |     def gamelog_basic(self, year, kind="R"):
194 |         """Returns a table of a player's basic game-by-game stats for a season.
195 | 
196 |         :param year: The year representing the desired season.
197 |         :param kind: specifies regular season, playoffs, or both. One of 'R',
198 |             'P', 'B'. Defaults to 'R'.
199 |         :returns: A DataFrame of the player's standard boxscore stats from each
200 |             game of the season.
201 |         :rtype: pd.DataFrame
202 |         """
203 |         doc = self.get_sub_doc(f"gamelog/{year}")
204 |         table = (
205 |             doc("table#pgl_basic_playoffs") if kind == "P" else doc("table#pgl_basic")
206 |         )
207 |         df = sportsref.utils.parse_table(table)
208 |         return df
209 | 
210 |     @sportsref.decorators.memoize
211 |     @sportsref.decorators.kind_rpb(include_type=True)
212 |     def gamelog_advanced(self, year, kind="R"):
213 |         """Returns a table of a player's advanced game-by-game stats for a
214 |         season.
215 | 
216 |         :param year: The year representing the desired season.
217 |         :param kind: specifies regular season, playoffs, or both. One of 'R',
218 |             'P', 'B'. Defaults to 'R'.
219 |         :returns: A DataFrame of the player's advanced stats from each game of
220 |             the season.
221 |         :rtype: pd.DataFrame
222 |         """
223 |         doc = self.get_sub_doc(f"gamelog-advanced/{year}")
224 |         table = (
225 |             doc("table#pgl_advanced_playoffs")
226 |             if kind == "P"
227 |             else doc("table#pgl_advanced")
228 |         )
229 |         df = sportsref.utils.parse_table(table)
230 |         return df
231 | 


--------------------------------------------------------------------------------
/sportsref/nba/seasons.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from pyquery import PyQuery as pq
  3 | 
  4 | import sportsref
  5 | 
  6 | 
  7 | class Season(object, metaclass=sportsref.decorators.Cached):
  8 | 
  9 |     """Object representing a given NBA season."""
 10 | 
 11 |     def __init__(self, year):
 12 |         """Initializes a Season object for an NBA season.
 13 | 
 14 |         :year: The year of the season we want.
 15 |         """
 16 |         self.yr = int(year)
 17 | 
 18 |     def __eq__(self, other):
 19 |         return self.yr == other.yr
 20 | 
 21 |     def __hash__(self):
 22 |         return hash(self.yr)
 23 | 
 24 |     def __repr__(self):
 25 |         return "Season({})".format(self.yr)
 26 | 
 27 |     def _subpage_url(self, page):
 28 |         return sportsref.nba.BASE_URL + "/leagues/NBA_{}_{}.html".format(self.yr, page)
 29 | 
 30 |     @sportsref.decorators.memoize
 31 |     def get_main_doc(self):
 32 |         """Returns PyQuery object for the main season URL.
 33 |         :returns: PyQuery object.
 34 |         """
 35 |         url = sportsref.nba.BASE_URL + "/leagues/NBA_{}.html".format(self.yr)
 36 |         return pq(sportsref.utils.get_html(url))
 37 | 
 38 |     @sportsref.decorators.memoize
 39 |     def get_sub_doc(self, subpage):
 40 |         """Returns PyQuery object for a given subpage URL.
 41 |         :subpage: The subpage of the season, e.g. 'per_game'.
 42 |         :returns: PyQuery object.
 43 |         """
 44 |         html = sportsref.utils.get_html(self._subpage_url(subpage))
 45 |         return pq(html)
 46 | 
 47 |     @sportsref.decorators.memoize
 48 |     def get_team_ids(self):
 49 |         """Returns a list of the team IDs for the given year.
 50 |         :returns: List of team IDs.
 51 |         """
 52 |         df = self.team_stats_per_game()
 53 |         if not df.empty:
 54 |             return df.index.tolist()
 55 |         else:
 56 |             print("ERROR: no teams found")
 57 |             return []
 58 | 
 59 |     @sportsref.decorators.memoize
 60 |     def team_ids_to_names(self):
 61 |         """Mapping from 3-letter team IDs to full team names.
 62 |         :returns: Dictionary with team IDs as keys and full team strings as
 63 |         values.
 64 |         """
 65 |         doc = self.get_main_doc()
 66 |         table = doc("table#team-stats-per_game")
 67 |         flattened = sportsref.utils.parse_table(table, flatten=True)
 68 |         unflattened = sportsref.utils.parse_table(table, flatten=False)
 69 |         team_ids = flattened["team_id"]
 70 |         team_names = unflattened["team_name"]
 71 |         if len(team_names) != len(team_ids):
 72 |             raise Exception("team names and team IDs don't align")
 73 |         return dict(list(zip(team_ids, team_names)))
 74 | 
 75 |     @sportsref.decorators.memoize
 76 |     def team_names_to_ids(self):
 77 |         """Mapping from full team names to 3-letter team IDs.
 78 |         :returns: Dictionary with tean names as keys and team IDs as values.
 79 |         """
 80 |         d = self.team_ids_to_names()
 81 |         return {v: k for k, v in list(d.items())}
 82 | 
 83 |     @sportsref.decorators.memoize
 84 |     @sportsref.decorators.kind_rpb(include_type=True)
 85 |     def schedule(self, kind="R"):
 86 |         """Returns a list of BoxScore IDs for every game in the season.
 87 |         Only needs to handle 'R' or 'P' options because decorator handles 'B'.
 88 | 
 89 |         :param kind: 'R' for regular season, 'P' for playoffs, 'B' for both.
 90 |             Defaults to 'R'.
 91 |         :returns: DataFrame of schedule information.
 92 |         :rtype: pd.DataFrame
 93 |         """
 94 |         kind = kind.upper()[0]
 95 |         dfs = []
 96 | 
 97 |         # get games from each month
 98 |         for month in (
 99 |             "october",
100 |             "november",
101 |             "december",
102 |             "january",
103 |             "february",
104 |             "march",
105 |             "april",
106 |             "may",
107 |             "june",
108 |         ):
109 |             try:
110 |                 doc = self.get_sub_doc("games-{}".format(month))
111 |             except ValueError:
112 |                 continue
113 |             table = doc("table#schedule")
114 |             df = sportsref.utils.parse_table(table)
115 |             dfs.append(df)
116 |         df = pd.concat(dfs).reset_index(drop=True)
117 | 
118 |         # figure out how many regular season games
119 |         try:
120 |             sportsref.utils.get_html(
121 |                 "{}/playoffs/NBA_{}.html".format(sportsref.nba.BASE_URL, self.yr)
122 |             )
123 |             is_past_season = True
124 |         except ValueError:
125 |             is_past_season = False
126 | 
127 |         if is_past_season:
128 |             team_per_game = self.team_stats_per_game()
129 |             n_reg_games = int(team_per_game.g.sum() // 2)
130 |         else:
131 |             n_reg_games = len(df)
132 | 
133 |         # subset appropriately based on `kind`
134 |         if kind == "P":
135 |             return df.iloc[n_reg_games:]
136 |         else:
137 |             return df.iloc[:n_reg_games]
138 | 
139 |     def finals_winner(self):
140 |         """Returns the team ID for the winner of that year's NBA Finals.
141 |         :returns: 3-letter team ID for champ.
142 |         """
143 |         raise NotImplementedError("nba.Season.finals_winner")
144 | 
145 |     def finals_loser(self):
146 |         """Returns the team ID for the loser of that year's NBA Finals.
147 |         :returns: 3-letter team ID for runner-up.
148 |         """
149 |         raise NotImplementedError("nba.Season.finals_loser")
150 | 
151 |     def standings(self):
152 |         """Returns a DataFrame containing standings information."""
153 |         doc = self.get_sub_doc("standings")
154 | 
155 |         east_table = doc("table#confs_standings_E")
156 |         east_df = sportsref.utils.parse_table(east_table)
157 |         east_df.sort_values("wins", ascending=False, inplace=True)
158 |         east_df["seed"] = list(range(1, len(east_df) + 1))
159 |         east_df["conference"] = "E"
160 | 
161 |         west_table = doc("table#confs_standings_W")
162 |         west_df = sportsref.utils.parse_table(west_table)
163 |         west_df.sort_values("wins", ascending=False, inplace=True)
164 |         west_df["seed"] = list(range(1, len(west_df) + 1))
165 |         west_df["conference"] = "W"
166 | 
167 |         full_df = pd.concat([east_df, west_df], axis=0).reset_index(drop=True)
168 |         full_df["gb"] = [
169 |             gb if isinstance(gb, int) or isinstance(gb, float) else 0
170 |             for gb in full_df["gb"]
171 |         ]
172 |         full_df = full_df.drop("has_class_full_table", axis=1)
173 | 
174 |         expanded_table = doc("table#expanded_standings")
175 |         expanded_df = sportsref.utils.parse_table(expanded_table)
176 | 
177 |         full_df = pd.merge(full_df, expanded_df, on="team_id")
178 |         return full_df
179 | 
180 |     @sportsref.decorators.memoize
181 |     def _get_team_stats_table(self, selector):
182 |         """Helper function for stats tables on season pages. Returns a
183 |         DataFrame."""
184 |         doc = self.get_main_doc()
185 |         table = doc(selector)
186 |         df = sportsref.utils.parse_table(table)
187 |         df.set_index("team_id", inplace=True)
188 |         return df
189 | 
190 |     def team_stats_per_game(self):
191 |         """Returns a Pandas DataFrame of each team's basic per-game stats for
192 |         the season."""
193 |         return self._get_team_stats_table("table#team-stats-per_game")
194 | 
195 |     def opp_stats_per_game(self):
196 |         """Returns a Pandas DataFrame of each team's opponent's basic per-game
197 |         stats for the season."""
198 |         return self._get_team_stats_table("table#opponent-stats-per_game")
199 | 
200 |     def team_stats_totals(self):
201 |         """Returns a Pandas DataFrame of each team's basic stat totals for the
202 |         season."""
203 |         return self._get_team_stats_table("table#team-stats-base")
204 | 
205 |     def opp_stats_totals(self):
206 |         """Returns a Pandas DataFrame of each team's opponent's basic stat
207 |         totals for the season."""
208 |         return self._get_team_stats_table("table#opponent-stats-base")
209 | 
210 |     def misc_stats(self):
211 |         """Returns a Pandas DataFrame of miscellaneous stats about each team's
212 |         season."""
213 |         return self._get_team_stats_table("table#misc_stats")
214 | 
215 |     def team_stats_shooting(self):
216 |         """Returns a Pandas DataFrame of each team's shooting stats for the
217 |         season."""
218 |         return self._get_team_stats_table("table#team_shooting")
219 | 
220 |     def opp_stats_shooting(self):
221 |         """Returns a Pandas DataFrame of each team's opponent's shooting stats
222 |         for the season."""
223 |         return self._get_team_stats_table("table#opponent_shooting")
224 | 
225 |     @sportsref.decorators.memoize
226 |     def _get_player_stats_table(self, identifier):
227 |         """Helper function for player season stats.
228 | 
229 |         :identifier: string identifying the type of stat, e.g. 'per_game'.
230 |         :returns: A DataFrame of stats.
231 |         """
232 |         doc = self.get_sub_doc(identifier)
233 |         table = doc("table#{}_stats".format(identifier))
234 |         df = sportsref.utils.parse_table(table)
235 |         return df
236 | 
237 |     def player_stats_per_game(self):
238 |         """Returns a DataFrame of per-game player stats for a season."""
239 |         return self._get_player_stats_table("per_game")
240 | 
241 |     def player_stats_totals(self):
242 |         """Returns a DataFrame of player stat totals for a season."""
243 |         return self._get_player_stats_table("totals")
244 | 
245 |     def player_stats_per36(self):
246 |         """Returns a DataFrame of player per-36 min stats for a season."""
247 |         return self._get_player_stats_table("per_minute")
248 | 
249 |     def player_stats_per100(self):
250 |         """Returns a DataFrame of player per-100 poss stats for a season."""
251 |         return self._get_player_stats_table("per_poss")
252 | 
253 |     def player_stats_advanced(self):
254 |         """Returns a DataFrame of player per-100 poss stats for a season."""
255 |         return self._get_player_stats_table("advanced")
256 | 
257 |     def mvp_voting(self):
258 |         """Returns a DataFrame containing information about MVP voting."""
259 |         raise NotImplementedError("nba.Season.mvp_voting")
260 | 
261 |     def roy_voting(self):
262 |         """Returns a DataFrame containing information about ROY voting."""
263 |         url = "{}/awards/awards_{}.html".format(sportsref.nba.BASE_URL, self.yr)
264 |         doc = pq(sportsref.utils.get_html(url))
265 |         table = doc("table#roy")
266 |         df = sportsref.utils.parse_table(table)
267 |         return df
268 | 


--------------------------------------------------------------------------------
/sportsref/utils.py:
--------------------------------------------------------------------------------
  1 | import ctypes
  2 | import multiprocessing
  3 | import re
  4 | import threading
  5 | import time
  6 | 
  7 | import pandas as pd
  8 | import requests
  9 | from pyquery import PyQuery as pq
 10 | 
 11 | import sportsref
 12 | 
 13 | # time between requests, in seconds
 14 | THROTTLE_DELAY = 0.5
 15 | 
 16 | # variables used to throttle requests across processes
 17 | throttle_thread_lock = threading.Lock()
 18 | throttle_process_lock = multiprocessing.Lock()
 19 | last_request_time = multiprocessing.Value(
 20 |     ctypes.c_longdouble, time.time() - 10 * THROTTLE_DELAY
 21 | )
 22 | 
 23 | 
 24 | @sportsref.decorators.cache
 25 | def get_html(url):
 26 |     """Gets the HTML for the given URL using a GET request.
 27 | 
 28 |     :url: the absolute URL of the desired page.
 29 |     :returns: a string of HTML.
 30 |     """
 31 |     global last_request_time
 32 |     with throttle_process_lock:
 33 |         with throttle_thread_lock:
 34 |             # sleep until THROTTLE_DELAY secs have passed since last request
 35 |             wait_left = THROTTLE_DELAY - (time.time() - last_request_time.value)
 36 |             if wait_left > 0:
 37 |                 time.sleep(wait_left)
 38 | 
 39 |             # make request
 40 |             response = requests.get(url)
 41 | 
 42 |             # update last request time for throttling
 43 |             last_request_time.value = time.time()
 44 | 
 45 |     # raise ValueError on 4xx status code, get rid of comments, and return
 46 |     if 400 <= response.status_code < 500:
 47 |         raise ValueError(
 48 |             f'Status Code {response.status_code} received fetching URL "{url}"'
 49 |         )
 50 |     html = response.text
 51 |     html = html.replace("<!--", "").replace("-->", "")
 52 | 
 53 |     return html
 54 | 
 55 | 
 56 | def parse_table(table, flatten=True, footer=False):
 57 |     """Parses a table from sports-reference sites into a pandas dataframe.
 58 | 
 59 |     :param table: the PyQuery object representing the HTML table
 60 |     :param flatten: if True, flattens relative URLs to IDs. otherwise, leaves
 61 |         all fields as text without cleaning.
 62 |     :param footer: If True, returns the summary/footer of the page. Recommended
 63 |         to use this with flatten=False. Defaults to False.
 64 |     :returns: pd.DataFrame
 65 |     """
 66 |     if not len(table):
 67 |         return pd.DataFrame()
 68 | 
 69 |     # get columns
 70 |     columns = [
 71 |         c.attrib["data-stat"] for c in table("thead tr:not([class]) th[data-stat]")
 72 |     ]
 73 | 
 74 |     # get data
 75 |     rows = list(
 76 |         table("tbody tr" if not footer else "tfoot tr")
 77 |         .not_(".thead, .stat_total, .stat_average")
 78 |         .items()
 79 |     )
 80 |     data = [
 81 |         [flatten_links(td) if flatten else td.text() for td in row.items("th,td")]
 82 |         for row in rows
 83 |     ]
 84 | 
 85 |     # make DataFrame
 86 |     df = pd.DataFrame(data, columns=columns, dtype="float")
 87 | 
 88 |     # add has_class columns
 89 |     all_classes = set(
 90 |         cls for row in rows if row.attr["class"] for cls in row.attr["class"].split()
 91 |     )
 92 |     for cls in all_classes:
 93 |         df["has_class_" + cls] = [
 94 |             bool(row.attr["class"] and cls in row.attr["class"].split()) for row in rows
 95 |         ]
 96 | 
 97 |     # cleaning the DataFrame
 98 | 
 99 |     df.drop(["ranker", "Xxx", "Yyy", "Zzz"], axis=1, inplace=True, errors="ignore")
100 | 
101 |     # year_id -> year (as int)
102 |     if "year_id" in df.columns:
103 |         df.rename(columns={"year_id": "year"}, inplace=True)
104 |         if flatten:
105 |             df.year = df.year.fillna(method="ffill")
106 |             df["year"] = df.year.map(lambda s: str(s)[:4]).astype(int)
107 | 
108 |     # pos -> position
109 |     if "pos" in df.columns:
110 |         df.rename(columns={"pos": "position"}, inplace=True)
111 | 
112 |     # boxscore_word, game_date -> boxscore_id and separate into Y, M, D columns
113 |     for bs_id_col in ("boxscore_word", "game_date", "box_score_text"):
114 |         if bs_id_col in df.columns:
115 |             df.rename(columns={bs_id_col: "boxscore_id"}, inplace=True)
116 |             break
117 | 
118 |     # ignore *, +, and other characters used to note things
119 |     df.replace(re.compile(r"[\*\+\u2605]", re.U), "", inplace=True)
120 |     for col in df.columns:
121 |         if hasattr(df[col], "str"):
122 |             df[col] = df[col].str.strip()
123 | 
124 |     # player -> player_id and/or player_name
125 |     if "player" in df.columns:
126 |         if flatten:
127 |             df.rename(columns={"player": "player_id"}, inplace=True)
128 |             # when flattening, keep a column for names
129 |             player_names = parse_table(table, flatten=False)["player_name"]
130 |             df["player_name"] = player_names
131 |         else:
132 |             df.rename(columns={"player": "player_name"}, inplace=True)
133 | 
134 |     # team, team_name -> team_id
135 |     for team_col in ("team", "team_name"):
136 |         if team_col in df.columns:
137 |             # first, get rid of faulty rows
138 |             df = df.loc[~df[team_col].isin(["XXX"])]
139 |             if flatten:
140 |                 df.rename(columns={team_col: "team_id"}, inplace=True)
141 | 
142 |     # season -> int
143 |     if "season" in df.columns and flatten:
144 |         df["season"] = df["season"].astype(int)
145 | 
146 |     # handle date_game columns (different types)
147 |     if "date_game" in df.columns and flatten:
148 |         date_re = r"month=(?P<month>\d+)&day=(?P<day>\d+)&year=(?P<year>\d+)"
149 |         date_df = df["date_game"].str.extract(date_re, expand=True)
150 |         if date_df.notnull().all(axis=1).any():
151 |             df = pd.concat((df, date_df), axis=1)
152 |         else:
153 |             df.rename(columns={"date_game": "boxscore_id"}, inplace=True)
154 | 
155 |     # game_location -> is_home
156 |     if "game_location" in df.columns and flatten:
157 |         df["game_location"] = df["game_location"].isnull()
158 |         df.rename(columns={"game_location": "is_home"}, inplace=True)
159 | 
160 |     # mp: (min:sec) -> float(min + sec / 60), notes -> NaN, new column
161 |     if "mp" in df.columns and df.dtypes["mp"] == object and flatten:
162 |         mp_df = (
163 |             df["mp"].str.extract(r"(?P<m>\d+):(?P<s>\d+)", expand=True).astype(float)
164 |         )
165 |         no_match = mp_df.isnull().all(axis=1)
166 |         if no_match.any():
167 |             df.loc[no_match, "note"] = df.loc[no_match, "mp"]
168 |         df["mp"] = mp_df["m"] + mp_df["s"] / 60
169 | 
170 |     # converts number-y things to floats
171 |     def convert_to_float(val):
172 |         # percentages: (number%) -> float(number * 0.01)
173 |         m = re.search(r"([-\.\d]+)\%", val if isinstance(val, str) else str(val), re.U)
174 |         try:
175 |             if m:
176 |                 return float(m.group(1)) / 100 if m else val
177 |             if m:
178 |                 return int(m.group(1)) + int(m.group(2)) / 60
179 |         except ValueError:
180 |             return val
181 |         # salaries: $ABC,DEF,GHI -> float(ABCDEFGHI)
182 |         m = re.search(r"\$[\d,]+", val if isinstance(val, str) else str(val), re.U)
183 |         try:
184 |             if m:
185 |                 return float(re.sub(r"\$|,", "", val))
186 |         except Exception:
187 |             return val
188 |         # generally try to coerce to float, unless it's an int or bool
189 |         try:
190 |             if isinstance(val, (int, bool)):
191 |                 return val
192 |             else:
193 |                 return float(val)
194 |         except Exception:
195 |             return val
196 | 
197 |     if flatten:
198 |         df = df.applymap(convert_to_float)
199 | 
200 |     df = df.loc[df.astype(bool).any(axis=1)]
201 | 
202 |     return df
203 | 
204 | 
205 | def parse_info_table(table):
206 |     """Parses an info table, like the "Game Info" table or the "Officials"
207 |     table on the PFR Boxscore page. Keys are lower case and have spaces/special
208 |     characters converted to underscores.
209 | 
210 |     :table: PyQuery object representing the HTML table.
211 |     :returns: A dictionary representing the information.
212 |     """
213 |     ret = {}
214 |     for tr in list(table("tr").not_(".thead").items()):
215 |         th, td = list(tr("th, td").items())
216 |         key = th.text().lower()
217 |         key = re.sub(r"\W", "_", key)
218 |         val = sportsref.utils.flatten_links(td)
219 |         ret[key] = val
220 |     return ret
221 | 
222 | 
223 | def parse_awards_table(table):
224 |     """Parses an awards table, like the "Pro Bowls" table on a PFR player page.
225 | 
226 |     :table: PyQuery object representing the HTML table.
227 |     :returns: A list of the entries in the table, with flattened links.
228 |     """
229 |     return [flatten_links(tr) for tr in list(table("tr").items())]
230 | 
231 | 
232 | def flatten_links(td, _recurse=False):
233 |     """Flattens relative URLs within text of a table cell to IDs and returns
234 |     the result.
235 | 
236 |     :td: the PyQuery object for the HTML to convert
237 |     :returns: the string with the links flattened to IDs
238 |     """
239 | 
240 |     # helper function to flatten individual strings/links
241 |     def _flatten_node(c):
242 |         if isinstance(c, str):
243 |             return c
244 |         elif "href" in c.attrib:
245 |             c_id = rel_url_to_id(c.attrib["href"])
246 |             return c_id if c_id else c.text_content()
247 |         else:
248 |             return flatten_links(pq(c), _recurse=True)
249 | 
250 |     # if there's no text, just return None
251 |     if td is None or not td.text():
252 |         return "" if _recurse else None
253 | 
254 |     td.remove("span.note")
255 |     return "".join(_flatten_node(c) for c in td.contents())
256 | 
257 | 
258 | @sportsref.decorators.memoize
259 | def rel_url_to_id(url):
260 |     """Converts a relative URL to a unique ID.
261 | 
262 |     Here, 'ID' refers generally to the unique ID for a given 'type' that a
263 |     given datum has. For example, 'BradTo00' is Tom Brady's player ID - this
264 |     corresponds to his relative URL, '/players/B/BradTo00.htm'. Similarly,
265 |     '201409070dal' refers to the boxscore of the SF @ DAL game on 09/07/14.
266 | 
267 |     Supported types:
268 |     * player/...
269 |     * boxscores/...
270 |     * teams/...
271 |     * years/...
272 |     * leagues/...
273 |     * awards/...
274 |     * coaches/...
275 |     * officials/...
276 |     * schools/...
277 |     * schools/high_schools.cgi?id=...
278 | 
279 |     :returns: ID associated with the given relative URL.
280 |     """
281 |     year_regex = r".*/years/(\d{4}).*|.*/gamelog/(\d{4}).*"
282 |     player_regex = r".*/players/(?:\w/)?(.+?)(?:/|\.html?)"
283 |     boxscores_regex = r".*/boxscores/(.+?)\.html?"
284 |     team_regex = r".*/teams/(\w{3})/.*"
285 |     coach_regex = r".*/coaches/(.+?)\.html?"
286 |     stadium_regex = r".*/stadiums/(.+?)\.html?"
287 |     ref_regex = r".*/officials/(.+?r)\.html?"
288 |     college_regex = r".*/schools/(\S+?)/.*|.*college=([^&]+)"
289 |     hs_regex = r".*/schools/high_schools\.cgi\?id=([^\&]{8})"
290 |     bs_date_regex = r".*/boxscores/index\.f?cgi\?(month=\d+&day=\d+&year=\d+)"
291 |     league_regex = r".*/leagues/(.*_\d{4}).*"
292 |     award_regex = r".*/awards/(.+)\.htm"
293 | 
294 |     regexes = [
295 |         year_regex,
296 |         player_regex,
297 |         boxscores_regex,
298 |         team_regex,
299 |         coach_regex,
300 |         stadium_regex,
301 |         ref_regex,
302 |         college_regex,
303 |         hs_regex,
304 |         bs_date_regex,
305 |         league_regex,
306 |         award_regex,
307 |     ]
308 | 
309 |     for regex in regexes:
310 |         match = re.match(regex, url, re.I)
311 |         if match:
312 |             return [_f for _f in match.groups() if _f][0]
313 | 
314 |     # things we don't want to match but don't want to print a WARNING
315 |     if any(url.startswith(s) for s in ("/play-index/",)):
316 |         return url
317 | 
318 |     print(f'WARNING. NO MATCH WAS FOUND FOR "{url}"')
319 |     return url
320 | 


--------------------------------------------------------------------------------
/sportsref/nfl/players.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import re
  3 | import urllib.parse
  4 | 
  5 | from pyquery import PyQuery as pq
  6 | 
  7 | import sportsref
  8 | 
  9 | __all__ = ["Player"]
 10 | 
 11 | 
 12 | class Player(object, metaclass=sportsref.decorators.Cached):
 13 |     def __init__(self, player_id):
 14 |         self.player_id = player_id
 15 |         self.mainURL = (sportsref.nfl.BASE_URL + "/players/{0[0]}/{0}.htm").format(
 16 |             self.player_id
 17 |         )
 18 | 
 19 |     def __eq__(self, other):
 20 |         return self.player_id == other.player_id
 21 | 
 22 |     def __hash__(self):
 23 |         return hash(self.player_id)
 24 | 
 25 |     def __repr__(self):
 26 |         return "Player({})".format(self.player_id)
 27 | 
 28 |     def __str__(self):
 29 |         return self.name()
 30 | 
 31 |     def __reduce__(self):
 32 |         return Player, (self.player_id,)
 33 | 
 34 |     def _subpage_url(self, page, year=None):
 35 |         # if no year, return career version
 36 |         if year is None:
 37 |             return urllib.parse.urljoin(
 38 |                 self.mainURL, "{}/{}/".format(self.player_id, page)
 39 |             )
 40 |         # otherwise, return URL for a given year
 41 |         else:
 42 |             return urllib.parse.urljoin(
 43 |                 self.mainURL, "{}/{}/{}/".format(self.player_id, page, year)
 44 |             )
 45 | 
 46 |     @sportsref.decorators.memoize
 47 |     def get_doc(self):
 48 |         doc = pq(sportsref.utils.get_html(self.mainURL))
 49 |         return doc
 50 | 
 51 |     @sportsref.decorators.memoize
 52 |     def name(self):
 53 |         doc = self.get_doc()
 54 |         name = doc("div#meta h1:first").text()
 55 |         return name
 56 | 
 57 |     @sportsref.decorators.memoize
 58 |     def age(self, year, month=9, day=1):
 59 |         doc = self.get_doc()
 60 |         span = doc("div#meta span#necro-birth")
 61 |         birthstring = span.attr("data-birth")
 62 |         try:
 63 |             dateargs = re.match(r"(\d{4})\-(\d{2})\-(\d{2})", birthstring).groups()
 64 |             dateargs = list(map(int, dateargs))
 65 |             birthDate = datetime.date(*dateargs)
 66 |             delta = datetime.date(year=year, month=month, day=day) - birthDate
 67 |             age = delta.days / 365
 68 |             return age
 69 |         except Exception:
 70 |             return None
 71 | 
 72 |     @sportsref.decorators.memoize
 73 |     def position(self):
 74 |         doc = self.get_doc()
 75 |         rawText = (
 76 |             doc("div#meta p").filter(lambda i, e: "Position" in e.text_content()).text()
 77 |         )
 78 |         rawPos = re.search(r"Position\W*(\S+)", rawText, re.I).group(1)
 79 |         allPositions = rawPos.split("-")
 80 |         # right now, returning just the primary position for those with
 81 |         # multiple positions
 82 |         return allPositions[0]
 83 | 
 84 |     @sportsref.decorators.memoize
 85 |     def height(self):
 86 |         doc = self.get_doc()
 87 |         rawText = doc('div#meta p span[itemprop="height"]').text()
 88 |         try:
 89 |             feet, inches = list(map(int, rawText.split("-")))
 90 |             return feet * 12 + inches
 91 |         except ValueError:
 92 |             return None
 93 | 
 94 |     @sportsref.decorators.memoize
 95 |     def weight(self):
 96 |         doc = self.get_doc()
 97 |         rawText = doc('div#meta p span[itemprop="weight"]').text()
 98 |         try:
 99 |             weight = re.match(r"(\d+)lb", rawText, re.I).group(1)
100 |             return int(weight)
101 |         except AttributeError:
102 |             return None
103 | 
104 |     @sportsref.decorators.memoize
105 |     def hand(self):
106 |         doc = self.get_doc()
107 |         try:
108 |             rawText = (
109 |                 doc("div#meta p")
110 |                 .filter(lambda i, e: "Throws" in e.text_content())
111 |                 .text()
112 |             )
113 |             rawHand = re.search(r"Throws\W+(\S+)", rawText, re.I).group(1)
114 |         except AttributeError:
115 |             return None
116 |         return rawHand[0]  # 'L' or 'R'
117 | 
118 |     @sportsref.decorators.memoize
119 |     def current_team(self):
120 |         doc = self.get_doc()
121 |         team = doc("div#meta p").filter(lambda i, e: "Team" in e.text_content())
122 |         text = sportsref.utils.flatten_links(team)
123 |         try:
124 |             m = re.match(r"Team: (\w{3})", text)
125 |             return m.group(1)
126 |         except Exception:
127 |             return None
128 | 
129 |     @sportsref.decorators.memoize
130 |     def draft_pick(self):
131 |         doc = self.get_doc()
132 |         rawDraft = (
133 |             doc("div#meta p").filter(lambda i, e: "Draft" in e.text_content()).text()
134 |         )
135 |         m = re.search(r"Draft.*? round \((\d+).*?overall\)", rawDraft, re.I)
136 |         # if not drafted or taken in supplemental draft, return NaN
137 |         if m is None or "Supplemental" in rawDraft:
138 |             return None
139 |         else:
140 |             return int(m.group(1))
141 | 
142 |     @sportsref.decorators.memoize
143 |     def draft_class(self):
144 |         doc = self.get_doc()
145 |         rawDraft = (
146 |             doc("div#meta p").filter(lambda i, e: "Draft" in e.text_content()).text()
147 |         )
148 |         m = re.search(r"Draft.*?of the (\d{4}) NFL", rawDraft, re.I)
149 |         if not m:
150 |             return None
151 |         else:
152 |             return int(m.group(1))
153 | 
154 |     @sportsref.decorators.memoize
155 |     def draft_team(self):
156 |         doc = self.get_doc()
157 |         rawDraft = doc("div#meta p").filter(lambda i, e: "Draft" in e.text_content())
158 |         try:
159 |             draftStr = sportsref.utils.flatten_links(rawDraft)
160 |             m = re.search(r"Draft\W+(\w+)", draftStr)
161 |             return m.group(1)
162 |         except Exception:
163 |             return None
164 | 
165 |     @sportsref.decorators.memoize
166 |     def college(self):
167 |         doc = self.get_doc()
168 |         rawText = doc("div#meta p").filter(lambda i, e: "College" in e.text_content())
169 |         cleanedText = sportsref.utils.flatten_links(rawText)
170 |         college = re.search(r"College:\s*(\S+)", cleanedText).group(1)
171 |         return college
172 | 
173 |     @sportsref.decorators.memoize
174 |     def high_school(self):
175 |         doc = self.get_doc()
176 |         rawText = doc("div#meta p").filter(
177 |             lambda i, e: "High School" in e.text_content()
178 |         )
179 |         cleanedText = sportsref.utils.flatten_links(rawText)
180 |         hs = re.search(r"High School:\s*(\S+)", cleanedText).group(1)
181 |         return hs
182 | 
183 |     @sportsref.decorators.memoize
184 |     @sportsref.decorators.kind_rpb(include_type=True)
185 |     def gamelog(self, year=None, kind="R"):
186 |         """Gets the career gamelog of the given player.
187 |         :kind: One of 'R', 'P', or 'B' (for regular season, playoffs, or both).
188 |         Case-insensitive; defaults to 'R'.
189 |         :year: The year for which the gamelog should be returned; if None,
190 |         return entire career gamelog. Defaults to None.
191 |         :returns: A DataFrame with the player's career gamelog.
192 |         """
193 |         url = self._subpage_url("gamelog", None)  # year is filtered later
194 |         doc = pq(sportsref.utils.get_html(url))
195 |         table = doc("table#stats") if kind == "R" else doc("table#stats_playoffs")
196 |         df = sportsref.utils.parse_table(table)
197 |         if year is not None:
198 |             df = df.query("year == @year").reset_index(drop=True)
199 |         return df
200 | 
201 |     @sportsref.decorators.memoize
202 |     @sportsref.decorators.kind_rpb(include_type=True)
203 |     def passing(self, kind="R"):
204 |         """Gets yearly passing stats for the player.
205 | 
206 |         :kind: One of 'R', 'P', or 'B'. Case-insensitive; defaults to 'R'.
207 |         :returns: Pandas DataFrame with passing stats.
208 |         """
209 |         doc = self.get_doc()
210 |         table = doc("table#passing") if kind == "R" else doc("table#passing_playoffs")
211 |         df = sportsref.utils.parse_table(table)
212 |         return df
213 | 
214 |     @sportsref.decorators.memoize
215 |     @sportsref.decorators.kind_rpb(include_type=True)
216 |     def rushing_and_receiving(self, kind="R"):
217 |         """Gets yearly rushing/receiving stats for the player.
218 | 
219 |         :kind: One of 'R', 'P', or 'B'. Case-insensitive; defaults to 'R'.
220 |         :returns: Pandas DataFrame with rushing/receiving stats.
221 |         """
222 |         doc = self.get_doc()
223 |         table = (
224 |             doc("table#rushing_and_receiving")
225 |             if kind == "R"
226 |             else doc("table#rushing_and_receiving_playoffs")
227 |         )
228 |         if not table:
229 |             table = (
230 |                 doc("table#receiving_and_rushing")
231 |                 if kind == "R"
232 |                 else doc("table#receiving_and_rushing_playoffs")
233 |             )
234 |         df = sportsref.utils.parse_table(table)
235 |         return df
236 | 
237 |     @sportsref.decorators.memoize
238 |     @sportsref.decorators.kind_rpb(include_type=True)
239 |     def defense(self, kind="R"):
240 |         """Gets yearly defense stats for the player (also has AV stats for OL).
241 | 
242 |         :kind: One of 'R', 'P', or 'B'. Case-insensitive; defaults to 'R'.
243 |         :returns: Pandas DataFrame with rushing/receiving stats.
244 |         """
245 |         doc = self.get_doc()
246 |         table = doc("table#defense") if kind == "R" else doc("table#defense_playoffs")
247 |         df = sportsref.utils.parse_table(table)
248 |         return df
249 | 
250 |     def _plays(self, year, play_type, expand_details):
251 |         """Returns a DataFrame of plays for a given year for a given play type
252 |         (like rushing, receiving, or passing).
253 | 
254 |         :year: The year for the season.
255 |         :play_type: A type of play for which there are plays (as of this
256 |         writing, either "passing", "rushing", or "receiving")
257 |         :expand_details: Bool for whether PBP should be parsed.
258 |         :returns: A DataFrame of plays, each row is a play. Returns None if
259 |         there were no such plays in that year.
260 |         """
261 |         url = self._subpage_url("{}-plays".format(play_type), year)
262 |         doc = pq(sportsref.utils.get_html(url))
263 |         table = doc("table#all_plays")
264 |         if table:
265 |             if expand_details:
266 |                 plays = sportsref.nfl.pbp.expand_details(
267 |                     sportsref.utils.parse_table(table), detailCol="description"
268 |                 )
269 |                 return plays
270 |             else:
271 |                 return sportsref.utils.parse_table(table)
272 |         else:
273 |             return None
274 | 
275 |     @sportsref.decorators.memoize
276 |     def passing_plays(self, year, expand_details=True):
277 |         """Returns a pbp DataFrame of a player's passing plays in a season.
278 | 
279 |         :year: The year for the season.
280 |         :expand_details: bool for whether PBP should be parsed.
281 |         :returns: A DataFrame of stats, each row is a play.
282 |         """
283 |         return self._plays(year, "passing", expand_details)
284 | 
285 |     @sportsref.decorators.memoize
286 |     def rushing_plays(self, year, expand_details=True):
287 |         """Returns a pbp DataFrame of a player's rushing plays in a season.
288 | 
289 |         :year: The year for the season.
290 |         :expand_details: bool for whether PBP should be parsed.
291 |         :returns: A DataFrame of stats, each row is a play.
292 |         """
293 |         return self._plays(year, "rushing", expand_details)
294 | 
295 |     @sportsref.decorators.memoize
296 |     def receiving_plays(self, year, expand_details=True):
297 |         """Returns a pbp DataFrame of a player's receiving plays in a season.
298 | 
299 |         :year: The year for the season.
300 |         :expand_details: bool for whether PBP should be parsed.
301 |         :returns: A DataFrame of stats, each row is a play.
302 |         """
303 |         return self._plays(year, "receiving", expand_details)
304 | 
305 |     @sportsref.decorators.memoize
306 |     def splits(self, year=None):
307 |         """Returns a DataFrame of splits data for a player-year.
308 | 
309 |         :year: The year for the season in question. If None, returns career
310 |         splits.
311 |         :returns: A DataFrame of splits data.
312 |         """
313 |         # get the table
314 |         url = self._subpage_url("splits", year)
315 |         doc = pq(sportsref.utils.get_html(url))
316 |         table = doc("table#stats")
317 |         df = sportsref.utils.parse_table(table)
318 |         # cleaning the data
319 |         if not df.empty:
320 |             df.split_id.fillna(method="ffill", inplace=True)
321 |         return df
322 | 
323 |     @sportsref.decorators.memoize
324 |     def advanced_splits(self, year=None):
325 |         """Returns a DataFrame of advanced splits data for a player-year. Note:
326 |             only go back to 2012.
327 | 
328 |         :year: The year for the season in question. If None, returns career
329 |         advanced splits.
330 |         :returns: A DataFrame of advanced splits data.
331 |         """
332 |         # get the table
333 |         url = self._subpage_url("splits", year)
334 |         doc = pq(sportsref.utils.get_html(url))
335 |         table = doc("table#advanced_splits")
336 |         df = sportsref.utils.parse_table(table)
337 |         # cleaning the data
338 |         if not df.empty:
339 |             df.split_type.fillna(method="ffill", inplace=True)
340 |         return df
341 | 
342 |     @sportsref.decorators.memoize
343 |     def _simple_year_award(self, award_id):
344 |         """Template for simple award functions that simply list years, such as
345 |         pro bowls and first-team all pro.
346 | 
347 |         :award_id: The div ID that is appended to "leaderboard_" in selecting
348 |         the table's div.
349 |         :returns: List of years for the award.
350 |         """
351 |         doc = self.get_doc()
352 |         table = doc("div#leaderboard_{} table".format(award_id))
353 |         return list(map(int, sportsref.utils.parse_awards_table(table)))
354 | 
355 |     def pro_bowls(self):
356 |         """Returns a list of years in which the player made the Pro Bowl."""
357 |         return self._simple_year_award("pro_bowls")
358 | 
359 |     def first_team_all_pros(self):
360 |         """Returns a list of years in which the player made 1st-Tm All Pro."""
361 |         return self._simple_year_award("all_pro")
362 | 
363 |     # TODO: other awards like MVP, OPOY, DPOY, NFL Top 100, etc.
364 | 


--------------------------------------------------------------------------------
/sportsref/nfl/boxscores.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import functools
  3 | import re
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | from pyquery import PyQuery as pq
  8 | 
  9 | import sportsref
 10 | 
 11 | __all__ = ["BoxScore"]
 12 | 
 13 | 
 14 | class BoxScore(object, metaclass=sportsref.decorators.Cached):
 15 |     def __init__(self, boxscore_id):
 16 |         self.boxscore_id = boxscore_id
 17 | 
 18 |     def __eq__(self, other):
 19 |         return self.boxscore_id == other.boxscore_id
 20 | 
 21 |     def __hash__(self):
 22 |         return hash(self.boxscore_id)
 23 | 
 24 |     def __repr__(self):
 25 |         return "BoxScore({})".format(self.boxscore_id)
 26 | 
 27 |     def __str__(self):
 28 |         return "{} Week {}: {} @ {}".format(
 29 |             self.season(), self.week(), self.away(), self.home()
 30 |         )
 31 | 
 32 |     def __reduce__(self):
 33 |         return BoxScore, (self.boxscore_id,)
 34 | 
 35 |     @sportsref.decorators.memoize
 36 |     def get_doc(self):
 37 |         url = sportsref.nfl.BASE_URL + "/boxscores/{}.htm".format(self.boxscore_id)
 38 |         doc = pq(sportsref.utils.get_html(url))
 39 |         return doc
 40 | 
 41 |     @sportsref.decorators.memoize
 42 |     def date(self):
 43 |         """Returns the date of the game. See Python datetime.date documentation
 44 |         for more.
 45 |         :returns: A datetime.date object with year, month, and day attributes.
 46 |         """
 47 |         match = re.match(r"(\d{4})(\d{2})(\d{2})", self.boxscore_id)
 48 |         year, month, day = list(map(int, match.groups()))
 49 |         return datetime.date(year=year, month=month, day=day)
 50 | 
 51 |     @sportsref.decorators.memoize
 52 |     def weekday(self):
 53 |         """Returns the day of the week on which the game occurred.
 54 |         :returns: String representation of the day of the week for the game.
 55 | 
 56 |         """
 57 |         days = [
 58 |             "Monday",
 59 |             "Tuesday",
 60 |             "Wednesday",
 61 |             "Thursday",
 62 |             "Friday",
 63 |             "Saturday",
 64 |             "Sunday",
 65 |         ]
 66 |         date = self.date()
 67 |         wd = date.weekday()
 68 |         return days[wd]
 69 | 
 70 |     @sportsref.decorators.memoize
 71 |     def home(self):
 72 |         """Returns home team ID.
 73 |         :returns: 3-character string representing home team's ID.
 74 |         """
 75 |         doc = self.get_doc()
 76 |         table = doc("table.linescore")
 77 |         relURL = table("tr").eq(2)("a").eq(2).attr["href"]
 78 |         home = sportsref.utils.rel_url_to_id(relURL)
 79 |         return home
 80 | 
 81 |     @sportsref.decorators.memoize
 82 |     def away(self):
 83 |         """Returns away team ID.
 84 |         :returns: 3-character string representing away team's ID.
 85 |         """
 86 |         doc = self.get_doc()
 87 |         table = doc("table.linescore")
 88 |         relURL = table("tr").eq(1)("a").eq(2).attr["href"]
 89 |         away = sportsref.utils.rel_url_to_id(relURL)
 90 |         return away
 91 | 
 92 |     @sportsref.decorators.memoize
 93 |     def home_score(self):
 94 |         """Returns score of the home team.
 95 |         :returns: int of the home score.
 96 |         """
 97 |         doc = self.get_doc()
 98 |         table = doc("table.linescore")
 99 |         home_score = table("tr").eq(2)("td")[-1].text_content()
100 |         return int(home_score)
101 | 
102 |     @sportsref.decorators.memoize
103 |     def away_score(self):
104 |         """Returns score of the away team.
105 |         :returns: int of the away score.
106 |         """
107 |         doc = self.get_doc()
108 |         table = doc("table.linescore")
109 |         away_score = table("tr").eq(1)("td")[-1].text_content()
110 |         return int(away_score)
111 | 
112 |     @sportsref.decorators.memoize
113 |     def winner(self):
114 |         """Returns the team ID of the winning team. Returns NaN if a tie."""
115 |         hmScore = self.home_score()
116 |         awScore = self.away_score()
117 |         if hmScore > awScore:
118 |             return self.home()
119 |         elif hmScore < awScore:
120 |             return self.away()
121 |         else:
122 |             return None
123 | 
124 |     @sportsref.decorators.memoize
125 |     def week(self):
126 |         """Returns the week in which this game took place. 18 is WC round, 19
127 |         is Div round, 20 is CC round, 21 is SB.
128 |         :returns: Integer from 1 to 21.
129 |         """
130 |         doc = self.get_doc()
131 |         raw = doc("div#div_other_scores h2 a").attr["href"]
132 |         match = re.match(r"/years/{}/week_(\d+)\.htm".format(self.season()), raw)
133 |         if match:
134 |             return int(match.group(1))
135 |         else:
136 |             return 21  # super bowl is week 21
137 | 
138 |     @sportsref.decorators.memoize
139 |     def season(self):
140 |         """
141 |         Returns the year ID of the season in which this game took place.
142 |         Useful for week 17 January games.
143 | 
144 |         :returns: An int representing the year of the season.
145 |         """
146 |         date = self.date()
147 |         return date.year - 1 if date.month <= 3 else date.year
148 | 
149 |     @sportsref.decorators.memoize
150 |     def starters(self):
151 |         """Returns a DataFrame where each row is an entry in the starters table
152 |         from PFR.
153 | 
154 |         The columns are:
155 |         * player_id - the PFR player ID for the player (note that this column
156 |         is not necessarily all unique; that is, one player can be a starter in
157 |         multiple positions, in theory).
158 |         * playerName - the listed name of the player; this too is not
159 |         necessarily unique.
160 |         * position - the position at which the player started for their team.
161 |         * team - the team for which the player started.
162 |         * home - True if the player's team was at home, False if they were away
163 |         * offense - True if the player is starting on an offensive position,
164 |         False if defense.
165 | 
166 |         :returns: A pandas DataFrame. See the description for details.
167 |         """
168 |         doc = self.get_doc()
169 |         a = doc("table#vis_starters")
170 |         h = doc("table#home_starters")
171 |         data = []
172 |         for h, table in enumerate((a, h)):
173 |             team = self.home() if h else self.away()
174 |             for i, row in enumerate(table("tbody tr").items()):
175 |                 datum = {}
176 |                 datum["player_id"] = sportsref.utils.rel_url_to_id(
177 |                     row("a")[0].attrib["href"]
178 |                 )
179 |                 datum["playerName"] = row("th").text()
180 |                 datum["position"] = row("td").text()
181 |                 datum["team"] = team
182 |                 datum["home"] = h == 1
183 |                 datum["offense"] = i <= 10
184 |                 data.append(datum)
185 |         return pd.DataFrame(data)
186 | 
187 |     @sportsref.decorators.memoize
188 |     def line(self):
189 |         doc = self.get_doc()
190 |         table = doc("table#game_info")
191 |         giTable = sportsref.utils.parse_info_table(table)
192 |         line_text = giTable.get("vegas_line", None)
193 |         if line_text is None:
194 |             return None
195 |         m = re.match(r"(.+?) ([\-\.\d]+)$", line_text)
196 |         if m:
197 |             favorite, line = m.groups()
198 |             line = float(line)
199 |             # give in terms of the home team
200 |             year = self.season()
201 |             if favorite != sportsref.nfl.teams.team_names(year)[self.home()]:
202 |                 line = -line
203 |         else:
204 |             line = 0
205 |         return line
206 | 
207 |     @sportsref.decorators.memoize
208 |     def surface(self):
209 |         """The playing surface on which the game was played.
210 | 
211 |         :returns: string representing the type of surface. Returns np.nan if
212 |         not avaiable.
213 |         """
214 |         doc = self.get_doc()
215 |         table = doc("table#game_info")
216 |         giTable = sportsref.utils.parse_info_table(table)
217 |         return giTable.get("surface", np.nan)
218 | 
219 |     @sportsref.decorators.memoize
220 |     def over_under(self):
221 |         """
222 |         Returns the over/under for the game as a float, or np.nan if not
223 |         available.
224 |         """
225 |         doc = self.get_doc()
226 |         table = doc("table#game_info")
227 |         giTable = sportsref.utils.parse_info_table(table)
228 |         if "over_under" in giTable:
229 |             ou = giTable["over_under"]
230 |             return float(ou.split()[0])
231 |         else:
232 |             return None
233 | 
234 |     @sportsref.decorators.memoize
235 |     def coin_toss(self):
236 |         """Gets information relating to the opening coin toss.
237 | 
238 |         Keys are:
239 |         * wonToss - contains the ID of the team that won the toss
240 |         * deferred - bool whether the team that won the toss deferred it
241 | 
242 |         :returns: Dictionary of coin toss-related info.
243 |         """
244 |         doc = self.get_doc()
245 |         table = doc("table#game_info")
246 |         giTable = sportsref.utils.parse_info_table(table)
247 |         if "Won Toss" in giTable:
248 |             # TODO: finish coinToss function
249 |             pass
250 |         else:
251 |             return None
252 | 
253 |     @sportsref.decorators.memoize
254 |     def weather(self):
255 |         """Returns a dictionary of weather-related info.
256 | 
257 |         Keys of the returned dict:
258 |         * temp
259 |         * windChill
260 |         * relHumidity
261 |         * windMPH
262 | 
263 |         :returns: Dict of weather data.
264 |         """
265 |         doc = self.get_doc()
266 |         table = doc("table#game_info")
267 |         giTable = sportsref.utils.parse_info_table(table)
268 |         if "weather" in giTable:
269 |             regex = (
270 |                 r"(?:(?P<temp>\-?\d+) degrees )?"
271 |                 r"(?:relative humidity (?P<relHumidity>\d+)%, )?"
272 |                 r"(?:wind (?P<windMPH>\d+) mph, )?"
273 |                 r"(?:wind chill (?P<windChill>\-?\d+))?"
274 |             )
275 |             m = re.match(regex, giTable["weather"])
276 |             d = m.groupdict()
277 | 
278 |             # cast values to int
279 |             for k in d:
280 |                 try:
281 |                     d[k] = int(d[k])
282 |                 except TypeError:
283 |                     pass
284 | 
285 |             # one-off fixes
286 |             d["windChill"] = d["windChill"] if pd.notnull(d["windChill"]) else d["temp"]
287 |             d["windMPH"] = d["windMPH"] if pd.notnull(d["windMPH"]) else 0
288 |             return d
289 |         else:
290 |             # no weather found, because it's a dome
291 |             # TODO: what's relative humidity in a dome?
292 |             return {"temp": 70, "windChill": 70, "relHumidity": None, "windMPH": 0}
293 | 
294 |     @sportsref.decorators.memoize
295 |     def pbp(self):
296 |         """Returns a dataframe of the play-by-play data from the game.
297 | 
298 |         Order of function calls:
299 |             1. parse_table on the play-by-play table
300 |             2. expand_details
301 |                 - calls parse_play_details & _clean_features
302 |             3. _add_team_columns
303 |             4. various fixes to clean data
304 |             5. _add_team_features
305 | 
306 |         :returns: pandas DataFrame of play-by-play. Similar to GPF.
307 |         """
308 |         doc = self.get_doc()
309 |         table = doc("table#pbp")
310 |         df = sportsref.utils.parse_table(table)
311 |         # make the following features conveniently available on each row
312 |         df["boxscore_id"] = self.boxscore_id
313 |         df["home"] = self.home()
314 |         df["away"] = self.away()
315 |         df["season"] = self.season()
316 |         df["week"] = self.week()
317 |         feats = sportsref.nfl.pbp.expand_details(df)
318 | 
319 |         # add team and opp columns by iterating through rows
320 |         df = sportsref.nfl.pbp._add_team_columns(feats)
321 |         # add WPA column (requires diff, can't be done row-wise)
322 |         df["home_wpa"] = df.home_wp.diff()
323 |         # lag score columns, fill in 0-0 to start
324 |         for col in ("home_wp", "pbp_score_hm", "pbp_score_aw"):
325 |             if col in df.columns:
326 |                 df[col] = df[col].shift(1)
327 |         df.loc[0, ["pbp_score_hm", "pbp_score_aw"]] = 0
328 |         # fill in WP NaN's
329 |         df.home_wp.fillna(method="ffill", inplace=True)
330 |         # fix first play border after diffing/shifting for WP and WPA
331 |         firstPlaysOfGame = df[df.secsElapsed == 0].index
332 |         line = self.line()
333 |         for i in firstPlaysOfGame:
334 |             initwp = sportsref.nfl.winProb.initialWinProb(line)
335 |             df.loc[i, "home_wp"] = initwp
336 |             df.loc[i, "home_wpa"] = df.loc[i + 1, "home_wp"] - initwp
337 |         # fix last play border after diffing/shifting for WP and WPA
338 |         lastPlayIdx = df.index[-1]
339 |         lastPlayWP = df.loc[lastPlayIdx, "home_wp"]
340 |         # if a tie, final WP is 50%; otherwise, determined by winner
341 |         winner = self.winner()
342 |         finalWP = 50.0 if pd.isnull(winner) else (winner == self.home()) * 100.0
343 |         df.loc[lastPlayIdx, "home_wpa"] = finalWP - lastPlayWP
344 |         # fix WPA for timeouts and plays after timeouts
345 |         timeouts = df[df.isTimeout].index
346 |         for to in timeouts:
347 |             df.loc[to, "home_wpa"] = 0.0
348 |             if to + 2 in df.index:
349 |                 wpa = df.loc[to + 2, "home_wp"] - df.loc[to + 1, "home_wp"]
350 |             else:
351 |                 wpa = finalWP - df.loc[to + 1, "home_wp"]
352 |             df.loc[to + 1, "home_wpa"] = wpa
353 |         # add team-related features to DataFrame
354 |         df = sportsref.nfl.pbp._add_team_features(df)
355 |         # fill distToGoal NaN's
356 |         df["distToGoal"] = np.where(df.isKickoff, 65, df.distToGoal)
357 |         df.distToGoal.fillna(method="bfill", inplace=True)
358 |         df.distToGoal.fillna(method="ffill", inplace=True)  # for last play
359 | 
360 |         return df
361 | 
362 |     @sportsref.decorators.memoize
363 |     def ref_info(self):
364 |         """Gets a dictionary of ref positions and the ref IDs of the refs for
365 |         that game.
366 | 
367 |         :returns: A dictionary of ref positions and IDs.
368 |         """
369 |         doc = self.get_doc()
370 |         table = doc("table#officials")
371 |         return sportsref.utils.parse_info_table(table)
372 | 
373 |     @sportsref.decorators.memoize
374 |     def player_stats(self):
375 |         """Gets the stats for offense, defense, returning, and kicking of
376 |         individual players in the game.
377 |         :returns: A DataFrame containing individual player stats.
378 |         """
379 |         doc = self.get_doc()
380 |         tableIDs = ("player_offense", "player_defense", "returns", "kicking")
381 |         dfs = []
382 |         for tID in tableIDs:
383 |             table = doc("table#{}".format(tID))
384 |             dfs.append(sportsref.utils.parse_table(table))
385 |         dfs = [df for df in dfs if not df.empty]
386 |         df = functools.reduce(
387 |             lambda x, y: pd.merge(
388 |                 x, y, how="outer", on=list(set(x.columns) & set(y.columns))
389 |             ),
390 |             dfs,
391 |         ).reset_index(drop=True)
392 |         return df
393 | 
394 |     @sportsref.decorators.memoize
395 |     def snap_counts(self):
396 |         """Gets the snap counts for both teams' players and returns them in a
397 |         DataFrame. Note: only goes back to 2012.
398 | 
399 |         :returns: DataFrame of snap count data
400 |         """
401 |         # TODO: combine duplicate players, see 201312150mia - ThomDa03
402 |         doc = self.get_doc()
403 |         table_ids = ("vis_snap_counts", "home_snap_counts")
404 |         tms = (self.away(), self.home())
405 |         df = pd.concat(
406 |             [
407 |                 sportsref.utils.parse_table(doc("table#{}".format(table_id))).assign(
408 |                     is_home=bool(i), team=tms[i], opp=tms[i * -1 + 1]
409 |                 )
410 |                 for i, table_id in enumerate(table_ids)
411 |             ]
412 |         )
413 |         if df.empty:
414 |             return df
415 |         return df.set_index("player_id")
416 | 


--------------------------------------------------------------------------------
/sportsref/nfl/teams.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | from pyquery import PyQuery as pq
  6 | 
  7 | import sportsref
  8 | 
  9 | __all__ = ["team_names", "team_ids", "list_teams", "Team"]
 10 | 
 11 | 
 12 | @sportsref.decorators.memoize
 13 | def team_names(year):
 14 |     """Returns a mapping from team ID to full team name for a given season.
 15 |     Example of a full team name: "New England Patriots"
 16 | 
 17 |     :year: The year of the season in question (as an int).
 18 |     :returns: A dictionary with teamID keys and full team name values.
 19 |     """
 20 |     doc = pq(sportsref.utils.get_html(sportsref.nfl.BASE_URL + "/teams/"))
 21 |     active_table = doc("table#teams_active")
 22 |     active_df = sportsref.utils.parse_table(active_table)
 23 |     inactive_table = doc("table#teams_inactive")
 24 |     inactive_df = sportsref.utils.parse_table(inactive_table)
 25 |     df = pd.concat((active_df, inactive_df))
 26 |     df = df.loc[~df["has_class_partial_table"]]
 27 |     ids = df.team_id.str[:3].values
 28 |     names = [tr("th a") for tr in list(active_table("tr").items())]
 29 |     names.extend(tr("th a") for tr in list(inactive_table("tr").items()))
 30 |     names = [_f for _f in names if _f]
 31 |     names = [lst[0].text_content() for lst in names]
 32 |     # combine IDs and team names into pandas series
 33 |     series = pd.Series(names, index=ids)
 34 |     # create a mask to filter to teams from the given year
 35 |     mask = ((df.year_min <= year) & (year <= df.year_max)).values
 36 |     # filter, convert to a dict, and return
 37 |     return series[mask].to_dict()
 38 | 
 39 | 
 40 | @sportsref.decorators.memoize
 41 | def team_ids(year):
 42 |     """Returns a mapping from team name to team ID for a given season. Inverse
 43 |     mapping of team_names. Example of a full team name: "New England Patriots"
 44 | 
 45 |     :year: The year of the season in question (as an int).
 46 |     :returns: A dictionary with full team name keys and teamID values.
 47 |     """
 48 |     names = team_names(year)
 49 |     return {v: k for k, v in list(names.items())}
 50 | 
 51 | 
 52 | @sportsref.decorators.memoize
 53 | def list_teams(year):
 54 |     """Returns a list of team IDs for a given season.
 55 | 
 56 |     :year: The year of the season in question (as an int).
 57 |     :returns: A list of team IDs.
 58 |     """
 59 |     return list(team_names(year).keys())
 60 | 
 61 | 
 62 | class Team(object, metaclass=sportsref.decorators.Cached):
 63 |     def __init__(self, teamID):
 64 |         self.teamID = teamID
 65 | 
 66 |     def __eq__(self, other):
 67 |         return self.teamID == other.teamID
 68 | 
 69 |     def __hash__(self):
 70 |         return hash(self.teamID)
 71 | 
 72 |     def __repr__(self):
 73 |         return "Team({})".format(self.teamID)
 74 | 
 75 |     def __str__(self):
 76 |         return self.name()
 77 | 
 78 |     def __reduce__(self):
 79 |         return Team, (self.teamID,)
 80 | 
 81 |     @sportsref.decorators.memoize
 82 |     def team_year_url(self, yr_str):
 83 |         return sportsref.nfl.BASE_URL + "/teams/{}/{}.htm".format(self.teamID, yr_str)
 84 | 
 85 |     @sportsref.decorators.memoize
 86 |     def get_main_doc(self):
 87 |         relURL = "/teams/{}".format(self.teamID)
 88 |         teamURL = sportsref.nfl.BASE_URL + relURL
 89 |         mainDoc = pq(sportsref.utils.get_html(teamURL))
 90 |         return mainDoc
 91 | 
 92 |     @sportsref.decorators.memoize
 93 |     def get_year_doc(self, yr_str):
 94 |         return pq(sportsref.utils.get_html(self.team_year_url(yr_str)))
 95 | 
 96 |     @sportsref.decorators.memoize
 97 |     def name(self):
 98 |         """Returns the real name of the franchise given the team ID.
 99 | 
100 |         Examples:
101 |         'nwe' -> 'New England Patriots'
102 |         'sea' -> 'Seattle Seahawks'
103 | 
104 |         :returns: A string corresponding to the team's full name.
105 |         """
106 |         doc = self.get_main_doc()
107 |         headerwords = doc("div#meta h1")[0].text_content().split()
108 |         lastIdx = headerwords.index("Franchise")
109 |         teamwords = headerwords[:lastIdx]
110 |         return " ".join(teamwords)
111 | 
112 |     @sportsref.decorators.memoize
113 |     def roster(self, year):
114 |         """Returns the roster table for the given year.
115 | 
116 |         :year: The year for which we want the roster; defaults to current year.
117 |         :returns: A DataFrame containing roster information for that year.
118 |         """
119 |         doc = self.get_year_doc("{}_roster".format(year))
120 |         roster_table = doc("table#games_played_team")
121 |         df = sportsref.utils.parse_table(roster_table)
122 |         starter_table = doc("table#starters")
123 |         if not starter_table.empty:
124 |             start_df = sportsref.utils.parse_table(starter_table)
125 |             start_df = start_df.dropna(axis=0, subset=["position"])
126 |             starters = start_df.set_index("position").player_id
127 |             df["is_starter"] = df.player_id.isin(starters)
128 |             df["starting_pos"] = df.player_id.map(
129 |                 lambda pid: (
130 |                     starters[starters == pid].index[0]
131 |                     if pid in starters.values
132 |                     else None
133 |                 )
134 |             )
135 |         return df
136 | 
137 |     @sportsref.decorators.memoize
138 |     def boxscores(self, year):
139 |         """Gets list of BoxScore objects corresponding to the box scores from
140 |         that year.
141 | 
142 |         :year: The year for which we want the boxscores; defaults to current
143 |         year.
144 |         :returns: np.array of strings representing boxscore IDs.
145 |         """
146 |         doc = self.get_year_doc(year)
147 |         table = doc("table#games")
148 |         df = sportsref.utils.parse_table(table)
149 |         if df.empty:
150 |             return np.array([])
151 |         return df.boxscore_id.values
152 | 
153 |     @sportsref.decorators.memoize
154 |     def _year_info_pq(self, year, keyword):
155 |         """Returns a PyQuery object containing the info from the meta div at
156 |         the top of the team year page with the given keyword.
157 | 
158 |         :year: Int representing the season.
159 |         :keyword: A keyword to filter to a single p tag in the meta div.
160 |         :returns: A PyQuery object for the selected p element.
161 |         """
162 |         doc = self.get_year_doc(year)
163 |         p_tags = doc("div#meta div:not(.logo) p")
164 |         texts = [p_tag.text_content().strip() for p_tag in p_tags]
165 |         try:
166 |             return next(
167 |                 pq(p_tag)
168 |                 for p_tag, text in zip(p_tags, texts)
169 |                 if keyword.lower() in text.lower()
170 |             )
171 |         except StopIteration:
172 |             if len(texts):
173 |                 raise ValueError("Keyword not found in any p tag.")
174 |             else:
175 |                 raise ValueError("No meta div p tags found.")
176 | 
177 |     # TODO: add functions for OC, DC, PF, PA, W-L, etc.
178 |     # TODO: Also give a function at BoxScore.homeCoach and BoxScore.awayCoach
179 |     # TODO: BoxScore needs a gameNum function to do this?
180 | 
181 |     @sportsref.decorators.memoize
182 |     def head_coaches_by_game(self, year):
183 |         """Returns head coach data by game.
184 | 
185 |         :year: An int representing the season in question.
186 |         :returns: An array with an entry per game of the season that the team
187 |         played (including playoffs). Each entry is the head coach's ID for that
188 |         game in the season.
189 |         """
190 |         coach_str = self._year_info_pq(year, "Coach").text()
191 |         regex = r"(\S+?) \((\d+)-(\d+)-(\d+)\)"
192 |         coachAndTenure = []
193 |         m = True
194 |         while m:
195 |             m = re.search(regex, coach_str)
196 |             coachID, wins, losses, ties = m.groups()
197 |             # nextIndex = m.end(4) + 1
198 |             # coachStr = coachStr[nextIndex:]
199 |             tenure = int(wins) + int(losses) + int(ties)
200 |             coachAndTenure.append((coachID, tenure))
201 | 
202 |         coachIDs = [cID for cID, games in coachAndTenure for _ in range(games)]
203 |         return np.array(coachIDs[::-1])
204 | 
205 |     @sportsref.decorators.memoize
206 |     def wins(self, year):
207 |         """Returns the # of regular season wins a team in a year.
208 | 
209 |         :year: The year for the season in question.
210 |         :returns: The number of regular season wins.
211 |         """
212 |         schedule = self.schedule(year)
213 |         if schedule.empty:
214 |             return np.nan
215 |         return schedule.query("week_num <= 17").is_win.sum()
216 | 
217 |     @sportsref.decorators.memoize
218 |     def schedule(self, year):
219 |         """Returns a DataFrame with schedule information for the given year.
220 | 
221 |         :year: The year for the season in question.
222 |         :returns: Pandas DataFrame with schedule information.
223 |         """
224 |         doc = self.get_year_doc(year)
225 |         table = doc("table#games")
226 |         df = sportsref.utils.parse_table(table)
227 |         if df.empty:
228 |             return pd.DataFrame()
229 |         df = df.loc[df["week_num"].notnull()]
230 |         df["week_num"] = np.arange(len(df)) + 1
231 |         df["is_win"] = df["game_outcome"] == "W"
232 |         df["is_loss"] = df["game_outcome"] == "L"
233 |         df["is_tie"] = df["game_outcome"] == "T"
234 |         df["is_bye"] = df["game_outcome"].isnull()
235 |         df["is_ot"] = df["overtime"].notnull()
236 |         return df
237 | 
238 |     @sportsref.decorators.memoize
239 |     def srs(self, year):
240 |         """Returns the SRS (Simple Rating System) for a team in a year.
241 | 
242 |         :year: The year for the season in question.
243 |         :returns: A float of SRS.
244 |         """
245 |         try:
246 |             srs_text = self._year_info_pq(year, "SRS").text()
247 |         except ValueError:
248 |             return None
249 |         m = re.match(r"SRS\s*?:\s*?(\S+)", srs_text)
250 |         if m:
251 |             return float(m.group(1))
252 |         else:
253 |             return None
254 | 
255 |     @sportsref.decorators.memoize
256 |     def sos(self, year):
257 |         """Returns the SOS (Strength of Schedule) for a team in a year, based
258 |         on SRS.
259 | 
260 |         :year: The year for the season in question.
261 |         :returns: A float of SOS.
262 |         """
263 |         try:
264 |             sos_text = self._year_info_pq(year, "SOS").text()
265 |         except ValueError:
266 |             return None
267 |         m = re.search(r"SOS\s*:\s*(\S+)", sos_text)
268 |         if m:
269 |             return float(m.group(1))
270 |         else:
271 |             return None
272 | 
273 |     @sportsref.decorators.memoize
274 |     def off_coordinator(self, year):
275 |         """Returns the coach ID for the team's OC in a given year.
276 | 
277 |         :year: An int representing the year.
278 |         :returns: A string containing the coach ID of the OC.
279 |         """
280 |         try:
281 |             oc_anchor = self._year_info_pq(year, "Offensive Coordinator")("a")
282 |             if oc_anchor:
283 |                 return oc_anchor.attr["href"]
284 |         except ValueError:
285 |             return None
286 | 
287 |     @sportsref.decorators.memoize
288 |     def def_coordinator(self, year):
289 |         """Returns the coach ID for the team's DC in a given year.
290 | 
291 |         :year: An int representing the year.
292 |         :returns: A string containing the coach ID of the DC.
293 |         """
294 |         try:
295 |             dc_anchor = self._year_info_pq(year, "Defensive Coordinator")("a")
296 |             if dc_anchor:
297 |                 return dc_anchor.attr["href"]
298 |         except ValueError:
299 |             return None
300 | 
301 |     @sportsref.decorators.memoize
302 |     def stadium(self, year):
303 |         """Returns the ID for the stadium in which the team played in a given
304 |         year.
305 | 
306 |         :year: The year in question.
307 |         :returns: A string representing the stadium ID.
308 |         """
309 |         anchor = self._year_info_pq(year, "Stadium")("a")
310 |         return sportsref.utils.rel_url_to_id(anchor.attr["href"])
311 | 
312 |     @sportsref.decorators.memoize
313 |     def off_scheme(self, year):
314 |         """Returns the name of the offensive scheme the team ran in the given
315 |         year.
316 | 
317 |         :year: Int representing the season year.
318 |         :returns: A string representing the offensive scheme.
319 |         """
320 |         scheme_text = self._year_info_pq(year, "Offensive Scheme").text()
321 |         m = re.search(r"Offensive Scheme[:\s]*(.+)\s*", scheme_text, re.I)
322 |         if m:
323 |             return m.group(1)
324 |         else:
325 |             return None
326 | 
327 |     @sportsref.decorators.memoize
328 |     def def_alignment(self, year):
329 |         """Returns the name of the defensive alignment the team ran in the
330 |         given year.
331 | 
332 |         :year: Int representing the season year.
333 |         :returns: A string representing the defensive alignment.
334 |         """
335 |         scheme_text = self._year_info_pq(year, "Defensive Alignment").text()
336 |         m = re.search(r"Defensive Alignment[:\s]*(.+)\s*", scheme_text, re.I)
337 |         if m:
338 |             return m.group(1)
339 |         else:
340 |             return None
341 | 
342 |     @sportsref.decorators.memoize
343 |     def team_stats(self, year):
344 |         """Returns a Series (dict-like) of team stats from the team-season
345 |         page.
346 | 
347 |         :year: Int representing the season.
348 |         :returns: A Series of team stats.
349 |         """
350 |         doc = self.get_year_doc(year)
351 |         table = doc("table#team_stats")
352 |         df = sportsref.utils.parse_table(table)
353 |         if df.empty:
354 |             return pd.Series()
355 |         return df.loc[df.player_id == "Team Stats"].iloc[0]
356 | 
357 |     @sportsref.decorators.memoize
358 |     def opp_stats(self, year):
359 |         """Returns a Series (dict-like) of the team's opponent's stats from the
360 |         team-season page.
361 | 
362 |         :year: Int representing the season.
363 |         :returns: A Series of team stats.
364 |         """
365 |         doc = self.get_year_doc(year)
366 |         table = doc("table#team_stats")
367 |         df = sportsref.utils.parse_table(table)
368 |         return df.loc[df.player_id == "Opp. Stats"].iloc[0]
369 | 
370 |     @sportsref.decorators.memoize
371 |     def passing(self, year):
372 |         doc = self.get_year_doc(year)
373 |         table = doc("table#passing")
374 |         df = sportsref.utils.parse_table(table)
375 |         return df
376 | 
377 |     @sportsref.decorators.memoize
378 |     def rushing_and_receiving(self, year):
379 |         doc = self.get_year_doc(year)
380 |         table = doc("#rushing_and_receiving")
381 |         df = sportsref.utils.parse_table(table)
382 |         return df
383 | 
384 |     @sportsref.decorators.memoize
385 |     def off_splits(self, year):
386 |         """Returns a DataFrame of offensive team splits for a season.
387 | 
388 |         :year: int representing the season.
389 |         :returns: Pandas DataFrame of split data.
390 |         """
391 |         doc = self.get_year_doc("{}_splits".format(year))
392 |         tables = doc("table.stats_table")
393 |         dfs = [sportsref.utils.parse_table(table) for table in list(tables.items())]
394 |         dfs = [
395 |             df.assign(split=df.columns[0]).rename(
396 |                 columns={df.columns[0]: "split_value"}
397 |             )
398 |             for df in dfs
399 |         ]
400 |         if not dfs:
401 |             return pd.DataFrame()
402 |         return pd.concat(dfs).reset_index(drop=True)
403 | 
404 |     @sportsref.decorators.memoize
405 |     def def_splits(self, year):
406 |         """Returns a DataFrame of defensive team splits (i.e. opponent splits)
407 |         for a season.
408 | 
409 |         :year: int representing the season.
410 |         :returns: Pandas DataFrame of split data.
411 |         """
412 |         doc = self.get_year_doc("{}_opp_splits".format(year))
413 |         tables = doc("table.stats_table")
414 |         dfs = [sportsref.utils.parse_table(table) for table in list(tables.items())]
415 |         dfs = [
416 |             df.assign(split=df.columns[0]).rename(
417 |                 columns={df.columns[0]: "split_value"}
418 |             )
419 |             for df in dfs
420 |         ]
421 |         if not dfs:
422 |             return pd.DataFrame()
423 |         return pd.concat(dfs).reset_index(drop=True)
424 | 


--------------------------------------------------------------------------------
/sportsref/nba/boxscores.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import re
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | from pyquery import PyQuery as pq
  7 | 
  8 | import sportsref
  9 | 
 10 | CLOCK_REGEX = re.compile(r"(\d+):(\d+)\.(\d+)")
 11 | 
 12 | 
 13 | class BoxScore(object, metaclass=sportsref.decorators.Cached):
 14 |     def __init__(self, boxscore_id):
 15 |         self.boxscore_id = boxscore_id
 16 | 
 17 |     def __eq__(self, other):
 18 |         return self.boxscore_id == other.boxscore_id
 19 | 
 20 |     def __hash__(self):
 21 |         return hash(self.boxscore_id)
 22 | 
 23 |     def __repr__(self):
 24 |         return f"BoxScore({self.boxscore_id})"
 25 | 
 26 |     @sportsref.decorators.memoize
 27 |     def get_main_doc(self):
 28 |         url = f"{sportsref.nba.BASE_URL}/boxscores/{self.boxscore_id}.html"
 29 |         doc = pq(sportsref.utils.get_html(url))
 30 |         return doc
 31 | 
 32 |     @sportsref.decorators.memoize
 33 |     def get_subpage_doc(self, page):
 34 |         url = f"{sportsref.nba.BASE_URL}/boxscores/{page}/{self.boxscore_id}.html"
 35 |         doc = pq(sportsref.utils.get_html(url))
 36 |         return doc
 37 | 
 38 |     @sportsref.decorators.memoize
 39 |     def date(self):
 40 |         """Returns the date of the game. See Python datetime.date documentation
 41 |         for more.
 42 |         :returns: A datetime.date object with year, month, and day attributes.
 43 |         """
 44 |         match = re.match(r"(\d{4})(\d{2})(\d{2})", self.boxscore_id)
 45 |         year, month, day = list(map(int, match.groups()))
 46 |         return datetime.date(year=year, month=month, day=day)
 47 | 
 48 |     @sportsref.decorators.memoize
 49 |     def weekday(self):
 50 |         days = [
 51 |             "Monday",
 52 |             "Tuesday",
 53 |             "Wednesday",
 54 |             "Thursday",
 55 |             "Friday",
 56 |             "Saturday",
 57 |             "Sunday",
 58 |         ]
 59 |         date = self.date()
 60 |         wd = date.weekday()
 61 |         return days[wd]
 62 | 
 63 |     @sportsref.decorators.memoize
 64 |     def linescore(self):
 65 |         """Returns the linescore for the game as a DataFrame."""
 66 |         doc = self.get_main_doc()
 67 |         table = doc("table#line_score")
 68 |         df = sportsref.utils.parse_table(table)
 69 |         df.index = ["away", "home"]
 70 |         return df
 71 | 
 72 |     @sportsref.decorators.memoize
 73 |     def home(self):
 74 |         """Returns home team ID.
 75 |         :returns: 3-character string representing home team's ID.
 76 |         """
 77 |         linescore = self.linescore()
 78 |         return linescore.loc["home", "team_id"]
 79 | 
 80 |     @sportsref.decorators.memoize
 81 |     def away(self):
 82 |         """Returns away team ID.
 83 |         :returns: 3-character string representing away team's ID.
 84 |         """
 85 |         linescore = self.linescore()
 86 |         return linescore.loc["away", "team_id"]
 87 | 
 88 |     @sportsref.decorators.memoize
 89 |     def home_score(self):
 90 |         """Returns score of the home team.
 91 |         :returns: int of the home score.
 92 |         """
 93 |         linescore = self.linescore()
 94 |         return linescore.loc["home", "T"]
 95 | 
 96 |     @sportsref.decorators.memoize
 97 |     def away_score(self):
 98 |         """Returns score of the away team.
 99 |         :returns: int of the away score.
100 |         """
101 |         linescore = self.linescore()
102 |         return linescore.loc["away", "T"]
103 | 
104 |     @sportsref.decorators.memoize
105 |     def winner(self):
106 |         """Returns the team ID of the winning team. Returns NaN if a tie."""
107 |         hm_score = self.home_score()
108 |         aw_score = self.away_score()
109 |         if hm_score > aw_score:
110 |             return self.home()
111 |         elif hm_score < aw_score:
112 |             return self.away()
113 |         else:
114 |             return None
115 | 
116 |     @sportsref.decorators.memoize
117 |     def season(self):
118 |         """
119 |         Returns the year ID of the season in which this game took place.
120 | 
121 |         :returns: An int representing the year of the season.
122 |         """
123 |         d = self.date()
124 |         if d.month >= 9:
125 |             return d.year + 1
126 |         else:
127 |             return d.year
128 | 
129 |     def _get_player_stats(self, table_id_fmt):
130 |         """Returns a DataFrame of player stats from the game (either basic or
131 |         advanced, depending on the argument.
132 | 
133 |         :param table_id_fmt: Format string for str.format with a placeholder
134 |             for the team ID (e.g. 'box-{}-game-basic')
135 |         :returns: DataFrame of player stats
136 |         """
137 | 
138 |         # get data
139 |         doc = self.get_main_doc()
140 |         tms = self.away(), self.home()
141 |         team_table_ids = [table_id_fmt.format(tm.upper()) for tm in tms]
142 |         tables = [doc(f"table#{table_id}") for table_id in team_table_ids]
143 |         dfs = [sportsref.utils.parse_table(table) for table in tables]
144 | 
145 |         # clean data and add features
146 |         for i, (tm, df) in enumerate(zip(tms, dfs)):
147 |             no_time = df["mp"] == 0
148 |             stat_cols = [
149 |                 col for col, dtype in list(df.dtypes.items()) if dtype != "object"
150 |             ]
151 |             df.loc[no_time, stat_cols] = 0
152 |             df["team_id"] = tm
153 |             df["is_home"] = i == 1
154 |             df["is_starter"] = [p < 5 for p in range(df.shape[0])]
155 |             df.drop_duplicates(subset="player_id", keep="first", inplace=True)
156 | 
157 |         return pd.concat(dfs)
158 | 
159 |     @sportsref.decorators.memoize
160 |     def basic_stats(self):
161 |         """Returns a DataFrame of basic player stats from the game."""
162 |         return self._get_player_stats("box-{}-game-basic")
163 | 
164 |     @sportsref.decorators.memoize
165 |     def advanced_stats(self):
166 |         """Returns a DataFrame of advanced player stats from the game."""
167 |         return self._get_player_stats("box-{}-game-advanced")
168 | 
169 |     @sportsref.decorators.memoize
170 |     def pbp(self, dense_lineups=False, sparse_lineups=False):
171 |         """Returns a dataframe of the play-by-play data from the game.
172 | 
173 |         :param dense_lineups: If True, adds 10 columns containing the names of
174 |             the players on the court. Defaults to False.
175 |         :param sparse_lineups: If True, adds binary columns denoting whether a
176 |             given player is in the game at the time of a pass. Defaults to
177 |             False.
178 |         :returns: pandas DataFrame of play-by-play. Similar to GPF.
179 |         """
180 |         try:
181 |             doc = self.get_subpage_doc("pbp")
182 |         except Exception:
183 |             raise ValueError(
184 |                 f"Error fetching PBP subpage for boxscore {self.boxscore_id}"
185 |             )
186 | 
187 |         table = doc("table#pbp")
188 |         trs = [
189 |             tr
190 |             for tr in list(table("tr").items())
191 |             if (
192 |                 not tr.attr["class"]
193 |                 or (tr.attr["id"] and tr.attr["id"].startswith("q"))
194 |             )
195 |         ]
196 |         rows = [tr.children("td") for tr in trs]
197 |         n_rows = len(trs)
198 |         data = []
199 |         cur_qtr = 0
200 | 
201 |         for i in range(n_rows):
202 |             tr = trs[i]
203 |             row = rows[i]
204 |             play = {}
205 | 
206 |             # increment cur_qtr when we hit a new quarter
207 |             if tr.attr["id"] and tr.attr["id"].startswith("q"):
208 |                 assert int(tr.attr["id"][1:]) == cur_qtr + 1
209 |                 cur_qtr += 1
210 |                 continue
211 | 
212 |             # add time of play to entry
213 |             clock_str = row.eq(0).text()
214 |             mins, secs, tenths = list(
215 |                 map(int, re.match(CLOCK_REGEX, clock_str).groups())
216 |             )
217 |             secs_in_period = 12 * 60 * min(cur_qtr, 4) + 5 * 60 * (
218 |                 cur_qtr - 4 if cur_qtr > 4 else 0
219 |             )
220 |             secs_elapsed = secs_in_period - (60 * mins + secs + 0.1 * tenths)
221 |             play["secs_elapsed"] = secs_elapsed
222 |             play["clock_str"] = clock_str
223 |             play["quarter"] = cur_qtr
224 | 
225 |             # handle single play description
226 |             # ex: beginning/end of quarter, jump ball
227 |             if row.length == 2:
228 |                 desc = row.eq(1)
229 |                 # handle jump balls
230 |                 if desc.text().lower().startswith("jump ball: "):
231 |                     play["is_jump_ball"] = True
232 |                     jump_ball_str = sportsref.utils.flatten_links(desc)
233 |                     play.update(
234 |                         sportsref.nba.pbp.parse_play(
235 |                             self.boxscore_id, jump_ball_str, is_home=None
236 |                         )
237 |                     )
238 |                 # ignore rows marking beginning/end of quarters
239 |                 elif desc.text().lower().startswith(
240 |                     "start of "
241 |                 ) or desc.text().lower().startswith("end of "):
242 |                     continue
243 |                 # if another case, log and continue
244 |                 else:
245 |                     if not desc.text().lower().startswith("end of "):
246 |                         print(
247 |                             f"{self.boxscore_id}, Q{cur_qtr}, {clock_str} other case: {desc.text()}"
248 |                         )
249 |                     continue
250 | 
251 |             # handle team play description
252 |             # ex: shot, turnover, rebound, foul, sub, etc.
253 |             elif row.length == 6:
254 |                 aw_desc, hm_desc = row.eq(1), row.eq(5)
255 |                 is_hm_play = bool(hm_desc.text())
256 |                 desc = hm_desc if is_hm_play else aw_desc
257 |                 desc = sportsref.utils.flatten_links(desc)
258 |                 # parse the play
259 |                 new_play = sportsref.nba.pbp.parse_play(
260 |                     self.boxscore_id, desc, is_hm_play
261 |                 )
262 |                 if not new_play:
263 |                     continue
264 |                 elif isinstance(new_play, list):
265 |                     # this happens when a row needs to be expanded to 2 rows;
266 |                     # ex: double personal foul -> two PF rows
267 | 
268 |                     # first, update and append the first row
269 |                     orig_play = dict(play)
270 |                     play.update(new_play[0])
271 |                     data.append(play)
272 |                     # second, set up the second row to be appended below
273 |                     play = orig_play
274 |                     new_play = new_play[1]
275 |                 elif new_play.get("is_error"):
276 |                     print(f"can't parse: {desc}, boxscore: {self.boxscore_id}")
277 |                     # import pdb; pdb.set_trace()
278 |                 play.update(new_play)
279 | 
280 |             # otherwise, I don't know what this was
281 |             else:
282 |                 raise Exception(f"don't know how to handle row of length {row.length}")
283 | 
284 |             data.append(play)
285 | 
286 |         # convert to DataFrame and clean columns
287 |         df = pd.DataFrame.from_records(data)
288 |         df.sort_values("secs_elapsed", inplace=True, kind="mergesort")
289 |         df = sportsref.nba.pbp.clean_features(df)
290 | 
291 |         # add columns for home team, away team, boxscore_id, date
292 |         away, home = self.away(), self.home()
293 |         df["home"] = home
294 |         df["away"] = away
295 |         df["boxscore_id"] = self.boxscore_id
296 |         df["season"] = self.season()
297 |         date = self.date()
298 |         df["year"] = date.year
299 |         df["month"] = date.month
300 |         df["day"] = date.day
301 | 
302 |         def _clean_rebs(df):
303 |             df.reset_index(drop=True, inplace=True)
304 |             no_reb_after = (
305 |                 ((df.fta_num < df.tot_fta) | df.is_ftm | df.get("is_tech_fta", False))
306 |                 .shift(1)
307 |                 .fillna(False)
308 |             )
309 |             no_reb_before = ((df.fta_num == df.tot_fta)).shift(-1).fillna(False)
310 |             se_end_qtr = df.loc[df.clock_str == "0:00.0", "secs_elapsed"].unique()
311 |             no_reb_when = df.secs_elapsed.isin(se_end_qtr)
312 |             drop_mask = (df.rebounder == "Team") & (
313 |                 no_reb_after | no_reb_before | no_reb_when
314 |             )
315 |             df.drop(df.loc[drop_mask].index, axis=0, inplace=True)
316 |             df.reset_index(drop=True, inplace=True)
317 |             return df
318 | 
319 |         # get rid of 'rebounds' after FTM, non-final FTA, or tech FTA
320 |         df = _clean_rebs(df)
321 | 
322 |         # track possession number for each possession
323 |         # TODO: see 201604130PHO, secs_elapsed == 2756
324 |         # things that end a poss:
325 |         # FGM, dreb, TO, end of Q, made last FT, lost jump ball,
326 |         # def goaltending, shot clock violation
327 |         new_poss = (df.off_team == df.home).diff().fillna(False)
328 |         # def rebound considered part of the new possession
329 |         df["poss_id"] = np.cumsum(new_poss) + df.is_dreb
330 |         # create poss_id with rebs -> new possessions for granular groupbys
331 |         poss_id_reb = np.cumsum(new_poss | df.is_reb)
332 | 
333 |         # make sure plays with the same clock time are in the right order
334 |         # TODO: make sort_cols depend on what cols are in the play?
335 |         # or combine related plays, like and-1 shot and foul
336 |         # issues come up with FGA after timeout in 201604130LAL
337 |         # issues come up with PF between FGA and DREB in 201604120SAS
338 |         sort_cols = [
339 |             col
340 |             for col in [
341 |                 "is_reb",
342 |                 "is_fga",
343 |                 "is_pf",
344 |                 "is_tech_foul",
345 |                 "is_ejection",
346 |                 "is_tech_fta",
347 |                 "is_timeout",
348 |                 "is_pf_fta",
349 |                 "fta_num",
350 |                 "is_viol",
351 |                 "is_to",
352 |                 "is_jump_ball",
353 |                 "is_sub",
354 |             ]
355 |             if col in df.columns
356 |         ]
357 |         asc_true = ["fta_num"]
358 |         ascend = [(col in asc_true) for col in sort_cols]
359 |         for label, group in df.groupby([df.secs_elapsed, poss_id_reb]):
360 |             if len(group) > 1:
361 |                 df.loc[group.index, :] = group.sort_values(
362 |                     sort_cols, ascending=ascend, kind="mergesort"
363 |                 ).values
364 | 
365 |         # 2nd pass: get rid of 'rebounds' after FTM, non-final FTA, etc.
366 |         df = _clean_rebs(df)
367 | 
368 |         # makes sure off/def and poss_id are correct for subs after rearranging
369 |         # some possessions above
370 |         df.loc[df["is_sub"], ["off_team", "def_team", "poss_id"]] = np.nan
371 |         df.off_team.fillna(method="bfill", inplace=True)
372 |         df.def_team.fillna(method="bfill", inplace=True)
373 |         df.poss_id.fillna(method="bfill", inplace=True)
374 |         # make off_team and def_team NaN for jump balls
375 |         if "is_jump_ball" in df.columns:
376 |             df.loc[df["is_jump_ball"], ["off_team", "def_team"]] = np.nan
377 | 
378 |         # make sure 'off_team' is always the team shooting FTs, even on techs
379 |         # (impt for keeping track of the score)
380 |         if "is_tech_fta" in df.columns:
381 |             tech_fta = df["is_tech_fta"]
382 |             df.loc[tech_fta, "off_team"] = df.loc[tech_fta, "fta_team"]
383 |             df.loc[tech_fta, "def_team"] = np.where(
384 |                 df.loc[tech_fta, "off_team"] == home, away, home
385 |             )
386 |         df.drop("fta_team", axis=1, inplace=True)
387 |         # redefine poss_id_reb
388 |         new_poss = (df.off_team == df.home).diff().fillna(False)
389 |         poss_id_reb = np.cumsum(new_poss | df.is_reb)
390 | 
391 |         # get rid of redundant subs
392 |         for (se, tm, pnum), group in df[df.is_sub].groupby(
393 |             [df.secs_elapsed, df.sub_team, poss_id_reb]
394 |         ):
395 |             if len(group) > 1:
396 |                 sub_in = set()
397 |                 sub_out = set()
398 |                 # first, figure out who's in and who's out after subs
399 |                 for i, row in group.iterrows():
400 |                     if row["sub_in"] in sub_out:
401 |                         sub_out.remove(row["sub_in"])
402 |                     else:
403 |                         sub_in.add(row["sub_in"])
404 |                     if row["sub_out"] in sub_in:
405 |                         sub_in.remove(row["sub_out"])
406 |                     else:
407 |                         sub_out.add(row["sub_out"])
408 |                 assert len(sub_in) == len(sub_out)
409 |                 # second, add those subs
410 |                 n_subs = len(sub_in)
411 |                 for idx, p_in, p_out in zip(group.index[:n_subs], sub_in, sub_out):
412 |                     assert df.loc[idx, "is_sub"]
413 |                     df.loc[idx, "sub_in"] = p_in
414 |                     df.loc[idx, "sub_out"] = p_out
415 |                     df.loc[idx, "sub_team"] = tm
416 |                     df.loc[idx, "detail"] = f"{p_in} enters the game for {p_out}"
417 |                 # third, if applicable, remove old sub entries when there are
418 |                 # redundant subs
419 |                 n_extra = len(group) - len(sub_in)
420 |                 if n_extra:
421 |                     extra_idxs = group.index[-n_extra:]
422 |                     df.drop(extra_idxs, axis=0, inplace=True)
423 | 
424 |         df.reset_index(drop=True, inplace=True)
425 | 
426 |         # add column for pts and score
427 |         df["pts"] = df["is_ftm"] + 2 * df["is_fgm"] + (df["is_fgm"] & df["is_three"])
428 |         df["hm_pts"] = np.where(df.off_team == df.home, df.pts, 0)
429 |         df["aw_pts"] = np.where(df.off_team == df.away, df.pts, 0)
430 |         df["hm_score"] = np.cumsum(df["hm_pts"])
431 |         df["aw_score"] = np.cumsum(df["aw_pts"])
432 | 
433 |         # more helpful columns
434 |         # "play" is differentiated from "poss" by counting OReb as new play
435 |         # "plays" end with non-and1 FGA, TO, last non-tech FTA, or end of qtr
436 |         # (or double lane viol)
437 |         new_qtr = df.quarter.diff().shift(-1).fillna(False).astype(bool)  # noqa
438 |         and1 = (  # noqa
439 |             df.is_fgm
440 |             & df.is_pf.shift(-1).fillna(False)
441 |             & df.is_fta.shift(-2).fillna(False)
442 |             & ~df.secs_elapsed.diff().shift(-1).fillna(False).astype(bool)
443 |         )
444 |         double_lane = df.get("viol_type") == "double lane"  # noqa
445 | 
446 |         new_play = df.eval(
447 |             "(is_fga & ~(@and1)) | is_to | @new_qtr |"
448 |             "(is_fta & ~is_tech_fta & fta_num == tot_fta) |"
449 |             "@double_lane"
450 |         )
451 |         df["play_id"] = np.cumsum(new_play).shift(1).fillna(0)
452 |         df["hm_off"] = df.off_team == df.home
453 | 
454 |         # get lineup data
455 |         if dense_lineups:
456 |             df = pd.concat((df, sportsref.nba.pbp.get_dense_lineups(df)), axis=1)
457 |         if sparse_lineups:
458 |             df = pd.concat((df, sportsref.nba.pbp.get_sparse_lineups(df)), axis=1)
459 | 
460 |         # TODO: add shot clock as a feature
461 | 
462 |         return df
463 | 


--------------------------------------------------------------------------------
/sportsref/nfl/pbp.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import re
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | import sportsref
  8 | 
  9 | RUSH_OPTS = {
 10 |     "left end": "LE",
 11 |     "left tackle": "LT",
 12 |     "left guard": "LG",
 13 |     "up the middle": "M",
 14 |     "middle": "M",
 15 |     "right end": "RE",
 16 |     "right tackle": "RT",
 17 |     "right guard": "RG",
 18 | }
 19 | PASS_OPTS = {
 20 |     "short left": "SL",
 21 |     "short middle": "SM",
 22 |     "short right": "SR",
 23 |     "deep left": "DL",
 24 |     "deep middle": "DM",
 25 |     "deep right": "DR",
 26 | }
 27 | 
 28 | 
 29 | def expand_details(df, detailCol="detail"):
 30 |     """Expands the details column of the given dataframe and returns the
 31 |     resulting DataFrame.
 32 | 
 33 |     :df: The input DataFrame.
 34 |     :detailCol: The detail column name.
 35 |     :returns: Returns DataFrame with new columns from pbp parsing.
 36 |     """
 37 |     df = copy.deepcopy(df)
 38 |     df["detail"] = df[detailCol]
 39 |     dicts = [
 40 |         sportsref.nfl.pbp.parse_play_details(detail) for detail in df["detail"].values
 41 |     ]
 42 |     # clean up unmatched details
 43 |     cols = {c for d in dicts if d for c in list(d.keys())}
 44 |     blankEntry = {c: np.nan for c in cols}
 45 |     newDicts = [d if d else blankEntry for d in dicts]
 46 |     # get details DataFrame and merge it with original to create main DataFrame
 47 |     details = pd.DataFrame(newDicts)
 48 |     df = pd.merge(df, details, left_index=True, right_index=True)
 49 |     # add isError column
 50 |     errors = [i for i, d in enumerate(dicts) if d is None]
 51 |     df["isError"] = False
 52 |     df.loc[errors, "isError"] = True
 53 |     # fill in some NaN's necessary for _clean_features
 54 |     df.loc[0, "qtr_time_remain"] = "15:00"
 55 |     df.qtr_time_remain.fillna(method="bfill", inplace=True)
 56 |     df.qtr_time_remain.fillna(
 57 |         pd.Series(np.where(df.quarter == 4, "0:00", "15:00")), inplace=True
 58 |     )
 59 |     # use _clean_features to clean up and add columns
 60 |     new_df = df.apply(_clean_features, axis=1)
 61 |     return new_df
 62 | 
 63 | 
 64 | @sportsref.decorators.memoize
 65 | def parse_play_details(details):
 66 |     """Parses play details from play-by-play string and returns structured
 67 |     data.
 68 | 
 69 |     :details: detail string for play
 70 |     :returns: dictionary of play attributes
 71 |     """
 72 | 
 73 |     # if input isn't a string, return None
 74 |     if not isinstance(details, str):
 75 |         return None
 76 | 
 77 |     rushOptRE = r"(?P<rushDir>{})".format(r"|".join(list(RUSH_OPTS.keys())))
 78 |     passOptRE = r"(?P<passLoc>{})".format(r"|".join(list(PASS_OPTS.keys())))
 79 | 
 80 |     playerRE = r"\S{6,8}\d{2}"
 81 | 
 82 |     # initialize return dictionary - struct
 83 |     struct = {}
 84 | 
 85 |     # handle challenges
 86 |     # TODO: record the play both before & after an overturned challenge
 87 |     challengeRE = re.compile(
 88 |         r".+\. (?P<challenger>.+?) challenged.*? the play was "
 89 |         r"(?P<callUpheld>upheld|overturned)\.",
 90 |         re.IGNORECASE,
 91 |     )
 92 |     match = challengeRE.search(details)
 93 |     if match:
 94 |         struct["isChallenge"] = True
 95 |         struct.update(match.groupdict())
 96 |         # if overturned, only record updated play
 97 |         if "overturned" in details:
 98 |             overturnedIdx = details.index("overturned.")
 99 |             newStart = overturnedIdx + len("overturned.")
100 |             details = details[newStart:].strip()
101 |     else:
102 |         struct["isChallenge"] = False
103 | 
104 |     # TODO: expand on laterals
105 |     struct["isLateral"] = details.find("lateral") != -1
106 | 
107 |     # create rushing regex
108 |     rusherRE = r"(?P<rusher>{0})".format(playerRE)
109 |     rushOptRE = r"(?: {})?".format(rushOptRE)
110 |     rushYardsRE = r"(?:(?:(?P<rushYds>\-?\d+) yards?)|(?:no gain))"
111 |     # cases: tackle, fumble, td, penalty
112 |     tackleRE = (
113 |         r"(?: \(tackle by (?P<tackler1>{0})"
114 |         r"(?: and (?P<tackler2>{0}))?\))?".format(playerRE)
115 |     )
116 |     # currently, plays with multiple fumbles record the original fumbler
117 |     # and the final fumble recoverer
118 |     fumbleRE = (
119 |         r"(?:"
120 |         r"\.? ?(?P<fumbler>{0}) fumbles"
121 |         r"(?: \(forced by (?P<fumbForcer>{0})\))?"
122 |         r"(?:.*, recovered by (?P<fumbRecoverer>{0}) at )?"
123 |         r"(?:, ball out of bounds at )?"
124 |         r"(?:(?P<fumbRecFieldSide>[a-z]+)?\-?(?P<fumbRecYdLine>\-?\d+))?"
125 |         r"(?: and returned for (?P<fumbRetYds>\-?\d*) yards)?"
126 |         r")?".format(playerRE)
127 |     )
128 |     tdSafetyRE = r"(?:(?P<isTD>, touchdown)|(?P<isSafety>, safety))?"
129 |     # TODO: offsetting penalties
130 |     penaltyRE = (
131 |         r"(?:.*?"
132 |         r"\. Penalty on (?P<penOn>{0}|): "
133 |         r"(?P<penalty>[^\(,]+)"
134 |         r"(?: \((?P<penDeclined>Declined)\)|"
135 |         r", (?P<penYds>\d*) yards?)"
136 |         r"(?: \(no play\))?"
137 |         r")?".format(playerRE)
138 |     )
139 | 
140 |     rushREstr = (r"{}{}(?: for {}{}{}{}{})?").format(
141 |         rusherRE, rushOptRE, rushYardsRE, tackleRE, fumbleRE, tdSafetyRE, penaltyRE
142 |     )
143 |     rushRE = re.compile(rushREstr, re.IGNORECASE)
144 | 
145 |     # create passing regex
146 |     # TODO: capture "defended by X" for defensive stats
147 |     passerRE = r"(?P<passer>{0})".format(playerRE)
148 |     sackRE = (
149 |         r"(?:sacked (?:by (?P<sacker1>{0})(?: and (?P<sacker2>{0}))? )?"
150 |         r"for (?P<sackYds>\-?\d+) yards?)".format(playerRE)
151 |     )
152 |     # create throw RE
153 |     completeRE = r"pass (?P<isComplete>(?:in)?complete)"
154 |     passOptRE = r"(?: {})?".format(passOptRE)
155 |     targetedRE = r"(?: (?:to |intended for )?(?P<target>{0}))?".format(playerRE)
156 |     passYardsRE = r"(?: for (?:(?P<passYds>\-?\d+) yards?|no gain))"
157 |     intRE = (
158 |         r"(?: is intercepted by (?P<interceptor>{0}) at ".format(playerRE)
159 |         + r"(?:(?P<intFieldSide>[a-z]*)?\-?(?P<intYdLine>\-?\d*))?"
160 |         + r"(?: and returned for (?P<intRetYds>\-?\d+) yards?\.?)?)?"
161 |     )
162 |     throwRE = r"(?:{}{}{}(?:(?:{}|{}){})?)".format(
163 |         completeRE, passOptRE, targetedRE, passYardsRE, intRE, tackleRE
164 |     )
165 |     passREstr = (r"{} (?:{}|{})(?:{}{}{})?").format(
166 |         passerRE, sackRE, throwRE, fumbleRE, tdSafetyRE, penaltyRE
167 |     )
168 |     passRE = re.compile(passREstr, re.IGNORECASE)
169 | 
170 |     # create kickoff regex
171 |     koKickerRE = r"(?P<koKicker>{0})".format(playerRE)
172 |     koYardsRE = (
173 |         r" kicks (?:off|(?P<isOnside>onside))" r" (?:(?P<koYds>\d+) yards?|no gain)"
174 |     )
175 |     nextREs = []
176 |     nextREs.append(
177 |         (
178 |             r", (?:returned|recovered) by (?P<koReturner>{0})(?: for "
179 |             r"(?:(?P<koRetYds>\-?\d+) yards?|no gain))?"
180 |         ).format(playerRE)
181 |     )
182 |     nextREs.append(
183 |         (
184 |             r"(?P<isMuffedCatch>, muffed catch by )(?P<muffedBy>{0}),"
185 |             r"(?: recovered by (?P<muffRecoverer>{0}))?"
186 |         ).format(playerRE)
187 |         + r"(?: and returned for (?:(?P<muffRetYds>\-?\d+) yards|no gain))?"
188 |     )
189 |     nextREs.append(r", recovered by (?P<onsideRecoverer>{0})".format(playerRE))
190 |     nextREs.append(r"(?P<oob>, out of bounds)")
191 |     nextREs.append(r"(?P<isTouchback>, touchback)")
192 |     # TODO: test the following line to fix a small subset of cases
193 |     # (ex: muff -> oob)
194 |     nextRE = "".join(r"(?:{})?".format(nre) for nre in nextREs)
195 |     kickoffREstr = r"{}{}{}{}{}{}{}".format(
196 |         koKickerRE, koYardsRE, nextRE, tackleRE, fumbleRE, tdSafetyRE, penaltyRE
197 |     )
198 |     kickoffRE = re.compile(kickoffREstr, re.IGNORECASE)
199 | 
200 |     # create timeout regex
201 |     timeoutREstr = r"Timeout #(?P<timeoutNum>\d) by (?P<timeoutTeam>.+)"
202 |     timeoutRE = re.compile(timeoutREstr, re.IGNORECASE)
203 | 
204 |     # create FG regex
205 |     fgKickerRE = r"(?P<fgKicker>{0})".format(playerRE)
206 |     fgBaseRE = r" (?P<fgDist>\d+) yard field goal" r" (?P<fgGood>good|no good)"
207 |     fgBlockRE = (
208 |         r"(?:, (?P<isBlocked>blocked) by "
209 |         r"(?P<fgBlocker>{0}))?".format(playerRE)
210 |         + r"(?:, recovered by (?P<fgBlockRecoverer>{0}))?".format(playerRE)
211 |         + r"(?: and returned for (?:(?P<fgBlockRetYds>\-?\d+) yards?|no gain))?"
212 |     )
213 |     fgREstr = r"{}{}{}{}{}".format(
214 |         fgKickerRE, fgBaseRE, fgBlockRE, tdSafetyRE, penaltyRE
215 |     )
216 |     fgRE = re.compile(fgREstr, re.IGNORECASE)
217 | 
218 |     # create punt regex
219 |     punterRE = r".*?(?P<punter>{0})".format(playerRE)
220 |     puntBlockRE = (
221 |         (
222 |             r" punts, (?P<isBlocked>blocked) by (?P<puntBlocker>{0})"
223 |             r"(?:, recovered by (?P<puntBlockRecoverer>{0})"
224 |         ).format(playerRE)
225 |         + r"(?: and returned (?:(?P<puntBlockRetYds>\-?\d+) yards|no gain))?)?"
226 |     )
227 |     puntYdsRE = r" punts (?P<puntYds>\d+) yards?"
228 |     nextREs = []
229 |     nextREs.append(
230 |         r", (?P<isFairCatch>fair catch) by (?P<fairCatcher>{0})".format(playerRE)
231 |     )
232 |     nextREs.append(r", (?P<oob>out of bounds)")
233 |     nextREs.append(
234 |         (
235 |             r"(?P<isMuffedCatch>, muffed catch by )(?P<muffedBy>{0}),"
236 |             r" recovered by (?P<muffRecoverer>{0})"
237 |         ).format(playerRE)
238 |         + r" and returned for "
239 |         + r"(?:(?P<muffRetYds>\d+) yards|no gain)"
240 |     )
241 |     nextREs.append(
242 |         r", returned by (?P<puntReturner>{0}) for ".format(playerRE)
243 |         + r"(?:(?P<puntRetYds>\-?\d+) yards?|no gain)"
244 |     )
245 |     nextRE = r"(?:{})?".format("|".join(nextREs))
246 |     puntREstr = r"{}(?:{}|{}){}{}{}{}{}".format(
247 |         punterRE,
248 |         puntBlockRE,
249 |         puntYdsRE,
250 |         nextRE,
251 |         tackleRE,
252 |         fumbleRE,
253 |         tdSafetyRE,
254 |         penaltyRE,
255 |     )
256 |     puntRE = re.compile(puntREstr, re.IGNORECASE)
257 | 
258 |     # create kneel regex
259 |     kneelREstr = (
260 |         r"(?P<kneelQB>{0}) kneels for ".format(playerRE)
261 |         + r"(?:(?P<kneelYds>\-?\d+) yards?|no gain)"
262 |     )
263 |     kneelRE = re.compile(kneelREstr, re.IGNORECASE)
264 | 
265 |     # create spike regex
266 |     spikeREstr = r"(?P<spikeQB>{0}) spiked the ball".format(playerRE)
267 |     spikeRE = re.compile(spikeREstr, re.IGNORECASE)
268 | 
269 |     # create XP regex
270 |     extraPointREstr = (
271 |         r"(?:(?P<xpKicker>{0}) kicks)? ?extra point " r"(?P<xpGood>good|no good)"
272 |     ).format(playerRE)
273 |     extraPointRE = re.compile(extraPointREstr, re.IGNORECASE)
274 | 
275 |     # create 2pt conversion regex
276 |     twoPointREstr = (
277 |         r"Two Point Attempt: (?P<twoPoint>.*?),?\s+conversion\s+"
278 |         r"(?P<twoPointSuccess>succeeds|fails)"
279 |     )
280 |     twoPointRE = re.compile(twoPointREstr, re.IGNORECASE)
281 | 
282 |     # create penalty regex
283 |     psPenaltyREstr = (
284 |         r"^Penalty on (?P<penOn>{0}|".format(playerRE)
285 |         + r"\w{3}): "
286 |         + r"(?P<penalty>[^\(,]+)(?: \((?P<penDeclined>Declined)\)|"
287 |         + r", (?P<penYds>\d*) yards?|"
288 |         + r".*?(?: \(no play\)))"
289 |     )
290 |     psPenaltyRE = re.compile(psPenaltyREstr, re.IGNORECASE)
291 | 
292 |     # try parsing as a kickoff
293 |     match = kickoffRE.search(details)
294 |     if match:
295 |         # parse as a kickoff
296 |         struct["isKickoff"] = True
297 |         struct.update(match.groupdict())
298 |         return struct
299 | 
300 |     # try parsing as a timeout
301 |     match = timeoutRE.search(details)
302 |     if match:
303 |         # parse as timeout
304 |         struct["isTimeout"] = True
305 |         struct.update(match.groupdict())
306 |         return struct
307 | 
308 |     # try parsing as a field goal
309 |     match = fgRE.search(details)
310 |     if match:
311 |         # parse as a field goal
312 |         struct["isFieldGoal"] = True
313 |         struct.update(match.groupdict())
314 |         return struct
315 | 
316 |     # try parsing as a punt
317 |     match = puntRE.search(details)
318 |     if match:
319 |         # parse as a punt
320 |         struct["isPunt"] = True
321 |         struct.update(match.groupdict())
322 |         return struct
323 | 
324 |     # try parsing as a kneel
325 |     match = kneelRE.search(details)
326 |     if match:
327 |         # parse as a kneel
328 |         struct["isKneel"] = True
329 |         struct.update(match.groupdict())
330 |         return struct
331 | 
332 |     # try parsing as a spike
333 |     match = spikeRE.search(details)
334 |     if match:
335 |         # parse as a spike
336 |         struct["isSpike"] = True
337 |         struct.update(match.groupdict())
338 |         return struct
339 | 
340 |     # try parsing as an XP
341 |     match = extraPointRE.search(details)
342 |     if match:
343 |         # parse as an XP
344 |         struct["isXP"] = True
345 |         struct.update(match.groupdict())
346 |         return struct
347 | 
348 |     # try parsing as a 2-point conversion
349 |     match = twoPointRE.search(details)
350 |     if match:
351 |         # parse as a 2-point conversion
352 |         struct["isTwoPoint"] = True
353 |         struct["twoPointSuccess"] = match.group("twoPointSuccess")
354 |         realPlay = sportsref.nfl.pbp.parse_play_details(match.group("twoPoint"))
355 |         if realPlay:
356 |             struct.update(realPlay)
357 |         return struct
358 | 
359 |     # try parsing as a pass
360 |     match = passRE.search(details)
361 |     if match:
362 |         # parse as a pass
363 |         struct["isPass"] = True
364 |         struct.update(match.groupdict())
365 |         return struct
366 | 
367 |     # try parsing as a pre-snap penalty
368 |     match = psPenaltyRE.search(details)
369 |     if match:
370 |         # parse as a pre-snap penalty
371 |         struct["isPresnapPenalty"] = True
372 |         struct.update(match.groupdict())
373 |         return struct
374 | 
375 |     # try parsing as a run
376 |     match = rushRE.search(details)
377 |     if match:
378 |         # parse as a run
379 |         struct["isRun"] = True
380 |         struct.update(match.groupdict())
381 |         return struct
382 | 
383 |     return None
384 | 
385 | 
386 | def _clean_features(struct):
387 |     """Cleans up the features collected in parse_play_details.
388 | 
389 |     :struct: Pandas Series of features parsed from details string.
390 |     :returns: the same dict, but with cleaner features (e.g., convert bools,
391 |     ints, etc.)
392 |     """
393 |     struct = dict(struct)
394 |     # First, clean up play type bools
395 |     ptypes = [
396 |         "isKickoff",
397 |         "isTimeout",
398 |         "isFieldGoal",
399 |         "isPunt",
400 |         "isKneel",
401 |         "isSpike",
402 |         "isXP",
403 |         "isTwoPoint",
404 |         "isPresnapPenalty",
405 |         "isPass",
406 |         "isRun",
407 |     ]
408 |     for pt in ptypes:
409 |         struct[pt] = struct[pt] if pd.notnull(struct.get(pt)) else False
410 |     # Second, clean up other existing variables on a one-off basis
411 |     struct["callUpheld"] = struct.get("callUpheld") == "upheld"
412 |     struct["fgGood"] = struct.get("fgGood") == "good"
413 |     struct["isBlocked"] = struct.get("isBlocked") == "blocked"
414 |     struct["isComplete"] = struct.get("isComplete") == "complete"
415 |     struct["isFairCatch"] = struct.get("isFairCatch") == "fair catch"
416 |     struct["isMuffedCatch"] = pd.notnull(struct.get("isMuffedCatch"))
417 |     struct["isNoPlay"] = (
418 |         " (no play)" in struct["detail"]
419 |         and "penalty enforced in end zone" not in struct["detail"]
420 |         if struct.get("detail")
421 |         else False
422 |     )
423 |     struct["isOnside"] = struct.get("isOnside") == "onside"
424 |     struct["isSack"] = pd.notnull(struct.get("sackYds"))
425 |     struct["isSafety"] = struct.get("isSafety") == ", safety" or (
426 |         struct.get("detail") and "enforced in end zone, safety" in struct["detail"]
427 |     )
428 |     struct["isTD"] = struct.get("isTD") == ", touchdown"
429 |     struct["isTouchback"] = struct.get("isTouchback") == ", touchback"
430 |     struct["oob"] = pd.notnull(struct.get("oob"))
431 |     struct["passLoc"] = PASS_OPTS.get(struct.get("passLoc"), np.nan)
432 |     if struct["isPass"]:
433 |         pyds = struct["passYds"]
434 |         struct["passYds"] = pyds if pd.notnull(pyds) else 0
435 |     if pd.notnull(struct["penalty"]):
436 |         struct["penalty"] = struct["penalty"].strip()
437 |     struct["penDeclined"] = struct.get("penDeclined") == "Declined"
438 |     if struct["quarter"] == "OT":
439 |         struct["quarter"] = 5
440 |     struct["rushDir"] = RUSH_OPTS.get(struct.get("rushDir"), np.nan)
441 |     if struct["isRun"]:
442 |         ryds = struct["rushYds"]
443 |         struct["rushYds"] = ryds if pd.notnull(ryds) else 0
444 |     year = struct.get("season", np.nan)
445 |     struct["timeoutTeam"] = sportsref.nfl.teams.team_ids(year).get(
446 |         struct.get("timeoutTeam"), np.nan
447 |     )
448 |     struct["twoPointSuccess"] = struct.get("twoPointSuccess") == "succeeds"
449 |     struct["xpGood"] = struct.get("xpGood") == "good"
450 | 
451 |     # Third, ensure types are correct
452 |     bool_vars = [
453 |         "fgGood",
454 |         "isBlocked",
455 |         "isChallenge",
456 |         "isComplete",
457 |         "isFairCatch",
458 |         "isFieldGoal",
459 |         "isKickoff",
460 |         "isKneel",
461 |         "isLateral",
462 |         "isNoPlay",
463 |         "isPass",
464 |         "isPresnapPenalty",
465 |         "isPunt",
466 |         "isRun",
467 |         "isSack",
468 |         "isSafety",
469 |         "isSpike",
470 |         "isTD",
471 |         "isTimeout",
472 |         "isTouchback",
473 |         "isTwoPoint",
474 |         "isXP",
475 |         "isMuffedCatch",
476 |         "oob",
477 |         "penDeclined",
478 |         "twoPointSuccess",
479 |         "xpGood",
480 |     ]
481 |     int_vars = [
482 |         "down",
483 |         "fgBlockRetYds",
484 |         "fgDist",
485 |         "fumbRecYdLine",
486 |         "fumbRetYds",
487 |         "intRetYds",
488 |         "intYdLine",
489 |         "koRetYds",
490 |         "koYds",
491 |         "muffRetYds",
492 |         "pbp_score_aw",
493 |         "pbp_score_hm",
494 |         "passYds",
495 |         "penYds",
496 |         "puntBlockRetYds",
497 |         "puntRetYds",
498 |         "puntYds",
499 |         "quarter",
500 |         "rushYds",
501 |         "sackYds",
502 |         "timeoutNum",
503 |         "ydLine",
504 |         "yds_to_go",
505 |     ]
506 |     float_vars = ["exp_pts_after", "exp_pts_before", "home_wp"]
507 |     string_vars = [
508 |         "challenger",
509 |         "detail",
510 |         "fairCatcher",
511 |         "fgBlockRecoverer",
512 |         "fgBlocker",
513 |         "fgKicker",
514 |         "fieldSide",
515 |         "fumbForcer",
516 |         "fumbRecFieldSide",
517 |         "fumbRecoverer",
518 |         "fumbler",
519 |         "intFieldSide",
520 |         "interceptor",
521 |         "kneelQB",
522 |         "koKicker",
523 |         "koReturner",
524 |         "muffRecoverer",
525 |         "muffedBy",
526 |         "passLoc",
527 |         "passer",
528 |         "penOn",
529 |         "penalty",
530 |         "puntBlockRecoverer",
531 |         "puntBlocker",
532 |         "puntReturner",
533 |         "punter",
534 |         "qtr_time_remain",
535 |         "rushDir",
536 |         "rusher",
537 |         "sacker1",
538 |         "sacker2",
539 |         "spikeQB",
540 |         "tackler1",
541 |         "tackler2",
542 |         "target",
543 |         "timeoutTeam",
544 |         "xpKicker",
545 |     ]
546 |     for var in bool_vars:
547 |         struct[var] = struct.get(var) is True
548 |     for var in int_vars:
549 |         try:
550 |             struct[var] = int(struct.get(var))
551 |         except (ValueError, TypeError):
552 |             struct[var] = np.nan
553 |     for var in float_vars:
554 |         try:
555 |             struct[var] = float(struct.get(var))
556 |         except (ValueError, TypeError):
557 |             struct[var] = np.nan
558 |     for var in string_vars:
559 |         if var not in struct or pd.isnull(struct[var]) or var == "":
560 |             struct[var] = np.nan
561 | 
562 |     # Fourth, create new helper variables based on parsed variables
563 |     # creating fieldSide and ydline from location
564 |     if struct["isXP"]:
565 |         struct["fieldSide"] = struct["ydLine"] = np.nan
566 |     else:
567 |         fieldSide, ydline = _loc_to_features(struct.get("location"))
568 |         struct["fieldSide"] = fieldSide
569 |         struct["ydLine"] = ydline
570 |     # creating secsElapsed (in entire game) from qtr_time_remain and quarter
571 |     if pd.notnull(struct.get("qtr_time_remain")):
572 |         qtr = struct["quarter"]
573 |         mins, secs = list(map(int, struct["qtr_time_remain"].split(":")))
574 |         struct["secsElapsed"] = qtr * 900 - mins * 60 - secs
575 |     # creating columns for turnovers
576 |     struct["isInt"] = pd.notnull(struct.get("interceptor"))
577 |     struct["isFumble"] = pd.notnull(struct.get("fumbler"))
578 |     # create column for isPenalty
579 |     struct["isPenalty"] = pd.notnull(struct.get("penalty"))
580 |     # create columns for EPA
581 |     struct["team_epa"] = struct["exp_pts_after"] - struct["exp_pts_before"]
582 |     struct["opp_epa"] = struct["exp_pts_before"] - struct["exp_pts_after"]
583 |     return pd.Series(struct)
584 | 
585 | 
586 | def _loc_to_features(loc):
587 |     """Converts a location string "{Half}, {YardLine}" into a tuple of those
588 |     values, the second being an int.
589 | 
590 |     :l: The string from the play by play table representing location.
591 |     :returns: A tuple that separates out the values, making them missing
592 |     (np.nan) when necessary.
593 | 
594 |     """
595 |     if loc:
596 |         if isinstance(loc, str):
597 |             loc = loc.strip()
598 |             if " " in loc:
599 |                 r = loc.split()
600 |                 r[0] = r[0].lower()
601 |                 r[1] = int(r[1])
602 |             else:
603 |                 r = (np.nan, int(loc))
604 |         elif isinstance(loc, float):
605 |             return (np.nan, 50)
606 |     else:
607 |         r = (np.nan, np.nan)
608 |     return r
609 | 
610 | 
611 | def _add_team_columns(features):
612 |     """Function that adds 'team' and 'opp' columns to the features by iterating
613 |     through the rows in order. A precondition is that the features dicts are in
614 |     order in a continuous game sense and that all rows are from the same game.
615 | 
616 |     :features: A DataFrame with each row representing each play (in order).
617 |     :returns: A similar DataFrame but with 'team' and 'opp' columns added.
618 |     """
619 |     features = features.to_dict("records")
620 |     curTm = curOpp = None
621 |     playAfterKickoff = False
622 |     # fill in team and opp columns
623 |     for row in features:
624 |         # if it's a kickoff or the play after a kickoff,
625 |         # figure out who has possession manually
626 |         if row["isKickoff"] or playAfterKickoff:
627 |             curTm, curOpp = _team_and_opp(row)
628 |         else:
629 |             curTm, curOpp = _team_and_opp(row, curTm, curOpp)
630 |         row["team"], row["opp"] = curTm, curOpp
631 |         # set playAfterKickoff
632 |         playAfterKickoff = row["isKickoff"]
633 | 
634 |     features = pd.DataFrame(features)
635 |     features.team.fillna(method="bfill", inplace=True)
636 |     features.opp.fillna(method="bfill", inplace=True)
637 |     # ffill for last row
638 |     features.team.fillna(method="ffill", inplace=True)
639 |     features.opp.fillna(method="ffill", inplace=True)
640 |     return features
641 | 
642 | 
643 | def _team_and_opp(struct, curTm=None, curOpp=None):
644 |     """Given a dict representing a play and the current team with the ball,
645 |     returns (team, opp) where team is the team with the ball and opp is the
646 |     team without the ball at the end of the play.
647 | 
648 |     :struct: A Series/dict representing the play.
649 |     :curTm: The current team with the ball; None means it's the first play of
650 |     the game or the offensive team on the previous play's offensive team was
651 |     somehow undetermined.
652 |     :curOpp: The current team on defense; None means same as curTm.
653 |     :returns: (team, opp) tuple where team and opp are the 3-character team IDs
654 |     or the offensive and defensive teams respectively.
655 |     """
656 |     # if we don't know the current team, figure it out
657 |     if pd.isnull(curTm):
658 |         if struct["isRun"]:
659 |             pID = struct["rusher"]
660 |         elif struct["isPass"]:
661 |             pID = struct["passer"]
662 |         elif struct["isFieldGoal"]:
663 |             pID = struct["fgKicker"]
664 |         elif struct["isPunt"]:
665 |             pID = struct["punter"]
666 |         elif struct["isXP"]:
667 |             pID = struct["xpKicker"]
668 |         elif struct["isKickoff"]:
669 |             pID = struct["koKicker"]
670 |         elif struct["isSpike"]:
671 |             pID = struct["spikeQB"]
672 |         elif struct["isKneel"]:
673 |             pID = struct["kneelQB"]
674 |         else:
675 |             pID = None
676 |         curTm = curOpp = np.nan
677 |         bs = sportsref.nfl.boxscores.BoxScore(struct["boxscore_id"])
678 |         if pID and len(pID) == 3:
679 |             curTm = pID
680 |             curOpp = bs.away() if bs.home() == curTm else bs.home()
681 |         elif pID:
682 |             player = sportsref.nfl.Player(pID)
683 |             gamelog = player.gamelog(kind="B")
684 |             curTm = gamelog.loc[
685 |                 gamelog.boxscore_id == struct["boxscore_id"], "team_id"
686 |             ].item()
687 |             curOpp = bs.home() if bs.home() != curTm else bs.away()
688 | 
689 |         return curTm, curOpp
690 | 
691 |     # use row's class to determine when possession changes
692 |     if struct["has_class_divider"]:
693 |         return curOpp, curTm
694 |     else:
695 |         return curTm, curOpp
696 | 
697 | 
698 | def _add_team_features(df):
699 |     """Adds extra convenience features based on teams with and without
700 |     possession, with the precondition that the there are 'team' and 'opp'
701 |     specified in row.
702 | 
703 |     :df: A DataFrame representing a game's play-by-play data after
704 |         _clean_features has been called and 'team' and 'opp' have been added by
705 |         _add_team_columns.
706 |     :returns: A dict with new features in addition to previous features.
707 |     """
708 |     assert df.team.notnull().all()
709 | 
710 |     homeOnOff = df["team"] == df["home"]
711 |     # create column for distToGoal
712 |     df["distToGoal"] = np.where(
713 |         df["team"] != df["fieldSide"], df["ydLine"], 100 - df["ydLine"]
714 |     )
715 |     df["distToGoal"] = np.where(df["isXP"] | df["isTwoPoint"], 2, df["distToGoal"])
716 |     # create column for each team's WP
717 |     df["team_wp"] = np.where(homeOnOff, df["home_wp"], 100.0 - df["home_wp"])
718 |     df["opp_wp"] = 100.0 - df["team_wp"]
719 |     # create columns for each team's WPA
720 |     df["team_wpa"] = np.where(homeOnOff, df["home_wpa"], -df["home_wpa"])
721 |     df["opp_wpa"] = -df["team_wpa"]
722 |     # create column for offense and defense scores if not already there
723 |     assert df["boxscore_id"].nunique() == 1
724 |     bs_id = df["boxscore_id"].values[0]
725 |     bs = sportsref.nfl.boxscores.BoxScore(bs_id)
726 |     df["team_score"] = np.where(
727 |         df["team"] == bs.home(), df["pbp_score_hm"], df["pbp_score_aw"]
728 |     )
729 |     df["opp_score"] = np.where(
730 |         df["team"] == bs.home(), df["pbp_score_aw"], df["pbp_score_hm"]
731 |     )
732 | 
733 |     return df
734 | 


--------------------------------------------------------------------------------
/sportsref/nba/pbp.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | 
  6 | import sportsref
  7 | 
  8 | HM_LINEUP_COLS = ["hm_player{}".format(i) for i in range(1, 6)]
  9 | AW_LINEUP_COLS = ["aw_player{}".format(i) for i in range(1, 6)]
 10 | ALL_LINEUP_COLS = AW_LINEUP_COLS + HM_LINEUP_COLS
 11 | 
 12 | PLAYER_RE = r"\w{0,7}\d{2}"
 13 | 
 14 | # parsing field goal attempts
 15 | shot_re = (
 16 |     rf"(?P<shooter>{PLAYER_RE}) "
 17 |     r"(?P<is_fgm>makes|misses) "
 18 |     r"(?P<is_three>2|3)\-pt "
 19 |     r"(?P<shot_type>jump shot|hook shot|layup|dunk) "
 20 |     r"(?:from (?P<shot_dist>\d+) ft|at rim)"
 21 | )
 22 | assist_re = rf" \(assist by (?P<assister>{PLAYER_RE})\)"
 23 | block_re = rf" \(block by (?P<blocker>{PLAYER_RE})\)"
 24 | SHOT_RE = re.compile(rf"{shot_re}(?:{assist_re}|{block_re})?", flags=re.I)
 25 | 
 26 | # parsing jump balls
 27 | jump_re = (
 28 |     rf"Jump ball: (?P<away_jumper>{PLAYER_RE}) vs\. (?P<home_jumper>{PLAYER_RE})"
 29 |     rf"(?: \((?P<gains_poss>{PLAYER_RE}) gains possession\))?"
 30 | )
 31 | JUMP_RE = re.compile(jump_re, flags=re.I)
 32 | 
 33 | # parsing rebounds
 34 | reb_re = rf"(?P<is_oreb>Offensive|Defensive) rebound by (?P<rebounder>{PLAYER_RE}|Team)"
 35 | REB_RE = re.compile(reb_re, flags=re.I)
 36 | 
 37 | # parsing free throws
 38 | ft_re = (
 39 |     rf"(?P<ft_shooter>{PLAYER_RE}) (?P<is_ftm>makes|misses) "
 40 |     r"(?P<is_tech_fta>technical )?(?P<is_flag_fta>flagrant )?"
 41 |     r"(?P<is_clearpath_fta>clear path )?free throw"
 42 |     r"(?: (?P<fta_num>\d+) of (?P<tot_fta>\d+))?"
 43 | )
 44 | FT_RE = re.compile(ft_re, flags=re.I)
 45 | 
 46 | # parsing substitutions
 47 | sub_re = rf"(?P<sub_in>{PLAYER_RE}) enters the game for (?P<sub_out>{PLAYER_RE})"
 48 | SUB_RE = re.compile(sub_re, flags=re.I)
 49 | 
 50 | # parsing turnovers
 51 | to_reasons = rf"(?P<to_type>[^;]+)(?:; steal by (?P<stealer>{PLAYER_RE}))?"
 52 | to_re = rf"Turnover by (?P<to_by>{PLAYER_RE}|Team) \((?:{to_reasons})\)"
 53 | TO_RE = re.compile(to_re, flags=re.I)
 54 | 
 55 | # parsing shooting fouls
 56 | shot_foul_re = (
 57 |     r"Shooting(?P<is_block_foul> block)? foul "
 58 |     rf"by (?P<fouler>{PLAYER_RE})"
 59 |     rf"(?: \(drawn by (?P<drew_foul>{PLAYER_RE})\))?"
 60 | )
 61 | SHOT_FOUL_RE = re.compile(shot_foul_re, flags=re.I)
 62 | 
 63 | # parsing offensive fouls
 64 | off_foul_re = (
 65 |     r"Offensive(?P<is_charge> charge)? foul "
 66 |     rf"by (?P<to_by>{PLAYER_RE})"
 67 |     rf"(?: \(drawn by (?P<drew_foul>{PLAYER_RE})\))?"
 68 | )
 69 | OFF_FOUL_RE = re.compile(off_foul_re, flags=re.I)
 70 | 
 71 | # parsing personal fouls
 72 | foul_re = (
 73 |     r"Personal (?P<is_take_foul>take )?(?P<is_block_foul>block )?"
 74 |     rf"foul by (?P<fouler>{PLAYER_RE})(?: \(drawn by "
 75 |     rf"(?P<drew_foul>{PLAYER_RE})\))?"
 76 | )
 77 | FOUL_RE = re.compile(foul_re, flags=re.I)
 78 | 
 79 | # parsing loose ball fouls
 80 | loose_ball_re = (
 81 |     rf"Loose ball foul by (?P<fouler>{PLAYER_RE})"
 82 |     rf"(?: \(drawn by (?P<drew_foul>{PLAYER_RE})\))?"
 83 | )
 84 | LOOSE_BALL_RE = re.compile(loose_ball_re, flags=re.I)
 85 | 
 86 | # parsing away from play fouls
 87 | away_from_ball_re = (
 88 |     rf"Away from play foul by (?P<fouler>{PLAYER_RE})"
 89 |     rf"(?: \(drawn by (?P<drew_foul>{PLAYER_RE})\))?"
 90 | )
 91 | AWAY_FROM_BALL_RE = re.compile(away_from_ball_re, flags=re.I)
 92 | 
 93 | # parsing inbound fouls
 94 | inbound_re = (
 95 |     rf"Inbound foul by (?P<fouler>{PLAYER_RE})"
 96 |     rf"(?: \(drawn by (?P<drew_foul>{PLAYER_RE})\))?"
 97 | )
 98 | INBOUND_RE = re.compile(inbound_re, flags=re.I)
 99 | 
100 | # parsing flagrant fouls
101 | flagrant_re = (
102 |     rf"Flagrant foul type (?P<flag_type>1|2) by (?P<fouler>{PLAYER_RE})"
103 |     rf"(?: \(drawn by (?P<drew_foul>{PLAYER_RE})\))?"
104 | )
105 | FLAGRANT_RE = re.compile(flagrant_re, flags=re.I)
106 | 
107 | # parsing clear path fouls
108 | clear_path_re = (
109 |     rf"Clear path foul by (?P<fouler>{PLAYER_RE})"
110 |     rf"(?: \(drawn by (?P<drew_foul>{PLAYER_RE})\))?"
111 | )
112 | CLEAR_PATH_RE = re.compile(clear_path_re, flags=re.I)
113 | 
114 | # parsing timeouts
115 | timeout_re = r"(?P<timeout_team>.*?) (?:full )?timeout"
116 | TIMEOUT_RE = re.compile(timeout_re, flags=re.I)
117 | 
118 | # parsing technical fouls
119 | tech_re = (
120 |     r"(?P<is_hanging>Hanging )?"
121 |     r"(?P<is_taunting>Taunting )?"
122 |     r"(?P<is_ill_def>Ill def )?"
123 |     r"(?P<is_delay>Delay )?"
124 |     r"(?P<is_unsport>Non unsport )?"
125 |     r"tech(?:nical)? foul by "
126 |     rf"(?P<tech_fouler>{PLAYER_RE}|Team)"
127 | )
128 | TECH_RE = re.compile(tech_re, flags=re.I)
129 | 
130 | # parsing ejections
131 | eject_re = rf"(?P<ejectee>{PLAYER_RE}|Team) ejected from game"
132 | EJECT_RE = re.compile(eject_re, flags=re.I)
133 | 
134 | # parsing defensive 3 seconds techs
135 | def3_tech_re = (
136 |     r"(?:Def 3 sec tech foul|Defensive three seconds)"
137 |     rf" by (?P<tech_fouler>{PLAYER_RE})"
138 | )
139 | DEF3_TECH_RE = re.compile(def3_tech_re, flags=re.I)
140 | 
141 | # parsing violations
142 | viol_re = rf"Violation by (?P<violator>{PLAYER_RE}|Team) \((?P<viol_type>.*)\)"
143 | VIOL_RE = re.compile(viol_re, flags=re.I)
144 | 
145 | 
146 | def sparse_lineup_cols(df):
147 |     regex = "{}_in".format(PLAYER_RE)
148 |     return [c for c in df.columns if re.match(regex, c)]
149 | 
150 | 
151 | def parse_play(boxscore_id, details, is_home):
152 |     """Parse play details from a play-by-play string describing a play.
153 | 
154 |     Assuming valid input, this function returns structured data in a dictionary
155 |     describing the play. If the play detail string was invalid, this function
156 |     returns None.
157 | 
158 |     :param boxscore_id: the boxscore ID of the play
159 |     :param details: detail string for the play
160 |     :param is_home: bool indicating whether the offense is at home
161 |     :param returns: dictionary of play attributes or None if invalid
162 |     :rtype: dictionary or None
163 |     """
164 |     # if input isn't a string, return None
165 |     if not details or not isinstance(details, str):
166 |         return None
167 | 
168 |     bs = sportsref.nba.BoxScore(boxscore_id)
169 |     aw, hm = bs.away(), bs.home()
170 |     season = sportsref.nba.Season(bs.season())
171 |     hm_roster = set(bs.basic_stats().query("is_home == True").player_id.values)
172 | 
173 |     play = {}
174 |     play["detail"] = details
175 |     play["home"] = hm
176 |     play["away"] = aw
177 |     play["is_home_play"] = is_home
178 | 
179 |     match = re.match(SHOT_RE, details)
180 |     if match:
181 |         play["is_fga"] = True
182 |         play.update(match.groupdict())
183 |         play["shot_dist"] = play["shot_dist"] if play["shot_dist"] is not None else 0
184 |         play["shot_dist"] = int(play["shot_dist"])
185 |         play["is_fgm"] = play["is_fgm"] == "makes"
186 |         play["is_three"] = play["is_three"] == "3"
187 |         play["is_assist"] = pd.notnull(play.get("assister"))
188 |         play["is_block"] = pd.notnull(play.get("blocker"))
189 |         shooter_home = play["shooter"] in hm_roster
190 |         play["off_team"] = hm if shooter_home else aw
191 |         play["def_team"] = aw if shooter_home else hm
192 |         return play
193 | 
194 |     match = re.match(JUMP_RE, details)
195 |     if match:
196 |         play["is_jump_ball"] = True
197 |         play.update(match.groupdict())
198 |         return play
199 | 
200 |     match = re.match(REB_RE, details)
201 |     if match:
202 |         play["is_reb"] = True
203 |         play.update(match.groupdict())
204 |         play["is_oreb"] = play["is_oreb"].lower() == "offensive"
205 |         play["is_dreb"] = not play["is_oreb"]
206 |         play["is_team_rebound"] = play["rebounder"] == "Team"
207 |         if play["is_team_rebound"]:
208 |             play["reb_team"], other = (hm, aw) if is_home else (aw, hm)
209 |         else:
210 |             reb_home = play["rebounder"] in hm_roster
211 |             play["reb_team"], other = (hm, aw) if reb_home else (aw, hm)
212 |         play["off_team"] = play["reb_team"] if play["is_oreb"] else other
213 |         play["def_team"] = play["reb_team"] if play["is_dreb"] else other
214 |         return play
215 | 
216 |     match = re.match(FT_RE, details)
217 |     if match:
218 |         play["is_fta"] = True
219 |         play.update(match.groupdict())
220 |         play["is_ftm"] = play["is_ftm"] == "makes"
221 |         play["is_tech_fta"] = bool(play["is_tech_fta"])
222 |         play["is_flag_fta"] = bool(play["is_flag_fta"])
223 |         play["is_clearpath_fta"] = bool(play["is_clearpath_fta"])
224 |         play["is_pf_fta"] = not play["is_tech_fta"]
225 |         if play["tot_fta"]:
226 |             play["tot_fta"] = int(play["tot_fta"])
227 |         if play["fta_num"]:
228 |             play["fta_num"] = int(play["fta_num"])
229 |         ft_home = play["ft_shooter"] in hm_roster
230 |         play["fta_team"] = hm if ft_home else aw
231 |         if not play["is_tech_fta"]:
232 |             play["off_team"] = hm if ft_home else aw
233 |             play["def_team"] = aw if ft_home else hm
234 |         return play
235 | 
236 |     match = re.match(SUB_RE, details)
237 |     if match:
238 |         play["is_sub"] = True
239 |         play.update(match.groupdict())
240 |         sub_home = play["sub_in"] in hm_roster or play["sub_out"] in hm_roster
241 |         play["sub_team"] = hm if sub_home else aw
242 |         return play
243 | 
244 |     match = re.match(TO_RE, details)
245 |     if match:
246 |         play["is_to"] = True
247 |         play.update(match.groupdict())
248 |         play["to_type"] = play["to_type"].lower()
249 |         if play["to_type"] == "offensive foul":
250 |             return None
251 |         play["is_steal"] = pd.notnull(play["stealer"])
252 |         play["is_travel"] = play["to_type"] == "traveling"
253 |         play["is_shot_clock_viol"] = play["to_type"] == "shot clock"
254 |         play["is_oob"] = play["to_type"] == "step out of bounds"
255 |         play["is_three_sec_viol"] = play["to_type"] == "3 sec"
256 |         play["is_backcourt_viol"] = play["to_type"] == "back court"
257 |         play["is_off_goaltend"] = play["to_type"] == "offensive goaltending"
258 |         play["is_double_dribble"] = play["to_type"] == "dbl dribble"
259 |         play["is_discont_dribble"] = play["to_type"] == "discontinued dribble"
260 |         play["is_carry"] = play["to_type"] == "palming"
261 |         if play["to_by"] == "Team":
262 |             play["off_team"] = hm if is_home else aw
263 |             play["def_team"] = aw if is_home else hm
264 |         else:
265 |             to_home = play["to_by"] in hm_roster
266 |             play["off_team"] = hm if to_home else aw
267 |             play["def_team"] = aw if to_home else hm
268 |         return play
269 | 
270 |     match = re.match(SHOT_FOUL_RE, details)
271 |     if match:
272 |         play["is_pf"] = True
273 |         play["is_shot_foul"] = True
274 |         play.update(match.groupdict())
275 |         play["is_block_foul"] = bool(play["is_block_foul"])
276 |         foul_on_home = play["fouler"] in hm_roster
277 |         play["off_team"] = aw if foul_on_home else hm
278 |         play["def_team"] = hm if foul_on_home else aw
279 |         play["foul_team"] = play["def_team"]
280 |         return play
281 | 
282 |     match = re.match(OFF_FOUL_RE, details)
283 |     if match:
284 |         play["is_pf"] = True
285 |         play["is_off_foul"] = True
286 |         play["is_to"] = True
287 |         play["to_type"] = "offensive foul"
288 |         play.update(match.groupdict())
289 |         play["is_charge"] = bool(play["is_charge"])
290 |         play["fouler"] = play["to_by"]
291 |         foul_on_home = play["fouler"] in hm_roster
292 |         play["off_team"] = hm if foul_on_home else aw
293 |         play["def_team"] = aw if foul_on_home else hm
294 |         play["foul_team"] = play["off_team"]
295 |         return play
296 | 
297 |     match = re.match(FOUL_RE, details)
298 |     if match:
299 |         play["is_pf"] = True
300 |         play.update(match.groupdict())
301 |         play["is_take_foul"] = bool(play["is_take_foul"])
302 |         play["is_block_foul"] = bool(play["is_block_foul"])
303 |         foul_on_home = play["fouler"] in hm_roster
304 |         play["off_team"] = aw if foul_on_home else hm
305 |         play["def_team"] = hm if foul_on_home else aw
306 |         play["foul_team"] = play["def_team"]
307 |         return play
308 | 
309 |     # TODO: parsing double personal fouls
310 |     # double_foul_re = (r'Double personal foul by (?P<fouler1>{0}) and '
311 |     #                   r'(?P<fouler2>{0})').format(PLAYER_RE)
312 |     # m = re.match(double_Foul_re, details)
313 |     # if m:
314 |     #     p['is_pf'] = True
315 |     #     p.update(m.groupdict())
316 |     #     p['off_team'] =
317 | 
318 |     match = re.match(LOOSE_BALL_RE, details)
319 |     if match:
320 |         play["is_pf"] = True
321 |         play["is_loose_ball_foul"] = True
322 |         play.update(match.groupdict())
323 |         foul_home = play["fouler"] in hm_roster
324 |         play["foul_team"] = hm if foul_home else aw
325 |         return play
326 | 
327 |     # parsing punching fouls
328 |     # TODO
329 | 
330 |     match = re.match(AWAY_FROM_BALL_RE, details)
331 |     if match:
332 |         play["is_pf"] = True
333 |         play["is_away_from_play_foul"] = True
334 |         play.update(match.groupdict())
335 |         foul_on_home = play["fouler"] in hm_roster
336 |         # TODO: figure out who had the ball based on previous play
337 |         play["foul_team"] = hm if foul_on_home else aw
338 |         return play
339 | 
340 |     match = re.match(INBOUND_RE, details)
341 |     if match:
342 |         play["is_pf"] = True
343 |         play["is_inbound_foul"] = True
344 |         play.update(match.groupdict())
345 |         foul_on_home = play["fouler"] in hm_roster
346 |         play["off_team"] = aw if foul_on_home else hm
347 |         play["def_team"] = hm if foul_on_home else aw
348 |         play["foul_team"] = play["def_team"]
349 |         return play
350 | 
351 |     match = re.match(FLAGRANT_RE, details)
352 |     if match:
353 |         play["is_pf"] = True
354 |         play["is_flagrant"] = True
355 |         play.update(match.groupdict())
356 |         foul_on_home = play["fouler"] in hm_roster
357 |         play["foul_team"] = hm if foul_on_home else aw
358 |         return play
359 | 
360 |     match = re.match(CLEAR_PATH_RE, details)
361 |     if match:
362 |         play["is_pf"] = True
363 |         play["is_clear_path_foul"] = True
364 |         play.update(match.groupdict())
365 |         foul_on_home = play["fouler"] in hm_roster
366 |         play["off_team"] = aw if foul_on_home else hm
367 |         play["def_team"] = hm if foul_on_home else aw
368 |         play["foul_team"] = play["def_team"]
369 |         return play
370 | 
371 |     match = re.match(TIMEOUT_RE, details)
372 |     if match:
373 |         play["is_timeout"] = True
374 |         play.update(match.groupdict())
375 |         is_official_to = play["timeout_team"].lower() == "official"
376 |         name_to_id = season.team_names_to_ids()
377 |         play["timeout_team"] = (
378 |             "Official"
379 |             if is_official_to
380 |             else name_to_id.get(hm, name_to_id.get(aw, play["timeout_team"]))
381 |         )
382 |         return play
383 | 
384 |     match = re.match(TECH_RE, details)
385 |     if match:
386 |         play["is_tech_foul"] = True
387 |         play.update(match.groupdict())
388 |         play["is_hanging"] = bool(play["is_hanging"])
389 |         play["is_taunting"] = bool(play["is_taunting"])
390 |         play["is_ill_def"] = bool(play["is_ill_def"])
391 |         play["is_delay"] = bool(play["is_delay"])
392 |         play["is_unsport"] = bool(play["is_unsport"])
393 |         foul_on_home = play["tech_fouler"] in hm_roster
394 |         play["foul_team"] = hm if foul_on_home else aw
395 |         return play
396 | 
397 |     match = re.match(EJECT_RE, details)
398 |     if match:
399 |         play["is_ejection"] = True
400 |         play.update(match.groupdict())
401 |         if play["ejectee"] == "Team":
402 |             play["ejectee_team"] = hm if is_home else aw
403 |         else:
404 |             eject_home = play["ejectee"] in hm_roster
405 |             play["ejectee_team"] = hm if eject_home else aw
406 |         return play
407 | 
408 |     match = re.match(DEF3_TECH_RE, details)
409 |     if match:
410 |         play["is_tech_foul"] = True
411 |         play["is_def_three_secs"] = True
412 |         play.update(match.groupdict())
413 |         foul_on_home = play["tech_fouler"] in hm_roster
414 |         play["off_team"] = aw if foul_on_home else hm
415 |         play["def_team"] = hm if foul_on_home else aw
416 |         play["foul_team"] = play["def_team"]
417 |         return play
418 | 
419 |     match = re.match(VIOL_RE, details)
420 |     if match:
421 |         play["is_viol"] = True
422 |         play.update(match.groupdict())
423 |         if play["viol_type"] == "kicked_ball":
424 |             play["is_to"] = True
425 |             play["to_by"] = play["violator"]
426 |         if play["violator"] == "Team":
427 |             play["viol_team"] = hm if is_home else aw
428 |         else:
429 |             viol_home = play["violator"] in hm_roster
430 |             play["viol_team"] = hm if viol_home else aw
431 |         return play
432 | 
433 |     play["is_error"] = True
434 |     return play
435 | 
436 | 
437 | def clean_features(df):
438 |     """Fixes up columns of the passed DataFrame, such as casting T/F columns to
439 |     boolean and filling in NaNs for team and opp.
440 | 
441 |     :param df: DataFrame of play-by-play data.
442 |     :returns: Dataframe with cleaned columns.
443 |     """
444 |     df = pd.DataFrame(df)
445 | 
446 |     bool_vals = set([True, False, None, np.nan])
447 |     sparse_cols = sparse_lineup_cols(df)
448 |     for col in df:
449 | 
450 |         # make indicator columns boolean type (and fill in NaNs)
451 |         if set(df[col].unique()[:5]) <= bool_vals:
452 |             df[col] = df[col] == True  # noqa
453 | 
454 |         # fill NaN's in sparse lineup columns to 0
455 |         elif col in sparse_cols:
456 |             df[col] = df[col].fillna(0)
457 | 
458 |     # fix free throw columns on technicals
459 |     df.loc[df.is_tech_fta, ["fta_num", "tot_fta"]] = 1
460 | 
461 |     # fill in NaN's/fix off_team and def_team columns
462 |     df.off_team.fillna(method="bfill", inplace=True)
463 |     df.def_team.fillna(method="bfill", inplace=True)
464 |     df.off_team.fillna(method="ffill", inplace=True)
465 |     df.def_team.fillna(method="ffill", inplace=True)
466 | 
467 |     return df
468 | 
469 | 
470 | def clean_multigame_features(df):
471 |     """TODO: Docstring for clean_multigame_features.
472 | 
473 |     :df: TODO
474 |     :returns: TODO
475 |     """
476 |     df = pd.DataFrame(df)
477 |     if df.index.value_counts().max() > 1:
478 |         df.reset_index(drop=True, inplace=True)
479 | 
480 |     df = clean_features(df)
481 | 
482 |     # if it's many games in one DataFrame, make poss_id and play_id unique
483 |     for col in ("play_id", "poss_id"):
484 |         diffs = df[col].diff().fillna(0)
485 |         if (diffs < 0).any():
486 |             new_col = np.cumsum(diffs.astype(bool))  # noqa
487 |             df.eval("{} = @new_col".format(col), inplace=True)
488 | 
489 |     return df
490 | 
491 | 
492 | def get_period_starters(df):
493 |     """TODO"""
494 | 
495 |     def players_from_play(play):
496 |         """Figures out what players are in the game based on the players
497 |         mentioned in a play. Returns away and home players as two sets.
498 | 
499 |         :param play: A dictionary representing a parsed play.
500 |         :returns: (aw_players, hm_players)
501 |         :rtype: tuple of lists
502 |         """
503 |         # if it's a tech FT from between periods, don't count this play
504 |         if play["clock_str"] == "12:00.0" and (
505 |             play.get("is_tech_foul") or play.get("is_tech_fta")
506 |         ):
507 |             return [], []
508 | 
509 |         stats = sportsref.nba.BoxScore(play["boxscore_id"]).basic_stats()
510 |         home_grouped = stats.groupby("is_home")
511 |         hm_roster = set(home_grouped.player_id.get_group(True).values)
512 |         aw_roster = set(home_grouped.player_id.get_group(False).values)
513 |         player_keys = [
514 |             "assister",
515 |             "away_jumper",
516 |             "blocker",
517 |             "drew_foul",
518 |             "fouler",
519 |             "ft_shooter",
520 |             "gains_poss",
521 |             "home_jumper",
522 |             "rebounder",
523 |             "shooter",
524 |             "stealer",
525 |             "sub_in",
526 |             "sub_out",
527 |             "to_by",
528 |         ]
529 |         players = [p for p in play[player_keys] if pd.notnull(p)]
530 | 
531 |         aw_players = [p for p in players if p in aw_roster]
532 |         hm_players = [p for p in players if p in hm_roster]
533 |         return aw_players, hm_players
534 | 
535 |     # create a mapping { quarter => (away_starters, home_starters) }
536 |     n_periods = df.quarter.nunique()
537 |     period_starters = [(set(), set()) for _ in range(n_periods)]
538 | 
539 |     # fill out this mapping quarter by quarter
540 |     for qtr, qtr_grp in df.groupby(df.quarter):
541 |         aw_starters, hm_starters = period_starters[qtr - 1]
542 |         exclude = set()
543 |         # loop through sets of plays that happen at the "same time"
544 |         for label, time_grp in qtr_grp.groupby(qtr_grp.secs_elapsed):
545 |             # first, if they sub in and weren't already starters, exclude them
546 |             sub_ins = set(time_grp.sub_in.dropna().values)
547 |             exclude.update(sub_ins - aw_starters - hm_starters)
548 |             # second, figure out new starters from each play at this time
549 |             for i, row in time_grp.iterrows():
550 |                 aw_players, hm_players = players_from_play(row)
551 |                 # update overall sets for the quarter
552 |                 aw_starters.update(aw_players)
553 |                 hm_starters.update(hm_players)
554 |             # remove excluded (subbed-in) players
555 |             hm_starters -= exclude
556 |             aw_starters -= exclude
557 |             # check whether we have found all starters
558 |             if len(hm_starters) > 5 or len(aw_starters) > 5:
559 |                 import ipdb
560 | 
561 |                 ipdb.set_trace()
562 |             if len(hm_starters) >= 5 and len(aw_starters) >= 5:
563 |                 break
564 | 
565 |         if len(hm_starters) != 5 or len(aw_starters) != 5:
566 |             print(
567 |                 "WARNING: wrong number of starters for a team in Q{} of {}".format(
568 |                     qtr, df.boxscore_id.iloc[0]
569 |                 )
570 |             )
571 | 
572 |     return period_starters
573 | 
574 | 
575 | def get_sparse_lineups(df):
576 |     """TODO: Docstring for get_sparse_lineups.
577 | 
578 |     :param df: TODO
579 |     :returns: TODO
580 |     """
581 | 
582 |     # get the lineup data using get_dense_lineups if necessary
583 |     if set(ALL_LINEUP_COLS) - set(df.columns):
584 |         lineup_df = get_dense_lineups(df)
585 |     else:
586 |         lineup_df = df[ALL_LINEUP_COLS]
587 | 
588 |     # create the sparse representation
589 |     hm_lineups = lineup_df[HM_LINEUP_COLS].values
590 |     aw_lineups = lineup_df[AW_LINEUP_COLS].values
591 |     # +1 for home, -1 for away
592 |     hm_df = pd.DataFrame(
593 |         [
594 |             {"{}_in".format(player_id): 1 for player_id in lineup}
595 |             for lineup in hm_lineups
596 |         ],
597 |         dtype=int,
598 |     )
599 |     aw_df = pd.DataFrame(
600 |         [
601 |             {"{}_in".format(player_id): -1 for player_id in lineup}
602 |             for lineup in aw_lineups
603 |         ],
604 |         dtype=int,
605 |     )
606 |     sparse_df = pd.concat((hm_df, aw_df), axis=1).fillna(0)
607 |     return sparse_df
608 | 
609 | 
610 | def get_dense_lineups(df):
611 |     """Returns a new DataFrame based on the one it is passed. Specifically, it
612 |     adds five columns for each team (ten total), where each column has the ID
613 |     of a player on the court during the play. Assumes the DataFrame corresponds
614 |     to only a single game (one unique boxscore ID).
615 | 
616 |     This information is figured out sequentially from the game's substitution
617 |     data in the passed DataFrame, so the DataFrame passed as an argument must
618 |     be from a specific BoxScore (rather than a DataFrame of non-consecutive
619 |     plays). That is, the DataFrame must be of the form returned by
620 |     :func:`nba.BoxScore.pbp <nba.BoxScore.pbp>`.
621 | 
622 |     .. note:: Note that the lineups reflect the teams in the game when the play
623 |         happened, not after the play. For example, if a play is a substitution,
624 |         the lineups for that play will be the lineups before the substituion
625 |         occurs.
626 | 
627 |     :param df: A DataFrame of a game's play-by-play data.
628 |     :returns: A DataFrame with additional lineup columns.
629 | 
630 |     """
631 |     assert df["boxscore_id"].nunique() == 1
632 | 
633 |     def lineup_dict(aw_lineup, hm_lineup):
634 |         """Returns a dictionary of lineups to be converted to columns.
635 |         Specifically, the columns are 'aw_player1' through 'aw_player5' and
636 |         'hm_player1' through 'hm_player5'.
637 | 
638 |         :param aw_lineup: The away team's current lineup.
639 |         :param hm_lineup: The home team's current lineup.
640 |         :returns: A dictionary of lineups.
641 |         """
642 |         return {
643 |             "{}_player{}".format(tm, i + 1): player
644 |             for tm, lineup in zip(["aw", "hm"], [aw_lineup, hm_lineup])
645 |             for i, player in enumerate(lineup)
646 |         }
647 | 
648 |     def handle_sub(row, aw_lineup, hm_lineup):
649 |         """Modifies the aw_lineup and hm_lineup lists based on the substitution
650 |         that takes place in the given row."""
651 |         assert row["is_sub"]
652 |         sub_lineup = hm_lineup if row["sub_team"] == row["home"] else aw_lineup
653 |         try:
654 |             # make the sub
655 |             idx = sub_lineup.index(row["sub_out"])
656 |             sub_lineup[idx] = row["sub_in"]
657 |         except ValueError:
658 |             # if the sub was double-entered and it's already been executed...
659 |             if row["sub_in"] in sub_lineup and row["sub_out"] not in sub_lineup:
660 |                 return aw_lineup, hm_lineup
661 |             # otherwise, let's print and pretend this never happened
662 |             print(
663 |                 "ERROR IN SUB IN {}, Q{}, {}: {}".format(
664 |                     row["boxscore_id"], row["quarter"], row["clock_str"], row["detail"]
665 |                 )
666 |             )
667 |             raise
668 |         return aw_lineup, hm_lineup
669 | 
670 |     per_starters = get_period_starters(df)
671 |     cur_qtr = 0
672 |     aw_lineup, hm_lineup = [], []
673 |     df = df.reset_index(drop=True)
674 |     lineups = [{} for _ in range(df.shape[0])]
675 | 
676 |     # loop through select plays to determine lineups
677 |     sub_or_per_start = df.is_sub | df.quarter.diff().astype(bool)
678 |     for i, row in df.loc[sub_or_per_start].iterrows():
679 |         if row["quarter"] > cur_qtr:
680 |             # first row in a quarter
681 |             assert row["quarter"] == cur_qtr + 1
682 |             # first, finish up the last quarter's lineups
683 |             if cur_qtr > 0 and not df.loc[i - 1, "is_sub"]:
684 |                 lineups[i - 1] = lineup_dict(aw_lineup, hm_lineup)
685 |             # then, move on to the quarter, and enter the starting lineups
686 |             cur_qtr += 1
687 |             aw_lineup, hm_lineup = list(map(list, per_starters[cur_qtr - 1]))
688 |             lineups[i] = lineup_dict(aw_lineup, hm_lineup)
689 |             # if the first play in the quarter is a sub, handle that
690 |             if row["is_sub"]:
691 |                 aw_lineup, hm_lineup = handle_sub(row, aw_lineup, hm_lineup)
692 |         else:
693 |             # during the quarter
694 |             # update lineups first then change lineups based on subs
695 |             lineups[i] = lineup_dict(aw_lineup, hm_lineup)
696 |             if row["is_sub"]:
697 |                 aw_lineup, hm_lineup = handle_sub(row, aw_lineup, hm_lineup)
698 | 
699 |     # create and clean DataFrame
700 |     lineup_df = pd.DataFrame(lineups)
701 |     if lineup_df.iloc[-1].isnull().all():
702 |         lineup_df.iloc[-1] = lineup_dict(aw_lineup, hm_lineup)
703 |     lineup_df = lineup_df.groupby(df.quarter).fillna(method="bfill")
704 | 
705 |     # fill in NaN's based on minutes played
706 |     bool_mat = lineup_df.isnull()
707 |     mask = bool_mat.any(axis=1)
708 |     if mask.any():
709 |         bs = sportsref.nba.BoxScore(df.boxscore_id[0])
710 |         # first, get the true minutes played from the box score
711 |         stats = sportsref.nba.BoxScore(df.boxscore_id.iloc[0]).basic_stats()
712 |         true_mp = (
713 |             pd.Series(
714 |                 stats.query("mp > 0")[["player_id", "mp"]]
715 |                 .set_index("player_id")
716 |                 .to_dict()["mp"]
717 |             )
718 |             * 60
719 |         )
720 |         # next, calculate minutes played based on the lineup data
721 |         calc_mp = pd.Series(
722 |             {
723 |                 p: (
724 |                     df.secs_elapsed.diff() * [p in row for row in lineup_df.values]
725 |                 ).sum()
726 |                 for p in stats.query("mp > 0").player_id.values
727 |             }
728 |         )
729 |         # finally, figure which players are missing minutes
730 |         diff = true_mp - calc_mp
731 |         players_missing = diff.loc[diff.abs() >= 150]
732 |         hm_roster = bs.basic_stats().query("is_home == True").player_id.values
733 |         missing_df = pd.DataFrame(
734 |             {
735 |                 "secs": players_missing.values,
736 |                 "is_home": players_missing.index.isin(hm_roster),
737 |             },
738 |             index=players_missing.index,
739 |         )
740 | 
741 |         if missing_df.empty:
742 |             # TODO: log this as a warning (or error?)
743 |             print(
744 |                 "There are NaNs in the lineup data, but no players were "
745 |                 "found to be missing significant minutes"
746 |             )
747 |         else:
748 |             for is_home, group in missing_df.groupby("is_home"):
749 |                 player_id = group.index.item()
750 |                 tm_cols = (
751 |                     sportsref.nba.pbp.HM_LINEUP_COLS
752 |                     if is_home
753 |                     else sportsref.nba.pbp.AW_LINEUP_COLS
754 |                 )
755 |                 row_mask = lineup_df[tm_cols].isnull().any(axis=1)
756 |                 lineup_df.loc[row_mask, tm_cols] = (
757 |                     lineup_df.loc[row_mask, tm_cols].fillna(player_id).values
758 |                 )
759 | 
760 |     return lineup_df
761 | 


--------------------------------------------------------------------------------