├── doc ├── .nojekyll ├── source │ ├── _static │ │ ├── .gitignore │ │ └── validation_plot.png │ ├── modules.rst │ ├── nflwin.rst │ ├── installation.rst │ ├── index.rst │ ├── dev.rst │ ├── nfldb.rst │ ├── model.rst │ └── conf.py ├── index.html ├── make.bat └── Makefile ├── .coveragerc ├── nflwin ├── models │ └── .gitignore ├── _version.py ├── __init__.py ├── tests │ ├── test_model.py │ ├── test_utilities.py │ └── test_preprocessing.py ├── utilities.py ├── preprocessing.py └── model.py ├── MANIFEST.in ├── .gitignore ├── rtd_requirements.txt ├── environment.yml ├── run_tests.sh ├── LICENSE ├── README.rst ├── .travis.yml ├── make_default_model.py ├── setup.py └── increment_version.sh /doc/.nojekyll: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /doc/source/_static/.gitignore: -------------------------------------------------------------------------------- 1 | !*.png -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = nflwin/tests/* -------------------------------------------------------------------------------- /nflwin/models/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore -------------------------------------------------------------------------------- /nflwin/_version.py: -------------------------------------------------------------------------------- 1 | __version__ = u"1.0.1" 2 | -------------------------------------------------------------------------------- /nflwin/__init__.py: -------------------------------------------------------------------------------- 1 | from ._version import __version__ 2 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README* LICENSE 2 | include nflwin/models/default_model.nflwin* -------------------------------------------------------------------------------- /doc/source/modules.rst: -------------------------------------------------------------------------------- 1 | nflwin 2 | ====== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | nflwin 8 | -------------------------------------------------------------------------------- /doc/source/_static/validation_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndrewRook/NFLWin/HEAD/doc/source/_static/validation_plot.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.png 3 | dist/ 4 | *.egg-info/ 5 | 6 | *.cache/ 7 | *.coverage 8 | 9 | old_pywpa/ 10 | secrets.txt 11 | 12 | doc/build/ -------------------------------------------------------------------------------- /rtd_requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.11.0 2 | numpydoc==0.5 3 | pandas==0.18.0 4 | scikit-learn==0.17.1 5 | scipy==0.17.0 6 | Sphinx==1.4.1 7 | sphinx-rtd-theme==0.1.9 8 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: nflwin 2 | dependencies: 3 | - numpy=1.11.0 4 | - pandas=0.18.0 5 | - psycopg2=2.6.1 6 | - pytest=2.9.1 7 | - pytest-cov=2.2.1 8 | - scikit-learn=0.17.1 9 | - scipy=0.17.0 10 | - sqlalchemy=1.0.12 11 | - pip: 12 | - nfldb==0.2.15 13 | - nflgame==1.2.19 14 | 15 | -------------------------------------------------------------------------------- /doc/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Redirect 5 | 6 | 7 | 10 | 11 | 12 | 13 | If you are not redirected automatically, follow the link here. 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /doc/source/nflwin.rst: -------------------------------------------------------------------------------- 1 | nflwin package 2 | ============== 3 | 4 | Submodules 5 | ---------- 6 | 7 | nflwin.model module 8 | ------------------- 9 | 10 | .. automodule:: nflwin.model 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | nflwin.preprocessing module 16 | --------------------------- 17 | 18 | .. automodule:: nflwin.preprocessing 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | nflwin.utilities module 24 | ----------------------- 25 | 26 | .. automodule:: nflwin.utilities 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | 32 | Module contents 33 | --------------- 34 | 35 | .. automodule:: nflwin 36 | :members: 37 | :undoc-members: 38 | :show-inheritance: 39 | -------------------------------------------------------------------------------- /run_tests.sh: -------------------------------------------------------------------------------- 1 | 2 | #!/bin/bash 3 | 4 | mflag=" -m" 5 | mflagval=" not requires_db" 6 | python_version=`python -c "import sys; print(sys.version_info.major)"` 7 | pytest_command="py.test" 8 | if [ $python_version == "3" ]; then 9 | pytest_command="pytest" 10 | fi 11 | 12 | while getopts ":d" opt; do 13 | case $opt in 14 | d) 15 | echo "Running all tests..." 16 | mflagval='' 17 | mflag='' 18 | ;; 19 | \?) 20 | echo "" 21 | echo "" 22 | echo "Invalid option: -$OPTARG" 23 | echo "Usage:" 24 | echo "-----------------" 25 | echo "-d: run tests which require nfldb database access" 26 | echo "" 27 | ;; 28 | esac 29 | done 30 | 31 | python -m ${pytest_command}${mflag}"${mflagval}" --cov=nflwin --cov-config .coveragerc --cov-report term-missing nflwin/tests/ 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) [2015] [Andrew Schechtman-Rook] 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | NFLWin 2 | =============== 3 | 4 | |Build Status| |Doc Status| 5 | 6 | 7 | Estimate Win Probability (WP) for plays in NFL games: 8 | 9 | .. code-block:: python 10 | 11 | >>> import pandas as pd 12 | >>> from nflwin.model import WPModel 13 | >>> standard_model = WPModel.load_model() 14 | >>> plays = pd.DataFrame({ 15 | ... "quarter": ["Q1", "Q2", "Q4"], 16 | ... "seconds_elapsed": [0, 0, 600], 17 | ... "offense_team": ["NYJ", "NYJ", "NE"], 18 | ... "yardline": [-20, 20, 35], 19 | ... "down": [1, 3, 3], 20 | ... "yards_to_go": [10, 2, 10], 21 | ... "home_team": ["NYJ", "NYJ", "NYJ"], 22 | ... "away_team": ["NE", "NE", "NE"], 23 | ... "curr_home_score": [0, 0, 21], 24 | ... "curr_away_score": [0, 0, 10] 25 | ... }) 26 | >>> standard_model.predict_wp(plays) 27 | array([ 0.58300397, 0.64321796, 0.18195466]) 28 | 29 | For full documentation, including information about methods and accuracy, click `here `_. 30 | 31 | License 32 | --------------- 33 | MIT. See `license file `_. 34 | 35 | .. |Build Status| image:: https://travis-ci.org/AndrewRook/NFLWin.svg?branch=master 36 | :target: https://travis-ci.org/AndrewRook/NFLWin 37 | :alt: Build Status 38 | .. |Doc Status| image:: https://readthedocs.org/projects/nflwin/badge/?version=latest 39 | :target: http://nflwin.readthedocs.io/en/latest/?badge=latest 40 | :alt: Documentation Status 41 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | #From http://conda.pydata.org/docs/travis.html 2 | language: python 3 | python: 4 | # We don't actually use the Travis Python, but this keeps it organized. 5 | - "2.7" 6 | install: 7 | - sudo apt-get update 8 | # We do this conditionally because it saves us some downloading if the 9 | # version is the same. 10 | - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then 11 | wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh -O miniconda.sh; 12 | else 13 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; 14 | fi 15 | - bash miniconda.sh -b -p $HOME/miniconda 16 | - export PATH="$HOME/miniconda/bin:$PATH" 17 | - hash -r 18 | - conda config --set always_yes yes --set changeps1 no 19 | - conda update -q conda 20 | # Useful for debugging any issues with conda 21 | - conda info -a 22 | 23 | - conda create -n nflwin_py2 -y -c conda-forge python=2 numpy scipy pandas pytest pytest-cov scikit-learn=0.19 24 | - source activate nflwin_py2 25 | #- pip install nfldb==0.2.15 nflgame==1.2.19 26 | - python setup.py install 27 | 28 | - source deactivate 29 | - conda create -n nflwin_py3 -y -c conda-forge python=3 numpy scipy pandas pytest pytest-cov scikit-learn=0.19 30 | - source activate nflwin_py3 31 | - python setup.py install 32 | 33 | script: 34 | # Your test script goes here 35 | - source activate nflwin_py3 36 | - ./run_tests.sh 37 | - source activate nflwin_py2 38 | - ./run_tests.sh 39 | -------------------------------------------------------------------------------- /doc/source/installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | =============== 3 | NFLWin only supports Python 2, as nfldb is currently incompatible 4 | with Python 3. The bulk of NFLWin should work natively with Python 3, 5 | however that is currently untested. Pull requests ensuring this 6 | compatibility would be welcome. 7 | 8 | 9 | Releases 10 | ---------------------- 11 | Stable releases of NFLWin are available on PyPI:: 12 | 13 | $ pip install nflwin 14 | 15 | The default install provides exactly the tools necessary to make 16 | predictions using the standard WP model as well as make new 17 | models. However it does not include the dependencies necessary for 18 | :ref:`using nfldb `, producing diagnostic plots, or contributing to the 19 | package. 20 | 21 | Installing NFLWin with those extra dependencies is accomplished by 22 | adding a parameter in square brackets:: 23 | 24 | $ pip install nflwin[plotting] #Adds matplotlib for plotting 25 | $ pip install nflwin[nfldb] #Dependencies for using nfldb 26 | $ pip install nflwin[dev] #Everything you need to develop on NFLWin 27 | 28 | .. note:: 29 | NFLWin depends on the scipy library, which is notoriously difficult 30 | to install via pip or from source. One option if you're having 31 | difficulty getting scipy installed is to use the `Conda 32 | `_ package manager. After installing 33 | Conda, you can create a new environment and install dependencies 34 | manually before pip installing NFLWin:: 35 | 36 | $ conda create -n nflwin-env python=2.7 numpy scipy scikit-learn pandas 37 | 38 | Bleeding Edge 39 | --------------------------- 40 | If you want the most recent stable version you can install directly 41 | from GitHub:: 42 | 43 | $ pip install git+https://github.com/AndrewRook/NFLWin.git@master#egg=nflwin 44 | 45 | You can append the arguments for the extra dependencies in the same 46 | way as for the installation from PyPI. 47 | 48 | .. note:: 49 | GitHub installs **do not** come with the default model. If you want 50 | to use a GitHub install with the default model, you'll need to 51 | install NFLWin from PyPI somewhere else and then copy the model 52 | into the model directory from your GitHub install. If you need to 53 | figure out where that directory is, print 54 | ``model.WPModel.model_directory``. 55 | -------------------------------------------------------------------------------- /make_default_model.py: -------------------------------------------------------------------------------- 1 | """A simple script to create, train, validate, and save the default model""" 2 | from __future__ import division, print_function 3 | 4 | import datetime as dt 5 | import time 6 | import os 7 | 8 | from nflwin import model 9 | 10 | def main(): 11 | start = time.time() 12 | win_probability_model = model.WPModel() 13 | 14 | training_seasons = [2009, 2010, 2011, 2012, 2013, 2014] 15 | validation_seasons = [2015] 16 | season_types = ["Regular", "Postseason"] 17 | 18 | win_probability_model.train_model(training_seasons=training_seasons, 19 | training_season_types=season_types) 20 | print("Took {0:.2f}s to build model".format(time.time() - start)) 21 | 22 | start = time.time() 23 | max_deviation, residual_area = win_probability_model.validate_model(validation_seasons=validation_seasons, 24 | validation_season_types=season_types) 25 | print("Took {0:.2f}s to validate model, with a max residual of {1:.2f} and a residual area of {2:.2f}" 26 | .format(time.time() - start, max_deviation, residual_area)) 27 | 28 | win_probability_model.save_model() 29 | 30 | ax = win_probability_model.plot_validation(label="max deviation={0:.2f}, \n" 31 | "residual total area={1:.2f}" 32 | "".format(max_deviation, residual_area)) 33 | curr_datetime = dt.datetime.now() 34 | ax.set_title("Model Generated At: " + curr_datetime.strftime("%Y-%m-%d %H:%M:%S")) 35 | ax.legend(loc="lower right", fontsize=10) 36 | ax.text(0.02, 0.98, ("Data from: {0:s}\n" 37 | "Training season(s): {1:s}\n" 38 | "Validation season(s): {2:s}" 39 | "".format(", ".join(season_types), 40 | ", ".join(str(year) for year in training_seasons), 41 | ", ".join(str(year) for year in validation_seasons))), 42 | ha="left", va="top", fontsize=10, transform=ax.transAxes) 43 | 44 | this_filepath = os.path.dirname(os.path.abspath(__file__)) 45 | save_filepath = os.path.join(this_filepath, "doc", "source", "_static", "validation_plot.png") 46 | ax.figure.savefig(save_filepath) 47 | 48 | 49 | if __name__ == "__main__": 50 | main() 51 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import tarfile 4 | import warnings 5 | from setuptools import setup, find_packages 6 | from setuptools.command.install import install as _install 7 | 8 | ################################################################### 9 | #Boilerplate I modified from the internet 10 | 11 | VERSION_FILE = "nflwin/_version.py" 12 | version_string = open(VERSION_FILE, "r").read() 13 | version_re = r"^__version__ = [u]{0,1}['\"]([^'\"]*)['\"]" 14 | version_match = re.search(version_re, version_string, re.M) 15 | if version_match: 16 | VERSION = version_match.group(1) 17 | else: 18 | raise RuntimeError("Unable to find version string in {0}".format(VERSION_FILE)) 19 | 20 | NAME = "nflwin" 21 | PACKAGES = find_packages(where=".") 22 | META_PATH = os.path.join(NAME, "__init__.py") 23 | KEYWORDS = ['NFL','WP','Win Probability'] 24 | CLASSIFIERS = [ 25 | "Development Status :: 4 - Beta", 26 | "Natural Language :: English", 27 | "License :: OSI Approved :: MIT License", 28 | "Operating System :: OS Independent", 29 | "Programming Language :: Python", 30 | "Programming Language :: Python :: 2", 31 | "Programming Language :: Python :: 2.7", 32 | ] 33 | INSTALL_REQUIRES = ['numpy', 34 | 'scipy', 35 | 'pandas', 36 | 'scikit-learn', 37 | 'joblib',] 38 | 39 | EXTRAS_REQUIRE = { 40 | "plotting": ["matplotlib"], 41 | "nfldb": ["nfldb", "sqlalchemy"], 42 | "dev": ["matplotlib", "nfldb", "sqlalchemy", "pytest", "pytest-cov", "sphinx", "numpydoc"] 43 | } 44 | 45 | PACKAGE_DATA = {"nflwin": ["models/default_model.nflwin*"]} 46 | 47 | HERE = os.path.abspath(os.path.dirname(__file__)) 48 | README = None 49 | with open(os.path.join(HERE, 'README.rst'),'r') as f: 50 | README = f.read() 51 | 52 | ################################################################### 53 | 54 | if __name__ == "__main__": 55 | setup( 56 | name=NAME, 57 | description='A Python implementation of NFL Win Probability (WP)', 58 | license='MIT', 59 | url='https://github.com/AndrewRook/NFLWin', 60 | version=VERSION, 61 | author='Andrew Schechtman-Rook', 62 | author_email='footballastronomer@gmail.com', 63 | maintainer='Andrew Schechtman-Rook', 64 | maintainer_email='footballastronomer@gmail.com', 65 | keywords=KEYWORDS, 66 | long_description=README, 67 | packages=PACKAGES, 68 | package_data=PACKAGE_DATA, 69 | classifiers=CLASSIFIERS, 70 | install_requires=INSTALL_REQUIRES, 71 | extras_require=EXTRAS_REQUIRE 72 | ) 73 | -------------------------------------------------------------------------------- /doc/source/index.rst: -------------------------------------------------------------------------------- 1 | ================================== 2 | NFLWin 3 | ================================== 4 | 5 | NFLWin is designed from the ground up to provide two things: 6 | 7 | * A simple-to-use interface for users to compute Win Probabilities 8 | (WP) for NFL plays based on a built-in WP model. 9 | * A robust framework for improving estimates of WP. 10 | 11 | NFLWin builds on `scikit-learn's `_ 12 | ``fit``-``transform`` idiom, allowing for pipelines that take in raw 13 | box score data and return estimated WPs - all data 14 | preprocessing takes place behind the scenes. Additionally, 15 | these preprocessing steps can be easily reordered, replaced, and/or 16 | extended, allowing for rapid iteration and prototyping of potential 17 | improvements to the WP model. 18 | 19 | NFLWin also has built-in support for efficiently querying data from 20 | `nfldb `_ directly into a format 21 | useable by the built-in WP model, although the model is fully 22 | data-source-agnostic as long as the data is formatted properly for the 23 | model to parse. 24 | 25 | Quickstart 26 | --------------- 27 | 28 | NFLWin is ``pip``-installable:: 29 | 30 | $ pip install nflwin 31 | 32 | .. note:: NFLWin depends on `SciPy `_, which 33 | is notoriously difficult to install properly via 34 | ``pip``. You may wish to use the `Conda 35 | `_ package manager to install 36 | Scipy before installing NFLWin. 37 | 38 | When installed via ``pip``, NFLWin comes with a working Win Probability model out-of-the-box: 39 | 40 | .. code-block:: python 41 | 42 | >>> from nflwin.model import WPModel 43 | >>> standard_model = WPModel.load_model() 44 | 45 | The default model can be inspected to learn what data it requires: 46 | 47 | .. code-block:: python 48 | 49 | >>> standard_model.column_descriptions 50 | {'home_team': 'Abbreviation for the home team', 'yardline': "The yardline, given by (yards from own goalline - 50). -49 is your own 1 while 49 is the opponent's 1.", 'seconds_elapsed': 'Seconds elapsed in the quarter', 'down': 'The current down', 'curr_away_score': 'Abbreviation for the visiting team', 'offense_team': 'Abbreviation for the offensive team', 'yards_to_go': 'Yards to a first down (or the endzone)', 'quarter': 'The quarter'} 51 | 52 | 53 | 54 | NFLWin operates on `Pandas `_ DataFrames: 55 | 56 | .. code-block:: python 57 | 58 | >>> import pandas as pd 59 | >>> plays = pd.DataFrame({ 60 | ... "quarter": ["Q1", "Q2", "Q4"], 61 | ... "seconds_elapsed": [0, 0, 600], 62 | ... "offense_team": ["NYJ", "NYJ", "NE"], 63 | ... "yardline": [-20, 20, 35], 64 | ... "down": [1, 3, 3], 65 | ... "yards_to_go": [10, 2, 10], 66 | ... "home_team": ["NYJ", "NYJ", "NYJ"], 67 | ... "away_team": ["NE", "NE", "NE"], 68 | ... "curr_home_score": [0, 0, 21], 69 | ... "curr_away_score": [0, 0, 10] 70 | ... }) 71 | 72 | Once data is loaded, using the model to predict WP is easy: 73 | 74 | .. code-block:: python 75 | 76 | >>> standard_model.predict_wp(plays) 77 | array([ 0.58300397, 0.64321796, 0.18195466]) 78 | 79 | Current Default Model 80 | --------------------- 81 | 82 | .. image:: _static/validation_plot.png 83 | 84 | Why NFLWin? 85 | -------------- 86 | Put simply, there are no other options: while WP models have been 87 | widely used in NFL analytics for years, the analytics community has 88 | almost totally dropped the ball in making these models available for the 89 | general public or even explaining their algorithms at all. 90 | 91 | For a (much) longer explanation, see the `PhD Football blog 92 | `_. 93 | 94 | 95 | Resources 96 | ------------ 97 | 98 | .. toctree:: 99 | :maxdepth: 2 100 | :hidden: 101 | :caption: Links 102 | 103 | installation.rst 104 | model.rst 105 | Using nfldb 106 | Developer Documentation 107 | Full API Documentation 108 | 109 | -------------------------------------------------------------------------------- /increment_version.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ######################################################################## 4 | # This script intelligently increments NFLWin's version, 5 | # based on the rules of semantic versioning. 6 | # It does the following: 7 | # 1. Parse command line arguments to determine whether to 8 | # increment major, minor, or patch version. 9 | # 2. Makes sure it's not on the master branch. 10 | # 3. Makes sure there aren't any changes that have been 11 | # staged but not committed. 12 | # 4. Makes sure there aren't any changes that have been 13 | # committed but not pushed. 14 | # 5. Makes sure all unit tests pass. 15 | # 6. Compares current version in nflwin/_version.py to most recent 16 | # git tag to make sure they're the same. 17 | # 7. Figures out what the new version should be. 18 | # 8. Updates nflwin/_version.py to the new version. 19 | # 9. Uploads package to PyPI. 20 | #10. Adds and commits nflwin/_version.py with commit message 21 | # "bumped [TYPE] version to [VERSION]", where [TYPE] is major, minor, or patch. 22 | #11. Tags latest commit with version number (no 'v'). 23 | #12. Pushes commit and tag. 24 | ######################################################################## 25 | 26 | set -e 27 | 28 | #Parse command line arguments: 29 | if [ "$#" -ne 1 ]; then 30 | echo "Syntax: ./increment_version.sh [major|minor|patch]" 31 | exit 1 32 | fi 33 | 34 | VERSION_TYPE=`echo "$1" | tr '[:upper:]' '[:lower:]'` 35 | 36 | if [ "$VERSION_TYPE" != "major" -a "$VERSION_TYPE" != "minor" -a "$VERSION_TYPE" != "patch" ]; then 37 | echo "Version type must be one of 'major', 'minor', or 'patch'" 38 | exit 1 39 | fi 40 | 41 | #Ensure we're not on master: 42 | CURRENT_BRANCH=`git rev-parse --abbrev-ref HEAD` 43 | if [ "$CURRENT_BRANCH" == "master" ]; then 44 | echo "Must not be on master branch" 45 | exit 1 46 | fi 47 | 48 | #Make sure there aren't any staged changes: 49 | STAGED_CHANGES_FLAG=`git status | grep "Changes to be committed" | wc -l` 50 | if [ $STAGED_CHANGES_FLAG -ne 0 ]; then 51 | echo "Must not have any staged changes" 52 | exit 1 53 | fi 54 | 55 | #Make sure there aren't any unpushed changes: 56 | git pull #Do this first to sync things 57 | 58 | UP_TO_DATE_FLAG=`git status | sed -n 2p | grep "Your branch is up-to-date with" | wc -l` 59 | if [ $UP_TO_DATE_FLAG -eq 0 ]; then 60 | echo "Must not have any unpushed changes" 61 | exit 1 62 | fi 63 | 64 | #Make sure all unit tests pass: 65 | ./run_tests.sh #Will return 1 if any tests fail, thus triggering the set -e flag. 66 | 67 | #Get version in nflwin/_version.py 68 | VERSION_PY=`grep "^__version__" nflwin/_version.py | awk '{print $NF}' | tr -d \" | tr -d u` 69 | 70 | #Get version in git: 71 | VERSION_GIT=`git describe --tags $(git rev-list --tags --max-count=1)` 72 | 73 | #Ensure versions are the same: 74 | if [ $VERSION_PY != $VERSION_GIT ]; then 75 | echo "Versions must match! Python version=${VERSION_PY}, git version=${VERSION_GIT}" 76 | exit 1 77 | fi 78 | 79 | #Determines what new version should be: 80 | MAJOR=`echo $VERSION_PY | awk -F"." '{print $1}'` 81 | MINOR=`echo $VERSION_PY | awk -F"." '{print $2}'` 82 | PATCH=`echo $VERSION_PY | awk -F"." '{print $3}'` 83 | if [ $VERSION_TYPE == "patch" ]; then 84 | PATCH=$(expr $PATCH + 1) 85 | elif [ $VERSION_TYPE == "minor" ]; then 86 | MINOR=$(expr $MINOR + 1) 87 | PATCH=0 88 | else 89 | MAJOR=$(expr $MAJOR + 1) 90 | MINOR=0 91 | PATCH=0 92 | fi 93 | NEW_VERSION="$MAJOR.$MINOR.$PATCH" 94 | 95 | #Update nflwin/_version.py: 96 | sed -i.bak "s/${VERSION_PY}/${NEW_VERSION}/" nflwin/_version.py 97 | rm nflwin/_version.py.bak 98 | 99 | #Upload package to PyPI: 100 | python setup.py sdist upload -r pypi 101 | 102 | #Stage and commit nflwin/_version.py 103 | git add nflwin/_version.py 104 | git commit -m "bumped ${VERSION_TYPE} version to ${NEW_VERSION}" 105 | 106 | #Tag the commit: 107 | git tag -a ${NEW_VERSION} -m "bumped ${VERSION_TYPE}" 108 | 109 | #Push the commit and tag: 110 | git push 111 | git push origin ${NEW_VERSION} 112 | 113 | echo "finished!" 114 | 115 | exit 0 116 | -------------------------------------------------------------------------------- /doc/source/dev.rst: -------------------------------------------------------------------------------- 1 | For Developers 2 | ========================= 3 | 4 | This section of the documentation covers things that will be useful for those already contributing to NFLWin. 5 | 6 | .. note:: 7 | Unless stated otherwise assume that all filepaths given in this section start at the root directory for the repo. 8 | 9 | Testing Documentation 10 | ------------------------------------------ 11 | 12 | Documentation for NFLWin is hosted at `Read the Docs `_, and is built automatically when changes are made on the master branch or a release is cut. However, oftentimes it's valuable to display NFLWin's documentation locally as you're writing. To do this, run the following:: 13 | 14 | $ ./build_local_documentation.sh 15 | 16 | When that command finishes, open up ``doc/index.html`` in your browser of choice to see the site. 17 | 18 | Updating the Default Model 19 | -------------------------------------- 20 | 21 | NFLWin comes with a pre-trained model, but if the code generating that model is updated **the model itself is not**. So you have to update it yourself. The good news, however, is that there's a script for that:: 22 | 23 | $ python make_default_model.py 24 | 25 | .. note:: 26 | This script hardcodes in the seasons to use for training and 27 | testing samples. After each season those will likely need to be 28 | updated to use the most up-to-date data. 29 | 30 | .. note:: 31 | This script requires ``matplotlib`` in order to run, as it produces a 32 | validation plot for the documentation. 33 | 34 | Cutting a New Release 35 | ---------------------------------- 36 | NFLWin uses `semantic versioning `_, which basically boils down to the following (taken directly from the webpage linked earlier in this sentence): 37 | 38 | Given a version number MAJOR.MINOR.PATCH, increment the: 39 | 40 | 1. MAJOR version when you make incompatible API changes, 41 | 2. MINOR version when you add functionality in a backwards-compatible manner, and 42 | 3. PATCH version when you make backwards-compatible bug fixes. 43 | 44 | Basically, unless you change something drastic you leave the major version alone (the exception being going to version 1.0.0, which indicates the first release where the interface is considered "stable"). 45 | 46 | The trick here is to note that information about a new release must live in a few places: 47 | 48 | * In ``nflwin/_version.py`` as the value of the ``__version__`` variable. 49 | * As a tagged commit. 50 | * As a release on GitHub. 51 | * As an upload to PyPI. 52 | * (If necessary) as a documented release on Read the Docs. 53 | 54 | Changing the version in one place but not in others can have relatively minor but fairly annoying consequences. To help manage the release cutting process there is a shell script that automates significant parts of this process:: 55 | 56 | $ ./increment_version.sh [major|minor|patch] 57 | 58 | This script does a bunch of things, namely: 59 | 60 | 1. Parse command line arguments to determine whether to 61 | increment major, minor, or patch version. 62 | 2. Makes sure it's not on the master branch. 63 | 3. Makes sure there aren't any changes that have been 64 | staged but not committed. 65 | 4. Makes sure there aren't any changes that have been 66 | committed but not pushed. 67 | 5. Makes sure all unit tests pass. 68 | 6. Compares current version in nflwin/_version.py to most recent 69 | git tag to make sure they're the same. 70 | 7. Figures out what the new version should be. 71 | 8. Updates nflwin/_version.py to the new version. 72 | 9. Uploads package to PyPI. 73 | 10. Adds and commits nflwin/_version.py with commit message 74 | "bumped [TYPE] version to [VERSION]", where [TYPE] is major, minor, or patch. 75 | 11. Tags latest commit with version number (no 'v'). 76 | 12. Pushes commit and tag. 77 | 78 | It will exit if **anything** returns with a non-zero exit status, and since it waits until the very end to upload anything to PyPI or GitHub if you do run into an error in most cases you can fix it and then just re-run the script. 79 | 80 | The process for cutting a release is as follows: 81 | 82 | 1. Make double sure that you're on a branch that's not ``master`` and you're ready to cut a new release (general good practice is to branch off from master *just* for the purpose of making a new release). 83 | 2. Run the ``increment_version.sh`` script. 84 | 3. Fix any errors, then rerun the script until it passes. 85 | 4. Make a PR on GitHub into master, and merge it in (self-merge is ok if branch is just updating version). 86 | 5. Make release notes for new release on GitHub. 87 | 6. (If necessary) go to Read the Docs and activate the new release. 88 | -------------------------------------------------------------------------------- /doc/source/nfldb.rst: -------------------------------------------------------------------------------- 1 | .. _nfldb-install: 2 | 3 | Using Data From nfldb 4 | ======================================= 5 | 6 | NFLWin comes with robust support for querying data from `nfldb 7 | `_, a package designed to 8 | facilitate downloading and accessing play-by-play data. There are 9 | functions to query the nfldb database in :py:mod:`nflwin.utilities`, 10 | and :py:class:`nflwin.model.WPModel` has keyword arguments that allow 11 | you to directly use nfldb data to fit and validate a WP model. Using 12 | nfldb is totally optional: a default model is already fit and ready to 13 | use, and NFLWin is fully compatible with any source for play-by-play 14 | data. However, nfldb is one of the few free sources of up-to-date NFL 15 | data and so it may be a useful resource to have. 16 | 17 | 18 | Installing nfldb 19 | -------------------------------- 20 | 21 | nfldb is pip-installable, and can be installed as an extra dependency 22 | (``pip install nflwin[nfldb]``). Without setting up the nfldb 23 | Postgres database first, however, the pip install will succeed but 24 | nfldb will be unuseable. What's more, trying to set up the database 25 | *after* installing nfldb may fail as well. 26 | 27 | The nfldb wiki has `fairly decent installation instructions 28 | `_, but I know 29 | that when I went through the installation process I had to interpret 30 | and adjust several steps. I'd at least recommend reading through the 31 | wiki first, but in case it's useful 32 | I've listed the steps I followed below (for reference I was on Mac OS 10.10). 33 | 34 | 35 | Installing Postgres 36 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 37 | I had an old install kicking around, so I first had to clean that up. 38 | Since I was using `Homebrew `_:: 39 | 40 | $ brew uninstall -force postgresql 41 | $ rm -rf /usr/local/var/postgres/ # where I'd installed the prior DB 42 | 43 | Then install a fresh version:: 44 | 45 | $ brew update 46 | $ brew install postgresql 47 | 48 | 49 | Start Postgres and Create a Default DB 50 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 51 | You can choose to run Postgres at startup, but I don't use it that 52 | often so I choose not to do those steps - I just run it in the 53 | foreground with this command:: 54 | 55 | $ postgres -D /usr/local/var/postgres 56 | 57 | Or in the background with this command:: 58 | 59 | $ pg_ctl -D /usr/local/var/postgres -l logfile start 60 | 61 | If you don't create a default database based on your username, 62 | launching Postgres will fail with a ``psql: FATAL: database 63 | "USERNAME" does not exist`` error:: 64 | 65 | $ createdb `whoami` 66 | 67 | Check that the install and configuration went well by launching 68 | Postgres as your default user:: 69 | 70 | $ psql 71 | psql (9.5.2) 72 | Type "help" for help. 73 | 74 | USERNAME=# 75 | 76 | Next, add a password:: 77 | 78 | USERNAME=# ALTER ROLE "USERNAME" WITH ENCRYPTED PASSWORD 'choose a 79 | superuser password'; 80 | USERNAME=# \q; 81 | 82 | Edit the ``pg_hba.conf``file found in your database (in my case the 83 | file was 84 | ``/usr/local/var/postgres/pg_hba.conf``), and change all instances of 85 | ``trust`` to ``md5``. 86 | 87 | Create nfldb Postgres User and Database 88 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 89 | Start by making a user:: 90 | 91 | $ createuser -U USERNAME -E -P nfldb 92 | 93 | where you replace ``USERNAME`` with your actual username. Make up a 94 | new password. Then make the nfldb database:: 95 | 96 | $ createdb -U USERNAME -O nfldb nfldb 97 | 98 | You'll need to enter the password for the USERNAME account. Next, add 99 | the fuzzy string matching extension:: 100 | 101 | $ psql -U USERNAME -c 'CREATE EXTENSION fuzzystrmatch;' nfldb 102 | 103 | You should now be able to connect the nfldb user to the nfldb 104 | database:: 105 | 106 | $ psql -U nfldb nfldb 107 | 108 | From this point you should be able to follow along with the 109 | instructions from `nfldb 110 | `_. 111 | 112 | Using nfldb 113 | ---------------------- 114 | 115 | Once nfldb is properly installed, you can use it with NFLwin in a 116 | couple of different ways. 117 | 118 | Querying Data 119 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^ 120 | nfldb comes with a robust set of options to query its database, but 121 | they tend to be designed more for ad hoc querying of small amounts of 122 | data or computing aggregate statistics. It's possible to use built-in 123 | nfldb queries to get the data NFLWin needs, but it's *slow*. So NFLWin 124 | has built in support for bulk queries of nfldb in the 125 | :py:mod:`nflwin.utilities` module:: 126 | 127 | >>> from nflwin import utilities 128 | >>> data = utilities.get_nfldb_play_data(season_years=[2010], 129 | ... season_types=["Regular", "Postseason"]) 130 | >>> data.head() 131 | gsis_id drive_id play_id offense_team yardline down yards_to_go \ 132 | 0 2010090900 1 35 MIN -20.0 0 0 133 | 1 2010090900 1 57 NO -27.0 1 10 134 | 2 2010090900 1 81 NO 1.0 1 10 135 | 3 2010090900 1 109 NO 13.0 1 10 136 | 4 2010090900 1 135 NO 13.0 2 10 137 | 138 | home_team away_team offense_won quarter seconds_elapsed curr_home_score \ 139 | 0 NO MIN False Q1 0.0 0 140 | 1 NO MIN True Q1 4.0 0 141 | 2 NO MIN True Q1 39.0 0 142 | 3 NO MIN True Q1 79.0 0 143 | 4 NO MIN True Q1 84.0 0 144 | 145 | curr_away_score 146 | 0 0 147 | 1 0 148 | 2 0 149 | 3 0 150 | 4 0 151 | 152 | You can see the `docstring `_ for more details, but basically ``get_nfldb_play_data`` queries 153 | the nfldb database directly for columns relevant to estimating WP, 154 | does some simple parsing/preprocessing to get them in the right format, 155 | then returns them as a dataframe. Keyword arguments control what parts 156 | of seasons are queried. 157 | 158 | Integration with WPModel 159 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 160 | 161 | While you can train NFLWin's win probability model 162 | (:py:class:`nflwin.model.WPModel`) with whatever data you want, it 163 | comes with keyword arguments that allow you to query nfldb 164 | directly. For instance, to train the default model on the 2009 and 2010 165 | regular seasons from nfldb, you'd enter the following:: 166 | 167 | >>> from nflwin.model import WPModel 168 | >>> model = WPModel() 169 | >>> model.create_default_pipeline() #doctest: +ELLIPSIS 170 | Pipeline(...) 171 | >>> model.train_model(source_data="nfldb", 172 | ... training_seasons=[2009, 2010], 173 | ... training_season_types=["Regular"]) 174 | -------------------------------------------------------------------------------- /doc/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source 10 | set I18NSPHINXOPTS=%SPHINXOPTS% source 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. epub3 to make an epub3 31 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 32 | echo. text to make text files 33 | echo. man to make manual pages 34 | echo. texinfo to make Texinfo files 35 | echo. gettext to make PO message catalogs 36 | echo. changes to make an overview over all changed/added/deprecated items 37 | echo. xml to make Docutils-native XML files 38 | echo. pseudoxml to make pseudoxml-XML files for display purposes 39 | echo. linkcheck to check all external links for integrity 40 | echo. doctest to run all doctests embedded in the documentation if enabled 41 | echo. coverage to run coverage check of the documentation if enabled 42 | echo. dummy to check syntax errors of document sources 43 | goto end 44 | ) 45 | 46 | if "%1" == "clean" ( 47 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 48 | del /q /s %BUILDDIR%\* 49 | goto end 50 | ) 51 | 52 | 53 | REM Check if sphinx-build is available and fallback to Python version if any 54 | %SPHINXBUILD% 1>NUL 2>NUL 55 | if errorlevel 9009 goto sphinx_python 56 | goto sphinx_ok 57 | 58 | :sphinx_python 59 | 60 | set SPHINXBUILD=python -m sphinx.__init__ 61 | %SPHINXBUILD% 2> nul 62 | if errorlevel 9009 ( 63 | echo. 64 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 65 | echo.installed, then set the SPHINXBUILD environment variable to point 66 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 67 | echo.may add the Sphinx directory to PATH. 68 | echo. 69 | echo.If you don't have Sphinx installed, grab it from 70 | echo.http://sphinx-doc.org/ 71 | exit /b 1 72 | ) 73 | 74 | :sphinx_ok 75 | 76 | 77 | if "%1" == "html" ( 78 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 79 | if errorlevel 1 exit /b 1 80 | echo. 81 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 82 | goto end 83 | ) 84 | 85 | if "%1" == "dirhtml" ( 86 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 87 | if errorlevel 1 exit /b 1 88 | echo. 89 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 90 | goto end 91 | ) 92 | 93 | if "%1" == "singlehtml" ( 94 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 95 | if errorlevel 1 exit /b 1 96 | echo. 97 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 98 | goto end 99 | ) 100 | 101 | if "%1" == "pickle" ( 102 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 103 | if errorlevel 1 exit /b 1 104 | echo. 105 | echo.Build finished; now you can process the pickle files. 106 | goto end 107 | ) 108 | 109 | if "%1" == "json" ( 110 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 111 | if errorlevel 1 exit /b 1 112 | echo. 113 | echo.Build finished; now you can process the JSON files. 114 | goto end 115 | ) 116 | 117 | if "%1" == "htmlhelp" ( 118 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 119 | if errorlevel 1 exit /b 1 120 | echo. 121 | echo.Build finished; now you can run HTML Help Workshop with the ^ 122 | .hhp project file in %BUILDDIR%/htmlhelp. 123 | goto end 124 | ) 125 | 126 | if "%1" == "qthelp" ( 127 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 128 | if errorlevel 1 exit /b 1 129 | echo. 130 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 131 | .qhcp project file in %BUILDDIR%/qthelp, like this: 132 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\NFLWin.qhcp 133 | echo.To view the help file: 134 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\NFLWin.ghc 135 | goto end 136 | ) 137 | 138 | if "%1" == "devhelp" ( 139 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 140 | if errorlevel 1 exit /b 1 141 | echo. 142 | echo.Build finished. 143 | goto end 144 | ) 145 | 146 | if "%1" == "epub" ( 147 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 148 | if errorlevel 1 exit /b 1 149 | echo. 150 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 151 | goto end 152 | ) 153 | 154 | if "%1" == "epub3" ( 155 | %SPHINXBUILD% -b epub3 %ALLSPHINXOPTS% %BUILDDIR%/epub3 156 | if errorlevel 1 exit /b 1 157 | echo. 158 | echo.Build finished. The epub3 file is in %BUILDDIR%/epub3. 159 | goto end 160 | ) 161 | 162 | if "%1" == "latex" ( 163 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 164 | if errorlevel 1 exit /b 1 165 | echo. 166 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 167 | goto end 168 | ) 169 | 170 | if "%1" == "latexpdf" ( 171 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 172 | cd %BUILDDIR%/latex 173 | make all-pdf 174 | cd %~dp0 175 | echo. 176 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 177 | goto end 178 | ) 179 | 180 | if "%1" == "latexpdfja" ( 181 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 182 | cd %BUILDDIR%/latex 183 | make all-pdf-ja 184 | cd %~dp0 185 | echo. 186 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 187 | goto end 188 | ) 189 | 190 | if "%1" == "text" ( 191 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 192 | if errorlevel 1 exit /b 1 193 | echo. 194 | echo.Build finished. The text files are in %BUILDDIR%/text. 195 | goto end 196 | ) 197 | 198 | if "%1" == "man" ( 199 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 200 | if errorlevel 1 exit /b 1 201 | echo. 202 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 203 | goto end 204 | ) 205 | 206 | if "%1" == "texinfo" ( 207 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 208 | if errorlevel 1 exit /b 1 209 | echo. 210 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 211 | goto end 212 | ) 213 | 214 | if "%1" == "gettext" ( 215 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 216 | if errorlevel 1 exit /b 1 217 | echo. 218 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 219 | goto end 220 | ) 221 | 222 | if "%1" == "changes" ( 223 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 224 | if errorlevel 1 exit /b 1 225 | echo. 226 | echo.The overview file is in %BUILDDIR%/changes. 227 | goto end 228 | ) 229 | 230 | if "%1" == "linkcheck" ( 231 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 232 | if errorlevel 1 exit /b 1 233 | echo. 234 | echo.Link check complete; look for any errors in the above output ^ 235 | or in %BUILDDIR%/linkcheck/output.txt. 236 | goto end 237 | ) 238 | 239 | if "%1" == "doctest" ( 240 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 241 | if errorlevel 1 exit /b 1 242 | echo. 243 | echo.Testing of doctests in the sources finished, look at the ^ 244 | results in %BUILDDIR%/doctest/output.txt. 245 | goto end 246 | ) 247 | 248 | if "%1" == "coverage" ( 249 | %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage 250 | if errorlevel 1 exit /b 1 251 | echo. 252 | echo.Testing of coverage in the sources finished, look at the ^ 253 | results in %BUILDDIR%/coverage/python.txt. 254 | goto end 255 | ) 256 | 257 | if "%1" == "xml" ( 258 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 259 | if errorlevel 1 exit /b 1 260 | echo. 261 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 262 | goto end 263 | ) 264 | 265 | if "%1" == "pseudoxml" ( 266 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 267 | if errorlevel 1 exit /b 1 268 | echo. 269 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 270 | goto end 271 | ) 272 | 273 | if "%1" == "dummy" ( 274 | %SPHINXBUILD% -b dummy %ALLSPHINXOPTS% %BUILDDIR%/dummy 275 | if errorlevel 1 exit /b 1 276 | echo. 277 | echo.Build finished. Dummy builder generates no files. 278 | goto end 279 | ) 280 | 281 | :end 282 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don\'t have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 21 | 22 | .PHONY: help 23 | help: 24 | @echo "Please use \`make ' where is one of" 25 | @echo " html to make standalone HTML files" 26 | @echo " dirhtml to make HTML files named index.html in directories" 27 | @echo " singlehtml to make a single large HTML file" 28 | @echo " pickle to make pickle files" 29 | @echo " json to make JSON files" 30 | @echo " htmlhelp to make HTML files and a HTML help project" 31 | @echo " qthelp to make HTML files and a qthelp project" 32 | @echo " applehelp to make an Apple Help Book" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " epub3 to make an epub3" 36 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 37 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 38 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 39 | @echo " text to make text files" 40 | @echo " man to make manual pages" 41 | @echo " texinfo to make Texinfo files" 42 | @echo " info to make Texinfo files and run them through makeinfo" 43 | @echo " gettext to make PO message catalogs" 44 | @echo " changes to make an overview of all changed/added/deprecated items" 45 | @echo " xml to make Docutils-native XML files" 46 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 47 | @echo " linkcheck to check all external links for integrity" 48 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 49 | @echo " coverage to run coverage check of the documentation (if enabled)" 50 | @echo " dummy to check syntax errors of document sources" 51 | 52 | .PHONY: clean 53 | clean: 54 | rm -rf $(BUILDDIR)/* 55 | 56 | .PHONY: html 57 | html: 58 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 59 | @echo 60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 61 | 62 | .PHONY: dirhtml 63 | dirhtml: 64 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 65 | @echo 66 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 67 | 68 | .PHONY: singlehtml 69 | singlehtml: 70 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 71 | @echo 72 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 73 | 74 | .PHONY: pickle 75 | pickle: 76 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 77 | @echo 78 | @echo "Build finished; now you can process the pickle files." 79 | 80 | .PHONY: json 81 | json: 82 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 83 | @echo 84 | @echo "Build finished; now you can process the JSON files." 85 | 86 | .PHONY: htmlhelp 87 | htmlhelp: 88 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 89 | @echo 90 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 91 | ".hhp project file in $(BUILDDIR)/htmlhelp." 92 | 93 | .PHONY: qthelp 94 | qthelp: 95 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 96 | @echo 97 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 98 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 99 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/NFLWin.qhcp" 100 | @echo "To view the help file:" 101 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/NFLWin.qhc" 102 | 103 | .PHONY: applehelp 104 | applehelp: 105 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp 106 | @echo 107 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." 108 | @echo "N.B. You won't be able to view it unless you put it in" \ 109 | "~/Library/Documentation/Help or install it in your application" \ 110 | "bundle." 111 | 112 | .PHONY: devhelp 113 | devhelp: 114 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 115 | @echo 116 | @echo "Build finished." 117 | @echo "To view the help file:" 118 | @echo "# mkdir -p $$HOME/.local/share/devhelp/NFLWin" 119 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/NFLWin" 120 | @echo "# devhelp" 121 | 122 | .PHONY: epub 123 | epub: 124 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 125 | @echo 126 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 127 | 128 | .PHONY: epub3 129 | epub3: 130 | $(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3 131 | @echo 132 | @echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3." 133 | 134 | .PHONY: latex 135 | latex: 136 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 137 | @echo 138 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 139 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 140 | "(use \`make latexpdf' here to do that automatically)." 141 | 142 | .PHONY: latexpdf 143 | latexpdf: 144 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 145 | @echo "Running LaTeX files through pdflatex..." 146 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 147 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 148 | 149 | .PHONY: latexpdfja 150 | latexpdfja: 151 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 152 | @echo "Running LaTeX files through platex and dvipdfmx..." 153 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 154 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 155 | 156 | .PHONY: text 157 | text: 158 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 159 | @echo 160 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 161 | 162 | .PHONY: man 163 | man: 164 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 165 | @echo 166 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 167 | 168 | .PHONY: texinfo 169 | texinfo: 170 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 171 | @echo 172 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 173 | @echo "Run \`make' in that directory to run these through makeinfo" \ 174 | "(use \`make info' here to do that automatically)." 175 | 176 | .PHONY: info 177 | info: 178 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 179 | @echo "Running Texinfo files through makeinfo..." 180 | make -C $(BUILDDIR)/texinfo info 181 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 182 | 183 | .PHONY: gettext 184 | gettext: 185 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 186 | @echo 187 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 188 | 189 | .PHONY: changes 190 | changes: 191 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 192 | @echo 193 | @echo "The overview file is in $(BUILDDIR)/changes." 194 | 195 | .PHONY: linkcheck 196 | linkcheck: 197 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 198 | @echo 199 | @echo "Link check complete; look for any errors in the above output " \ 200 | "or in $(BUILDDIR)/linkcheck/output.txt." 201 | 202 | .PHONY: doctest 203 | doctest: 204 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 205 | @echo "Testing of doctests in the sources finished, look at the " \ 206 | "results in $(BUILDDIR)/doctest/output.txt." 207 | 208 | .PHONY: coverage 209 | coverage: 210 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 211 | @echo "Testing of coverage in the sources finished, look at the " \ 212 | "results in $(BUILDDIR)/coverage/python.txt." 213 | 214 | .PHONY: xml 215 | xml: 216 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 217 | @echo 218 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 219 | 220 | .PHONY: pseudoxml 221 | pseudoxml: 222 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 223 | @echo 224 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 225 | 226 | .PHONY: dummy 227 | dummy: 228 | $(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy 229 | @echo 230 | @echo "Build finished. Dummy builder generates no files." 231 | -------------------------------------------------------------------------------- /doc/source/model.rst: -------------------------------------------------------------------------------- 1 | Creating a New WP Model 2 | ============================== 3 | While NFLWin ships with a fairly robust default model, there is always 4 | room for improvement. Maybe there's a new dataset you want to use to 5 | train the model, a new feature you want to add, or a new machine 6 | learning model you want to evaluate. 7 | 8 | Good news! NFLWin makes it easy to train a new model, whether you just 9 | want to refresh the data or to do an entire refit from scratch. We'll 10 | start with the simplest case: 11 | 12 | Default Model, New Data 13 | ----------------------- 14 | Refreshing the data with NFLWin is a snap. If you want to change the 15 | data used by the default model but keep the source as nfldb, all you 16 | have to do is override the default keyword arguments when calling the 17 | :meth:`~nflwin.model.WPModel.train_model` and :meth:`~nflwin.model.WPModel.validate_model` 18 | methods. For instance, if for some insane reason you wanted to train on the 2009 and 2010 regular 19 | seasons and validate on the 2011 and 2012 playoffs, you would do the following: 20 | 21 | .. code-block:: python 22 | 23 | >>> from nflwin.model import WPModel 24 | >>> new_data_model = WPModel() 25 | >>> new_data_model.train_model(training_seasons=[2009, 2010], training_season_types=["Regular"]) 26 | >>> new_data_model.validate_model(validation_seasons=[2011, 2012], validation_season_types=["Postseason"]) 27 | (21.355462918011327, 565.56909036318007) 28 | 29 | If you want to supply your own data, that's easy too - simply set the 30 | `source_data` kwarg of :meth:`~nflwin.model.WPModel.train_model` and 31 | :meth:`~nflwin.model.WPModel.validate_model` to be a Pandas DataFrame of your training and validation data (respectively): 32 | 33 | .. 34 | from nflwin.utilities import get_nfldb_play_data 35 | training_data = get_nfldb_play_data(season_years=[2012, 2013]) 36 | validation_data = get_nfldb_play_data(season_years=[2014]) 37 | 38 | .. code-block:: python 39 | 40 | >>> from nflwin.model import WPModel 41 | >>> new_data_model = WPModel() 42 | >>> training_data.head() 43 | gsis_id drive_id play_id offense_team yardline down yards_to_go \ 44 | 0 2012090500 1 35 DAL -15.0 0 0 45 | 1 2012090500 1 57 NYG -34.0 1 10 46 | 2 2012090500 1 79 NYG -34.0 2 10 47 | 3 2012090500 1 103 NYG -29.0 3 5 48 | 4 2012090500 1 125 NYG -29.0 4 5 49 | 50 | home_team away_team offense_won quarter seconds_elapsed curr_home_score \ 51 | 0 NYG DAL True Q1 0.0 0 52 | 1 NYG DAL False Q1 4.0 0 53 | 2 NYG DAL False Q1 11.0 0 54 | 3 NYG DAL False Q1 55.0 0 55 | 4 NYG DAL False Q1 62.0 0 56 | 57 | curr_away_score 58 | 0 0 59 | 1 0 60 | 2 0 61 | 3 0 62 | 4 0 63 | >>> new_data_model.train_model(source_data=training_data) 64 | >>> validation_data.head() 65 | gsis_id drive_id play_id offense_team yardline down yards_to_go \ 66 | 0 2014090400 1 36 SEA -15.0 0 0 67 | 1 2014090400 1 58 GB -37.0 1 10 68 | 2 2014090400 1 79 GB -31.0 2 4 69 | 3 2014090400 1 111 GB -26.0 1 10 70 | 4 2014090400 1 132 GB -11.0 1 10 71 | 72 | home_team away_team offense_won quarter seconds_elapsed curr_home_score \ 73 | 0 SEA GB True Q1 0.0 0 74 | 1 SEA GB False Q1 4.0 0 75 | 2 SEA GB False Q1 30.0 0 76 | 3 SEA GB False Q1 49.0 0 77 | 4 SEA GB False Q1 88.0 0 78 | 79 | curr_away_score 80 | 0 0 81 | 1 0 82 | 2 0 83 | 3 0 84 | 4 0 85 | >>> new_data_model.validate_model(source_data=validation_data) 86 | (8.9344062502671591, 265.7971863696315) 87 | 88 | Building a New Model 89 | -------------------- 90 | If you want to construct a totally new model, that's possible 91 | too. Just instantiate 92 | :class:`~nflwin.model.WPModel`, then replace the 93 | :attr:`~nflwin.model.WPModel.model` attribute with either a 94 | scikit-learn `classifier 95 | `_ 96 | or `Pipeline 97 | `_. From 98 | that point :meth:`~nflwin.model.WPModel.train_model` and 99 | :meth:`~nflwin.model.WPModel.validate_model` should work as normal. 100 | 101 | .. note:: 102 | If you create your own model, the 103 | :attr:`~nflwin.model.WPModel.column_descriptions` attribute will no longer be 104 | accurate unless you update it manually. 105 | 106 | .. note:: 107 | If your model uses a data structure other than a Pandas DataFrame, 108 | you will not be able to use the ``source_data="nfldb"`` default 109 | kwarg of :meth:`~nflwin.model.WPModel.train_model` and 110 | :meth:`~nflwin.model.WPModel.validate_model`. If you want to use nfldb 111 | data, query it through :func:`nflwin.utilities.get_nfldb_play_data` 112 | first and convert it from a DataFrame to the format required by your model. 113 | 114 | Using NFLWin's Preprocessors 115 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 116 | While you can completely roll your own WP model from scratch, NFLWin 117 | comes with several classes designed to aid in preprocessing your 118 | data. These can be found in the appropriately named 119 | :mod:`~nflwin.preprocessing` module. Each of these preprocessors inherits 120 | from scikit-learn's BaseEstimator class, and therefore is fully 121 | compatible with scikit-learn Pipelines. Available preprocessors 122 | include: 123 | 124 | * :class:`~nflwin.preprocessing.ComputeElapsedTime`: Convert the time 125 | elapsed in a quarter into the total seconds elapsed in the game. 126 | * :class:`~nflwin.preprocessing.ComputeIfOffenseIsHome`: Create an 127 | indicator variable for whether or not the offense is the home team. 128 | * :class:`~nflwin.preprocessing.CreateScoreDifferential`: Create a 129 | column indicating the difference between the offense and defense 130 | point totals (offense-defense). Uses home team and away team plus 131 | an indicator giving if the offense is the home team to compute. 132 | * :class:`~nflwin.preprocessing.MapToInt`: Map a column of values to 133 | integers. Useful for string columns (e.g. a quarter column with "Q1", "Q2", etc). 134 | * :class:`~nflwin.preprocessing.CheckColumnNames`: Ensure that only the desired data gets passed to 135 | the model in the right order. Useful to guarantee that the 136 | underlying numpy arrays in a Pandas DataFrame used for model 137 | validation are in the same order as they were when the model was 138 | trained. 139 | 140 | To see examples of these preprocessors in use to build a model, look 141 | at :meth:`nflwin.model.WPModel.create_default_pipeline`. 142 | 143 | Model I/O 144 | --------- 145 | To save a model to disk, use the 146 | :meth:`nflwin.model.WPModel.save_model` method. 147 | 148 | .. note:: 149 | If you do not provide 150 | a filename, the default model will be overwritten and in order to 151 | recover it you will need to reinstall NFLWin (which will then 152 | overwrite any non-default models you have saved). 153 | 154 | To load a model from disk, use the 155 | :meth:`nflwin.model.WPModel.load_model` class method. By default this 156 | will load the standard model that comes bundled with pip installs of 157 | NFLWin. Simply specify the ``filename`` kwarg to load a non-standard 158 | model. 159 | 160 | .. note:: 161 | By default, models are saved to and loaded from the path given by 162 | :attr:`nflwin.model.WPModel.model_directory`, which by default is 163 | located inside your NFLWin install. 164 | 165 | Estimating Quality of Fit 166 | ------------------------- 167 | When you care about measuring the probability of a classification 168 | model rather than getting a yes/no prediction it is challenging to 169 | estimate its quality. This is an area I'm actively looking to improve 170 | upon, but for now NFLWin does the following. 171 | 172 | First, it takes the probabilities given by the model for each play in the 173 | validation set, then produces a `kernel density estimate 174 | `_ (KDE) of all 175 | the plays as well as just the ones that were predicted 176 | correctly. The ratio of these two KDEs is the actual WP measured 177 | from the test data set at a given *predicted* WP. While all of this is 178 | measured in :meth:`~nflwin.model.WPModel.validate_model`, you can plot 179 | it for yourself by calling the 180 | :meth:`~nflwin.model.WPModel.plot_validation` method, which will 181 | generate a plot like that shown on the home page. 182 | 183 | From there NFLWin computes both the maximum deviation at any given 184 | percentage and the total area between the estimated WP from the model 185 | and what would be expected if the model was perfect - that's what is 186 | actually returned by 187 | :meth:`~nflwin.model.WPModel.validate_model`. This is obviously not 188 | ideal given that it's not directly estimating uncertainties in 189 | the model, but it's the best I've been able to come up with so far. If anyone 190 | has an idea for how to do this better I would welcome it enthusiastically. 191 | -------------------------------------------------------------------------------- /nflwin/tests/test_model.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | 3 | import os 4 | import collections 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import pytest 9 | 10 | from nflwin import model 11 | 12 | 13 | class TestDefaults(object): 14 | """Tests for defaults.""" 15 | 16 | def test_column_descriptions_set(self): 17 | wpmodel = model.WPModel() 18 | assert isinstance(wpmodel.column_descriptions, collections.Mapping) 19 | 20 | class TestModelTrain(object): 21 | """Tests for the train_model method.""" 22 | 23 | def test_bad_string(self): 24 | wpmodel = model.WPModel() 25 | with pytest.raises(ValueError): 26 | wpmodel.train_model(source_data="this is a bad string") 27 | 28 | def test_dataframe_input(self): 29 | wpmodel = model.WPModel() 30 | test_data = {'offense_won': {0: True, 1: False, 2: False, 31 | 3: False, 4: False, 5: True, 32 | 6: True, 7: True, 8: True, 9: False}, 33 | 'home_team': {0: 'NYG', 1: 'NYG', 2: 'NYG', 3: 'NYG', 34 | 4: 'NYG', 5: 'NYG', 6: 'NYG', 7: 'NYG', 35 | 8: 'NYG', 9: 'NYG'}, 36 | 'away_team': {0: 'DAL', 1: 'DAL', 2: 'DAL', 3: 'DAL', 37 | 4: 'DAL', 5: 'DAL', 6: 'DAL', 7: 'DAL', 38 | 8: 'DAL', 9: 'DAL'}, 39 | 'gsis_id': {0: '2012090500', 1: '2012090500', 2: '2012090500', 40 | 3: '2012090500', 4: '2012090500', 5: '2012090500', 41 | 6: '2012090500', 7: '2012090500', 8: '2012090500', 42 | 9: '2012090500'}, 43 | 'play_id': {0: 35, 1: 57, 2: 79, 3: 103, 4: 125, 5: 150, 44 | 6: 171, 7: 190, 8: 212, 9: 252}, 45 | 'seconds_elapsed': {0: 0.0, 1: 4.0, 2: 11.0, 3: 55.0, 4: 62.0, 46 | 5: 76.0, 6: 113.0, 7: 153.0, 8: 159.0, 9: 171.0}, 47 | 'down': {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 1, 6: 2, 7: 3, 8: 4, 9: 1}, 48 | 'curr_home_score': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}, 49 | 'offense_team': {0: 'DAL', 1: 'NYG', 2: 'NYG', 3: 'NYG', 50 | 4: 'NYG', 5: 'DAL', 6: 'DAL', 7: 'DAL', 51 | 8: 'DAL', 9: 'NYG'}, 52 | 'curr_away_score': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}, 53 | 'yardline': {0: -15.0, 1: -34.0, 2: -34.0, 3: -29.0, 54 | 4: -29.0, 5: -26.0, 6: -23.0, 7: -31.0, 8: -31.0, 9: -37.0}, 55 | 'drive_id': {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 2, 6: 2, 7: 2, 8: 2, 9: 3}, 56 | 'yards_to_go': {0: 0, 1: 10, 2: 10, 3: 5, 4: 5, 5: 10, 6: 7, 7: 15, 8: 15, 9: 10}, 57 | 'quarter': {0: 'Q1', 1: 'Q1', 2: 'Q1', 3: 'Q1', 4: 'Q1', 58 | 5: 'Q1', 6: 'Q1', 7: 'Q1', 8: 'Q1', 9: 'Q1'} 59 | } 60 | test_df = pd.DataFrame(test_data) 61 | wpmodel.train_model(source_data=test_df) 62 | 63 | class TestModelValidate(object): 64 | """Tests for the validate_model method.""" 65 | 66 | def setup_method(self, method): 67 | self.test_data = {'offense_won': {0: True, 1: False, 2: False, 68 | 3: False, 4: False, 5: True, 69 | 6: True, 7: True, 8: True, 9: False}, 70 | 'home_team': {0: 'NYG', 1: 'NYG', 2: 'NYG', 3: 'NYG', 71 | 4: 'NYG', 5: 'NYG', 6: 'NYG', 7: 'NYG', 72 | 8: 'NYG', 9: 'NYG'}, 73 | 'away_team': {0: 'DAL', 1: 'DAL', 2: 'DAL', 3: 'DAL', 74 | 4: 'DAL', 5: 'DAL', 6: 'DAL', 7: 'DAL', 75 | 8: 'DAL', 9: 'DAL'}, 76 | 'gsis_id': {0: '2012090500', 1: '2012090500', 2: '2012090500', 77 | 3: '2012090500', 4: '2012090500', 5: '2012090500', 78 | 6: '2012090500', 7: '2012090500', 8: '2012090500', 79 | 9: '2012090500'}, 80 | 'play_id': {0: 35, 1: 57, 2: 79, 3: 103, 4: 125, 5: 150, 81 | 6: 171, 7: 190, 8: 212, 9: 252}, 82 | 'seconds_elapsed': {0: 0.0, 1: 4.0, 2: 11.0, 3: 55.0, 4: 62.0, 83 | 5: 76.0, 6: 113.0, 7: 153.0, 8: 159.0, 9: 171.0}, 84 | 'down': {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 1, 6: 2, 7: 3, 8: 4, 9: 1}, 85 | 'curr_home_score': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}, 86 | 'offense_team': {0: 'DAL', 1: 'NYG', 2: 'NYG', 3: 'NYG', 87 | 4: 'NYG', 5: 'DAL', 6: 'DAL', 7: 'DAL', 88 | 8: 'DAL', 9: 'NYG'}, 89 | 'curr_away_score': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}, 90 | 'yardline': {0: -15.0, 1: -34.0, 2: -34.0, 3: -29.0, 91 | 4: -29.0, 5: -26.0, 6: -23.0, 7: -31.0, 8: -31.0, 9: -37.0}, 92 | 'drive_id': {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 2, 6: 2, 7: 2, 8: 2, 9: 3}, 93 | 'yards_to_go': {0: 0, 1: 10, 2: 10, 3: 5, 4: 5, 5: 10, 6: 7, 7: 15, 8: 15, 9: 10}, 94 | 'quarter': {0: 'Q1', 1: 'Q1', 2: 'Q1', 3: 'Q1', 4: 'Q1', 95 | 5: 'Q1', 6: 'Q1', 7: 'Q1', 8: 'Q1', 9: 'Q1'} 96 | } 97 | self.test_df = pd.DataFrame(self.test_data) 98 | 99 | def test_bad_string(self): 100 | wpmodel = model.WPModel() 101 | wpmodel.train_model(source_data=self.test_df) 102 | with pytest.raises(ValueError): 103 | wpmodel.validate_model(source_data="this is bad data") 104 | 105 | 106 | def test_dataframe_input(self): 107 | wpmodel = model.WPModel() 108 | wpmodel.train_model(source_data=self.test_df) 109 | wpmodel.validate_model(source_data=self.test_df) 110 | 111 | class TestTestDistribution(object): 112 | """Tests the _test_distribution static method of WPModel.""" 113 | 114 | def test_simple_case(self): 115 | input_probabilities = [0.1, 0.2, 0.3] 116 | input_predicted_win_percents = [0.1, 0.2, 0.3] 117 | input_num_plays_used = [10, 10, 10] 118 | 119 | expected_output = 1.0 120 | 121 | assert (expected_output - 122 | model.WPModel._test_distribution(input_probabilities, 123 | input_predicted_win_percents, 124 | input_num_plays_used) 125 | ) < 1e-5 126 | 127 | def test_more_complicated_case(self): 128 | input_probabilities = [0.1, 0.2, 0.4] 129 | input_predicted_win_percents = [0.1, 0.2, 0.3] 130 | input_num_plays_used = [10, 10, 100000] 131 | 132 | expected_output = 0.0 133 | 134 | assert (expected_output - 135 | model.WPModel._test_distribution(input_probabilities, 136 | input_predicted_win_percents, 137 | input_num_plays_used) 138 | ) < 1e-5 139 | 140 | 141 | class TestModelIO(object): 142 | """Tests functions that deal with model saving and loading""" 143 | 144 | def teardown_method(self, method): 145 | 146 | try: 147 | os.remove(self.expected_path) 148 | except OSError: 149 | pass 150 | 151 | def test_model_save_default(self): 152 | instance = model.WPModel() 153 | model_name = "test_model_asljasljt.nflwin" 154 | instance._default_model_filename = model_name 155 | 156 | self.expected_path = os.path.join( 157 | os.path.join( 158 | os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "models") 159 | , model_name) 160 | 161 | assert os.path.isfile(self.expected_path) is False 162 | 163 | instance.save_model() 164 | 165 | assert os.path.isfile(self.expected_path) is True 166 | 167 | def test_model_save_specified(self): 168 | instance = model.WPModel() 169 | model_name = "test_model_qerooiua.nflwin" 170 | 171 | self.expected_path = os.path.join( 172 | os.path.join( 173 | os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "models") 174 | , model_name) 175 | 176 | assert os.path.isfile(self.expected_path) is False 177 | 178 | instance.save_model(filename=model_name) 179 | 180 | assert os.path.isfile(self.expected_path) is True 181 | 182 | def test_model_load_default(self): 183 | instance = model.WPModel() 184 | model_name = "test_model_asljasljt.nflwin" 185 | instance._default_model_filename = model_name 186 | 187 | self.expected_path = os.path.join( 188 | os.path.join( 189 | os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "models") 190 | , model_name) 191 | 192 | assert os.path.isfile(self.expected_path) is False 193 | 194 | instance.save_model() 195 | 196 | WPModel_class = model.WPModel 197 | WPModel_class._default_model_filename = model_name 198 | 199 | loaded_instance = WPModel_class.load_model() 200 | 201 | assert isinstance(loaded_instance, model.WPModel) 202 | 203 | def test_model_load_specified(self): 204 | instance = model.WPModel() 205 | model_name = "test_model_qerooiua.nflwin" 206 | 207 | self.expected_path = os.path.join( 208 | os.path.join( 209 | os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "models") 210 | , model_name) 211 | 212 | assert os.path.isfile(self.expected_path) is False 213 | 214 | instance.save_model(filename=model_name) 215 | 216 | loaded_instance = model.WPModel.load_model(filename=model_name) 217 | 218 | assert isinstance(loaded_instance, model.WPModel) 219 | 220 | 221 | -------------------------------------------------------------------------------- /nflwin/utilities.py: -------------------------------------------------------------------------------- 1 | """Utility functions that don't fit in the main modules""" 2 | from __future__ import print_function, division 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | 8 | def connect_nfldb(): 9 | """Connect to the nfldb database. 10 | 11 | Rather than using the builtin method we make our own, 12 | since we're going to use SQLAlchemy as the engine. However, 13 | we can still make use of the information in the nfldb config 14 | file to get information like username and password, which 15 | means this function doesn't need any arguments. 16 | 17 | Parameters 18 | ---------- 19 | None 20 | 21 | Returns 22 | ------- 23 | SQLAlchemy engine object 24 | A connected engine, ready to be used to query the DB. 25 | 26 | Raises 27 | ------ 28 | IOError 29 | If it can't find the config file. 30 | """ 31 | import nfldb 32 | import sqlalchemy as sql 33 | db_config, paths_tried = nfldb.db.config() 34 | if db_config is None: 35 | raise IOError("get_play_data: could not find database config! Looked" 36 | " in these places: {0}".format(paths_tried)) 37 | db_config["drivername"] = "postgres" 38 | db_config["username"] = db_config["user"] 39 | del db_config["user"] 40 | del db_config["timezone"] 41 | 42 | engine = sql.create_engine(sql.engine.url.URL(**db_config)) 43 | 44 | return engine 45 | 46 | 47 | def get_nfldb_play_data(season_years=None, season_types=("Regular", "Postseason")): 48 | """Get play-by-play data from the nfldb database. 49 | 50 | We use a specialized query and then postprocessing because, while possible to 51 | do using the objects created by ``nfldb``, it is *orders of magnitude slower*. 52 | This is due to the more general nature of ``nfldb``, which is not really designed 53 | for this kind of data mining. Since we need to get a lot of data in a single way, 54 | it's much simpler to interact at a lower level with the underlying postgres 55 | database. 56 | 57 | 58 | Parameters 59 | ---------- 60 | season_years : list (default=None) 61 | A list of all years to get data for (earliest year in nfldb is 2009). 62 | If ``None``, get data from all available seasons. 63 | season_types : list (default=["Regular", "Postseason"]) 64 | A list of all parts of seasons to get data for (acceptable values are 65 | "Preseason", "Regular", and "Postseason"). If ``None``, get data from 66 | all three season types. 67 | 68 | Returns 69 | ------- 70 | Pandas DataFrame 71 | The play by play data, with the following columns: 72 | 73 | * **gsis_id:** The official NFL GSIS_ID for the game. 74 | * **drive_id:** The id of the drive, starts at 1 and increases by 1 for each new drive. 75 | * **play_id:** The id of the play in ``nfldb``. Note that sequential plays have 76 | increasing but not necessarily sequential values. With ``drive_id`` and ``gsis_id``, 77 | works as a unique identifier for a given play. 78 | * **quarter:** The quarter, prepended with "Q" (e.g. ``Q1`` means the first quarter). 79 | Overtime periods are denoted as ``OT``, ``OT2``, and theoretically ``OT3`` if one were to 80 | ever be played. 81 | * **seconds_elapsed:** seconds elapsed since the start of the quarter. 82 | * **offense_team:** The abbreviation of the team currently with possession of the ball. 83 | * **yardline:** The current field position. Goes from -49 to 49, where negative numbers 84 | indicate that the team with possession is on its own side of the field. 85 | * **down:** The down. kickoffs, extra points, and similar have a down of 0. 86 | * **yards_to_go:** How many yards needed in order to get a first down (or touchdown). 87 | * **home_team:** The abbreviation of the home team. 88 | * **away_team:** The abbreviation of the away team. 89 | * **curr_home_score:** The home team's score at the start of the play. 90 | * **curr_away_score:** The away team's score at the start of the play. 91 | * **offense_won:** A boolean - ``True`` if the offense won the game, ``False`` otherwise. (The 92 | database query skips tied games.) 93 | 94 | Notes 95 | ----- 96 | ``gsis_id``, ``drive_id``, and ``play_id`` are not necessary to make the model, but 97 | are included because they can be useful for computing things like WPA. 98 | """ 99 | 100 | engine = connect_nfldb() 101 | 102 | sql_string = _make_nfldb_query_string(season_years=season_years, season_types=season_types) 103 | 104 | plays_df = pd.read_sql(sql_string, engine) 105 | 106 | #Fix yardline, quarter and time elapsed: 107 | def yardline_time_fix(row): 108 | try: 109 | yardline = float(row['yardline'][1:-1]) 110 | except TypeError: 111 | yardline = np.nan 112 | split_time = row['time'].split(",") 113 | return yardline, split_time[0][1:], float(split_time[1][:-1]) 114 | 115 | plays_df[['yardline', 'quarter', 'seconds_elapsed']] = pd.DataFrame(plays_df.apply(yardline_time_fix, axis=1).values.tolist()) 116 | plays_df.drop('time', axis=1, inplace=True) 117 | 118 | #Set NaN downs (kickoffs, etc) to 0: 119 | plays_df['down'] = plays_df['down'].fillna(value=0).astype(np.int8) 120 | 121 | 122 | #Aggregate scores: 123 | plays_df = _aggregate_nfldb_scores(plays_df) 124 | 125 | return plays_df 126 | 127 | def _aggregate_nfldb_scores(play_df): 128 | """Aggregate the raw nfldb data to get the score of every play.""" 129 | 130 | # First, add the yardline of the subsequent play to the df 131 | play_df['next_yardline'] = play_df['yardline'].shift(-1) 132 | 133 | #Set up the dictionary to keep track of things: 134 | curr_home_score = 0 135 | curr_away_score = 0 136 | curr_gsis_id = play_df.iloc[0].gsis_id 137 | argdict = {"curr_home_score": 0, "curr_away_score": 0, "curr_gsis_id": play_df.iloc[0].gsis_id} 138 | 139 | #Define an internal function to actually compute the score of a given play: 140 | def compute_current_scores(play, argdict): 141 | #If new game, set scores to zero: 142 | if play.gsis_id != argdict['curr_gsis_id']: 143 | argdict['curr_home_score'] = 0 144 | argdict['curr_away_score'] = 0 145 | argdict['curr_gsis_id'] = play.gsis_id 146 | 147 | #Get current score at start of play: 148 | home_score_to_return = argdict['curr_home_score'] 149 | away_score_to_return = argdict['curr_away_score'] 150 | 151 | #Check if an extra point is missing from the data: 152 | if play.offense_play_points == 6 and play.next_yardline < 0: 153 | play.offense_play_points += 1 154 | if play.defense_play_points == 6 and play.next_yardline < 0: 155 | play.defense_play_points += 1 156 | 157 | #Update scores, if necessary: 158 | if play.offense_team == play.home_team: 159 | argdict['curr_home_score'] += play.offense_play_points 160 | argdict['curr_away_score'] += play.defense_play_points 161 | else: 162 | argdict['curr_home_score'] += play.defense_play_points 163 | argdict['curr_away_score'] += play.offense_play_points 164 | return home_score_to_return, away_score_to_return 165 | 166 | #Apply function to data: 167 | #TODO (AndrewRook): Make the .apply function go faster, currently it's a large bottleneck 168 | aggregate_scores = play_df.apply(compute_current_scores, axis=1, args=(argdict,)) 169 | aggregate_scores = pd.DataFrame(aggregate_scores.values.tolist()) 170 | play_df[['curr_home_score', 'curr_away_score']] = aggregate_scores 171 | 172 | #Drop unnecessary columns: 173 | play_df.drop(labels=["next_yardline", "offense_play_points", "defense_play_points"], 174 | axis=1, inplace=True) 175 | 176 | return play_df 177 | 178 | 179 | def _make_nfldb_query_string(season_years=None, season_types=None): 180 | """Construct the query string to get all the play data. 181 | 182 | This way is a little more compact and robust than specifying 183 | the string in the function that uses it. 184 | 185 | """ 186 | 187 | play_fields = ['gsis_id', 'drive_id', 'play_id', 188 | 'time', 'pos_team AS offense_team', 'yardline', 'down', 189 | 'yards_to_go'] 190 | 191 | offense_play_points = ("GREATEST(" 192 | "(agg_play.fumbles_rec_tds * 6), " 193 | "(agg_play.kicking_rec_tds * 6), " 194 | "(agg_play.passing_tds * 6), " 195 | "(agg_play.receiving_tds * 6), " 196 | "(agg_play.rushing_tds * 6), " 197 | "(agg_play.kicking_xpmade * 1), " 198 | "(agg_play.passing_twoptm * 2), " 199 | "(agg_play.receiving_twoptm * 2), " 200 | "(agg_play.rushing_twoptm * 2), " 201 | "(agg_play.kicking_fgm * 3)) " 202 | "AS offense_play_points") 203 | defense_play_points = ("GREATEST(" 204 | "(agg_play.defense_frec_tds * 6), " 205 | "(agg_play.defense_int_tds * 6), " 206 | "(agg_play.defense_misc_tds * 6), " 207 | "(agg_play.kickret_tds * 6), " 208 | "(agg_play.puntret_tds * 6), " 209 | "(agg_play.defense_safe * 2)) " 210 | "AS defense_play_points") 211 | 212 | game_fields = ("game.home_team, game.away_team, " 213 | "((game.home_score > game.away_score AND play.pos_team = game.home_team) " 214 | "OR (game.away_score > game.home_score AND play.pos_team = game.away_team)) AS offense_won") 215 | 216 | where_clause = ("WHERE game.home_score != game.away_score " 217 | "AND game.finished = TRUE " 218 | "AND play.pos_team != 'UNK' " 219 | "AND (play.time).phase not in ('Pregame', 'Half', 'Final')") 220 | 221 | if season_years is not None: 222 | where_clause += " AND game.season_year" 223 | if len(season_years) == 1: 224 | where_clause += " = {0}".format(season_years[0]) 225 | else: 226 | where_clause += (" in ({0})" 227 | "".format(",".join([str(year) for year in season_years]))) 228 | if season_types is not None: 229 | where_clause += " AND game.season_type" 230 | if len(season_types) == 1: 231 | where_clause += " = '{0}'".format(season_types[0]) 232 | else: 233 | where_clause += " in ('{0}')".format("','".join(season_types)) 234 | 235 | query_string = "SELECT " 236 | query_string += "play." + ", play.".join(play_fields) 237 | query_string += ", " + offense_play_points 238 | query_string += ", " + defense_play_points 239 | query_string += ", " + game_fields 240 | query_string += " FROM play INNER JOIN agg_play" 241 | query_string += (" ON play.gsis_id = agg_play.gsis_id" 242 | " AND play.drive_id = agg_play.drive_id" 243 | " AND play.play_id = agg_play.play_id") 244 | query_string += " INNER JOIN game on play.gsis_id = game.gsis_id" 245 | query_string += " " + where_clause 246 | query_string += " ORDER BY play.gsis_id, play.drive_id, play.play_id;" 247 | 248 | return query_string 249 | -------------------------------------------------------------------------------- /doc/source/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # NFLWin documentation build configuration file, created by 4 | # sphinx-quickstart on Thu Jun 16 22:35:58 2016. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import sys 16 | import os 17 | 18 | # If extensions (or modules to document with autodoc) are in another directory, 19 | # add these directories to sys.path here. If the directory is relative to the 20 | # documentation root, use os.path.abspath to make it absolute, like shown here. 21 | PROJECT_DIRECTORY = os.path.dirname( 22 | os.path.dirname( 23 | os.path.dirname( 24 | os.path.abspath(__file__) 25 | ) 26 | ) 27 | ) 28 | sys.path.insert(0, PROJECT_DIRECTORY) 29 | 30 | # -- General configuration ------------------------------------------------ 31 | 32 | # If your documentation needs a minimal Sphinx version, state it here. 33 | #needs_sphinx = '1.0' 34 | 35 | # Add any Sphinx extension module names here, as strings. They can be 36 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 37 | # ones. 38 | extensions = [ 39 | 'sphinx.ext.autodoc', 40 | 'sphinx.ext.doctest', 41 | 'sphinx.ext.viewcode', 42 | 'sphinx.ext.githubpages', 43 | 'sphinx.ext.autosummary', 44 | 'numpydoc', 45 | ] 46 | 47 | #some magic (http://stackoverflow.com/questions/12206334/sphinx-autosummary-toctree-contains-reference-to-nonexisting-document-warnings) to suppress spurious warnings: 48 | numpydoc_show_class_members = False 49 | 50 | # Add any paths that contain templates here, relative to this directory. 51 | templates_path = ['_templates'] 52 | 53 | # The suffix(es) of source filenames. 54 | # You can specify multiple suffix as a list of string: 55 | # source_suffix = ['.rst', '.md'] 56 | source_suffix = '.rst' 57 | 58 | # The encoding of source files. 59 | #source_encoding = 'utf-8-sig' 60 | 61 | # The master toctree document. 62 | master_doc = 'index' 63 | 64 | # General information about the project. 65 | project = u'NFLWin' 66 | copyright = u'2016, Andrew Schechtman-Rook' 67 | author = u'Andrew Schechtman-Rook' 68 | 69 | # The version info for the project you're documenting, acts as replacement for 70 | # |version| and |release|, also used in various other places throughout the 71 | # built documents. 72 | # 73 | # The short X.Y version. 74 | from nflwin import __version__ 75 | version = __version__ 76 | # The full version, including alpha/beta/rc tags. 77 | release = __version__ 78 | 79 | # The language for content autogenerated by Sphinx. Refer to documentation 80 | # for a list of supported languages. 81 | # 82 | # This is also used if you do content translation via gettext catalogs. 83 | # Usually you set "language" from the command line for these cases. 84 | language = None 85 | 86 | # There are two options for replacing |today|: either, you set today to some 87 | # non-false value, then it is used: 88 | #today = '' 89 | # Else, today_fmt is used as the format for a strftime call. 90 | #today_fmt = '%B %d, %Y' 91 | 92 | # List of patterns, relative to source directory, that match files and 93 | # directories to ignore when looking for source files. 94 | # This patterns also effect to html_static_path and html_extra_path 95 | exclude_patterns = [] 96 | 97 | # The reST default role (used for this markup: `text`) to use for all 98 | # documents. 99 | #default_role = None 100 | 101 | # If true, '()' will be appended to :func: etc. cross-reference text. 102 | #add_function_parentheses = True 103 | 104 | # If true, the current module name will be prepended to all description 105 | # unit titles (such as .. function::). 106 | #add_module_names = True 107 | 108 | # If true, sectionauthor and moduleauthor directives will be shown in the 109 | # output. They are ignored by default. 110 | #show_authors = False 111 | 112 | # The name of the Pygments (syntax highlighting) style to use. 113 | pygments_style = 'sphinx' 114 | 115 | # A list of ignored prefixes for module index sorting. 116 | #modindex_common_prefix = [] 117 | 118 | # If true, keep warnings as "system message" paragraphs in the built documents. 119 | #keep_warnings = False 120 | 121 | # If true, `todo` and `todoList` produce output, else they produce nothing. 122 | todo_include_todos = False 123 | 124 | 125 | # -- Options for HTML output ---------------------------------------------- 126 | 127 | # The theme to use for HTML and HTML Help pages. See the documentation for 128 | # a list of builtin themes. 129 | html_theme = 'sphinx_rtd_theme' 130 | 131 | # Theme options are theme-specific and customize the look and feel of a theme 132 | # further. For a list of options available for each theme, see the 133 | # documentation. 134 | #html_theme_options = {} 135 | 136 | # Add any paths that contain custom themes here, relative to this directory. 137 | #html_theme_path = [] 138 | 139 | # The name for this set of Sphinx documents. 140 | # " v documentation" by default. 141 | #html_title = u'NFLWin v0.1.0' 142 | 143 | # A shorter title for the navigation bar. Default is the same as html_title. 144 | #html_short_title = None 145 | 146 | # The name of an image file (relative to this directory) to place at the top 147 | # of the sidebar. 148 | #html_logo = None 149 | 150 | # The name of an image file (relative to this directory) to use as a favicon of 151 | # the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 152 | # pixels large. 153 | #html_favicon = None 154 | 155 | # Add any paths that contain custom static files (such as style sheets) here, 156 | # relative to this directory. They are copied after the builtin static files, 157 | # so a file named "default.css" will overwrite the builtin "default.css". 158 | html_static_path = ['_static'] 159 | 160 | # Add any extra paths that contain custom files (such as robots.txt or 161 | # .htaccess) here, relative to this directory. These files are copied 162 | # directly to the root of the documentation. 163 | #html_extra_path = [] 164 | 165 | # If not None, a 'Last updated on:' timestamp is inserted at every page 166 | # bottom, using the given strftime format. 167 | # The empty string is equivalent to '%b %d, %Y'. 168 | #html_last_updated_fmt = None 169 | 170 | # If true, SmartyPants will be used to convert quotes and dashes to 171 | # typographically correct entities. 172 | #html_use_smartypants = True 173 | 174 | # Custom sidebar templates, maps document names to template names. 175 | #html_sidebars = {} 176 | 177 | # Additional templates that should be rendered to pages, maps page names to 178 | # template names. 179 | #html_additional_pages = {} 180 | 181 | # If false, no module index is generated. 182 | #html_domain_indices = True 183 | 184 | # If false, no index is generated. 185 | #html_use_index = True 186 | 187 | # If true, the index is split into individual pages for each letter. 188 | #html_split_index = False 189 | 190 | # If true, links to the reST sources are added to the pages. 191 | #html_show_sourcelink = True 192 | 193 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 194 | #html_show_sphinx = True 195 | 196 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 197 | #html_show_copyright = True 198 | 199 | # If true, an OpenSearch description file will be output, and all pages will 200 | # contain a tag referring to it. The value of this option must be the 201 | # base URL from which the finished HTML is served. 202 | #html_use_opensearch = '' 203 | 204 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 205 | #html_file_suffix = None 206 | 207 | # Language to be used for generating the HTML full-text search index. 208 | # Sphinx supports the following languages: 209 | # 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' 210 | # 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh' 211 | #html_search_language = 'en' 212 | 213 | # A dictionary with options for the search language support, empty by default. 214 | # 'ja' uses this config value. 215 | # 'zh' user can custom change `jieba` dictionary path. 216 | #html_search_options = {'type': 'default'} 217 | 218 | # The name of a javascript file (relative to the configuration directory) that 219 | # implements a search results scorer. If empty, the default will be used. 220 | #html_search_scorer = 'scorer.js' 221 | 222 | # Output file base name for HTML help builder. 223 | htmlhelp_basename = 'NFLWindoc' 224 | 225 | # -- Options for LaTeX output --------------------------------------------- 226 | 227 | latex_elements = { 228 | # The paper size ('letterpaper' or 'a4paper'). 229 | #'papersize': 'letterpaper', 230 | 231 | # The font size ('10pt', '11pt' or '12pt'). 232 | #'pointsize': '10pt', 233 | 234 | # Additional stuff for the LaTeX preamble. 235 | #'preamble': '', 236 | 237 | # Latex figure (float) alignment 238 | #'figure_align': 'htbp', 239 | } 240 | 241 | # Grouping the document tree into LaTeX files. List of tuples 242 | # (source start file, target name, title, 243 | # author, documentclass [howto, manual, or own class]). 244 | latex_documents = [ 245 | (master_doc, 'NFLWin.tex', u'NFLWin Documentation', 246 | u'Andrew Schechtman-Rook', 'manual'), 247 | ] 248 | 249 | # The name of an image file (relative to this directory) to place at the top of 250 | # the title page. 251 | #latex_logo = None 252 | 253 | # For "manual" documents, if this is true, then toplevel headings are parts, 254 | # not chapters. 255 | #latex_use_parts = False 256 | 257 | # If true, show page references after internal links. 258 | #latex_show_pagerefs = False 259 | 260 | # If true, show URL addresses after external links. 261 | #latex_show_urls = False 262 | 263 | # Documents to append as an appendix to all manuals. 264 | #latex_appendices = [] 265 | 266 | # If false, no module index is generated. 267 | #latex_domain_indices = True 268 | 269 | 270 | # -- Options for manual page output --------------------------------------- 271 | 272 | # One entry per manual page. List of tuples 273 | # (source start file, name, description, authors, manual section). 274 | man_pages = [ 275 | (master_doc, 'nflwin', u'NFLWin Documentation', 276 | [author], 1) 277 | ] 278 | 279 | # If true, show URL addresses after external links. 280 | #man_show_urls = False 281 | 282 | 283 | # -- Options for Texinfo output ------------------------------------------- 284 | 285 | # Grouping the document tree into Texinfo files. List of tuples 286 | # (source start file, target name, title, author, 287 | # dir menu entry, description, category) 288 | texinfo_documents = [ 289 | (master_doc, 'NFLWin', u'NFLWin Documentation', 290 | author, 'NFLWin', 'One line description of project.', 291 | 'Miscellaneous'), 292 | ] 293 | 294 | # Documents to append as an appendix to all manuals. 295 | #texinfo_appendices = [] 296 | 297 | # If false, no module index is generated. 298 | #texinfo_domain_indices = True 299 | 300 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 301 | #texinfo_show_urls = 'footnote' 302 | 303 | # If true, do not generate a @detailmenu in the "Top" node's menu. 304 | #texinfo_no_detailmenu = False 305 | 306 | #Run apidoc if inside a ReadTheDocs environment: 307 | on_rtd = os.environ.get('READTHEDOCS', None) == 'True' 308 | if on_rtd: 309 | os.system("sphinx-apidoc -f -o doc/source nflwin/ nflwin/tests") 310 | -------------------------------------------------------------------------------- /nflwin/tests/test_utilities.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | 3 | try: 4 | import nfldb 5 | nfldb_missing=False 6 | except ImportError: 7 | nfldb_missing=True 8 | 9 | import numpy as np 10 | import pandas as pd 11 | import pytest 12 | 13 | import nflwin.utilities as utils 14 | 15 | class TestGetNFLDBPlayData(object): 16 | """Testing the ability to get play data from nfldb.""" 17 | 18 | #TODO (AndrewRook): Need to test if the sql query actually works 19 | 20 | def setup_method(self, method): 21 | self.test_df = pd.DataFrame({ 22 | 'gsis_id': [0, 0, 0, 0, 0, 1, 1, 1, 1, 1], 23 | 'drive_id': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 24 | 'play_id': [1, 2, 3, 4, 5, 1, 2, 3, 4, 5], 25 | 'time': ["(Q1,0)", "(Q1,152)", "(Q1,354)", "(Q1,354)", "(Q2,0)", 26 | "(OT,840)", "(OT,840)", "(OT2,875)", "(OT3,900)", "(OT,900)"], 27 | 'offense_team': ["HOU", "KC", "KC", "HOU", "HOU", "UNK", "DEN", "DEN", "CAR", "UNK"], 28 | 'yardline': ["(-15)", "(35)", "(-15)", "(-30)", "(-26)", 29 | None, "(48)", "(-15)", "(-18)", None], 30 | 'down': [np.nan, np.nan, np.nan, 1.0, 2.0, np.nan, 1.0, np.nan, 1.0, np.nan], 31 | 'yards_to_go': [0, 0, 0, 10, 6, 0, 2, 0, 10, 0], 32 | 'offense_play_points': [0, 1, 0, 0, 0, 0, 6, 0, 0, 0], 33 | 'defense_play_points': [6, 0, 0, 0, 0, 0, 0, 0, 0, 0], 34 | 'home_team': ["HOU", "HOU", "HOU", "HOU", "HOU", "DEN", "DEN", "DEN", "DEN", "DEN"], 35 | 'away_team': ["KC", "KC", "KC", "KC", "KC", "CAR", "CAR", "CAR", "CAR", "CAR"], 36 | 'offense_won': [False, False, False, False, False, True, True, True, True, True] 37 | }) 38 | 39 | def test_standard_play_mock(self,monkeypatch): 40 | def mockreturn_engine(): 41 | return True 42 | def mockreturn_query_string(season_years, season_types): 43 | return True 44 | def mockreturn_read_sql(sql_string, engine): 45 | return self.test_df 46 | monkeypatch.setattr(utils, 'connect_nfldb', mockreturn_engine) 47 | monkeypatch.setattr(utils, '_make_nfldb_query_string', mockreturn_query_string) 48 | monkeypatch.setattr(pd, 'read_sql', mockreturn_read_sql) 49 | 50 | expected_df = pd.DataFrame({ 51 | 'gsis_id': [0, 0, 0, 0, 0, 1, 1, 1, 1, 1], 52 | 'drive_id': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 53 | 'play_id': [1, 2, 3, 4, 5, 1, 2, 3, 4, 5], 54 | 'seconds_elapsed': [0.0, 152.0, 354.0, 354.0, 0.0, 55 | 840.0, 840.0, 875.0, 900.0, 900.0], 56 | 'offense_team': ["HOU", "KC", "KC", "HOU", "HOU", "UNK", "DEN", "DEN", "CAR", "UNK"], 57 | 'yardline': [-15, 35, -15, -30, -26, 58 | np.nan, 48, -15, -18, np.nan], 59 | 'down': [0, 0, 0, 1, 2, 0, 1, 0, 1, 0], 60 | 'yards_to_go': [0, 0, 0, 10, 6, 0, 2, 0, 10, 0], 61 | 'home_team': ["HOU", "HOU", "HOU", "HOU", "HOU", "DEN", "DEN", "DEN", "DEN", "DEN"], 62 | 'away_team': ["KC", "KC", "KC", "KC", "KC", "CAR", "CAR", "CAR", "CAR", "CAR"], 63 | 'offense_won': [False, False, False, False, False, True, True, True, True, True], 64 | 'quarter': ["Q1", "Q1", "Q1", "Q1", "Q2", "OT", "OT", "OT2", "OT3", "OT"], 65 | 'curr_home_score': [0, 0, 0, 0, 0, 0, 0, 7, 7, 7], 66 | 'curr_away_score': [0, 6, 7, 7, 7, 0, 0, 0, 0, 0] 67 | }) 68 | expected_df['down'] = expected_df['down'].astype(np.int8) 69 | 70 | pd.util.testing.assert_frame_equal(utils.get_nfldb_play_data().sort_index(axis=1), 71 | expected_df.sort_index(axis=1)) 72 | 73 | @pytest.mark.requires_db 74 | def test_2015_playoffs_query(self): 75 | queried_df = utils.get_nfldb_play_data(season_years=[2015], season_types=["Postseason"]) 76 | expected_df = pd.DataFrame({ 77 | 'gsis_id': ['2016010900', '2016010900', '2016010900', '2016010900', '2016010900'], 78 | 'drive_id': [1, 1, 2, 2, 2], 79 | 'play_id': [36, 54, 70, 88, 109], 80 | 'seconds_elapsed': [0., 11., 11., 11., 45.], 81 | 'offense_team': ['HOU', 'KC', 'KC', 'HOU', 'HOU'], 82 | 'yardline': [-15., 35, -15, -30, -26], 83 | 'down': [0, 0, 0, 1, 2], 84 | 'yards_to_go': [0, 0, 0, 10, 6], 85 | 'home_team': ['HOU', 'HOU', 'HOU', 'HOU', 'HOU'], 86 | 'away_team': ['KC', 'KC', 'KC', 'KC', 'KC'], 87 | 'offense_won': [False, True, True, False, False], 88 | 'quarter': ['Q1', 'Q1', 'Q1', 'Q1', 'Q1'], 89 | 'curr_home_score': [0, 0, 0, 0, 0], 90 | 'curr_away_score': [0, 6, 7, 7, 7] 91 | }) 92 | expected_df['down'] = expected_df['down'].astype(np.int8) 93 | pd.util.testing.assert_frame_equal(queried_df[:5].sort_index(axis=1), 94 | expected_df.sort_index(axis=1), check_column_type=False) 95 | 96 | @pytest.mark.requires_db 97 | def test_2009_regular_season_query(self): 98 | queried_df = utils.get_nfldb_play_data(season_years=[2009], season_types=["Regular"]) 99 | expected_df = pd.DataFrame({ 100 | 'gsis_id': ['2009091000', '2009091000', '2009091000', '2009091000', '2009091000'], 101 | 'drive_id': [1, 1, 1, 1, 1], 102 | 'play_id': [46, 68, 92, 113, 139], 103 | 'seconds_elapsed': [0., 7, 44, 85, 93], 104 | 'offense_team': ['TEN', 'PIT', 'PIT', 'PIT', 'PIT'], 105 | 'yardline': [-20., -8, -3, -6, -6], 106 | 'down': [0, 1, 2, 3, 4], 107 | 'yards_to_go': [0, 10, 5, 8, 8], 108 | 'home_team': ['PIT', 'PIT', 'PIT', 'PIT', 'PIT'], 109 | 'away_team': ['TEN', 'TEN', 'TEN', 'TEN', 'TEN'], 110 | 'offense_won': [False, True, True, True, True], 111 | 'quarter': ['Q1', 'Q1', 'Q1', 'Q1', 'Q1'], 112 | 'curr_home_score': [0, 0, 0, 0, 0], 113 | 'curr_away_score': [0, 0, 0, 0, 0] 114 | }) 115 | expected_df['down'] = expected_df['down'].astype(np.int8) 116 | pd.util.testing.assert_frame_equal(queried_df[:5].sort_index(axis=1), 117 | expected_df.sort_index(axis=1), check_column_type=False) 118 | 119 | @pytest.mark.requires_db 120 | class TestConnectNFLDB(object): 121 | """testing the connect_nfldb function""" 122 | def setup_method(self, method): 123 | self.curr_config_home = nfldb.db._config_home 124 | 125 | def teardown_method(self, method): 126 | nfldb.db._config_home = self.curr_config_home 127 | 128 | def test_no_config_error(self): 129 | nfldb.db._config_home = "/boogaboogabooga" 130 | 131 | with pytest.raises(IOError): 132 | utils.connect_nfldb() 133 | 134 | @pytest.mark.requires_db 135 | def test_engine_works(self): 136 | engine = utils.connect_nfldb() 137 | test_query = ("SELECT play.description " 138 | "from play " 139 | "WHERE play.gsis_id = '2009080950' AND play.play_id=721;") 140 | 141 | plays_df = pd.read_sql(test_query, engine) 142 | 143 | assert (plays_df.iloc[0]['description'] == 144 | u'(6:55) L.White left guard for 3 yards, TOUCHDOWN.') 145 | 146 | class TestMakeNFLDBQueryString(object): 147 | """testing the _make_nfldb_query_string function""" 148 | 149 | def test_no_args(self): 150 | expected_string = ("SELECT play.gsis_id, play.drive_id, " 151 | "play.play_id, play.time, play.pos_team AS offense_team, " 152 | "play.yardline, play.down, play.yards_to_go, " 153 | "GREATEST(" 154 | "(agg_play.fumbles_rec_tds * 6), " 155 | "(agg_play.kicking_rec_tds * 6), " 156 | "(agg_play.passing_tds * 6), " 157 | "(agg_play.receiving_tds * 6), " 158 | "(agg_play.rushing_tds * 6), " 159 | "(agg_play.kicking_xpmade * 1), " 160 | "(agg_play.passing_twoptm * 2), " 161 | "(agg_play.receiving_twoptm * 2), " 162 | "(agg_play.rushing_twoptm * 2), " 163 | "(agg_play.kicking_fgm * 3)) AS offense_play_points, " 164 | "GREATEST(" 165 | "(agg_play.defense_frec_tds * 6), " 166 | "(agg_play.defense_int_tds * 6), " 167 | "(agg_play.defense_misc_tds * 6), " 168 | "(agg_play.kickret_tds * 6), " 169 | "(agg_play.puntret_tds * 6), " 170 | "(agg_play.defense_safe * 2)) AS defense_play_points, " 171 | "game.home_team, game.away_team, " 172 | "((game.home_score > game.away_score AND play.pos_team = game.home_team) OR " 173 | "(game.away_score > game.home_score AND play.pos_team = game.away_team))" 174 | " AS offense_won " 175 | "FROM play INNER JOIN agg_play " 176 | "ON play.gsis_id = agg_play.gsis_id " 177 | "AND play.drive_id = agg_play.drive_id " 178 | "AND play.play_id = agg_play.play_id " 179 | "INNER JOIN game on play.gsis_id = game.gsis_id " 180 | "WHERE game.home_score != game.away_score AND game.finished = TRUE " 181 | "AND play.pos_team != 'UNK' " 182 | "AND (play.time).phase not in ('Pregame', 'Half', 'Final') " 183 | "ORDER BY play.gsis_id, play.drive_id, play.play_id;") 184 | assert expected_string == utils._make_nfldb_query_string() 185 | 186 | def test_single_year(self): 187 | """Test that adding a single year constraint works""" 188 | expected_substring = ("WHERE game.home_score != game.away_score " 189 | "AND game.finished = TRUE " 190 | "AND play.pos_team != 'UNK' " 191 | "AND (play.time).phase not in ('Pregame', 'Half', 'Final') " 192 | "AND game.season_year = 2013") 193 | assert expected_substring in utils._make_nfldb_query_string(season_years=[2013]) 194 | 195 | def test_single_season_type(self): 196 | """Test that adding a single season type constraint works""" 197 | expected_substring = ("WHERE game.home_score != game.away_score " 198 | "AND game.finished = TRUE " 199 | "AND play.pos_team != 'UNK' " 200 | "AND (play.time).phase not in ('Pregame', 'Half', 'Final') " 201 | "AND game.season_type = 'Regular'") 202 | assert expected_substring in utils._make_nfldb_query_string(season_types=["Regular"]) 203 | 204 | def test_multiple_year(self): 205 | """Test that adding a multiple year constraint works""" 206 | expected_substring = ("WHERE game.home_score != game.away_score " 207 | "AND game.finished = TRUE " 208 | "AND play.pos_team != 'UNK' " 209 | "AND (play.time).phase not in ('Pregame', 'Half', 'Final') " 210 | "AND game.season_year in (2013,2010)") 211 | assert expected_substring in utils._make_nfldb_query_string(season_years=[2013, 2010]) 212 | 213 | def test_multiple_season_type(self): 214 | """Test that adding a single season type constraint works""" 215 | expected_substring = ("WHERE game.home_score != game.away_score " 216 | "AND game.finished = TRUE " 217 | "AND play.pos_team != 'UNK' " 218 | "AND (play.time).phase not in ('Pregame', 'Half', 'Final') " 219 | "AND game.season_type in ('Regular','Postseason'") 220 | assert expected_substring in utils._make_nfldb_query_string(season_types=["Regular", "Postseason"]) 221 | 222 | 223 | class TestAggregateNFLDBScores(object): 224 | """Testing the _aggregate_nfldb_scores function""" 225 | 226 | def test_single_game_offense_points(self): 227 | input_df = pd.DataFrame({'gsis_id': [0, 0, 0, 0, 0, 0, 0, 0], 228 | 'yardline': [0, 0, 0, -15, 0, 0, 0, -15], 229 | 'offense_team': ['KC', 'KC', 'KC', 'KC', 'NE', 'NE', 'NE', 'NE'], 230 | 'home_team': ['KC', 'KC', 'KC', 'KC', 'KC', 'KC', 'KC', 'KC'], 231 | 'away_team': ['NE', 'NE', 'NE', 'NE', 'NE', 'NE', 'NE', 'NE'], 232 | 'offense_play_points': [0, 0, 3, 0, 0, 6, 1, 0], 233 | 'defense_play_points': [0, 0, 0, 0, 0, 0, 0, 0] 234 | }) 235 | expected_df = pd.DataFrame({'gsis_id': [0, 0, 0, 0, 0, 0, 0, 0], 236 | 'yardline': [0, 0, 0, -15, 0, 0, 0, -15], 237 | 'offense_team': ['KC', 'KC', 'KC', 'KC', 'NE', 'NE', 'NE', 'NE'], 238 | 'home_team': ['KC', 'KC', 'KC', 'KC', 'KC', 'KC', 'KC', 'KC'], 239 | 'away_team': ['NE', 'NE', 'NE', 'NE', 'NE', 'NE', 'NE', 'NE'] 240 | }) 241 | #Have to append the score columns manually: 242 | expected_df[['curr_home_score', 'curr_away_score']] = pd.DataFrame([(0, 0), 243 | (0, 0), 244 | (0, 0), 245 | (3, 0), 246 | (3, 0), 247 | (3, 0), 248 | (3, 6), 249 | (3, 7),]) 250 | 251 | input_df = utils._aggregate_nfldb_scores(input_df) 252 | pd.util.testing.assert_frame_equal(input_df, expected_df) 253 | 254 | def test_single_game_defense_points(self): 255 | input_df = pd.DataFrame({'gsis_id': [0, 0, 0, 0, 0, 0, 0, 0], 256 | 'yardline': [0, 0, 0, -15, 0, 0, 0, -15], 257 | 'offense_team': ['KC', 'KC', 'KC', 'KC', 'NE', 'NE', 'NE', 'NE'], 258 | 'away_team': ['KC', 'KC', 'KC', 'KC', 'KC', 'KC', 'KC', 'KC'], 259 | 'home_team': ['NE', 'NE', 'NE', 'NE', 'NE', 'NE', 'NE', 'NE'], 260 | 'offense_play_points': [0, 0, 3, 0, 0, 6, 1, 0], 261 | 'defense_play_points': [0, 0, 0, 0, 0, 0, 0, 0] 262 | }) 263 | expected_df = pd.DataFrame({'gsis_id': [0, 0, 0, 0, 0, 0, 0, 0], 264 | 'yardline': [0, 0, 0, -15, 0, 0, 0, -15], 265 | 'offense_team': ['KC', 'KC', 'KC', 'KC', 'NE', 'NE', 'NE', 'NE'], 266 | 'away_team': ['KC', 'KC', 'KC', 'KC', 'KC', 'KC', 'KC', 'KC'], 267 | 'home_team': ['NE', 'NE', 'NE', 'NE', 'NE', 'NE', 'NE', 'NE'] 268 | }) 269 | #Have to append the score columns manually: 270 | expected_df[['curr_home_score', 'curr_away_score']] = pd.DataFrame([(0, 0), 271 | (0, 0), 272 | (0, 0), 273 | (0, 3), 274 | (0, 3), 275 | (0, 3), 276 | (6, 3), 277 | (7, 3),]) 278 | 279 | input_df = utils._aggregate_nfldb_scores(input_df) 280 | pd.util.testing.assert_frame_equal(input_df, expected_df) 281 | 282 | def test_multiple_games(self): 283 | input_df = pd.DataFrame({'gsis_id': [0, 0, 0, 1, 1, 1, 1, 1], 284 | 'yardline': [0, 0, 0, -15, 0, 0, 0, -15], 285 | 'offense_team': ['KC', 'KC', 'KC', 'NYJ', 'NE', 'NE', 'NE', 'NE'], 286 | 'home_team': ['KC', 'KC', 'KC', 'NYJ', 'NYJ', 'NYJ', 'NYJ', 'NYJ'], 287 | 'away_team': ['DEN', 'DEN', 'DEN', 'NE', 'NE', 'NE', 'NE', 'NE'], 288 | 'offense_play_points': [0, 0, 3, 0, 0, 6, 1, 0], 289 | 'defense_play_points': [0, 0, 0, 0, 0, 0, 0, 0] 290 | }) 291 | expected_df = pd.DataFrame({'gsis_id': [0, 0, 0, 1, 1, 1, 1, 1], 292 | 'yardline': [0, 0, 0, -15, 0, 0, 0, -15], 293 | 'offense_team': ['KC', 'KC', 'KC', 'NYJ', 'NE', 'NE', 'NE', 'NE'], 294 | 'home_team': ['KC', 'KC', 'KC', 'NYJ', 'NYJ', 'NYJ', 'NYJ', 'NYJ'], 295 | 'away_team': ['DEN', 'DEN', 'DEN', 'NE', 'NE', 'NE', 'NE', 'NE'] 296 | }) 297 | #Have to append the score columns manually: 298 | expected_df[['curr_home_score', 'curr_away_score']] = pd.DataFrame([(0, 0), 299 | (0, 0), 300 | (0, 0), 301 | (0, 0), 302 | (0, 0), 303 | (0, 0), 304 | (0, 6), 305 | (0, 7),]) 306 | 307 | input_df = utils._aggregate_nfldb_scores(input_df) 308 | pd.util.testing.assert_frame_equal(input_df, expected_df) 309 | 310 | def test_missing_xp(self): 311 | input_df = pd.DataFrame({'gsis_id': [0, 0, 0, 0, 0, 0, 0, 0], 312 | 'yardline': [0, 0, 0, -15, 0, 0, 0, -15], 313 | 'offense_team': ['KC', 'KC', 'KC', 'NE', 'KC', 'KC', 'KC', 'KC'], 314 | 'home_team': ['KC', 'KC', 'KC', 'KC', 'KC', 'KC', 'KC', 'KC'], 315 | 'away_team': ['NE', 'NE', 'NE', 'NE', 'NE', 'NE', 'NE', 'NE'], 316 | 'offense_play_points': [0, 0, 0, 0, 0, 0, 6, 0], 317 | 'defense_play_points': [0, 0, 6, 0, 0, 0, 0, 0] 318 | }) 319 | expected_df = pd.DataFrame({'gsis_id': [0, 0, 0, 0, 0, 0, 0, 0], 320 | 'yardline': [0, 0, 0, -15, 0, 0, 0, -15], 321 | 'offense_team': ['KC', 'KC', 'KC', 'NE', 'KC', 'KC', 'KC', 'KC'], 322 | 'home_team': ['KC', 'KC', 'KC', 'KC', 'KC', 'KC', 'KC', 'KC'], 323 | 'away_team': ['NE', 'NE', 'NE', 'NE', 'NE', 'NE', 'NE', 'NE'] 324 | }) 325 | #Have to append the score columns manually: 326 | expected_df[['curr_home_score', 'curr_away_score']] = pd.DataFrame([(0, 0), 327 | (0, 0), 328 | (0, 0), 329 | (0, 7), 330 | (0, 7), 331 | (0, 7), 332 | (0, 7), 333 | (7, 7),]) 334 | 335 | input_df = utils._aggregate_nfldb_scores(input_df) 336 | pd.util.testing.assert_frame_equal(input_df, expected_df) 337 | 338 | 339 | 340 | 341 | 342 | -------------------------------------------------------------------------------- /nflwin/preprocessing.py: -------------------------------------------------------------------------------- 1 | """Tools to get raw data ready for modeling.""" 2 | from __future__ import print_function, division 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from sklearn.base import BaseEstimator 8 | from sklearn.preprocessing import OneHotEncoder 9 | from sklearn.utils.validation import NotFittedError 10 | 11 | class ComputeElapsedTime(BaseEstimator): 12 | """Compute the total elapsed time from the start of the game. 13 | 14 | Parameters 15 | ---------- 16 | quarter_colname : string 17 | Which column indicates what quarter it is. 18 | quarter_time_colname : string 19 | Which column indicates how much time has elapsed in the current quarter. 20 | quarter_to_second_mapping : dict (default=``{"Q1": 0, "Q2": 900, "Q3": 1800, "Q4": 2700, 21 | "OT": 3600, "OT2": 4500, "OT3": 5400}``) 22 | What mapping to use between the string values in the quarter column and the seconds they 23 | correspond to. Mostly useful if your data had quarters listed as something like "Quarter 1" 24 | or "q1" instead of the values from ``nfldb``. 25 | total_time_colname : string (default="total_elapsed_time") 26 | What column name to store the total elapsed time under. 27 | copy : boolean (default=True) 28 | Whether to add the new column in place. 29 | """ 30 | def __init__(self, quarter_colname, quarter_time_colname, 31 | quarter_to_second_mapping={"Q1": 0, "Q2": 900, "Q3": 1800, "Q4": 2700, 32 | "OT": 3600, "OT2": 4500, "OT3": 5400}, 33 | total_time_colname="total_elapsed_time", copy=True): 34 | self.quarter_colname = quarter_colname 35 | self.quarter_time_colname = quarter_time_colname 36 | self.quarter_to_second_mapping = quarter_to_second_mapping 37 | self.total_time_colname = total_time_colname 38 | self.copy = copy 39 | 40 | def fit(self, X, y=None): 41 | return self 42 | 43 | 44 | def transform(self, X, y=None): 45 | """Create the new column. 46 | 47 | Parameters 48 | ---------- 49 | X : Pandas DataFrame, of shape(number of plays, number of features) 50 | NFL play data. 51 | y : Numpy array, with length = number of plays, or None 52 | 1 if the home team won, 0 if not. 53 | (Used as part of Scikit-learn's ``Pipeline``) 54 | 55 | Returns 56 | ------- 57 | X : Pandas DataFrame, of shape(number of plays, number of features + 1) 58 | The input DataFrame, with the new column added. 59 | 60 | Raises 61 | ------ 62 | KeyError 63 | If ``quarter_colname`` or ``quarter_time_colname`` don't exist, or 64 | if ``total_time_colname`` **does** exist. 65 | TypeError 66 | If the total time elapsed is not a numeric column, which typically indicates 67 | that the mapping did not apply to every row. 68 | """ 69 | 70 | if self.quarter_colname not in X.columns: 71 | raise KeyError("ComputeElapsedTime: quarter_colname {0} does not exist in dataset." 72 | .format(self.quarter_colname)) 73 | if self.quarter_time_colname not in X.columns: 74 | raise KeyError("ComputeElapsedTime: quarter_time_colname {0} does not exist in dataset." 75 | .format(self.quarter_time_colname)) 76 | 77 | if self.total_time_colname in X.columns: 78 | raise KeyError("ComputeElapsedTime: total_time_colname {0} already exists in dataset." 79 | .format(self.total_time_colname)) 80 | 81 | if self.copy: 82 | X = X.copy() 83 | 84 | try: 85 | time_elapsed = X[self.quarter_colname].replace(self.quarter_to_second_mapping) + X[self.quarter_time_colname] 86 | except TypeError: 87 | raise TypeError("ComputeElapsedTime: Total time elapsed not numeric. Check your mapping from quarter name to time.") 88 | 89 | X[self.total_time_colname] = time_elapsed.astype(np.int) 90 | 91 | return X 92 | 93 | 94 | class ComputeIfOffenseIsHome(BaseEstimator): 95 | """Determine if the team currently with possession is the home team. 96 | 97 | 98 | Parameters 99 | ---------- 100 | offense_team_colname : string 101 | Which column indicates what team was on offense. 102 | home_team_colname : string 103 | Which column indicates what team was the home team. 104 | offense_home_team_colname : string (default="is_offense_home") 105 | What column to store whether or not the offense was the home team. 106 | copy : boolean (default=True) 107 | Whether to add the new column in place. 108 | """ 109 | def __init__(self, offense_team_colname, 110 | home_team_colname, 111 | offense_home_team_colname="is_offense_home", 112 | copy=True): 113 | self.offense_team_colname = offense_team_colname 114 | self.home_team_colname = home_team_colname 115 | self.offense_home_team_colname = offense_home_team_colname 116 | self.copy = copy 117 | 118 | def fit(self, X, y=None): 119 | return self 120 | 121 | def transform(self, X, y=None): 122 | """Create the new column. 123 | 124 | Parameters 125 | ---------- 126 | X : Pandas DataFrame, of shape(number of plays, number of features) 127 | NFL play data. 128 | y : Numpy array, with length = number of plays, or None 129 | 1 if the home team won, 0 if not. 130 | (Used as part of Scikit-learn's ``Pipeline``) 131 | 132 | Returns 133 | ------- 134 | X : Pandas DataFrame, of shape(number of plays, number of features + 1) 135 | The input DataFrame, with the new column added. 136 | 137 | Raises 138 | ------ 139 | KeyError 140 | If ``offense_team_colname`` or ``home_team_colname`` don't exist, or 141 | if ``offense_home_team_colname`` **does** exist. 142 | """ 143 | 144 | if self.home_team_colname not in X.columns: 145 | raise KeyError("ComputeIfOffenseWon: home_team_colname {0} does not exist in dataset." 146 | .format(self.home_team_colname)) 147 | if self.offense_team_colname not in X.columns: 148 | raise KeyError("ComputeIfOffenseWon: offense_team_colname {0} does not exist in dataset." 149 | .format(self.offense_team_colname)) 150 | 151 | if self.offense_home_team_colname in X.columns: 152 | raise KeyError("ComputeIfOffenseWon: offense_home_team_colname {0} already exists in dataset." 153 | .format(self.offense_home_team_colname)) 154 | 155 | if self.copy: 156 | X = X.copy() 157 | 158 | X[self.offense_home_team_colname] = (X[self.home_team_colname] == X[self.offense_team_colname]) 159 | 160 | return X 161 | 162 | 163 | class MapToInt(BaseEstimator): 164 | """Map a column of values to integers. 165 | 166 | Mapping to integer is nice if you know a column 167 | only has a few specific values in it, but you need 168 | to convert it to integers before one-hot encoding it. 169 | 170 | Parameters 171 | ---------- 172 | colname : string 173 | The name of the column to perform the mapping on. 174 | copy : boolean (default=True) 175 | If ``False``, apply the mapping in-place. 176 | 177 | Attributes 178 | ---------- 179 | mapping : dict 180 | Keys are the unique values of the column, values are the 181 | integers those values will be mapped to. 182 | 183 | Note 184 | ---- 185 | The ``transform`` method DOES NOT CHECK to see if the input 186 | DataFrame only contains values in ``mapping``. Any values not 187 | in ``mapping`` will be left alone, which can cause subtle bugs 188 | if you're not careful. 189 | """ 190 | 191 | def __init__(self, colname, copy=True): 192 | self.colname = colname 193 | self.copy = copy 194 | self.mapping = None 195 | 196 | def fit(self, X, y=None): 197 | """Find all unique strings and construct the mapping. 198 | 199 | Parameters 200 | ---------- 201 | X : Pandas DataFrame, of shape(number of plays, number of features) 202 | NFL play data. 203 | y : Numpy array, with length = number of plays, or None 204 | 1 if the home team won, 0 if not. 205 | (Used as part of Scikit-learn's ``Pipeline``) 206 | 207 | Returns 208 | ------- 209 | self : For compatibility with Scikit-learn's ``Pipeline``. 210 | 211 | Raises 212 | ------ 213 | KeyError 214 | If ``colname`` is not in ``X``. 215 | 216 | """ 217 | if self.colname not in X.columns: 218 | raise KeyError("MapStringsToInt: Required column {0} " 219 | "not present in data".format(self.colname)) 220 | unique_values = X[self.colname].unique() 221 | 222 | self.mapping = {unique_values[i]: i for i in range(len(unique_values))} 223 | 224 | try: 225 | del self.mapping[np.nan] 226 | except KeyError: 227 | pass 228 | 229 | return self 230 | 231 | def transform(self, X, y=None): 232 | """Apply the mapping to the data. 233 | 234 | Parameters 235 | ---------- 236 | X : Pandas DataFrame, of shape(number of plays, number of features) 237 | NFL play data. 238 | y : Numpy array, with length = number of plays, or None 239 | 1 if the home team won, 0 if not. 240 | (Used as part of Scikit-learn's ``Pipeline``) 241 | 242 | Returns 243 | ------- 244 | X : Pandas DataFrame, of shape(number of plays, number of features) 245 | The input DataFrame, with the mapping applied. 246 | 247 | Raises 248 | ------ 249 | NotFittedError 250 | If ``transform`` is called before ``fit``. 251 | KeyError 252 | If ``colname`` is not in ``X``. 253 | """ 254 | if not self.mapping: 255 | raise NotFittedError("MapStringsToInt: Must fit before transform.") 256 | 257 | if self.colname not in X.columns: 258 | raise KeyError("MapStringsToInt: Required column {0} " 259 | "not present in data".format(self.colname)) 260 | 261 | if self.copy: 262 | X = X.copy() 263 | 264 | X[self.colname].replace(self.mapping, inplace=True) 265 | 266 | return X 267 | 268 | 269 | class OneHotEncoderFromDataFrame(BaseEstimator): 270 | """One-hot encode a DataFrame. 271 | 272 | This cleaner wraps the standard scikit-learn OneHotEncoder, 273 | handling the transfer between column name and column index. 274 | 275 | Parameters 276 | ---------- 277 | categorical_feature_names : "all" or array of column names. 278 | Specify what features are treated as categorical. 279 | * "all" (default): All features are treated as categorical. 280 | * array of column names: Array of categorical feature names. 281 | dtype : number type, default=np.float. 282 | Desired dtype of output. 283 | handle_unknown : str, "error" (default) or "ignore". 284 | Whether to raise an error or ignore if an unknown categorical feature 285 | is present during transform. 286 | copy : boolean (default=True) 287 | If ``False``, apply the encoding in-place. 288 | """ 289 | 290 | @property 291 | def dtype(self): 292 | return self._dtype 293 | @dtype.setter 294 | def dtype(self, dtype): 295 | self._dtype = dtype 296 | self.onehot.dtype = self._dtype 297 | 298 | @property 299 | def handle_unknown(self): 300 | return self._handle_unknown 301 | @handle_unknown.setter 302 | def handle_unknown(self, handle_unknown): 303 | self._handle_unknown = handle_unknown 304 | self.onehot.handle_unknown = self._handle_unknown 305 | 306 | def __init__(self, 307 | categorical_feature_names="all", 308 | dtype=np.float, 309 | handle_unknown="error", 310 | copy=True): 311 | self.onehot = OneHotEncoder(sparse=False, n_values="auto", 312 | categorical_features="all") #We'll subset the DF 313 | self.categorical_feature_names = categorical_feature_names 314 | self.dtype = dtype 315 | self.handle_unknown = handle_unknown 316 | self.copy = copy 317 | 318 | def fit(self, X, y=None): 319 | """Convert the column names to indices, then compute the one hot encoding. 320 | 321 | Parameters 322 | ---------- 323 | X : Pandas DataFrame, of shape(number of plays, number of features) 324 | NFL play data. 325 | y : Numpy array, with length = number of plays, or None 326 | 1 if the home team won, 0 if not. 327 | (Used as part of Scikit-learn's ``Pipeline``) 328 | 329 | Returns 330 | ------- 331 | self : For compatibility with Scikit-learn's ``Pipeline``. 332 | """ 333 | 334 | if self.categorical_feature_names == "all": 335 | self.categorical_feature_names = X.columns 336 | 337 | #Get all columns that need to be encoded: 338 | data_to_encode = X[self.categorical_feature_names] 339 | 340 | 341 | self.onehot.fit(data_to_encode) 342 | 343 | return self 344 | 345 | def transform(self, X, y=None): 346 | """Apply the encoding to the data. 347 | 348 | Parameters 349 | ---------- 350 | X : Pandas DataFrame, of shape(number of plays, number of features) 351 | NFL play data. 352 | y : Numpy array, with length = number of plays, or None 353 | 1 if the home team won, 0 if not. 354 | (Used as part of Scikit-learn's ``Pipeline``) 355 | 356 | Returns 357 | ------- 358 | X : Pandas DataFrame, of shape(number of plays, number of new features) 359 | The input DataFrame, with the encoding applied. 360 | """ 361 | if self.copy: 362 | X = X.copy() 363 | 364 | data_to_transform = X[self.categorical_feature_names] 365 | transformed_data = self.onehot.transform(data_to_transform) 366 | 367 | #TODO (AndrewRook): Find good column names for the encoded columns. 368 | colnames = ["onehot_col{0}".format(i+1) for i in range(transformed_data.shape[1])] 369 | #Create a dataframe from the transformed columns (setting the index is critical for 370 | #merging with data containing non-standard indexes) 371 | transformed_df = pd.DataFrame(transformed_data, columns=colnames, index=X.index) 372 | 373 | X.drop(self.categorical_feature_names, axis=1, inplace=True) 374 | X[transformed_df.columns] = transformed_df 375 | 376 | return X 377 | 378 | 379 | 380 | class CreateScoreDifferential(BaseEstimator): 381 | """Convert offense and defense scores into a differential (offense - defense). 382 | 383 | Parameters 384 | ---------- 385 | home_score_colname : string 386 | The name of the column containing the score of the home team. 387 | away_score_colname : string 388 | The name of the column containing the score of the away team. 389 | offense_home_colname : string 390 | The name of the column indicating if the offense is home. 391 | score_differential_colname : string (default=``"score_differential"``) 392 | The name of column containing the score differential. Must not already 393 | exist in the DataFrame. 394 | copy : boolean (default = ``True``) 395 | If ``False``, add the score differential in place. 396 | """ 397 | def __init__(self, home_score_colname, 398 | away_score_colname, 399 | offense_home_colname, 400 | score_differential_colname="score_differential", 401 | copy=True): 402 | self.home_score_colname = home_score_colname 403 | self.away_score_colname = away_score_colname 404 | self.offense_home_colname = offense_home_colname 405 | self.score_differential_colname = score_differential_colname 406 | self.copy = copy 407 | 408 | def fit(self, X, y=None): 409 | return self 410 | 411 | def transform(self, X, y=None): 412 | """Create the score differential column. 413 | 414 | Parameters 415 | ---------- 416 | X : Pandas DataFrame, of shape(number of plays, number of features) 417 | NFL play data. 418 | y : Numpy array, with length = number of plays, or None 419 | 1 if the home team won, 0 if not. 420 | (Used as part of Scikit-learn's ``Pipeline``) 421 | 422 | Returns 423 | ------- 424 | X : Pandas DataFrame, of shape(number of plays, number of features + 1) 425 | The input DataFrame, with the score differential column added. 426 | """ 427 | try: 428 | score_differential = ((X[self.home_score_colname] - X[self.away_score_colname]) * 429 | (2 * X[self.offense_home_colname] - 1)) 430 | except KeyError: 431 | raise KeyError("CreateScoreDifferential: data missing required column. Must " 432 | "include columns named {0}, {1}, and {2}".format(self.home_score_colname, 433 | self.away_score_colname, 434 | self.offense_home_colname)) 435 | if self.score_differential_colname in X.columns: 436 | raise KeyError("CreateScoreDifferential: column {0} already in DataFrame, and can't " 437 | "be used for the score differential".format(self.score_differential_colname)) 438 | 439 | if self.copy: 440 | X = X.copy() 441 | 442 | X[self.score_differential_colname] = score_differential 443 | 444 | return X 445 | 446 | 447 | 448 | class CheckColumnNames(BaseEstimator): 449 | """Make sure user has the right column names, in the right order. 450 | 451 | This is a useful first step to make sure that nothing 452 | is going to break downstream, but can also be used effectively 453 | to drop columns that are no longer necessary. 454 | 455 | Parameters 456 | ---------- 457 | column_names : ``None``, or list of strings 458 | A list of column names that need to be present in the scoring 459 | data. All other columns will be stripped out. The order of the 460 | columns will be applied to any scoring 461 | data as well, in order to handle the fact that pandas lets 462 | you play fast and loose with column order. If ``None``, 463 | will obtain every column in the DataFrame passed to the 464 | ``fit`` method. 465 | copy : boolean (default=``True``) 466 | If ``False``, add the score differential in place. 467 | 468 | """ 469 | def __init__(self, column_names=None, copy=True): 470 | self.column_names = column_names 471 | self.copy = copy 472 | self._fit = True 473 | self.user_specified_columns = False 474 | if self.column_names is None: 475 | self._fit = False 476 | else: 477 | self.user_specified_columns = True 478 | 479 | 480 | def fit(self, X, y=None): 481 | """Grab the column names from a Pandas DataFrame. 482 | 483 | Parameters 484 | ---------- 485 | X : Pandas DataFrame, of shape(number of plays, number of features) 486 | NFL play data. 487 | y : Numpy array, with length = number of plays, or None 488 | 1 if the home team won, 0 if not. 489 | (Used as part of Scikit-learn's ``Pipeline``) 490 | 491 | Returns 492 | ------- 493 | self : For compatibility with Scikit-learn's ``Pipeline``. 494 | """ 495 | if not self.user_specified_columns: 496 | self.column_names = X.columns 497 | self._fit = True 498 | 499 | return self 500 | 501 | def transform(self, X, y=None): 502 | """Apply the column ordering to the data. 503 | 504 | Parameters 505 | ---------- 506 | X : Pandas DataFrame, of shape(number of plays, number of features) 507 | NFL play data. 508 | y : Numpy array, with length = number of plays, or None 509 | 1 if the home team won, 0 if not. 510 | (Used as part of Scikit-learn's ``Pipeline``) 511 | 512 | Returns 513 | ------- 514 | X : Pandas DataFrame, of shape(number of plays, ``len(column_names)``) 515 | The input DataFrame, properly ordered and with extraneous 516 | columns dropped 517 | 518 | Raises 519 | ------ 520 | KeyError 521 | If the input data frame doesn't have all the columns specified 522 | by ``column_names``. 523 | NotFittedError 524 | If ``transform`` is called before ``fit``. 525 | """ 526 | if not self._fit: 527 | raise NotFittedError("CheckColumnName: Call 'fit' before 'transform") 528 | 529 | if self.copy: 530 | X = X.copy() 531 | 532 | try: 533 | 534 | return X[self.column_names] 535 | except KeyError: 536 | raise KeyError("CheckColumnName: DataFrame does not have required columns. " 537 | "Must contain at least {0}".format(self.column_names)) 538 | -------------------------------------------------------------------------------- /nflwin/model.py: -------------------------------------------------------------------------------- 1 | """Tools for creating and running the model.""" 2 | from __future__ import print_function, division 3 | 4 | import os 5 | 6 | import numpy as np 7 | from scipy import integrate 8 | from scipy import stats 9 | 10 | import joblib 11 | 12 | from sklearn.ensemble import RandomForestClassifier 13 | from sklearn.linear_model import LogisticRegression 14 | from sklearn.calibration import CalibratedClassifierCV 15 | from sklearn.model_selection import train_test_split, GridSearchCV 16 | from sklearn.metrics import brier_score_loss 17 | from sklearn.neighbors import KernelDensity 18 | from sklearn.pipeline import Pipeline 19 | from sklearn.utils.validation import NotFittedError 20 | 21 | from . import preprocessing, utilities 22 | 23 | class WPModel(object): 24 | """The object that computes win probabilities. 25 | 26 | In addition to holding the model itself, it defines some columns names likely to be 27 | used in the model as parameters to allow other users to more easily figure out which 28 | columns go into the model. 29 | 30 | Parameters 31 | ---------- 32 | copy_data : boolean (default=``True``) 33 | Whether or not to copy data when fitting and applying the model. Running the model 34 | in-place (``copy_data=False``) will be faster and have a smaller memory footprint, 35 | but if not done carefully can lead to data integrity issues. 36 | 37 | Attributes 38 | ---------- 39 | model : A Scikit-learn pipeline (or equivalent) 40 | The actual model used to compute WP. Upon initialization it will be set to 41 | a default model, but can be overridden by the user. 42 | column_descriptions : dictionary 43 | A dictionary whose keys are the names of the columns used in the model, and the values are 44 | string descriptions of what the columns mean. Set at initialization to be the default model, 45 | if you create your own model you'll need to update this attribute manually. 46 | training_seasons : A list of ints, or ``None`` (default=``None``) 47 | If the model was trained using data downloaded from nfldb, a list of the seasons 48 | used to train the model. If nfldb was **not** used, an empty list. If no model 49 | has been trained yet, ``None``. 50 | training_season_types : A list of strings or ``None`` (default=``None``) 51 | Same as ``training_seasons``, except for the portions of the seasons used in training the 52 | model ("Preseason", "Regular", and/or "Postseason"). 53 | validation_seasons : same as ``training_seasons``, but for validation data. 54 | validation_season_types : same as ``training_season_types``, but for validation data. 55 | sample_probabilities : A numpy array of floats or ``None`` (default=``None``) 56 | After the model has been validated, contains the sampled predicted probabilities used to 57 | compute the validation statistic. 58 | predicted_win_percents : A numpy array of floats or ``None`` (default=``None``) 59 | After the model has been validated, contains the actual probabilities in the test 60 | set at each probability in ``sample_probabilities``. 61 | num_plays_used : A numpy array of floats or ``None`` (default=``None``) 62 | After the model has been validated, contains the number of plays used to compute each 63 | element of ``predicted_win_percents``. 64 | model_directory : string 65 | The directory where all models will be saved to or loaded from. 66 | 67 | """ 68 | model_directory = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models") 69 | _default_model_filename = "default_model.nflwin" 70 | 71 | def __init__(self, 72 | copy_data=True 73 | ): 74 | self.copy_data = copy_data 75 | 76 | self.model = self.create_default_pipeline() 77 | self._training_seasons = None 78 | self._training_season_types = None 79 | self._validation_seasons = None 80 | self._validation_season_types = None 81 | 82 | self._sample_probabilities = None 83 | self._predicted_win_percents = None 84 | self._num_plays_used = None 85 | 86 | 87 | @property 88 | def training_seasons(self): 89 | return self._training_seasons 90 | @property 91 | def training_seasons_types(self): 92 | return self._training_season_types 93 | @property 94 | def validation_seasons(self): 95 | return self._validation_seasons 96 | @property 97 | def validation_seasons_types(self): 98 | return self._validation_season_types 99 | 100 | @property 101 | def sample_probabilities(self): 102 | return self._sample_probabilities 103 | @property 104 | def predicted_win_percents(self): 105 | return self._predicted_win_percents 106 | @property 107 | def num_plays_used(self): 108 | return self._num_plays_used 109 | 110 | def train_model(self, 111 | source_data="nfldb", 112 | training_seasons=(2009, 2010, 2011, 2012, 2013, 2014), 113 | training_season_types=("Regular", "Postseason"), 114 | target_colname="offense_won"): 115 | """Train the model. 116 | 117 | Once a modeling pipeline is set up (either the default or something 118 | custom-generated), historical data needs to be fed into it in order to 119 | "fit" the model so that it can then be used to predict future results. 120 | This method implements a simple wrapper around the core Scikit-learn functionality 121 | which does this. 122 | 123 | The default is to use data from the nfldb database, however that can be changed 124 | to a simple Pandas DataFrame if desired (for instance if you wish to use data 125 | from another source). 126 | 127 | There is no particular output from this function, rather the parameters governing 128 | the fit of the model are saved inside the model object itself. If you want to get an 129 | estimate of the quality of the fit, use the ``validate_model`` method after running 130 | this method. 131 | 132 | Notes 133 | ----- 134 | If you are loading in the default model, **there is no need to re-run this method**. 135 | In fact, doing so will likely result in weird errors and could corrupt the model if you 136 | were to try to save it back to disk. 137 | 138 | Parameters 139 | ---------- 140 | source_data : the string ``"nfldb"`` or a Pandas DataFrame (default=``"nfldb"``) 141 | The data to be used to train the model. If ``"nfldb"``, will query the nfldb 142 | database for the training data (note that this requires a correctly configured 143 | installation of nfldb's database). 144 | training_seasons : list of ints (default=``[2009, 2010, 2011, 2012, 2013, 2014]``) 145 | What seasons to use to train the model if getting data from the nfldb database. 146 | If ``source_data`` is not ``"nfldb"``, this argument will be ignored. 147 | **NOTE:** it is critical not to use all possible data in order to train the 148 | model - some will need to be reserved for a final validation (see the 149 | ``validate_model`` method). A good dataset to reserve 150 | for validation is the most recent one or two NFL seasons. 151 | training_season_types : list of strings (default=``["Regular", "Postseason"]``) 152 | If querying from the nfldb database, what parts of the seasons to use. 153 | Options are "Preseason", "Regular", and "Postseason". If ``source_data`` is not 154 | ``"nfldb"``, this argument will be ignored. 155 | target_colname : string or integer (default=``"offense_won"``) 156 | The name of the target variable column. 157 | 158 | Returns 159 | ------- 160 | ``None`` 161 | """ 162 | self._training_seasons = [] 163 | self._training_season_types = [] 164 | if isinstance(source_data, str): 165 | if source_data == "nfldb": 166 | source_data = utilities.get_nfldb_play_data(season_years=training_seasons, 167 | season_types=training_season_types) 168 | self._training_seasons = training_seasons 169 | self._training_season_types = training_season_types 170 | else: 171 | raise ValueError("WPModel: if source_data is a string, it must be 'nfldb'") 172 | target_col = source_data[target_colname] 173 | feature_cols = source_data.drop(target_colname, axis=1) 174 | self.model.fit(feature_cols, target_col) 175 | 176 | def validate_model(self, 177 | source_data="nfldb", 178 | validation_seasons=(2015,), 179 | validation_season_types=("Regular", "Postseason"), 180 | target_colname="offense_won"): 181 | """Validate the model. 182 | 183 | Once a modeling pipeline is trained, a different dataset must be fed into the trained model 184 | to validate the quality of the fit. 185 | This method implements a simple wrapper around the core Scikit-learn functionality 186 | which does this. 187 | 188 | The default is to use data from the nfldb database, however that can be changed 189 | to a simple Pandas DataFrame if desired (for instance if you wish to use data 190 | from another source). 191 | 192 | The output of this method is a p value which represents the confidence at which 193 | we can reject the null hypothesis that the model predicts the appropriate win 194 | probabilities. This number is computed by first smoothing the predicted win probabilities of both all test data and 195 | just the data where the offense won with a gaussian `kernel density 196 | estimate `_ 197 | with standard deviation = 0.01. Once the data is smooth, ratios at each percentage point from 1% to 99% are computed (i.e. 198 | what fraction of the time did the offense win when the model says they have a 1% chance of winning, 2% chance, etc.). Each of 199 | these ratios should be well approximated by the binomial distribution, since they are essentially independent (not perfectly 200 | but hopefully close enough) weighted coin flips, giving a p value. From there `Fisher's method `_ 201 | is used to combine the p values into a global p value. A p value close to zero means that the model is unlikely to be 202 | properly predicting the correct win probabilities. A p value close to one, **while not proof that the model is correct**, 203 | means that the model is at least not inconsistent with the hypothesis that it predicts good win probabilities. 204 | 205 | Parameters 206 | ---------- 207 | source_data : the string ``"nfldb"`` or a Pandas DataFrame (default=``"nfldb"``) 208 | The data to be used to train the model. If ``"nfldb"``, will query the nfldb 209 | database for the training data (note that this requires a correctly configured 210 | installation of nfldb's database). 211 | training_seasons : list of ints (default=``[2015]``) 212 | What seasons to use to validate the model if getting data from the nfldb database. 213 | If ``source_data`` is not ``"nfldb"``, this argument will be ignored. 214 | **NOTE:** it is critical not to use the same data to validate the model as was used 215 | in the fit. Generally a good data set to use for validation is one from a time 216 | period more recent than was used to train the model. For instance, if the model was trained 217 | on data from 2009-2014, data from the 2015 season would be a sensible choice to validate the model. 218 | training_season_types : list of strings (default=``["Regular", "Postseason"]``) 219 | If querying from the nfldb database, what parts of the seasons to use. 220 | Options are "Preseason", "Regular", and "Postseason". If ``source_data`` is not 221 | ``"nfldb"``, this argument will be ignored. 222 | target_colname : string or integer (default=``"offense_won"``) 223 | The name of the target variable column. 224 | 225 | Returns 226 | ------- 227 | float, between 0 and 1 228 | The combined p value, where smaller values indicate that the model is not accurately predicting win 229 | probabilities. 230 | 231 | Raises 232 | ------ 233 | NotFittedError 234 | If the model hasn't been fit. 235 | 236 | Notes 237 | ----- 238 | Probabilities are computed between 1 and 99 percent because a single incorrect prediction at 100% or 0% automatically drives 239 | the global p value to zero. Since the model is being smoothed this situation can occur even when there are no model predictions 240 | at those extreme values, and therefore leads to erroneous p values. 241 | 242 | While it seems reasonable (to me at least), I am not totally certain that this approach is entirely correct. 243 | It's certainly sub-optimal in that you would ideally reject the null hypothesis that the model predictions 244 | **aren't** appropriate, but that seems to be a much harder problem (and one that would need much more test 245 | data to beat down the uncertainties involved). I'm also not sure if using Fisher's method is appropriate here, 246 | and I wonder if it might be necessary to Monte Carlo this. I would welcome input from others on better ways to do this. 247 | 248 | """ 249 | 250 | if self.training_seasons is None: 251 | raise NotFittedError("Must fit model before validating.") 252 | 253 | self._validation_seasons = [] 254 | self._validation_season_types = [] 255 | if isinstance(source_data, str): 256 | if source_data == "nfldb": 257 | source_data = utilities.get_nfldb_play_data(season_years=validation_seasons, 258 | season_types=validation_season_types) 259 | self._validation_seasons = validation_seasons 260 | self._validation_season_types = validation_season_types 261 | else: 262 | raise ValueError("WPModel: if source_data is a string, it must be 'nfldb'") 263 | 264 | target_col = source_data[target_colname] 265 | feature_cols = source_data.drop(target_colname, axis=1) 266 | predicted_probabilities = self.model.predict_proba(feature_cols)[:,1] 267 | 268 | self._sample_probabilities, self._predicted_win_percents, self._num_plays_used = ( 269 | WPModel._compute_predicted_percentages(target_col.values, predicted_probabilities)) 270 | 271 | #Compute the maximal deviation from a perfect prediction as well as the area under the 272 | #curve of the residual between |predicted - perfect|: 273 | max_deviation, residual_area = self._compute_prediction_statistics(self.sample_probabilities, 274 | self.predicted_win_percents) 275 | return max_deviation, residual_area 276 | 277 | #Compute p-values for each where null hypothesis is that distributions are same, then combine 278 | #them all to make sure data is not inconsistent with accurate predictions. 279 | # combined_pvalue = self._test_distribution(self.sample_probabilities, 280 | # self.predicted_win_percents, 281 | # self.num_plays_used) 282 | 283 | # return combined_pvalue 284 | 285 | @staticmethod 286 | def _compute_prediction_statistics(sample_probabilities, predicted_win_percents): 287 | """Take the KDE'd model estimates, then compute statistics. 288 | 289 | Returns 290 | ------- 291 | A tuple of (``max_deviation``, ``residual_area``), where ``max_deviation`` 292 | is the largest discrepancy between the model and expectation at any WP, 293 | and ``residual_area`` is the total area under the curve of |predicted WP - expected WP|. 294 | """ 295 | abs_deviations = np.abs(predicted_win_percents - sample_probabilities) 296 | max_deviation = np.max(abs_deviations) 297 | residual_area = integrate.simps(abs_deviations, 298 | sample_probabilities) 299 | return (max_deviation, residual_area) 300 | 301 | 302 | def predict_wp(self, plays): 303 | """Estimate the win probability for a set of plays. 304 | 305 | Basically a simple wrapper around ``WPModel.model.predict_proba``, 306 | takes in a DataFrame and then spits out an array of predicted 307 | win probabilities. 308 | 309 | Parameters 310 | ---------- 311 | plays : Pandas DataFrame 312 | The input data to use to make the predictions. 313 | 314 | Returns 315 | ------- 316 | Numpy array, of length ``len(plays)`` 317 | Predicted probability that the offensive team in each play 318 | will go on to win the game. 319 | 320 | Raises 321 | ------ 322 | NotFittedError 323 | If the model hasn't been fit. 324 | """ 325 | if self.training_seasons is None: 326 | raise NotFittedError("Must fit model before predicting WP.") 327 | 328 | return self.model.predict_proba(plays)[:,1] 329 | 330 | 331 | def plot_validation(self, axis=None, **kwargs): 332 | """Plot the validation data. 333 | 334 | Parameters 335 | ---------- 336 | axis : matplotlib.pyplot.axis object or ``None`` (default=``None``) 337 | If provided, the validation line will be overlaid on ``axis``. 338 | Otherwise, a new figure and axis will be generated and plotted on. 339 | **kwargs 340 | Arguments to ``axis.plot``. 341 | 342 | Returns 343 | ------- 344 | matplotlib.pylot.axis 345 | The axis the plot was made on. 346 | 347 | Raises 348 | ------ 349 | NotFittedError 350 | If the model hasn't been fit **and** validated. 351 | """ 352 | 353 | if self.sample_probabilities is None: 354 | raise NotFittedError("Must validate model before plotting.") 355 | 356 | import matplotlib.pyplot as plt 357 | if axis is None: 358 | axis = plt.figure().add_subplot(111) 359 | axis.plot([0, 100], [0, 100], ls="--", lw=2, color="black") 360 | axis.set_xlabel("Predicted WP") 361 | axis.set_ylabel("Actual WP") 362 | axis.plot(self.sample_probabilities, 363 | self.predicted_win_percents, 364 | **kwargs) 365 | 366 | return axis 367 | 368 | 369 | @staticmethod 370 | def _test_distribution(sample_probabilities, predicted_win_percents, num_plays_used): 371 | """Based off assuming the data at each probability is a Bernoulli distribution.""" 372 | 373 | #Get the p-values: 374 | p_values = [stats.binom_test(np.round(predicted_win_percents[i] * num_plays_used[i]), 375 | np.round(num_plays_used[i]), 376 | p=sample_probabilities[i]) for i in range(len(sample_probabilities))] 377 | combined_p_value = stats.combine_pvalues(p_values)[1] 378 | return(combined_p_value) 379 | 380 | @staticmethod 381 | def _compute_predicted_percentages(actual_results, predicted_win_probabilities): 382 | """Compute the sample percentages from a validation data set. 383 | """ 384 | kde_offense_won = KernelDensity(kernel='gaussian', bandwidth=0.01).fit( 385 | (predicted_win_probabilities[(actual_results == 1)])[:, np.newaxis]) 386 | kde_total = KernelDensity(kernel='gaussian', bandwidth=0.01).fit( 387 | predicted_win_probabilities[:, np.newaxis]) 388 | sample_probabilities = np.linspace(0.01, 0.99, 99) 389 | number_density_offense_won = np.exp(kde_offense_won.score_samples(sample_probabilities[:, np.newaxis])) * np.sum((actual_results)) 390 | number_density_total = np.exp(kde_total.score_samples(sample_probabilities[:, np.newaxis])) * len(actual_results) 391 | number_offense_won = number_density_offense_won * np.sum(actual_results) / np.sum(number_density_offense_won) 392 | number_total = number_density_total * len(actual_results) / np.sum(number_density_total) 393 | predicted_win_percents = number_offense_won / number_total 394 | 395 | return 100.*sample_probabilities, 100.*predicted_win_percents, number_total 396 | 397 | def create_default_pipeline(self): 398 | """Create the default win probability estimation pipeline. 399 | 400 | 401 | Returns 402 | ------- 403 | Scikit-learn pipeline 404 | The default pipeline, suitable for computing win probabilities 405 | but by no means the best possible model. 406 | 407 | This can be run any time a new default pipeline is required, 408 | and either set to the ``model`` attribute or used independently. 409 | """ 410 | 411 | steps = [] 412 | 413 | offense_team_colname = "offense_team" 414 | home_team_colname = "home_team" 415 | home_score_colname = "curr_home_score" 416 | away_score_colname = "curr_away_score" 417 | down_colname = "down" 418 | quarter_colname = "quarter" 419 | time_colname = "seconds_elapsed" 420 | yardline_colname = "yardline" 421 | yards_to_go_colname="yards_to_go" 422 | 423 | self.column_descriptions = { 424 | offense_team_colname: "Abbreviation for the offensive team", 425 | home_team_colname: "Abbreviation for the home team", 426 | away_score_colname: "Abbreviation for the visiting team", 427 | down_colname: "The current down", 428 | yards_to_go_colname: "Yards to a first down (or the endzone)", 429 | quarter_colname: "The quarter", 430 | time_colname: "Seconds elapsed in the quarter", 431 | yardline_colname: ("The yardline, given by (yards from own goalline - 50). " 432 | "-49 is your own 1 while 49 is the opponent's 1.") 433 | } 434 | 435 | is_offense_home = preprocessing.ComputeIfOffenseIsHome(offense_team_colname, 436 | home_team_colname, 437 | copy=self.copy_data) 438 | steps.append(("compute_offense_home", is_offense_home)) 439 | score_differential = preprocessing.CreateScoreDifferential(home_score_colname, 440 | away_score_colname, 441 | is_offense_home.offense_home_team_colname, 442 | copy=self.copy_data) 443 | steps.append(("create_score_differential", score_differential)) 444 | steps.append(("map_downs_to_int", preprocessing.MapToInt(down_colname, copy=self.copy_data))) 445 | total_time_elapsed = preprocessing.ComputeElapsedTime(quarter_colname, time_colname, copy=self.copy_data) 446 | steps.append(("compute_total_time_elapsed", total_time_elapsed)) 447 | steps.append(("remove_unnecessary_columns", preprocessing.CheckColumnNames( 448 | column_names=[is_offense_home.offense_home_team_colname, 449 | score_differential.score_differential_colname, 450 | total_time_elapsed.total_time_colname, 451 | yardline_colname, 452 | yards_to_go_colname, 453 | down_colname], 454 | copy=self.copy_data))) 455 | steps.append(("encode_categorical_columns", preprocessing.OneHotEncoderFromDataFrame( 456 | categorical_feature_names=[down_colname], 457 | copy=self.copy_data))) 458 | 459 | search_grid = {'base_estimator__penalty': ['l1', 'l2'], 460 | 'base_estimator__C': [0.01, 0.1, 1, 10, 100] 461 | } 462 | base_model = LogisticRegression() 463 | calibrated_model = CalibratedClassifierCV(base_model, cv=2, method="isotonic") 464 | #grid_search_model = GridSearchCV(calibrated_model, search_grid, 465 | # scoring=self._brier_loss_scorer) 466 | steps.append(("compute_model", calibrated_model)) 467 | 468 | pipe = Pipeline(steps) 469 | return pipe 470 | 471 | def save_model(self, filename=None): 472 | """Save the WPModel instance to disk. 473 | 474 | All models are saved to the same place, with the installed 475 | NFLWin library (given by ``WPModel.model_directory``). 476 | 477 | Parameters 478 | ---------- 479 | filename : string (default=None): 480 | The filename to use for the saved model. If this parameter 481 | is not specified, save to the default filename. Note that if a model 482 | already lists with this filename, it will be overwritten. Note also that 483 | this is a filename only, **not** a full path. If a full path is specified 484 | it is likely (albeit not guaranteed) to cause errors. 485 | 486 | Returns 487 | ------- 488 | ``None`` 489 | """ 490 | 491 | if filename is None: 492 | filename = self._default_model_filename 493 | joblib.dump(self, os.path.join(self.model_directory, filename)) 494 | 495 | @classmethod 496 | def load_model(cls, filename=None): 497 | """Load a saved WPModel. 498 | 499 | Parameters 500 | ---------- 501 | Same as ``save_model``. 502 | 503 | Returns 504 | ------- 505 | ``nflwin.WPModel`` instance. 506 | """ 507 | if filename is None: 508 | filename = cls._default_model_filename 509 | 510 | return joblib.load(os.path.join(cls.model_directory, filename)) 511 | 512 | @staticmethod 513 | def _brier_loss_scorer(estimator, X, y): 514 | """Use the Brier loss to estimate model score. 515 | 516 | For use in GridSearchCV, instead of accuracy. 517 | """ 518 | predicted_positive_probabilities = estimator.predict_proba(X)[:, 1] 519 | return 1. - brier_score_loss(y, predicted_positive_probabilities) 520 | -------------------------------------------------------------------------------- /nflwin/tests/test_preprocessing.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import pytest 6 | from sklearn.utils.validation import NotFittedError 7 | from sklearn.pipeline import Pipeline 8 | 9 | from nflwin import preprocessing 10 | 11 | class TestPipelines(object): 12 | """Testing if pipelining cleaning steps works.""" 13 | def test_map_to_int_to_onehot(self): 14 | fit_df = pd.DataFrame({"quarter": ["Q1", "Q1", "Q1", "Q2", "Q2"]}) 15 | transform_df = fit_df.copy() 16 | 17 | mti = preprocessing.MapToInt("quarter", copy=True) 18 | ohe = preprocessing.OneHotEncoderFromDataFrame(categorical_feature_names=["quarter"], copy=True) 19 | pipe = Pipeline(steps=[("one", mti), ("two", ohe)]) 20 | pipe.fit(fit_df) 21 | output_df = pipe.transform(transform_df) 22 | 23 | expected_df = pd.DataFrame({"onehot_col1": [1.0, 1, 1, 0, 0], "onehot_col2": [0.0, 0, 0, 1, 1]}) 24 | pd.util.testing.assert_frame_equal(output_df, expected_df) 25 | 26 | class TestComputeElapsedTime(object): 27 | """Testing if we can properly map quarters and time elapsed to a total time elapsed.""" 28 | 29 | def test_bad_quarter_colname_produces_error(self): 30 | input_df = pd.DataFrame({"blahblahblah": ["Q1", "Q2", "Q3", "Q4", "OT"], 31 | "time_elapsed": [200, 0, 50, 850, 40]}) 32 | cet = preprocessing.ComputeElapsedTime("quarter", "time_elapsed") 33 | cet.fit(input_df) 34 | 35 | with pytest.raises(KeyError): 36 | cet.transform(input_df) 37 | 38 | def test_bad_time_elapsed_colname_produces_error(self): 39 | input_df = pd.DataFrame({"quarter": ["Q1", "Q2", "Q3", "Q4", "OT"], 40 | "blahblahblah": [200, 0, 50, 850, 40]}) 41 | cet = preprocessing.ComputeElapsedTime("quarter", "time_elapsed") 42 | cet.fit(input_df) 43 | 44 | with pytest.raises(KeyError): 45 | cet.transform(input_df) 46 | 47 | def test_preexisting_output_colname_produces_error(self): 48 | input_df = pd.DataFrame({"quarter": ["Q1", "Q2", "Q3", "Q4", "OT"], 49 | "time_elapsed": [200, 0, 50, 850, 40], 50 | "total_time_elapsed": [0, 0, 0, 0, 0]}) 51 | cet = preprocessing.ComputeElapsedTime("quarter", "time_elapsed", 52 | total_time_colname="total_time_elapsed") 53 | cet.fit(input_df) 54 | 55 | with pytest.raises(KeyError): 56 | cet.transform(input_df) 57 | 58 | def test_incomplete_quarter_mapping(self): 59 | input_df = pd.DataFrame({"quarter": ["Q1", "Q2", "Q3", "Q4", "OT1"], 60 | "time_elapsed": [200, 0, 50, 850, 40]}) 61 | cet = preprocessing.ComputeElapsedTime("quarter", "time_elapsed", 62 | quarter_to_second_mapping={ 63 | "Q1": 0, 64 | "Q2": 900, 65 | "Q4": 2700, 66 | "OT1":3600} ) 67 | cet.fit(input_df) 68 | 69 | with pytest.raises(TypeError): 70 | cet.transform(input_df) 71 | 72 | def test_simple_working_case(self): 73 | input_df = pd.DataFrame({"quarter": ["Q1", "Q2", "Q3", "Q4", "OT"], 74 | "time_elapsed": [200, 0, 50, 850, 40]}) 75 | cet = preprocessing.ComputeElapsedTime("quarter", "time_elapsed") 76 | cet.fit(input_df) 77 | 78 | transformed_df = cet.transform(input_df) 79 | expected_df = pd.DataFrame({"quarter": ["Q1", "Q2", "Q3", "Q4", "OT"], 80 | "time_elapsed": [200, 0, 50, 850, 40], 81 | "total_elapsed_time": [200, 900, 1850, 3550, 3640]}) 82 | pd.util.testing.assert_frame_equal(transformed_df, expected_df) 83 | 84 | def test_inplace_transform(self): 85 | input_df = pd.DataFrame({"quarter": ["Q1", "Q2", "Q3", "Q4", "OT"], 86 | "time_elapsed": [200, 0, 50, 850, 40]}) 87 | cet = preprocessing.ComputeElapsedTime("quarter", "time_elapsed", copy=False) 88 | cet.fit(input_df) 89 | 90 | cet.transform(input_df) 91 | expected_df = pd.DataFrame({"quarter": ["Q1", "Q2", "Q3", "Q4", "OT"], 92 | "time_elapsed": [200, 0, 50, 850, 40], 93 | "total_elapsed_time": [200, 900, 1850, 3550, 3640]}) 94 | pd.util.testing.assert_frame_equal(input_df, expected_df) 95 | 96 | def test_custom_mapping(self): 97 | input_df = pd.DataFrame({"quarter": ["quarter1", "Q2", "Q3", "Q4", "OT1"], 98 | "time_elapsed": [200, 0, 50, 850, 40]}) 99 | cet = preprocessing.ComputeElapsedTime("quarter", "time_elapsed", 100 | quarter_to_second_mapping={ 101 | "quarter1": 0, 102 | "Q2": 500, 103 | "Q3": 1800, 104 | "Q4": 2700, 105 | "OT1":3600}) 106 | cet.fit(input_df) 107 | 108 | transformed_df = cet.transform(input_df) 109 | expected_df = pd.DataFrame({"quarter": ["quarter1", "Q2", "Q3", "Q4", "OT1"], 110 | "time_elapsed": [200, 0, 50, 850, 40], 111 | "total_elapsed_time": [200, 500, 1850, 3550, 3640]}) 112 | pd.util.testing.assert_frame_equal(transformed_df, expected_df) 113 | 114 | 115 | class TestComputeIfOffenseIsHome(object): 116 | """Testing if we can correctly compute if the offense is the home team.""" 117 | 118 | def test_bad_offense_colname_produces_error(self): 119 | input_df = pd.DataFrame({"home_team": ["a", "a", "a"], 120 | "blahblahblah": ["a", "b", "a"]}) 121 | ciow = preprocessing.ComputeIfOffenseIsHome("offense_team", "home_team") 122 | ciow.fit(input_df) 123 | 124 | with pytest.raises(KeyError): 125 | ciow.transform(input_df) 126 | 127 | def test_bad_home_team_colname_produces_error(self): 128 | input_df = pd.DataFrame({"blahblahblah": ["a", "a", "a"], 129 | "offense_team": ["a", "b", "a"]}) 130 | ciow = preprocessing.ComputeIfOffenseIsHome("offense_team", "home_team") 131 | ciow.fit(input_df) 132 | 133 | with pytest.raises(KeyError): 134 | ciow.transform(input_df) 135 | 136 | def test_existing_offense_home_team_colname_produces_error(self): 137 | input_df = pd.DataFrame({"home_team": ["a", "a", "a"], 138 | "offense_team": ["a", "b", "a"]}) 139 | ciow = preprocessing.ComputeIfOffenseIsHome("offense_team", "home_team", 140 | offense_home_team_colname="home_team") 141 | ciow.fit(input_df) 142 | 143 | with pytest.raises(KeyError): 144 | ciow.transform(input_df) 145 | 146 | def test_correct_answer_with_copy(self): 147 | input_df = pd.DataFrame({"home_team": ["a", "a", "a"], 148 | "offense_team": ["a", "b", "a"]}) 149 | expected_input_df = input_df.copy() 150 | expected_transformed_df = pd.DataFrame({"home_team": ["a", "a", "a"], 151 | "offense_team": ["a", "b", "a"], 152 | "offense_home_team": [True, False, True]}) 153 | ciow = preprocessing.ComputeIfOffenseIsHome("offense_team", "home_team", 154 | offense_home_team_colname="offense_home_team", 155 | copy=True) 156 | transformed_df = ciow.transform(input_df) 157 | pd.util.testing.assert_frame_equal(input_df.sort_index(axis=1), expected_input_df.sort_index(axis=1)) 158 | pd.util.testing.assert_frame_equal(transformed_df.sort_index(axis=1), expected_transformed_df.sort_index(axis=1)) 159 | 160 | def test_correct_answer_without_copy(self): 161 | input_df = pd.DataFrame({"home_team": ["a", "a", "a"], 162 | "offense_team": ["a", "b", "a"]}) 163 | expected_transformed_df = pd.DataFrame({"home_team": ["a", "a", "a"], 164 | "offense_team": ["a", "b", "a"], 165 | "offense_home_team": [True, False, True]}) 166 | ciow = preprocessing.ComputeIfOffenseIsHome("offense_team", "home_team", 167 | offense_home_team_colname="offense_home_team", 168 | copy=False) 169 | ciow.transform(input_df) 170 | pd.util.testing.assert_frame_equal(input_df.sort_index(axis=1), expected_transformed_df.sort_index(axis=1)) 171 | 172 | 173 | class TestMapToInt(object): 174 | """Testing if the integer mapper works.""" 175 | 176 | def test_fit_bad_colname_produces_error(self): 177 | input_df = pd.DataFrame({"one": ["one", "two", "one", "four", 178 | "six", "two", "one", "one"]}) 179 | mti = preprocessing.MapToInt("blahblahblah") 180 | 181 | with pytest.raises(KeyError): 182 | mti.fit(input_df) 183 | 184 | 185 | def test_mapping_without_nans(self): 186 | input_df = pd.DataFrame({"one": ["one", "two", "one", "four", 187 | "six", "two", "one", "one"]}) 188 | mti = preprocessing.MapToInt("one") 189 | mti.fit(input_df) 190 | expected_output = {"one": 0, "two": 1, "four": 2, "six": 3} 191 | assert mti.mapping == expected_output 192 | 193 | def test_mapping_with_nans(self): 194 | input_df = pd.DataFrame({"one": ["one", "two", "one", "four", 195 | "six", np.nan, "one", "one"]}) 196 | mti = preprocessing.MapToInt("one") 197 | mti.fit(input_df) 198 | expected_output = {"one": 0, "two": 1, "four": 2, "six": 3} 199 | assert mti.mapping == expected_output 200 | 201 | def test_transform_before_fit_produces_error(self): 202 | input_df = pd.DataFrame({"one": ["one", "two", "one", "four", 203 | "six", "two", "one", "one"]}) 204 | mti = preprocessing.MapToInt("one") 205 | 206 | with pytest.raises(NotFittedError): 207 | mti.transform(input_df) 208 | 209 | def test_transform_bad_colname_produces_error(self): 210 | input_df = pd.DataFrame({"one": ["one", "two", "one", "four", 211 | "six", "two", "one", "one"]}) 212 | mti = preprocessing.MapToInt("one") 213 | mti.fit(input_df) 214 | transform_df = pd.DataFrame({"blahblahblah": ["one", "two", "one", "four", 215 | "six", "two", "one", "one"]}) 216 | with pytest.raises(KeyError): 217 | mti.transform(transform_df) 218 | 219 | def test_transform_without_nans(self): 220 | input_df = pd.DataFrame({"one": ["one", "two", "one", "four", 221 | "six", "two", "one", "one"]}) 222 | mti = preprocessing.MapToInt("one") 223 | mti.fit(input_df) 224 | transformed_df = mti.transform(input_df) 225 | expected_df = pd.DataFrame({"one": [0, 1, 0, 2, 3, 1, 0, 0]}) 226 | pd.util.testing.assert_frame_equal(transformed_df, expected_df) 227 | 228 | def test_transform_with_nans(self): 229 | input_df = pd.DataFrame({"one": ["one", "two", "one", "four", 230 | "six", "two", np.nan, "one"]}) 231 | mti = preprocessing.MapToInt("one") 232 | mti.fit(input_df) 233 | transformed_df = mti.transform(input_df) 234 | expected_df = pd.DataFrame({"one": [0, 1, 0, 2, 3, 1, np.nan, 0]}) 235 | pd.util.testing.assert_frame_equal(transformed_df, expected_df) 236 | 237 | def test_transform_inplace(self): 238 | input_df = pd.DataFrame({"one": ["one", "two", "one", "four", 239 | "six", "two", "one", "one"]}) 240 | mti = preprocessing.MapToInt("one", copy=False) 241 | mti.fit(input_df) 242 | mti.transform(input_df) 243 | expected_df = pd.DataFrame({"one": [0, 1, 0, 2, 3, 1, 0, 0]}) 244 | pd.util.testing.assert_frame_equal(input_df, expected_df) 245 | 246 | def test_transform_copy(self): 247 | input_df = pd.DataFrame({"one": ["one", "two", "one", "four", 248 | "six", "two", "one", "one"]}) 249 | expected_df = input_df.copy() 250 | mti = preprocessing.MapToInt("one", copy=True) 251 | mti.fit(input_df) 252 | transformed_data = mti.transform(input_df) 253 | pd.util.testing.assert_frame_equal(input_df, expected_df) 254 | 255 | 256 | 257 | 258 | class TestOneHotEncoderFromDataFrame(object): 259 | """Testing if the one-hot encoder wrapper works.""" 260 | 261 | def setup_method(self, method): 262 | self.data = pd.DataFrame({"one": [1, 2, 3, 1], 263 | "two": [2, 2, 2, 5], 264 | "three": [0, 5, 0, 5]}) 265 | self.data = self.data[["one", "two", "three"]] 266 | 267 | def test_correct_dtype_passed(self): 268 | ohe = preprocessing.OneHotEncoderFromDataFrame(dtype=np.int) 269 | assert ohe.dtype == np.int 270 | 271 | def test_correct_handle_unknown_string_passed(self): 272 | ohe = preprocessing.OneHotEncoderFromDataFrame(handle_unknown="ignore") 273 | assert ohe.handle_unknown == "ignore" 274 | 275 | def test_encode_all_columns(self): 276 | ohe = preprocessing.OneHotEncoderFromDataFrame(categorical_feature_names="all") 277 | ohe.fit(self.data) 278 | transformed_data = ohe.transform(self.data) 279 | expected_data = pd.DataFrame({"onehot_col1": [1., 0, 0, 1], 280 | "onehot_col2": [0., 1, 0, 0], 281 | "onehot_col3": [0., 0, 1, 0], 282 | "onehot_col4": [1., 1, 1, 0], 283 | "onehot_col5": [0., 0, 0, 1], 284 | "onehot_col6": [1., 0, 1, 0], 285 | "onehot_col7": [0., 1, 0, 1]}) 286 | 287 | pd.util.testing.assert_frame_equal(transformed_data.sort_index(axis=1), 288 | expected_data.sort_index(axis=1)) 289 | 290 | def test_encode_some_columns(self): 291 | ohe = preprocessing.OneHotEncoderFromDataFrame(categorical_feature_names=["one", "three"]) 292 | ohe.fit(self.data) 293 | transformed_data = ohe.transform(self.data) 294 | expected_data = pd.DataFrame({"two": [2, 2, 2, 5], 295 | "onehot_col1": [1., 0, 0, 1], 296 | "onehot_col2": [0., 1, 0, 0], 297 | "onehot_col3": [0., 0, 1, 0], 298 | "onehot_col4": [1., 0, 1, 0], 299 | "onehot_col5": [0., 1, 0, 1]}) 300 | 301 | pd.util.testing.assert_frame_equal(transformed_data.sort_index(axis=1), 302 | expected_data.sort_index(axis=1)) 303 | 304 | def test_copy_data_works(self): 305 | ohe = preprocessing.OneHotEncoderFromDataFrame(categorical_feature_names=["one", "three"], 306 | copy=True) 307 | ohe.fit(self.data) 308 | transformed_data = ohe.transform(self.data) 309 | expected_data = pd.DataFrame({"one": [1, 2, 3, 1], 310 | "two": [2, 2, 2, 5], 311 | "three": [0, 5, 0, 5]}) 312 | 313 | pd.util.testing.assert_frame_equal(self.data.sort_index(axis=1), 314 | expected_data.sort_index(axis=1)) 315 | 316 | 317 | def test_inplace_transform_works(self): 318 | ohe = preprocessing.OneHotEncoderFromDataFrame(categorical_feature_names=["one", "three"], 319 | copy=False) 320 | data = self.data.copy() 321 | ohe.fit(self.data) 322 | ohe.transform(self.data) 323 | expected_data = pd.DataFrame({"two": [2, 2, 2, 5], 324 | "onehot_col1": [1., 0, 0, 1], 325 | "onehot_col2": [0., 1, 0, 0], 326 | "onehot_col3": [0., 0, 1, 0], 327 | "onehot_col4": [1., 0, 1, 0], 328 | "onehot_col5": [0., 1, 0, 1]}) 329 | 330 | pd.util.testing.assert_frame_equal(self.data.sort_index(axis=1), 331 | expected_data.sort_index(axis=1)) 332 | 333 | def test_encoding_subset_columns(self): 334 | ohe = preprocessing.OneHotEncoderFromDataFrame(categorical_feature_names=["one", "three"], 335 | copy=True) 336 | shifted_data = self.data[2:] 337 | ohe.fit(shifted_data) 338 | transformed_data = ohe.transform(shifted_data) 339 | self.data = pd.DataFrame({"one": [1, 2, 3, 1], 340 | "two": [2, 2, 2, 5], 341 | "three": [0, 5, 0, 5]}) 342 | expected_data = pd.DataFrame({"two": [2, 5], 343 | "onehot_col1": [0., 1], 344 | "onehot_col2": [1., 0], 345 | "onehot_col3": [1., 0], 346 | "onehot_col4": [0., 1]}, 347 | index=[2, 3]) 348 | print(transformed_data) 349 | print(expected_data) 350 | pd.util.testing.assert_frame_equal(transformed_data.sort_index(axis=1), 351 | expected_data.sort_index(axis=1)) 352 | 353 | 354 | 355 | 356 | class TestCreateScoreDifferential(object): 357 | """Testing if score differentials are properly created.""" 358 | 359 | def test_bad_home_score_colname(self): 360 | csd = preprocessing.CreateScoreDifferential("badcol", "away_score", "offense_home") 361 | data = pd.DataFrame({"home_score": [1, 2, 3, 4], 362 | "away_score": [10, 0, 5, 15], 363 | "offense_home": [True, True, True, True]}) 364 | with pytest.raises(KeyError): 365 | csd.transform(data) 366 | 367 | def test_bad_away_score_colname(self): 368 | csd = preprocessing.CreateScoreDifferential("home_score", "badcol", "offense_home") 369 | data = pd.DataFrame({"home_score": [1, 2, 3, 4], 370 | "away_score": [10, 0, 5, 15], 371 | "offense_home": [True, True, True, True]}) 372 | with pytest.raises(KeyError): 373 | csd.fit(data) 374 | csd.transform(data) 375 | 376 | def test_bad_offense_home_colname(self): 377 | csd = preprocessing.CreateScoreDifferential("home_score", "away_score", "badcol") 378 | data = pd.DataFrame({"home_score": [1, 2, 3, 4], 379 | "away_score": [10, 0, 5, 15], 380 | "offense_home": [True, True, True, True]}) 381 | with pytest.raises(KeyError): 382 | csd.fit(data) 383 | csd.transform(data) 384 | 385 | def test_differential_column_already_exists(self): 386 | csd = preprocessing.CreateScoreDifferential("home_score", 387 | "away_score", 388 | "offense_home", 389 | score_differential_colname="used_col") 390 | data = pd.DataFrame({"home_score": [1, 2, 3, 4], 391 | "away_score": [10, 0, 5, 15], 392 | "offense_home": [True, True, True, True], 393 | "used_col": [0, 0, 0, 0]}) 394 | with pytest.raises(KeyError): 395 | csd.fit(data) 396 | csd.transform(data) 397 | 398 | def test_differential_works_offense_is_home(self): 399 | csd = preprocessing.CreateScoreDifferential("home_score", 400 | "away_score", 401 | "offense_home", 402 | score_differential_colname="score_diff") 403 | input_data = pd.DataFrame({"home_score": [1, 2, 3, 4], 404 | "away_score": [10, 0, 5, 15], 405 | "offense_home": [True, True, True, True]}) 406 | expected_data = pd.DataFrame({"home_score": [1, 2, 3, 4], 407 | "away_score": [10, 0, 5, 15], 408 | "offense_home": [True, True, True, True], 409 | "score_diff": [-9, 2, -2, -11]}) 410 | 411 | csd.fit(input_data) 412 | transformed_data = csd.transform(input_data) 413 | pd.util.testing.assert_frame_equal(expected_data.sort_index(axis=1), 414 | transformed_data.sort_index(axis=1)) 415 | 416 | def test_differential_works_offense_is_away(self): 417 | csd = preprocessing.CreateScoreDifferential("home_score", 418 | "away_score", 419 | "offense_home", 420 | score_differential_colname="score_diff") 421 | input_data = pd.DataFrame({"home_score": [1, 2, 3, 4], 422 | "away_score": [10, 0, 5, 15], 423 | "offense_home": [False, False, False, False]}) 424 | expected_data = pd.DataFrame({"home_score": [1, 2, 3, 4], 425 | "away_score": [10, 0, 5, 15], 426 | "offense_home": [False, False, False, False], 427 | "score_diff": [9, -2, 2, 11]}) 428 | 429 | csd.fit(input_data) 430 | transformed_data = csd.transform(input_data) 431 | pd.util.testing.assert_frame_equal(expected_data.sort_index(axis=1), 432 | transformed_data.sort_index(axis=1)) 433 | 434 | def test_differential_works_offense_is_mix(self): 435 | csd = preprocessing.CreateScoreDifferential("home_score", 436 | "away_score", 437 | "offense_home", 438 | score_differential_colname="score_diff") 439 | input_data = pd.DataFrame({"home_score": [1, 2, 3, 4], 440 | "away_score": [10, 0, 5, 15], 441 | "offense_home": [True, True, False, False]}) 442 | expected_data = pd.DataFrame({"home_score": [1, 2, 3, 4], 443 | "away_score": [10, 0, 5, 15], 444 | "offense_home": [True, True, False, False], 445 | "score_diff": [-9, 2, 2, 11]}) 446 | 447 | csd.fit(input_data) 448 | transformed_data = csd.transform(input_data) 449 | pd.util.testing.assert_frame_equal(expected_data.sort_index(axis=1), 450 | transformed_data.sort_index(axis=1)) 451 | 452 | def test_differential_with_copied_data(self): 453 | csd = preprocessing.CreateScoreDifferential("home_score", 454 | "away_score", 455 | "offense_home", 456 | score_differential_colname="score_diff", 457 | copy=True) 458 | input_data = pd.DataFrame({"home_score": [1, 2, 3, 4], 459 | "away_score": [10, 0, 5, 15], 460 | "offense_home": [True, True, True, True]}) 461 | expected_input_data = pd.DataFrame({"home_score": [1, 2, 3, 4], 462 | "away_score": [10, 0, 5, 15], 463 | "offense_home": [True, True, True, True]}) 464 | expected_transformed_data = pd.DataFrame({"home_score": [1, 2, 3, 4], 465 | "away_score": [10, 0, 5, 15], 466 | "offense_home": [True, True, True, True], 467 | "score_diff": [-9, 2, -2, -11]}) 468 | 469 | csd.fit(input_data) 470 | transformed_data = csd.transform(input_data) 471 | pd.util.testing.assert_frame_equal(expected_input_data.sort_index(axis=1), 472 | input_data.sort_index(axis=1)) 473 | pd.util.testing.assert_frame_equal(expected_transformed_data.sort_index(axis=1), 474 | transformed_data.sort_index(axis=1)) 475 | 476 | def test_differential_with_inplace_data(self): 477 | csd = preprocessing.CreateScoreDifferential("home_score", 478 | "away_score", 479 | "offense_home", 480 | score_differential_colname="score_diff", 481 | copy=False) 482 | input_data = pd.DataFrame({"home_score": [1, 2, 3, 4], 483 | "away_score": [10, 0, 5, 15], 484 | "offense_home": [True, True, True, True]}) 485 | expected_data = pd.DataFrame({"home_score": [1, 2, 3, 4], 486 | "away_score": [10, 0, 5, 15], 487 | "offense_home": [True, True, True, True], 488 | "score_diff": [-9, 2, -2, -11]}) 489 | csd.fit(input_data) 490 | csd.transform(input_data) 491 | pd.util.testing.assert_frame_equal(expected_data.sort_index(axis=1), 492 | input_data.sort_index(axis=1)) 493 | 494 | 495 | 496 | 497 | class TestCheckColumnNames(object): 498 | """Testing whether column names are properly checked.""" 499 | 500 | def test_transform_called_before_fit(self): 501 | ccn = preprocessing.CheckColumnNames() 502 | data = pd.DataFrame() 503 | 504 | with pytest.raises(NotFittedError): 505 | ccn.transform(data) 506 | 507 | def test_transform_data_has_wrong_columns(self): 508 | ccn = preprocessing.CheckColumnNames() 509 | input_data = pd.DataFrame({"one": [1, 2], 510 | "two": [3, 4]}) 511 | ccn.fit(input_data) 512 | test_data = pd.DataFrame({"one": [1, 2], 513 | "three": [3, 4]}) 514 | 515 | with pytest.raises(KeyError): 516 | ccn.transform(test_data) 517 | 518 | def test_transform_reorders_columns(self): 519 | ccn = preprocessing.CheckColumnNames() 520 | input_data = pd.DataFrame({"one": [1, 2], 521 | "two": [3, 4], 522 | "three": [5, 6]}) 523 | test_data = pd.DataFrame({"one": [7, 8], 524 | "two": [9, 10], 525 | "three": [11, 12]}) 526 | expected_data = test_data.copy() 527 | #Ensure columns are in a particular order: 528 | input_data = input_data[["one", "two", "three"]] 529 | test_data = test_data[["two", "one", "three"]] 530 | expected_data = expected_data[["one", "two", "three"]] 531 | 532 | with pytest.raises(AssertionError): 533 | pd.util.testing.assert_frame_equal(test_data, expected_data) 534 | 535 | ccn.fit(input_data) 536 | pd.util.testing.assert_frame_equal(ccn.transform(test_data), expected_data) 537 | 538 | 539 | def test_transform_drops_unnecessary_columns(self): 540 | ccn = preprocessing.CheckColumnNames() 541 | input_data = pd.DataFrame({"one": [1, 2], 542 | "two": [3, 4], 543 | "three": [5, 6]}) 544 | test_data = pd.DataFrame({"one": [7, 8], 545 | "two": [9, 10], 546 | "three": [11, 12], 547 | "four": [13, 14]}) 548 | expected_data = pd.DataFrame({"one": [7, 8], 549 | "two": [9, 10], 550 | "three": [11, 12]}) 551 | #Ensure columns are in a particular order: 552 | input_data = input_data[["one", "two", "three"]] 553 | expected_data = expected_data[["one", "two", "three"]] 554 | 555 | ccn.fit(input_data) 556 | pd.util.testing.assert_frame_equal(ccn.transform(test_data), expected_data) 557 | 558 | 559 | def test_transform_with_user_specified_colums(self): 560 | ccn = preprocessing.CheckColumnNames(column_names=["c", "b", "a"]) 561 | input_data = pd.DataFrame({"e": [-2, -1, 0], 562 | "a": [1, 2, 3], 563 | "b": [4, 5, 6], 564 | "c": [7, 8, 9], 565 | "d": [10, 11, 12]}) 566 | expected_data = pd.DataFrame({"c": [7, 8, 9], 567 | "b": [4, 5, 6], 568 | "a": [1, 2, 3]}) 569 | expected_data = expected_data[["c", "b", "a"]] 570 | transformed_data = ccn.transform(input_data) 571 | pd.util.testing.assert_frame_equal(expected_data, transformed_data) 572 | --------------------------------------------------------------------------------